X-Git-Url: http://iramuteq.org/git?p=iramuteq;a=blobdiff_plain;f=corpus.py;h=51061fbb2006bb58385173f235a35d4cbdc5f147;hp=e0437077ff0fc5642a68f89a82ca2b5777da7ea3;hb=b19770356272772c8c8ba75f351520eca186bd19;hpb=d1d24d86422c9e9805516190ea17a379201f9300 diff --git a/corpus.py b/corpus.py index e043707..51061fb 100644 --- a/corpus.py +++ b/corpus.py @@ -11,7 +11,6 @@ from time import time from functions import decoupercharact, ReadDicoAsDico, DoConf import re import sqlite3 -import numpy import itertools import logging from operator import itemgetter @@ -211,6 +210,28 @@ class Corpus : def getucesfrometoile(self, etoile) : return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles] + def getetoileuces(self) : + log.info('get uces etoiles') + etoileuces = {} + idpara = 0 + for uci in self.ucis : + etoiles = uci.etoiles[1:] + for et in etoiles : + if et in etoileuces : + etoileuces[et] += [uce.ident for uce in uci.uces] + else : + etoileuces[et] = [uce.ident for uce in uci.uces] + if uci.paras != [] : + for et in uci.paras : + if et in etoileuces : + etoileuces[et] += [uce.ident for uce in uci.uces if uce.para == idpara] + else : + etoileuces[et] = [uce.ident for uce in uci.uces if uce.para == idpara] + idpara += 1 + else : + idpara += 1 + return etoileuces + def getucefromid(self, uceid) : if self.iduces is None : self.make_iduces() return self.iduces[uceid] @@ -260,8 +281,8 @@ class Corpus : for uci in self.ucis : get = list(set(uci.etoiles).intersection(etoiles)) if len(get) > 1 : - return '2 variables sur la meme ligne' - elif get != [] : + log.info('2 variables sur une ligne') + if get != [] : etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces] etuces = [set(val) for val in etuces] tab = [] @@ -531,7 +552,7 @@ class Corpus : def make_etoiles(self) : etoiles = set([]) for uci in self.ucis : - etoiles.update(uci.etoiles[1:] + uci.paras) + etoiles.update(uci.etoiles[1:]) return list(etoiles) def make_etoiles_dict(self) : @@ -542,16 +563,16 @@ class Corpus : if et[0] in det : try : endet = '_'.join(et[1:]) - if endet in det[et[0]] : - det[et[0]][endet] += 1 + if etoile in det[et[0]] : + det[et[0]][etoile] += 1 else : - det[et[0]][endet] = 1 + det[et[0]][etoile] = 1 except IndexError : det[et[0]] += 1 else : try : endet = '_'.join(et[1:]) - det[et[0]] = {endet :1} + det[et[0]] = {etoile :1} except IndexError : det[et[0]] = 1 return det @@ -565,13 +586,16 @@ class Corpus : elif get != [] : etuces[listet.index(get[0])] += [uce.ident for uce in uci.uces] return etuces - def make_and_write_profile_et(self, ucecl, fileout) : log.info('etoiles/classes') - etoiles = self.make_etoiles() + etoileuces = self.getetoileuces() + etoileuces = dict([[et, etoileuces[et]] for et in etoileuces if len(etoileuces[et]) > 1]) with open(fileout, 'w') as f : - f.write('\n'.join([';'.join([etoile] + [`len(set(self.getucesfrometoile(etoile)).intersection(classe))` for classe in ucecl]) for etoile in etoiles]).encode(self.parametres['syscoding'])) + f.write('\n'.join([';'.join([et] + [`len(set(etoileuces[et]).intersection(classe))` for classe in ucecl]) for et in etoileuces]).encode(self.parametres['syscoding'])) + #etoiles = self.make_etoiles() + #with open(fileout, 'w') as f : + # f.write('\n'.join([';'.join([etoile] + [`len(set(self.getucesfrometoile(etoile)).intersection(classe))` for classe in ucecl]) for etoile in etoiles]).encode(self.parametres['syscoding'])) def make_colored_corpus(self) : ucecl = {} @@ -762,6 +786,22 @@ class Corpus : with open('/tmp/testhapxuce.html','w') as f : f.write(txt) + def export_dictionary(self, fileout, syscoding) : + listformes = [[self.formes[forme].freq, forme, self.formes[forme].lem, self.formes[forme].gram] for forme in self.formes] + listformes.sort(reverse = True) + listformes = [forme[1:] + [`forme[0]`] for forme in listformes] + with open(fileout, 'w') as f : + f.write('\n'.join(['\t'.join(forme) for forme in listformes]).encode(syscoding)) + + def export_lems(self, fileout, syscoding) : + self.make_idformes() + listlem = [[lem, '\t'.join(['\t'.join([self.idformes[forme].forme, `self.lems[lem].formes[forme]`]) for forme in self.lems[lem].formes])] for lem in self.lems] + listlem.sort() + with open(fileout, 'w') as f : + f.write('\n'.join(['\t'.join(lem) for lem in listlem]).encode(syscoding)) + + + class MakeUciStat : def __init__(self, corpus) : @@ -1092,7 +1132,6 @@ class BuildFromAlceste(BuildCorpus) : log.info(u'Empty text : %i' % linenb) iduci -= 1 self.corpus.ucis.pop() - #raise Exception("EmptyText %i" % linenb) self.corpus.ucis.append(Uci(iduci, line)) if self.dlg is not None : if not (iduci + 1) % 10 : @@ -1105,19 +1144,24 @@ class BuildFromAlceste(BuildCorpus) : idpara += 1 self.corpus.ucis[-1].paras.append(line.split()[0]) else : - raise Exception('paragrapheOT') + raise Exception('paragrapheOT %i' % linenb) elif line.strip() != '' and iduci != -1 : txt.append(line) if txt != [] and iduci != -1 : iduce, idpara = self.treattxt(txt, iduce, idpara, iduci) del(txt) else : - raise Exception("EmptyText") + if iduci != -1 : + iduci -= 1 + self.corpus.ucis.pop() + log.info(Exception("Empty text %i" % linenb)) + else : + raise Exception('EmptyText %i' % linenb) if iduci != -1 and iduce != -1: self.backup_uce() else : log.info(_(u"No Text in corpora. Are you sure of the formatting ?")) - raise Exception('TextBeforeTextMark') + raise Exception('TextBeforeTextMark %i' % linenb) except UnicodeDecodeError : raise Exception("CorpusEncoding") @@ -1188,7 +1232,10 @@ class Builder : parametres['originalpath'] = parent.filename PathOut().createdir(parametres['pathout']) ReadLexique(self.parent, lang = parametres['lang']) - self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')) + if parametres['lang'] != 'other' and os.path.exists(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')): + self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')) + else : + self.parent.expressions = {} self.parametres = parametres else : if self.dlg is not None :