X-Git-Url: http://iramuteq.org/git?p=iramuteq;a=blobdiff_plain;f=corpus.py;h=d26a8a376dd96e3fa4d7be12a9c47bce6584a9ef;hp=d4357cf97559ce4c4b4cdcd8886494e469d06603;hb=bd8d0a889d1d393e64a6d768dc14e9c639a0df8c;hpb=ef45aa7e5e55a37956ce86dc4ce86471f11b018d;ds=sidebyside diff --git a/corpus.py b/corpus.py index d4357cf..d26a8a3 100644 --- a/corpus.py +++ b/corpus.py @@ -8,7 +8,7 @@ _ = gettext.gettext import locale import sys from time import time -from functions import decoupercharact, ReadDicoAsDico, DoConf +from functions import decoupercharact, ReadDicoAsDico, DoConf, ReadLexique import re import sqlite3 import itertools @@ -17,7 +17,6 @@ from operator import itemgetter from uuid import uuid4 from chemins import PathOut from dialog import CorpusPref -from functions import ReadLexique, ReadDicoAsDico from colors import colors import datetime @@ -38,8 +37,7 @@ def copycorpus(corpus) : class Corpus : """Corpus class - list of uci - + list of text """ def __init__(self, parent, parametres = {}, read = False) : self.parent = parent @@ -77,10 +75,10 @@ class Corpus : gramtype = self.parent.lexique[word][1] lem = self.parent.lexique[word][0] elif word.isdigit() : - gramtype = 'num' + gramtype = u'num' lem = word else : - gramtype = 'nr' + gramtype = u'nr' lem = word self.formes[word] = Word(word, gramtype, len(self.formes), lem) self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident : 1} @@ -125,6 +123,10 @@ class Corpus : wordid = self.formes[wordid].ident res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`wordid`,)) return list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res])) + + def getworducis(self, wordid) : + res = self.getworduces(wordid) + return list(set([self.getucefromid(uce).uci for uce in res])) def getformeuceseff(self, formeid) : if isinstance(formeid, basestring) : @@ -160,7 +162,7 @@ class Corpus : lemuceeff = {} for i, uce in enumerate(uces) : lemuceeff[uce] = lemuceeff.get(uce, 0) + eff[i] - return lemuceeff + return lemuceeff def getlemclustereff(self, lem, cluster) : return len(list(set(self.lc[cluster]).intersection(self.getlemuces(lem)))) @@ -197,6 +199,11 @@ class Corpus : def getconcorde(self, uces) : return self.cuces.execute('select * from uces where id IN (%s);' % ', '.join([`i` for i in uces])) + + def getuciconcorde(self, ucis) : + uces = [[val,[uce.ident for uce in self.ucis[val].uces]] for val in ucis] + uces = [[val[0], '\n'.join([row[1] for row in self.getconcorde(val[1])])] for val in uces] + return uces def getwordconcorde(self, word) : return self.getconcorde(self.getworduces(word)) @@ -206,20 +213,45 @@ class Corpus : def getalluces(self) : return self.cuces.execute('SELECT * FROM uces') - + + def getallucis(self): + uces = [row[1] for row in self.getalluces()] + return [[uci.ident, '\n'.join([uces[uce.ident] for uce in uci.uces])] for uci in self.ucis] + def getucesfrometoile(self, etoile) : return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles] def getetoileuces(self) : log.info('get uces etoiles') etoileuces = {} + idpara = 0 for uci in self.ucis : - etoiles = uci.etoiles[1:] + uci.paras + etoiles = uci.etoiles[1:] for et in etoiles : if et in etoileuces : etoileuces[et] += [uce.ident for uce in uci.uces] else : etoileuces[et] = [uce.ident for uce in uci.uces] + if uci.paras != [] : + for et in uci.paras : + if et in etoileuces : + etoileuces[et] += [uce.ident for uce in uci.uces if uce.para == idpara] + else : + etoileuces[et] = [uce.ident for uce in uci.uces if uce.para == idpara] + idpara += 1 + else : + idpara += 1 + return etoileuces + + def getetoileucis(self): + etoileuces = {} + for uci in self.ucis : + etoiles = uci.etoiles[1:] + for et in etoiles : + if et in etoileuces : + etoileuces[et] += [uci.ident] + else : + etoileuces[et] = [uci.ident] return etoileuces def getucefromid(self, uceid) : @@ -265,14 +297,18 @@ class Corpus : if self.iduces is None : self.iduces = dict([[uce.ident, uce] for uci in self.ucis for uce in uci.uces]) - def make_lexitable(self, mineff, etoiles) : - tokeep = [lem for lem in self.lems if self.lems[lem].freq >= mineff] + def make_lexitable(self, mineff, etoiles, gram = 0) : + if gram == 0 : + grams = {1:'', 2:''} + else : + grams = {gram :''} + tokeep = [lem for lem in self.lems if self.lems[lem].freq >= mineff and self.lems[lem].act in grams] etuces = [[] for et in etoiles] for uci in self.ucis : get = list(set(uci.etoiles).intersection(etoiles)) if len(get) > 1 : - return '2 variables sur la meme ligne' - elif get != [] : + log.info('2 variables sur une ligne') + if get != [] : etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces] etuces = [set(val) for val in etuces] tab = [] @@ -401,28 +437,84 @@ class Corpus : ident += 1 f.write('\n'.join([self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n') - def export_corpus_classes(self, outf, alc = True, lem = False) : + def export_corpus_classes(self, outf, alc = True, lem = False, uci = False) : ucecl = {} for i, lc in enumerate(self.lc) : for uce in lc : ucecl[uce] = i + 1 for uce in self.lc0 : ucecl[uce] = 0 - res = self.getalluces() - self.make_iduces() + if not uci : + res = self.getalluces() + self.make_iduces() + else : + res = self.getallucis() with open(outf, 'w') as f : for uce in res : guce = uce[1] - actuci = self.iduces[uce[0]].uci + if not uci : + actuci = self.iduces[uce[0]].uci + else : + actuci = uce[0] if lem : guce = ' '.join([self.formes[forme].lem for forme in guce.split()]) if alc : - etline = ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles + ['*classe_%i' % ucecl[uce[0]]]) + etline = ' '.join(self.ucis[actuci].etoiles + ['*classe_%i' % ucecl[uce[0]]]) else : - etline = ' '.join(['<' + '='.join(et.split('_')) + '>' for et in self.ucis[self.iduces[uce[0]].uci].etoiles[1:]]) + etline = ' '.join(['<' + '='.join(et.split('_')) + '>' for et in self.ucis[actuci].etoiles[1:]]) f.write(etline.encode(self.parametres['syscoding']) + '\n') f.write(guce.encode(self.parametres['syscoding']) + '\n\n') + def export_classe(self, outf, classe, lem = False, uci = False) : + sts = self.lc[classe - 1] + if not uci : + res = self.getconcorde(sts) + self.make_iduces() + else : + res = self.getuciconcorde(sts) + with open(outf, 'w') as f : + for uce in res : + guce = uce[1] + if not uci : + f.write(' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n') + else : + f.write(' '.join(self.ucis[uce[0]].etoiles).encode(self.parametres['syscoding']) + '\n') + if lem : + guce = ' '.join([self.formes[forme].lem for forme in guce.split()]) + f.write(guce.encode(self.parametres['syscoding']) + '\n\n') + + def export_owledge(self, rep, classe, lem = False, uci = False) : + sts = self.lc[classe - 1] + if not uci : + res = self.getconcorde(sts) + self.make_iduces() + else : + res = self.getuciconcorde(sts) + for uce in res : + ident = uce[0] + guce = uce[1] + outf = '.'.join([`ident`, 'txt']) + outf = os.path.join(rep, outf) + if lem : + guce = ' '.join([self.formes[forme].lem for forme in guce.split()]) + with open(outf, 'w') as f : + f.write(guce.encode('cp1252', errors = 'replace')) + + def export_tropes(self, fileout, classe, lem = False, uci = False) : + sts = self.lc[classe - 1] + if not uci : + res = self.getconcorde(sts) + self.make_iduces() + else : + res = self.getuciconcorde(sts) + with open(fileout, 'w') as f : + for uce in res : + guce = uce[1] + if lem : + guce = ' '.join([self.formes[forme].lem for forme in guce.split()]) + f.write(guce.encode('cp1252', errors = 'replace')) + f.write('\n') + def make_and_write_sparse_matrix_from_uces(self, actives, outfile, listuce = False) : log.info('make_and_write_sparse_matrix_from_uces %s' % outfile) nbl = 0 @@ -475,15 +567,30 @@ class Corpus : ffin.write(line) os.remove(outfile + '~') - def make_table_with_classe(self, uces, list_act) : + def make_table_with_classe(self, uces, list_act, uci = False) : table_uce = [[0 for val in list_act] for line in range(0,len(uces))] uces = dict([[uce, i] for i, uce in enumerate(uces)]) + if uci : + getlem = self.getlemucis + else : + getlem = self.getlemuces for i, lem in enumerate(list_act) : - lemuces = list(set(self.getlemuces(lem)).intersection(uces)) + lemuces = list(set(getlem(lem)).intersection(uces)) for uce in lemuces : table_uce[uces[uce]][i] = 1 table_uce.insert(0, list_act) return table_uce + + def make_pondtable_with_classe(self, uces, list_act) : + table_uce = [[0 for val in list_act] for line in range(0,len(uces))] + uces = dict([[uce, i] for i, uce in enumerate(uces)]) + for i, lem in enumerate(list_act) : + uceseff = self.getlemuceseff(lem) + lemuces = list(set(uceseff.keys()).intersection(uces)) + for uce in lemuces : + table_uce[uces[uce]][i] = uceseff[uce] + table_uce.insert(0, list_act) + return table_uce def parse_active(self, gramact, gramsup = None) : log.info('parse actives') @@ -492,7 +599,7 @@ class Corpus : self.lems[lem].act = 2 elif self.lems[lem].gram in gramact : self.lems[lem].act = 1 - elif gramsup is not None : + elif gramsup is not None and self.lems[lem].gram not in gramact: if self.lems[lem].gram in gramsup : self.lems[lem].act = 2 else : @@ -512,6 +619,8 @@ class Corpus : allactives = [[self.lems[lem].freq, lem] for lem in self.lems if self.lems[lem].act == key and self.lems[lem].freq >= 3] self.activenb = len(allactives) allactives = sorted(allactives, reverse = True) + if self.activenb == 0 : + return [], 0 if len(allactives) <= nbmax : log.info('nb = %i - eff min = %i ' % (len(allactives), allactives[-1][0])) return [val[1] for val in allactives], allactives[-1][0] @@ -532,9 +641,12 @@ class Corpus : log.info('nb actives = %i - eff min = %i ' % (stop + 1, lim)) return [val[1] for val in allactives[0:stop + 1]], lim - def make_and_write_profile(self, actives, ucecl, fileout) : + def make_and_write_profile(self, actives, ucecl, fileout, uci = False) : log.info('formes/classes') - tab = [[lem] + [len(set(self.getlemuces(lem)).intersection(classe)) for classe in ucecl] for lem in actives] + if uci : + tab = [[lem] + [len(set(self.getlemucis(lem)).intersection(classe)) for classe in ucecl] for lem in actives] + else : + tab = [[lem] + [len(set(self.getlemuces(lem)).intersection(classe)) for classe in ucecl] for lem in actives] tab = [[line[0]] + [`val` for val in line[1:]] for line in tab if sum(line[1:]) >= 3] with open(fileout, 'w') as f : f.write('\n'.join([';'.join(line) for line in tab]).encode(self.parametres['syscoding'])) @@ -542,7 +654,7 @@ class Corpus : def make_etoiles(self) : etoiles = set([]) for uci in self.ucis : - etoiles.update(uci.etoiles[1:] + uci.paras) + etoiles.update(uci.etoiles[1:]) return list(etoiles) def make_etoiles_dict(self) : @@ -553,16 +665,16 @@ class Corpus : if et[0] in det : try : endet = '_'.join(et[1:]) - if endet in det[et[0]] : - det[et[0]][endet] += 1 + if etoile in det[et[0]] : + det[et[0]][etoile] += 1 else : - det[et[0]][endet] = 1 + det[et[0]][etoile] = 1 except IndexError : det[et[0]] += 1 else : try : endet = '_'.join(et[1:]) - det[et[0]] = {endet :1} + det[et[0]] = {etoile :1} except IndexError : det[et[0]] = 1 return det @@ -577,9 +689,12 @@ class Corpus : etuces[listet.index(get[0])] += [uce.ident for uce in uci.uces] return etuces - def make_and_write_profile_et(self, ucecl, fileout) : + def make_and_write_profile_et(self, ucecl, fileout, uci = False) : log.info('etoiles/classes') - etoileuces = self.getetoileuces() + if not uci : + etoileuces = self.getetoileuces() + else : + etoileuces = self.getetoileucis() etoileuces = dict([[et, etoileuces[et]] for et in etoileuces if len(etoileuces[et]) > 1]) with open(fileout, 'w') as f : f.write('\n'.join([';'.join([et] + [`len(set(etoileuces[et]).intersection(classe))` for classe in ucecl]) for et in etoileuces]).encode(self.parametres['syscoding'])) @@ -587,7 +702,7 @@ class Corpus : #with open(fileout, 'w') as f : # f.write('\n'.join([';'.join([etoile] + [`len(set(self.getucesfrometoile(etoile)).intersection(classe))` for classe in ucecl]) for etoile in etoiles]).encode(self.parametres['syscoding'])) - def make_colored_corpus(self) : + def make_colored_corpus(self, uci = False) : ucecl = {} for i, lc in enumerate(self.lc) : for uce in lc : @@ -599,17 +714,28 @@ class Corpus : ''' % sys.getdefaultencoding() - res = self.getalluces() - self.make_iduces() - actuci = '' - actpara = False - for uce in res : - if self.iduces[uce[0]].uci != actuci : - actuci = self.iduces[uce[0]].uci - txt += '

' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles) + '

' - txt += '' % (color[ucecl[uce[0]]]) + uce[1] + '

' - else : - txt += '' % (color[ucecl[uce[0]]]) + uce[1] + '

' + if not uci : + res = self.getalluces() + self.make_iduces() + actuci = '' + actpara = False + for uce in res : + if self.iduces[uce[0]].uci != actuci : + actuci = self.iduces[uce[0]].uci + txt += '

' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles) + '

' + txt += '' % (color[ucecl[uce[0]]]) + uce[1] + '

' + else : + txt += '' % (color[ucecl[uce[0]]]) + uce[1] + '

' + else : + res = self.getallucis() + actuci = '' + for uce in res : + if self.ucis[uce[0]].ident != actuci : + actuci = self.ucis[uce[0]].ident + txt += '

' + ' '.join(self.ucis[self.ucis[uce[0]].ident].etoiles) + '

' + txt += '' % (color[ucecl[uce[0]]]) + uce[1] + '

' + else : + txt += '' % (color[ucecl[uce[0]]]) + uce[1] + '

' return txt + '\n' def count_from_list(self, l, d) : @@ -641,9 +767,13 @@ class Corpus : l = l[-taille_limite:] return l - def find_segments_in_classe(self, list_uce, taille_segment, taille_limite): + def find_segments_in_classe(self, list_uce, taille_segment, taille_limite, uci = False): d={} - for uce in self.getconcorde(list_uce) : + if not uci : + concorde = self.getconcorde + else : + concorde = self.getuciconcorde + for uce in concorde(list_uce) : uce = uce[1].split() d =self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d) l = [[d[val], val, taille_segment] for val in d if d[val] >= 3] @@ -697,27 +827,37 @@ class Corpus : self.lc0 = self.lc.pop(0) #return ucecl - def get_stat_by_cluster(self, outf) : + def get_stat_by_cluster(self, outf, lclasses = None) : log.info('get_stat_by_cluster') + if lclasses is None : + lclasses = self.lc t1 = time() - occurrences = dict([[i + 1, 0] for i in range(len(self.lc))]) - formescl = dict([[i + 1, 0] for i in range(len(self.lc))]) - hapaxcl = dict([[i + 1, 0] for i in range(len(self.lc))]) - lenclasses = dict([[i+1,len(cl)] for i, cl in enumerate(self.lc)]) - sets = [set(cl) for cl in self.lc] + occurrences = dict([[i + 1, 0] for i in range(len(lclasses))]) + formescl = dict([[i + 1, 0] for i in range(len(lclasses))]) + hapaxcl = dict([[i + 1, 0] for i in range(len(lclasses))]) + lenclasses = dict([[i+1,len(cl)] for i, cl in enumerate(lclasses)]) + sets = [set(cl) for cl in lclasses] for forme in self.formes : formeuceeff = self.getformeuceseff(forme) - for i, classe in enumerate(self.lc) : + for i, classe in enumerate(lclasses) : concern = sets[i].intersection(formeuceeff.keys()) if len(concern) : occurrences[i+1] += sum([formeuceeff[uce] for uce in concern]) formescl[i+1] += 1 if self.formes[forme].freq == 1 : hapaxcl[i+1] += 1 - toprint = '\n'.join([';'.join([`i`, `occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`]) for i in occurrences]) - with open(outf, 'w') as f : - f.write(toprint) log.info('%f' % (time() - t1)) + if outf is not None : + toprint = '\n'.join([';'.join([`i`, `occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`]) for i in occurrences]) + with open(outf, 'w') as f : + f.write(toprint) + else : + return [[`occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`] for i in occurrences] + + def get_stat_by_et(self, outf, etoiles) : + lclasses = [self.getucesfrometoile(etoile) for etoile in etoiles] + stats = self.get_stat_by_cluster(None, lclasses) + stats = [[etoiles[i]] + val for i, val in enumerate(stats)] def gethapaxbyet(self, etoiles) : hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1] @@ -776,14 +916,29 @@ class Corpus : with open('/tmp/testhapxuce.html','w') as f : f.write(txt) + def export_dictionary(self, fileout, syscoding) : + listformes = [[self.formes[forme].freq, forme, self.formes[forme].lem, self.formes[forme].gram] for forme in self.formes] + listformes.sort(reverse = True) + listformes = [forme[1:] + [`forme[0]`] for forme in listformes] + with open(fileout, 'w') as f : + f.write('\n'.join(['\t'.join(forme) for forme in listformes]).encode(syscoding)) + + def export_lems(self, fileout, syscoding) : + self.make_idformes() + listlem = [[lem, '\t'.join(['\t'.join([self.idformes[forme].forme, `self.lems[lem].formes[forme]`]) for forme in self.lems[lem].formes])] for lem in self.lems] + listlem.sort() + with open(fileout, 'w') as f : + f.write('\n'.join(['\t'.join(lem) for lem in listlem]).encode(syscoding)) + + + class MakeUciStat : def __init__(self, corpus) : ucinb = corpus.getucinb() ucisize = corpus.getucisize() ucimean = float(sum(ucisize))/float(ucinb) - detoile = corpus.make_etoiles_dict() - + detoile = corpus.make_etoiles_dict() class Uci : def __init__(self, iduci, line, paraset = None) : @@ -1011,10 +1166,10 @@ class BuildCorpus : self.cleans.append(self.dotiret) def make_expression(self,txt) : - for expression in self.expressions: + for expression in self.expressions: if expression in txt : txt = txt.replace(expression, self.expressions[expression][0]) - return txt + return txt def dolower(self, txt) : return txt.lower() @@ -1033,7 +1188,7 @@ class BuildCorpus : def firstclean(self, txt) : txt = txt.replace(u'’',"'") txt = txt.replace(u'œ', u'oe') - return txt.replace('...',u' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'…', ' £$£ ') + return txt.replace('...',u' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'…', u' £$£ ') def make_cleans(self, txt) : for clean in self.cleans : @@ -1093,7 +1248,7 @@ class BuildFromAlceste(BuildCorpus) : try : with codecs.open(infile, 'r', self.encoding) as f : for linenb, line in enumerate(f) : - line = line.rstrip('\n\r') + line = line.rstrip('\n\r')#FIXME .lstrip(codecs.BOM).lstrip(codecs.BOM_UTF8) if self.testuci(line) : iduci += 1 if txt != [] : @@ -1106,7 +1261,6 @@ class BuildFromAlceste(BuildCorpus) : log.info(u'Empty text : %i' % linenb) iduci -= 1 self.corpus.ucis.pop() - #raise Exception("EmptyText %i" % linenb) self.corpus.ucis.append(Uci(iduci, line)) if self.dlg is not None : if not (iduci + 1) % 10 : @@ -1119,19 +1273,24 @@ class BuildFromAlceste(BuildCorpus) : idpara += 1 self.corpus.ucis[-1].paras.append(line.split()[0]) else : - raise Exception('paragrapheOT') + raise Exception('paragrapheOT %i' % linenb) elif line.strip() != '' and iduci != -1 : txt.append(line) if txt != [] and iduci != -1 : iduce, idpara = self.treattxt(txt, iduce, idpara, iduci) del(txt) else : - raise Exception("EmptyText") + if iduci != -1 : + iduci -= 1 + self.corpus.ucis.pop() + log.info(Exception("Empty text %i" % linenb)) + else : + raise Exception('EmptyText %i' % linenb) if iduci != -1 and iduce != -1: self.backup_uce() else : log.info(_(u"No Text in corpora. Are you sure of the formatting ?")) - raise Exception('TextBeforeTextMark') + raise Exception('TextBeforeTextMark %i' % linenb) except UnicodeDecodeError : raise Exception("CorpusEncoding") @@ -1202,7 +1361,10 @@ class Builder : parametres['originalpath'] = parent.filename PathOut().createdir(parametres['pathout']) ReadLexique(self.parent, lang = parametres['lang']) - self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')) + if parametres['lang'] != 'other' and os.path.exists(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')): + self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')) + else : + self.parent.expressions = {} self.parametres = parametres else : if self.dlg is not None : @@ -1215,6 +1377,6 @@ class Builder : if __name__ == '__main__' : t1 = time() - parametres = {'formesdb':'formes.db', 'ucesdb': 'uces.db', 'corpusdb' : 'corpus.db', 'syscoding' : 'utf-8', 'encoding' : encoding} + parametres = {'formesdb':'formes.db', 'ucesdb': 'uces.db', 'corpusdb' : 'corpus.db', 'syscoding' : 'utf-8', 'encoding' : 'utf8'} intro = BuildCorpus(infile, parametres)#, tar_in, tar_infouce)#, tar_formes) print time() - t1