X-Git-Url: http://iramuteq.org/git?p=iramuteq;a=blobdiff_plain;f=corpus.py;h=080b980ff7bbe13c4ff0d665db751a2af92fbfd7;hp=e6836308acc16d00490c3a392f831c3016a91f99;hb=80f4bfad30ece8835cb1f91349b1dda36439e4ca;hpb=441459f18ef9957b592f26743eea037bca431e55 diff --git a/corpus.py b/corpus.py old mode 100644 new mode 100755 index e683630..080b980 --- a/corpus.py +++ b/corpus.py @@ -8,7 +8,7 @@ _ = gettext.gettext import locale import sys from time import time -from functions import decoupercharact, ReadDicoAsDico, DoConf +from functions import decoupercharact, ReadDicoAsDico, DoConf, ReadLexique, progressbar import re import sqlite3 import itertools @@ -16,8 +16,8 @@ import logging from operator import itemgetter from uuid import uuid4 from chemins import PathOut -from dialog import CorpusPref -from functions import ReadLexique, ReadDicoAsDico +from dialog import CorpusPref, SubTextFromMetaDial, MergeClusterFrame +from copy import copy from colors import colors import datetime @@ -34,6 +34,17 @@ def copycorpus(corpus) : copy_corpus.conn_all() return copy_corpus +def CopyUce(uce) : + return Uce(uce.ident, uce.para, uce.uci) + + +def CopyUci(uci): + nuci = Uci(uci.ident, '') + nuci.etoiles = copy(uci.etoiles) + nuci.uces = [CopyUce(uce) for uce in uci.uces] + nuci.paras = copy(uci.paras) + return nuci + class Corpus : @@ -43,7 +54,7 @@ class Corpus : def __init__(self, parent, parametres = {}, read = False) : self.parent = parent self.parametres = parametres - self.cformes = None + self.cformes = None self.connformes = None self.connuces = None self.conncorpus = None @@ -84,6 +95,20 @@ class Corpus : self.formes[word] = Word(word, gramtype, len(self.formes), lem) self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident : 1} + def add_word_from_forme(self, word, stident): + if word.forme in self.formes : + self.formes[word.forme].freq += 1 + if self.formes[word.forme].ident in self.idformesuces : + if stident in self.idformesuces[self.formes[word.forme].ident] : + self.idformesuces[self.formes[word.forme].ident][stident] += 1 + else : + self.idformesuces[self.formes[word.forme].ident][stident] = 1 + else : + self.idformesuces[self.formes[word.forme].ident] = {stident: 1} + else : + self.formes[word.forme] = Word(word.forme, word.gram, len(self.formes), word.lem) + self.idformesuces[self.formes[word.forme].ident] = {stident : 1} + def conn_all(self): """connect corpus to db""" if self.connformes is None : @@ -118,13 +143,17 @@ class Corpus : res = self.ccorpus.execute('SELECT * FROM formes;') self.formes = dict([[forme[1], Word(forme[1], forme[3], forme[0], lem = forme[2], freq = forme[4])] for forme in res]) self.ccorpus.close() - + def getworduces(self, wordid) : if isinstance(wordid, basestring) : wordid = self.formes[wordid].ident res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`wordid`,)) return list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res])) + def getworducis(self, wordid) : + res = self.getworduces(wordid) + return list(set([self.getucefromid(uce).uci for uce in res])) + def getformeuceseff(self, formeid) : if isinstance(formeid, basestring) : formeid = self.formes[formeid].ident @@ -136,7 +165,7 @@ class Corpus : formeuceeff = {} for i, uce in enumerate(uces) : formeuceeff[uce] = formeuceeff.get(uce, 0) + eff[i] - return formeuceeff + return formeuceeff def getlemuces(self, lem) : formesid = ', '.join([`val` for val in self.lems[lem].formes]) @@ -144,6 +173,34 @@ class Corpus : res = self.cformes.execute(query) return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res])))) + def gettgenst(self, tgen): + formesid = [] + for lem in tgen : + if lem in self.lems : + formesid += self.lems[lem].formes + else : + print 'abscent : %s' % lem + query = 'SELECT uces FROM uces where id IN %s ORDER BY id' % str(tuple(formesid)) + res = self.cformes.execute(query) + return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res])))) + + def gettgenstprof(self, tgen, classe, i, clnb): + tgenst = [] + for lem in tgen : + if lem in self.lems : + lemst = self.getlemuces(lem) + tgenst += lemst + if not lem in self.tgenlem : + self.tgenlem[lem] = [0] * clnb + self.tgenlem[lem][i] = len(set(lemst).intersection(classe)) + else : + print 'abscent: ',lem + return list(set(tgenst)) + + def gettgentxt(self, tgen): + sts = self.gettgenst(tgen) + return list(set([self.getucefromid(val).uci for val in sts])) + def getlemucis(self, lem) : uces = self.getlemuces(lem) return list(set([self.getucefromid(val).uci for val in uces])) @@ -159,7 +216,7 @@ class Corpus : lemuceeff = {} for i, uce in enumerate(uces) : lemuceeff[uce] = lemuceeff.get(uce, 0) + eff[i] - return lemuceeff + return lemuceeff def getlemclustereff(self, lem, cluster) : return len(list(set(self.lc[cluster]).intersection(self.getlemuces(lem)))) @@ -189,13 +246,23 @@ class Corpus : def getucisize(self) : ucesize = self.getucesize() return [sum(ucesize[uci.uces[0].ident:(uci.uces[-1].ident + 1)]) for uci in self.ucis] - + def getucesize(self) : res = self.getalluces() return [len(uce[1].split()) for uce in res] def getconcorde(self, uces) : - return self.cuces.execute('select * from uces where id IN (%s);' % ', '.join([`i` for i in uces])) + return self.cuces.execute('select * from uces where id IN (%s) ORDER BY id;' % ', '.join([`i` for i in uces])) + + def getuciconcorde(self, ucis) : + uces = [[val,[uce.ident for uce in self.ucis[val].uces]] for val in ucis] + uces = [[val[0], '\n'.join([row[1] for row in self.getconcorde(val[1])])] for val in uces] + return uces + + def getuciconcorde_uces(self, uciid, uceid) : + uces = [uce.ident for uce in self.ucis[uciid].uces] + uces = [row for row in self.getconcorde(uces)] + return uces def getwordconcorde(self, word) : return self.getconcorde(self.getworduces(word)) @@ -206,6 +273,10 @@ class Corpus : def getalluces(self) : return self.cuces.execute('SELECT * FROM uces') + def getallucis(self): + uces = [row[1] for row in self.getalluces()] + return [[uci.ident, '\n'.join([uces[uce.ident] for uce in uci.uces])] for uci in self.ucis] + def getucesfrometoile(self, etoile) : return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles] @@ -231,6 +302,17 @@ class Corpus : idpara += 1 return etoileuces + def getetoileucis(self): + etoileuces = {} + for uci in self.ucis : + etoiles = uci.etoiles[1:] + for et in etoiles : + if et in etoileuces : + etoileuces[et] += [uci.ident] + else : + etoileuces[et] = [uci.ident] + return etoileuces + def getucefromid(self, uceid) : if self.iduces is None : self.make_iduces() return self.iduces[uceid] @@ -266,7 +348,31 @@ class Corpus : self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme]) else : self.lems = dict([[forme, Lem(self, self.formes[forme])] for forme in self.formes]) - + + def make_lems_from_dict(self, dictionnaire, dolem = True) : + log.info('make lems from dict') + self.lems = {} + for forme in self.formes : + if self.formes[forme].forme in dictionnaire : + lem = dictionnaire[forme][0] + gram = dictionnaire[forme][1] + elif forme.isdigit() : + gram = u'num' + lem = forme + else : + gram = u'nr' + lem = forme + self.formes[forme].lem = lem + self.formes[forme].gram = gram + if dolem : + if self.formes[forme].lem in self.lems : + if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes : + self.lems[self.formes[forme].lem].add_forme(self.formes[forme]) + else : + self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme]) + else : + self.lems[forme] = Lem(self, self.formes[forme]) + def make_idformes(self) : self.idformes = dict([[self.formes[forme].ident, self.formes[forme]] for forme in self.formes]) @@ -275,6 +381,7 @@ class Corpus : self.iduces = dict([[uce.ident, uce] for uci in self.ucis for uce in uci.uces]) def make_lexitable(self, mineff, etoiles, gram = 0) : + log.info('making lexical table...') if gram == 0 : grams = {1:'', 2:''} else : @@ -292,9 +399,57 @@ class Corpus : for lem in tokeep : deff = self.getlemuceseff(lem) ucesk = deff.keys() - tab.append([lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces]) + line = [lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces] + if sum(line[1:]) >= mineff : + tab.append(line) tab.insert(0, [''] + etoiles) return tab + + def make_tgen_table(self, tgen, etoiles, tot = None): + lclasses = [self.getucesfrometoile(etoile) for etoile in etoiles] + sets = [set(cl) for cl in lclasses] + totoccurrences = dict([[val, 0] for val in etoiles]) + if tot is None : + for forme in self.formes : + formeuceeff = self.getformeuceseff(forme) + for i, classe in enumerate(lclasses) : + concern = sets[i].intersection(formeuceeff.keys()) + if len(concern) : + totoccurrences[etoiles[i]] += sum([formeuceeff[uce] for uce in concern]) + #tgenoccurrences = dict([[val, 0] for val in etoiles]) + tgenoccurrences = {} + for t in tgen.tgen : + tgenoccurrences[t] = dict([[val, 0] for val in etoiles]) + for lem in tgen[t] : + lemuceeff = self.getlemuceseff(lem) + for i, classe in enumerate(lclasses) : + concern = sets[i].intersection(lemuceeff.keys()) + if len(concern) : + tgenoccurrences[t][etoiles[i]] += sum([lemuceeff[uce] for uce in concern]) + return tgenoccurrences, totoccurrences + + def make_tgen_profile(self, tgen, ucecl, uci = False) : + log.info('tgen/classes') + self.tgenlem = {} + clnb = len(ucecl) + if uci : + #FIXME : NE MARCHE PLUS CHANGER CA + tab = [[lem] + [len(set(self.gettgentxt(tgen[lem])).intersection(classe)) for i, classe in enumerate(ucecl)] for lem in tgen] + else : + tab = [[lem] + [len(set(self.gettgenstprof(tgen[lem], classe, i, clnb)).intersection(classe)) for i, classe in enumerate(ucecl)] for lem in tgen] + tab = [[line[0]] + [val for val in line[1:]] for line in tab if sum(line[1:]) >= 3] + return tab + #i = 0 + #nam = 'total' + #while nam + `i` in tgen : + # i += 1 + #nam = nam + `i` + #last = [nam] + [`len(classe)` for classe in ucecl] + #tab += [last] + #line0 = ['tgen'] + ['_'.join(['cluster', `i+1`]) for i in range(len(ucecl))] + #tab = [line0] + tab + #with open(fileout, 'w') as f : + # f.write('\n'.join(['\t'.join(line) for line in tab]).encode(self.parametres['syscoding'])) def make_efftype_from_etoiles(self, etoiles) : dtype = {} @@ -414,39 +569,91 @@ class Corpus : ident += 1 f.write('\n'.join([self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n') - def export_corpus_classes(self, outf, alc = True, lem = False) : + def export_meta_table(self, outf) : + metas = [[`i`] + text.etoiles[1:] for i, text in enumerate(self.ucis)] + longueur_max = max([len(val) for val in metas]) + first = ['column_%i' % i for i in range(longueur_max)] + metas.insert(0, first) + with open(outf, 'w') as f : + f.write('\n'.join(['\t'.join(line) for line in metas]).encode(self.parametres['syscoding'])) + + def export_corpus_classes(self, outf, alc = True, lem = False, uci = False) : ucecl = {} for i, lc in enumerate(self.lc) : for uce in lc : ucecl[uce] = i + 1 for uce in self.lc0 : ucecl[uce] = 0 - res = self.getalluces() - self.make_iduces() + if not uci : + res = self.getalluces() + self.make_iduces() + else : + res = self.getallucis() with open(outf, 'w') as f : for uce in res : guce = uce[1] - actuci = self.iduces[uce[0]].uci + if not uci : + actuci = self.iduces[uce[0]].uci + else : + actuci = uce[0] if lem : guce = ' '.join([self.formes[forme].lem for forme in guce.split()]) if alc : - etline = ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles + ['*classe_%i' % ucecl[uce[0]]]) + etline = ' '.join(self.ucis[actuci].etoiles + ['*classe_%i' % ucecl[uce[0]]]) else : - etline = ' '.join(['<' + '='.join(et.split('_')) + '>' for et in self.ucis[self.iduces[uce[0]].uci].etoiles[1:]]) + etline = ' '.join(['<' + '='.join(et.split('_')) + '>' for et in self.ucis[actuci].etoiles[1:]]) f.write(etline.encode(self.parametres['syscoding']) + '\n') f.write(guce.encode(self.parametres['syscoding']) + '\n\n') - def export_classe(self, outf, classe, lem = False) : - sts = self.lc[classe - 1] - res = self.getconcorde(sts) - self.make_iduces() + def export_classe(self, outf, classe, lem = False, uci = False) : + sts = self.lc[classe - 1] + if not uci : + res = self.getconcorde(sts) + self.make_iduces() + else : + res = self.getuciconcorde(sts) with open(outf, 'w') as f : for uce in res : guce = uce[1] - f.write(' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n') + if not uci : + f.write(' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n') + else : + f.write(' '.join(self.ucis[uce[0]].etoiles).encode(self.parametres['syscoding']) + '\n') if lem : guce = ' '.join([self.formes[forme].lem for forme in guce.split()]) f.write(guce.encode(self.parametres['syscoding']) + '\n\n') + + def export_owledge(self, rep, classe, lem = False, uci = False) : + sts = self.lc[classe - 1] + if not uci : + res = self.getconcorde(sts) + self.make_iduces() + else : + res = self.getuciconcorde(sts) + for uce in res : + ident = uce[0] + guce = uce[1] + outf = '.'.join([`ident`, 'txt']) + outf = os.path.join(rep, outf) + if lem : + guce = ' '.join([self.formes[forme].lem for forme in guce.split()]) + with open(outf, 'w') as f : + f.write(guce.encode('cp1252', errors = 'replace')) + + def export_tropes(self, fileout, classe, lem = False, uci = False) : + sts = self.lc[classe - 1] + if not uci : + res = self.getconcorde(sts) + self.make_iduces() + else : + res = self.getuciconcorde(sts) + with open(fileout, 'w') as f : + for uce in res : + guce = uce[1] + if lem : + guce = ' '.join([self.formes[forme].lem for forme in guce.split()]) + f.write(guce.encode('cp1252', errors = 'replace')) + f.write('\n') def make_and_write_sparse_matrix_from_uces(self, actives, outfile, listuce = False) : log.info('make_and_write_sparse_matrix_from_uces %s' % outfile) @@ -457,7 +664,7 @@ class Corpus : nbl += 1 f.write(''.join([' '.join([`uce+1`, `i+1`,`1`]),'\n'])) f.seek(0) - with open(outfile, 'w') as ffin : + with open(outfile, 'w') as ffin : ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl)) for line in f : ffin.write(line) @@ -475,7 +682,7 @@ class Corpus : nbl += 1 f.write(''.join([' '.join([`uci+1`, `i+1`,`1`]),'\n'])) f.seek(0) - with open(outfile, 'w') as ffin : + with open(outfile, 'w') as ffin : ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucinb(), len(actives), nbl)) for line in f : ffin.write(line) @@ -483,7 +690,7 @@ class Corpus : if listuci : with open(listuci, 'w') as f : f.write('\n'.join(['uci;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucinb())])) - + def make_and_write_sparse_matrix_from_classe(self, actives, uces, outfile) : log.info('make_and_write_sparse_matrix_from_classe %s' % outfile) nbl = 0 @@ -494,22 +701,26 @@ class Corpus : for uce in uces_ok : f.write(''.join([' '.join([`duces[uce]+1`,`i+1`,`1`]),'\n'])) f.seek(0) - with open(outfile, 'w') as ffin : - ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl)) + with open(outfile, 'w') as ffin : + ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (len(uces), len(actives), nbl)) for line in f : ffin.write(line) os.remove(outfile + '~') - - def make_table_with_classe(self, uces, list_act) : + + def make_table_with_classe(self, uces, list_act, uci = False) : table_uce = [[0 for val in list_act] for line in range(0,len(uces))] uces = dict([[uce, i] for i, uce in enumerate(uces)]) + if uci : + getlem = self.getlemucis + else : + getlem = self.getlemuces for i, lem in enumerate(list_act) : - lemuces = list(set(self.getlemuces(lem)).intersection(uces)) + lemuces = list(set(getlem(lem)).intersection(uces)) for uce in lemuces : table_uce[uces[uce]][i] = 1 table_uce.insert(0, list_act) - return table_uce - + return table_uce + def make_pondtable_with_classe(self, uces, list_act) : table_uce = [[0 for val in list_act] for line in range(0,len(uces))] uces = dict([[uce, i] for i, uce in enumerate(uces)]) @@ -519,7 +730,7 @@ class Corpus : for uce in lemuces : table_uce[uces[uce]][i] = uceseff[uce] table_uce.insert(0, list_act) - return table_uce + return table_uce def parse_active(self, gramact, gramsup = None) : log.info('parse actives') @@ -540,7 +751,7 @@ class Corpus : if self.idformes is None : self.make_idformes() return [lem for lem in self.lems if self.getlemeff(lem) >= limit and self.lems[lem].act == key] - + def make_actives_nb(self, nbmax, key) : log.info('make_actives_nb : %i - %i' % (nbmax,key)) if self.idformes is None : @@ -568,20 +779,30 @@ class Corpus : stop = nbmax - 1 lim = effs[stop] log.info('nb actives = %i - eff min = %i ' % (stop + 1, lim)) - return [val[1] for val in allactives[0:stop + 1]], lim + return [val[1] for val in allactives[0:stop]], lim - def make_and_write_profile(self, actives, ucecl, fileout) : + def make_and_write_profile(self, actives, ucecl, fileout, uci = False) : log.info('formes/classes') - tab = [[lem] + [len(set(self.getlemuces(lem)).intersection(classe)) for classe in ucecl] for lem in actives] + if uci : + tab = [[lem] + [len(set(self.getlemucis(lem)).intersection(classe)) for classe in ucecl] for lem in actives] + else : + tab = [[lem] + [len(set(self.getlemuces(lem)).intersection(classe)) for classe in ucecl] for lem in actives] tab = [[line[0]] + [`val` for val in line[1:]] for line in tab if sum(line[1:]) >= 3] with open(fileout, 'w') as f : f.write('\n'.join([';'.join(line) for line in tab]).encode(self.parametres['syscoding'])) + f.write('\n') def make_etoiles(self) : etoiles = set([]) for uci in self.ucis : etoiles.update(uci.etoiles[1:]) return list(etoiles) + + def make_themes(self): + themes = set([]) + for uci in self.ucis : + themes.update(uci.paras) + return list(themes) def make_etoiles_dict(self) : etoiles = [et for uci in self.ucis for et in uci.etoiles[1:]] @@ -604,6 +825,28 @@ class Corpus : except IndexError : det[et[0]] = 1 return det + + def make_theme_dict(self): + themes = [val for uci in self.ucis for val in uci.paras] + det = {} + for theme in themes : + th = theme.split('_') + if th[0] in det : + try : + endth = '_'.join(th[1:]) + if theme in det[th[0]] : + det[th[0]][theme] += 1 + else : + det[th[0]][theme] = 1 + except IndexError : + det[th[0]] += 1 + else : + try : + endth = '_'.join(th[1:]) + det[th[0]] = {theme:1} + except IndexError : + det[th[0]] = 1 + return det def make_etline(self, listet) : etuces = [[] for et in listet] @@ -615,41 +858,86 @@ class Corpus : etuces[listet.index(get[0])] += [uce.ident for uce in uci.uces] return etuces - def make_and_write_profile_et(self, ucecl, fileout) : + def make_and_write_profile_et(self, ucecl, fileout, uci = False) : log.info('etoiles/classes') - etoileuces = self.getetoileuces() - etoileuces = dict([[et, etoileuces[et]] for et in etoileuces if len(etoileuces[et]) > 1]) + if not uci : + etoileuces = self.getetoileuces() + else : + etoileuces = self.getetoileucis() + print 'etoilesuces ok' + etoileuces = dict([[et, etoileuces[et]] for et in etoileuces if + len(etoileuces[et]) > 1 ]) #and not et.startswith(u'*reference_') + print len(etoileuces) + print 'etoilesuces ok2' with open(fileout, 'w') as f : + print 'write...' f.write('\n'.join([';'.join([et] + [`len(set(etoileuces[et]).intersection(classe))` for classe in ucecl]) for et in etoileuces]).encode(self.parametres['syscoding'])) #etoiles = self.make_etoiles() #with open(fileout, 'w') as f : # f.write('\n'.join([';'.join([etoile] + [`len(set(self.getucesfrometoile(etoile)).intersection(classe))` for classe in ucecl]) for etoile in etoiles]).encode(self.parametres['syscoding'])) - def make_colored_corpus(self) : + def make_colored_corpus(self, uci = False) : ucecl = {} for i, lc in enumerate(self.lc) : - for uce in lc : + for uce in lc : ucecl[uce] = i + 1 for uce in self.lc0 : ucecl[uce] = 0 - color = ['black'] + colors[len(self.lc) - 1] + color = ['black'] + colors[len(self.lc) - 1] txt = ''' ''' % sys.getdefaultencoding() - res = self.getalluces() - self.make_iduces() - actuci = '' - actpara = False - for uce in res : - if self.iduces[uce[0]].uci != actuci : - actuci = self.iduces[uce[0]].uci - txt += '

' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles) + '

' - txt += '' % (color[ucecl[uce[0]]]) + uce[1] + '

' - else : - txt += '' % (color[ucecl[uce[0]]]) + uce[1] + '

' + if not uci : + res = self.getalluces() + self.make_iduces() + actuci = '' + actpara = False + for uce in res : + if self.iduces[uce[0]].uci != actuci : + actuci = self.iduces[uce[0]].uci + txt += '

' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles) + '

' + txt += '' % (color[ucecl[uce[0]]]) + uce[1] + '

' + else : + txt += '' % (color[ucecl[uce[0]]]) + uce[1] + '

' + else : + res = self.getallucis() + actuci = '' + for uce in res : + if self.ucis[uce[0]].ident != actuci : + actuci = self.ucis[uce[0]].ident + txt += '

' + ' '.join(self.ucis[self.ucis[uce[0]].ident].etoiles) + '

' + txt += '' % (color[ucecl[uce[0]]]) + uce[1] + '

' + else : + txt += '' % (color[ucecl[uce[0]]]) + uce[1] + '

' return txt + '\n' + def make_cut_corpus(self, uci = False) : + txt = u'' + if not uci : + res = self.getalluces() + self.make_iduces() + actuci = '' + actpara = False + for uce in res : + if self.iduces[uce[0]].uci != actuci : + actuci = self.iduces[uce[0]].uci + txt += u'\n' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles) + u'\n' + txt += ''.join([u'\n',uce[1],u'\n']) + else : + txt += ''.join([u'\n',uce[1],u'\n']) + else : + res = self.getallucis() + actuci = '' + for uce in res : + if self.ucis[uce[0]].ident != actuci : + actuci = self.ucis[uce[0]].ident + txt += u'\n' + ' '.join(self.ucis[self.ucis[uce[0]].ident].etoiles) + u'\n' + txt += ''.join([u'\n',uce[1],u'\n']) + else : + txt += ''.join([u'\n',uce[1],u'\n']) + return txt + def count_from_list(self, l, d) : for val in l : if val in d : @@ -679,9 +967,13 @@ class Corpus : l = l[-taille_limite:] return l - def find_segments_in_classe(self, list_uce, taille_segment, taille_limite): + def find_segments_in_classe(self, list_uce, taille_segment, taille_limite, uci = False): d={} - for uce in self.getconcorde(list_uce) : + if not uci : + concorde = self.getconcorde + else : + concorde = self.getuciconcorde + for uce in concorde(list_uce) : uce = uce[1].split() d =self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d) l = [[d[val], val, taille_segment] for val in d if d[val] >= 3] @@ -690,7 +982,7 @@ class Corpus : if len(l) > taille_limite : l = l[-taille_limite:] return l - + def make_segments_profile(self, fileout, lenmin = 3, lenmax = 10, effmin = 50, lem = False) : d = {} for b, classe in enumerate(self.lc) : @@ -703,7 +995,7 @@ class Corpus : result = [[seg] + [str(val) for val in d[seg]] for seg in d if sum(d[seg]) >= effmin] with open(fileout, 'w') as f : f.write('\n'.join([';'.join(line) for line in result])) - + def make_proftype(self, outf) : res = {} for lem in self.lems : @@ -734,7 +1026,7 @@ class Corpus : self.lc = [[uce[0] for uce in self.lc if uce[1] == i] for i in range(clnb+1)] self.lc0 = self.lc.pop(0) #return ucecl - + def get_stat_by_cluster(self, outf, lclasses = None) : log.info('get_stat_by_cluster') if lclasses is None : @@ -754,7 +1046,7 @@ class Corpus : formescl[i+1] += 1 if self.formes[forme].freq == 1 : hapaxcl[i+1] += 1 - log.info('%f' % (time() - t1)) + log.info('%f' % (time() - t1)) if outf is not None : toprint = '\n'.join([';'.join([`i`, `occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`]) for i in occurrences]) with open(outf, 'w') as f : @@ -840,13 +1132,12 @@ class Corpus : - class MakeUciStat : def __init__(self, corpus) : ucinb = corpus.getucinb() ucisize = corpus.getucisize() ucimean = float(sum(ucisize))/float(ucinb) - detoile = corpus.make_etoiles_dict() + detoile = corpus.make_etoiles_dict() class Uci : def __init__(self, iduci, line, paraset = None) : @@ -897,12 +1188,12 @@ def decouperlist(chaine, longueur, longueurOptimale) : dsep = dict([[val[0],val[1]] for val in separateurs]) trouve = False # si on a trouvé un bon séparateur iDecoupe = 0 # indice du caractere ou il faut decouper - + longueur = min(longueur, len(chaine) - 1) chaineTravail = chaine[:longueur + 1] nbCar = longueur meilleur = ['', 0, 0] # type, poids et position du meilleur separateur - + try : indice = chaineTravail.index(u'$') trouve = True @@ -970,7 +1261,7 @@ class BuildCorpus : self.corpus.pathout = PathOut(filename = parametres_corpus['originalpath'], dirout = parametres_corpus['pathout']) self.corpus.pathout.createdir(parametres_corpus['pathout']) self.corpus.parametres['uuid'] = str(uuid4()) - self.corpus.parametres['corpus_name'] = os.path.split(self.corpus.parametres['pathout'])[1] + self.corpus.parametres['corpus_name'] = parametres_corpus['corpus_name']#os.path.split(self.corpus.parametres['pathout'])[1] self.corpus.parametres['type'] = 'corpus' if self.corpus.parametres['keep_ponct'] : self.ponctuation_espace = [' ', ''] @@ -996,14 +1287,14 @@ class BuildCorpus : self.ucesize = self.corpus.parametres.get('ucesize', 240) log.info('method uce : %s' % method) - def dobuild(self) : + def dobuild(self) : t1 = time() try : self.read_corpus(self.infile) except Warning, args : log.info('pas kool %s' % args) raise Warning - else : + else : self.indexdb() self.corpus.parametres['ira'] = self.corpus.pathout['Corpus.cira'] self.time = time() - t1 @@ -1040,7 +1331,7 @@ class BuildCorpus : self.cf.execute('CREATE INDEX ideff ON eff (id);') self.c.close() self.cf.close() - #backup corpora + #backup corpus self.conn_corpus = sqlite3.connect(self.corpus.pathout['corpus.db']) self.ccorpus = self.conn_corpus.cursor() self.ccorpus.execute('CREATE TABLE IF NOT EXISTS etoiles (uci INTEGER, et TEXT, paras TEXT);') @@ -1074,11 +1365,13 @@ class BuildCorpus : self.cleans.append(self.dotiret) def make_expression(self,txt) : - for expression in self.expressions: + exp = self.expressions.keys() + exp.sort(reverse=True) + for expression in exp : if expression in txt : txt = txt.replace(expression, self.expressions[expression][0]) - return txt - + return txt + def dolower(self, txt) : return txt.lower() @@ -1086,7 +1379,7 @@ class BuildCorpus : #rule = u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_-" list_keep = u"[" + self.rule + "]+" return re.sub(list_keep, ' ', txt) - + def doapos(self, txt) : return txt.replace(u'\'', u' ') @@ -1110,7 +1403,7 @@ class BuildCorpus : toeff = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].values()])) for forme in self.corpus.idformesuces] self.cf.executemany('INSERT INTO uces VALUES (?,?);', touce) self.cf.executemany('INSERT INTO eff VALUES (?,?);', toeff) - self.corpus.idformesuces = {} + self.corpus.idformesuces = {} self.count = 1 def backup_corpus(self) : @@ -1118,7 +1411,7 @@ class BuildCorpus : t = time() for uci in self.corpus.ucis : self.ccorpus.execute('INSERT INTO etoiles VALUES (?,?,?);' ,(uci.ident,' '.join(uci.etoiles), ' '.join(uci.paras,))) - for uce in uci.uces : + for uce in uci.uces : self.ccorpus.execute('INSERT INTO luces VALUES (?,?,?);',(`uci.ident`,`uce.para`,`uce.ident`,)) for forme in self.corpus.formes : self.ccorpus.execute('INSERT INTO formes VALUES (?,?,?,?,?);', (`self.corpus.formes[forme].ident`, forme, self.corpus.formes[forme].lem, self.corpus.formes[forme].gram, `self.corpus.formes[forme].freq`,)) @@ -1138,6 +1431,109 @@ class BuildCorpus : pourhapaxocc = (float(hapaxnb) / self.corpus.parametres['occurrences']) * 100 self.corpus.parametres['hapax'] = '%i - %.2f %% des formes - %.2f %% des occurrences' % (hapaxnb, pourhapaxf, pourhapaxocc) +class BuildSubCorpus(BuildCorpus): + def __init__(self, corpus, parametres, dlg = None) : + log.info('begin subcorpus...') + self.dlg = dlg + self.ori = corpus + self.infile = None + self.corpus = Corpus(self, {'type' : 'corpus', 'originalpath' : corpus.parametres['originalpath'], 'encoding' : corpus.parametres['encoding']}) + self.last = 0 + self.parametres = parametres + self.encoding = corpus.parametres['encoding'] + self.corpus.parametres['corpus_name'] = parametres['corpus_name'] + self.corpus.pathout = PathOut(filename = corpus.parametres['originalpath'], dirout = parametres['pathout']) + self.corpus.pathout.createdir(parametres['pathout']) + self.corpus.parametres['pathout'] = parametres['pathout'] + self.corpus.parametres['meta'] = parametres.get('meta', False) + self.corpus.parametres['uuid'] = str(uuid4()) + if parametres.get('frommeta', False) : + print 'make subtexts' + self.corpus.ucis = [CopyUci(uci) for uci in self.ori.ucis if set(parametres['meta']).intersection(uci.etoiles) != set()] + elif parametres.get('fromtheme', False) : + print 'make subtexts from theme' + idpara = 0 + for uci in self.ori.ucis : + if uci.paras != [] : + newuce = [] + newpara = [] + for et in uci.paras : + if et in parametres['meta'] : + newuce += [CopyUce(uce) for uce in uci.uces if uce.para == idpara] + newpara.append(et) + idpara += 1 + if newuce != [] : + nuci = CopyUci(uci) + nuci.uces = newuce + nuci.paras = newpara + self.corpus.ucis.append(nuci) + else : + idpara += 1 + elif parametres.get('fromclusters', False) : + self.parametres['uceids'] = [st for i in self.parametres['meta'] for st in self.parametres['lc'][i]] + self.fromuceids() + elif parametres.get('fromuceids', False) : + self.fromuceids() + #create database + self.connect() + self.dobuild() + + def fromuceids(self): + print 'fromuceids' + dictucekeep = dict(zip(self.parametres['uceids'], self.parametres['uceids'])) + idpara = 0 + for uci in self.ori.ucis : + if uci.paras == [] : + keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep] + if keepuces != [] : + nuci = CopyUci(uci) + nuci.uces = keepuces + self.corpus.ucis.append(nuci) + idpara += 1 + else : + newuces = [] + newpara = [] + for et in uci.paras : + keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep and uce.para == idpara] + idpara += 1 + if keepuces != [] : + newuces += keepuces + newpara.append(et) + if newuces != [] : + nuci = CopyUci(uci) + nuci.uces = newuces + nuci.paras = newpara + self.corpus.ucis.append(nuci) + + def read_corpus(self, infile = None): + self.olduceid = [uce.ident for uci in self.corpus.ucis for uce in uci.uces] + ident_uci = 0 + ident_uce = 0 + ident_para = -1 + lastpara = -1 + newuceident = {} + print 'redo text, para and st ident' + for uci in self.corpus.ucis : + uci.ident = ident_uci + ident_uci += 1 + for uce in uci.uces : + uce.uci = uci.ident + if uce.para != lastpara : + ident_para += 1 + lastpara = uce.para + uce.para = ident_para + else : + uce.para = ident_para + newuceident[uce.ident] = ident_uce + uce.ident = ident_uce + ident_uce += 1 + print 'backup st text and forms' + for row in self.ori.getconcorde(self.olduceid) : + self.c.execute('INSERT INTO uces VALUES(?,?);', (`newuceident[row[0]]`, row[1])) + for word in row[1].split() : + self.corpus.add_word_from_forme(self.ori.formes[word], newuceident[row[0]]) + self.backup_uce() + print 'done' class BuildFromAlceste(BuildCorpus) : def read_corpus(self, infile) : @@ -1156,7 +1552,7 @@ class BuildFromAlceste(BuildCorpus) : try : with codecs.open(infile, 'r', self.encoding) as f : for linenb, line in enumerate(f) : - line = line.rstrip('\n\r') + line = line.rstrip('\n\r')#FIXME .lstrip(codecs.BOM).lstrip(codecs.BOM_UTF8) if self.testuci(line) : iduci += 1 if txt != [] : @@ -1196,8 +1592,8 @@ class BuildFromAlceste(BuildCorpus) : raise Exception('EmptyText %i' % linenb) if iduci != -1 and iduce != -1: self.backup_uce() - else : - log.info(_(u"No Text in corpora. Are you sure of the formatting ?")) + else : + log.info(_(u"No Text in corpus. Are you sure of the formatting ?")) raise Exception('TextBeforeTextMark %i' % linenb) except UnicodeDecodeError : raise Exception("CorpusEncoding") @@ -1242,7 +1638,7 @@ class BuildFromAlceste(BuildCorpus) : out.append(uce) reste, texte_uce, suite = self.decouper(suite, self.ucesize + 15, self.ucesize) uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace]) - if uce != '' : + if uce != '' : out.append(uce) return out else : @@ -1257,34 +1653,266 @@ class Builder : def __init__(self, parent, dlg = None) : self.parent = parent self.dlg = dlg + parametres = DoConf(os.path.join(self.parent.UserConfigPath,'corpus.cfg')).getoptions('corpus') parametres['pathout'] = PathOut(parent.filename, 'corpus').mkdirout() + parametres['corpus_name'] = os.path.split(parametres['pathout'])[1] dial = CorpusPref(parent, parametres) dial.CenterOnParent() dial.txtpath.SetLabel(parent.filename) #dial.repout_choices.SetValue(parametres['pathout']) self.res = dial.ShowModal() + if self.dlg is not None : + self.dlg = progressbar(self.parent, self.dlg) if self.res == 5100 : parametres = dial.doparametres() parametres['originalpath'] = parent.filename PathOut().createdir(parametres['pathout']) - ReadLexique(self.parent, lang = parametres['lang']) + if parametres.get('dictionary', False) : + filein = parametres['dictionary'] + else : + filein = None + if dial.corpusname.GetValue() != '' : + parametres['corpus_name'] = dial.corpusname.GetValue() + dial.Destroy() + ReadLexique(self.parent, lang = parametres['lang'], filein = filein) if parametres['lang'] != 'other' and os.path.exists(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')): self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')) else : self.parent.expressions = {} self.parametres = parametres else : + dial.Destroy() if self.dlg is not None : self.dlg.Destroy() - dial.Destroy() def doanalyse(self) : return BuildFromAlceste(self.parent.filename, self.parametres, self.parent.lexique, self.parent.expressions, dlg = self.dlg).corpus +class SubBuilder : + def __init__(self, parent, corpus, parametres = None, dlg = None): + self.parent = parent + self.ori = corpus + self.dlg = dlg + corpus_name = 'Sub' + corpus.parametres['corpus_name'] + if dlg is not None : + busy = wx.BusyInfo(_("Please wait...").decode('utf8'), self) + wx.SafeYield() + parametres['corpus_name'] = corpus_name + if parametres.get('frommeta', False) : + parametres['meta'] = corpus.make_etoiles() + elif parametres.get('fromtheme', False) : + parametres['meta'] = corpus.make_themes() + elif parametres.get('fromclusters', False) : + parametres['meta'] = [' '.join(['classe', `i`]) for i in range(1,parametres['clnb'] + 1)] + else : + parametres['meta'] = [] + if 'fromclusters' not in parametres : + parametres['meta'].sort() + if dlg is not None : + del busy + dial = SubTextFromMetaDial(parent, parametres) + self.res = dial.ShowModal() + if self.res == 5100 : + if dial.subcorpusname.GetValue() != '' : + corpus_name = ''.join([l for l in dial.subcorpusname.GetValue() if l.isalnum() or l in ['_']]) + if corpus_name != '' : + parametres['corpus_name'] = corpus_name + else : + parametres['corpus_name'] = 'Sub' + corpus.parametres['corpus_name'] + pathout = os.path.join(corpus.parametres['pathout'], parametres['corpus_name']) + i = 1 + while os.path.exists(pathout + '_%i' % i) : + i += 1 + parametres['pathout'] = pathout + '_%i' % i + meta = dial.m_listBox1.GetSelections() + if not 'fromclusters' in parametres : + parametres['meta'] = [parametres['meta'][val] for val in meta] + else : + parametres['meta'] = meta + self.parametres = parametres + dial.Destroy() + else : + dial.Destroy() + + def doanalyse(self): + return BuildSubCorpus(self.ori, parametres = self.parametres, dlg = self.dlg).corpus + +class BuildMergeFromClusters(BuildCorpus): + def __init__(self, analyses, parametres, dlg = None) : + log.info('begin subcorpus...') + self.dlg = dlg + self.infile = None + self.corpus = Corpus(self, {'type' : 'corpus', 'originalpath' : 'MergeFromClusters', 'encoding' : 'merge'}) + self.last = 0 + self.analyses = analyses + self.lcl = [] + self.parametres = parametres + #self.encoding = corpus.parametres['encoding'] + self.corpus.parametres['corpus_name'] = parametres['corpus_name'] + self.corpus.pathout = PathOut(filename = 'MFC', dirout = parametres['pathout']) + self.corpus.pathout.createdir(parametres['pathout']) + self.corpus.parametres['pathout'] = parametres['pathout'] + self.corpus.parametres['meta'] = parametres.get('meta', False) + self.corpus.parametres['uuid'] = str(uuid4()) + for i, analyse in enumerate(analyses) : + self.lcl.append([]) + self.analyseid = i + corpus_uuid = analyse['corpus'] + #if corpus_uuid not in self.parent.history.openedcorpus : + irapath = parametres['corpusira'][i] + corpus = Corpus(self, parametres = DoConf(irapath).getoptions('corpus'), read = True) + ucepath = os.path.join(analyse['pathout'], 'uce.csv') + corpus.make_ucecl_from_R(ucepath) + self.ori = corpus + for j, cl in enumerate(parametres['clusters'][i]) : + #print cl, self.ori.lc[cl-1] + self.parametres['uceids'] = self.ori.lc[cl-1]#[st for st in self.ori['lc'][cl-1]] + self.lcl[i] += self.ori.lc[cl-1] + self.et = parametres['newet'][i][j] + self.fromuceids() + #create database + self.connect() + self.dobuild() + + def fromuceids(self): + print 'fromuceids' + dictucekeep = dict(zip(self.parametres['uceids'], self.parametres['uceids'])) + idpara = 0 + for uci in self.ori.ucis : + if uci.paras == [] : + keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep] + if keepuces != [] : + nuci = CopyUci(uci) + nuci.uces = keepuces + nuci.etoiles.append(self.et) + nuci.analyseid = self.analyseid + self.corpus.ucis.append(nuci) + idpara += 1 + else : + newuces = [] + newpara = [] + for et in uci.paras : + keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep] + idpara += 1 + if keepuces != [] : + newuces += keepuces + newpara.append(et) + if newuces != [] : + nuci = CopyUci(uci) + nuci.uces = newuces + nuci.paras = newpara + nuci.etoiles.append(self.et) + nuci.analyseid = self.analyseid + self.corpus.ucis.append(nuci) + #print nuci.etoiles, nuci.ident, nuci.uces + + def read_corpus(self, infile = None): + #self.olduceid = [uce.ident for uci in self.corpus.ucis for uce in uci.uces] + ident_uci = 0 + ident_uce = 0 + ident_para = -1 + lastpara = -1 + newuceident = {} + print 'redo text, para and st ident' + for uci in self.corpus.ucis : + #print uci.ident, ident_uci, [uce.ident for uce in uci.uces], uci.etoiles + uci.ident = ident_uci + ident_uci += 1 + for uce in uci.uces : + uce.uci = uci.ident + if uce.para != lastpara : + ident_para += 1 + lastpara = uce.para + uce.para = ident_para + else : + uce.para = ident_para + newuceident['%i-%i' %(uci.analyseid, uce.ident)] = ident_uce + uce.ident = ident_uce + #print uce.ident + ident_uce += 1 + print 'backup st text and forms' + rowid = 0 + for i, analyse in enumerate(self.analyses) : + #print analyse, self.parametres['corpusira'] + irapath = self.parametres['corpusira'][i] + old = Corpus(self, parametres = DoConf(irapath).getoptions('corpus'), read = True) + for row in old.getconcorde(self.lcl[i]) : + self.c.execute('INSERT INTO uces VALUES(?,?);', (newuceident['%i-%i' % (i,row[0])], row[1])) + for word in row[1].split() : + self.corpus.add_word_from_forme(old.formes[word], newuceident['%i-%i' % (i,row[0])]) + rowid += 1 + self.backup_uce() + print 'done' + + +class MergeClusters : + def __init__(self, parent, parametres = None, dlg = None): + self.parent = parent + #self.ori = corpus + self.dlg = dlg + corpus_name = 'MergeFromClusters' + if dlg is not None : + busy = wx.BusyInfo(_("Please wait...").decode('utf8'), self) + wx.SafeYield() + parametres['corpus_name'] = corpus_name + if dlg is not None : + del busy + dial = MergeClusterFrame(parent) + dial.m_textCtrl4.SetValue(corpus_name) + self.res = dial.ShowModal() + if self.res == 5100 : + self.analyses = {} + self.clusters = {} + self.newet = {} + self.corpusira = {} + if dial.m_textCtrl4.GetValue() != '' : + corpus_name = ''.join([l for l in dial.m_textCtrl4.GetValue() if l.isalnum() or l in ['_']]) + if corpus_name != '' : + parametres['corpus_name'] = corpus_name + else : + parametres['corpus_name'] = 'MergeFromClusters' + for cl in dial.selected : + corpus_uuid = cl[1] + #if corpus_uuid not in self.parent.history.openedcorpus : + irapath = self.parent.history.corpus[corpus_uuid]['ira'] + #corpus = Corpus(self.parent, parametres = DoConf(irapath).getoptions('corpus'), read = True) + #self.parent.history.openedcorpus[corpus_uuid] = corpus + if cl[0] not in self.analyses : + analyse = DoConf(dial.irapath[cl[0]]).getoptions() + #ucepath = os.path.join(os.path.dirname(dial.irapath[cl[0]]), 'uce.csv') + #corpus = copycorpus(self.parent.history.openedcorpus[corpus_uuid]) + #corpus.make_ucecl_from_R(ucepath) + self.analyses[cl[0]] = analyse + self.clusters[cl[0]] = [cl[2]] + self.newet[cl[0]] = [dial.selected[cl]] + self.corpusira[cl[0]] = irapath + else : + self.clusters[cl[0]].append(cl[2]) + self.newet[cl[0]].append(dial.selected[cl]) + + + analyses = [val for val in self.clusters] + clusters = [self.clusters[val] for val in analyses] + self.newet = [self.newet[val] for val in analyses] + corpusira = [self.corpusira[val] for val in analyses] + analyses = [self.analyses[val] for val in analyses] + pathout = os.path.dirname(os.path.dirname(analyses[0]['pathout'])) + self.analyses = analyses + + pathout = os.path.join(pathout, parametres['corpus_name']) + i = 1 + while os.path.exists(pathout + '_%i' % i) : + i += 1 + parametres['pathout'] = pathout + '_%i' % i + self.parametres = parametres + self.parametres['clusters'] = clusters + self.parametres['newet'] = self.newet + self.parametres['corpusira'] = corpusira + dial.Destroy() + else : + dial.Destroy() + + def doanalyse(self): + return BuildMergeFromClusters(self.analyses, parametres = self.parametres, dlg = self.dlg).corpus -if __name__ == '__main__' : - t1 = time() - parametres = {'formesdb':'formes.db', 'ucesdb': 'uces.db', 'corpusdb' : 'corpus.db', 'syscoding' : 'utf-8', 'encoding' : encoding} - intro = BuildCorpus(infile, parametres)#, tar_in, tar_infouce)#, tar_formes) - print time() - t1