X-Git-Url: http://iramuteq.org/git?p=iramuteq;a=blobdiff_plain;f=corpusNG.py;h=fa13a8b916b3762c806c9096fe0ff21722c76337;hp=eb55b0857be095bbfca46a7ebf13f6fd029851b6;hb=1fb687c23b19ae4cc88146acf393041356c1df3a;hpb=b0333175cc68917ceb33589b0b354bf931fec245 diff --git a/corpusNG.py b/corpusNG.py index eb55b08..fa13a8b 100644 --- a/corpusNG.py +++ b/corpusNG.py @@ -3,6 +3,9 @@ import codecs import os +import gettext +_ = gettext.gettext +import locale import sys from time import time from functions import decoupercharact, ReadDicoAsDico, DoConf @@ -16,6 +19,7 @@ from uuid import uuid4 from chemins import PathOut from dialog import CorpusPref from functions import ReadLexique, ReadDicoAsDico +from colors import colors import datetime @@ -123,6 +127,19 @@ class Corpus : res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`wordid`,)) return list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res])) + def getformeuceseff(self, formeid) : + if isinstance(formeid, basestring) : + formeid = self.formes[formeid].ident + res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`formeid`,)) + uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res])) + query = 'SELECT eff FROM eff where id=%i ORDER BY id' % formeid + res = self.cformes.execute(query) + eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res])) + formeuceeff = {} + for i, uce in enumerate(uces) : + formeuceeff[uce] = formeuceeff.get(uce, 0) + eff[i] + return formeuceeff + def getlemuces(self, lem) : formesid = ', '.join([`val` for val in self.lems[lem].formes]) query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid @@ -133,7 +150,7 @@ class Corpus : uces = self.getlemuces(lem) return list(set([self.getucefromid(val).uci for val in uces])) - def getlemuceseff(self, lem) : + def getlemuceseff(self, lem, luces = None) : formesid = ', '.join([`val` for val in self.lems[lem].formes]) query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid res = self.cformes.execute(query) @@ -146,6 +163,9 @@ class Corpus : lemuceeff[uce] = lemuceeff.get(uce, 0) + eff[i] return lemuceeff + def getlemclustereff(self, lem, cluster) : + return len(list(set(self.lc[cluster]).intersection(self.getlemuces(lem)))) + def getlemeff(self, lem) : return self.lems[lem].freq @@ -247,7 +267,7 @@ class Corpus : self.iduces = dict([[uce.ident, uce] for uci in self.ucis for uce in uci.uces]) def make_lexitable(self, mineff, etoiles) : - tokeep = [lem for lem in self.lems if self.lems[lem].freq > mineff] + tokeep = [lem for lem in self.lems if self.lems[lem].freq >= mineff] etuces = [[] for et in etoiles] for uci in self.ucis : get = list(set(uci.etoiles).intersection(etoiles)) @@ -381,6 +401,28 @@ class Corpus : actpara = self.iduces[uce[0]].para ident += 1 f.write('\n'.join([self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n') + + def export_corpus_classes(self, outf, alc = True, lem = False) : + ucecl = {} + for i, lc in enumerate(self.lc) : + for uce in lc : + ucecl[uce] = i + 1 + for uce in self.lc0 : + ucecl[uce] = 0 + res = self.getalluces() + self.make_iduces() + with open(outf, 'w') as f : + for uce in res : + guce = uce[1] + actuci = self.iduces[uce[0]].uci + if lem : + guce = ' '.join([self.formes[forme].lem for forme in guce.split()]) + if alc : + etline = ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles + ['*classe_%i' % ucecl[uce[0]]]) + else : + etline = ' '.join(['<' + '='.join(et.split('_')) + '>' for et in self.ucis[self.iduces[uce[0]].uci].etoiles[1:]]) + f.write(etline.encode(self.parametres['syscoding']) + '\n') + f.write(guce.encode(self.parametres['syscoding']) + '\n\n') def make_and_write_sparse_matrix_from_uces(self, actives, outfile, listuce = False) : log.info('make_and_write_sparse_matrix_from_uces %s' % outfile) @@ -433,11 +475,23 @@ class Corpus : for line in f : ffin.write(line) os.remove(outfile + '~') + + def make_table_with_classe(self, uces, list_act) : + table_uce = [[0 for val in list_act] for line in range(0,len(uces))] + uces = dict([[uce, i] for i, uce in enumerate(uces)]) + for i, lem in enumerate(list_act) : + lemuces = list(set(self.getlemuces(lem)).intersection(uces)) + for uce in lemuces : + table_uce[uces[uce]][i] = 1 + table_uce.insert(0, list_act) + return table_uce def parse_active(self, gramact, gramsup = None) : log.info('parse actives') for lem in self.lems : - if self.lems[lem].gram in gramact : + if lem.startswith('_') and lem.endswith('_') : + self.lems[lem].act = 2 + elif self.lems[lem].gram in gramact : self.lems[lem].act = 1 elif gramsup is not None : if self.lems[lem].gram in gramsup : @@ -447,10 +501,10 @@ class Corpus : else : self.lems[lem].act = 2 - def make_actives_limit(self, limit) : + def make_actives_limit(self, limit, key = 1) : if self.idformes is None : self.make_idformes() - return [lem for lem in self.lems if self.getlemeff(lem) >= limit] + return [lem for lem in self.lems if self.getlemeff(lem) >= limit and self.lems[lem].act == key] def make_actives_nb(self, nbmax, key) : log.info('make_actives_nb : %i - %i' % (nbmax,key)) @@ -475,7 +529,8 @@ class Corpus : lim -= 1 else : stop = nbmax - 1 - log.info('nb actives = %i - eff min = %i ' % (stop, lim)) + lim = effs[stop] + log.info('nb actives = %i - eff min = %i ' % (stop + 1, lim)) return [val[1] for val in allactives[0:stop + 1]], lim def make_and_write_profile(self, actives, ucecl, fileout) : @@ -491,12 +546,70 @@ class Corpus : etoiles.update(uci.etoiles[1:] + uci.paras) return list(etoiles) + def make_etoiles_dict(self) : + etoiles = [et for uci in self.ucis for et in uci.etoiles[1:]] + det = {} + for etoile in etoiles : + et = etoile.split('_') + if et[0] in det : + try : + endet = '_'.join(et[1:]) + if endet in det[et[0]] : + det[et[0]][endet] += 1 + else : + det[et[0]][endet] = 1 + except IndexError : + det[et[0]] += 1 + else : + try : + endet = '_'.join(et[1:]) + det[et[0]] = {endet :1} + except IndexError : + det[et[0]] = 1 + return det + + def make_etline(self, listet) : + etuces = [[] for et in listet] + for uci in self.ucis : + get = list(set(uci.etoiles).intersection(listet)) + if len(get) > 1 : + return '2 variables sur la meme ligne' + elif get != [] : + etuces[listet.index(get[0])] += [uce.ident for uce in uci.uces] + return etuces + + def make_and_write_profile_et(self, ucecl, fileout) : log.info('etoiles/classes') etoiles = self.make_etoiles() with open(fileout, 'w') as f : f.write('\n'.join([';'.join([etoile] + [`len(set(self.getucesfrometoile(etoile)).intersection(classe))` for classe in ucecl]) for etoile in etoiles]).encode(self.parametres['syscoding'])) + def make_colored_corpus(self) : + ucecl = {} + for i, lc in enumerate(self.lc) : + for uce in lc : + ucecl[uce] = i + 1 + for uce in self.lc0 : + ucecl[uce] = 0 + color = ['black'] + colors[len(self.lc) - 1] + txt = ''' + + +''' % sys.getdefaultencoding() + res = self.getalluces() + self.make_iduces() + actuci = '' + actpara = False + for uce in res : + if self.iduces[uce[0]].uci != actuci : + actuci = self.iduces[uce[0]].uci + txt += '

' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles) + '

' + txt += '' % (color[ucecl[uce[0]]]) + uce[1] + '

' + else : + txt += '' % (color[ucecl[uce[0]]]) + uce[1] + '

' + return txt + '\n' + def count_from_list(self, l, d) : for val in l : if val in d : @@ -505,6 +618,15 @@ class Corpus : d[val] = 1 return d + def count_from_list_cl(self, l, d, a, clnb) : + for val in l : + if val in d : + d[val][a] += 1 + else : + d[val] = [0] * clnb + d[val][a] = 1 + return d + def find_segments(self, taille_segment, taille_limite) : d = {} for uce in self.getalluces() : @@ -516,7 +638,48 @@ class Corpus : if len(l) > taille_limite : l = l[-taille_limite:] return l - + + def find_segments_in_classe(self, list_uce, taille_segment, taille_limite): + d={} + for uce in self.getconcorde(list_uce) : + uce = uce[1].split() + d =self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d) + l = [[d[val], val, taille_segment] for val in d if d[val] >= 3] + del(d) + l.sort() + if len(l) > taille_limite : + l = l[-taille_limite:] + return l + + def make_segments_profile(self, fileout, lenmin = 3, lenmax = 10, effmin = 50, lem = False) : + d = {} + for b, classe in enumerate(self.lc) : + for uce in self.getconcorde(classe) : + uce = uce[1].split() + if lem : + uce = [self.formes[forme].lem for forme in uce] + for taille_segment in range(lenmin,lenmax) : + d =self.count_from_list_cl([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d, b, len(self.lc)) + result = [[seg] + [str(val) for val in d[seg]] for seg in d if sum(d[seg]) >= effmin] + with open(fileout, 'w') as f : + f.write('\n'.join([';'.join(line) for line in result])) + + def make_proftype(self, outf) : + res = {} + for lem in self.lems : + gram = self.lems[lem].gram + if not gram in res : + res[gram] = [0 for val in self.lc] + lemuceeff = self.getlemuceseff(lem) + for i, classe in enumerate(self.lc) : + concern = set(classe).intersection(lemuceeff.keys()) + res[gram][i] += sum([lemuceeff[uce] for uce in concern]) + res = [[gram] + [`val` for val in res[gram]] for gram in res] + res.sort() + with open(outf, 'w') as f : + f.write('\n'.join([';'.join(line) for line in res]).encode(self.parametres['syscoding'])) + + def make_ucecl_from_R(self, filein) : with open(filein, 'rU') as f : c = f.readlines() @@ -531,6 +694,28 @@ class Corpus : self.lc = [[uce[0] for uce in self.lc if uce[1] == i] for i in range(clnb+1)] self.lc0 = self.lc.pop(0) #return ucecl + + def get_stat_by_cluster(self, outf) : + log.info('get_stat_by_cluster') + t1 = time() + occurrences = dict([[i + 1, 0] for i in range(len(self.lc))]) + formescl = dict([[i + 1, 0] for i in range(len(self.lc))]) + hapaxcl = dict([[i + 1, 0] for i in range(len(self.lc))]) + lenclasses = dict([[i+1,len(cl)] for i, cl in enumerate(self.lc)]) + sets = [set(cl) for cl in self.lc] + for forme in self.formes : + formeuceeff = self.getformeuceseff(forme) + for i, classe in enumerate(self.lc) : + concern = sets[i].intersection(formeuceeff.keys()) + if len(concern) : + occurrences[i+1] += sum([formeuceeff[uce] for uce in concern]) + formescl[i+1] += 1 + if self.formes[forme].freq == 1 : + hapaxcl[i+1] += 1 + toprint = '\n'.join([';'.join([`i`, `occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`]) for i in occurrences]) + with open(outf, 'w') as f : + f.write(toprint) + log.info('%f' % (time() - t1)) def gethapaxbyet(self, etoiles) : hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1] @@ -590,6 +775,14 @@ class Corpus : f.write(txt) +class MakeUciStat : + def __init__(self, corpus) : + ucinb = corpus.getucinb() + ucisize = corpus.getucisize() + ucimean = float(sum(ucisize))/float(ucinb) + detoile = corpus.make_etoiles_dict() + + class Uci : def __init__(self, iduci, line, paraset = None) : self.ident = iduci @@ -635,7 +828,7 @@ def decouperlist(chaine, longueur, longueurOptimale) : Si on trouve un '$', c'est fini. Sinon, on cherche le meilleur candidat. C'est-à-dire le rapport poids/distance le plus important. """ - separateurs = [[u'.', 6.0], [u'?', 6.0], [u'!', 6.0], [u'£', 6.0], [u':', 5.0], [u';', 4.0], [u',', 1.0], [u' ', 0.01]] + separateurs = [[u'.', 6.0], [u'?', 6.0], [u'!', 6.0], [u'£$£', 6.0], [u':', 5.0], [u';', 4.0], [u',', 1.0], [u' ', 0.01]] dsep = dict([[val[0],val[1]] for val in separateurs]) trouve = False # si on a trouvé un bon séparateur iDecoupe = 0 # indice du caractere ou il faut decouper @@ -648,7 +841,7 @@ def decouperlist(chaine, longueur, longueurOptimale) : try : indice = chaineTravail.index(u'$') trouve = True - iDecoupe = indice + iDecoupe = indice - 1 except ValueError : pass if not trouve: @@ -665,7 +858,7 @@ def decouperlist(chaine, longueur, longueurOptimale) : iDecoupe = nbCar else : if (float(dsep[' ']) / distance) > (float(meilleur[1]) / meilleureDistance) : - meilleur[0] = caractere + meilleur[0] = ' ' meilleur[1] = dsep[' '] meilleur[2] = nbCar trouve = True @@ -673,8 +866,12 @@ def decouperlist(chaine, longueur, longueurOptimale) : nbCar = nbCar - 1 # si on a trouvé if trouve: + #if meilleur[0] != ' ' : + # fin = chaine[iDecoupe + 1:] + # retour = chaineTravail[:iDecoupe] + #else : fin = chaine[iDecoupe + 1:] - retour = chaineTravail[:iDecoupe] + retour = chaineTravail[:iDecoupe + 1] return len(retour) > 0, retour, fin # si on a rien trouvé return False, chaine, '' @@ -713,7 +910,7 @@ class BuildCorpus : if self.corpus.parametres['keep_ponct'] : self.ponctuation_espace = [' ', ''] else : - self.ponctuation_espace = [' ','.', u'£', ';', '?', '!', ',', ':',''] + self.ponctuation_espace = [' ','.', u'£$£', ';', '?', '!', ',', ':',''] self.cleans = [] self.tolist = self.corpus.parametres.get('tolist', 0) self.buildcleans() @@ -736,13 +933,18 @@ class BuildCorpus : def dobuild(self) : t1 = time() - self.read_corpus(self.infile) - self.indexdb() - self.corpus.parametres['ira'] = self.corpus.pathout['Corpus.cira'] - self.time = time() - t1 - self.dofinish() - DoConf().makeoptions(['corpus'],[self.corpus.parametres], self.corpus.pathout['Corpus.cira']) - log.info('time : %f' % (time() - t1)) + try : + self.read_corpus(self.infile) + except Warning, args : + log.info('pas kool %s' % args) + raise Warning + else : + self.indexdb() + self.corpus.parametres['ira'] = self.corpus.pathout['Corpus.cira'] + self.time = time() - t1 + self.dofinish() + DoConf().makeoptions(['corpus'],[self.corpus.parametres], self.corpus.pathout['Corpus.cira']) + log.info('time : %f' % (time() - t1)) def connect(self) : self.conn_f = sqlite3.connect(self.corpus.pathout['formes.db']) @@ -796,8 +998,9 @@ class BuildCorpus : self.cleans.append(self.dolower) if self.corpus.parametres.get('firstclean', 1) : self.cleans.append(self.firstclean) - self.rule = self.corpus.parametres.get('keep_caract', u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_-") - self.cleans.append(self.docharact) + if self.corpus.parametres['charact'] : + self.rule = self.corpus.parametres.get('keep_caract', u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_") + self.cleans.append(self.docharact) if self.corpus.parametres.get('expressions', 1) : self.cleans.append(self.make_expression) if self.corpus.parametres.get('apos', 1) : @@ -828,7 +1031,7 @@ class BuildCorpus : def firstclean(self, txt) : txt = txt.replace(u'’',"'") txt = txt.replace(u'œ', u'oe') - return txt.replace('...',u' £ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ') + return txt.replace('...',u' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'…', ' £$£ ') def make_cleans(self, txt) : for clean in self.cleans : @@ -872,11 +1075,9 @@ class BuildCorpus : class BuildFromAlceste(BuildCorpus) : - #def __init___(self, infile, parametres_corpus) : - # BuildCorpus.__init__(self, infile, parametres_corpus) - - def read_corpus(self, infile) : + if self.dlg is not None : + self.dlg.Pulse('textes : 0 - segments : 0') self.limitshow = 0 self.count = 1 if self.corpus.parametres['ucimark'] == 0 : @@ -887,37 +1088,61 @@ class BuildFromAlceste(BuildCorpus) : iduci = -1 idpara = -1 iduce = -1 - with codecs.open(infile, 'rU', self.encoding) as f : - for line in f : - if self.testuci(line) : - iduci += 1 - if txt != [] : - iduce, idpara = self.treattxt(txt, iduce, idpara, iduci - 1) - txt = [] - self.corpus.ucis.append(Uci(iduci, line)) - else : - self.corpus.ucis.append(Uci(iduci, line)) - elif line.startswith(u'-*') : - if txt != [] : - iduce, idpara = self.treattxt(txt, iduce, idpara, iduci) - txt = [] - idpara += 1 - self.corpus.ucis[-1].paras.append(line.split()[0]) - elif line.strip() != '' and iduci != -1 : - txt.append(line) - if txt != [] : - iduce, idpara = self.treattxt(txt, iduce, idpara, iduci) - del(txt) - self.backup_uce() + try : + with codecs.open(infile, 'r', self.encoding) as f : + for linenb, line in enumerate(f) : + line = line.rstrip('\n\r') + if self.testuci(line) : + iduci += 1 + if txt != [] : + iduce, idpara = self.treattxt(txt, iduce, idpara, iduci - 1) + txt = [] + self.corpus.ucis.append(Uci(iduci, line)) + else : + if iduci > 0 : + if self.corpus.ucis[-1].uces == [] : + log.info(u'Empty text : %i' % linenb) + iduci -= 1 + self.corpus.ucis.pop() + #raise Exception("EmptyText %i" % linenb) + self.corpus.ucis.append(Uci(iduci, line)) + if self.dlg is not None : + if not (iduci + 1) % 10 : + self.dlg.Pulse('textes : %i - segments : %i' % (iduci + 1, iduce +1)) + elif line.startswith(u'-*') : + if iduci != -1 : + if txt != [] : + iduce, idpara = self.treattxt(txt, iduce, idpara, iduci) + txt = [] + idpara += 1 + self.corpus.ucis[-1].paras.append(line.split()[0]) + else : + raise Exception('paragrapheOT') + elif line.strip() != '' and iduci != -1 : + txt.append(line) + if txt != [] and iduci != -1 : + iduce, idpara = self.treattxt(txt, iduce, idpara, iduci) + del(txt) + else : + raise Exception("EmptyText") + if iduci != -1 and iduce != -1: + self.backup_uce() + else : + log.info(_(u"No Text in corpora. Are you sure of the formatting ?")) + raise Exception('TextBeforeTextMark') + except UnicodeDecodeError : + raise Exception("CorpusEncoding") def treattxt(self, txt, iduce, idpara, iduci) : - txt = ' '.join(txt) - #log.debug('ATTENTION CHINOIS -> charactères') - #clean_chinois = [self.firstclean, self.dolower, self.make_expression, self.doapos, self.dotiret] - #log.debug('ATTENTION CHINOIS -> list(text)') - #txt = ' '.join(list(txt)) - txt = self.make_cleans(txt)#, clean_chinois) - ucetxt = self.make_uces(txt, self.corpus.parametres['douce']) + if self.corpus.parametres.get('ucemethod', 0) == 2 and self.corpus.parametres['douce']: + txt = 'laphrasepoursplitter'.join(txt) + txt = self.make_cleans(txt) + txt = ' '.join([val for val in txt.split() if val not in self.ponctuation_espace]) + ucetxt = txt.split('laphrasepoursplitter') + else : + txt = ' '.join(txt) + txt = self.make_cleans(txt) + ucetxt = self.make_uces(txt, self.corpus.parametres['douce']) if self.corpus.ucis[-1].paras == [] : idpara += 1 for uce in ucetxt : @@ -931,14 +1156,7 @@ class BuildFromAlceste(BuildCorpus) : for word in uce : self.last += 1 self.corpus.add_word(word) - if self.dlg is not None : - if self.limitshow > self.count : - self.dlg.Pulse('uci: %i - uce : %i' % (iduci + 1, iduce +1)) - self.count += 1 - self.limitshow = 0 - else : - self.limitshow = self.last / 100000 - log.debug(`iduci`, `idpara`, `iduce`) + log.debug(' '.join([`iduci`,`idpara`,`iduce`])) if self.last > self.lim : self.backup_uce() self.last = 0 @@ -949,27 +1167,13 @@ class BuildFromAlceste(BuildCorpus) : if douce : out = [] reste, texte_uce, suite = self.decouper(self.prep_txt(txt), self.ucesize + 15, self.ucesize) -# print 'reste' -# print reste -# print 'texte_uce' -# print texte_uce -# print 'suite' -# print suite while reste : uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace]) if uce != '' : out.append(uce) reste, texte_uce, suite = self.decouper(suite, self.ucesize + 15, self.ucesize) -# print 'reste' -# print reste -# print 'texte_uce' -# print texte_uce -# print 'suite' -# print suite - uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace]) if uce != '' : - print 'RESTEE UUCEEEEEEEEEEEEE', uce out.append(uce) return out else : @@ -998,6 +1202,9 @@ class Builder : ReadLexique(self.parent, lang = parametres['lang']) self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')) self.parametres = parametres + else : + if self.dlg is not None : + self.dlg.Destroy() dial.Destroy() def doanalyse(self) :