X-Git-Url: http://iramuteq.org/git?p=iramuteq;a=blobdiff_plain;f=corpusNG.py;h=7354463e4f2e0293a5f4ef15a7793781d6d14242;hp=380b9a2c8fc1a129ee74fadd4dea4c47382d1d7d;hb=ab23968410d4e2eff482fd16a639801b457d5063;hpb=0bb1e9556fdbb07e171b663ffcea149692a8a49f diff --git a/corpusNG.py b/corpusNG.py index 380b9a2..7354463 100644 --- a/corpusNG.py +++ b/corpusNG.py @@ -3,6 +3,9 @@ import codecs import os +import gettext +_ = gettext.gettext +import locale import sys from time import time from functions import decoupercharact, ReadDicoAsDico, DoConf @@ -16,6 +19,7 @@ from uuid import uuid4 from chemins import PathOut from dialog import CorpusPref from functions import ReadLexique, ReadDicoAsDico +from colors import colors import datetime @@ -457,10 +461,10 @@ class Corpus : else : self.lems[lem].act = 2 - def make_actives_limit(self, limit) : + def make_actives_limit(self, limit, key = 1) : if self.idformes is None : self.make_idformes() - return [lem for lem in self.lems if self.getlemeff(lem) >= limit and self.lems[lem].act == 1] + return [lem for lem in self.lems if self.getlemeff(lem) >= limit and self.lems[lem].act == key] def make_actives_nb(self, nbmax, key) : log.info('make_actives_nb : %i - %i' % (nbmax,key)) @@ -508,19 +512,21 @@ class Corpus : et = etoile.split('_') if et[0] in det : try : - if et[1] in det[et[0]] : - det[et[0]][et[1]] += 1 + endet = '_'.join(et[1:]) + if endet in det[et[0]] : + det[et[0]][endet] += 1 else : - det[et[0]][et[1]] = 1 + det[et[0]][endet] = 1 except IndexError : det[et[0]] += 1 else : try : - det[et[0]] = {et[1] :1} + endet = '_'.join(et[1:]) + det[et[0]] = {endet :1} except IndexError : det[et[0]] = 1 print det - + return det def make_and_write_profile_et(self, ucecl, fileout) : log.info('etoiles/classes') @@ -528,6 +534,31 @@ class Corpus : with open(fileout, 'w') as f : f.write('\n'.join([';'.join([etoile] + [`len(set(self.getucesfrometoile(etoile)).intersection(classe))` for classe in ucecl]) for etoile in etoiles]).encode(self.parametres['syscoding'])) + def make_colored_corpus(self) : + ucecl = {} + for i, lc in enumerate(self.lc) : + for uce in lc : + ucecl[uce] = i + 1 + for uce in self.lc0 : + ucecl[uce] = 0 + color = ['black'] + colors[len(self.lc) - 1] + txt = ''' + + +''' % sys.getdefaultencoding() + res = self.getalluces() + self.make_iduces() + actuci = '' + actpara = False + for uce in res : + if self.iduces[uce[0]].uci != actuci : + actuci = self.iduces[uce[0]].uci + txt += '

' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles) + '

' + txt += '' % (color[ucecl[uce[0]]]) + uce[1] + '

' + else : + txt += '' % (color[ucecl[uce[0]]]) + uce[1] + '

' + return txt + '\n' + def count_from_list(self, l, d) : for val in l : if val in d : @@ -536,6 +567,15 @@ class Corpus : d[val] = 1 return d + def count_from_list_cl(self, l, d, a, clnb) : + for val in l : + if val in d : + d[val][a] += 1 + else : + d[val] = [0] * clnb + d[val][a] = 1 + return d + def find_segments(self, taille_segment, taille_limite) : d = {} for uce in self.getalluces() : @@ -547,6 +587,31 @@ class Corpus : if len(l) > taille_limite : l = l[-taille_limite:] return l + + def find_segments_in_classe(self, list_uce, taille_segment, taille_limite): + d={} + for uce in self.getconcorde(list_uce) : + uce = uce[1].split() + d =self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d) + l = [[d[val], val, taille_segment] for val in d if d[val] >= 3] + del(d) + l.sort() + if len(l) > taille_limite : + l = l[-taille_limite:] + return l + + def make_segments_profile(self, fileout, lenmin = 3, lenmax = 10, effmin = 50, lem = False) : + d = {} + for b, classe in enumerate(self.lc) : + for uce in self.getconcorde(classe) : + uce = uce[1].split() + if lem : + uce = [self.formes[forme].lem for forme in uce] + for taille_segment in range(lenmin,lenmax) : + d =self.count_from_list_cl([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d, b, len(self.lc)) + result = [[seg] + [str(val) for val in d[seg]] for seg in d if sum(d[seg]) >= effmin] + with open(fileout, 'w') as f : + f.write('\n'.join([';'.join(line) for line in result])) def make_ucecl_from_R(self, filein) : with open(filein, 'rU') as f : @@ -674,7 +739,7 @@ def decouperlist(chaine, longueur, longueurOptimale) : Si on trouve un '$', c'est fini. Sinon, on cherche le meilleur candidat. C'est-Ã -dire le rapport poids/distance le plus important. """ - separateurs = [[u'.', 6.0], [u'?', 6.0], [u'!', 6.0], [u'Â£', 6.0], [u':', 5.0], [u';', 4.0], [u',', 1.0], [u' ', 0.01]] + separateurs = [[u'.', 6.0], [u'?', 6.0], [u'!', 6.0], [u'Â£$Â£', 6.0], [u':', 5.0], [u';', 4.0], [u',', 1.0], [u' ', 0.01]] dsep = dict([[val[0],val[1]] for val in separateurs]) trouve = False # si on a trouvÃ© un bon sÃ©parateur iDecoupe = 0 # indice du caractere ou il faut decouper @@ -756,7 +821,7 @@ class BuildCorpus : if self.corpus.parametres['keep_ponct'] : self.ponctuation_espace = [' ', ''] else : - self.ponctuation_espace = [' ','.', u'Â£', ';', '?', '!', ',', ':',''] + self.ponctuation_espace = [' ','.', u'Â£$Â£', ';', '?', '!', ',', ':',''] self.cleans = [] self.tolist = self.corpus.parametres.get('tolist', 0) self.buildcleans() @@ -779,13 +844,18 @@ class BuildCorpus : def dobuild(self) : t1 = time() - self.read_corpus(self.infile) - self.indexdb() - self.corpus.parametres['ira'] = self.corpus.pathout['Corpus.cira'] - self.time = time() - t1 - self.dofinish() - DoConf().makeoptions(['corpus'],[self.corpus.parametres], self.corpus.pathout['Corpus.cira']) - log.info('time : %f' % (time() - t1)) + try : + self.read_corpus(self.infile) + except Warning, args : + log.info('pas kool %s' % args) + raise Warning + else : + self.indexdb() + self.corpus.parametres['ira'] = self.corpus.pathout['Corpus.cira'] + self.time = time() - t1 + self.dofinish() + DoConf().makeoptions(['corpus'],[self.corpus.parametres], self.corpus.pathout['Corpus.cira']) + log.info('time : %f' % (time() - t1)) def connect(self) : self.conn_f = sqlite3.connect(self.corpus.pathout['formes.db']) @@ -839,8 +909,9 @@ class BuildCorpus : self.cleans.append(self.dolower) if self.corpus.parametres.get('firstclean', 1) : self.cleans.append(self.firstclean) - self.rule = self.corpus.parametres.get('keep_caract', u"^a-zA-Z0-9Ã ÃÃ¢ÃÃ¤ÃÃ¡ÃÃ©ÃÃ¨ÃÃªÃÃ«ÃÃ¬ÃÃ®ÃÃ¯ÃÃ²ÃÃ´ÃÃ¶ÃÃ¹ÃÃ»ÃÃ¼ÃÃ§ÃÃÅÅâÃ±.:,;!?*'_-") - self.cleans.append(self.docharact) + if self.corpus.parametres['charact'] : + self.rule = self.corpus.parametres.get('keep_caract', u"^a-zA-Z0-9Ã ÃÃ¢ÃÃ¤ÃÃ¡ÃÃ©ÃÃ¨ÃÃªÃÃ«ÃÃ¬ÃÃ®ÃÃ¯ÃÃ²ÃÃ´ÃÃ¶ÃÃ¹ÃÃ»ÃÃ¼ÃÃ§ÃÃÅÅâÃ±.:,;!?*'_") + self.cleans.append(self.docharact) if self.corpus.parametres.get('expressions', 1) : self.cleans.append(self.make_expression) if self.corpus.parametres.get('apos', 1) : @@ -871,7 +942,7 @@ class BuildCorpus : def firstclean(self, txt) : txt = txt.replace(u'â',"'") txt = txt.replace(u'Å', u'oe') - return txt.replace('...',u' Â£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ') + return txt.replace('...',u' Â£$Â£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'â¦', ' Â£$Â£ ') def make_cleans(self, txt) : for clean in self.cleans : @@ -930,28 +1001,47 @@ class BuildFromAlceste(BuildCorpus) : iduci = -1 idpara = -1 iduce = -1 - with codecs.open(infile, 'rU', self.encoding) as f : - for line in f : - if self.testuci(line) : - iduci += 1 - if txt != [] : - iduce, idpara = self.treattxt(txt, iduce, idpara, iduci - 1) - txt = [] - self.corpus.ucis.append(Uci(iduci, line)) - else : - self.corpus.ucis.append(Uci(iduci, line)) - elif line.startswith(u'-*') : - if txt != [] : - iduce, idpara = self.treattxt(txt, iduce, idpara, iduci) - txt = [] - idpara += 1 - self.corpus.ucis[-1].paras.append(line.split()[0]) - elif line.strip() != '' and iduci != -1 : - txt.append(line) - if txt != [] : - iduce, idpara = self.treattxt(txt, iduce, idpara, iduci) - del(txt) - self.backup_uce() + linenb = 0 + try : + with codecs.open(infile, 'r', self.encoding) as f : + for line in f : + linenb += 1 + line = line.rstrip('\n\r') + if self.testuci(line) : + iduci += 1 + if txt != [] : + iduce, idpara = self.treattxt(txt, iduce, idpara, iduci - 1) + txt = [] + self.corpus.ucis.append(Uci(iduci, line)) + else : + if iduci > 0 : + if self.corpus.ucis[-1].uces == [] : + log.info('linenb : %i' % linenb) + raise Exception("EmptyText %i" % linenb) + self.corpus.ucis.append(Uci(iduci, line)) + elif line.startswith(u'-*') : + if iduci != -1 : + if txt != [] : + iduce, idpara = self.treattxt(txt, iduce, idpara, iduci) + txt = [] + idpara += 1 + self.corpus.ucis[-1].paras.append(line.split()[0]) + else : + raise Exception('paragrapheOT') + elif line.strip() != '' and iduci != -1 : + txt.append(line) + if txt != [] and iduci != -1 : + iduce, idpara = self.treattxt(txt, iduce, idpara, iduci) + del(txt) + else : + raise Exception("EmptyText") + if iduci != -1 and iduce != -1: + self.backup_uce() + else : + log.info(_(u"No Texte in corpora. Are you sure of the formatting ?")) + raise Exception('TextBeforeTextMark') + except UnicodeDecodeError : + raise Exception("CorpusEncoding") def treattxt(self, txt, iduce, idpara, iduci) : if self.corpus.parametres.get('ucemethod', 0) == 2 and self.corpus.parametres['douce']: @@ -961,7 +1051,11 @@ class BuildFromAlceste(BuildCorpus) : ucetxt = txt.split('laphrasepoursplitter') else : txt = ' '.join(txt) + + print txt txt = self.make_cleans(txt) + + print txt ucetxt = self.make_uces(txt, self.corpus.parametres['douce']) if self.corpus.ucis[-1].paras == [] : idpara += 1 @@ -983,7 +1077,7 @@ class BuildFromAlceste(BuildCorpus) : self.limitshow = 0 else : self.limitshow = self.last / 100000 - log.debug(`iduci`, `idpara`, `iduce`) + log.debug(' '.join([`iduci`,`idpara`,`iduce`])) if self.last > self.lim : self.backup_uce() self.last = 0 @@ -1014,7 +1108,7 @@ class BuildFromAlceste(BuildCorpus) : uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace]) if uce != '' : - print 'RESTEE UUCEEEEEEEEEEEEE', uce + #print 'RESTEE UUCEEEEEEEEEEEEE', uce out.append(uce) return out else :