X-Git-Url: http://iramuteq.org/git?p=iramuteq;a=blobdiff_plain;f=corpusNG.py;h=380b9a2c8fc1a129ee74fadd4dea4c47382d1d7d;hp=6dc48806e7fd8a43fa527507c7823ed6ba6bbf75;hb=a5fa23767c01368804b2fbb1e2915bc332c6f932;hpb=8fa853a25a9d62b1446e1bc543e5a3a4d0e03dcf diff --git a/corpusNG.py b/corpusNG.py index 6dc4880..380b9a2 100644 --- a/corpusNG.py +++ b/corpusNG.py @@ -21,23 +21,6 @@ import datetime log = logging.getLogger('iramuteq.corpus') -#expressions = ReadDicoAsDico('dictionnaires/expression_fr.txt') -#lexique = ReadDicoAsDico('dictionnaires/lexique_fr.txt') -#infile = '/home/pierre/workspace/iramuteq/corpus/lru2.txt' -#infile = '/home/pierre/workspace/iramuteq/corpus/corpussab_cor.txt' -#encoding = 'utf8' -#infile = '/home/pierre/fac/identite/identite_sans_doublons_ok.txt' -#encoding = 'cp1252' -#infile = '/home/pierre/workspace/iramuteq/corpus/Natacha.txt' -#infile = '/home/pierre/fac/cablegate/allcables-all.txt' -#infile = '/home/pierre/fac/cablegate/allcables-08290338.txt' -#tar_in = '/home/pierre/fac/identite/uce.tar.gz -#tar_in = '/home/pierre/fac/cablegate/uce-cable-test.tar.gz' -#tar_infouce = '/home/pierre/fac/identite/info_uce.tar.gz' -#tar_infouce = '/home/pierre/fac/cablegate/info_uce.tar.gz' -#tar_formes = '/home/pierre/fac/identite/tar_formes.tar.gz' -#tar_formes = '/home/pierre/fac/cablegate/tar_formes.tar.gz' - def copycorpus(corpus) : log.info('copy corpus') @@ -71,6 +54,7 @@ class Corpus : self.idformesuces = {} self.iduces = None self.idformes = None + self.uceuci = None if read : self.pathout = PathOut(dirout = parametres['pathout']) self.read_corpus() @@ -238,6 +222,10 @@ class Corpus : # else : # self.lems[self.formes[forme].lem] = {self.formes[forme].ident : 0} + def getetbyuceid(self, uceid) : + if self.uceuci is None : self.uceuci = dict([[uce.ident,uci.ident] for uci in self.ucis for uce in uci.uces]) + return self.ucis[self.uceuci[uceid]].etoiles + def make_lems(self, lem = True) : log.info('make lems') self.lems = {} @@ -445,6 +433,16 @@ class Corpus : for line in f : ffin.write(line) os.remove(outfile + '~') + + def make_table_with_classe(self, uces, list_act) : + table_uce = [[0 for val in list_act] for line in range(0,len(uces))] + uces = dict([[uce, i] for i, uce in enumerate(uces)]) + for i, lem in enumerate(list_act) : + lemuces = list(set(self.getlemuces(lem)).intersection(uces)) + for uce in lemuces : + table_uce[uces[uce]][i] = 1 + table_uce.insert(0, list_act) + return table_uce def parse_active(self, gramact, gramsup = None) : log.info('parse actives') @@ -462,7 +460,7 @@ class Corpus : def make_actives_limit(self, limit) : if self.idformes is None : self.make_idformes() - return [lem for lem in self.lems if self.getlemeff(lem) >= limit] + return [lem for lem in self.lems if self.getlemeff(lem) >= limit and self.lems[lem].act == 1] def make_actives_nb(self, nbmax, key) : log.info('make_actives_nb : %i - %i' % (nbmax,key)) @@ -503,6 +501,27 @@ class Corpus : etoiles.update(uci.etoiles[1:] + uci.paras) return list(etoiles) + def make_etoiles_dict(self) : + etoiles = [et for uci in self.ucis for et in uci.etoiles[1:]] + det = {} + for etoile in etoiles : + et = etoile.split('_') + if et[0] in det : + try : + if et[1] in det[et[0]] : + det[et[0]][et[1]] += 1 + else : + det[et[0]][et[1]] = 1 + except IndexError : + det[et[0]] += 1 + else : + try : + det[et[0]] = {et[1] :1} + except IndexError : + det[et[0]] = 1 + print det + + def make_and_write_profile_et(self, ucecl, fileout) : log.info('etoiles/classes') etoiles = self.make_etoiles() @@ -544,6 +563,72 @@ class Corpus : self.lc0 = self.lc.pop(0) #return ucecl + def gethapaxbyet(self, etoiles) : + hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1] + hucesdict = {} + for uce in hapaxuces : + if uce in hucesdict : + hucesdict[uce] += 1 + else : + hucesdict[uce] = 1 + etuces = [[] for et in etoiles] + for uci in self.ucis : + get = list(set(uci.etoiles).intersection(etoiles)) + if len(get) > 1 : + return '2 variables sur la meme ligne' + elif get != [] : + etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces] + etuces = [set(val) for val in etuces] + return [sum([hucesdict[uce] for uce in list(etuce.intersection(hapaxuces))]) for etuce in etuces] + + def gethapaxuces(self) : + hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1] + hapax = [forme for forme in self.lems if self.lems[forme].freq == 1] + hucesdict = {} + for i,uce in enumerate(hapaxuces) : + if uce in hucesdict : + hucesdict[uce][0] += 1 + hucesdict[uce][1].append(hapax[i]) + else : + hucesdict[uce] = [1,[hapax[i]]] + huces = {} + for uce in hucesdict : + if hucesdict[uce][0] in huces : + huces[hucesdict[uce][0]].append(uce) + else : + huces[hucesdict[uce][0]] = [uce] + huces = zip(huces, huces.values()) + huces.sort(reverse=True) + txt = """ + + """ + for nb in huces[0:4] : + txt += "

%i hapax par uce

\n" % nb[0] + for uce in nb[1] : + res = self.getconcorde([uce]) + for row in res : + ucetxt = ' ' + row[1] + ' ' + uceid = row[0] + for hap in hucesdict[uce][1] : + laforme = self.getforme([forme for forme in self.lems[hap].formes][0]).forme + ucetxt = ucetxt.replace(' '+laforme+' ', ' '+laforme+' ') + txt += '

' + ' '.join(self.getetbyuceid(uceid)) + '

' + txt += '

'+ucetxt+'

\n' + txt += """ + + """ + with open('/tmp/testhapxuce.html','w') as f : + f.write(txt) + + +class MakeUciStat : + def __init__(self, corpus) : + ucinb = corpus.getucinb() + ucisize = corpus.getucisize() + ucimean = float(sum(ucisize))/float(ucinb) + detoile = corpus.make_etoiles_dict() + + class Uci : def __init__(self, iduci, line, paraset = None) : self.ident = iduci @@ -602,7 +687,7 @@ def decouperlist(chaine, longueur, longueurOptimale) : try : indice = chaineTravail.index(u'$') trouve = True - iDecoupe = indice + iDecoupe = indice - 1 except ValueError : pass if not trouve: @@ -619,7 +704,7 @@ def decouperlist(chaine, longueur, longueurOptimale) : iDecoupe = nbCar else : if (float(dsep[' ']) / distance) > (float(meilleur[1]) / meilleureDistance) : - meilleur[0] = caractere + meilleur[0] = ' ' meilleur[1] = dsep[' '] meilleur[2] = nbCar trouve = True @@ -627,8 +712,12 @@ def decouperlist(chaine, longueur, longueurOptimale) : nbCar = nbCar - 1 # si on a trouvé if trouve: + #if meilleur[0] != ' ' : + # fin = chaine[iDecoupe + 1:] + # retour = chaineTravail[:iDecoupe] + #else : fin = chaine[iDecoupe + 1:] - retour = chaineTravail[:iDecoupe] + retour = chaineTravail[:iDecoupe + 1] return len(retour) > 0, retour, fin # si on a rien trouvé return False, chaine, '' @@ -647,7 +736,7 @@ def prep_txtcharact(txt) : class BuildCorpus : """ - Class for building a corpora + Class for building a corpus """ def __init__(self, infile, parametres_corpus, lexique = None, expressions = None, dlg = None) : log.info('begin building corpus...') @@ -865,13 +954,15 @@ class BuildFromAlceste(BuildCorpus) : self.backup_uce() def treattxt(self, txt, iduce, idpara, iduci) : - txt = ' '.join(txt) - #log.debug('ATTENTION CHINOIS -> charactères') - #clean_chinois = [self.firstclean, self.dolower, self.make_expression, self.doapos, self.dotiret] - #log.debug('ATTENTION CHINOIS -> list(text)') - #txt = ' '.join(list(txt)) - txt = self.make_cleans(txt)#, clean_chinois) - ucetxt = self.make_uces(txt, self.corpus.parametres['douce']) + if self.corpus.parametres.get('ucemethod', 0) == 2 and self.corpus.parametres['douce']: + txt = 'laphrasepoursplitter'.join(txt) + txt = self.make_cleans(txt) + txt = ' '.join([val for val in txt.split() if val not in self.ponctuation_espace]) + ucetxt = txt.split('laphrasepoursplitter') + else : + txt = ' '.join(txt) + txt = self.make_cleans(txt) + ucetxt = self.make_uces(txt, self.corpus.parametres['douce']) if self.corpus.ucis[-1].paras == [] : idpara += 1 for uce in ucetxt :