log = logging.getLogger('iramuteq.corpus')
-#expressions = ReadDicoAsDico('dictionnaires/expression_fr.txt')
-#lexique = ReadDicoAsDico('dictionnaires/lexique_fr.txt')
-#infile = '/home/pierre/workspace/iramuteq/corpus/lru2.txt'
-#infile = '/home/pierre/workspace/iramuteq/corpus/corpussab_cor.txt'
-#encoding = 'utf8'
-#infile = '/home/pierre/fac/identite/identite_sans_doublons_ok.txt'
-#encoding = 'cp1252'
-#infile = '/home/pierre/workspace/iramuteq/corpus/Natacha.txt'
-#infile = '/home/pierre/fac/cablegate/allcables-all.txt'
-#infile = '/home/pierre/fac/cablegate/allcables-08290338.txt'
-#tar_in = '/home/pierre/fac/identite/uce.tar.gz
-#tar_in = '/home/pierre/fac/cablegate/uce-cable-test.tar.gz'
-#tar_infouce = '/home/pierre/fac/identite/info_uce.tar.gz'
-#tar_infouce = '/home/pierre/fac/cablegate/info_uce.tar.gz'
-#tar_formes = '/home/pierre/fac/identite/tar_formes.tar.gz'
-#tar_formes = '/home/pierre/fac/cablegate/tar_formes.tar.gz'
-
def copycorpus(corpus) :
log.info('copy corpus')
self.idformesuces = {}
self.iduces = None
self.idformes = None
+ self.uceuci = None
if read :
self.pathout = PathOut(dirout = parametres['pathout'])
self.read_corpus()
# else :
# self.lems[self.formes[forme].lem] = {self.formes[forme].ident : 0}
+ def getetbyuceid(self, uceid) :
+ if self.uceuci is None : self.uceuci = dict([[uce.ident,uci.ident] for uci in self.ucis for uce in uci.uces])
+ return self.ucis[self.uceuci[uceid]].etoiles
+
def make_lems(self, lem = True) :
log.info('make lems')
self.lems = {}
for line in f :
ffin.write(line)
os.remove(outfile + '~')
+
+ def make_table_with_classe(self, uces, list_act) :
+ table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
+ uces = dict([[uce, i] for i, uce in enumerate(uces)])
+ for i, lem in enumerate(list_act) :
+ lemuces = list(set(self.getlemuces(lem)).intersection(uces))
+ for uce in lemuces :
+ table_uce[uces[uce]][i] = 1
+ table_uce.insert(0, list_act)
+ return table_uce
def parse_active(self, gramact, gramsup = None) :
log.info('parse actives')
def make_actives_limit(self, limit) :
if self.idformes is None :
self.make_idformes()
- return [lem for lem in self.lems if self.getlemeff(lem) >= limit]
+ return [lem for lem in self.lems if self.getlemeff(lem) >= limit and self.lems[lem].act == 1]
def make_actives_nb(self, nbmax, key) :
log.info('make_actives_nb : %i - %i' % (nbmax,key))
etoiles.update(uci.etoiles[1:] + uci.paras)
return list(etoiles)
+ def make_etoiles_dict(self) :
+ etoiles = [et for uci in self.ucis for et in uci.etoiles[1:]]
+ det = {}
+ for etoile in etoiles :
+ et = etoile.split('_')
+ if et[0] in det :
+ try :
+ if et[1] in det[et[0]] :
+ det[et[0]][et[1]] += 1
+ else :
+ det[et[0]][et[1]] = 1
+ except IndexError :
+ det[et[0]] += 1
+ else :
+ try :
+ det[et[0]] = {et[1] :1}
+ except IndexError :
+ det[et[0]] = 1
+ print det
+
+
def make_and_write_profile_et(self, ucecl, fileout) :
log.info('etoiles/classes')
etoiles = self.make_etoiles()
self.lc0 = self.lc.pop(0)
#return ucecl
+ def gethapaxbyet(self, etoiles) :
+ hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
+ hucesdict = {}
+ for uce in hapaxuces :
+ if uce in hucesdict :
+ hucesdict[uce] += 1
+ else :
+ hucesdict[uce] = 1
+ etuces = [[] for et in etoiles]
+ for uci in self.ucis :
+ get = list(set(uci.etoiles).intersection(etoiles))
+ if len(get) > 1 :
+ return '2 variables sur la meme ligne'
+ elif get != [] :
+ etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
+ etuces = [set(val) for val in etuces]
+ return [sum([hucesdict[uce] for uce in list(etuce.intersection(hapaxuces))]) for etuce in etuces]
+
+ def gethapaxuces(self) :
+ hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
+ hapax = [forme for forme in self.lems if self.lems[forme].freq == 1]
+ hucesdict = {}
+ for i,uce in enumerate(hapaxuces) :
+ if uce in hucesdict :
+ hucesdict[uce][0] += 1
+ hucesdict[uce][1].append(hapax[i])
+ else :
+ hucesdict[uce] = [1,[hapax[i]]]
+ huces = {}
+ for uce in hucesdict :
+ if hucesdict[uce][0] in huces :
+ huces[hucesdict[uce][0]].append(uce)
+ else :
+ huces[hucesdict[uce][0]] = [uce]
+ huces = zip(huces, huces.values())
+ huces.sort(reverse=True)
+ txt = """
+ <html><body>
+ """
+ for nb in huces[0:4] :
+ txt += "<p><h2>%i hapax par uce</h2><p>\n" % nb[0]
+ for uce in nb[1] :
+ res = self.getconcorde([uce])
+ for row in res :
+ ucetxt = ' ' + row[1] + ' '
+ uceid = row[0]
+ for hap in hucesdict[uce][1] :
+ laforme = self.getforme([forme for forme in self.lems[hap].formes][0]).forme
+ ucetxt = ucetxt.replace(' '+laforme+' ', ' <font color=red>'+laforme+'</font> ')
+ txt += '<p><b>' + ' '.join(self.getetbyuceid(uceid)) + '</b></p>'
+ txt += '<p>'+ucetxt+'</p>\n'
+ txt += """
+ </body></html>
+ """
+ with open('/tmp/testhapxuce.html','w') as f :
+ f.write(txt)
+
+
+class MakeUciStat :
+ def __init__(self, corpus) :
+ ucinb = corpus.getucinb()
+ ucisize = corpus.getucisize()
+ ucimean = float(sum(ucisize))/float(ucinb)
+ detoile = corpus.make_etoiles_dict()
+
+
class Uci :
def __init__(self, iduci, line, paraset = None) :
self.ident = iduci
try :
indice = chaineTravail.index(u'$')
trouve = True
- iDecoupe = indice
+ iDecoupe = indice - 1
except ValueError :
pass
if not trouve:
iDecoupe = nbCar
else :
if (float(dsep[' ']) / distance) > (float(meilleur[1]) / meilleureDistance) :
- meilleur[0] = caractere
+ meilleur[0] = ' '
meilleur[1] = dsep[' ']
meilleur[2] = nbCar
trouve = True
nbCar = nbCar - 1
# si on a trouvé
if trouve:
+ #if meilleur[0] != ' ' :
+ # fin = chaine[iDecoupe + 1:]
+ # retour = chaineTravail[:iDecoupe]
+ #else :
fin = chaine[iDecoupe + 1:]
- retour = chaineTravail[:iDecoupe]
+ retour = chaineTravail[:iDecoupe + 1]
return len(retour) > 0, retour, fin
# si on a rien trouvé
return False, chaine, ''
class BuildCorpus :
"""
- Class for building a corpora
+ Class for building a corpus
"""
def __init__(self, infile, parametres_corpus, lexique = None, expressions = None, dlg = None) :
log.info('begin building corpus...')
self.backup_uce()
def treattxt(self, txt, iduce, idpara, iduci) :
- txt = ' '.join(txt)
- #log.debug('ATTENTION CHINOIS -> charactères')
- #clean_chinois = [self.firstclean, self.dolower, self.make_expression, self.doapos, self.dotiret]
- #log.debug('ATTENTION CHINOIS -> list(text)')
- #txt = ' '.join(list(txt))
- txt = self.make_cleans(txt)#, clean_chinois)
- ucetxt = self.make_uces(txt, self.corpus.parametres['douce'])
+ if self.corpus.parametres.get('ucemethod', 0) == 2 and self.corpus.parametres['douce']:
+ txt = 'laphrasepoursplitter'.join(txt)
+ txt = self.make_cleans(txt)
+ txt = ' '.join([val for val in txt.split() if val not in self.ponctuation_espace])
+ ucetxt = txt.split('laphrasepoursplitter')
+ else :
+ txt = ' '.join(txt)
+ txt = self.make_cleans(txt)
+ ucetxt = self.make_uces(txt, self.corpus.parametres['douce'])
if self.corpus.ucis[-1].paras == [] :
idpara += 1
for uce in ucetxt :