import codecs
import os
+import gettext
+_ = gettext.gettext
+import locale
import sys
from time import time
from functions import decoupercharact, ReadDicoAsDico, DoConf
from chemins import PathOut
from dialog import CorpusPref
from functions import ReadLexique, ReadDicoAsDico
+from colors import colors
import datetime
log = logging.getLogger('iramuteq.corpus')
-#expressions = ReadDicoAsDico('dictionnaires/expression_fr.txt')
-#lexique = ReadDicoAsDico('dictionnaires/lexique_fr.txt')
-#infile = '/home/pierre/workspace/iramuteq/corpus/lru2.txt'
-#infile = '/home/pierre/workspace/iramuteq/corpus/corpussab_cor.txt'
-#encoding = 'utf8'
-#infile = '/home/pierre/fac/identite/identite_sans_doublons_ok.txt'
-#encoding = 'cp1252'
-#infile = '/home/pierre/workspace/iramuteq/corpus/Natacha.txt'
-#infile = '/home/pierre/fac/cablegate/allcables-all.txt'
-#infile = '/home/pierre/fac/cablegate/allcables-08290338.txt'
-#tar_in = '/home/pierre/fac/identite/uce.tar.gz
-#tar_in = '/home/pierre/fac/cablegate/uce-cable-test.tar.gz'
-#tar_infouce = '/home/pierre/fac/identite/info_uce.tar.gz'
-#tar_infouce = '/home/pierre/fac/cablegate/info_uce.tar.gz'
-#tar_formes = '/home/pierre/fac/identite/tar_formes.tar.gz'
-#tar_formes = '/home/pierre/fac/cablegate/tar_formes.tar.gz'
-
def copycorpus(corpus) :
log.info('copy corpus')
self.idformesuces = {}
self.iduces = None
self.idformes = None
+ self.uceuci = None
if read :
self.pathout = PathOut(dirout = parametres['pathout'])
self.read_corpus()
# else :
# self.lems[self.formes[forme].lem] = {self.formes[forme].ident : 0}
+ def getetbyuceid(self, uceid) :
+ if self.uceuci is None : self.uceuci = dict([[uce.ident,uci.ident] for uci in self.ucis for uce in uci.uces])
+ return self.ucis[self.uceuci[uceid]].etoiles
+
def make_lems(self, lem = True) :
log.info('make lems')
self.lems = {}
for line in f :
ffin.write(line)
os.remove(outfile + '~')
+
+ def make_table_with_classe(self, uces, list_act) :
+ table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
+ uces = dict([[uce, i] for i, uce in enumerate(uces)])
+ for i, lem in enumerate(list_act) :
+ lemuces = list(set(self.getlemuces(lem)).intersection(uces))
+ for uce in lemuces :
+ table_uce[uces[uce]][i] = 1
+ table_uce.insert(0, list_act)
+ return table_uce
def parse_active(self, gramact, gramsup = None) :
log.info('parse actives')
else :
self.lems[lem].act = 2
- def make_actives_limit(self, limit) :
+ def make_actives_limit(self, limit, key = 1) :
if self.idformes is None :
self.make_idformes()
- return [lem for lem in self.lems if self.getlemeff(lem) >= limit]
+ return [lem for lem in self.lems if self.getlemeff(lem) >= limit and self.lems[lem].act == key]
def make_actives_nb(self, nbmax, key) :
log.info('make_actives_nb : %i - %i' % (nbmax,key))
etoiles.update(uci.etoiles[1:] + uci.paras)
return list(etoiles)
+ def make_etoiles_dict(self) :
+ etoiles = [et for uci in self.ucis for et in uci.etoiles[1:]]
+ det = {}
+ for etoile in etoiles :
+ et = etoile.split('_')
+ if et[0] in det :
+ try :
+ endet = '_'.join(et[1:])
+ if endet in det[et[0]] :
+ det[et[0]][endet] += 1
+ else :
+ det[et[0]][endet] = 1
+ except IndexError :
+ det[et[0]] += 1
+ else :
+ try :
+ endet = '_'.join(et[1:])
+ det[et[0]] = {endet :1}
+ except IndexError :
+ det[et[0]] = 1
+ return det
+
+ def make_etline(self, listet) :
+ etuces = [[] for et in listet]
+ for uci in self.ucis :
+ get = list(set(uci.etoiles).intersection(listet))
+ if len(get) > 1 :
+ return '2 variables sur la meme ligne'
+ elif get != [] :
+ etuces[listet.index(get[0])] += [uce.ident for uce in uci.uces]
+ return etuces
+
+
def make_and_write_profile_et(self, ucecl, fileout) :
log.info('etoiles/classes')
etoiles = self.make_etoiles()
with open(fileout, 'w') as f :
f.write('\n'.join([';'.join([etoile] + [`len(set(self.getucesfrometoile(etoile)).intersection(classe))` for classe in ucecl]) for etoile in etoiles]).encode(self.parametres['syscoding']))
+ def make_colored_corpus(self) :
+ ucecl = {}
+ for i, lc in enumerate(self.lc) :
+ for uce in lc :
+ ucecl[uce] = i + 1
+ for uce in self.lc0 :
+ ucecl[uce] = 0
+ color = ['black'] + colors[len(self.lc) - 1]
+ txt = '''<html>
+ <meta http-equiv="content-Type" content="text/html; charset=%s" />
+ <body>
+''' % sys.getdefaultencoding()
+ res = self.getalluces()
+ self.make_iduces()
+ actuci = ''
+ actpara = False
+ for uce in res :
+ if self.iduces[uce[0]].uci != actuci :
+ actuci = self.iduces[uce[0]].uci
+ txt += '<br><hr>' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles) + '<br><br>'
+ txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
+ else :
+ txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
+ return txt + '\n</body></html>'
+
def count_from_list(self, l, d) :
for val in l :
if val in d :
d[val] = 1
return d
+ def count_from_list_cl(self, l, d, a, clnb) :
+ for val in l :
+ if val in d :
+ d[val][a] += 1
+ else :
+ d[val] = [0] * clnb
+ d[val][a] = 1
+ return d
+
def find_segments(self, taille_segment, taille_limite) :
d = {}
for uce in self.getalluces() :
if len(l) > taille_limite :
l = l[-taille_limite:]
return l
+
+ def find_segments_in_classe(self, list_uce, taille_segment, taille_limite):
+ d={}
+ for uce in self.getconcorde(list_uce) :
+ uce = uce[1].split()
+ d =self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
+ l = [[d[val], val, taille_segment] for val in d if d[val] >= 3]
+ del(d)
+ l.sort()
+ if len(l) > taille_limite :
+ l = l[-taille_limite:]
+ return l
+
+ def make_segments_profile(self, fileout, lenmin = 3, lenmax = 10, effmin = 50, lem = False) :
+ d = {}
+ for b, classe in enumerate(self.lc) :
+ for uce in self.getconcorde(classe) :
+ uce = uce[1].split()
+ if lem :
+ uce = [self.formes[forme].lem for forme in uce]
+ for taille_segment in range(lenmin,lenmax) :
+ d =self.count_from_list_cl([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d, b, len(self.lc))
+ result = [[seg] + [str(val) for val in d[seg]] for seg in d if sum(d[seg]) >= effmin]
+ with open(fileout, 'w') as f :
+ f.write('\n'.join([';'.join(line) for line in result]))
def make_ucecl_from_R(self, filein) :
with open(filein, 'rU') as f :
self.lc0 = self.lc.pop(0)
#return ucecl
+ def gethapaxbyet(self, etoiles) :
+ hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
+ hucesdict = {}
+ for uce in hapaxuces :
+ if uce in hucesdict :
+ hucesdict[uce] += 1
+ else :
+ hucesdict[uce] = 1
+ etuces = [[] for et in etoiles]
+ for uci in self.ucis :
+ get = list(set(uci.etoiles).intersection(etoiles))
+ if len(get) > 1 :
+ return '2 variables sur la meme ligne'
+ elif get != [] :
+ etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
+ etuces = [set(val) for val in etuces]
+ return [sum([hucesdict[uce] for uce in list(etuce.intersection(hapaxuces))]) for etuce in etuces]
+
+ def gethapaxuces(self) :
+ hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
+ hapax = [forme for forme in self.lems if self.lems[forme].freq == 1]
+ hucesdict = {}
+ for i,uce in enumerate(hapaxuces) :
+ if uce in hucesdict :
+ hucesdict[uce][0] += 1
+ hucesdict[uce][1].append(hapax[i])
+ else :
+ hucesdict[uce] = [1,[hapax[i]]]
+ huces = {}
+ for uce in hucesdict :
+ if hucesdict[uce][0] in huces :
+ huces[hucesdict[uce][0]].append(uce)
+ else :
+ huces[hucesdict[uce][0]] = [uce]
+ huces = zip(huces, huces.values())
+ huces.sort(reverse=True)
+ txt = """
+ <html><body>
+ """
+ for nb in huces[0:4] :
+ txt += "<p><h2>%i hapax par uce</h2><p>\n" % nb[0]
+ for uce in nb[1] :
+ res = self.getconcorde([uce])
+ for row in res :
+ ucetxt = ' ' + row[1] + ' '
+ uceid = row[0]
+ for hap in hucesdict[uce][1] :
+ laforme = self.getforme([forme for forme in self.lems[hap].formes][0]).forme
+ ucetxt = ucetxt.replace(' '+laforme+' ', ' <font color=red>'+laforme+'</font> ')
+ txt += '<p><b>' + ' '.join(self.getetbyuceid(uceid)) + '</b></p>'
+ txt += '<p>'+ucetxt+'</p>\n'
+ txt += """
+ </body></html>
+ """
+ with open('/tmp/testhapxuce.html','w') as f :
+ f.write(txt)
+
+
+class MakeUciStat :
+ def __init__(self, corpus) :
+ ucinb = corpus.getucinb()
+ ucisize = corpus.getucisize()
+ ucimean = float(sum(ucisize))/float(ucinb)
+ detoile = corpus.make_etoiles_dict()
+
+
class Uci :
def __init__(self, iduci, line, paraset = None) :
self.ident = iduci
Si on trouve un '$', c'est fini.
Sinon, on cherche le meilleur candidat. C'est-à-dire le rapport poids/distance le plus important.
"""
- separateurs = [[u'.', 6.0], [u'?', 6.0], [u'!', 6.0], [u'£', 6.0], [u':', 5.0], [u';', 4.0], [u',', 1.0], [u' ', 0.01]]
+ separateurs = [[u'.', 6.0], [u'?', 6.0], [u'!', 6.0], [u'£$£', 6.0], [u':', 5.0], [u';', 4.0], [u',', 1.0], [u' ', 0.01]]
dsep = dict([[val[0],val[1]] for val in separateurs])
trouve = False # si on a trouvé un bon séparateur
iDecoupe = 0 # indice du caractere ou il faut decouper
try :
indice = chaineTravail.index(u'$')
trouve = True
- iDecoupe = indice
+ iDecoupe = indice - 1
except ValueError :
pass
if not trouve:
iDecoupe = nbCar
else :
if (float(dsep[' ']) / distance) > (float(meilleur[1]) / meilleureDistance) :
- meilleur[0] = caractere
+ meilleur[0] = ' '
meilleur[1] = dsep[' ']
meilleur[2] = nbCar
trouve = True
nbCar = nbCar - 1
# si on a trouvé
if trouve:
+ #if meilleur[0] != ' ' :
+ # fin = chaine[iDecoupe + 1:]
+ # retour = chaineTravail[:iDecoupe]
+ #else :
fin = chaine[iDecoupe + 1:]
- retour = chaineTravail[:iDecoupe]
+ retour = chaineTravail[:iDecoupe + 1]
return len(retour) > 0, retour, fin
# si on a rien trouvé
return False, chaine, ''
class BuildCorpus :
"""
- Class for building a corpora
+ Class for building a corpus
"""
def __init__(self, infile, parametres_corpus, lexique = None, expressions = None, dlg = None) :
log.info('begin building corpus...')
if self.corpus.parametres['keep_ponct'] :
self.ponctuation_espace = [' ', '']
else :
- self.ponctuation_espace = [' ','.', u'£', ';', '?', '!', ',', ':','']
+ self.ponctuation_espace = [' ','.', u'£$£', ';', '?', '!', ',', ':','']
self.cleans = []
self.tolist = self.corpus.parametres.get('tolist', 0)
self.buildcleans()
def dobuild(self) :
t1 = time()
- self.read_corpus(self.infile)
- self.indexdb()
- self.corpus.parametres['ira'] = self.corpus.pathout['Corpus.cira']
- self.time = time() - t1
- self.dofinish()
- DoConf().makeoptions(['corpus'],[self.corpus.parametres], self.corpus.pathout['Corpus.cira'])
- log.info('time : %f' % (time() - t1))
+ try :
+ self.read_corpus(self.infile)
+ except Warning, args :
+ log.info('pas kool %s' % args)
+ raise Warning
+ else :
+ self.indexdb()
+ self.corpus.parametres['ira'] = self.corpus.pathout['Corpus.cira']
+ self.time = time() - t1
+ self.dofinish()
+ DoConf().makeoptions(['corpus'],[self.corpus.parametres], self.corpus.pathout['Corpus.cira'])
+ log.info('time : %f' % (time() - t1))
def connect(self) :
self.conn_f = sqlite3.connect(self.corpus.pathout['formes.db'])
self.cleans.append(self.dolower)
if self.corpus.parametres.get('firstclean', 1) :
self.cleans.append(self.firstclean)
- self.rule = self.corpus.parametres.get('keep_caract', u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_-")
- self.cleans.append(self.docharact)
+ if self.corpus.parametres['charact'] :
+ self.rule = self.corpus.parametres.get('keep_caract', u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_")
+ self.cleans.append(self.docharact)
if self.corpus.parametres.get('expressions', 1) :
self.cleans.append(self.make_expression)
if self.corpus.parametres.get('apos', 1) :
def firstclean(self, txt) :
txt = txt.replace(u'’',"'")
txt = txt.replace(u'œ', u'oe')
- return txt.replace('...',u' £ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ')
+ return txt.replace('...',u' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'…', ' £$£ ')
def make_cleans(self, txt) :
for clean in self.cleans :
iduci = -1
idpara = -1
iduce = -1
- with codecs.open(infile, 'rU', self.encoding) as f :
- for line in f :
- if self.testuci(line) :
- iduci += 1
- if txt != [] :
- iduce, idpara = self.treattxt(txt, iduce, idpara, iduci - 1)
- txt = []
- self.corpus.ucis.append(Uci(iduci, line))
- else :
- self.corpus.ucis.append(Uci(iduci, line))
- elif line.startswith(u'-*') :
- if txt != [] :
- iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
- txt = []
- idpara += 1
- self.corpus.ucis[-1].paras.append(line.split()[0])
- elif line.strip() != '' and iduci != -1 :
- txt.append(line)
- if txt != [] :
- iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
- del(txt)
- self.backup_uce()
+ linenb = 0
+ try :
+ with codecs.open(infile, 'r', self.encoding) as f :
+ for line in f :
+ linenb += 1
+ line = line.rstrip('\n\r')
+ if self.testuci(line) :
+ iduci += 1
+ if txt != [] :
+ iduce, idpara = self.treattxt(txt, iduce, idpara, iduci - 1)
+ txt = []
+ self.corpus.ucis.append(Uci(iduci, line))
+ else :
+ if iduci > 0 :
+ if self.corpus.ucis[-1].uces == [] :
+ log.info('linenb : %i' % linenb)
+ raise Exception("EmptyText %i" % linenb)
+ self.corpus.ucis.append(Uci(iduci, line))
+ elif line.startswith(u'-*') :
+ if iduci != -1 :
+ if txt != [] :
+ iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
+ txt = []
+ idpara += 1
+ self.corpus.ucis[-1].paras.append(line.split()[0])
+ else :
+ raise Exception('paragrapheOT')
+ elif line.strip() != '' and iduci != -1 :
+ txt.append(line)
+ if txt != [] and iduci != -1 :
+ iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
+ del(txt)
+ else :
+ raise Exception("EmptyText")
+ if iduci != -1 and iduce != -1:
+ self.backup_uce()
+ else :
+ log.info(_(u"No Texte in corpora. Are you sure of the formatting ?"))
+ raise Exception('TextBeforeTextMark')
+ except UnicodeDecodeError :
+ raise Exception("CorpusEncoding")
def treattxt(self, txt, iduce, idpara, iduci) :
- txt = ' '.join(txt)
- #log.debug('ATTENTION CHINOIS -> charactères')
- #clean_chinois = [self.firstclean, self.dolower, self.make_expression, self.doapos, self.dotiret]
- #log.debug('ATTENTION CHINOIS -> list(text)')
- #txt = ' '.join(list(txt))
- txt = self.make_cleans(txt)#, clean_chinois)
- ucetxt = self.make_uces(txt, self.corpus.parametres['douce'])
+ if self.corpus.parametres.get('ucemethod', 0) == 2 and self.corpus.parametres['douce']:
+ txt = 'laphrasepoursplitter'.join(txt)
+ txt = self.make_cleans(txt)
+ txt = ' '.join([val for val in txt.split() if val not in self.ponctuation_espace])
+ ucetxt = txt.split('laphrasepoursplitter')
+ else :
+ txt = ' '.join(txt)
+ txt = self.make_cleans(txt)
+ ucetxt = self.make_uces(txt, self.corpus.parametres['douce'])
if self.corpus.ucis[-1].paras == [] :
idpara += 1
for uce in ucetxt :
self.limitshow = 0
else :
self.limitshow = self.last / 100000
- log.debug(`iduci`, `idpara`, `iduce`)
+ log.debug(' '.join([`iduci`,`idpara`,`iduce`]))
if self.last > self.lim :
self.backup_uce()
self.last = 0
uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
if uce != '' :
- print 'RESTEE UUCEEEEEEEEEEEEE', uce
+ #print 'RESTEE UUCEEEEEEEEEEEEE', uce
out.append(uce)
return out
else :