1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
8 from functions import decoupercharact, ReadDicoAsDico, DoConf
14 from operator import itemgetter
15 from uuid import uuid4
16 from chemins import PathOut
17 from dialog import CorpusPref
18 from functions import ReadLexique, ReadDicoAsDico
22 log = logging.getLogger('iramuteq.corpus')
24 #expressions = ReadDicoAsDico('dictionnaires/expression_fr.txt')
25 #lexique = ReadDicoAsDico('dictionnaires/lexique_fr.txt')
26 #infile = '/home/pierre/workspace/iramuteq/corpus/lru2.txt'
27 #infile = '/home/pierre/workspace/iramuteq/corpus/corpussab_cor.txt'
29 #infile = '/home/pierre/fac/identite/identite_sans_doublons_ok.txt'
31 #infile = '/home/pierre/workspace/iramuteq/corpus/Natacha.txt'
32 #infile = '/home/pierre/fac/cablegate/allcables-all.txt'
33 #infile = '/home/pierre/fac/cablegate/allcables-08290338.txt'
34 #tar_in = '/home/pierre/fac/identite/uce.tar.gz
35 #tar_in = '/home/pierre/fac/cablegate/uce-cable-test.tar.gz'
36 #tar_infouce = '/home/pierre/fac/identite/info_uce.tar.gz'
37 #tar_infouce = '/home/pierre/fac/cablegate/info_uce.tar.gz'
38 #tar_formes = '/home/pierre/fac/identite/tar_formes.tar.gz'
39 #tar_formes = '/home/pierre/fac/cablegate/tar_formes.tar.gz'
42 def copycorpus(corpus) :
43 log.info('copy corpus')
44 copy_corpus = Corpus(corpus.parent, parametres = corpus.parametres)
45 copy_corpus.ucis = corpus.ucis
46 copy_corpus.formes = corpus.formes
47 copy_corpus.pathout = corpus.pathout
48 copy_corpus.conn_all()
58 def __init__(self, parent, parametres = {}, read = False) :
60 self.parametres = parametres
62 self.connformes = None
64 self.conncorpus = None
71 self.idformesuces = {}
75 self.pathout = PathOut(dirout = parametres['pathout'])
78 def add_word(self, word) :
79 if word in self.formes :
80 self.formes[word].freq += 1
81 if self.formes[word].ident in self.idformesuces :
82 if self.ucis[-1].uces[-1].ident in self.idformesuces[self.formes[word].ident] :
83 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] += 1
85 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] = 1
87 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident: 1}
89 if word in self.parent.lexique :
90 gramtype = self.parent.lexique[word][1]
91 lem = self.parent.lexique[word][0]
98 self.formes[word] = Word(word, gramtype, len(self.formes), lem)
99 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident : 1}
102 """connect corpus to db"""
103 if self.connformes is None :
104 log.info('connexion corpus')
105 self.connuces = sqlite3.connect(self.pathout['uces.db'])
106 self.cuces = self.connuces.cursor()
107 self.connformes = sqlite3.connect(self.pathout['formes.db'])
108 self.cformes = self.connformes.cursor()
109 self.conncorpus = sqlite3.connect(self.pathout['corpus.db'])
110 self.ccorpus = self.conncorpus.cursor()
111 self.cformes.execute('PRAGMA temp_store=MEMORY;')
112 self.cformes.execute('PRAGMA journal_mode=MEMORY;')
113 self.cformes.execute('PRAGMA synchronous = OFF;')
114 self.cuces.execute('PRAGMA temp_store=MEMORY;')
115 self.cuces.execute('PRAGMA journal_mode=MEMORY;')
116 self.cuces.execute('PRAGMA synchronous = OFF;')
117 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
118 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
119 self.ccorpus.execute('PRAGMA synchronous = OFF;')
121 def read_corpus(self) :
122 log.info('read corpus')
123 self.parametres['syscoding'] = sys.getdefaultencoding()
124 if self.conncorpus is None :
126 res = self.ccorpus.execute('SELECT * FROM etoiles;')
128 self.ucis.append(Uci(row[0], row[1], row[2]))
129 uces = self.conncorpus.cursor().execute('SELECT * FROM luces where uci=?;',(`self.ucis[-1].ident`,))
131 self.ucis[-1].uces.append(Uce(uce[2], uce[1], uce[0]))
132 res = self.ccorpus.execute('SELECT * FROM formes;')
133 self.formes = dict([[forme[1], Word(forme[1], forme[3], forme[0], lem = forme[2], freq = forme[4])] for forme in res])
136 def getworduces(self, wordid) :
137 if isinstance(wordid, basestring) :
138 wordid = self.formes[wordid].ident
139 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`wordid`,))
140 return list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
142 def getlemuces(self, lem) :
143 formesid = ', '.join([`val` for val in self.lems[lem].formes])
144 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
145 res = self.cformes.execute(query)
146 return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))))
148 def getlemucis(self, lem) :
149 uces = self.getlemuces(lem)
150 return list(set([self.getucefromid(val).uci for val in uces]))
152 def getlemuceseff(self, lem) :
153 formesid = ', '.join([`val` for val in self.lems[lem].formes])
154 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
155 res = self.cformes.execute(query)
156 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
157 query = 'SELECT eff FROM eff where id IN (%s) ORDER BY id' % formesid
158 res = self.cformes.execute(query)
159 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
161 for i, uce in enumerate(uces) :
162 lemuceeff[uce] = lemuceeff.get(uce, 0) + eff[i]
165 def getlemeff(self, lem) :
166 return self.lems[lem].freq
171 def getforme(self, formeid) :
172 if self.idformes is None : self.make_idformes()
173 return self.idformes[formeid]
175 def gettotocc(self) :
176 return sum([self.formes[forme].freq for forme in self.formes])
178 def getucemean(self) :
179 return float(self.gettotocc())/self.getucenb()
182 return self.ucis[-1].uces[-1].ident + 1
185 return self.ucis[-1].ident + 1
187 def getucisize(self) :
188 ucesize = self.getucesize()
189 return [sum(ucesize[uci.uces[0].ident:(uci.uces[-1].ident + 1)]) for uci in self.ucis]
191 def getucesize(self) :
192 res = self.getalluces()
193 return [len(uce[1].split()) for uce in res]
195 # def getlemseff(self) :
196 # if self.idformes is None :
197 # self.make_idformes()
198 # return dict([[lem, sum([self.idformes[forme].freq for forme in self.lems[lem]])] for lem in self.lems])
200 # def getlemsefftype(self) :
201 # if self.idformes is None :
202 # self.make_idformes()
203 # if self.lems is None :
205 # return dict([[lem, [sum([self.idformes[forme].freq for forme in self.lems[lem]]), '', self.idformes[self.lems[lem].keys()[0]].gram]] for lem in self.lems])
207 def getconcorde(self, uces) :
208 return self.cuces.execute('select * from uces where id IN (%s);' % ', '.join([`i` for i in uces]))
210 def getwordconcorde(self, word) :
211 return self.getconcorde(self.getworduces(word))
213 def getlemconcorde(self, lem) :
214 return self.getconcorde(self.getlemuces(lem))
216 def getalluces(self) :
217 return self.cuces.execute('SELECT * FROM uces')
219 def getucesfrometoile(self, etoile) :
220 return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles]
222 def getucefromid(self, uceid) :
223 if self.iduces is None : self.make_iduces()
224 return self.iduces[uceid]
226 def gethapaxnb(self) :
227 return len([None for forme in self.formes if self.formes[forme].freq == 1])
229 def getactivesnb(self, key) :
230 return len([lem for lem in self.lems if self.lems[lem].act == key])
231 # def make_lems(self, lem = True) :
232 # log.info('make lems')
234 # for forme in self.formes :
235 # if self.formes[forme].lem in self.lems :
236 # if self.formes[forme].ident not in self.lems[self.formes[forme].lem] :
237 # self.lems[self.formes[forme].lem][self.formes[forme].ident] = 0
239 # self.lems[self.formes[forme].lem] = {self.formes[forme].ident : 0}
241 def make_lems(self, lem = True) :
242 log.info('make lems')
245 for forme in self.formes :
246 if self.formes[forme].lem in self.lems :
247 if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes :
248 self.lems[self.formes[forme].lem].add_forme(self.formes[forme])
250 self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
252 self.lems = dict([[forme, Lem(self, self.formes[forme])] for forme in self.formes])
254 def make_idformes(self) :
255 self.idformes = dict([[self.formes[forme].ident, self.formes[forme]] for forme in self.formes])
257 def make_iduces(self) :
258 if self.iduces is None :
259 self.iduces = dict([[uce.ident, uce] for uci in self.ucis for uce in uci.uces])
261 def make_lexitable(self, mineff, etoiles) :
262 tokeep = [lem for lem in self.lems if self.lems[lem].freq > mineff]
263 etuces = [[] for et in etoiles]
264 for uci in self.ucis :
265 get = list(set(uci.etoiles).intersection(etoiles))
267 return '2 variables sur la meme ligne'
269 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
270 etuces = [set(val) for val in etuces]
273 deff = self.getlemuceseff(lem)
275 tab.append([lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])
276 tab.insert(0, [''] + etoiles)
279 def make_efftype_from_etoiles(self, etoiles) :
281 etuces = [[] for et in etoiles]
282 for uci in self.ucis :
283 get = list(set(uci.etoiles).intersection(etoiles))
285 return '2 variables sur la meme ligne'
287 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
288 etuces = [set(val) for val in etuces]
289 for lem in self.lems :
290 deff = self.getlemuceseff(lem)
292 gram = self.lems[lem].gram
294 dtype[gram] = [i + j for i, j in zip(dtype[gram], [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])]
296 dtype[gram] = [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces]
297 tabout = [[gram] + dtype[gram] for gram in dtype]
298 tabout.insert(0, [''] + etoiles)
301 def make_uceactsize(self, actives) :
302 res = self.getalluces()
305 deff = self.getlemuceseff(lem)
307 ucesize[uce] = ucesize.get(uce, 0) + 1
310 def make_uc(self, actives, lim1, lim2) :
311 uceactsize = self.make_uceactsize(actives)
317 for uce in [uce for uci in self.ucis for uce in uci.uces] :
318 if uce.para == lastpara :
320 last1 += uceactsize.get(uce.ident,0)
321 uc1[-1].append(uce.ident)
323 uc1.append([uce.ident])
326 last2 += uceactsize.get(uce.ident, 0)
327 uc2[-1].append(uce.ident)
329 uc2.append([uce.ident])
332 last1 = uceactsize.get(uce.ident, 0)
333 last2 = uceactsize.get(uce.ident, 0)
335 uc1.append([uce.ident])
336 uc2.append([uce.ident])
339 def make_and_write_sparse_matrix_from_uc(self, actives, sizeuc1, sizeuc2, uc1out, uc2out, listuce1out, listuce2out) :
340 uc1, uc2 = self.make_uc(actives, sizeuc1, sizeuc2)
341 log.info('taille uc1 : %i - taille uc2 : %i' % (len(uc1), len(uc2)))
342 self.write_ucmatrix(uc1, actives, uc1out)
343 self.write_ucmatrix(uc2, actives, uc2out)
344 listuce1 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc1) for uce in ucl]
345 listuce2 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc2) for uce in ucl]
346 with open(listuce1out, 'w') as f :
347 f.write('\n'.join([';'.join(line) for line in listuce1]))
348 with open(listuce2out, 'w') as f :
349 f.write('\n'.join([';'.join(line) for line in listuce2]))
350 return len(uc1), len(uc2)
352 def write_ucmatrix(self, uc, actives, fileout) :
353 log.info('write uc matrix %s' % fileout)
354 uces_uc = dict([[uce, i] for i, ucl in enumerate(uc) for uce in ucl])
357 with open(fileout + '~', 'w+') as f :
358 for i, lem in enumerate(actives) :
359 for uce in self.getlemuces(lem):
360 if (uces_uc[uce], i) not in deja_la :
362 f.write(''.join([' '.join([`uces_uc[uce]+1`,`i+1`,`1`]),'\n']))
363 deja_la[(uces_uc[uce], i)] = 0
365 with open(fileout, 'w') as ffin :
366 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (len(uc), len(actives), nbl))
369 os.remove(fileout + '~')
372 def export_corpus(self, outf) :
373 #outf = 'export_corpus.txt'
375 res = self.getalluces()
379 with open(outf,'w') as f :
381 if self.iduces[uce[0]].uci == actuci and self.iduces[uce[0]].para == actpara :
382 f.write(uce[1].encode(self.parametres['syscoding']) + '\n')
383 elif self.iduces[uce[0]].uci != actuci :
384 actuci = self.iduces[uce[0]].uci
385 if self.ucis[self.iduces[uce[0]].uci].paras == [] :
386 actpara = self.iduces[uce[0]].para
387 f.write('\n' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n' + uce[1].encode(self.parametres['syscoding']) + '\n')
390 actpara = self.iduces[uce[0]].para
391 f.write('\n'.join([' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']), self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
392 elif self.iduces[uce[0]].para != actpara :
393 actpara = self.iduces[uce[0]].para
395 f.write('\n'.join([self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
397 def make_and_write_sparse_matrix_from_uces(self, actives, outfile, listuce = False) :
398 log.info('make_and_write_sparse_matrix_from_uces %s' % outfile)
400 with open(outfile + '~', 'w+') as f :
401 for i, lem in enumerate(actives) :
402 for uce in sorted(self.getlemuces(lem)) :
404 f.write(''.join([' '.join([`uce+1`, `i+1`,`1`]),'\n']))
406 with open(outfile, 'w') as ffin :
407 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
410 os.remove(outfile + '~')
412 with open(listuce, 'w') as f :
413 f.write('\n'.join(['uce;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucenb())]))
415 def make_and_write_sparse_matrix_from_uci(self, actives, outfile, listuci = False) :
416 log.info('make_and_write_sparse_matrix_from_ucis %s' % outfile)
418 with open(outfile + '~', 'w+') as f :
419 for i, lem in enumerate(actives) :
420 for uci in sorted(self.getlemucis(lem)) :
422 f.write(''.join([' '.join([`uci+1`, `i+1`,`1`]),'\n']))
424 with open(outfile, 'w') as ffin :
425 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucinb(), len(actives), nbl))
428 os.remove(outfile + '~')
430 with open(listuci, 'w') as f :
431 f.write('\n'.join(['uci;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucinb())]))
433 def make_and_write_sparse_matrix_from_classe(self, actives, uces, outfile) :
434 log.info('make_and_write_sparse_matrix_from_classe %s' % outfile)
436 duces = dict([[uce, i] for i, uce in enumerate(uces)])
437 with open(outfile + '~', 'w+') as f :
438 for i, lem in enumerate(actives) :
439 uces_ok = list(set(self.getlemuces(lem)).intersection(uces))
441 f.write(''.join([' '.join([`duces[uce]+1`,`i+1`,`1`]),'\n']))
443 with open(outfile, 'w') as ffin :
444 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
447 os.remove(outfile + '~')
449 def parse_active(self, gramact, gramsup = None) :
450 log.info('parse actives')
451 for lem in self.lems :
452 if self.lems[lem].gram in gramact :
453 self.lems[lem].act = 1
454 elif gramsup is not None :
455 if self.lems[lem].gram in gramsup :
456 self.lems[lem].act = 2
458 self.lems[lem].act = 0
460 self.lems[lem].act = 2
462 def make_actives_limit(self, limit) :
463 if self.idformes is None :
465 return [lem for lem in self.lems if self.getlemeff(lem) >= limit]
467 def make_actives_nb(self, nbmax, key) :
468 log.info('make_actives_nb : %i - %i' % (nbmax,key))
469 if self.idformes is None :
471 allactives = [[self.lems[lem].freq, lem] for lem in self.lems if self.lems[lem].act == key and self.lems[lem].freq >= 3]
472 self.activenb = len(allactives)
473 allactives = sorted(allactives, reverse = True)
474 if len(allactives) <= nbmax :
475 log.info('nb = %i - eff min = %i ' % (len(allactives), allactives[-1][0]))
476 return [val[1] for val in allactives], allactives[-1][0]
478 effs = [val[0] for val in allactives]
479 if effs.count(effs[nbmax - 1]) > 1 :
480 lim = effs[nbmax - 1] + 1
484 stop = effs.index(lim)
490 log.info('nb actives = %i - eff min = %i ' % (stop, lim))
491 return [val[1] for val in allactives[0:stop + 1]], lim
493 def make_and_write_profile(self, actives, ucecl, fileout) :
494 log.info('formes/classes')
495 tab = [[lem] + [len(set(self.getlemuces(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
496 tab = [[line[0]] + [`val` for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
497 with open(fileout, 'w') as f :
498 f.write('\n'.join([';'.join(line) for line in tab]).encode(self.parametres['syscoding']))
500 def make_etoiles(self) :
502 for uci in self.ucis :
503 etoiles.update(uci.etoiles[1:] + uci.paras)
506 def make_and_write_profile_et(self, ucecl, fileout) :
507 log.info('etoiles/classes')
508 etoiles = self.make_etoiles()
509 with open(fileout, 'w') as f :
510 f.write('\n'.join([';'.join([etoile] + [`len(set(self.getucesfrometoile(etoile)).intersection(classe))` for classe in ucecl]) for etoile in etoiles]).encode(self.parametres['syscoding']))
512 def count_from_list(self, l, d) :
520 def find_segments(self, taille_segment, taille_limite) :
522 for uce in self.getalluces() :
524 d = self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
525 l = [[d[val], val] for val in d if d[val] >= 3]
528 if len(l) > taille_limite :
529 l = l[-taille_limite:]
532 def make_ucecl_from_R(self, filein) :
533 with open(filein, 'rU') as f :
538 line = line.replace('\n', '').replace('"', '').split(';')
539 self.lc.append([int(line[0]) - 1, int(line[1])])
540 classesl = [val[1] for val in self.lc]
542 self.lc = sorted(self.lc, key=itemgetter(1))
543 self.lc = [[uce[0] for uce in self.lc if uce[1] == i] for i in range(clnb+1)]
544 self.lc0 = self.lc.pop(0)
548 def __init__(self, iduci, line, paraset = None) :
550 self.etoiles = line.split()
552 if paraset is not None :
553 self.paras = paraset.split()
558 def __init__(self, iduce, idpara, iduci) :
564 def __init__(self, word, gramtype, idword, lem = None, freq = None) :
570 if freq is not None :
576 def __init__(self, parent, forme) :
577 self.formes = {forme.ident : forme.freq}
578 self.gram = forme.gram
579 self.freq = forme.freq
582 def add_forme(self, forme) :
583 self.formes[forme.ident] = forme.freq
584 self.freq += forme.freq
586 def decouperlist(chaine, longueur, longueurOptimale) :
588 on part du dernier caractère, et on recule jusqu'au début de la chaîne.
589 Si on trouve un '$', c'est fini.
590 Sinon, on cherche le meilleur candidat. C'est-à-dire le rapport poids/distance le plus important.
592 separateurs = [[u'.', 6.0], [u'?', 6.0], [u'!', 6.0], [u'£', 6.0], [u':', 5.0], [u';', 4.0], [u',', 1.0], [u' ', 0.01]]
593 dsep = dict([[val[0],val[1]] for val in separateurs])
594 trouve = False # si on a trouvé un bon séparateur
595 iDecoupe = 0 # indice du caractere ou il faut decouper
597 longueur = min(longueur, len(chaine) - 1)
598 chaineTravail = chaine[:longueur + 1]
600 meilleur = ['', 0, 0] # type, poids et position du meilleur separateur
603 indice = chaineTravail.index(u'$')
610 caractere = chaineTravail[nbCar]
611 distance = abs(longueurOptimale - nbCar) + 1
612 meilleureDistance = abs(longueurOptimale - meilleur[2]) + 1
613 if caractere in dsep :
614 if (float(dsep[caractere]) / distance) > (float(meilleur[1]) / meilleureDistance) :
615 meilleur[0] = caractere
616 meilleur[1] = dsep[caractere]
621 if (float(dsep[' ']) / distance) > (float(meilleur[1]) / meilleureDistance) :
622 meilleur[0] = caractere
623 meilleur[1] = dsep[' ']
630 fin = chaine[iDecoupe + 1:]
631 retour = chaineTravail[:iDecoupe]
632 return len(retour) > 0, retour, fin
633 # si on a rien trouvé
634 return False, chaine, ''
636 def testetoile(line) :
637 return line.startswith(u'****')
640 return line[0:4].isdigit() and u'*' in line
642 def prep_txtlist(txt) :
643 return txt.split() + [u'$']
645 def prep_txtcharact(txt) :
650 Class for building a corpora
652 def __init__(self, infile, parametres_corpus, lexique = None, expressions = None, dlg = None) :
653 log.info('begin building corpus...')
654 self.lexique = lexique
655 self.expressions = expressions
657 self.corpus = Corpus(self, parametres_corpus)
660 self.lim = parametres_corpus.get('lim', 1000000)
661 self.encoding = parametres_corpus['encoding']
662 self.corpus.pathout = PathOut(filename = parametres_corpus['originalpath'], dirout = parametres_corpus['pathout'])
663 self.corpus.pathout.createdir(parametres_corpus['pathout'])
664 self.corpus.parametres['uuid'] = str(uuid4())
665 self.corpus.parametres['corpus_name'] = os.path.split(self.corpus.parametres['pathout'])[1]
666 self.corpus.parametres['type'] = 'corpus'
667 if self.corpus.parametres['keep_ponct'] :
668 self.ponctuation_espace = [' ', '']
670 self.ponctuation_espace = [' ','.', u'£', ';', '?', '!', ',', ':','']
672 self.tolist = self.corpus.parametres.get('tolist', 0)
679 def prep_makeuce(self) :
680 method = self.corpus.parametres.get('ucemethod', 0)
682 self.decouper = decouperlist
683 self.prep_txt = prep_txtlist
684 self.ucesize = self.corpus.parametres.get('ucesize', 40)
686 self.decouper = decoupercharact
687 self.prep_txt = prep_txtcharact
688 self.ucesize = self.corpus.parametres.get('ucesize', 240)
689 log.info('method uce : %s' % method)
693 self.read_corpus(self.infile)
695 self.corpus.parametres['ira'] = self.corpus.pathout['Corpus.cira']
696 self.time = time() - t1
698 DoConf().makeoptions(['corpus'],[self.corpus.parametres], self.corpus.pathout['Corpus.cira'])
699 log.info('time : %f' % (time() - t1))
702 self.conn_f = sqlite3.connect(self.corpus.pathout['formes.db'])
703 self.cf = self.conn_f.cursor()
704 self.cf.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
705 self.cf.execute('CREATE TABLE IF NOT EXISTS eff (id INTEGER, eff TEXT);')
707 self.cf = self.conn_f.cursor()
708 self.cf.execute('PRAGMA temp_store=MEMORY;')
709 self.cf.execute('PRAGMA journal_mode=MEMORY;')
710 self.cf.execute('PRAGMA synchronous = OFF;')
711 self.cf.execute('begin')
712 self.conn = sqlite3.connect(self.corpus.pathout['uces.db'])
713 self.c = self.conn.cursor()
714 self.c.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
716 self.c = self.conn.cursor()
717 self.c.execute('PRAGMA temp_store=MEMORY;')
718 self.c.execute('PRAGMA journal_mode=MEMORY;')
719 self.c.execute('PRAGMA synchronous = OFF;')
720 self.c.execute('begin')
723 #commit index and close db
726 self.cf.execute('CREATE INDEX iduces ON uces (id);')
727 self.cf.execute('CREATE INDEX ideff ON eff (id);')
731 self.conn_corpus = sqlite3.connect(self.corpus.pathout['corpus.db'])
732 self.ccorpus = self.conn_corpus.cursor()
733 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS etoiles (uci INTEGER, et TEXT, paras TEXT);')
734 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS luces (uci INTEGER, para INTEGER, uce INTEGER);')
735 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS formes (ident INTEGER, forme TEXT, lem TEXT, gram TEXT, freq INTEGER);')
736 self.conn_corpus.commit()
737 self.ccorpus = self.conn_corpus.cursor()
738 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
739 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
740 self.ccorpus.execute('PRAGMA synchronous = OFF;')
741 self.ccorpus.execute('begin')
743 self.ccorpus.execute('CREATE INDEX iduci ON luces (uci);')
744 self.conn_corpus.commit()
745 self.conn_corpus.close()
746 #self.corpus.parametres['corpus_ira'] = self.corpus.pathout['corpus.cira']
748 def buildcleans(self) :
749 if self.corpus.parametres.get('lower', 1) :
750 self.cleans.append(self.dolower)
751 if self.corpus.parametres.get('firstclean', 1) :
752 self.cleans.append(self.firstclean)
753 self.rule = self.corpus.parametres.get('keep_caract', u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_-")
754 self.cleans.append(self.docharact)
755 if self.corpus.parametres.get('expressions', 1) :
756 self.cleans.append(self.make_expression)
757 if self.corpus.parametres.get('apos', 1) :
758 self.cleans.append(self.doapos)
759 if self.corpus.parametres.get('tiret', 1):
760 self.cleans.append(self.dotiret)
762 def make_expression(self,txt) :
763 for expression in self.expressions:
764 if expression in txt :
765 txt = txt.replace(expression, self.expressions[expression][0])
768 def dolower(self, txt) :
771 def docharact(self, txt) :
772 #rule = u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_-"
773 list_keep = u"[" + self.rule + "]+"
774 return re.sub(list_keep, ' ', txt)
776 def doapos(self, txt) :
777 return txt.replace(u'\'', u' ')
779 def dotiret(self, txt) :
780 return txt.replace(u'-', u' ')
782 def firstclean(self, txt) :
783 txt = txt.replace(u'’',"'")
784 txt = txt.replace(u'œ', u'oe')
785 return txt.replace('...',u' £ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ')
787 def make_cleans(self, txt) :
788 for clean in self.cleans :
792 def backup_uce(self) :
793 if self.corpus.idformesuces != {} :
794 log.info('backup %i' % len(self.corpus.idformesuces))
795 touce = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].keys()])) for forme in self.corpus.idformesuces]
796 toeff = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].values()])) for forme in self.corpus.idformesuces]
797 self.cf.executemany('INSERT INTO uces VALUES (?,?);', touce)
798 self.cf.executemany('INSERT INTO eff VALUES (?,?);', toeff)
799 self.corpus.idformesuces = {}
802 def backup_corpus(self) :
803 log.info('start backup corpus')
805 for uci in self.corpus.ucis :
806 self.ccorpus.execute('INSERT INTO etoiles VALUES (?,?,?);' ,(uci.ident,' '.join(uci.etoiles), ' '.join(uci.paras,)))
807 for uce in uci.uces :
808 self.ccorpus.execute('INSERT INTO luces VALUES (?,?,?);',(`uci.ident`,`uce.para`,`uce.ident`,))
809 for forme in self.corpus.formes :
810 self.ccorpus.execute('INSERT INTO formes VALUES (?,?,?,?,?);', (`self.corpus.formes[forme].ident`, forme, self.corpus.formes[forme].lem, self.corpus.formes[forme].gram, `self.corpus.formes[forme].freq`,))
811 log.info('%f' % (time() - t))
814 self.corpus.parametres['date'] = datetime.datetime.now().ctime()
815 minutes, seconds = divmod(self.time, 60)
816 hours, minutes = divmod(minutes, 60)
817 self.corpus.parametres['time'] = '%.0fh %.0fm %.0fs' % (hours, minutes, seconds)
818 self.corpus.parametres['ucinb'] = self.corpus.getucinb()
819 self.corpus.parametres['ucenb'] = self.corpus.getucenb()
820 self.corpus.parametres['occurrences'] = self.corpus.gettotocc()
821 self.corpus.parametres['formesnb'] = len(self.corpus.formes)
822 hapaxnb = self.corpus.gethapaxnb()
823 pourhapaxf = (float(hapaxnb) / len(self.corpus.formes)) * 100
824 pourhapaxocc = (float(hapaxnb) / self.corpus.parametres['occurrences']) * 100
825 self.corpus.parametres['hapax'] = '%i - %.2f %% des formes - %.2f %% des occurrences' % (hapaxnb, pourhapaxf, pourhapaxocc)
828 class BuildFromAlceste(BuildCorpus) :
829 #def __init___(self, infile, parametres_corpus) :
830 # BuildCorpus.__init__(self, infile, parametres_corpus)
833 def read_corpus(self, infile) :
836 if self.corpus.parametres['ucimark'] == 0 :
837 self.testuci = testetoile
838 elif self.corpus.parametres['ucimark'] == 1 :
839 self.testuci = testint
844 with codecs.open(infile, 'rU', self.encoding) as f :
846 if self.testuci(line) :
849 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci - 1)
851 self.corpus.ucis.append(Uci(iduci, line))
853 self.corpus.ucis.append(Uci(iduci, line))
854 elif line.startswith(u'-*') :
856 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
859 self.corpus.ucis[-1].paras.append(line.split()[0])
860 elif line.strip() != '' and iduci != -1 :
863 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
867 def treattxt(self, txt, iduce, idpara, iduci) :
869 #log.debug('ATTENTION CHINOIS -> charactères')
870 #clean_chinois = [self.firstclean, self.dolower, self.make_expression, self.doapos, self.dotiret]
871 #log.debug('ATTENTION CHINOIS -> list(text)')
872 #txt = ' '.join(list(txt))
873 txt = self.make_cleans(txt)#, clean_chinois)
874 ucetxt = self.make_uces(txt, self.corpus.parametres['douce'])
875 if self.corpus.ucis[-1].paras == [] :
879 self.corpus.ucis[-1].uces.append(Uce(iduce, idpara, iduci))
880 self.c.execute('INSERT INTO uces VALUES(?,?);', (`iduce`,uce))
887 self.corpus.add_word(word)
888 if self.dlg is not None :
889 if self.limitshow > self.count :
890 self.dlg.Pulse('uci: %i - uce : %i' % (iduci + 1, iduce +1))
894 self.limitshow = self.last / 100000
895 log.debug(`iduci`, `idpara`, `iduce`)
896 if self.last > self.lim :
901 def make_uces(self, txt, douce = True, keep_ponct = False) :
902 txt = ' '.join(txt.split())
905 reste, texte_uce, suite = self.decouper(self.prep_txt(txt), self.ucesize + 15, self.ucesize)
913 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
916 reste, texte_uce, suite = self.decouper(suite, self.ucesize + 15, self.ucesize)
924 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
926 print 'RESTEE UUCEEEEEEEEEEEEE', uce
930 return [' '.join([val for val in txt.split() if val not in self.ponctuation_espace])]
933 #make_uces (decouper)
934 #treat_txt (make_uces)
938 def __init__(self, parent, dlg = None) :
941 parametres = DoConf(os.path.join(self.parent.UserConfigPath,'corpus.cfg')).getoptions('corpus')
942 parametres['pathout'] = PathOut(parent.filename, 'corpus').mkdirout()
943 dial = CorpusPref(parent, parametres)
944 dial.CenterOnParent()
945 dial.txtpath.SetLabel(parent.filename)
946 #dial.repout_choices.SetValue(parametres['pathout'])
947 self.res = dial.ShowModal()
948 if self.res == 5100 :
949 parametres = dial.doparametres()
950 parametres['originalpath'] = parent.filename
951 PathOut().createdir(parametres['pathout'])
952 ReadLexique(self.parent, lang = parametres['lang'])
953 self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp'))
954 self.parametres = parametres
957 def doanalyse(self) :
958 return BuildFromAlceste(self.parent.filename, self.parametres, self.parent.lexique, self.parent.expressions, dlg = self.dlg).corpus
961 if __name__ == '__main__' :
963 parametres = {'formesdb':'formes.db', 'ucesdb': 'uces.db', 'corpusdb' : 'corpus.db', 'syscoding' : 'utf-8', 'encoding' : encoding}
964 intro = BuildCorpus(infile, parametres)#, tar_in, tar_infouce)#, tar_formes)