1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
11 from functions import decoupercharact, ReadDicoAsDico, DoConf
16 from operator import itemgetter
17 from uuid import uuid4
18 from chemins import PathOut
19 from dialog import CorpusPref
20 from functions import ReadLexique, ReadDicoAsDico
21 from colors import colors
25 log = logging.getLogger('iramuteq.corpus')
28 def copycorpus(corpus) :
29 log.info('copy corpus')
30 copy_corpus = Corpus(corpus.parent, parametres = corpus.parametres)
31 copy_corpus.ucis = corpus.ucis
32 copy_corpus.formes = corpus.formes
33 copy_corpus.pathout = corpus.pathout
34 copy_corpus.conn_all()
44 def __init__(self, parent, parametres = {}, read = False) :
46 self.parametres = parametres
48 self.connformes = None
50 self.conncorpus = None
57 self.idformesuces = {}
62 self.pathout = PathOut(dirout = parametres['pathout'])
65 def add_word(self, word) :
66 if word in self.formes :
67 self.formes[word].freq += 1
68 if self.formes[word].ident in self.idformesuces :
69 if self.ucis[-1].uces[-1].ident in self.idformesuces[self.formes[word].ident] :
70 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] += 1
72 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] = 1
74 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident: 1}
76 if word in self.parent.lexique :
77 gramtype = self.parent.lexique[word][1]
78 lem = self.parent.lexique[word][0]
85 self.formes[word] = Word(word, gramtype, len(self.formes), lem)
86 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident : 1}
89 """connect corpus to db"""
90 if self.connformes is None :
91 log.info('connexion corpus')
92 self.connuces = sqlite3.connect(self.pathout['uces.db'])
93 self.cuces = self.connuces.cursor()
94 self.connformes = sqlite3.connect(self.pathout['formes.db'])
95 self.cformes = self.connformes.cursor()
96 self.conncorpus = sqlite3.connect(self.pathout['corpus.db'])
97 self.ccorpus = self.conncorpus.cursor()
98 self.cformes.execute('PRAGMA temp_store=MEMORY;')
99 self.cformes.execute('PRAGMA journal_mode=MEMORY;')
100 self.cformes.execute('PRAGMA synchronous = OFF;')
101 self.cuces.execute('PRAGMA temp_store=MEMORY;')
102 self.cuces.execute('PRAGMA journal_mode=MEMORY;')
103 self.cuces.execute('PRAGMA synchronous = OFF;')
104 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
105 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
106 self.ccorpus.execute('PRAGMA synchronous = OFF;')
108 def read_corpus(self) :
109 log.info('read corpus')
110 self.parametres['syscoding'] = sys.getdefaultencoding()
111 if self.conncorpus is None :
113 res = self.ccorpus.execute('SELECT * FROM etoiles;')
115 self.ucis.append(Uci(row[0], row[1], row[2]))
116 uces = self.conncorpus.cursor().execute('SELECT * FROM luces where uci=?;',(`self.ucis[-1].ident`,))
118 self.ucis[-1].uces.append(Uce(uce[2], uce[1], uce[0]))
119 res = self.ccorpus.execute('SELECT * FROM formes;')
120 self.formes = dict([[forme[1], Word(forme[1], forme[3], forme[0], lem = forme[2], freq = forme[4])] for forme in res])
123 def getworduces(self, wordid) :
124 if isinstance(wordid, basestring) :
125 wordid = self.formes[wordid].ident
126 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`wordid`,))
127 return list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
129 def getformeuceseff(self, formeid) :
130 if isinstance(formeid, basestring) :
131 formeid = self.formes[formeid].ident
132 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`formeid`,))
133 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
134 query = 'SELECT eff FROM eff where id=%i ORDER BY id' % formeid
135 res = self.cformes.execute(query)
136 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
138 for i, uce in enumerate(uces) :
139 formeuceeff[uce] = formeuceeff.get(uce, 0) + eff[i]
142 def getlemuces(self, lem) :
143 formesid = ', '.join([`val` for val in self.lems[lem].formes])
144 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
145 res = self.cformes.execute(query)
146 return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))))
148 def getlemucis(self, lem) :
149 uces = self.getlemuces(lem)
150 return list(set([self.getucefromid(val).uci for val in uces]))
152 def getlemuceseff(self, lem, luces = None) :
153 formesid = ', '.join([`val` for val in self.lems[lem].formes])
154 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
155 res = self.cformes.execute(query)
156 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
157 query = 'SELECT eff FROM eff where id IN (%s) ORDER BY id' % formesid
158 res = self.cformes.execute(query)
159 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
161 for i, uce in enumerate(uces) :
162 lemuceeff[uce] = lemuceeff.get(uce, 0) + eff[i]
165 def getlemclustereff(self, lem, cluster) :
166 return len(list(set(self.lc[cluster]).intersection(self.getlemuces(lem))))
168 def getlemeff(self, lem) :
169 return self.lems[lem].freq
174 def getforme(self, formeid) :
175 if self.idformes is None : self.make_idformes()
176 return self.idformes[formeid]
178 def gettotocc(self) :
179 return sum([self.formes[forme].freq for forme in self.formes])
181 def getucemean(self) :
182 return float(self.gettotocc())/self.getucenb()
185 return self.ucis[-1].uces[-1].ident + 1
188 return self.ucis[-1].ident + 1
190 def getucisize(self) :
191 ucesize = self.getucesize()
192 return [sum(ucesize[uci.uces[0].ident:(uci.uces[-1].ident + 1)]) for uci in self.ucis]
194 def getucesize(self) :
195 res = self.getalluces()
196 return [len(uce[1].split()) for uce in res]
198 def getconcorde(self, uces) :
199 return self.cuces.execute('select * from uces where id IN (%s);' % ', '.join([`i` for i in uces]))
201 def getwordconcorde(self, word) :
202 return self.getconcorde(self.getworduces(word))
204 def getlemconcorde(self, lem) :
205 return self.getconcorde(self.getlemuces(lem))
207 def getalluces(self) :
208 return self.cuces.execute('SELECT * FROM uces')
210 def getucesfrometoile(self, etoile) :
211 return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles]
213 def getetoileuces(self) :
214 log.info('get uces etoiles')
217 for uci in self.ucis :
218 etoiles = uci.etoiles[1:]
220 if et in etoileuces :
221 etoileuces[et] += [uce.ident for uce in uci.uces]
223 etoileuces[et] = [uce.ident for uce in uci.uces]
225 for et in uci.paras :
226 if et in etoileuces :
227 etoileuces[et] += [uce.ident for uce in uci.uces if uce.para == idpara]
229 etoileuces[et] = [uce.ident for uce in uci.uces if uce.para == idpara]
235 def getucefromid(self, uceid) :
236 if self.iduces is None : self.make_iduces()
237 return self.iduces[uceid]
239 def gethapaxnb(self) :
240 return len([None for forme in self.formes if self.formes[forme].freq == 1])
242 def getactivesnb(self, key) :
243 return len([lem for lem in self.lems if self.lems[lem].act == key])
244 # def make_lems(self, lem = True) :
245 # log.info('make lems')
247 # for forme in self.formes :
248 # if self.formes[forme].lem in self.lems :
249 # if self.formes[forme].ident not in self.lems[self.formes[forme].lem] :
250 # self.lems[self.formes[forme].lem][self.formes[forme].ident] = 0
252 # self.lems[self.formes[forme].lem] = {self.formes[forme].ident : 0}
254 def getetbyuceid(self, uceid) :
255 if self.uceuci is None : self.uceuci = dict([[uce.ident,uci.ident] for uci in self.ucis for uce in uci.uces])
256 return self.ucis[self.uceuci[uceid]].etoiles
258 def make_lems(self, lem = True) :
259 log.info('make lems')
262 for forme in self.formes :
263 if self.formes[forme].lem in self.lems :
264 if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes :
265 self.lems[self.formes[forme].lem].add_forme(self.formes[forme])
267 self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
269 self.lems = dict([[forme, Lem(self, self.formes[forme])] for forme in self.formes])
271 def make_idformes(self) :
272 self.idformes = dict([[self.formes[forme].ident, self.formes[forme]] for forme in self.formes])
274 def make_iduces(self) :
275 if self.iduces is None :
276 self.iduces = dict([[uce.ident, uce] for uci in self.ucis for uce in uci.uces])
278 def make_lexitable(self, mineff, etoiles) :
279 tokeep = [lem for lem in self.lems if self.lems[lem].freq >= mineff]
280 etuces = [[] for et in etoiles]
281 for uci in self.ucis :
282 get = list(set(uci.etoiles).intersection(etoiles))
284 return '2 variables sur la meme ligne'
286 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
287 etuces = [set(val) for val in etuces]
290 deff = self.getlemuceseff(lem)
292 tab.append([lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])
293 tab.insert(0, [''] + etoiles)
296 def make_efftype_from_etoiles(self, etoiles) :
298 etuces = [[] for et in etoiles]
299 for uci in self.ucis :
300 get = list(set(uci.etoiles).intersection(etoiles))
302 return '2 variables sur la meme ligne'
304 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
305 etuces = [set(val) for val in etuces]
306 for lem in self.lems :
307 deff = self.getlemuceseff(lem)
309 gram = self.lems[lem].gram
311 dtype[gram] = [i + j for i, j in zip(dtype[gram], [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])]
313 dtype[gram] = [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces]
314 tabout = [[gram] + dtype[gram] for gram in dtype]
315 tabout.insert(0, [''] + etoiles)
318 def make_uceactsize(self, actives) :
319 res = self.getalluces()
322 deff = self.getlemuceseff(lem)
324 ucesize[uce] = ucesize.get(uce, 0) + 1
327 def make_uc(self, actives, lim1, lim2) :
328 uceactsize = self.make_uceactsize(actives)
334 for uce in [uce for uci in self.ucis for uce in uci.uces] :
335 if uce.para == lastpara :
337 last1 += uceactsize.get(uce.ident,0)
338 uc1[-1].append(uce.ident)
340 uc1.append([uce.ident])
343 last2 += uceactsize.get(uce.ident, 0)
344 uc2[-1].append(uce.ident)
346 uc2.append([uce.ident])
349 last1 = uceactsize.get(uce.ident, 0)
350 last2 = uceactsize.get(uce.ident, 0)
352 uc1.append([uce.ident])
353 uc2.append([uce.ident])
356 def make_and_write_sparse_matrix_from_uc(self, actives, sizeuc1, sizeuc2, uc1out, uc2out, listuce1out, listuce2out) :
357 uc1, uc2 = self.make_uc(actives, sizeuc1, sizeuc2)
358 log.info('taille uc1 : %i - taille uc2 : %i' % (len(uc1), len(uc2)))
359 self.write_ucmatrix(uc1, actives, uc1out)
360 self.write_ucmatrix(uc2, actives, uc2out)
361 listuce1 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc1) for uce in ucl]
362 listuce2 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc2) for uce in ucl]
363 with open(listuce1out, 'w') as f :
364 f.write('\n'.join([';'.join(line) for line in listuce1]))
365 with open(listuce2out, 'w') as f :
366 f.write('\n'.join([';'.join(line) for line in listuce2]))
367 return len(uc1), len(uc2)
369 def write_ucmatrix(self, uc, actives, fileout) :
370 log.info('write uc matrix %s' % fileout)
371 uces_uc = dict([[uce, i] for i, ucl in enumerate(uc) for uce in ucl])
374 with open(fileout + '~', 'w+') as f :
375 for i, lem in enumerate(actives) :
376 for uce in self.getlemuces(lem):
377 if (uces_uc[uce], i) not in deja_la :
379 f.write(''.join([' '.join([`uces_uc[uce]+1`,`i+1`,`1`]),'\n']))
380 deja_la[(uces_uc[uce], i)] = 0
382 with open(fileout, 'w') as ffin :
383 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (len(uc), len(actives), nbl))
386 os.remove(fileout + '~')
389 def export_corpus(self, outf) :
390 #outf = 'export_corpus.txt'
392 res = self.getalluces()
396 with open(outf,'w') as f :
398 if self.iduces[uce[0]].uci == actuci and self.iduces[uce[0]].para == actpara :
399 f.write(uce[1].encode(self.parametres['syscoding']) + '\n')
400 elif self.iduces[uce[0]].uci != actuci :
401 actuci = self.iduces[uce[0]].uci
402 if self.ucis[self.iduces[uce[0]].uci].paras == [] :
403 actpara = self.iduces[uce[0]].para
404 f.write('\n' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n' + uce[1].encode(self.parametres['syscoding']) + '\n')
407 actpara = self.iduces[uce[0]].para
408 f.write('\n'.join([' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']), self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
409 elif self.iduces[uce[0]].para != actpara :
410 actpara = self.iduces[uce[0]].para
412 f.write('\n'.join([self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
414 def export_corpus_classes(self, outf, alc = True, lem = False) :
416 for i, lc in enumerate(self.lc) :
419 for uce in self.lc0 :
421 res = self.getalluces()
423 with open(outf, 'w') as f :
426 actuci = self.iduces[uce[0]].uci
428 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
430 etline = ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles + ['*classe_%i' % ucecl[uce[0]]])
432 etline = ' '.join(['<' + '='.join(et.split('_')) + '>' for et in self.ucis[self.iduces[uce[0]].uci].etoiles[1:]])
433 f.write(etline.encode(self.parametres['syscoding']) + '\n')
434 f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
436 def make_and_write_sparse_matrix_from_uces(self, actives, outfile, listuce = False) :
437 log.info('make_and_write_sparse_matrix_from_uces %s' % outfile)
439 with open(outfile + '~', 'w+') as f :
440 for i, lem in enumerate(actives) :
441 for uce in sorted(self.getlemuces(lem)) :
443 f.write(''.join([' '.join([`uce+1`, `i+1`,`1`]),'\n']))
445 with open(outfile, 'w') as ffin :
446 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
449 os.remove(outfile + '~')
451 with open(listuce, 'w') as f :
452 f.write('\n'.join(['uce;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucenb())]))
454 def make_and_write_sparse_matrix_from_uci(self, actives, outfile, listuci = False) :
455 log.info('make_and_write_sparse_matrix_from_ucis %s' % outfile)
457 with open(outfile + '~', 'w+') as f :
458 for i, lem in enumerate(actives) :
459 for uci in sorted(self.getlemucis(lem)) :
461 f.write(''.join([' '.join([`uci+1`, `i+1`,`1`]),'\n']))
463 with open(outfile, 'w') as ffin :
464 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucinb(), len(actives), nbl))
467 os.remove(outfile + '~')
469 with open(listuci, 'w') as f :
470 f.write('\n'.join(['uci;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucinb())]))
472 def make_and_write_sparse_matrix_from_classe(self, actives, uces, outfile) :
473 log.info('make_and_write_sparse_matrix_from_classe %s' % outfile)
475 duces = dict([[uce, i] for i, uce in enumerate(uces)])
476 with open(outfile + '~', 'w+') as f :
477 for i, lem in enumerate(actives) :
478 uces_ok = list(set(self.getlemuces(lem)).intersection(uces))
480 f.write(''.join([' '.join([`duces[uce]+1`,`i+1`,`1`]),'\n']))
482 with open(outfile, 'w') as ffin :
483 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
486 os.remove(outfile + '~')
488 def make_table_with_classe(self, uces, list_act) :
489 table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
490 uces = dict([[uce, i] for i, uce in enumerate(uces)])
491 for i, lem in enumerate(list_act) :
492 lemuces = list(set(self.getlemuces(lem)).intersection(uces))
494 table_uce[uces[uce]][i] = 1
495 table_uce.insert(0, list_act)
498 def parse_active(self, gramact, gramsup = None) :
499 log.info('parse actives')
500 for lem in self.lems :
501 if lem.startswith('_') and lem.endswith('_') :
502 self.lems[lem].act = 2
503 elif self.lems[lem].gram in gramact :
504 self.lems[lem].act = 1
505 elif gramsup is not None :
506 if self.lems[lem].gram in gramsup :
507 self.lems[lem].act = 2
509 self.lems[lem].act = 0
511 self.lems[lem].act = 2
513 def make_actives_limit(self, limit, key = 1) :
514 if self.idformes is None :
516 return [lem for lem in self.lems if self.getlemeff(lem) >= limit and self.lems[lem].act == key]
518 def make_actives_nb(self, nbmax, key) :
519 log.info('make_actives_nb : %i - %i' % (nbmax,key))
520 if self.idformes is None :
522 allactives = [[self.lems[lem].freq, lem] for lem in self.lems if self.lems[lem].act == key and self.lems[lem].freq >= 3]
523 self.activenb = len(allactives)
524 allactives = sorted(allactives, reverse = True)
525 if len(allactives) <= nbmax :
526 log.info('nb = %i - eff min = %i ' % (len(allactives), allactives[-1][0]))
527 return [val[1] for val in allactives], allactives[-1][0]
529 effs = [val[0] for val in allactives]
530 if effs.count(effs[nbmax - 1]) > 1 :
531 lim = effs[nbmax - 1] + 1
535 stop = effs.index(lim)
542 log.info('nb actives = %i - eff min = %i ' % (stop + 1, lim))
543 return [val[1] for val in allactives[0:stop + 1]], lim
545 def make_and_write_profile(self, actives, ucecl, fileout) :
546 log.info('formes/classes')
547 tab = [[lem] + [len(set(self.getlemuces(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
548 tab = [[line[0]] + [`val` for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
549 with open(fileout, 'w') as f :
550 f.write('\n'.join([';'.join(line) for line in tab]).encode(self.parametres['syscoding']))
552 def make_etoiles(self) :
554 for uci in self.ucis :
555 etoiles.update(uci.etoiles[1:])
558 def make_etoiles_dict(self) :
559 etoiles = [et for uci in self.ucis for et in uci.etoiles[1:]]
561 for etoile in etoiles :
562 et = etoile.split('_')
565 endet = '_'.join(et[1:])
566 if endet in det[et[0]] :
567 det[et[0]][endet] += 1
569 det[et[0]][endet] = 1
574 endet = '_'.join(et[1:])
575 det[et[0]] = {endet :1}
580 def make_etline(self, listet) :
581 etuces = [[] for et in listet]
582 for uci in self.ucis :
583 get = list(set(uci.etoiles).intersection(listet))
585 return '2 variables sur la meme ligne'
587 etuces[listet.index(get[0])] += [uce.ident for uce in uci.uces]
590 def make_and_write_profile_et(self, ucecl, fileout) :
591 log.info('etoiles/classes')
592 etoileuces = self.getetoileuces()
593 etoileuces = dict([[et, etoileuces[et]] for et in etoileuces if len(etoileuces[et]) > 1])
594 with open(fileout, 'w') as f :
595 f.write('\n'.join([';'.join([et] + [`len(set(etoileuces[et]).intersection(classe))` for classe in ucecl]) for et in etoileuces]).encode(self.parametres['syscoding']))
596 #etoiles = self.make_etoiles()
597 #with open(fileout, 'w') as f :
598 # f.write('\n'.join([';'.join([etoile] + [`len(set(self.getucesfrometoile(etoile)).intersection(classe))` for classe in ucecl]) for etoile in etoiles]).encode(self.parametres['syscoding']))
600 def make_colored_corpus(self) :
602 for i, lc in enumerate(self.lc) :
605 for uce in self.lc0 :
607 color = ['black'] + colors[len(self.lc) - 1]
609 <meta http-equiv="content-Type" content="text/html; charset=%s" />
611 ''' % sys.getdefaultencoding()
612 res = self.getalluces()
617 if self.iduces[uce[0]].uci != actuci :
618 actuci = self.iduces[uce[0]].uci
619 txt += '<br><hr>' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles) + '<br><br>'
620 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
622 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
623 return txt + '\n</body></html>'
625 def count_from_list(self, l, d) :
633 def count_from_list_cl(self, l, d, a, clnb) :
642 def find_segments(self, taille_segment, taille_limite) :
644 for uce in self.getalluces() :
646 d = self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
647 l = [[d[val], val] for val in d if d[val] >= 3]
650 if len(l) > taille_limite :
651 l = l[-taille_limite:]
654 def find_segments_in_classe(self, list_uce, taille_segment, taille_limite):
656 for uce in self.getconcorde(list_uce) :
658 d =self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
659 l = [[d[val], val, taille_segment] for val in d if d[val] >= 3]
662 if len(l) > taille_limite :
663 l = l[-taille_limite:]
666 def make_segments_profile(self, fileout, lenmin = 3, lenmax = 10, effmin = 50, lem = False) :
668 for b, classe in enumerate(self.lc) :
669 for uce in self.getconcorde(classe) :
672 uce = [self.formes[forme].lem for forme in uce]
673 for taille_segment in range(lenmin,lenmax) :
674 d =self.count_from_list_cl([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d, b, len(self.lc))
675 result = [[seg] + [str(val) for val in d[seg]] for seg in d if sum(d[seg]) >= effmin]
676 with open(fileout, 'w') as f :
677 f.write('\n'.join([';'.join(line) for line in result]))
679 def make_proftype(self, outf) :
681 for lem in self.lems :
682 gram = self.lems[lem].gram
684 res[gram] = [0 for val in self.lc]
685 lemuceeff = self.getlemuceseff(lem)
686 for i, classe in enumerate(self.lc) :
687 concern = set(classe).intersection(lemuceeff.keys())
688 res[gram][i] += sum([lemuceeff[uce] for uce in concern])
689 res = [[gram] + [`val` for val in res[gram]] for gram in res]
691 with open(outf, 'w') as f :
692 f.write('\n'.join([';'.join(line) for line in res]).encode(self.parametres['syscoding']))
695 def make_ucecl_from_R(self, filein) :
696 with open(filein, 'rU') as f :
701 line = line.replace('\n', '').replace('"', '').split(';')
702 self.lc.append([int(line[0]) - 1, int(line[1])])
703 classesl = [val[1] for val in self.lc]
705 self.lc = sorted(self.lc, key=itemgetter(1))
706 self.lc = [[uce[0] for uce in self.lc if uce[1] == i] for i in range(clnb+1)]
707 self.lc0 = self.lc.pop(0)
710 def get_stat_by_cluster(self, outf) :
711 log.info('get_stat_by_cluster')
713 occurrences = dict([[i + 1, 0] for i in range(len(self.lc))])
714 formescl = dict([[i + 1, 0] for i in range(len(self.lc))])
715 hapaxcl = dict([[i + 1, 0] for i in range(len(self.lc))])
716 lenclasses = dict([[i+1,len(cl)] for i, cl in enumerate(self.lc)])
717 sets = [set(cl) for cl in self.lc]
718 for forme in self.formes :
719 formeuceeff = self.getformeuceseff(forme)
720 for i, classe in enumerate(self.lc) :
721 concern = sets[i].intersection(formeuceeff.keys())
723 occurrences[i+1] += sum([formeuceeff[uce] for uce in concern])
725 if self.formes[forme].freq == 1 :
727 toprint = '\n'.join([';'.join([`i`, `occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`]) for i in occurrences])
728 with open(outf, 'w') as f :
730 log.info('%f' % (time() - t1))
732 def gethapaxbyet(self, etoiles) :
733 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
735 for uce in hapaxuces :
736 if uce in hucesdict :
740 etuces = [[] for et in etoiles]
741 for uci in self.ucis :
742 get = list(set(uci.etoiles).intersection(etoiles))
744 return '2 variables sur la meme ligne'
746 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
747 etuces = [set(val) for val in etuces]
748 return [sum([hucesdict[uce] for uce in list(etuce.intersection(hapaxuces))]) for etuce in etuces]
750 def gethapaxuces(self) :
751 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
752 hapax = [forme for forme in self.lems if self.lems[forme].freq == 1]
754 for i,uce in enumerate(hapaxuces) :
755 if uce in hucesdict :
756 hucesdict[uce][0] += 1
757 hucesdict[uce][1].append(hapax[i])
759 hucesdict[uce] = [1,[hapax[i]]]
761 for uce in hucesdict :
762 if hucesdict[uce][0] in huces :
763 huces[hucesdict[uce][0]].append(uce)
765 huces[hucesdict[uce][0]] = [uce]
766 huces = zip(huces, huces.values())
767 huces.sort(reverse=True)
771 for nb in huces[0:4] :
772 txt += "<p><h2>%i hapax par uce</h2><p>\n" % nb[0]
774 res = self.getconcorde([uce])
776 ucetxt = ' ' + row[1] + ' '
778 for hap in hucesdict[uce][1] :
779 laforme = self.getforme([forme for forme in self.lems[hap].formes][0]).forme
780 ucetxt = ucetxt.replace(' '+laforme+' ', ' <font color=red>'+laforme+'</font> ')
781 txt += '<p><b>' + ' '.join(self.getetbyuceid(uceid)) + '</b></p>'
782 txt += '<p>'+ucetxt+'</p>\n'
786 with open('/tmp/testhapxuce.html','w') as f :
791 def __init__(self, corpus) :
792 ucinb = corpus.getucinb()
793 ucisize = corpus.getucisize()
794 ucimean = float(sum(ucisize))/float(ucinb)
795 detoile = corpus.make_etoiles_dict()
799 def __init__(self, iduci, line, paraset = None) :
801 self.etoiles = line.split()
803 if paraset is not None :
804 self.paras = paraset.split()
809 def __init__(self, iduce, idpara, iduci) :
815 def __init__(self, word, gramtype, idword, lem = None, freq = None) :
821 if freq is not None :
827 def __init__(self, parent, forme) :
828 self.formes = {forme.ident : forme.freq}
829 self.gram = forme.gram
830 self.freq = forme.freq
833 def add_forme(self, forme) :
834 self.formes[forme.ident] = forme.freq
835 self.freq += forme.freq
837 def decouperlist(chaine, longueur, longueurOptimale) :
839 on part du dernier caractère, et on recule jusqu'au début de la chaîne.
840 Si on trouve un '$', c'est fini.
841 Sinon, on cherche le meilleur candidat. C'est-à-dire le rapport poids/distance le plus important.
843 separateurs = [[u'.', 6.0], [u'?', 6.0], [u'!', 6.0], [u'£$£', 6.0], [u':', 5.0], [u';', 4.0], [u',', 1.0], [u' ', 0.01]]
844 dsep = dict([[val[0],val[1]] for val in separateurs])
845 trouve = False # si on a trouvé un bon séparateur
846 iDecoupe = 0 # indice du caractere ou il faut decouper
848 longueur = min(longueur, len(chaine) - 1)
849 chaineTravail = chaine[:longueur + 1]
851 meilleur = ['', 0, 0] # type, poids et position du meilleur separateur
854 indice = chaineTravail.index(u'$')
856 iDecoupe = indice - 1
861 caractere = chaineTravail[nbCar]
862 distance = abs(longueurOptimale - nbCar) + 1
863 meilleureDistance = abs(longueurOptimale - meilleur[2]) + 1
864 if caractere in dsep :
865 if (float(dsep[caractere]) / distance) > (float(meilleur[1]) / meilleureDistance) :
866 meilleur[0] = caractere
867 meilleur[1] = dsep[caractere]
872 if (float(dsep[' ']) / distance) > (float(meilleur[1]) / meilleureDistance) :
874 meilleur[1] = dsep[' ']
881 #if meilleur[0] != ' ' :
882 # fin = chaine[iDecoupe + 1:]
883 # retour = chaineTravail[:iDecoupe]
885 fin = chaine[iDecoupe + 1:]
886 retour = chaineTravail[:iDecoupe + 1]
887 return len(retour) > 0, retour, fin
888 # si on a rien trouvé
889 return False, chaine, ''
891 def testetoile(line) :
892 return line.startswith(u'****')
895 return line[0:4].isdigit() and u'*' in line
897 def prep_txtlist(txt) :
898 return txt.split() + [u'$']
900 def prep_txtcharact(txt) :
905 Class for building a corpus
907 def __init__(self, infile, parametres_corpus, lexique = None, expressions = None, dlg = None) :
908 log.info('begin building corpus...')
909 self.lexique = lexique
910 self.expressions = expressions
912 self.corpus = Corpus(self, parametres_corpus)
915 self.lim = parametres_corpus.get('lim', 1000000)
916 self.encoding = parametres_corpus['encoding']
917 self.corpus.pathout = PathOut(filename = parametres_corpus['originalpath'], dirout = parametres_corpus['pathout'])
918 self.corpus.pathout.createdir(parametres_corpus['pathout'])
919 self.corpus.parametres['uuid'] = str(uuid4())
920 self.corpus.parametres['corpus_name'] = os.path.split(self.corpus.parametres['pathout'])[1]
921 self.corpus.parametres['type'] = 'corpus'
922 if self.corpus.parametres['keep_ponct'] :
923 self.ponctuation_espace = [' ', '']
925 self.ponctuation_espace = [' ','.', u'£$£', ';', '?', '!', ',', ':','']
927 self.tolist = self.corpus.parametres.get('tolist', 0)
934 def prep_makeuce(self) :
935 method = self.corpus.parametres.get('ucemethod', 0)
937 self.decouper = decouperlist
938 self.prep_txt = prep_txtlist
939 self.ucesize = self.corpus.parametres.get('ucesize', 40)
941 self.decouper = decoupercharact
942 self.prep_txt = prep_txtcharact
943 self.ucesize = self.corpus.parametres.get('ucesize', 240)
944 log.info('method uce : %s' % method)
949 self.read_corpus(self.infile)
950 except Warning, args :
951 log.info('pas kool %s' % args)
955 self.corpus.parametres['ira'] = self.corpus.pathout['Corpus.cira']
956 self.time = time() - t1
958 DoConf().makeoptions(['corpus'],[self.corpus.parametres], self.corpus.pathout['Corpus.cira'])
959 log.info('time : %f' % (time() - t1))
962 self.conn_f = sqlite3.connect(self.corpus.pathout['formes.db'])
963 self.cf = self.conn_f.cursor()
964 self.cf.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
965 self.cf.execute('CREATE TABLE IF NOT EXISTS eff (id INTEGER, eff TEXT);')
967 self.cf = self.conn_f.cursor()
968 self.cf.execute('PRAGMA temp_store=MEMORY;')
969 self.cf.execute('PRAGMA journal_mode=MEMORY;')
970 self.cf.execute('PRAGMA synchronous = OFF;')
971 self.cf.execute('begin')
972 self.conn = sqlite3.connect(self.corpus.pathout['uces.db'])
973 self.c = self.conn.cursor()
974 self.c.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
976 self.c = self.conn.cursor()
977 self.c.execute('PRAGMA temp_store=MEMORY;')
978 self.c.execute('PRAGMA journal_mode=MEMORY;')
979 self.c.execute('PRAGMA synchronous = OFF;')
980 self.c.execute('begin')
983 #commit index and close db
986 self.cf.execute('CREATE INDEX iduces ON uces (id);')
987 self.cf.execute('CREATE INDEX ideff ON eff (id);')
991 self.conn_corpus = sqlite3.connect(self.corpus.pathout['corpus.db'])
992 self.ccorpus = self.conn_corpus.cursor()
993 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS etoiles (uci INTEGER, et TEXT, paras TEXT);')
994 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS luces (uci INTEGER, para INTEGER, uce INTEGER);')
995 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS formes (ident INTEGER, forme TEXT, lem TEXT, gram TEXT, freq INTEGER);')
996 self.conn_corpus.commit()
997 self.ccorpus = self.conn_corpus.cursor()
998 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
999 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
1000 self.ccorpus.execute('PRAGMA synchronous = OFF;')
1001 self.ccorpus.execute('begin')
1002 self.backup_corpus()
1003 self.ccorpus.execute('CREATE INDEX iduci ON luces (uci);')
1004 self.conn_corpus.commit()
1005 self.conn_corpus.close()
1006 #self.corpus.parametres['corpus_ira'] = self.corpus.pathout['corpus.cira']
1008 def buildcleans(self) :
1009 if self.corpus.parametres.get('lower', 1) :
1010 self.cleans.append(self.dolower)
1011 if self.corpus.parametres.get('firstclean', 1) :
1012 self.cleans.append(self.firstclean)
1013 if self.corpus.parametres['charact'] :
1014 self.rule = self.corpus.parametres.get('keep_caract', u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_")
1015 self.cleans.append(self.docharact)
1016 if self.corpus.parametres.get('expressions', 1) :
1017 self.cleans.append(self.make_expression)
1018 if self.corpus.parametres.get('apos', 1) :
1019 self.cleans.append(self.doapos)
1020 if self.corpus.parametres.get('tiret', 1):
1021 self.cleans.append(self.dotiret)
1023 def make_expression(self,txt) :
1024 for expression in self.expressions:
1025 if expression in txt :
1026 txt = txt.replace(expression, self.expressions[expression][0])
1029 def dolower(self, txt) :
1032 def docharact(self, txt) :
1033 #rule = u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_-"
1034 list_keep = u"[" + self.rule + "]+"
1035 return re.sub(list_keep, ' ', txt)
1037 def doapos(self, txt) :
1038 return txt.replace(u'\'', u' ')
1040 def dotiret(self, txt) :
1041 return txt.replace(u'-', u' ')
1043 def firstclean(self, txt) :
1044 txt = txt.replace(u'’',"'")
1045 txt = txt.replace(u'œ', u'oe')
1046 return txt.replace('...',u' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'…', ' £$£ ')
1048 def make_cleans(self, txt) :
1049 for clean in self.cleans :
1053 def backup_uce(self) :
1054 if self.corpus.idformesuces != {} :
1055 log.info('backup %i' % len(self.corpus.idformesuces))
1056 touce = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].keys()])) for forme in self.corpus.idformesuces]
1057 toeff = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].values()])) for forme in self.corpus.idformesuces]
1058 self.cf.executemany('INSERT INTO uces VALUES (?,?);', touce)
1059 self.cf.executemany('INSERT INTO eff VALUES (?,?);', toeff)
1060 self.corpus.idformesuces = {}
1063 def backup_corpus(self) :
1064 log.info('start backup corpus')
1066 for uci in self.corpus.ucis :
1067 self.ccorpus.execute('INSERT INTO etoiles VALUES (?,?,?);' ,(uci.ident,' '.join(uci.etoiles), ' '.join(uci.paras,)))
1068 for uce in uci.uces :
1069 self.ccorpus.execute('INSERT INTO luces VALUES (?,?,?);',(`uci.ident`,`uce.para`,`uce.ident`,))
1070 for forme in self.corpus.formes :
1071 self.ccorpus.execute('INSERT INTO formes VALUES (?,?,?,?,?);', (`self.corpus.formes[forme].ident`, forme, self.corpus.formes[forme].lem, self.corpus.formes[forme].gram, `self.corpus.formes[forme].freq`,))
1072 log.info('%f' % (time() - t))
1074 def dofinish(self) :
1075 self.corpus.parametres['date'] = datetime.datetime.now().ctime()
1076 minutes, seconds = divmod(self.time, 60)
1077 hours, minutes = divmod(minutes, 60)
1078 self.corpus.parametres['time'] = '%.0fh %.0fm %.0fs' % (hours, minutes, seconds)
1079 self.corpus.parametres['ucinb'] = self.corpus.getucinb()
1080 self.corpus.parametres['ucenb'] = self.corpus.getucenb()
1081 self.corpus.parametres['occurrences'] = self.corpus.gettotocc()
1082 self.corpus.parametres['formesnb'] = len(self.corpus.formes)
1083 hapaxnb = self.corpus.gethapaxnb()
1084 pourhapaxf = (float(hapaxnb) / len(self.corpus.formes)) * 100
1085 pourhapaxocc = (float(hapaxnb) / self.corpus.parametres['occurrences']) * 100
1086 self.corpus.parametres['hapax'] = '%i - %.2f %% des formes - %.2f %% des occurrences' % (hapaxnb, pourhapaxf, pourhapaxocc)
1089 class BuildFromAlceste(BuildCorpus) :
1090 def read_corpus(self, infile) :
1091 if self.dlg is not None :
1092 self.dlg.Pulse('textes : 0 - segments : 0')
1095 if self.corpus.parametres['ucimark'] == 0 :
1096 self.testuci = testetoile
1097 elif self.corpus.parametres['ucimark'] == 1 :
1098 self.testuci = testint
1104 with codecs.open(infile, 'r', self.encoding) as f :
1105 for linenb, line in enumerate(f) :
1106 line = line.rstrip('\n\r')
1107 if self.testuci(line) :
1110 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci - 1)
1112 self.corpus.ucis.append(Uci(iduci, line))
1115 if self.corpus.ucis[-1].uces == [] :
1116 log.info(u'Empty text : %i' % linenb)
1118 self.corpus.ucis.pop()
1119 #raise Exception("EmptyText %i" % linenb)
1120 self.corpus.ucis.append(Uci(iduci, line))
1121 if self.dlg is not None :
1122 if not (iduci + 1) % 10 :
1123 self.dlg.Pulse('textes : %i - segments : %i' % (iduci + 1, iduce +1))
1124 elif line.startswith(u'-*') :
1127 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1130 self.corpus.ucis[-1].paras.append(line.split()[0])
1132 raise Exception('paragrapheOT')
1133 elif line.strip() != '' and iduci != -1 :
1135 if txt != [] and iduci != -1 :
1136 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1139 raise Exception("EmptyText")
1140 if iduci != -1 and iduce != -1:
1143 log.info(_(u"No Text in corpora. Are you sure of the formatting ?"))
1144 raise Exception('TextBeforeTextMark')
1145 except UnicodeDecodeError :
1146 raise Exception("CorpusEncoding")
1148 def treattxt(self, txt, iduce, idpara, iduci) :
1149 if self.corpus.parametres.get('ucemethod', 0) == 2 and self.corpus.parametres['douce']:
1150 txt = 'laphrasepoursplitter'.join(txt)
1151 txt = self.make_cleans(txt)
1152 txt = ' '.join([val for val in txt.split() if val not in self.ponctuation_espace])
1153 ucetxt = txt.split('laphrasepoursplitter')
1156 txt = self.make_cleans(txt)
1157 ucetxt = self.make_uces(txt, self.corpus.parametres['douce'])
1158 if self.corpus.ucis[-1].paras == [] :
1162 self.corpus.ucis[-1].uces.append(Uce(iduce, idpara, iduci))
1163 self.c.execute('INSERT INTO uces VALUES(?,?);', (`iduce`,uce))
1164 if not self.tolist :
1170 self.corpus.add_word(word)
1171 log.debug(' '.join([`iduci`,`idpara`,`iduce`]))
1172 if self.last > self.lim :
1175 return iduce, idpara
1177 def make_uces(self, txt, douce = True, keep_ponct = False) :
1178 txt = ' '.join(txt.split())
1181 reste, texte_uce, suite = self.decouper(self.prep_txt(txt), self.ucesize + 15, self.ucesize)
1183 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1186 reste, texte_uce, suite = self.decouper(suite, self.ucesize + 15, self.ucesize)
1187 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1192 return [' '.join([val for val in txt.split() if val not in self.ponctuation_espace])]
1194 #decouper (list_sep)
1195 #make_uces (decouper)
1196 #treat_txt (make_uces)
1200 def __init__(self, parent, dlg = None) :
1201 self.parent = parent
1203 parametres = DoConf(os.path.join(self.parent.UserConfigPath,'corpus.cfg')).getoptions('corpus')
1204 parametres['pathout'] = PathOut(parent.filename, 'corpus').mkdirout()
1205 dial = CorpusPref(parent, parametres)
1206 dial.CenterOnParent()
1207 dial.txtpath.SetLabel(parent.filename)
1208 #dial.repout_choices.SetValue(parametres['pathout'])
1209 self.res = dial.ShowModal()
1210 if self.res == 5100 :
1211 parametres = dial.doparametres()
1212 parametres['originalpath'] = parent.filename
1213 PathOut().createdir(parametres['pathout'])
1214 ReadLexique(self.parent, lang = parametres['lang'])
1215 self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp'))
1216 self.parametres = parametres
1218 if self.dlg is not None :
1222 def doanalyse(self) :
1223 return BuildFromAlceste(self.parent.filename, self.parametres, self.parent.lexique, self.parent.expressions, dlg = self.dlg).corpus
1226 if __name__ == '__main__' :
1228 parametres = {'formesdb':'formes.db', 'ucesdb': 'uces.db', 'corpusdb' : 'corpus.db', 'syscoding' : 'utf-8', 'encoding' : encoding}
1229 intro = BuildCorpus(infile, parametres)#, tar_in, tar_infouce)#, tar_formes)