1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
11 from functions import decoupercharact, ReadDicoAsDico, DoConf
16 from operator import itemgetter
17 from uuid import uuid4
18 from chemins import PathOut
19 from dialog import CorpusPref
20 from functions import ReadLexique, ReadDicoAsDico
21 from colors import colors
25 log = logging.getLogger('iramuteq.corpus')
28 def copycorpus(corpus) :
29 log.info('copy corpus')
30 copy_corpus = Corpus(corpus.parent, parametres = corpus.parametres)
31 copy_corpus.ucis = corpus.ucis
32 copy_corpus.formes = corpus.formes
33 copy_corpus.pathout = corpus.pathout
34 copy_corpus.conn_all()
44 def __init__(self, parent, parametres = {}, read = False) :
46 self.parametres = parametres
48 self.connformes = None
50 self.conncorpus = None
57 self.idformesuces = {}
62 self.pathout = PathOut(dirout = parametres['pathout'])
65 def add_word(self, word) :
66 if word in self.formes :
67 self.formes[word].freq += 1
68 if self.formes[word].ident in self.idformesuces :
69 if self.ucis[-1].uces[-1].ident in self.idformesuces[self.formes[word].ident] :
70 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] += 1
72 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] = 1
74 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident: 1}
76 if word in self.parent.lexique :
77 gramtype = self.parent.lexique[word][1]
78 lem = self.parent.lexique[word][0]
85 self.formes[word] = Word(word, gramtype, len(self.formes), lem)
86 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident : 1}
89 """connect corpus to db"""
90 if self.connformes is None :
91 log.info('connexion corpus')
92 self.connuces = sqlite3.connect(self.pathout['uces.db'])
93 self.cuces = self.connuces.cursor()
94 self.connformes = sqlite3.connect(self.pathout['formes.db'])
95 self.cformes = self.connformes.cursor()
96 self.conncorpus = sqlite3.connect(self.pathout['corpus.db'])
97 self.ccorpus = self.conncorpus.cursor()
98 self.cformes.execute('PRAGMA temp_store=MEMORY;')
99 self.cformes.execute('PRAGMA journal_mode=MEMORY;')
100 self.cformes.execute('PRAGMA synchronous = OFF;')
101 self.cuces.execute('PRAGMA temp_store=MEMORY;')
102 self.cuces.execute('PRAGMA journal_mode=MEMORY;')
103 self.cuces.execute('PRAGMA synchronous = OFF;')
104 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
105 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
106 self.ccorpus.execute('PRAGMA synchronous = OFF;')
108 def read_corpus(self) :
109 log.info('read corpus')
110 self.parametres['syscoding'] = sys.getdefaultencoding()
111 if self.conncorpus is None :
113 res = self.ccorpus.execute('SELECT * FROM etoiles;')
115 self.ucis.append(Uci(row[0], row[1], row[2]))
116 uces = self.conncorpus.cursor().execute('SELECT * FROM luces where uci=?;',(`self.ucis[-1].ident`,))
118 self.ucis[-1].uces.append(Uce(uce[2], uce[1], uce[0]))
119 res = self.ccorpus.execute('SELECT * FROM formes;')
120 self.formes = dict([[forme[1], Word(forme[1], forme[3], forme[0], lem = forme[2], freq = forme[4])] for forme in res])
123 def getworduces(self, wordid) :
124 if isinstance(wordid, basestring) :
125 wordid = self.formes[wordid].ident
126 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`wordid`,))
127 return list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
129 def getformeuceseff(self, formeid) :
130 if isinstance(formeid, basestring) :
131 formeid = self.formes[formeid].ident
132 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`formeid`,))
133 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
134 query = 'SELECT eff FROM eff where id=%i ORDER BY id' % formeid
135 res = self.cformes.execute(query)
136 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
138 for i, uce in enumerate(uces) :
139 formeuceeff[uce] = formeuceeff.get(uce, 0) + eff[i]
142 def getlemuces(self, lem) :
143 formesid = ', '.join([`val` for val in self.lems[lem].formes])
144 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
145 res = self.cformes.execute(query)
146 return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))))
148 def getlemucis(self, lem) :
149 uces = self.getlemuces(lem)
150 return list(set([self.getucefromid(val).uci for val in uces]))
152 def getlemuceseff(self, lem, luces = None) :
153 formesid = ', '.join([`val` for val in self.lems[lem].formes])
154 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
155 res = self.cformes.execute(query)
156 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
157 query = 'SELECT eff FROM eff where id IN (%s) ORDER BY id' % formesid
158 res = self.cformes.execute(query)
159 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
161 for i, uce in enumerate(uces) :
162 lemuceeff[uce] = lemuceeff.get(uce, 0) + eff[i]
165 def getlemclustereff(self, lem, cluster) :
166 return len(list(set(self.lc[cluster]).intersection(self.getlemuces(lem))))
168 def getlemeff(self, lem) :
169 return self.lems[lem].freq
174 def getforme(self, formeid) :
175 if self.idformes is None : self.make_idformes()
176 return self.idformes[formeid]
178 def gettotocc(self) :
179 return sum([self.formes[forme].freq for forme in self.formes])
181 def getucemean(self) :
182 return float(self.gettotocc())/self.getucenb()
185 return self.ucis[-1].uces[-1].ident + 1
188 return self.ucis[-1].ident + 1
190 def getucisize(self) :
191 ucesize = self.getucesize()
192 return [sum(ucesize[uci.uces[0].ident:(uci.uces[-1].ident + 1)]) for uci in self.ucis]
194 def getucesize(self) :
195 res = self.getalluces()
196 return [len(uce[1].split()) for uce in res]
198 def getconcorde(self, uces) :
199 return self.cuces.execute('select * from uces where id IN (%s);' % ', '.join([`i` for i in uces]))
201 def getwordconcorde(self, word) :
202 return self.getconcorde(self.getworduces(word))
204 def getlemconcorde(self, lem) :
205 return self.getconcorde(self.getlemuces(lem))
207 def getalluces(self) :
208 return self.cuces.execute('SELECT * FROM uces')
210 def getucesfrometoile(self, etoile) :
211 return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles]
213 def getetoileuces(self) :
214 log.info('get uces etoiles')
217 for uci in self.ucis :
218 etoiles = uci.etoiles[1:]
220 if et in etoileuces :
221 etoileuces[et] += [uce.ident for uce in uci.uces]
223 etoileuces[et] = [uce.ident for uce in uci.uces]
225 for et in uci.paras :
226 if et in etoileuces :
227 etoileuces[et] += [uce.ident for uce in uci.uces if uce.para == idpara]
229 etoileuces[et] = [uce.ident for uce in uci.uces if uce.para == idpara]
235 def getucefromid(self, uceid) :
236 if self.iduces is None : self.make_iduces()
237 return self.iduces[uceid]
239 def gethapaxnb(self) :
240 return len([None for forme in self.formes if self.formes[forme].freq == 1])
242 def getactivesnb(self, key) :
243 return len([lem for lem in self.lems if self.lems[lem].act == key])
244 # def make_lems(self, lem = True) :
245 # log.info('make lems')
247 # for forme in self.formes :
248 # if self.formes[forme].lem in self.lems :
249 # if self.formes[forme].ident not in self.lems[self.formes[forme].lem] :
250 # self.lems[self.formes[forme].lem][self.formes[forme].ident] = 0
252 # self.lems[self.formes[forme].lem] = {self.formes[forme].ident : 0}
254 def getetbyuceid(self, uceid) :
255 if self.uceuci is None : self.uceuci = dict([[uce.ident,uci.ident] for uci in self.ucis for uce in uci.uces])
256 return self.ucis[self.uceuci[uceid]].etoiles
258 def make_lems(self, lem = True) :
259 log.info('make lems')
262 for forme in self.formes :
263 if self.formes[forme].lem in self.lems :
264 if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes :
265 self.lems[self.formes[forme].lem].add_forme(self.formes[forme])
267 self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
269 self.lems = dict([[forme, Lem(self, self.formes[forme])] for forme in self.formes])
271 def make_idformes(self) :
272 self.idformes = dict([[self.formes[forme].ident, self.formes[forme]] for forme in self.formes])
274 def make_iduces(self) :
275 if self.iduces is None :
276 self.iduces = dict([[uce.ident, uce] for uci in self.ucis for uce in uci.uces])
278 def make_lexitable(self, mineff, etoiles) :
279 tokeep = [lem for lem in self.lems if self.lems[lem].freq >= mineff]
280 etuces = [[] for et in etoiles]
281 for uci in self.ucis :
282 get = list(set(uci.etoiles).intersection(etoiles))
284 log.info('2 variables sur une ligne')
286 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
287 etuces = [set(val) for val in etuces]
290 deff = self.getlemuceseff(lem)
292 tab.append([lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])
293 tab.insert(0, [''] + etoiles)
296 def make_efftype_from_etoiles(self, etoiles) :
298 etuces = [[] for et in etoiles]
299 for uci in self.ucis :
300 get = list(set(uci.etoiles).intersection(etoiles))
302 return '2 variables sur la meme ligne'
304 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
305 etuces = [set(val) for val in etuces]
306 for lem in self.lems :
307 deff = self.getlemuceseff(lem)
309 gram = self.lems[lem].gram
311 dtype[gram] = [i + j for i, j in zip(dtype[gram], [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])]
313 dtype[gram] = [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces]
314 tabout = [[gram] + dtype[gram] for gram in dtype]
315 tabout.insert(0, [''] + etoiles)
318 def make_uceactsize(self, actives) :
319 res = self.getalluces()
322 deff = self.getlemuceseff(lem)
324 ucesize[uce] = ucesize.get(uce, 0) + 1
327 def make_uc(self, actives, lim1, lim2) :
328 uceactsize = self.make_uceactsize(actives)
334 for uce in [uce for uci in self.ucis for uce in uci.uces] :
335 if uce.para == lastpara :
337 last1 += uceactsize.get(uce.ident,0)
338 uc1[-1].append(uce.ident)
340 uc1.append([uce.ident])
343 last2 += uceactsize.get(uce.ident, 0)
344 uc2[-1].append(uce.ident)
346 uc2.append([uce.ident])
349 last1 = uceactsize.get(uce.ident, 0)
350 last2 = uceactsize.get(uce.ident, 0)
352 uc1.append([uce.ident])
353 uc2.append([uce.ident])
356 def make_and_write_sparse_matrix_from_uc(self, actives, sizeuc1, sizeuc2, uc1out, uc2out, listuce1out, listuce2out) :
357 uc1, uc2 = self.make_uc(actives, sizeuc1, sizeuc2)
358 log.info('taille uc1 : %i - taille uc2 : %i' % (len(uc1), len(uc2)))
359 self.write_ucmatrix(uc1, actives, uc1out)
360 self.write_ucmatrix(uc2, actives, uc2out)
361 listuce1 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc1) for uce in ucl]
362 listuce2 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc2) for uce in ucl]
363 with open(listuce1out, 'w') as f :
364 f.write('\n'.join([';'.join(line) for line in listuce1]))
365 with open(listuce2out, 'w') as f :
366 f.write('\n'.join([';'.join(line) for line in listuce2]))
367 return len(uc1), len(uc2)
369 def write_ucmatrix(self, uc, actives, fileout) :
370 log.info('write uc matrix %s' % fileout)
371 uces_uc = dict([[uce, i] for i, ucl in enumerate(uc) for uce in ucl])
374 with open(fileout + '~', 'w+') as f :
375 for i, lem in enumerate(actives) :
376 for uce in self.getlemuces(lem):
377 if (uces_uc[uce], i) not in deja_la :
379 f.write(''.join([' '.join([`uces_uc[uce]+1`,`i+1`,`1`]),'\n']))
380 deja_la[(uces_uc[uce], i)] = 0
382 with open(fileout, 'w') as ffin :
383 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (len(uc), len(actives), nbl))
386 os.remove(fileout + '~')
389 def export_corpus(self, outf) :
390 #outf = 'export_corpus.txt'
392 res = self.getalluces()
396 with open(outf,'w') as f :
398 if self.iduces[uce[0]].uci == actuci and self.iduces[uce[0]].para == actpara :
399 f.write(uce[1].encode(self.parametres['syscoding']) + '\n')
400 elif self.iduces[uce[0]].uci != actuci :
401 actuci = self.iduces[uce[0]].uci
402 if self.ucis[self.iduces[uce[0]].uci].paras == [] :
403 actpara = self.iduces[uce[0]].para
404 f.write('\n' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n' + uce[1].encode(self.parametres['syscoding']) + '\n')
407 actpara = self.iduces[uce[0]].para
408 f.write('\n'.join([' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']), self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
409 elif self.iduces[uce[0]].para != actpara :
410 actpara = self.iduces[uce[0]].para
412 f.write('\n'.join([self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
414 def export_corpus_classes(self, outf, alc = True, lem = False) :
416 for i, lc in enumerate(self.lc) :
419 for uce in self.lc0 :
421 res = self.getalluces()
423 with open(outf, 'w') as f :
426 actuci = self.iduces[uce[0]].uci
428 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
430 etline = ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles + ['*classe_%i' % ucecl[uce[0]]])
432 etline = ' '.join(['<' + '='.join(et.split('_')) + '>' for et in self.ucis[self.iduces[uce[0]].uci].etoiles[1:]])
433 f.write(etline.encode(self.parametres['syscoding']) + '\n')
434 f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
436 def export_classe(self, outf, classe, lem = False) :
437 sts = self.lc[classe]
438 res = self.getconcorde(sts)
440 with open(outf, 'w') as f :
443 f.write(' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n')
445 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
446 f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
448 def make_and_write_sparse_matrix_from_uces(self, actives, outfile, listuce = False) :
449 log.info('make_and_write_sparse_matrix_from_uces %s' % outfile)
451 with open(outfile + '~', 'w+') as f :
452 for i, lem in enumerate(actives) :
453 for uce in sorted(self.getlemuces(lem)) :
455 f.write(''.join([' '.join([`uce+1`, `i+1`,`1`]),'\n']))
457 with open(outfile, 'w') as ffin :
458 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
461 os.remove(outfile + '~')
463 with open(listuce, 'w') as f :
464 f.write('\n'.join(['uce;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucenb())]))
466 def make_and_write_sparse_matrix_from_uci(self, actives, outfile, listuci = False) :
467 log.info('make_and_write_sparse_matrix_from_ucis %s' % outfile)
469 with open(outfile + '~', 'w+') as f :
470 for i, lem in enumerate(actives) :
471 for uci in sorted(self.getlemucis(lem)) :
473 f.write(''.join([' '.join([`uci+1`, `i+1`,`1`]),'\n']))
475 with open(outfile, 'w') as ffin :
476 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucinb(), len(actives), nbl))
479 os.remove(outfile + '~')
481 with open(listuci, 'w') as f :
482 f.write('\n'.join(['uci;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucinb())]))
484 def make_and_write_sparse_matrix_from_classe(self, actives, uces, outfile) :
485 log.info('make_and_write_sparse_matrix_from_classe %s' % outfile)
487 duces = dict([[uce, i] for i, uce in enumerate(uces)])
488 with open(outfile + '~', 'w+') as f :
489 for i, lem in enumerate(actives) :
490 uces_ok = list(set(self.getlemuces(lem)).intersection(uces))
492 f.write(''.join([' '.join([`duces[uce]+1`,`i+1`,`1`]),'\n']))
494 with open(outfile, 'w') as ffin :
495 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
498 os.remove(outfile + '~')
500 def make_table_with_classe(self, uces, list_act) :
501 table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
502 uces = dict([[uce, i] for i, uce in enumerate(uces)])
503 for i, lem in enumerate(list_act) :
504 lemuces = list(set(self.getlemuces(lem)).intersection(uces))
506 table_uce[uces[uce]][i] = 1
507 table_uce.insert(0, list_act)
510 def parse_active(self, gramact, gramsup = None) :
511 log.info('parse actives')
512 for lem in self.lems :
513 if lem.startswith('_') and lem.endswith('_') :
514 self.lems[lem].act = 2
515 elif self.lems[lem].gram in gramact :
516 self.lems[lem].act = 1
517 elif gramsup is not None and self.lems[lem].gram not in gramact:
518 if self.lems[lem].gram in gramsup :
519 self.lems[lem].act = 2
521 self.lems[lem].act = 0
523 self.lems[lem].act = 2
525 def make_actives_limit(self, limit, key = 1) :
526 if self.idformes is None :
528 return [lem for lem in self.lems if self.getlemeff(lem) >= limit and self.lems[lem].act == key]
530 def make_actives_nb(self, nbmax, key) :
531 log.info('make_actives_nb : %i - %i' % (nbmax,key))
532 if self.idformes is None :
534 allactives = [[self.lems[lem].freq, lem] for lem in self.lems if self.lems[lem].act == key and self.lems[lem].freq >= 3]
535 self.activenb = len(allactives)
536 allactives = sorted(allactives, reverse = True)
537 if len(allactives) <= nbmax :
538 log.info('nb = %i - eff min = %i ' % (len(allactives), allactives[-1][0]))
539 return [val[1] for val in allactives], allactives[-1][0]
541 effs = [val[0] for val in allactives]
542 if effs.count(effs[nbmax - 1]) > 1 :
543 lim = effs[nbmax - 1] + 1
547 stop = effs.index(lim)
554 log.info('nb actives = %i - eff min = %i ' % (stop + 1, lim))
555 return [val[1] for val in allactives[0:stop + 1]], lim
557 def make_and_write_profile(self, actives, ucecl, fileout) :
558 log.info('formes/classes')
559 tab = [[lem] + [len(set(self.getlemuces(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
560 tab = [[line[0]] + [`val` for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
561 with open(fileout, 'w') as f :
562 f.write('\n'.join([';'.join(line) for line in tab]).encode(self.parametres['syscoding']))
564 def make_etoiles(self) :
566 for uci in self.ucis :
567 etoiles.update(uci.etoiles[1:])
570 def make_etoiles_dict(self) :
571 etoiles = [et for uci in self.ucis for et in uci.etoiles[1:]]
573 for etoile in etoiles :
574 et = etoile.split('_')
577 endet = '_'.join(et[1:])
578 if etoile in det[et[0]] :
579 det[et[0]][etoile] += 1
581 det[et[0]][etoile] = 1
586 endet = '_'.join(et[1:])
587 det[et[0]] = {etoile :1}
592 def make_etline(self, listet) :
593 etuces = [[] for et in listet]
594 for uci in self.ucis :
595 get = list(set(uci.etoiles).intersection(listet))
597 return '2 variables sur la meme ligne'
599 etuces[listet.index(get[0])] += [uce.ident for uce in uci.uces]
602 def make_and_write_profile_et(self, ucecl, fileout) :
603 log.info('etoiles/classes')
604 etoileuces = self.getetoileuces()
605 etoileuces = dict([[et, etoileuces[et]] for et in etoileuces if len(etoileuces[et]) > 1])
606 with open(fileout, 'w') as f :
607 f.write('\n'.join([';'.join([et] + [`len(set(etoileuces[et]).intersection(classe))` for classe in ucecl]) for et in etoileuces]).encode(self.parametres['syscoding']))
608 #etoiles = self.make_etoiles()
609 #with open(fileout, 'w') as f :
610 # f.write('\n'.join([';'.join([etoile] + [`len(set(self.getucesfrometoile(etoile)).intersection(classe))` for classe in ucecl]) for etoile in etoiles]).encode(self.parametres['syscoding']))
612 def make_colored_corpus(self) :
614 for i, lc in enumerate(self.lc) :
617 for uce in self.lc0 :
619 color = ['black'] + colors[len(self.lc) - 1]
621 <meta http-equiv="content-Type" content="text/html; charset=%s" />
623 ''' % sys.getdefaultencoding()
624 res = self.getalluces()
629 if self.iduces[uce[0]].uci != actuci :
630 actuci = self.iduces[uce[0]].uci
631 txt += '<br><hr>' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles) + '<br><br>'
632 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
634 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
635 return txt + '\n</body></html>'
637 def count_from_list(self, l, d) :
645 def count_from_list_cl(self, l, d, a, clnb) :
654 def find_segments(self, taille_segment, taille_limite) :
656 for uce in self.getalluces() :
658 d = self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
659 l = [[d[val], val] for val in d if d[val] >= 3]
662 if len(l) > taille_limite :
663 l = l[-taille_limite:]
666 def find_segments_in_classe(self, list_uce, taille_segment, taille_limite):
668 for uce in self.getconcorde(list_uce) :
670 d =self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
671 l = [[d[val], val, taille_segment] for val in d if d[val] >= 3]
674 if len(l) > taille_limite :
675 l = l[-taille_limite:]
678 def make_segments_profile(self, fileout, lenmin = 3, lenmax = 10, effmin = 50, lem = False) :
680 for b, classe in enumerate(self.lc) :
681 for uce in self.getconcorde(classe) :
684 uce = [self.formes[forme].lem for forme in uce]
685 for taille_segment in range(lenmin,lenmax) :
686 d =self.count_from_list_cl([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d, b, len(self.lc))
687 result = [[seg] + [str(val) for val in d[seg]] for seg in d if sum(d[seg]) >= effmin]
688 with open(fileout, 'w') as f :
689 f.write('\n'.join([';'.join(line) for line in result]))
691 def make_proftype(self, outf) :
693 for lem in self.lems :
694 gram = self.lems[lem].gram
696 res[gram] = [0 for val in self.lc]
697 lemuceeff = self.getlemuceseff(lem)
698 for i, classe in enumerate(self.lc) :
699 concern = set(classe).intersection(lemuceeff.keys())
700 res[gram][i] += sum([lemuceeff[uce] for uce in concern])
701 res = [[gram] + [`val` for val in res[gram]] for gram in res]
703 with open(outf, 'w') as f :
704 f.write('\n'.join([';'.join(line) for line in res]).encode(self.parametres['syscoding']))
707 def make_ucecl_from_R(self, filein) :
708 with open(filein, 'rU') as f :
713 line = line.replace('\n', '').replace('"', '').split(';')
714 self.lc.append([int(line[0]) - 1, int(line[1])])
715 classesl = [val[1] for val in self.lc]
717 self.lc = sorted(self.lc, key=itemgetter(1))
718 self.lc = [[uce[0] for uce in self.lc if uce[1] == i] for i in range(clnb+1)]
719 self.lc0 = self.lc.pop(0)
722 def get_stat_by_cluster(self, outf, lclasses = None) :
723 log.info('get_stat_by_cluster')
724 if lclasses is None :
727 occurrences = dict([[i + 1, 0] for i in range(len(lclasses))])
728 formescl = dict([[i + 1, 0] for i in range(len(lclasses))])
729 hapaxcl = dict([[i + 1, 0] for i in range(len(lclasses))])
730 lenclasses = dict([[i+1,len(cl)] for i, cl in enumerate(lclasses)])
731 sets = [set(cl) for cl in lclasses]
732 for forme in self.formes :
733 formeuceeff = self.getformeuceseff(forme)
734 for i, classe in enumerate(lclasses) :
735 concern = sets[i].intersection(formeuceeff.keys())
737 occurrences[i+1] += sum([formeuceeff[uce] for uce in concern])
739 if self.formes[forme].freq == 1 :
741 log.info('%f' % (time() - t1))
742 if outf is not None :
743 toprint = '\n'.join([';'.join([`i`, `occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`]) for i in occurrences])
744 with open(outf, 'w') as f :
747 return [[`occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`] for i in occurrences]
749 def get_stat_by_et(self, outf, etoiles) :
750 lclasses = [self.getucesfrometoile(etoile) for etoile in etoiles]
751 stats = self.get_stat_by_cluster(None, lclasses)
752 stats = [[etoiles[i]] + val for i, val in enumerate(stats)]
754 def gethapaxbyet(self, etoiles) :
755 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
757 for uce in hapaxuces :
758 if uce in hucesdict :
762 etuces = [[] for et in etoiles]
763 for uci in self.ucis :
764 get = list(set(uci.etoiles).intersection(etoiles))
766 return '2 variables sur la meme ligne'
768 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
769 etuces = [set(val) for val in etuces]
770 return [sum([hucesdict[uce] for uce in list(etuce.intersection(hapaxuces))]) for etuce in etuces]
772 def gethapaxuces(self) :
773 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
774 hapax = [forme for forme in self.lems if self.lems[forme].freq == 1]
776 for i,uce in enumerate(hapaxuces) :
777 if uce in hucesdict :
778 hucesdict[uce][0] += 1
779 hucesdict[uce][1].append(hapax[i])
781 hucesdict[uce] = [1,[hapax[i]]]
783 for uce in hucesdict :
784 if hucesdict[uce][0] in huces :
785 huces[hucesdict[uce][0]].append(uce)
787 huces[hucesdict[uce][0]] = [uce]
788 huces = zip(huces, huces.values())
789 huces.sort(reverse=True)
793 for nb in huces[0:4] :
794 txt += "<p><h2>%i hapax par uce</h2><p>\n" % nb[0]
796 res = self.getconcorde([uce])
798 ucetxt = ' ' + row[1] + ' '
800 for hap in hucesdict[uce][1] :
801 laforme = self.getforme([forme for forme in self.lems[hap].formes][0]).forme
802 ucetxt = ucetxt.replace(' '+laforme+' ', ' <font color=red>'+laforme+'</font> ')
803 txt += '<p><b>' + ' '.join(self.getetbyuceid(uceid)) + '</b></p>'
804 txt += '<p>'+ucetxt+'</p>\n'
808 with open('/tmp/testhapxuce.html','w') as f :
811 def export_dictionary(self, fileout, syscoding) :
812 listformes = [[self.formes[forme].freq, forme, self.formes[forme].lem, self.formes[forme].gram] for forme in self.formes]
813 listformes.sort(reverse = True)
814 listformes = [forme[1:] + [`forme[0]`] for forme in listformes]
815 with open(fileout, 'w') as f :
816 f.write('\n'.join(['\t'.join(forme) for forme in listformes]).encode(syscoding))
818 def export_lems(self, fileout, syscoding) :
820 listlem = [[lem, '\t'.join(['\t'.join([self.idformes[forme].forme, `self.lems[lem].formes[forme]`]) for forme in self.lems[lem].formes])] for lem in self.lems]
822 with open(fileout, 'w') as f :
823 f.write('\n'.join(['\t'.join(lem) for lem in listlem]).encode(syscoding))
829 def __init__(self, corpus) :
830 ucinb = corpus.getucinb()
831 ucisize = corpus.getucisize()
832 ucimean = float(sum(ucisize))/float(ucinb)
833 detoile = corpus.make_etoiles_dict()
837 def __init__(self, iduci, line, paraset = None) :
839 self.etoiles = line.split()
841 if paraset is not None :
842 self.paras = paraset.split()
847 def __init__(self, iduce, idpara, iduci) :
853 def __init__(self, word, gramtype, idword, lem = None, freq = None) :
859 if freq is not None :
865 def __init__(self, parent, forme) :
866 self.formes = {forme.ident : forme.freq}
867 self.gram = forme.gram
868 self.freq = forme.freq
871 def add_forme(self, forme) :
872 self.formes[forme.ident] = forme.freq
873 self.freq += forme.freq
875 def decouperlist(chaine, longueur, longueurOptimale) :
877 on part du dernier caractère, et on recule jusqu'au début de la chaîne.
878 Si on trouve un '$', c'est fini.
879 Sinon, on cherche le meilleur candidat. C'est-à-dire le rapport poids/distance le plus important.
881 separateurs = [[u'.', 6.0], [u'?', 6.0], [u'!', 6.0], [u'£$£', 6.0], [u':', 5.0], [u';', 4.0], [u',', 1.0], [u' ', 0.01]]
882 dsep = dict([[val[0],val[1]] for val in separateurs])
883 trouve = False # si on a trouvé un bon séparateur
884 iDecoupe = 0 # indice du caractere ou il faut decouper
886 longueur = min(longueur, len(chaine) - 1)
887 chaineTravail = chaine[:longueur + 1]
889 meilleur = ['', 0, 0] # type, poids et position du meilleur separateur
892 indice = chaineTravail.index(u'$')
894 iDecoupe = indice - 1
899 caractere = chaineTravail[nbCar]
900 distance = abs(longueurOptimale - nbCar) + 1
901 meilleureDistance = abs(longueurOptimale - meilleur[2]) + 1
902 if caractere in dsep :
903 if (float(dsep[caractere]) / distance) > (float(meilleur[1]) / meilleureDistance) :
904 meilleur[0] = caractere
905 meilleur[1] = dsep[caractere]
910 if (float(dsep[' ']) / distance) > (float(meilleur[1]) / meilleureDistance) :
912 meilleur[1] = dsep[' ']
919 #if meilleur[0] != ' ' :
920 # fin = chaine[iDecoupe + 1:]
921 # retour = chaineTravail[:iDecoupe]
923 fin = chaine[iDecoupe + 1:]
924 retour = chaineTravail[:iDecoupe + 1]
925 return len(retour) > 0, retour, fin
926 # si on a rien trouvé
927 return False, chaine, ''
929 def testetoile(line) :
930 return line.startswith(u'****')
933 return line[0:4].isdigit() and u'*' in line
935 def prep_txtlist(txt) :
936 return txt.split() + [u'$']
938 def prep_txtcharact(txt) :
943 Class for building a corpus
945 def __init__(self, infile, parametres_corpus, lexique = None, expressions = None, dlg = None) :
946 log.info('begin building corpus...')
947 self.lexique = lexique
948 self.expressions = expressions
950 self.corpus = Corpus(self, parametres_corpus)
953 self.lim = parametres_corpus.get('lim', 1000000)
954 self.encoding = parametres_corpus['encoding']
955 self.corpus.pathout = PathOut(filename = parametres_corpus['originalpath'], dirout = parametres_corpus['pathout'])
956 self.corpus.pathout.createdir(parametres_corpus['pathout'])
957 self.corpus.parametres['uuid'] = str(uuid4())
958 self.corpus.parametres['corpus_name'] = os.path.split(self.corpus.parametres['pathout'])[1]
959 self.corpus.parametres['type'] = 'corpus'
960 if self.corpus.parametres['keep_ponct'] :
961 self.ponctuation_espace = [' ', '']
963 self.ponctuation_espace = [' ','.', u'£$£', ';', '?', '!', ',', ':','']
965 self.tolist = self.corpus.parametres.get('tolist', 0)
972 def prep_makeuce(self) :
973 method = self.corpus.parametres.get('ucemethod', 0)
975 self.decouper = decouperlist
976 self.prep_txt = prep_txtlist
977 self.ucesize = self.corpus.parametres.get('ucesize', 40)
979 self.decouper = decoupercharact
980 self.prep_txt = prep_txtcharact
981 self.ucesize = self.corpus.parametres.get('ucesize', 240)
982 log.info('method uce : %s' % method)
987 self.read_corpus(self.infile)
988 except Warning, args :
989 log.info('pas kool %s' % args)
993 self.corpus.parametres['ira'] = self.corpus.pathout['Corpus.cira']
994 self.time = time() - t1
996 DoConf().makeoptions(['corpus'],[self.corpus.parametres], self.corpus.pathout['Corpus.cira'])
997 log.info('time : %f' % (time() - t1))
1000 self.conn_f = sqlite3.connect(self.corpus.pathout['formes.db'])
1001 self.cf = self.conn_f.cursor()
1002 self.cf.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
1003 self.cf.execute('CREATE TABLE IF NOT EXISTS eff (id INTEGER, eff TEXT);')
1004 self.conn_f.commit()
1005 self.cf = self.conn_f.cursor()
1006 self.cf.execute('PRAGMA temp_store=MEMORY;')
1007 self.cf.execute('PRAGMA journal_mode=MEMORY;')
1008 self.cf.execute('PRAGMA synchronous = OFF;')
1009 self.cf.execute('begin')
1010 self.conn = sqlite3.connect(self.corpus.pathout['uces.db'])
1011 self.c = self.conn.cursor()
1012 self.c.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
1014 self.c = self.conn.cursor()
1015 self.c.execute('PRAGMA temp_store=MEMORY;')
1016 self.c.execute('PRAGMA journal_mode=MEMORY;')
1017 self.c.execute('PRAGMA synchronous = OFF;')
1018 self.c.execute('begin')
1021 #commit index and close db
1023 self.conn_f.commit()
1024 self.cf.execute('CREATE INDEX iduces ON uces (id);')
1025 self.cf.execute('CREATE INDEX ideff ON eff (id);')
1029 self.conn_corpus = sqlite3.connect(self.corpus.pathout['corpus.db'])
1030 self.ccorpus = self.conn_corpus.cursor()
1031 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS etoiles (uci INTEGER, et TEXT, paras TEXT);')
1032 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS luces (uci INTEGER, para INTEGER, uce INTEGER);')
1033 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS formes (ident INTEGER, forme TEXT, lem TEXT, gram TEXT, freq INTEGER);')
1034 self.conn_corpus.commit()
1035 self.ccorpus = self.conn_corpus.cursor()
1036 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
1037 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
1038 self.ccorpus.execute('PRAGMA synchronous = OFF;')
1039 self.ccorpus.execute('begin')
1040 self.backup_corpus()
1041 self.ccorpus.execute('CREATE INDEX iduci ON luces (uci);')
1042 self.conn_corpus.commit()
1043 self.conn_corpus.close()
1044 #self.corpus.parametres['corpus_ira'] = self.corpus.pathout['corpus.cira']
1046 def buildcleans(self) :
1047 if self.corpus.parametres.get('lower', 1) :
1048 self.cleans.append(self.dolower)
1049 if self.corpus.parametres.get('firstclean', 1) :
1050 self.cleans.append(self.firstclean)
1051 if self.corpus.parametres['charact'] :
1052 self.rule = self.corpus.parametres.get('keep_caract', u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_")
1053 self.cleans.append(self.docharact)
1054 if self.corpus.parametres.get('expressions', 1) :
1055 self.cleans.append(self.make_expression)
1056 if self.corpus.parametres.get('apos', 1) :
1057 self.cleans.append(self.doapos)
1058 if self.corpus.parametres.get('tiret', 1):
1059 self.cleans.append(self.dotiret)
1061 def make_expression(self,txt) :
1062 for expression in self.expressions:
1063 if expression in txt :
1064 txt = txt.replace(expression, self.expressions[expression][0])
1067 def dolower(self, txt) :
1070 def docharact(self, txt) :
1071 #rule = u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_-"
1072 list_keep = u"[" + self.rule + "]+"
1073 return re.sub(list_keep, ' ', txt)
1075 def doapos(self, txt) :
1076 return txt.replace(u'\'', u' ')
1078 def dotiret(self, txt) :
1079 return txt.replace(u'-', u' ')
1081 def firstclean(self, txt) :
1082 txt = txt.replace(u'’',"'")
1083 txt = txt.replace(u'œ', u'oe')
1084 return txt.replace('...',u' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'…', u' £$£ ')
1086 def make_cleans(self, txt) :
1087 for clean in self.cleans :
1091 def backup_uce(self) :
1092 if self.corpus.idformesuces != {} :
1093 log.info('backup %i' % len(self.corpus.idformesuces))
1094 touce = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].keys()])) for forme in self.corpus.idformesuces]
1095 toeff = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].values()])) for forme in self.corpus.idformesuces]
1096 self.cf.executemany('INSERT INTO uces VALUES (?,?);', touce)
1097 self.cf.executemany('INSERT INTO eff VALUES (?,?);', toeff)
1098 self.corpus.idformesuces = {}
1101 def backup_corpus(self) :
1102 log.info('start backup corpus')
1104 for uci in self.corpus.ucis :
1105 self.ccorpus.execute('INSERT INTO etoiles VALUES (?,?,?);' ,(uci.ident,' '.join(uci.etoiles), ' '.join(uci.paras,)))
1106 for uce in uci.uces :
1107 self.ccorpus.execute('INSERT INTO luces VALUES (?,?,?);',(`uci.ident`,`uce.para`,`uce.ident`,))
1108 for forme in self.corpus.formes :
1109 self.ccorpus.execute('INSERT INTO formes VALUES (?,?,?,?,?);', (`self.corpus.formes[forme].ident`, forme, self.corpus.formes[forme].lem, self.corpus.formes[forme].gram, `self.corpus.formes[forme].freq`,))
1110 log.info('%f' % (time() - t))
1112 def dofinish(self) :
1113 self.corpus.parametres['date'] = datetime.datetime.now().ctime()
1114 minutes, seconds = divmod(self.time, 60)
1115 hours, minutes = divmod(minutes, 60)
1116 self.corpus.parametres['time'] = '%.0fh %.0fm %.0fs' % (hours, minutes, seconds)
1117 self.corpus.parametres['ucinb'] = self.corpus.getucinb()
1118 self.corpus.parametres['ucenb'] = self.corpus.getucenb()
1119 self.corpus.parametres['occurrences'] = self.corpus.gettotocc()
1120 self.corpus.parametres['formesnb'] = len(self.corpus.formes)
1121 hapaxnb = self.corpus.gethapaxnb()
1122 pourhapaxf = (float(hapaxnb) / len(self.corpus.formes)) * 100
1123 pourhapaxocc = (float(hapaxnb) / self.corpus.parametres['occurrences']) * 100
1124 self.corpus.parametres['hapax'] = '%i - %.2f %% des formes - %.2f %% des occurrences' % (hapaxnb, pourhapaxf, pourhapaxocc)
1127 class BuildFromAlceste(BuildCorpus) :
1128 def read_corpus(self, infile) :
1129 if self.dlg is not None :
1130 self.dlg.Pulse('textes : 0 - segments : 0')
1133 if self.corpus.parametres['ucimark'] == 0 :
1134 self.testuci = testetoile
1135 elif self.corpus.parametres['ucimark'] == 1 :
1136 self.testuci = testint
1142 with codecs.open(infile, 'r', self.encoding) as f :
1143 for linenb, line in enumerate(f) :
1144 line = line.rstrip('\n\r')
1145 if self.testuci(line) :
1148 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci - 1)
1150 self.corpus.ucis.append(Uci(iduci, line))
1153 if self.corpus.ucis[-1].uces == [] :
1154 log.info(u'Empty text : %i' % linenb)
1156 self.corpus.ucis.pop()
1157 self.corpus.ucis.append(Uci(iduci, line))
1158 if self.dlg is not None :
1159 if not (iduci + 1) % 10 :
1160 self.dlg.Pulse('textes : %i - segments : %i' % (iduci + 1, iduce +1))
1161 elif line.startswith(u'-*') :
1164 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1167 self.corpus.ucis[-1].paras.append(line.split()[0])
1169 raise Exception('paragrapheOT %i' % linenb)
1170 elif line.strip() != '' and iduci != -1 :
1172 if txt != [] and iduci != -1 :
1173 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1178 self.corpus.ucis.pop()
1179 log.info(Exception("Empty text %i" % linenb))
1181 raise Exception('EmptyText %i' % linenb)
1182 if iduci != -1 and iduce != -1:
1185 log.info(_(u"No Text in corpora. Are you sure of the formatting ?"))
1186 raise Exception('TextBeforeTextMark %i' % linenb)
1187 except UnicodeDecodeError :
1188 raise Exception("CorpusEncoding")
1190 def treattxt(self, txt, iduce, idpara, iduci) :
1191 if self.corpus.parametres.get('ucemethod', 0) == 2 and self.corpus.parametres['douce']:
1192 txt = 'laphrasepoursplitter'.join(txt)
1193 txt = self.make_cleans(txt)
1194 txt = ' '.join([val for val in txt.split() if val not in self.ponctuation_espace])
1195 ucetxt = txt.split('laphrasepoursplitter')
1198 txt = self.make_cleans(txt)
1199 ucetxt = self.make_uces(txt, self.corpus.parametres['douce'])
1200 if self.corpus.ucis[-1].paras == [] :
1204 self.corpus.ucis[-1].uces.append(Uce(iduce, idpara, iduci))
1205 self.c.execute('INSERT INTO uces VALUES(?,?);', (`iduce`,uce))
1206 if not self.tolist :
1212 self.corpus.add_word(word)
1213 log.debug(' '.join([`iduci`,`idpara`,`iduce`]))
1214 if self.last > self.lim :
1217 return iduce, idpara
1219 def make_uces(self, txt, douce = True, keep_ponct = False) :
1220 txt = ' '.join(txt.split())
1223 reste, texte_uce, suite = self.decouper(self.prep_txt(txt), self.ucesize + 15, self.ucesize)
1225 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1228 reste, texte_uce, suite = self.decouper(suite, self.ucesize + 15, self.ucesize)
1229 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1234 return [' '.join([val for val in txt.split() if val not in self.ponctuation_espace])]
1236 #decouper (list_sep)
1237 #make_uces (decouper)
1238 #treat_txt (make_uces)
1242 def __init__(self, parent, dlg = None) :
1243 self.parent = parent
1245 parametres = DoConf(os.path.join(self.parent.UserConfigPath,'corpus.cfg')).getoptions('corpus')
1246 parametres['pathout'] = PathOut(parent.filename, 'corpus').mkdirout()
1247 dial = CorpusPref(parent, parametres)
1248 dial.CenterOnParent()
1249 dial.txtpath.SetLabel(parent.filename)
1250 #dial.repout_choices.SetValue(parametres['pathout'])
1251 self.res = dial.ShowModal()
1252 if self.res == 5100 :
1253 parametres = dial.doparametres()
1254 parametres['originalpath'] = parent.filename
1255 PathOut().createdir(parametres['pathout'])
1256 ReadLexique(self.parent, lang = parametres['lang'])
1257 if parametres['lang'] != 'other' and os.path.exists(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')):
1258 self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp'))
1260 self.parent.expressions = {}
1261 self.parametres = parametres
1263 if self.dlg is not None :
1267 def doanalyse(self) :
1268 return BuildFromAlceste(self.parent.filename, self.parametres, self.parent.lexique, self.parent.expressions, dlg = self.dlg).corpus
1271 if __name__ == '__main__' :
1273 parametres = {'formesdb':'formes.db', 'ucesdb': 'uces.db', 'corpusdb' : 'corpus.db', 'syscoding' : 'utf-8', 'encoding' : encoding}
1274 intro = BuildCorpus(infile, parametres)#, tar_in, tar_infouce)#, tar_formes)