1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
11 from functions import decoupercharact, ReadDicoAsDico, DoConf
16 from operator import itemgetter
17 from uuid import uuid4
18 from chemins import PathOut
19 from dialog import CorpusPref
20 from functions import ReadLexique, ReadDicoAsDico
21 from colors import colors
25 log = logging.getLogger('iramuteq.corpus')
28 def copycorpus(corpus) :
29 log.info('copy corpus')
30 copy_corpus = Corpus(corpus.parent, parametres = corpus.parametres)
31 copy_corpus.ucis = corpus.ucis
32 copy_corpus.formes = corpus.formes
33 copy_corpus.pathout = corpus.pathout
34 copy_corpus.conn_all()
44 def __init__(self, parent, parametres = {}, read = False) :
46 self.parametres = parametres
48 self.connformes = None
50 self.conncorpus = None
57 self.idformesuces = {}
62 self.pathout = PathOut(dirout = parametres['pathout'])
65 def add_word(self, word) :
66 if word in self.formes :
67 self.formes[word].freq += 1
68 if self.formes[word].ident in self.idformesuces :
69 if self.ucis[-1].uces[-1].ident in self.idformesuces[self.formes[word].ident] :
70 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] += 1
72 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] = 1
74 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident: 1}
76 if word in self.parent.lexique :
77 gramtype = self.parent.lexique[word][1]
78 lem = self.parent.lexique[word][0]
85 self.formes[word] = Word(word, gramtype, len(self.formes), lem)
86 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident : 1}
89 """connect corpus to db"""
90 if self.connformes is None :
91 log.info('connexion corpus')
92 self.connuces = sqlite3.connect(self.pathout['uces.db'])
93 self.cuces = self.connuces.cursor()
94 self.connformes = sqlite3.connect(self.pathout['formes.db'])
95 self.cformes = self.connformes.cursor()
96 self.conncorpus = sqlite3.connect(self.pathout['corpus.db'])
97 self.ccorpus = self.conncorpus.cursor()
98 self.cformes.execute('PRAGMA temp_store=MEMORY;')
99 self.cformes.execute('PRAGMA journal_mode=MEMORY;')
100 self.cformes.execute('PRAGMA synchronous = OFF;')
101 self.cuces.execute('PRAGMA temp_store=MEMORY;')
102 self.cuces.execute('PRAGMA journal_mode=MEMORY;')
103 self.cuces.execute('PRAGMA synchronous = OFF;')
104 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
105 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
106 self.ccorpus.execute('PRAGMA synchronous = OFF;')
108 def read_corpus(self) :
109 log.info('read corpus')
110 self.parametres['syscoding'] = sys.getdefaultencoding()
111 if self.conncorpus is None :
113 res = self.ccorpus.execute('SELECT * FROM etoiles;')
115 self.ucis.append(Uci(row[0], row[1], row[2]))
116 uces = self.conncorpus.cursor().execute('SELECT * FROM luces where uci=?;',(`self.ucis[-1].ident`,))
118 self.ucis[-1].uces.append(Uce(uce[2], uce[1], uce[0]))
119 res = self.ccorpus.execute('SELECT * FROM formes;')
120 self.formes = dict([[forme[1], Word(forme[1], forme[3], forme[0], lem = forme[2], freq = forme[4])] for forme in res])
123 def getworduces(self, wordid) :
124 if isinstance(wordid, basestring) :
125 wordid = self.formes[wordid].ident
126 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`wordid`,))
127 return list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
129 def getformeuceseff(self, formeid) :
130 if isinstance(formeid, basestring) :
131 formeid = self.formes[formeid].ident
132 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`formeid`,))
133 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
134 query = 'SELECT eff FROM eff where id=%i ORDER BY id' % formeid
135 res = self.cformes.execute(query)
136 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
138 for i, uce in enumerate(uces) :
139 formeuceeff[uce] = formeuceeff.get(uce, 0) + eff[i]
142 def getlemuces(self, lem) :
143 formesid = ', '.join([`val` for val in self.lems[lem].formes])
144 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
145 res = self.cformes.execute(query)
146 return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))))
148 def getlemucis(self, lem) :
149 uces = self.getlemuces(lem)
150 return list(set([self.getucefromid(val).uci for val in uces]))
152 def getlemuceseff(self, lem, luces = None) :
153 formesid = ', '.join([`val` for val in self.lems[lem].formes])
154 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
155 res = self.cformes.execute(query)
156 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
157 query = 'SELECT eff FROM eff where id IN (%s) ORDER BY id' % formesid
158 res = self.cformes.execute(query)
159 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
161 for i, uce in enumerate(uces) :
162 lemuceeff[uce] = lemuceeff.get(uce, 0) + eff[i]
165 def getlemclustereff(self, lem, cluster) :
166 return len(list(set(self.lc[cluster]).intersection(self.getlemuces(lem))))
168 def getlemeff(self, lem) :
169 return self.lems[lem].freq
174 def getforme(self, formeid) :
175 if self.idformes is None : self.make_idformes()
176 return self.idformes[formeid]
178 def gettotocc(self) :
179 return sum([self.formes[forme].freq for forme in self.formes])
181 def getucemean(self) :
182 return float(self.gettotocc())/self.getucenb()
185 return self.ucis[-1].uces[-1].ident + 1
188 return self.ucis[-1].ident + 1
190 def getucisize(self) :
191 ucesize = self.getucesize()
192 return [sum(ucesize[uci.uces[0].ident:(uci.uces[-1].ident + 1)]) for uci in self.ucis]
194 def getucesize(self) :
195 res = self.getalluces()
196 return [len(uce[1].split()) for uce in res]
198 def getconcorde(self, uces) :
199 return self.cuces.execute('select * from uces where id IN (%s);' % ', '.join([`i` for i in uces]))
201 def getwordconcorde(self, word) :
202 return self.getconcorde(self.getworduces(word))
204 def getlemconcorde(self, lem) :
205 return self.getconcorde(self.getlemuces(lem))
207 def getalluces(self) :
208 return self.cuces.execute('SELECT * FROM uces')
210 def getucesfrometoile(self, etoile) :
211 return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles]
213 def getetoileuces(self) :
214 log.info('get uces etoiles')
217 for uci in self.ucis :
218 etoiles = uci.etoiles[1:]
220 if et in etoileuces :
221 etoileuces[et] += [uce.ident for uce in uci.uces]
223 etoileuces[et] = [uce.ident for uce in uci.uces]
225 for et in uci.paras :
226 if et in etoileuces :
227 etoileuces[et] += [uce.ident for uce in uci.uces if uce.para == idpara]
229 etoileuces[et] = [uce.ident for uce in uci.uces if uce.para == idpara]
235 def getucefromid(self, uceid) :
236 if self.iduces is None : self.make_iduces()
237 return self.iduces[uceid]
239 def gethapaxnb(self) :
240 return len([None for forme in self.formes if self.formes[forme].freq == 1])
242 def getactivesnb(self, key) :
243 return len([lem for lem in self.lems if self.lems[lem].act == key])
244 # def make_lems(self, lem = True) :
245 # log.info('make lems')
247 # for forme in self.formes :
248 # if self.formes[forme].lem in self.lems :
249 # if self.formes[forme].ident not in self.lems[self.formes[forme].lem] :
250 # self.lems[self.formes[forme].lem][self.formes[forme].ident] = 0
252 # self.lems[self.formes[forme].lem] = {self.formes[forme].ident : 0}
254 def getetbyuceid(self, uceid) :
255 if self.uceuci is None : self.uceuci = dict([[uce.ident,uci.ident] for uci in self.ucis for uce in uci.uces])
256 return self.ucis[self.uceuci[uceid]].etoiles
258 def make_lems(self, lem = True) :
259 log.info('make lems')
262 for forme in self.formes :
263 if self.formes[forme].lem in self.lems :
264 if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes :
265 self.lems[self.formes[forme].lem].add_forme(self.formes[forme])
267 self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
269 self.lems = dict([[forme, Lem(self, self.formes[forme])] for forme in self.formes])
271 def make_idformes(self) :
272 self.idformes = dict([[self.formes[forme].ident, self.formes[forme]] for forme in self.formes])
274 def make_iduces(self) :
275 if self.iduces is None :
276 self.iduces = dict([[uce.ident, uce] for uci in self.ucis for uce in uci.uces])
278 def make_lexitable(self, mineff, etoiles) :
279 tokeep = [lem for lem in self.lems if self.lems[lem].freq >= mineff]
280 etuces = [[] for et in etoiles]
281 for uci in self.ucis :
282 get = list(set(uci.etoiles).intersection(etoiles))
284 log.info('2 variables sur une ligne')
286 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
287 etuces = [set(val) for val in etuces]
290 deff = self.getlemuceseff(lem)
292 tab.append([lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])
293 tab.insert(0, [''] + etoiles)
296 def make_efftype_from_etoiles(self, etoiles) :
298 etuces = [[] for et in etoiles]
299 for uci in self.ucis :
300 get = list(set(uci.etoiles).intersection(etoiles))
302 return '2 variables sur la meme ligne'
304 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
305 etuces = [set(val) for val in etuces]
306 for lem in self.lems :
307 deff = self.getlemuceseff(lem)
309 gram = self.lems[lem].gram
311 dtype[gram] = [i + j for i, j in zip(dtype[gram], [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])]
313 dtype[gram] = [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces]
314 tabout = [[gram] + dtype[gram] for gram in dtype]
315 tabout.insert(0, [''] + etoiles)
318 def make_uceactsize(self, actives) :
319 res = self.getalluces()
322 deff = self.getlemuceseff(lem)
324 ucesize[uce] = ucesize.get(uce, 0) + 1
327 def make_uc(self, actives, lim1, lim2) :
328 uceactsize = self.make_uceactsize(actives)
334 for uce in [uce for uci in self.ucis for uce in uci.uces] :
335 if uce.para == lastpara :
337 last1 += uceactsize.get(uce.ident,0)
338 uc1[-1].append(uce.ident)
340 uc1.append([uce.ident])
343 last2 += uceactsize.get(uce.ident, 0)
344 uc2[-1].append(uce.ident)
346 uc2.append([uce.ident])
349 last1 = uceactsize.get(uce.ident, 0)
350 last2 = uceactsize.get(uce.ident, 0)
352 uc1.append([uce.ident])
353 uc2.append([uce.ident])
356 def make_and_write_sparse_matrix_from_uc(self, actives, sizeuc1, sizeuc2, uc1out, uc2out, listuce1out, listuce2out) :
357 uc1, uc2 = self.make_uc(actives, sizeuc1, sizeuc2)
358 log.info('taille uc1 : %i - taille uc2 : %i' % (len(uc1), len(uc2)))
359 self.write_ucmatrix(uc1, actives, uc1out)
360 self.write_ucmatrix(uc2, actives, uc2out)
361 listuce1 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc1) for uce in ucl]
362 listuce2 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc2) for uce in ucl]
363 with open(listuce1out, 'w') as f :
364 f.write('\n'.join([';'.join(line) for line in listuce1]))
365 with open(listuce2out, 'w') as f :
366 f.write('\n'.join([';'.join(line) for line in listuce2]))
367 return len(uc1), len(uc2)
369 def write_ucmatrix(self, uc, actives, fileout) :
370 log.info('write uc matrix %s' % fileout)
371 uces_uc = dict([[uce, i] for i, ucl in enumerate(uc) for uce in ucl])
374 with open(fileout + '~', 'w+') as f :
375 for i, lem in enumerate(actives) :
376 for uce in self.getlemuces(lem):
377 if (uces_uc[uce], i) not in deja_la :
379 f.write(''.join([' '.join([`uces_uc[uce]+1`,`i+1`,`1`]),'\n']))
380 deja_la[(uces_uc[uce], i)] = 0
382 with open(fileout, 'w') as ffin :
383 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (len(uc), len(actives), nbl))
386 os.remove(fileout + '~')
389 def export_corpus(self, outf) :
390 #outf = 'export_corpus.txt'
392 res = self.getalluces()
396 with open(outf,'w') as f :
398 if self.iduces[uce[0]].uci == actuci and self.iduces[uce[0]].para == actpara :
399 f.write(uce[1].encode(self.parametres['syscoding']) + '\n')
400 elif self.iduces[uce[0]].uci != actuci :
401 actuci = self.iduces[uce[0]].uci
402 if self.ucis[self.iduces[uce[0]].uci].paras == [] :
403 actpara = self.iduces[uce[0]].para
404 f.write('\n' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n' + uce[1].encode(self.parametres['syscoding']) + '\n')
407 actpara = self.iduces[uce[0]].para
408 f.write('\n'.join([' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']), self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
409 elif self.iduces[uce[0]].para != actpara :
410 actpara = self.iduces[uce[0]].para
412 f.write('\n'.join([self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
414 def export_corpus_classes(self, outf, alc = True, lem = False) :
416 for i, lc in enumerate(self.lc) :
419 for uce in self.lc0 :
421 res = self.getalluces()
423 with open(outf, 'w') as f :
426 actuci = self.iduces[uce[0]].uci
428 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
430 etline = ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles + ['*classe_%i' % ucecl[uce[0]]])
432 etline = ' '.join(['<' + '='.join(et.split('_')) + '>' for et in self.ucis[self.iduces[uce[0]].uci].etoiles[1:]])
433 f.write(etline.encode(self.parametres['syscoding']) + '\n')
434 f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
436 def make_and_write_sparse_matrix_from_uces(self, actives, outfile, listuce = False) :
437 log.info('make_and_write_sparse_matrix_from_uces %s' % outfile)
439 with open(outfile + '~', 'w+') as f :
440 for i, lem in enumerate(actives) :
441 for uce in sorted(self.getlemuces(lem)) :
443 f.write(''.join([' '.join([`uce+1`, `i+1`,`1`]),'\n']))
445 with open(outfile, 'w') as ffin :
446 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
449 os.remove(outfile + '~')
451 with open(listuce, 'w') as f :
452 f.write('\n'.join(['uce;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucenb())]))
454 def make_and_write_sparse_matrix_from_uci(self, actives, outfile, listuci = False) :
455 log.info('make_and_write_sparse_matrix_from_ucis %s' % outfile)
457 with open(outfile + '~', 'w+') as f :
458 for i, lem in enumerate(actives) :
459 for uci in sorted(self.getlemucis(lem)) :
461 f.write(''.join([' '.join([`uci+1`, `i+1`,`1`]),'\n']))
463 with open(outfile, 'w') as ffin :
464 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucinb(), len(actives), nbl))
467 os.remove(outfile + '~')
469 with open(listuci, 'w') as f :
470 f.write('\n'.join(['uci;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucinb())]))
472 def make_and_write_sparse_matrix_from_classe(self, actives, uces, outfile) :
473 log.info('make_and_write_sparse_matrix_from_classe %s' % outfile)
475 duces = dict([[uce, i] for i, uce in enumerate(uces)])
476 with open(outfile + '~', 'w+') as f :
477 for i, lem in enumerate(actives) :
478 uces_ok = list(set(self.getlemuces(lem)).intersection(uces))
480 f.write(''.join([' '.join([`duces[uce]+1`,`i+1`,`1`]),'\n']))
482 with open(outfile, 'w') as ffin :
483 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
486 os.remove(outfile + '~')
488 def make_table_with_classe(self, uces, list_act) :
489 table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
490 uces = dict([[uce, i] for i, uce in enumerate(uces)])
491 for i, lem in enumerate(list_act) :
492 lemuces = list(set(self.getlemuces(lem)).intersection(uces))
494 table_uce[uces[uce]][i] = 1
495 table_uce.insert(0, list_act)
498 def parse_active(self, gramact, gramsup = None) :
499 log.info('parse actives')
500 for lem in self.lems :
501 if lem.startswith('_') and lem.endswith('_') :
502 self.lems[lem].act = 2
503 elif self.lems[lem].gram in gramact :
504 self.lems[lem].act = 1
505 elif gramsup is not None :
506 if self.lems[lem].gram in gramsup :
507 self.lems[lem].act = 2
509 self.lems[lem].act = 0
511 self.lems[lem].act = 2
513 def make_actives_limit(self, limit, key = 1) :
514 if self.idformes is None :
516 return [lem for lem in self.lems if self.getlemeff(lem) >= limit and self.lems[lem].act == key]
518 def make_actives_nb(self, nbmax, key) :
519 log.info('make_actives_nb : %i - %i' % (nbmax,key))
520 if self.idformes is None :
522 allactives = [[self.lems[lem].freq, lem] for lem in self.lems if self.lems[lem].act == key and self.lems[lem].freq >= 3]
523 self.activenb = len(allactives)
524 allactives = sorted(allactives, reverse = True)
525 if len(allactives) <= nbmax :
526 log.info('nb = %i - eff min = %i ' % (len(allactives), allactives[-1][0]))
527 return [val[1] for val in allactives], allactives[-1][0]
529 effs = [val[0] for val in allactives]
530 if effs.count(effs[nbmax - 1]) > 1 :
531 lim = effs[nbmax - 1] + 1
535 stop = effs.index(lim)
542 log.info('nb actives = %i - eff min = %i ' % (stop + 1, lim))
543 return [val[1] for val in allactives[0:stop + 1]], lim
545 def make_and_write_profile(self, actives, ucecl, fileout) :
546 log.info('formes/classes')
547 tab = [[lem] + [len(set(self.getlemuces(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
548 tab = [[line[0]] + [`val` for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
549 with open(fileout, 'w') as f :
550 f.write('\n'.join([';'.join(line) for line in tab]).encode(self.parametres['syscoding']))
552 def make_etoiles(self) :
554 for uci in self.ucis :
555 etoiles.update(uci.etoiles[1:])
558 def make_etoiles_dict(self) :
559 etoiles = [et for uci in self.ucis for et in uci.etoiles[1:]]
561 for etoile in etoiles :
562 et = etoile.split('_')
565 endet = '_'.join(et[1:])
566 if etoile in det[et[0]] :
567 det[et[0]][etoile] += 1
569 det[et[0]][etoile] = 1
574 endet = '_'.join(et[1:])
575 det[et[0]] = {etoile :1}
580 def make_etline(self, listet) :
581 etuces = [[] for et in listet]
582 for uci in self.ucis :
583 get = list(set(uci.etoiles).intersection(listet))
585 return '2 variables sur la meme ligne'
587 etuces[listet.index(get[0])] += [uce.ident for uce in uci.uces]
590 def make_and_write_profile_et(self, ucecl, fileout) :
591 log.info('etoiles/classes')
592 etoileuces = self.getetoileuces()
593 etoileuces = dict([[et, etoileuces[et]] for et in etoileuces if len(etoileuces[et]) > 1])
594 with open(fileout, 'w') as f :
595 f.write('\n'.join([';'.join([et] + [`len(set(etoileuces[et]).intersection(classe))` for classe in ucecl]) for et in etoileuces]).encode(self.parametres['syscoding']))
596 #etoiles = self.make_etoiles()
597 #with open(fileout, 'w') as f :
598 # f.write('\n'.join([';'.join([etoile] + [`len(set(self.getucesfrometoile(etoile)).intersection(classe))` for classe in ucecl]) for etoile in etoiles]).encode(self.parametres['syscoding']))
600 def make_colored_corpus(self) :
602 for i, lc in enumerate(self.lc) :
605 for uce in self.lc0 :
607 color = ['black'] + colors[len(self.lc) - 1]
609 <meta http-equiv="content-Type" content="text/html; charset=%s" />
611 ''' % sys.getdefaultencoding()
612 res = self.getalluces()
617 if self.iduces[uce[0]].uci != actuci :
618 actuci = self.iduces[uce[0]].uci
619 txt += '<br><hr>' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles) + '<br><br>'
620 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
622 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
623 return txt + '\n</body></html>'
625 def count_from_list(self, l, d) :
633 def count_from_list_cl(self, l, d, a, clnb) :
642 def find_segments(self, taille_segment, taille_limite) :
644 for uce in self.getalluces() :
646 d = self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
647 l = [[d[val], val] for val in d if d[val] >= 3]
650 if len(l) > taille_limite :
651 l = l[-taille_limite:]
654 def find_segments_in_classe(self, list_uce, taille_segment, taille_limite):
656 for uce in self.getconcorde(list_uce) :
658 d =self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
659 l = [[d[val], val, taille_segment] for val in d if d[val] >= 3]
662 if len(l) > taille_limite :
663 l = l[-taille_limite:]
666 def make_segments_profile(self, fileout, lenmin = 3, lenmax = 10, effmin = 50, lem = False) :
668 for b, classe in enumerate(self.lc) :
669 for uce in self.getconcorde(classe) :
672 uce = [self.formes[forme].lem for forme in uce]
673 for taille_segment in range(lenmin,lenmax) :
674 d =self.count_from_list_cl([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d, b, len(self.lc))
675 result = [[seg] + [str(val) for val in d[seg]] for seg in d if sum(d[seg]) >= effmin]
676 with open(fileout, 'w') as f :
677 f.write('\n'.join([';'.join(line) for line in result]))
679 def make_proftype(self, outf) :
681 for lem in self.lems :
682 gram = self.lems[lem].gram
684 res[gram] = [0 for val in self.lc]
685 lemuceeff = self.getlemuceseff(lem)
686 for i, classe in enumerate(self.lc) :
687 concern = set(classe).intersection(lemuceeff.keys())
688 res[gram][i] += sum([lemuceeff[uce] for uce in concern])
689 res = [[gram] + [`val` for val in res[gram]] for gram in res]
691 with open(outf, 'w') as f :
692 f.write('\n'.join([';'.join(line) for line in res]).encode(self.parametres['syscoding']))
695 def make_ucecl_from_R(self, filein) :
696 with open(filein, 'rU') as f :
701 line = line.replace('\n', '').replace('"', '').split(';')
702 self.lc.append([int(line[0]) - 1, int(line[1])])
703 classesl = [val[1] for val in self.lc]
705 self.lc = sorted(self.lc, key=itemgetter(1))
706 self.lc = [[uce[0] for uce in self.lc if uce[1] == i] for i in range(clnb+1)]
707 self.lc0 = self.lc.pop(0)
710 def get_stat_by_cluster(self, outf) :
711 log.info('get_stat_by_cluster')
713 occurrences = dict([[i + 1, 0] for i in range(len(self.lc))])
714 formescl = dict([[i + 1, 0] for i in range(len(self.lc))])
715 hapaxcl = dict([[i + 1, 0] for i in range(len(self.lc))])
716 lenclasses = dict([[i+1,len(cl)] for i, cl in enumerate(self.lc)])
717 sets = [set(cl) for cl in self.lc]
718 for forme in self.formes :
719 formeuceeff = self.getformeuceseff(forme)
720 for i, classe in enumerate(self.lc) :
721 concern = sets[i].intersection(formeuceeff.keys())
723 occurrences[i+1] += sum([formeuceeff[uce] for uce in concern])
725 if self.formes[forme].freq == 1 :
727 toprint = '\n'.join([';'.join([`i`, `occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`]) for i in occurrences])
728 with open(outf, 'w') as f :
730 log.info('%f' % (time() - t1))
732 def gethapaxbyet(self, etoiles) :
733 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
735 for uce in hapaxuces :
736 if uce in hucesdict :
740 etuces = [[] for et in etoiles]
741 for uci in self.ucis :
742 get = list(set(uci.etoiles).intersection(etoiles))
744 return '2 variables sur la meme ligne'
746 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
747 etuces = [set(val) for val in etuces]
748 return [sum([hucesdict[uce] for uce in list(etuce.intersection(hapaxuces))]) for etuce in etuces]
750 def gethapaxuces(self) :
751 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
752 hapax = [forme for forme in self.lems if self.lems[forme].freq == 1]
754 for i,uce in enumerate(hapaxuces) :
755 if uce in hucesdict :
756 hucesdict[uce][0] += 1
757 hucesdict[uce][1].append(hapax[i])
759 hucesdict[uce] = [1,[hapax[i]]]
761 for uce in hucesdict :
762 if hucesdict[uce][0] in huces :
763 huces[hucesdict[uce][0]].append(uce)
765 huces[hucesdict[uce][0]] = [uce]
766 huces = zip(huces, huces.values())
767 huces.sort(reverse=True)
771 for nb in huces[0:4] :
772 txt += "<p><h2>%i hapax par uce</h2><p>\n" % nb[0]
774 res = self.getconcorde([uce])
776 ucetxt = ' ' + row[1] + ' '
778 for hap in hucesdict[uce][1] :
779 laforme = self.getforme([forme for forme in self.lems[hap].formes][0]).forme
780 ucetxt = ucetxt.replace(' '+laforme+' ', ' <font color=red>'+laforme+'</font> ')
781 txt += '<p><b>' + ' '.join(self.getetbyuceid(uceid)) + '</b></p>'
782 txt += '<p>'+ucetxt+'</p>\n'
786 with open('/tmp/testhapxuce.html','w') as f :
789 def export_dictionary(self, fileout, syscoding) :
790 listformes = [[self.formes[forme].freq, forme, self.formes[forme].lem, self.formes[forme].gram] for forme in self.formes]
791 listformes.sort(reverse = True)
792 listformes = [forme[1:] + [`forme[0]`] for forme in listformes]
793 with open(fileout, 'w') as f :
794 f.write('\n'.join(['\t'.join(forme) for forme in listformes]).encode(syscoding))
796 def export_lems(self, fileout, syscoding) :
798 listlem = [[lem, '\t'.join(['\t'.join([self.idformes[forme].forme, `self.lems[lem].formes[forme]`]) for forme in self.lems[lem].formes])] for lem in self.lems]
800 with open(fileout, 'w') as f :
801 f.write('\n'.join(['\t'.join(lem) for lem in listlem]).encode(syscoding))
807 def __init__(self, corpus) :
808 ucinb = corpus.getucinb()
809 ucisize = corpus.getucisize()
810 ucimean = float(sum(ucisize))/float(ucinb)
811 detoile = corpus.make_etoiles_dict()
815 def __init__(self, iduci, line, paraset = None) :
817 self.etoiles = line.split()
819 if paraset is not None :
820 self.paras = paraset.split()
825 def __init__(self, iduce, idpara, iduci) :
831 def __init__(self, word, gramtype, idword, lem = None, freq = None) :
837 if freq is not None :
843 def __init__(self, parent, forme) :
844 self.formes = {forme.ident : forme.freq}
845 self.gram = forme.gram
846 self.freq = forme.freq
849 def add_forme(self, forme) :
850 self.formes[forme.ident] = forme.freq
851 self.freq += forme.freq
853 def decouperlist(chaine, longueur, longueurOptimale) :
855 on part du dernier caractère, et on recule jusqu'au début de la chaîne.
856 Si on trouve un '$', c'est fini.
857 Sinon, on cherche le meilleur candidat. C'est-à-dire le rapport poids/distance le plus important.
859 separateurs = [[u'.', 6.0], [u'?', 6.0], [u'!', 6.0], [u'£$£', 6.0], [u':', 5.0], [u';', 4.0], [u',', 1.0], [u' ', 0.01]]
860 dsep = dict([[val[0],val[1]] for val in separateurs])
861 trouve = False # si on a trouvé un bon séparateur
862 iDecoupe = 0 # indice du caractere ou il faut decouper
864 longueur = min(longueur, len(chaine) - 1)
865 chaineTravail = chaine[:longueur + 1]
867 meilleur = ['', 0, 0] # type, poids et position du meilleur separateur
870 indice = chaineTravail.index(u'$')
872 iDecoupe = indice - 1
877 caractere = chaineTravail[nbCar]
878 distance = abs(longueurOptimale - nbCar) + 1
879 meilleureDistance = abs(longueurOptimale - meilleur[2]) + 1
880 if caractere in dsep :
881 if (float(dsep[caractere]) / distance) > (float(meilleur[1]) / meilleureDistance) :
882 meilleur[0] = caractere
883 meilleur[1] = dsep[caractere]
888 if (float(dsep[' ']) / distance) > (float(meilleur[1]) / meilleureDistance) :
890 meilleur[1] = dsep[' ']
897 #if meilleur[0] != ' ' :
898 # fin = chaine[iDecoupe + 1:]
899 # retour = chaineTravail[:iDecoupe]
901 fin = chaine[iDecoupe + 1:]
902 retour = chaineTravail[:iDecoupe + 1]
903 return len(retour) > 0, retour, fin
904 # si on a rien trouvé
905 return False, chaine, ''
907 def testetoile(line) :
908 return line.startswith(u'****')
911 return line[0:4].isdigit() and u'*' in line
913 def prep_txtlist(txt) :
914 return txt.split() + [u'$']
916 def prep_txtcharact(txt) :
921 Class for building a corpus
923 def __init__(self, infile, parametres_corpus, lexique = None, expressions = None, dlg = None) :
924 log.info('begin building corpus...')
925 self.lexique = lexique
926 self.expressions = expressions
928 self.corpus = Corpus(self, parametres_corpus)
931 self.lim = parametres_corpus.get('lim', 1000000)
932 self.encoding = parametres_corpus['encoding']
933 self.corpus.pathout = PathOut(filename = parametres_corpus['originalpath'], dirout = parametres_corpus['pathout'])
934 self.corpus.pathout.createdir(parametres_corpus['pathout'])
935 self.corpus.parametres['uuid'] = str(uuid4())
936 self.corpus.parametres['corpus_name'] = os.path.split(self.corpus.parametres['pathout'])[1]
937 self.corpus.parametres['type'] = 'corpus'
938 if self.corpus.parametres['keep_ponct'] :
939 self.ponctuation_espace = [' ', '']
941 self.ponctuation_espace = [' ','.', u'£$£', ';', '?', '!', ',', ':','']
943 self.tolist = self.corpus.parametres.get('tolist', 0)
950 def prep_makeuce(self) :
951 method = self.corpus.parametres.get('ucemethod', 0)
953 self.decouper = decouperlist
954 self.prep_txt = prep_txtlist
955 self.ucesize = self.corpus.parametres.get('ucesize', 40)
957 self.decouper = decoupercharact
958 self.prep_txt = prep_txtcharact
959 self.ucesize = self.corpus.parametres.get('ucesize', 240)
960 log.info('method uce : %s' % method)
965 self.read_corpus(self.infile)
966 except Warning, args :
967 log.info('pas kool %s' % args)
971 self.corpus.parametres['ira'] = self.corpus.pathout['Corpus.cira']
972 self.time = time() - t1
974 DoConf().makeoptions(['corpus'],[self.corpus.parametres], self.corpus.pathout['Corpus.cira'])
975 log.info('time : %f' % (time() - t1))
978 self.conn_f = sqlite3.connect(self.corpus.pathout['formes.db'])
979 self.cf = self.conn_f.cursor()
980 self.cf.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
981 self.cf.execute('CREATE TABLE IF NOT EXISTS eff (id INTEGER, eff TEXT);')
983 self.cf = self.conn_f.cursor()
984 self.cf.execute('PRAGMA temp_store=MEMORY;')
985 self.cf.execute('PRAGMA journal_mode=MEMORY;')
986 self.cf.execute('PRAGMA synchronous = OFF;')
987 self.cf.execute('begin')
988 self.conn = sqlite3.connect(self.corpus.pathout['uces.db'])
989 self.c = self.conn.cursor()
990 self.c.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
992 self.c = self.conn.cursor()
993 self.c.execute('PRAGMA temp_store=MEMORY;')
994 self.c.execute('PRAGMA journal_mode=MEMORY;')
995 self.c.execute('PRAGMA synchronous = OFF;')
996 self.c.execute('begin')
999 #commit index and close db
1001 self.conn_f.commit()
1002 self.cf.execute('CREATE INDEX iduces ON uces (id);')
1003 self.cf.execute('CREATE INDEX ideff ON eff (id);')
1007 self.conn_corpus = sqlite3.connect(self.corpus.pathout['corpus.db'])
1008 self.ccorpus = self.conn_corpus.cursor()
1009 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS etoiles (uci INTEGER, et TEXT, paras TEXT);')
1010 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS luces (uci INTEGER, para INTEGER, uce INTEGER);')
1011 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS formes (ident INTEGER, forme TEXT, lem TEXT, gram TEXT, freq INTEGER);')
1012 self.conn_corpus.commit()
1013 self.ccorpus = self.conn_corpus.cursor()
1014 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
1015 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
1016 self.ccorpus.execute('PRAGMA synchronous = OFF;')
1017 self.ccorpus.execute('begin')
1018 self.backup_corpus()
1019 self.ccorpus.execute('CREATE INDEX iduci ON luces (uci);')
1020 self.conn_corpus.commit()
1021 self.conn_corpus.close()
1022 #self.corpus.parametres['corpus_ira'] = self.corpus.pathout['corpus.cira']
1024 def buildcleans(self) :
1025 if self.corpus.parametres.get('lower', 1) :
1026 self.cleans.append(self.dolower)
1027 if self.corpus.parametres.get('firstclean', 1) :
1028 self.cleans.append(self.firstclean)
1029 if self.corpus.parametres['charact'] :
1030 self.rule = self.corpus.parametres.get('keep_caract', u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_")
1031 self.cleans.append(self.docharact)
1032 if self.corpus.parametres.get('expressions', 1) :
1033 self.cleans.append(self.make_expression)
1034 if self.corpus.parametres.get('apos', 1) :
1035 self.cleans.append(self.doapos)
1036 if self.corpus.parametres.get('tiret', 1):
1037 self.cleans.append(self.dotiret)
1039 def make_expression(self,txt) :
1040 for expression in self.expressions:
1041 if expression in txt :
1042 txt = txt.replace(expression, self.expressions[expression][0])
1045 def dolower(self, txt) :
1048 def docharact(self, txt) :
1049 #rule = u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_-"
1050 list_keep = u"[" + self.rule + "]+"
1051 return re.sub(list_keep, ' ', txt)
1053 def doapos(self, txt) :
1054 return txt.replace(u'\'', u' ')
1056 def dotiret(self, txt) :
1057 return txt.replace(u'-', u' ')
1059 def firstclean(self, txt) :
1060 txt = txt.replace(u'’',"'")
1061 txt = txt.replace(u'œ', u'oe')
1062 return txt.replace('...',u' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'…', ' £$£ ')
1064 def make_cleans(self, txt) :
1065 for clean in self.cleans :
1069 def backup_uce(self) :
1070 if self.corpus.idformesuces != {} :
1071 log.info('backup %i' % len(self.corpus.idformesuces))
1072 touce = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].keys()])) for forme in self.corpus.idformesuces]
1073 toeff = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].values()])) for forme in self.corpus.idformesuces]
1074 self.cf.executemany('INSERT INTO uces VALUES (?,?);', touce)
1075 self.cf.executemany('INSERT INTO eff VALUES (?,?);', toeff)
1076 self.corpus.idformesuces = {}
1079 def backup_corpus(self) :
1080 log.info('start backup corpus')
1082 for uci in self.corpus.ucis :
1083 self.ccorpus.execute('INSERT INTO etoiles VALUES (?,?,?);' ,(uci.ident,' '.join(uci.etoiles), ' '.join(uci.paras,)))
1084 for uce in uci.uces :
1085 self.ccorpus.execute('INSERT INTO luces VALUES (?,?,?);',(`uci.ident`,`uce.para`,`uce.ident`,))
1086 for forme in self.corpus.formes :
1087 self.ccorpus.execute('INSERT INTO formes VALUES (?,?,?,?,?);', (`self.corpus.formes[forme].ident`, forme, self.corpus.formes[forme].lem, self.corpus.formes[forme].gram, `self.corpus.formes[forme].freq`,))
1088 log.info('%f' % (time() - t))
1090 def dofinish(self) :
1091 self.corpus.parametres['date'] = datetime.datetime.now().ctime()
1092 minutes, seconds = divmod(self.time, 60)
1093 hours, minutes = divmod(minutes, 60)
1094 self.corpus.parametres['time'] = '%.0fh %.0fm %.0fs' % (hours, minutes, seconds)
1095 self.corpus.parametres['ucinb'] = self.corpus.getucinb()
1096 self.corpus.parametres['ucenb'] = self.corpus.getucenb()
1097 self.corpus.parametres['occurrences'] = self.corpus.gettotocc()
1098 self.corpus.parametres['formesnb'] = len(self.corpus.formes)
1099 hapaxnb = self.corpus.gethapaxnb()
1100 pourhapaxf = (float(hapaxnb) / len(self.corpus.formes)) * 100
1101 pourhapaxocc = (float(hapaxnb) / self.corpus.parametres['occurrences']) * 100
1102 self.corpus.parametres['hapax'] = '%i - %.2f %% des formes - %.2f %% des occurrences' % (hapaxnb, pourhapaxf, pourhapaxocc)
1105 class BuildFromAlceste(BuildCorpus) :
1106 def read_corpus(self, infile) :
1107 if self.dlg is not None :
1108 self.dlg.Pulse('textes : 0 - segments : 0')
1111 if self.corpus.parametres['ucimark'] == 0 :
1112 self.testuci = testetoile
1113 elif self.corpus.parametres['ucimark'] == 1 :
1114 self.testuci = testint
1120 with codecs.open(infile, 'r', self.encoding) as f :
1121 for linenb, line in enumerate(f) :
1122 line = line.rstrip('\n\r')
1123 if self.testuci(line) :
1126 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci - 1)
1128 self.corpus.ucis.append(Uci(iduci, line))
1131 if self.corpus.ucis[-1].uces == [] :
1132 log.info(u'Empty text : %i' % linenb)
1134 self.corpus.ucis.pop()
1135 self.corpus.ucis.append(Uci(iduci, line))
1136 if self.dlg is not None :
1137 if not (iduci + 1) % 10 :
1138 self.dlg.Pulse('textes : %i - segments : %i' % (iduci + 1, iduce +1))
1139 elif line.startswith(u'-*') :
1142 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1145 self.corpus.ucis[-1].paras.append(line.split()[0])
1147 raise Exception('paragrapheOT %i' % linenb)
1148 elif line.strip() != '' and iduci != -1 :
1150 if txt != [] and iduci != -1 :
1151 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1156 self.corpus.ucis.pop()
1157 log.info(Exception("Empty text %i" % linenb))
1159 raise Exception('EmptyText %i' % linenb)
1160 if iduci != -1 and iduce != -1:
1163 log.info(_(u"No Text in corpora. Are you sure of the formatting ?"))
1164 raise Exception('TextBeforeTextMark %i' % linenb)
1165 except UnicodeDecodeError :
1166 raise Exception("CorpusEncoding")
1168 def treattxt(self, txt, iduce, idpara, iduci) :
1169 if self.corpus.parametres.get('ucemethod', 0) == 2 and self.corpus.parametres['douce']:
1170 txt = 'laphrasepoursplitter'.join(txt)
1171 txt = self.make_cleans(txt)
1172 txt = ' '.join([val for val in txt.split() if val not in self.ponctuation_espace])
1173 ucetxt = txt.split('laphrasepoursplitter')
1176 txt = self.make_cleans(txt)
1177 ucetxt = self.make_uces(txt, self.corpus.parametres['douce'])
1178 if self.corpus.ucis[-1].paras == [] :
1182 self.corpus.ucis[-1].uces.append(Uce(iduce, idpara, iduci))
1183 self.c.execute('INSERT INTO uces VALUES(?,?);', (`iduce`,uce))
1184 if not self.tolist :
1190 self.corpus.add_word(word)
1191 log.debug(' '.join([`iduci`,`idpara`,`iduce`]))
1192 if self.last > self.lim :
1195 return iduce, idpara
1197 def make_uces(self, txt, douce = True, keep_ponct = False) :
1198 txt = ' '.join(txt.split())
1201 reste, texte_uce, suite = self.decouper(self.prep_txt(txt), self.ucesize + 15, self.ucesize)
1203 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1206 reste, texte_uce, suite = self.decouper(suite, self.ucesize + 15, self.ucesize)
1207 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1212 return [' '.join([val for val in txt.split() if val not in self.ponctuation_espace])]
1214 #decouper (list_sep)
1215 #make_uces (decouper)
1216 #treat_txt (make_uces)
1220 def __init__(self, parent, dlg = None) :
1221 self.parent = parent
1223 parametres = DoConf(os.path.join(self.parent.UserConfigPath,'corpus.cfg')).getoptions('corpus')
1224 parametres['pathout'] = PathOut(parent.filename, 'corpus').mkdirout()
1225 dial = CorpusPref(parent, parametres)
1226 dial.CenterOnParent()
1227 dial.txtpath.SetLabel(parent.filename)
1228 #dial.repout_choices.SetValue(parametres['pathout'])
1229 self.res = dial.ShowModal()
1230 if self.res == 5100 :
1231 parametres = dial.doparametres()
1232 parametres['originalpath'] = parent.filename
1233 PathOut().createdir(parametres['pathout'])
1234 ReadLexique(self.parent, lang = parametres['lang'])
1235 if parametres['lang'] != 'other' and os.path.exists(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')):
1236 self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp'))
1238 self.parent.expressions = {}
1239 self.parametres = parametres
1241 if self.dlg is not None :
1245 def doanalyse(self) :
1246 return BuildFromAlceste(self.parent.filename, self.parametres, self.parent.lexique, self.parent.expressions, dlg = self.dlg).corpus
1249 if __name__ == '__main__' :
1251 parametres = {'formesdb':'formes.db', 'ucesdb': 'uces.db', 'corpusdb' : 'corpus.db', 'syscoding' : 'utf-8', 'encoding' : encoding}
1252 intro = BuildCorpus(infile, parametres)#, tar_in, tar_infouce)#, tar_formes)