1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
11 from functions import decoupercharact, ReadDicoAsDico, DoConf
16 from operator import itemgetter
17 from uuid import uuid4
18 from chemins import PathOut
19 from dialog import CorpusPref
20 from functions import ReadLexique, ReadDicoAsDico
21 from colors import colors
25 log = logging.getLogger('iramuteq.corpus')
28 def copycorpus(corpus) :
29 log.info('copy corpus')
30 copy_corpus = Corpus(corpus.parent, parametres = corpus.parametres)
31 copy_corpus.ucis = corpus.ucis
32 copy_corpus.formes = corpus.formes
33 copy_corpus.pathout = corpus.pathout
34 copy_corpus.conn_all()
44 def __init__(self, parent, parametres = {}, read = False) :
46 self.parametres = parametres
48 self.connformes = None
50 self.conncorpus = None
57 self.idformesuces = {}
62 self.pathout = PathOut(dirout = parametres['pathout'])
65 def add_word(self, word) :
66 if word in self.formes :
67 self.formes[word].freq += 1
68 if self.formes[word].ident in self.idformesuces :
69 if self.ucis[-1].uces[-1].ident in self.idformesuces[self.formes[word].ident] :
70 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] += 1
72 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] = 1
74 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident: 1}
76 if word in self.parent.lexique :
77 gramtype = self.parent.lexique[word][1]
78 lem = self.parent.lexique[word][0]
85 self.formes[word] = Word(word, gramtype, len(self.formes), lem)
86 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident : 1}
89 """connect corpus to db"""
90 if self.connformes is None :
91 log.info('connexion corpus')
92 self.connuces = sqlite3.connect(self.pathout['uces.db'])
93 self.cuces = self.connuces.cursor()
94 self.connformes = sqlite3.connect(self.pathout['formes.db'])
95 self.cformes = self.connformes.cursor()
96 self.conncorpus = sqlite3.connect(self.pathout['corpus.db'])
97 self.ccorpus = self.conncorpus.cursor()
98 self.cformes.execute('PRAGMA temp_store=MEMORY;')
99 self.cformes.execute('PRAGMA journal_mode=MEMORY;')
100 self.cformes.execute('PRAGMA synchronous = OFF;')
101 self.cuces.execute('PRAGMA temp_store=MEMORY;')
102 self.cuces.execute('PRAGMA journal_mode=MEMORY;')
103 self.cuces.execute('PRAGMA synchronous = OFF;')
104 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
105 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
106 self.ccorpus.execute('PRAGMA synchronous = OFF;')
108 def read_corpus(self) :
109 log.info('read corpus')
110 self.parametres['syscoding'] = sys.getdefaultencoding()
111 if self.conncorpus is None :
113 res = self.ccorpus.execute('SELECT * FROM etoiles;')
115 self.ucis.append(Uci(row[0], row[1], row[2]))
116 uces = self.conncorpus.cursor().execute('SELECT * FROM luces where uci=?;',(`self.ucis[-1].ident`,))
118 self.ucis[-1].uces.append(Uce(uce[2], uce[1], uce[0]))
119 res = self.ccorpus.execute('SELECT * FROM formes;')
120 self.formes = dict([[forme[1], Word(forme[1], forme[3], forme[0], lem = forme[2], freq = forme[4])] for forme in res])
123 def getworduces(self, wordid) :
124 if isinstance(wordid, basestring) :
125 wordid = self.formes[wordid].ident
126 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`wordid`,))
127 return list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
129 def getformeuceseff(self, formeid) :
130 if isinstance(formeid, basestring) :
131 formeid = self.formes[formeid].ident
132 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`formeid`,))
133 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
134 query = 'SELECT eff FROM eff where id=%i ORDER BY id' % formeid
135 res = self.cformes.execute(query)
136 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
138 for i, uce in enumerate(uces) :
139 formeuceeff[uce] = formeuceeff.get(uce, 0) + eff[i]
142 def getlemuces(self, lem) :
143 formesid = ', '.join([`val` for val in self.lems[lem].formes])
144 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
145 res = self.cformes.execute(query)
146 return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))))
148 def getlemucis(self, lem) :
149 uces = self.getlemuces(lem)
150 return list(set([self.getucefromid(val).uci for val in uces]))
152 def getlemuceseff(self, lem, luces = None) :
153 formesid = ', '.join([`val` for val in self.lems[lem].formes])
154 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
155 res = self.cformes.execute(query)
156 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
157 query = 'SELECT eff FROM eff where id IN (%s) ORDER BY id' % formesid
158 res = self.cformes.execute(query)
159 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
161 for i, uce in enumerate(uces) :
162 lemuceeff[uce] = lemuceeff.get(uce, 0) + eff[i]
165 def getlemclustereff(self, lem, cluster) :
166 return len(list(set(self.lc[cluster]).intersection(self.getlemuces(lem))))
168 def getlemeff(self, lem) :
169 return self.lems[lem].freq
174 def getforme(self, formeid) :
175 if self.idformes is None : self.make_idformes()
176 return self.idformes[formeid]
178 def gettotocc(self) :
179 return sum([self.formes[forme].freq for forme in self.formes])
181 def getucemean(self) :
182 return float(self.gettotocc())/self.getucenb()
185 return self.ucis[-1].uces[-1].ident + 1
188 return self.ucis[-1].ident + 1
190 def getucisize(self) :
191 ucesize = self.getucesize()
192 return [sum(ucesize[uci.uces[0].ident:(uci.uces[-1].ident + 1)]) for uci in self.ucis]
194 def getucesize(self) :
195 res = self.getalluces()
196 return [len(uce[1].split()) for uce in res]
198 def getconcorde(self, uces) :
199 return self.cuces.execute('select * from uces where id IN (%s);' % ', '.join([`i` for i in uces]))
201 def getwordconcorde(self, word) :
202 return self.getconcorde(self.getworduces(word))
204 def getlemconcorde(self, lem) :
205 return self.getconcorde(self.getlemuces(lem))
207 def getalluces(self) :
208 return self.cuces.execute('SELECT * FROM uces')
210 def getucesfrometoile(self, etoile) :
211 return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles]
213 def getetoileuces(self) :
214 log.info('get uces etoiles')
216 for uci in self.ucis :
217 etoiles = uci.etoiles[1:] + uci.paras
219 if et in etoileuces :
220 etoileuces[et] += [uce.ident for uce in uci.uces]
222 etoileuces[et] = [uce.ident for uce in uci.uces]
225 def getucefromid(self, uceid) :
226 if self.iduces is None : self.make_iduces()
227 return self.iduces[uceid]
229 def gethapaxnb(self) :
230 return len([None for forme in self.formes if self.formes[forme].freq == 1])
232 def getactivesnb(self, key) :
233 return len([lem for lem in self.lems if self.lems[lem].act == key])
234 # def make_lems(self, lem = True) :
235 # log.info('make lems')
237 # for forme in self.formes :
238 # if self.formes[forme].lem in self.lems :
239 # if self.formes[forme].ident not in self.lems[self.formes[forme].lem] :
240 # self.lems[self.formes[forme].lem][self.formes[forme].ident] = 0
242 # self.lems[self.formes[forme].lem] = {self.formes[forme].ident : 0}
244 def getetbyuceid(self, uceid) :
245 if self.uceuci is None : self.uceuci = dict([[uce.ident,uci.ident] for uci in self.ucis for uce in uci.uces])
246 return self.ucis[self.uceuci[uceid]].etoiles
248 def make_lems(self, lem = True) :
249 log.info('make lems')
252 for forme in self.formes :
253 if self.formes[forme].lem in self.lems :
254 if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes :
255 self.lems[self.formes[forme].lem].add_forme(self.formes[forme])
257 self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
259 self.lems = dict([[forme, Lem(self, self.formes[forme])] for forme in self.formes])
261 def make_idformes(self) :
262 self.idformes = dict([[self.formes[forme].ident, self.formes[forme]] for forme in self.formes])
264 def make_iduces(self) :
265 if self.iduces is None :
266 self.iduces = dict([[uce.ident, uce] for uci in self.ucis for uce in uci.uces])
268 def make_lexitable(self, mineff, etoiles) :
269 tokeep = [lem for lem in self.lems if self.lems[lem].freq >= mineff]
270 etuces = [[] for et in etoiles]
271 for uci in self.ucis :
272 get = list(set(uci.etoiles).intersection(etoiles))
274 return '2 variables sur la meme ligne'
276 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
277 etuces = [set(val) for val in etuces]
280 deff = self.getlemuceseff(lem)
282 tab.append([lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])
283 tab.insert(0, [''] + etoiles)
286 def make_efftype_from_etoiles(self, etoiles) :
288 etuces = [[] for et in etoiles]
289 for uci in self.ucis :
290 get = list(set(uci.etoiles).intersection(etoiles))
292 return '2 variables sur la meme ligne'
294 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
295 etuces = [set(val) for val in etuces]
296 for lem in self.lems :
297 deff = self.getlemuceseff(lem)
299 gram = self.lems[lem].gram
301 dtype[gram] = [i + j for i, j in zip(dtype[gram], [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])]
303 dtype[gram] = [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces]
304 tabout = [[gram] + dtype[gram] for gram in dtype]
305 tabout.insert(0, [''] + etoiles)
308 def make_uceactsize(self, actives) :
309 res = self.getalluces()
312 deff = self.getlemuceseff(lem)
314 ucesize[uce] = ucesize.get(uce, 0) + 1
317 def make_uc(self, actives, lim1, lim2) :
318 uceactsize = self.make_uceactsize(actives)
324 for uce in [uce for uci in self.ucis for uce in uci.uces] :
325 if uce.para == lastpara :
327 last1 += uceactsize.get(uce.ident,0)
328 uc1[-1].append(uce.ident)
330 uc1.append([uce.ident])
333 last2 += uceactsize.get(uce.ident, 0)
334 uc2[-1].append(uce.ident)
336 uc2.append([uce.ident])
339 last1 = uceactsize.get(uce.ident, 0)
340 last2 = uceactsize.get(uce.ident, 0)
342 uc1.append([uce.ident])
343 uc2.append([uce.ident])
346 def make_and_write_sparse_matrix_from_uc(self, actives, sizeuc1, sizeuc2, uc1out, uc2out, listuce1out, listuce2out) :
347 uc1, uc2 = self.make_uc(actives, sizeuc1, sizeuc2)
348 log.info('taille uc1 : %i - taille uc2 : %i' % (len(uc1), len(uc2)))
349 self.write_ucmatrix(uc1, actives, uc1out)
350 self.write_ucmatrix(uc2, actives, uc2out)
351 listuce1 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc1) for uce in ucl]
352 listuce2 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc2) for uce in ucl]
353 with open(listuce1out, 'w') as f :
354 f.write('\n'.join([';'.join(line) for line in listuce1]))
355 with open(listuce2out, 'w') as f :
356 f.write('\n'.join([';'.join(line) for line in listuce2]))
357 return len(uc1), len(uc2)
359 def write_ucmatrix(self, uc, actives, fileout) :
360 log.info('write uc matrix %s' % fileout)
361 uces_uc = dict([[uce, i] for i, ucl in enumerate(uc) for uce in ucl])
364 with open(fileout + '~', 'w+') as f :
365 for i, lem in enumerate(actives) :
366 for uce in self.getlemuces(lem):
367 if (uces_uc[uce], i) not in deja_la :
369 f.write(''.join([' '.join([`uces_uc[uce]+1`,`i+1`,`1`]),'\n']))
370 deja_la[(uces_uc[uce], i)] = 0
372 with open(fileout, 'w') as ffin :
373 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (len(uc), len(actives), nbl))
376 os.remove(fileout + '~')
379 def export_corpus(self, outf) :
380 #outf = 'export_corpus.txt'
382 res = self.getalluces()
386 with open(outf,'w') as f :
388 if self.iduces[uce[0]].uci == actuci and self.iduces[uce[0]].para == actpara :
389 f.write(uce[1].encode(self.parametres['syscoding']) + '\n')
390 elif self.iduces[uce[0]].uci != actuci :
391 actuci = self.iduces[uce[0]].uci
392 if self.ucis[self.iduces[uce[0]].uci].paras == [] :
393 actpara = self.iduces[uce[0]].para
394 f.write('\n' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n' + uce[1].encode(self.parametres['syscoding']) + '\n')
397 actpara = self.iduces[uce[0]].para
398 f.write('\n'.join([' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']), self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
399 elif self.iduces[uce[0]].para != actpara :
400 actpara = self.iduces[uce[0]].para
402 f.write('\n'.join([self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
404 def export_corpus_classes(self, outf, alc = True, lem = False) :
406 for i, lc in enumerate(self.lc) :
409 for uce in self.lc0 :
411 res = self.getalluces()
413 with open(outf, 'w') as f :
416 actuci = self.iduces[uce[0]].uci
418 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
420 etline = ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles + ['*classe_%i' % ucecl[uce[0]]])
422 etline = ' '.join(['<' + '='.join(et.split('_')) + '>' for et in self.ucis[self.iduces[uce[0]].uci].etoiles[1:]])
423 f.write(etline.encode(self.parametres['syscoding']) + '\n')
424 f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
426 def make_and_write_sparse_matrix_from_uces(self, actives, outfile, listuce = False) :
427 log.info('make_and_write_sparse_matrix_from_uces %s' % outfile)
429 with open(outfile + '~', 'w+') as f :
430 for i, lem in enumerate(actives) :
431 for uce in sorted(self.getlemuces(lem)) :
433 f.write(''.join([' '.join([`uce+1`, `i+1`,`1`]),'\n']))
435 with open(outfile, 'w') as ffin :
436 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
439 os.remove(outfile + '~')
441 with open(listuce, 'w') as f :
442 f.write('\n'.join(['uce;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucenb())]))
444 def make_and_write_sparse_matrix_from_uci(self, actives, outfile, listuci = False) :
445 log.info('make_and_write_sparse_matrix_from_ucis %s' % outfile)
447 with open(outfile + '~', 'w+') as f :
448 for i, lem in enumerate(actives) :
449 for uci in sorted(self.getlemucis(lem)) :
451 f.write(''.join([' '.join([`uci+1`, `i+1`,`1`]),'\n']))
453 with open(outfile, 'w') as ffin :
454 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucinb(), len(actives), nbl))
457 os.remove(outfile + '~')
459 with open(listuci, 'w') as f :
460 f.write('\n'.join(['uci;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucinb())]))
462 def make_and_write_sparse_matrix_from_classe(self, actives, uces, outfile) :
463 log.info('make_and_write_sparse_matrix_from_classe %s' % outfile)
465 duces = dict([[uce, i] for i, uce in enumerate(uces)])
466 with open(outfile + '~', 'w+') as f :
467 for i, lem in enumerate(actives) :
468 uces_ok = list(set(self.getlemuces(lem)).intersection(uces))
470 f.write(''.join([' '.join([`duces[uce]+1`,`i+1`,`1`]),'\n']))
472 with open(outfile, 'w') as ffin :
473 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
476 os.remove(outfile + '~')
478 def make_table_with_classe(self, uces, list_act) :
479 table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
480 uces = dict([[uce, i] for i, uce in enumerate(uces)])
481 for i, lem in enumerate(list_act) :
482 lemuces = list(set(self.getlemuces(lem)).intersection(uces))
484 table_uce[uces[uce]][i] = 1
485 table_uce.insert(0, list_act)
488 def parse_active(self, gramact, gramsup = None) :
489 log.info('parse actives')
490 for lem in self.lems :
491 if lem.startswith('_') and lem.endswith('_') :
492 self.lems[lem].act = 2
493 elif self.lems[lem].gram in gramact :
494 self.lems[lem].act = 1
495 elif gramsup is not None :
496 if self.lems[lem].gram in gramsup :
497 self.lems[lem].act = 2
499 self.lems[lem].act = 0
501 self.lems[lem].act = 2
503 def make_actives_limit(self, limit, key = 1) :
504 if self.idformes is None :
506 return [lem for lem in self.lems if self.getlemeff(lem) >= limit and self.lems[lem].act == key]
508 def make_actives_nb(self, nbmax, key) :
509 log.info('make_actives_nb : %i - %i' % (nbmax,key))
510 if self.idformes is None :
512 allactives = [[self.lems[lem].freq, lem] for lem in self.lems if self.lems[lem].act == key and self.lems[lem].freq >= 3]
513 self.activenb = len(allactives)
514 allactives = sorted(allactives, reverse = True)
515 if len(allactives) <= nbmax :
516 log.info('nb = %i - eff min = %i ' % (len(allactives), allactives[-1][0]))
517 return [val[1] for val in allactives], allactives[-1][0]
519 effs = [val[0] for val in allactives]
520 if effs.count(effs[nbmax - 1]) > 1 :
521 lim = effs[nbmax - 1] + 1
525 stop = effs.index(lim)
532 log.info('nb actives = %i - eff min = %i ' % (stop + 1, lim))
533 return [val[1] for val in allactives[0:stop + 1]], lim
535 def make_and_write_profile(self, actives, ucecl, fileout) :
536 log.info('formes/classes')
537 tab = [[lem] + [len(set(self.getlemuces(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
538 tab = [[line[0]] + [`val` for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
539 with open(fileout, 'w') as f :
540 f.write('\n'.join([';'.join(line) for line in tab]).encode(self.parametres['syscoding']))
542 def make_etoiles(self) :
544 for uci in self.ucis :
545 etoiles.update(uci.etoiles[1:] + uci.paras)
548 def make_etoiles_dict(self) :
549 etoiles = [et for uci in self.ucis for et in uci.etoiles[1:]]
551 for etoile in etoiles :
552 et = etoile.split('_')
555 endet = '_'.join(et[1:])
556 if endet in det[et[0]] :
557 det[et[0]][endet] += 1
559 det[et[0]][endet] = 1
564 endet = '_'.join(et[1:])
565 det[et[0]] = {endet :1}
570 def make_etline(self, listet) :
571 etuces = [[] for et in listet]
572 for uci in self.ucis :
573 get = list(set(uci.etoiles).intersection(listet))
575 return '2 variables sur la meme ligne'
577 etuces[listet.index(get[0])] += [uce.ident for uce in uci.uces]
580 def make_and_write_profile_et(self, ucecl, fileout) :
581 log.info('etoiles/classes')
582 etoileuces = self.getetoileuces()
583 etoileuces = dict([[et, etoileuces[et]] for et in etoileuces if len(etoileuces[et]) > 1])
584 with open(fileout, 'w') as f :
585 f.write('\n'.join([';'.join([et] + [`len(set(etoileuces[et]).intersection(classe))` for classe in ucecl]) for et in etoileuces]).encode(self.parametres['syscoding']))
586 #etoiles = self.make_etoiles()
587 #with open(fileout, 'w') as f :
588 # f.write('\n'.join([';'.join([etoile] + [`len(set(self.getucesfrometoile(etoile)).intersection(classe))` for classe in ucecl]) for etoile in etoiles]).encode(self.parametres['syscoding']))
590 def make_colored_corpus(self) :
592 for i, lc in enumerate(self.lc) :
595 for uce in self.lc0 :
597 color = ['black'] + colors[len(self.lc) - 1]
599 <meta http-equiv="content-Type" content="text/html; charset=%s" />
601 ''' % sys.getdefaultencoding()
602 res = self.getalluces()
607 if self.iduces[uce[0]].uci != actuci :
608 actuci = self.iduces[uce[0]].uci
609 txt += '<br><hr>' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles) + '<br><br>'
610 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
612 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
613 return txt + '\n</body></html>'
615 def count_from_list(self, l, d) :
623 def count_from_list_cl(self, l, d, a, clnb) :
632 def find_segments(self, taille_segment, taille_limite) :
634 for uce in self.getalluces() :
636 d = self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
637 l = [[d[val], val] for val in d if d[val] >= 3]
640 if len(l) > taille_limite :
641 l = l[-taille_limite:]
644 def find_segments_in_classe(self, list_uce, taille_segment, taille_limite):
646 for uce in self.getconcorde(list_uce) :
648 d =self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
649 l = [[d[val], val, taille_segment] for val in d if d[val] >= 3]
652 if len(l) > taille_limite :
653 l = l[-taille_limite:]
656 def make_segments_profile(self, fileout, lenmin = 3, lenmax = 10, effmin = 50, lem = False) :
658 for b, classe in enumerate(self.lc) :
659 for uce in self.getconcorde(classe) :
662 uce = [self.formes[forme].lem for forme in uce]
663 for taille_segment in range(lenmin,lenmax) :
664 d =self.count_from_list_cl([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d, b, len(self.lc))
665 result = [[seg] + [str(val) for val in d[seg]] for seg in d if sum(d[seg]) >= effmin]
666 with open(fileout, 'w') as f :
667 f.write('\n'.join([';'.join(line) for line in result]))
669 def make_proftype(self, outf) :
671 for lem in self.lems :
672 gram = self.lems[lem].gram
674 res[gram] = [0 for val in self.lc]
675 lemuceeff = self.getlemuceseff(lem)
676 for i, classe in enumerate(self.lc) :
677 concern = set(classe).intersection(lemuceeff.keys())
678 res[gram][i] += sum([lemuceeff[uce] for uce in concern])
679 res = [[gram] + [`val` for val in res[gram]] for gram in res]
681 with open(outf, 'w') as f :
682 f.write('\n'.join([';'.join(line) for line in res]).encode(self.parametres['syscoding']))
685 def make_ucecl_from_R(self, filein) :
686 with open(filein, 'rU') as f :
691 line = line.replace('\n', '').replace('"', '').split(';')
692 self.lc.append([int(line[0]) - 1, int(line[1])])
693 classesl = [val[1] for val in self.lc]
695 self.lc = sorted(self.lc, key=itemgetter(1))
696 self.lc = [[uce[0] for uce in self.lc if uce[1] == i] for i in range(clnb+1)]
697 self.lc0 = self.lc.pop(0)
700 def get_stat_by_cluster(self, outf) :
701 log.info('get_stat_by_cluster')
703 occurrences = dict([[i + 1, 0] for i in range(len(self.lc))])
704 formescl = dict([[i + 1, 0] for i in range(len(self.lc))])
705 hapaxcl = dict([[i + 1, 0] for i in range(len(self.lc))])
706 lenclasses = dict([[i+1,len(cl)] for i, cl in enumerate(self.lc)])
707 sets = [set(cl) for cl in self.lc]
708 for forme in self.formes :
709 formeuceeff = self.getformeuceseff(forme)
710 for i, classe in enumerate(self.lc) :
711 concern = sets[i].intersection(formeuceeff.keys())
713 occurrences[i+1] += sum([formeuceeff[uce] for uce in concern])
715 if self.formes[forme].freq == 1 :
717 toprint = '\n'.join([';'.join([`i`, `occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`]) for i in occurrences])
718 with open(outf, 'w') as f :
720 log.info('%f' % (time() - t1))
722 def gethapaxbyet(self, etoiles) :
723 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
725 for uce in hapaxuces :
726 if uce in hucesdict :
730 etuces = [[] for et in etoiles]
731 for uci in self.ucis :
732 get = list(set(uci.etoiles).intersection(etoiles))
734 return '2 variables sur la meme ligne'
736 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
737 etuces = [set(val) for val in etuces]
738 return [sum([hucesdict[uce] for uce in list(etuce.intersection(hapaxuces))]) for etuce in etuces]
740 def gethapaxuces(self) :
741 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
742 hapax = [forme for forme in self.lems if self.lems[forme].freq == 1]
744 for i,uce in enumerate(hapaxuces) :
745 if uce in hucesdict :
746 hucesdict[uce][0] += 1
747 hucesdict[uce][1].append(hapax[i])
749 hucesdict[uce] = [1,[hapax[i]]]
751 for uce in hucesdict :
752 if hucesdict[uce][0] in huces :
753 huces[hucesdict[uce][0]].append(uce)
755 huces[hucesdict[uce][0]] = [uce]
756 huces = zip(huces, huces.values())
757 huces.sort(reverse=True)
761 for nb in huces[0:4] :
762 txt += "<p><h2>%i hapax par uce</h2><p>\n" % nb[0]
764 res = self.getconcorde([uce])
766 ucetxt = ' ' + row[1] + ' '
768 for hap in hucesdict[uce][1] :
769 laforme = self.getforme([forme for forme in self.lems[hap].formes][0]).forme
770 ucetxt = ucetxt.replace(' '+laforme+' ', ' <font color=red>'+laforme+'</font> ')
771 txt += '<p><b>' + ' '.join(self.getetbyuceid(uceid)) + '</b></p>'
772 txt += '<p>'+ucetxt+'</p>\n'
776 with open('/tmp/testhapxuce.html','w') as f :
781 def __init__(self, corpus) :
782 ucinb = corpus.getucinb()
783 ucisize = corpus.getucisize()
784 ucimean = float(sum(ucisize))/float(ucinb)
785 detoile = corpus.make_etoiles_dict()
789 def __init__(self, iduci, line, paraset = None) :
791 self.etoiles = line.split()
793 if paraset is not None :
794 self.paras = paraset.split()
799 def __init__(self, iduce, idpara, iduci) :
805 def __init__(self, word, gramtype, idword, lem = None, freq = None) :
811 if freq is not None :
817 def __init__(self, parent, forme) :
818 self.formes = {forme.ident : forme.freq}
819 self.gram = forme.gram
820 self.freq = forme.freq
823 def add_forme(self, forme) :
824 self.formes[forme.ident] = forme.freq
825 self.freq += forme.freq
827 def decouperlist(chaine, longueur, longueurOptimale) :
829 on part du dernier caractère, et on recule jusqu'au début de la chaîne.
830 Si on trouve un '$', c'est fini.
831 Sinon, on cherche le meilleur candidat. C'est-à-dire le rapport poids/distance le plus important.
833 separateurs = [[u'.', 6.0], [u'?', 6.0], [u'!', 6.0], [u'£$£', 6.0], [u':', 5.0], [u';', 4.0], [u',', 1.0], [u' ', 0.01]]
834 dsep = dict([[val[0],val[1]] for val in separateurs])
835 trouve = False # si on a trouvé un bon séparateur
836 iDecoupe = 0 # indice du caractere ou il faut decouper
838 longueur = min(longueur, len(chaine) - 1)
839 chaineTravail = chaine[:longueur + 1]
841 meilleur = ['', 0, 0] # type, poids et position du meilleur separateur
844 indice = chaineTravail.index(u'$')
846 iDecoupe = indice - 1
851 caractere = chaineTravail[nbCar]
852 distance = abs(longueurOptimale - nbCar) + 1
853 meilleureDistance = abs(longueurOptimale - meilleur[2]) + 1
854 if caractere in dsep :
855 if (float(dsep[caractere]) / distance) > (float(meilleur[1]) / meilleureDistance) :
856 meilleur[0] = caractere
857 meilleur[1] = dsep[caractere]
862 if (float(dsep[' ']) / distance) > (float(meilleur[1]) / meilleureDistance) :
864 meilleur[1] = dsep[' ']
871 #if meilleur[0] != ' ' :
872 # fin = chaine[iDecoupe + 1:]
873 # retour = chaineTravail[:iDecoupe]
875 fin = chaine[iDecoupe + 1:]
876 retour = chaineTravail[:iDecoupe + 1]
877 return len(retour) > 0, retour, fin
878 # si on a rien trouvé
879 return False, chaine, ''
881 def testetoile(line) :
882 return line.startswith(u'****')
885 return line[0:4].isdigit() and u'*' in line
887 def prep_txtlist(txt) :
888 return txt.split() + [u'$']
890 def prep_txtcharact(txt) :
895 Class for building a corpus
897 def __init__(self, infile, parametres_corpus, lexique = None, expressions = None, dlg = None) :
898 log.info('begin building corpus...')
899 self.lexique = lexique
900 self.expressions = expressions
902 self.corpus = Corpus(self, parametres_corpus)
905 self.lim = parametres_corpus.get('lim', 1000000)
906 self.encoding = parametres_corpus['encoding']
907 self.corpus.pathout = PathOut(filename = parametres_corpus['originalpath'], dirout = parametres_corpus['pathout'])
908 self.corpus.pathout.createdir(parametres_corpus['pathout'])
909 self.corpus.parametres['uuid'] = str(uuid4())
910 self.corpus.parametres['corpus_name'] = os.path.split(self.corpus.parametres['pathout'])[1]
911 self.corpus.parametres['type'] = 'corpus'
912 if self.corpus.parametres['keep_ponct'] :
913 self.ponctuation_espace = [' ', '']
915 self.ponctuation_espace = [' ','.', u'£$£', ';', '?', '!', ',', ':','']
917 self.tolist = self.corpus.parametres.get('tolist', 0)
924 def prep_makeuce(self) :
925 method = self.corpus.parametres.get('ucemethod', 0)
927 self.decouper = decouperlist
928 self.prep_txt = prep_txtlist
929 self.ucesize = self.corpus.parametres.get('ucesize', 40)
931 self.decouper = decoupercharact
932 self.prep_txt = prep_txtcharact
933 self.ucesize = self.corpus.parametres.get('ucesize', 240)
934 log.info('method uce : %s' % method)
939 self.read_corpus(self.infile)
940 except Warning, args :
941 log.info('pas kool %s' % args)
945 self.corpus.parametres['ira'] = self.corpus.pathout['Corpus.cira']
946 self.time = time() - t1
948 DoConf().makeoptions(['corpus'],[self.corpus.parametres], self.corpus.pathout['Corpus.cira'])
949 log.info('time : %f' % (time() - t1))
952 self.conn_f = sqlite3.connect(self.corpus.pathout['formes.db'])
953 self.cf = self.conn_f.cursor()
954 self.cf.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
955 self.cf.execute('CREATE TABLE IF NOT EXISTS eff (id INTEGER, eff TEXT);')
957 self.cf = self.conn_f.cursor()
958 self.cf.execute('PRAGMA temp_store=MEMORY;')
959 self.cf.execute('PRAGMA journal_mode=MEMORY;')
960 self.cf.execute('PRAGMA synchronous = OFF;')
961 self.cf.execute('begin')
962 self.conn = sqlite3.connect(self.corpus.pathout['uces.db'])
963 self.c = self.conn.cursor()
964 self.c.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
966 self.c = self.conn.cursor()
967 self.c.execute('PRAGMA temp_store=MEMORY;')
968 self.c.execute('PRAGMA journal_mode=MEMORY;')
969 self.c.execute('PRAGMA synchronous = OFF;')
970 self.c.execute('begin')
973 #commit index and close db
976 self.cf.execute('CREATE INDEX iduces ON uces (id);')
977 self.cf.execute('CREATE INDEX ideff ON eff (id);')
981 self.conn_corpus = sqlite3.connect(self.corpus.pathout['corpus.db'])
982 self.ccorpus = self.conn_corpus.cursor()
983 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS etoiles (uci INTEGER, et TEXT, paras TEXT);')
984 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS luces (uci INTEGER, para INTEGER, uce INTEGER);')
985 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS formes (ident INTEGER, forme TEXT, lem TEXT, gram TEXT, freq INTEGER);')
986 self.conn_corpus.commit()
987 self.ccorpus = self.conn_corpus.cursor()
988 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
989 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
990 self.ccorpus.execute('PRAGMA synchronous = OFF;')
991 self.ccorpus.execute('begin')
993 self.ccorpus.execute('CREATE INDEX iduci ON luces (uci);')
994 self.conn_corpus.commit()
995 self.conn_corpus.close()
996 #self.corpus.parametres['corpus_ira'] = self.corpus.pathout['corpus.cira']
998 def buildcleans(self) :
999 if self.corpus.parametres.get('lower', 1) :
1000 self.cleans.append(self.dolower)
1001 if self.corpus.parametres.get('firstclean', 1) :
1002 self.cleans.append(self.firstclean)
1003 if self.corpus.parametres['charact'] :
1004 self.rule = self.corpus.parametres.get('keep_caract', u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_")
1005 self.cleans.append(self.docharact)
1006 if self.corpus.parametres.get('expressions', 1) :
1007 self.cleans.append(self.make_expression)
1008 if self.corpus.parametres.get('apos', 1) :
1009 self.cleans.append(self.doapos)
1010 if self.corpus.parametres.get('tiret', 1):
1011 self.cleans.append(self.dotiret)
1013 def make_expression(self,txt) :
1014 for expression in self.expressions:
1015 if expression in txt :
1016 txt = txt.replace(expression, self.expressions[expression][0])
1019 def dolower(self, txt) :
1022 def docharact(self, txt) :
1023 #rule = u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_-"
1024 list_keep = u"[" + self.rule + "]+"
1025 return re.sub(list_keep, ' ', txt)
1027 def doapos(self, txt) :
1028 return txt.replace(u'\'', u' ')
1030 def dotiret(self, txt) :
1031 return txt.replace(u'-', u' ')
1033 def firstclean(self, txt) :
1034 txt = txt.replace(u'’',"'")
1035 txt = txt.replace(u'œ', u'oe')
1036 return txt.replace('...',u' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'…', ' £$£ ')
1038 def make_cleans(self, txt) :
1039 for clean in self.cleans :
1043 def backup_uce(self) :
1044 if self.corpus.idformesuces != {} :
1045 log.info('backup %i' % len(self.corpus.idformesuces))
1046 touce = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].keys()])) for forme in self.corpus.idformesuces]
1047 toeff = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].values()])) for forme in self.corpus.idformesuces]
1048 self.cf.executemany('INSERT INTO uces VALUES (?,?);', touce)
1049 self.cf.executemany('INSERT INTO eff VALUES (?,?);', toeff)
1050 self.corpus.idformesuces = {}
1053 def backup_corpus(self) :
1054 log.info('start backup corpus')
1056 for uci in self.corpus.ucis :
1057 self.ccorpus.execute('INSERT INTO etoiles VALUES (?,?,?);' ,(uci.ident,' '.join(uci.etoiles), ' '.join(uci.paras,)))
1058 for uce in uci.uces :
1059 self.ccorpus.execute('INSERT INTO luces VALUES (?,?,?);',(`uci.ident`,`uce.para`,`uce.ident`,))
1060 for forme in self.corpus.formes :
1061 self.ccorpus.execute('INSERT INTO formes VALUES (?,?,?,?,?);', (`self.corpus.formes[forme].ident`, forme, self.corpus.formes[forme].lem, self.corpus.formes[forme].gram, `self.corpus.formes[forme].freq`,))
1062 log.info('%f' % (time() - t))
1064 def dofinish(self) :
1065 self.corpus.parametres['date'] = datetime.datetime.now().ctime()
1066 minutes, seconds = divmod(self.time, 60)
1067 hours, minutes = divmod(minutes, 60)
1068 self.corpus.parametres['time'] = '%.0fh %.0fm %.0fs' % (hours, minutes, seconds)
1069 self.corpus.parametres['ucinb'] = self.corpus.getucinb()
1070 self.corpus.parametres['ucenb'] = self.corpus.getucenb()
1071 self.corpus.parametres['occurrences'] = self.corpus.gettotocc()
1072 self.corpus.parametres['formesnb'] = len(self.corpus.formes)
1073 hapaxnb = self.corpus.gethapaxnb()
1074 pourhapaxf = (float(hapaxnb) / len(self.corpus.formes)) * 100
1075 pourhapaxocc = (float(hapaxnb) / self.corpus.parametres['occurrences']) * 100
1076 self.corpus.parametres['hapax'] = '%i - %.2f %% des formes - %.2f %% des occurrences' % (hapaxnb, pourhapaxf, pourhapaxocc)
1079 class BuildFromAlceste(BuildCorpus) :
1080 def read_corpus(self, infile) :
1081 if self.dlg is not None :
1082 self.dlg.Pulse('textes : 0 - segments : 0')
1085 if self.corpus.parametres['ucimark'] == 0 :
1086 self.testuci = testetoile
1087 elif self.corpus.parametres['ucimark'] == 1 :
1088 self.testuci = testint
1094 with codecs.open(infile, 'r', self.encoding) as f :
1095 for linenb, line in enumerate(f) :
1096 line = line.rstrip('\n\r')
1097 if self.testuci(line) :
1100 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci - 1)
1102 self.corpus.ucis.append(Uci(iduci, line))
1105 if self.corpus.ucis[-1].uces == [] :
1106 log.info(u'Empty text : %i' % linenb)
1108 self.corpus.ucis.pop()
1109 #raise Exception("EmptyText %i" % linenb)
1110 self.corpus.ucis.append(Uci(iduci, line))
1111 if self.dlg is not None :
1112 if not (iduci + 1) % 10 :
1113 self.dlg.Pulse('textes : %i - segments : %i' % (iduci + 1, iduce +1))
1114 elif line.startswith(u'-*') :
1117 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1120 self.corpus.ucis[-1].paras.append(line.split()[0])
1122 raise Exception('paragrapheOT')
1123 elif line.strip() != '' and iduci != -1 :
1125 if txt != [] and iduci != -1 :
1126 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1129 raise Exception("EmptyText")
1130 if iduci != -1 and iduce != -1:
1133 log.info(_(u"No Text in corpora. Are you sure of the formatting ?"))
1134 raise Exception('TextBeforeTextMark')
1135 except UnicodeDecodeError :
1136 raise Exception("CorpusEncoding")
1138 def treattxt(self, txt, iduce, idpara, iduci) :
1139 if self.corpus.parametres.get('ucemethod', 0) == 2 and self.corpus.parametres['douce']:
1140 txt = 'laphrasepoursplitter'.join(txt)
1141 txt = self.make_cleans(txt)
1142 txt = ' '.join([val for val in txt.split() if val not in self.ponctuation_espace])
1143 ucetxt = txt.split('laphrasepoursplitter')
1146 txt = self.make_cleans(txt)
1147 ucetxt = self.make_uces(txt, self.corpus.parametres['douce'])
1148 if self.corpus.ucis[-1].paras == [] :
1152 self.corpus.ucis[-1].uces.append(Uce(iduce, idpara, iduci))
1153 self.c.execute('INSERT INTO uces VALUES(?,?);', (`iduce`,uce))
1154 if not self.tolist :
1160 self.corpus.add_word(word)
1161 log.debug(' '.join([`iduci`,`idpara`,`iduce`]))
1162 if self.last > self.lim :
1165 return iduce, idpara
1167 def make_uces(self, txt, douce = True, keep_ponct = False) :
1168 txt = ' '.join(txt.split())
1171 reste, texte_uce, suite = self.decouper(self.prep_txt(txt), self.ucesize + 15, self.ucesize)
1173 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1176 reste, texte_uce, suite = self.decouper(suite, self.ucesize + 15, self.ucesize)
1177 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1182 return [' '.join([val for val in txt.split() if val not in self.ponctuation_espace])]
1184 #decouper (list_sep)
1185 #make_uces (decouper)
1186 #treat_txt (make_uces)
1190 def __init__(self, parent, dlg = None) :
1191 self.parent = parent
1193 parametres = DoConf(os.path.join(self.parent.UserConfigPath,'corpus.cfg')).getoptions('corpus')
1194 parametres['pathout'] = PathOut(parent.filename, 'corpus').mkdirout()
1195 dial = CorpusPref(parent, parametres)
1196 dial.CenterOnParent()
1197 dial.txtpath.SetLabel(parent.filename)
1198 #dial.repout_choices.SetValue(parametres['pathout'])
1199 self.res = dial.ShowModal()
1200 if self.res == 5100 :
1201 parametres = dial.doparametres()
1202 parametres['originalpath'] = parent.filename
1203 PathOut().createdir(parametres['pathout'])
1204 ReadLexique(self.parent, lang = parametres['lang'])
1205 self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp'))
1206 self.parametres = parametres
1208 if self.dlg is not None :
1212 def doanalyse(self) :
1213 return BuildFromAlceste(self.parent.filename, self.parametres, self.parent.lexique, self.parent.expressions, dlg = self.dlg).corpus
1216 if __name__ == '__main__' :
1218 parametres = {'formesdb':'formes.db', 'ucesdb': 'uces.db', 'corpusdb' : 'corpus.db', 'syscoding' : 'utf-8', 'encoding' : encoding}
1219 intro = BuildCorpus(infile, parametres)#, tar_in, tar_infouce)#, tar_formes)