1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
11 from functions import decoupercharact, ReadDicoAsDico, DoConf
16 from operator import itemgetter
17 from uuid import uuid4
18 from chemins import PathOut
19 from dialog import CorpusPref
20 from functions import ReadLexique, ReadDicoAsDico
21 from colors import colors
25 log = logging.getLogger('iramuteq.corpus')
28 def copycorpus(corpus) :
29 log.info('copy corpus')
30 copy_corpus = Corpus(corpus.parent, parametres = corpus.parametres)
31 copy_corpus.ucis = corpus.ucis
32 copy_corpus.formes = corpus.formes
33 copy_corpus.pathout = corpus.pathout
34 copy_corpus.conn_all()
44 def __init__(self, parent, parametres = {}, read = False) :
46 self.parametres = parametres
48 self.connformes = None
50 self.conncorpus = None
57 self.idformesuces = {}
62 self.pathout = PathOut(dirout = parametres['pathout'])
65 def add_word(self, word) :
66 if word in self.formes :
67 self.formes[word].freq += 1
68 if self.formes[word].ident in self.idformesuces :
69 if self.ucis[-1].uces[-1].ident in self.idformesuces[self.formes[word].ident] :
70 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] += 1
72 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] = 1
74 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident: 1}
76 if word in self.parent.lexique :
77 gramtype = self.parent.lexique[word][1]
78 lem = self.parent.lexique[word][0]
85 self.formes[word] = Word(word, gramtype, len(self.formes), lem)
86 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident : 1}
89 """connect corpus to db"""
90 if self.connformes is None :
91 log.info('connexion corpus')
92 self.connuces = sqlite3.connect(self.pathout['uces.db'])
93 self.cuces = self.connuces.cursor()
94 self.connformes = sqlite3.connect(self.pathout['formes.db'])
95 self.cformes = self.connformes.cursor()
96 self.conncorpus = sqlite3.connect(self.pathout['corpus.db'])
97 self.ccorpus = self.conncorpus.cursor()
98 self.cformes.execute('PRAGMA temp_store=MEMORY;')
99 self.cformes.execute('PRAGMA journal_mode=MEMORY;')
100 self.cformes.execute('PRAGMA synchronous = OFF;')
101 self.cuces.execute('PRAGMA temp_store=MEMORY;')
102 self.cuces.execute('PRAGMA journal_mode=MEMORY;')
103 self.cuces.execute('PRAGMA synchronous = OFF;')
104 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
105 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
106 self.ccorpus.execute('PRAGMA synchronous = OFF;')
108 def read_corpus(self) :
109 log.info('read corpus')
110 self.parametres['syscoding'] = sys.getdefaultencoding()
111 if self.conncorpus is None :
113 res = self.ccorpus.execute('SELECT * FROM etoiles;')
115 self.ucis.append(Uci(row[0], row[1], row[2]))
116 uces = self.conncorpus.cursor().execute('SELECT * FROM luces where uci=?;',(`self.ucis[-1].ident`,))
118 self.ucis[-1].uces.append(Uce(uce[2], uce[1], uce[0]))
119 res = self.ccorpus.execute('SELECT * FROM formes;')
120 self.formes = dict([[forme[1], Word(forme[1], forme[3], forme[0], lem = forme[2], freq = forme[4])] for forme in res])
123 def getworduces(self, wordid) :
124 if isinstance(wordid, basestring) :
125 wordid = self.formes[wordid].ident
126 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`wordid`,))
127 return list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
129 def getformeuceseff(self, formeid) :
130 if isinstance(formeid, basestring) :
131 formeid = self.formes[formeid].ident
132 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`formeid`,))
133 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
134 query = 'SELECT eff FROM eff where id=%i ORDER BY id' % formeid
135 res = self.cformes.execute(query)
136 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
138 for i, uce in enumerate(uces) :
139 formeuceeff[uce] = formeuceeff.get(uce, 0) + eff[i]
142 def getlemuces(self, lem) :
143 formesid = ', '.join([`val` for val in self.lems[lem].formes])
144 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
145 res = self.cformes.execute(query)
146 return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))))
148 def getlemucis(self, lem) :
149 uces = self.getlemuces(lem)
150 return list(set([self.getucefromid(val).uci for val in uces]))
152 def getlemuceseff(self, lem, luces = None) :
153 formesid = ', '.join([`val` for val in self.lems[lem].formes])
154 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
155 res = self.cformes.execute(query)
156 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
157 query = 'SELECT eff FROM eff where id IN (%s) ORDER BY id' % formesid
158 res = self.cformes.execute(query)
159 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
161 for i, uce in enumerate(uces) :
162 lemuceeff[uce] = lemuceeff.get(uce, 0) + eff[i]
165 def getlemclustereff(self, lem, cluster) :
166 return len(list(set(self.lc[cluster]).intersection(self.getlemuces(lem))))
168 def getlemeff(self, lem) :
169 return self.lems[lem].freq
174 def getforme(self, formeid) :
175 if self.idformes is None : self.make_idformes()
176 return self.idformes[formeid]
178 def gettotocc(self) :
179 return sum([self.formes[forme].freq for forme in self.formes])
181 def getucemean(self) :
182 return float(self.gettotocc())/self.getucenb()
185 return self.ucis[-1].uces[-1].ident + 1
188 return self.ucis[-1].ident + 1
190 def getucisize(self) :
191 ucesize = self.getucesize()
192 return [sum(ucesize[uci.uces[0].ident:(uci.uces[-1].ident + 1)]) for uci in self.ucis]
194 def getucesize(self) :
195 res = self.getalluces()
196 return [len(uce[1].split()) for uce in res]
198 def getconcorde(self, uces) :
199 return self.cuces.execute('select * from uces where id IN (%s);' % ', '.join([`i` for i in uces]))
201 def getwordconcorde(self, word) :
202 return self.getconcorde(self.getworduces(word))
204 def getlemconcorde(self, lem) :
205 return self.getconcorde(self.getlemuces(lem))
207 def getalluces(self) :
208 return self.cuces.execute('SELECT * FROM uces')
210 def getucesfrometoile(self, etoile) :
211 return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles]
213 def getucefromid(self, uceid) :
214 if self.iduces is None : self.make_iduces()
215 return self.iduces[uceid]
217 def gethapaxnb(self) :
218 return len([None for forme in self.formes if self.formes[forme].freq == 1])
220 def getactivesnb(self, key) :
221 return len([lem for lem in self.lems if self.lems[lem].act == key])
222 # def make_lems(self, lem = True) :
223 # log.info('make lems')
225 # for forme in self.formes :
226 # if self.formes[forme].lem in self.lems :
227 # if self.formes[forme].ident not in self.lems[self.formes[forme].lem] :
228 # self.lems[self.formes[forme].lem][self.formes[forme].ident] = 0
230 # self.lems[self.formes[forme].lem] = {self.formes[forme].ident : 0}
232 def getetbyuceid(self, uceid) :
233 if self.uceuci is None : self.uceuci = dict([[uce.ident,uci.ident] for uci in self.ucis for uce in uci.uces])
234 return self.ucis[self.uceuci[uceid]].etoiles
236 def make_lems(self, lem = True) :
237 log.info('make lems')
240 for forme in self.formes :
241 if self.formes[forme].lem in self.lems :
242 if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes :
243 self.lems[self.formes[forme].lem].add_forme(self.formes[forme])
245 self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
247 self.lems = dict([[forme, Lem(self, self.formes[forme])] for forme in self.formes])
249 def make_idformes(self) :
250 self.idformes = dict([[self.formes[forme].ident, self.formes[forme]] for forme in self.formes])
252 def make_iduces(self) :
253 if self.iduces is None :
254 self.iduces = dict([[uce.ident, uce] for uci in self.ucis for uce in uci.uces])
256 def make_lexitable(self, mineff, etoiles) :
257 tokeep = [lem for lem in self.lems if self.lems[lem].freq >= mineff]
258 etuces = [[] for et in etoiles]
259 for uci in self.ucis :
260 get = list(set(uci.etoiles).intersection(etoiles))
262 return '2 variables sur la meme ligne'
264 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
265 etuces = [set(val) for val in etuces]
268 deff = self.getlemuceseff(lem)
270 tab.append([lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])
271 tab.insert(0, [''] + etoiles)
274 def make_efftype_from_etoiles(self, etoiles) :
276 etuces = [[] for et in etoiles]
277 for uci in self.ucis :
278 get = list(set(uci.etoiles).intersection(etoiles))
280 return '2 variables sur la meme ligne'
282 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
283 etuces = [set(val) for val in etuces]
284 for lem in self.lems :
285 deff = self.getlemuceseff(lem)
287 gram = self.lems[lem].gram
289 dtype[gram] = [i + j for i, j in zip(dtype[gram], [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])]
291 dtype[gram] = [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces]
292 tabout = [[gram] + dtype[gram] for gram in dtype]
293 tabout.insert(0, [''] + etoiles)
296 def make_uceactsize(self, actives) :
297 res = self.getalluces()
300 deff = self.getlemuceseff(lem)
302 ucesize[uce] = ucesize.get(uce, 0) + 1
305 def make_uc(self, actives, lim1, lim2) :
306 uceactsize = self.make_uceactsize(actives)
312 for uce in [uce for uci in self.ucis for uce in uci.uces] :
313 if uce.para == lastpara :
315 last1 += uceactsize.get(uce.ident,0)
316 uc1[-1].append(uce.ident)
318 uc1.append([uce.ident])
321 last2 += uceactsize.get(uce.ident, 0)
322 uc2[-1].append(uce.ident)
324 uc2.append([uce.ident])
327 last1 = uceactsize.get(uce.ident, 0)
328 last2 = uceactsize.get(uce.ident, 0)
330 uc1.append([uce.ident])
331 uc2.append([uce.ident])
334 def make_and_write_sparse_matrix_from_uc(self, actives, sizeuc1, sizeuc2, uc1out, uc2out, listuce1out, listuce2out) :
335 uc1, uc2 = self.make_uc(actives, sizeuc1, sizeuc2)
336 log.info('taille uc1 : %i - taille uc2 : %i' % (len(uc1), len(uc2)))
337 self.write_ucmatrix(uc1, actives, uc1out)
338 self.write_ucmatrix(uc2, actives, uc2out)
339 listuce1 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc1) for uce in ucl]
340 listuce2 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc2) for uce in ucl]
341 with open(listuce1out, 'w') as f :
342 f.write('\n'.join([';'.join(line) for line in listuce1]))
343 with open(listuce2out, 'w') as f :
344 f.write('\n'.join([';'.join(line) for line in listuce2]))
345 return len(uc1), len(uc2)
347 def write_ucmatrix(self, uc, actives, fileout) :
348 log.info('write uc matrix %s' % fileout)
349 uces_uc = dict([[uce, i] for i, ucl in enumerate(uc) for uce in ucl])
352 with open(fileout + '~', 'w+') as f :
353 for i, lem in enumerate(actives) :
354 for uce in self.getlemuces(lem):
355 if (uces_uc[uce], i) not in deja_la :
357 f.write(''.join([' '.join([`uces_uc[uce]+1`,`i+1`,`1`]),'\n']))
358 deja_la[(uces_uc[uce], i)] = 0
360 with open(fileout, 'w') as ffin :
361 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (len(uc), len(actives), nbl))
364 os.remove(fileout + '~')
367 def export_corpus(self, outf) :
368 #outf = 'export_corpus.txt'
370 res = self.getalluces()
374 with open(outf,'w') as f :
376 if self.iduces[uce[0]].uci == actuci and self.iduces[uce[0]].para == actpara :
377 f.write(uce[1].encode(self.parametres['syscoding']) + '\n')
378 elif self.iduces[uce[0]].uci != actuci :
379 actuci = self.iduces[uce[0]].uci
380 if self.ucis[self.iduces[uce[0]].uci].paras == [] :
381 actpara = self.iduces[uce[0]].para
382 f.write('\n' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n' + uce[1].encode(self.parametres['syscoding']) + '\n')
385 actpara = self.iduces[uce[0]].para
386 f.write('\n'.join([' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']), self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
387 elif self.iduces[uce[0]].para != actpara :
388 actpara = self.iduces[uce[0]].para
390 f.write('\n'.join([self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
392 def export_corpus_classes(self, outf, alc = True, lem = False) :
394 for i, lc in enumerate(self.lc) :
397 for uce in self.lc0 :
399 res = self.getalluces()
401 with open(outf, 'w') as f :
404 actuci = self.iduces[uce[0]].uci
406 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
408 etline = ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles + ['*classe_%i' % ucecl[uce[0]]])
410 etline = ' '.join(['<' + '='.join(et.split('_')) + '>' for et in self.ucis[self.iduces[uce[0]].uci].etoiles[1:]])
411 f.write(etline.encode(self.parametres['syscoding']) + '\n')
412 f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
414 def make_and_write_sparse_matrix_from_uces(self, actives, outfile, listuce = False) :
415 log.info('make_and_write_sparse_matrix_from_uces %s' % outfile)
417 with open(outfile + '~', 'w+') as f :
418 for i, lem in enumerate(actives) :
419 for uce in sorted(self.getlemuces(lem)) :
421 f.write(''.join([' '.join([`uce+1`, `i+1`,`1`]),'\n']))
423 with open(outfile, 'w') as ffin :
424 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
427 os.remove(outfile + '~')
429 with open(listuce, 'w') as f :
430 f.write('\n'.join(['uce;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucenb())]))
432 def make_and_write_sparse_matrix_from_uci(self, actives, outfile, listuci = False) :
433 log.info('make_and_write_sparse_matrix_from_ucis %s' % outfile)
435 with open(outfile + '~', 'w+') as f :
436 for i, lem in enumerate(actives) :
437 for uci in sorted(self.getlemucis(lem)) :
439 f.write(''.join([' '.join([`uci+1`, `i+1`,`1`]),'\n']))
441 with open(outfile, 'w') as ffin :
442 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucinb(), len(actives), nbl))
445 os.remove(outfile + '~')
447 with open(listuci, 'w') as f :
448 f.write('\n'.join(['uci;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucinb())]))
450 def make_and_write_sparse_matrix_from_classe(self, actives, uces, outfile) :
451 log.info('make_and_write_sparse_matrix_from_classe %s' % outfile)
453 duces = dict([[uce, i] for i, uce in enumerate(uces)])
454 with open(outfile + '~', 'w+') as f :
455 for i, lem in enumerate(actives) :
456 uces_ok = list(set(self.getlemuces(lem)).intersection(uces))
458 f.write(''.join([' '.join([`duces[uce]+1`,`i+1`,`1`]),'\n']))
460 with open(outfile, 'w') as ffin :
461 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
464 os.remove(outfile + '~')
466 def make_table_with_classe(self, uces, list_act) :
467 table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
468 uces = dict([[uce, i] for i, uce in enumerate(uces)])
469 for i, lem in enumerate(list_act) :
470 lemuces = list(set(self.getlemuces(lem)).intersection(uces))
472 table_uce[uces[uce]][i] = 1
473 table_uce.insert(0, list_act)
476 def parse_active(self, gramact, gramsup = None) :
477 log.info('parse actives')
478 for lem in self.lems :
479 if lem.startswith('_') and lem.endswith('_') :
480 self.lems[lem].act = 2
481 elif self.lems[lem].gram in gramact :
482 self.lems[lem].act = 1
483 elif gramsup is not None :
484 if self.lems[lem].gram in gramsup :
485 self.lems[lem].act = 2
487 self.lems[lem].act = 0
489 self.lems[lem].act = 2
491 def make_actives_limit(self, limit, key = 1) :
492 if self.idformes is None :
494 return [lem for lem in self.lems if self.getlemeff(lem) >= limit and self.lems[lem].act == key]
496 def make_actives_nb(self, nbmax, key) :
497 log.info('make_actives_nb : %i - %i' % (nbmax,key))
498 if self.idformes is None :
500 allactives = [[self.lems[lem].freq, lem] for lem in self.lems if self.lems[lem].act == key and self.lems[lem].freq >= 3]
501 self.activenb = len(allactives)
502 allactives = sorted(allactives, reverse = True)
503 if len(allactives) <= nbmax :
504 log.info('nb = %i - eff min = %i ' % (len(allactives), allactives[-1][0]))
505 return [val[1] for val in allactives], allactives[-1][0]
507 effs = [val[0] for val in allactives]
508 if effs.count(effs[nbmax - 1]) > 1 :
509 lim = effs[nbmax - 1] + 1
513 stop = effs.index(lim)
520 log.info('nb actives = %i - eff min = %i ' % (stop + 1, lim))
521 return [val[1] for val in allactives[0:stop + 1]], lim
523 def make_and_write_profile(self, actives, ucecl, fileout) :
524 log.info('formes/classes')
525 tab = [[lem] + [len(set(self.getlemuces(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
526 tab = [[line[0]] + [`val` for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
527 with open(fileout, 'w') as f :
528 f.write('\n'.join([';'.join(line) for line in tab]).encode(self.parametres['syscoding']))
530 def make_etoiles(self) :
532 for uci in self.ucis :
533 etoiles.update(uci.etoiles[1:] + uci.paras)
536 def make_etoiles_dict(self) :
537 etoiles = [et for uci in self.ucis for et in uci.etoiles[1:]]
539 for etoile in etoiles :
540 et = etoile.split('_')
543 endet = '_'.join(et[1:])
544 if endet in det[et[0]] :
545 det[et[0]][endet] += 1
547 det[et[0]][endet] = 1
552 endet = '_'.join(et[1:])
553 det[et[0]] = {endet :1}
558 def make_etline(self, listet) :
559 etuces = [[] for et in listet]
560 for uci in self.ucis :
561 get = list(set(uci.etoiles).intersection(listet))
563 return '2 variables sur la meme ligne'
565 etuces[listet.index(get[0])] += [uce.ident for uce in uci.uces]
569 def make_and_write_profile_et(self, ucecl, fileout) :
570 log.info('etoiles/classes')
571 etoiles = self.make_etoiles()
572 with open(fileout, 'w') as f :
573 f.write('\n'.join([';'.join([etoile] + [`len(set(self.getucesfrometoile(etoile)).intersection(classe))` for classe in ucecl]) for etoile in etoiles]).encode(self.parametres['syscoding']))
575 def make_colored_corpus(self) :
577 for i, lc in enumerate(self.lc) :
580 for uce in self.lc0 :
582 color = ['black'] + colors[len(self.lc) - 1]
584 <meta http-equiv="content-Type" content="text/html; charset=%s" />
586 ''' % sys.getdefaultencoding()
587 res = self.getalluces()
592 if self.iduces[uce[0]].uci != actuci :
593 actuci = self.iduces[uce[0]].uci
594 txt += '<br><hr>' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles) + '<br><br>'
595 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
597 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
598 return txt + '\n</body></html>'
600 def count_from_list(self, l, d) :
608 def count_from_list_cl(self, l, d, a, clnb) :
617 def find_segments(self, taille_segment, taille_limite) :
619 for uce in self.getalluces() :
621 d = self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
622 l = [[d[val], val] for val in d if d[val] >= 3]
625 if len(l) > taille_limite :
626 l = l[-taille_limite:]
629 def find_segments_in_classe(self, list_uce, taille_segment, taille_limite):
631 for uce in self.getconcorde(list_uce) :
633 d =self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
634 l = [[d[val], val, taille_segment] for val in d if d[val] >= 3]
637 if len(l) > taille_limite :
638 l = l[-taille_limite:]
641 def make_segments_profile(self, fileout, lenmin = 3, lenmax = 10, effmin = 50, lem = False) :
643 for b, classe in enumerate(self.lc) :
644 for uce in self.getconcorde(classe) :
647 uce = [self.formes[forme].lem for forme in uce]
648 for taille_segment in range(lenmin,lenmax) :
649 d =self.count_from_list_cl([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d, b, len(self.lc))
650 result = [[seg] + [str(val) for val in d[seg]] for seg in d if sum(d[seg]) >= effmin]
651 with open(fileout, 'w') as f :
652 f.write('\n'.join([';'.join(line) for line in result]))
654 def make_proftype(self, outf) :
656 for lem in self.lems :
657 gram = self.lems[lem].gram
659 res[gram] = [0 for val in self.lc]
660 lemuceeff = self.getlemuceseff(lem)
661 for i, classe in enumerate(self.lc) :
662 concern = set(classe).intersection(lemuceeff.keys())
663 res[gram][i] += sum([lemuceeff[uce] for uce in concern])
664 res = [[gram] + [`val` for val in res[gram]] for gram in res]
666 with open(outf, 'w') as f :
667 f.write('\n'.join([';'.join(line) for line in res]).encode(self.parametres['syscoding']))
670 def make_ucecl_from_R(self, filein) :
671 with open(filein, 'rU') as f :
676 line = line.replace('\n', '').replace('"', '').split(';')
677 self.lc.append([int(line[0]) - 1, int(line[1])])
678 classesl = [val[1] for val in self.lc]
680 self.lc = sorted(self.lc, key=itemgetter(1))
681 self.lc = [[uce[0] for uce in self.lc if uce[1] == i] for i in range(clnb+1)]
682 self.lc0 = self.lc.pop(0)
685 def get_stat_by_cluster(self, outf) :
686 log.info('get_stat_by_cluster')
688 occurrences = dict([[i + 1, 0] for i in range(len(self.lc))])
689 formescl = dict([[i + 1, 0] for i in range(len(self.lc))])
690 hapaxcl = dict([[i + 1, 0] for i in range(len(self.lc))])
691 lenclasses = dict([[i+1,len(cl)] for i, cl in enumerate(self.lc)])
692 sets = [set(cl) for cl in self.lc]
693 for forme in self.formes :
694 formeuceeff = self.getformeuceseff(forme)
695 for i, classe in enumerate(self.lc) :
696 concern = sets[i].intersection(formeuceeff.keys())
698 occurrences[i+1] += sum([formeuceeff[uce] for uce in concern])
700 if self.formes[forme].freq == 1 :
702 toprint = '\n'.join([';'.join([`i`, `occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`]) for i in occurrences])
703 with open(outf, 'w') as f :
705 log.info('%f' % (time() - t1))
707 def gethapaxbyet(self, etoiles) :
708 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
710 for uce in hapaxuces :
711 if uce in hucesdict :
715 etuces = [[] for et in etoiles]
716 for uci in self.ucis :
717 get = list(set(uci.etoiles).intersection(etoiles))
719 return '2 variables sur la meme ligne'
721 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
722 etuces = [set(val) for val in etuces]
723 return [sum([hucesdict[uce] for uce in list(etuce.intersection(hapaxuces))]) for etuce in etuces]
725 def gethapaxuces(self) :
726 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
727 hapax = [forme for forme in self.lems if self.lems[forme].freq == 1]
729 for i,uce in enumerate(hapaxuces) :
730 if uce in hucesdict :
731 hucesdict[uce][0] += 1
732 hucesdict[uce][1].append(hapax[i])
734 hucesdict[uce] = [1,[hapax[i]]]
736 for uce in hucesdict :
737 if hucesdict[uce][0] in huces :
738 huces[hucesdict[uce][0]].append(uce)
740 huces[hucesdict[uce][0]] = [uce]
741 huces = zip(huces, huces.values())
742 huces.sort(reverse=True)
746 for nb in huces[0:4] :
747 txt += "<p><h2>%i hapax par uce</h2><p>\n" % nb[0]
749 res = self.getconcorde([uce])
751 ucetxt = ' ' + row[1] + ' '
753 for hap in hucesdict[uce][1] :
754 laforme = self.getforme([forme for forme in self.lems[hap].formes][0]).forme
755 ucetxt = ucetxt.replace(' '+laforme+' ', ' <font color=red>'+laforme+'</font> ')
756 txt += '<p><b>' + ' '.join(self.getetbyuceid(uceid)) + '</b></p>'
757 txt += '<p>'+ucetxt+'</p>\n'
761 with open('/tmp/testhapxuce.html','w') as f :
766 def __init__(self, corpus) :
767 ucinb = corpus.getucinb()
768 ucisize = corpus.getucisize()
769 ucimean = float(sum(ucisize))/float(ucinb)
770 detoile = corpus.make_etoiles_dict()
774 def __init__(self, iduci, line, paraset = None) :
776 self.etoiles = line.split()
778 if paraset is not None :
779 self.paras = paraset.split()
784 def __init__(self, iduce, idpara, iduci) :
790 def __init__(self, word, gramtype, idword, lem = None, freq = None) :
796 if freq is not None :
802 def __init__(self, parent, forme) :
803 self.formes = {forme.ident : forme.freq}
804 self.gram = forme.gram
805 self.freq = forme.freq
808 def add_forme(self, forme) :
809 self.formes[forme.ident] = forme.freq
810 self.freq += forme.freq
812 def decouperlist(chaine, longueur, longueurOptimale) :
814 on part du dernier caractère, et on recule jusqu'au début de la chaîne.
815 Si on trouve un '$', c'est fini.
816 Sinon, on cherche le meilleur candidat. C'est-à-dire le rapport poids/distance le plus important.
818 separateurs = [[u'.', 6.0], [u'?', 6.0], [u'!', 6.0], [u'£$£', 6.0], [u':', 5.0], [u';', 4.0], [u',', 1.0], [u' ', 0.01]]
819 dsep = dict([[val[0],val[1]] for val in separateurs])
820 trouve = False # si on a trouvé un bon séparateur
821 iDecoupe = 0 # indice du caractere ou il faut decouper
823 longueur = min(longueur, len(chaine) - 1)
824 chaineTravail = chaine[:longueur + 1]
826 meilleur = ['', 0, 0] # type, poids et position du meilleur separateur
829 indice = chaineTravail.index(u'$')
831 iDecoupe = indice - 1
836 caractere = chaineTravail[nbCar]
837 distance = abs(longueurOptimale - nbCar) + 1
838 meilleureDistance = abs(longueurOptimale - meilleur[2]) + 1
839 if caractere in dsep :
840 if (float(dsep[caractere]) / distance) > (float(meilleur[1]) / meilleureDistance) :
841 meilleur[0] = caractere
842 meilleur[1] = dsep[caractere]
847 if (float(dsep[' ']) / distance) > (float(meilleur[1]) / meilleureDistance) :
849 meilleur[1] = dsep[' ']
856 #if meilleur[0] != ' ' :
857 # fin = chaine[iDecoupe + 1:]
858 # retour = chaineTravail[:iDecoupe]
860 fin = chaine[iDecoupe + 1:]
861 retour = chaineTravail[:iDecoupe + 1]
862 return len(retour) > 0, retour, fin
863 # si on a rien trouvé
864 return False, chaine, ''
866 def testetoile(line) :
867 return line.startswith(u'****')
870 return line[0:4].isdigit() and u'*' in line
872 def prep_txtlist(txt) :
873 return txt.split() + [u'$']
875 def prep_txtcharact(txt) :
880 Class for building a corpus
882 def __init__(self, infile, parametres_corpus, lexique = None, expressions = None, dlg = None) :
883 log.info('begin building corpus...')
884 self.lexique = lexique
885 self.expressions = expressions
887 self.corpus = Corpus(self, parametres_corpus)
890 self.lim = parametres_corpus.get('lim', 1000000)
891 self.encoding = parametres_corpus['encoding']
892 self.corpus.pathout = PathOut(filename = parametres_corpus['originalpath'], dirout = parametres_corpus['pathout'])
893 self.corpus.pathout.createdir(parametres_corpus['pathout'])
894 self.corpus.parametres['uuid'] = str(uuid4())
895 self.corpus.parametres['corpus_name'] = os.path.split(self.corpus.parametres['pathout'])[1]
896 self.corpus.parametres['type'] = 'corpus'
897 if self.corpus.parametres['keep_ponct'] :
898 self.ponctuation_espace = [' ', '']
900 self.ponctuation_espace = [' ','.', u'£$£', ';', '?', '!', ',', ':','']
902 self.tolist = self.corpus.parametres.get('tolist', 0)
909 def prep_makeuce(self) :
910 method = self.corpus.parametres.get('ucemethod', 0)
912 self.decouper = decouperlist
913 self.prep_txt = prep_txtlist
914 self.ucesize = self.corpus.parametres.get('ucesize', 40)
916 self.decouper = decoupercharact
917 self.prep_txt = prep_txtcharact
918 self.ucesize = self.corpus.parametres.get('ucesize', 240)
919 log.info('method uce : %s' % method)
924 self.read_corpus(self.infile)
925 except Warning, args :
926 log.info('pas kool %s' % args)
930 self.corpus.parametres['ira'] = self.corpus.pathout['Corpus.cira']
931 self.time = time() - t1
933 DoConf().makeoptions(['corpus'],[self.corpus.parametres], self.corpus.pathout['Corpus.cira'])
934 log.info('time : %f' % (time() - t1))
937 self.conn_f = sqlite3.connect(self.corpus.pathout['formes.db'])
938 self.cf = self.conn_f.cursor()
939 self.cf.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
940 self.cf.execute('CREATE TABLE IF NOT EXISTS eff (id INTEGER, eff TEXT);')
942 self.cf = self.conn_f.cursor()
943 self.cf.execute('PRAGMA temp_store=MEMORY;')
944 self.cf.execute('PRAGMA journal_mode=MEMORY;')
945 self.cf.execute('PRAGMA synchronous = OFF;')
946 self.cf.execute('begin')
947 self.conn = sqlite3.connect(self.corpus.pathout['uces.db'])
948 self.c = self.conn.cursor()
949 self.c.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
951 self.c = self.conn.cursor()
952 self.c.execute('PRAGMA temp_store=MEMORY;')
953 self.c.execute('PRAGMA journal_mode=MEMORY;')
954 self.c.execute('PRAGMA synchronous = OFF;')
955 self.c.execute('begin')
958 #commit index and close db
961 self.cf.execute('CREATE INDEX iduces ON uces (id);')
962 self.cf.execute('CREATE INDEX ideff ON eff (id);')
966 self.conn_corpus = sqlite3.connect(self.corpus.pathout['corpus.db'])
967 self.ccorpus = self.conn_corpus.cursor()
968 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS etoiles (uci INTEGER, et TEXT, paras TEXT);')
969 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS luces (uci INTEGER, para INTEGER, uce INTEGER);')
970 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS formes (ident INTEGER, forme TEXT, lem TEXT, gram TEXT, freq INTEGER);')
971 self.conn_corpus.commit()
972 self.ccorpus = self.conn_corpus.cursor()
973 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
974 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
975 self.ccorpus.execute('PRAGMA synchronous = OFF;')
976 self.ccorpus.execute('begin')
978 self.ccorpus.execute('CREATE INDEX iduci ON luces (uci);')
979 self.conn_corpus.commit()
980 self.conn_corpus.close()
981 #self.corpus.parametres['corpus_ira'] = self.corpus.pathout['corpus.cira']
983 def buildcleans(self) :
984 if self.corpus.parametres.get('lower', 1) :
985 self.cleans.append(self.dolower)
986 if self.corpus.parametres.get('firstclean', 1) :
987 self.cleans.append(self.firstclean)
988 if self.corpus.parametres['charact'] :
989 self.rule = self.corpus.parametres.get('keep_caract', u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_")
990 self.cleans.append(self.docharact)
991 if self.corpus.parametres.get('expressions', 1) :
992 self.cleans.append(self.make_expression)
993 if self.corpus.parametres.get('apos', 1) :
994 self.cleans.append(self.doapos)
995 if self.corpus.parametres.get('tiret', 1):
996 self.cleans.append(self.dotiret)
998 def make_expression(self,txt) :
999 for expression in self.expressions:
1000 if expression in txt :
1001 txt = txt.replace(expression, self.expressions[expression][0])
1004 def dolower(self, txt) :
1007 def docharact(self, txt) :
1008 #rule = u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_-"
1009 list_keep = u"[" + self.rule + "]+"
1010 return re.sub(list_keep, ' ', txt)
1012 def doapos(self, txt) :
1013 return txt.replace(u'\'', u' ')
1015 def dotiret(self, txt) :
1016 return txt.replace(u'-', u' ')
1018 def firstclean(self, txt) :
1019 txt = txt.replace(u'’',"'")
1020 txt = txt.replace(u'œ', u'oe')
1021 return txt.replace('...',u' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'…', ' £$£ ')
1023 def make_cleans(self, txt) :
1024 for clean in self.cleans :
1028 def backup_uce(self) :
1029 if self.corpus.idformesuces != {} :
1030 log.info('backup %i' % len(self.corpus.idformesuces))
1031 touce = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].keys()])) for forme in self.corpus.idformesuces]
1032 toeff = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].values()])) for forme in self.corpus.idformesuces]
1033 self.cf.executemany('INSERT INTO uces VALUES (?,?);', touce)
1034 self.cf.executemany('INSERT INTO eff VALUES (?,?);', toeff)
1035 self.corpus.idformesuces = {}
1038 def backup_corpus(self) :
1039 log.info('start backup corpus')
1041 for uci in self.corpus.ucis :
1042 self.ccorpus.execute('INSERT INTO etoiles VALUES (?,?,?);' ,(uci.ident,' '.join(uci.etoiles), ' '.join(uci.paras,)))
1043 for uce in uci.uces :
1044 self.ccorpus.execute('INSERT INTO luces VALUES (?,?,?);',(`uci.ident`,`uce.para`,`uce.ident`,))
1045 for forme in self.corpus.formes :
1046 self.ccorpus.execute('INSERT INTO formes VALUES (?,?,?,?,?);', (`self.corpus.formes[forme].ident`, forme, self.corpus.formes[forme].lem, self.corpus.formes[forme].gram, `self.corpus.formes[forme].freq`,))
1047 log.info('%f' % (time() - t))
1049 def dofinish(self) :
1050 self.corpus.parametres['date'] = datetime.datetime.now().ctime()
1051 minutes, seconds = divmod(self.time, 60)
1052 hours, minutes = divmod(minutes, 60)
1053 self.corpus.parametres['time'] = '%.0fh %.0fm %.0fs' % (hours, minutes, seconds)
1054 self.corpus.parametres['ucinb'] = self.corpus.getucinb()
1055 self.corpus.parametres['ucenb'] = self.corpus.getucenb()
1056 self.corpus.parametres['occurrences'] = self.corpus.gettotocc()
1057 self.corpus.parametres['formesnb'] = len(self.corpus.formes)
1058 hapaxnb = self.corpus.gethapaxnb()
1059 pourhapaxf = (float(hapaxnb) / len(self.corpus.formes)) * 100
1060 pourhapaxocc = (float(hapaxnb) / self.corpus.parametres['occurrences']) * 100
1061 self.corpus.parametres['hapax'] = '%i - %.2f %% des formes - %.2f %% des occurrences' % (hapaxnb, pourhapaxf, pourhapaxocc)
1064 class BuildFromAlceste(BuildCorpus) :
1065 def read_corpus(self, infile) :
1066 if self.dlg is not None :
1067 self.dlg.Pulse('textes : 0 - segments : 0')
1070 if self.corpus.parametres['ucimark'] == 0 :
1071 self.testuci = testetoile
1072 elif self.corpus.parametres['ucimark'] == 1 :
1073 self.testuci = testint
1079 with codecs.open(infile, 'r', self.encoding) as f :
1080 for linenb, line in enumerate(f) :
1081 line = line.rstrip('\n\r')
1082 if self.testuci(line) :
1085 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci - 1)
1087 self.corpus.ucis.append(Uci(iduci, line))
1090 if self.corpus.ucis[-1].uces == [] :
1091 log.info(u'Empty text : %i' % linenb)
1093 self.corpus.ucis.pop()
1094 #raise Exception("EmptyText %i" % linenb)
1095 self.corpus.ucis.append(Uci(iduci, line))
1096 if self.dlg is not None :
1097 if not (iduci + 1) % 10 :
1098 self.dlg.Pulse('textes : %i - segments : %i' % (iduci + 1, iduce +1))
1099 elif line.startswith(u'-*') :
1102 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1105 self.corpus.ucis[-1].paras.append(line.split()[0])
1107 raise Exception('paragrapheOT')
1108 elif line.strip() != '' and iduci != -1 :
1110 if txt != [] and iduci != -1 :
1111 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1114 raise Exception("EmptyText")
1115 if iduci != -1 and iduce != -1:
1118 log.info(_(u"No Text in corpora. Are you sure of the formatting ?"))
1119 raise Exception('TextBeforeTextMark')
1120 except UnicodeDecodeError :
1121 raise Exception("CorpusEncoding")
1123 def treattxt(self, txt, iduce, idpara, iduci) :
1124 if self.corpus.parametres.get('ucemethod', 0) == 2 and self.corpus.parametres['douce']:
1125 txt = 'laphrasepoursplitter'.join(txt)
1126 txt = self.make_cleans(txt)
1127 txt = ' '.join([val for val in txt.split() if val not in self.ponctuation_espace])
1128 ucetxt = txt.split('laphrasepoursplitter')
1131 txt = self.make_cleans(txt)
1132 ucetxt = self.make_uces(txt, self.corpus.parametres['douce'])
1133 if self.corpus.ucis[-1].paras == [] :
1137 self.corpus.ucis[-1].uces.append(Uce(iduce, idpara, iduci))
1138 self.c.execute('INSERT INTO uces VALUES(?,?);', (`iduce`,uce))
1139 if not self.tolist :
1145 self.corpus.add_word(word)
1146 log.debug(' '.join([`iduci`,`idpara`,`iduce`]))
1147 if self.last > self.lim :
1150 return iduce, idpara
1152 def make_uces(self, txt, douce = True, keep_ponct = False) :
1153 txt = ' '.join(txt.split())
1156 reste, texte_uce, suite = self.decouper(self.prep_txt(txt), self.ucesize + 15, self.ucesize)
1158 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1161 reste, texte_uce, suite = self.decouper(suite, self.ucesize + 15, self.ucesize)
1162 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1167 return [' '.join([val for val in txt.split() if val not in self.ponctuation_espace])]
1169 #decouper (list_sep)
1170 #make_uces (decouper)
1171 #treat_txt (make_uces)
1175 def __init__(self, parent, dlg = None) :
1176 self.parent = parent
1178 parametres = DoConf(os.path.join(self.parent.UserConfigPath,'corpus.cfg')).getoptions('corpus')
1179 parametres['pathout'] = PathOut(parent.filename, 'corpus').mkdirout()
1180 dial = CorpusPref(parent, parametres)
1181 dial.CenterOnParent()
1182 dial.txtpath.SetLabel(parent.filename)
1183 #dial.repout_choices.SetValue(parametres['pathout'])
1184 self.res = dial.ShowModal()
1185 if self.res == 5100 :
1186 parametres = dial.doparametres()
1187 parametres['originalpath'] = parent.filename
1188 PathOut().createdir(parametres['pathout'])
1189 ReadLexique(self.parent, lang = parametres['lang'])
1190 self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp'))
1191 self.parametres = parametres
1193 if self.dlg is not None :
1197 def doanalyse(self) :
1198 return BuildFromAlceste(self.parent.filename, self.parametres, self.parent.lexique, self.parent.expressions, dlg = self.dlg).corpus
1201 if __name__ == '__main__' :
1203 parametres = {'formesdb':'formes.db', 'ucesdb': 'uces.db', 'corpusdb' : 'corpus.db', 'syscoding' : 'utf-8', 'encoding' : encoding}
1204 intro = BuildCorpus(infile, parametres)#, tar_in, tar_infouce)#, tar_formes)