1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
11 from functions import decoupercharact, ReadDicoAsDico, DoConf
16 from operator import itemgetter
17 from uuid import uuid4
18 from chemins import PathOut
19 from dialog import CorpusPref
20 from functions import ReadLexique, ReadDicoAsDico
21 from colors import colors
25 log = logging.getLogger('iramuteq.corpus')
28 def copycorpus(corpus) :
29 log.info('copy corpus')
30 copy_corpus = Corpus(corpus.parent, parametres = corpus.parametres)
31 copy_corpus.ucis = corpus.ucis
32 copy_corpus.formes = corpus.formes
33 copy_corpus.pathout = corpus.pathout
34 copy_corpus.conn_all()
44 def __init__(self, parent, parametres = {}, read = False) :
46 self.parametres = parametres
48 self.connformes = None
50 self.conncorpus = None
57 self.idformesuces = {}
62 self.pathout = PathOut(dirout = parametres['pathout'])
65 def add_word(self, word) :
66 if word in self.formes :
67 self.formes[word].freq += 1
68 if self.formes[word].ident in self.idformesuces :
69 if self.ucis[-1].uces[-1].ident in self.idformesuces[self.formes[word].ident] :
70 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] += 1
72 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] = 1
74 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident: 1}
76 if word in self.parent.lexique :
77 gramtype = self.parent.lexique[word][1]
78 lem = self.parent.lexique[word][0]
85 self.formes[word] = Word(word, gramtype, len(self.formes), lem)
86 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident : 1}
89 """connect corpus to db"""
90 if self.connformes is None :
91 log.info('connexion corpus')
92 self.connuces = sqlite3.connect(self.pathout['uces.db'])
93 self.cuces = self.connuces.cursor()
94 self.connformes = sqlite3.connect(self.pathout['formes.db'])
95 self.cformes = self.connformes.cursor()
96 self.conncorpus = sqlite3.connect(self.pathout['corpus.db'])
97 self.ccorpus = self.conncorpus.cursor()
98 self.cformes.execute('PRAGMA temp_store=MEMORY;')
99 self.cformes.execute('PRAGMA journal_mode=MEMORY;')
100 self.cformes.execute('PRAGMA synchronous = OFF;')
101 self.cuces.execute('PRAGMA temp_store=MEMORY;')
102 self.cuces.execute('PRAGMA journal_mode=MEMORY;')
103 self.cuces.execute('PRAGMA synchronous = OFF;')
104 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
105 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
106 self.ccorpus.execute('PRAGMA synchronous = OFF;')
108 def read_corpus(self) :
109 log.info('read corpus')
110 self.parametres['syscoding'] = sys.getdefaultencoding()
111 if self.conncorpus is None :
113 res = self.ccorpus.execute('SELECT * FROM etoiles;')
115 self.ucis.append(Uci(row[0], row[1], row[2]))
116 uces = self.conncorpus.cursor().execute('SELECT * FROM luces where uci=?;',(`self.ucis[-1].ident`,))
118 self.ucis[-1].uces.append(Uce(uce[2], uce[1], uce[0]))
119 res = self.ccorpus.execute('SELECT * FROM formes;')
120 self.formes = dict([[forme[1], Word(forme[1], forme[3], forme[0], lem = forme[2], freq = forme[4])] for forme in res])
123 def getworduces(self, wordid) :
124 if isinstance(wordid, basestring) :
125 wordid = self.formes[wordid].ident
126 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`wordid`,))
127 return list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
129 def getformeuceseff(self, formeid) :
130 if isinstance(formeid, basestring) :
131 formeid = self.formes[formeid].ident
132 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`formeid`,))
133 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
134 query = 'SELECT eff FROM eff where id=%i ORDER BY id' % formeid
135 res = self.cformes.execute(query)
136 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
138 for i, uce in enumerate(uces) :
139 formeuceeff[uce] = formeuceeff.get(uce, 0) + eff[i]
142 def getlemuces(self, lem) :
143 formesid = ', '.join([`val` for val in self.lems[lem].formes])
144 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
145 res = self.cformes.execute(query)
146 return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))))
148 def getlemucis(self, lem) :
149 uces = self.getlemuces(lem)
150 return list(set([self.getucefromid(val).uci for val in uces]))
152 def getlemuceseff(self, lem, luces = None) :
153 formesid = ', '.join([`val` for val in self.lems[lem].formes])
154 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
155 res = self.cformes.execute(query)
156 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
157 query = 'SELECT eff FROM eff where id IN (%s) ORDER BY id' % formesid
158 res = self.cformes.execute(query)
159 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
161 for i, uce in enumerate(uces) :
162 lemuceeff[uce] = lemuceeff.get(uce, 0) + eff[i]
165 def getlemclustereff(self, lem, cluster) :
166 return len(list(set(self.lc[cluster]).intersection(self.getlemuces(lem))))
168 def getlemeff(self, lem) :
169 return self.lems[lem].freq
174 def getforme(self, formeid) :
175 if self.idformes is None : self.make_idformes()
176 return self.idformes[formeid]
178 def gettotocc(self) :
179 return sum([self.formes[forme].freq for forme in self.formes])
181 def getucemean(self) :
182 return float(self.gettotocc())/self.getucenb()
185 return self.ucis[-1].uces[-1].ident + 1
188 return self.ucis[-1].ident + 1
190 def getucisize(self) :
191 ucesize = self.getucesize()
192 return [sum(ucesize[uci.uces[0].ident:(uci.uces[-1].ident + 1)]) for uci in self.ucis]
194 def getucesize(self) :
195 res = self.getalluces()
196 return [len(uce[1].split()) for uce in res]
198 def getconcorde(self, uces) :
199 return self.cuces.execute('select * from uces where id IN (%s);' % ', '.join([`i` for i in uces]))
201 def getwordconcorde(self, word) :
202 return self.getconcorde(self.getworduces(word))
204 def getlemconcorde(self, lem) :
205 return self.getconcorde(self.getlemuces(lem))
207 def getalluces(self) :
208 return self.cuces.execute('SELECT * FROM uces')
210 def getucesfrometoile(self, etoile) :
211 return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles]
213 def getetoileuces(self) :
214 log.info('get uces etoiles')
217 for uci in self.ucis :
218 etoiles = uci.etoiles[1:]
220 if et in etoileuces :
221 etoileuces[et] += [uce.ident for uce in uci.uces]
223 etoileuces[et] = [uce.ident for uce in uci.uces]
225 for et in uci.paras :
226 if et in etoileuces :
227 etoileuces[et] += [uce.ident for uce in uci.uces if uce.para == idpara]
229 etoileuces[et] = [uce.ident for uce in uci.uces if uce.para == idpara]
235 def getucefromid(self, uceid) :
236 if self.iduces is None : self.make_iduces()
237 return self.iduces[uceid]
239 def gethapaxnb(self) :
240 return len([None for forme in self.formes if self.formes[forme].freq == 1])
242 def getactivesnb(self, key) :
243 return len([lem for lem in self.lems if self.lems[lem].act == key])
244 # def make_lems(self, lem = True) :
245 # log.info('make lems')
247 # for forme in self.formes :
248 # if self.formes[forme].lem in self.lems :
249 # if self.formes[forme].ident not in self.lems[self.formes[forme].lem] :
250 # self.lems[self.formes[forme].lem][self.formes[forme].ident] = 0
252 # self.lems[self.formes[forme].lem] = {self.formes[forme].ident : 0}
254 def getetbyuceid(self, uceid) :
255 if self.uceuci is None : self.uceuci = dict([[uce.ident,uci.ident] for uci in self.ucis for uce in uci.uces])
256 return self.ucis[self.uceuci[uceid]].etoiles
258 def make_lems(self, lem = True) :
259 log.info('make lems')
262 for forme in self.formes :
263 if self.formes[forme].lem in self.lems :
264 if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes :
265 self.lems[self.formes[forme].lem].add_forme(self.formes[forme])
267 self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
269 self.lems = dict([[forme, Lem(self, self.formes[forme])] for forme in self.formes])
271 def make_idformes(self) :
272 self.idformes = dict([[self.formes[forme].ident, self.formes[forme]] for forme in self.formes])
274 def make_iduces(self) :
275 if self.iduces is None :
276 self.iduces = dict([[uce.ident, uce] for uci in self.ucis for uce in uci.uces])
278 def make_lexitable(self, mineff, etoiles, gram = 0) :
283 tokeep = [lem for lem in self.lems if self.lems[lem].freq >= mineff and self.lems[lem].act in grams]
284 etuces = [[] for et in etoiles]
285 for uci in self.ucis :
286 get = list(set(uci.etoiles).intersection(etoiles))
288 log.info('2 variables sur une ligne')
290 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
291 etuces = [set(val) for val in etuces]
294 deff = self.getlemuceseff(lem)
296 tab.append([lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])
297 tab.insert(0, [''] + etoiles)
300 def make_efftype_from_etoiles(self, etoiles) :
302 etuces = [[] for et in etoiles]
303 for uci in self.ucis :
304 get = list(set(uci.etoiles).intersection(etoiles))
306 return '2 variables sur la meme ligne'
308 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
309 etuces = [set(val) for val in etuces]
310 for lem in self.lems :
311 deff = self.getlemuceseff(lem)
313 gram = self.lems[lem].gram
315 dtype[gram] = [i + j for i, j in zip(dtype[gram], [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])]
317 dtype[gram] = [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces]
318 tabout = [[gram] + dtype[gram] for gram in dtype]
319 tabout.insert(0, [''] + etoiles)
322 def make_uceactsize(self, actives) :
323 res = self.getalluces()
326 deff = self.getlemuceseff(lem)
328 ucesize[uce] = ucesize.get(uce, 0) + 1
331 def make_uc(self, actives, lim1, lim2) :
332 uceactsize = self.make_uceactsize(actives)
338 for uce in [uce for uci in self.ucis for uce in uci.uces] :
339 if uce.para == lastpara :
341 last1 += uceactsize.get(uce.ident,0)
342 uc1[-1].append(uce.ident)
344 uc1.append([uce.ident])
347 last2 += uceactsize.get(uce.ident, 0)
348 uc2[-1].append(uce.ident)
350 uc2.append([uce.ident])
353 last1 = uceactsize.get(uce.ident, 0)
354 last2 = uceactsize.get(uce.ident, 0)
356 uc1.append([uce.ident])
357 uc2.append([uce.ident])
360 def make_and_write_sparse_matrix_from_uc(self, actives, sizeuc1, sizeuc2, uc1out, uc2out, listuce1out, listuce2out) :
361 uc1, uc2 = self.make_uc(actives, sizeuc1, sizeuc2)
362 log.info('taille uc1 : %i - taille uc2 : %i' % (len(uc1), len(uc2)))
363 self.write_ucmatrix(uc1, actives, uc1out)
364 self.write_ucmatrix(uc2, actives, uc2out)
365 listuce1 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc1) for uce in ucl]
366 listuce2 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc2) for uce in ucl]
367 with open(listuce1out, 'w') as f :
368 f.write('\n'.join([';'.join(line) for line in listuce1]))
369 with open(listuce2out, 'w') as f :
370 f.write('\n'.join([';'.join(line) for line in listuce2]))
371 return len(uc1), len(uc2)
373 def write_ucmatrix(self, uc, actives, fileout) :
374 log.info('write uc matrix %s' % fileout)
375 uces_uc = dict([[uce, i] for i, ucl in enumerate(uc) for uce in ucl])
378 with open(fileout + '~', 'w+') as f :
379 for i, lem in enumerate(actives) :
380 for uce in self.getlemuces(lem):
381 if (uces_uc[uce], i) not in deja_la :
383 f.write(''.join([' '.join([`uces_uc[uce]+1`,`i+1`,`1`]),'\n']))
384 deja_la[(uces_uc[uce], i)] = 0
386 with open(fileout, 'w') as ffin :
387 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (len(uc), len(actives), nbl))
390 os.remove(fileout + '~')
393 def export_corpus(self, outf) :
394 #outf = 'export_corpus.txt'
396 res = self.getalluces()
400 with open(outf,'w') as f :
402 if self.iduces[uce[0]].uci == actuci and self.iduces[uce[0]].para == actpara :
403 f.write(uce[1].encode(self.parametres['syscoding']) + '\n')
404 elif self.iduces[uce[0]].uci != actuci :
405 actuci = self.iduces[uce[0]].uci
406 if self.ucis[self.iduces[uce[0]].uci].paras == [] :
407 actpara = self.iduces[uce[0]].para
408 f.write('\n' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n' + uce[1].encode(self.parametres['syscoding']) + '\n')
411 actpara = self.iduces[uce[0]].para
412 f.write('\n'.join([' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']), self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
413 elif self.iduces[uce[0]].para != actpara :
414 actpara = self.iduces[uce[0]].para
416 f.write('\n'.join([self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
418 def export_corpus_classes(self, outf, alc = True, lem = False) :
420 for i, lc in enumerate(self.lc) :
423 for uce in self.lc0 :
425 res = self.getalluces()
427 with open(outf, 'w') as f :
430 actuci = self.iduces[uce[0]].uci
432 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
434 etline = ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles + ['*classe_%i' % ucecl[uce[0]]])
436 etline = ' '.join(['<' + '='.join(et.split('_')) + '>' for et in self.ucis[self.iduces[uce[0]].uci].etoiles[1:]])
437 f.write(etline.encode(self.parametres['syscoding']) + '\n')
438 f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
440 def export_classe(self, outf, classe, lem = False) :
441 sts = self.lc[classe - 1]
442 res = self.getconcorde(sts)
444 with open(outf, 'w') as f :
447 f.write(' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n')
449 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
450 f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
452 def make_and_write_sparse_matrix_from_uces(self, actives, outfile, listuce = False) :
453 log.info('make_and_write_sparse_matrix_from_uces %s' % outfile)
455 with open(outfile + '~', 'w+') as f :
456 for i, lem in enumerate(actives) :
457 for uce in sorted(self.getlemuces(lem)) :
459 f.write(''.join([' '.join([`uce+1`, `i+1`,`1`]),'\n']))
461 with open(outfile, 'w') as ffin :
462 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
465 os.remove(outfile + '~')
467 with open(listuce, 'w') as f :
468 f.write('\n'.join(['uce;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucenb())]))
470 def make_and_write_sparse_matrix_from_uci(self, actives, outfile, listuci = False) :
471 log.info('make_and_write_sparse_matrix_from_ucis %s' % outfile)
473 with open(outfile + '~', 'w+') as f :
474 for i, lem in enumerate(actives) :
475 for uci in sorted(self.getlemucis(lem)) :
477 f.write(''.join([' '.join([`uci+1`, `i+1`,`1`]),'\n']))
479 with open(outfile, 'w') as ffin :
480 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucinb(), len(actives), nbl))
483 os.remove(outfile + '~')
485 with open(listuci, 'w') as f :
486 f.write('\n'.join(['uci;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucinb())]))
488 def make_and_write_sparse_matrix_from_classe(self, actives, uces, outfile) :
489 log.info('make_and_write_sparse_matrix_from_classe %s' % outfile)
491 duces = dict([[uce, i] for i, uce in enumerate(uces)])
492 with open(outfile + '~', 'w+') as f :
493 for i, lem in enumerate(actives) :
494 uces_ok = list(set(self.getlemuces(lem)).intersection(uces))
496 f.write(''.join([' '.join([`duces[uce]+1`,`i+1`,`1`]),'\n']))
498 with open(outfile, 'w') as ffin :
499 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
502 os.remove(outfile + '~')
504 def make_table_with_classe(self, uces, list_act) :
505 table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
506 uces = dict([[uce, i] for i, uce in enumerate(uces)])
507 for i, lem in enumerate(list_act) :
508 lemuces = list(set(self.getlemuces(lem)).intersection(uces))
510 table_uce[uces[uce]][i] = 1
511 table_uce.insert(0, list_act)
514 def parse_active(self, gramact, gramsup = None) :
515 log.info('parse actives')
516 for lem in self.lems :
517 if lem.startswith('_') and lem.endswith('_') :
518 self.lems[lem].act = 2
519 elif self.lems[lem].gram in gramact :
520 self.lems[lem].act = 1
521 elif gramsup is not None and self.lems[lem].gram not in gramact:
522 if self.lems[lem].gram in gramsup :
523 self.lems[lem].act = 2
525 self.lems[lem].act = 0
527 self.lems[lem].act = 2
529 def make_actives_limit(self, limit, key = 1) :
530 if self.idformes is None :
532 return [lem for lem in self.lems if self.getlemeff(lem) >= limit and self.lems[lem].act == key]
534 def make_actives_nb(self, nbmax, key) :
535 log.info('make_actives_nb : %i - %i' % (nbmax,key))
536 if self.idformes is None :
538 allactives = [[self.lems[lem].freq, lem] for lem in self.lems if self.lems[lem].act == key and self.lems[lem].freq >= 3]
539 self.activenb = len(allactives)
540 allactives = sorted(allactives, reverse = True)
541 if len(allactives) <= nbmax :
542 log.info('nb = %i - eff min = %i ' % (len(allactives), allactives[-1][0]))
543 return [val[1] for val in allactives], allactives[-1][0]
545 effs = [val[0] for val in allactives]
546 if effs.count(effs[nbmax - 1]) > 1 :
547 lim = effs[nbmax - 1] + 1
551 stop = effs.index(lim)
558 log.info('nb actives = %i - eff min = %i ' % (stop + 1, lim))
559 return [val[1] for val in allactives[0:stop + 1]], lim
561 def make_and_write_profile(self, actives, ucecl, fileout) :
562 log.info('formes/classes')
563 tab = [[lem] + [len(set(self.getlemuces(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
564 tab = [[line[0]] + [`val` for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
565 with open(fileout, 'w') as f :
566 f.write('\n'.join([';'.join(line) for line in tab]).encode(self.parametres['syscoding']))
568 def make_etoiles(self) :
570 for uci in self.ucis :
571 etoiles.update(uci.etoiles[1:])
574 def make_etoiles_dict(self) :
575 etoiles = [et for uci in self.ucis for et in uci.etoiles[1:]]
577 for etoile in etoiles :
578 et = etoile.split('_')
581 endet = '_'.join(et[1:])
582 if etoile in det[et[0]] :
583 det[et[0]][etoile] += 1
585 det[et[0]][etoile] = 1
590 endet = '_'.join(et[1:])
591 det[et[0]] = {etoile :1}
596 def make_etline(self, listet) :
597 etuces = [[] for et in listet]
598 for uci in self.ucis :
599 get = list(set(uci.etoiles).intersection(listet))
601 return '2 variables sur la meme ligne'
603 etuces[listet.index(get[0])] += [uce.ident for uce in uci.uces]
606 def make_and_write_profile_et(self, ucecl, fileout) :
607 log.info('etoiles/classes')
608 etoileuces = self.getetoileuces()
609 etoileuces = dict([[et, etoileuces[et]] for et in etoileuces if len(etoileuces[et]) > 1])
610 with open(fileout, 'w') as f :
611 f.write('\n'.join([';'.join([et] + [`len(set(etoileuces[et]).intersection(classe))` for classe in ucecl]) for et in etoileuces]).encode(self.parametres['syscoding']))
612 #etoiles = self.make_etoiles()
613 #with open(fileout, 'w') as f :
614 # f.write('\n'.join([';'.join([etoile] + [`len(set(self.getucesfrometoile(etoile)).intersection(classe))` for classe in ucecl]) for etoile in etoiles]).encode(self.parametres['syscoding']))
616 def make_colored_corpus(self) :
618 for i, lc in enumerate(self.lc) :
621 for uce in self.lc0 :
623 color = ['black'] + colors[len(self.lc) - 1]
625 <meta http-equiv="content-Type" content="text/html; charset=%s" />
627 ''' % sys.getdefaultencoding()
628 res = self.getalluces()
633 if self.iduces[uce[0]].uci != actuci :
634 actuci = self.iduces[uce[0]].uci
635 txt += '<br><hr>' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles) + '<br><br>'
636 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
638 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
639 return txt + '\n</body></html>'
641 def count_from_list(self, l, d) :
649 def count_from_list_cl(self, l, d, a, clnb) :
658 def find_segments(self, taille_segment, taille_limite) :
660 for uce in self.getalluces() :
662 d = self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
663 l = [[d[val], val] for val in d if d[val] >= 3]
666 if len(l) > taille_limite :
667 l = l[-taille_limite:]
670 def find_segments_in_classe(self, list_uce, taille_segment, taille_limite):
672 for uce in self.getconcorde(list_uce) :
674 d =self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
675 l = [[d[val], val, taille_segment] for val in d if d[val] >= 3]
678 if len(l) > taille_limite :
679 l = l[-taille_limite:]
682 def make_segments_profile(self, fileout, lenmin = 3, lenmax = 10, effmin = 50, lem = False) :
684 for b, classe in enumerate(self.lc) :
685 for uce in self.getconcorde(classe) :
688 uce = [self.formes[forme].lem for forme in uce]
689 for taille_segment in range(lenmin,lenmax) :
690 d =self.count_from_list_cl([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d, b, len(self.lc))
691 result = [[seg] + [str(val) for val in d[seg]] for seg in d if sum(d[seg]) >= effmin]
692 with open(fileout, 'w') as f :
693 f.write('\n'.join([';'.join(line) for line in result]))
695 def make_proftype(self, outf) :
697 for lem in self.lems :
698 gram = self.lems[lem].gram
700 res[gram] = [0 for val in self.lc]
701 lemuceeff = self.getlemuceseff(lem)
702 for i, classe in enumerate(self.lc) :
703 concern = set(classe).intersection(lemuceeff.keys())
704 res[gram][i] += sum([lemuceeff[uce] for uce in concern])
705 res = [[gram] + [`val` for val in res[gram]] for gram in res]
707 with open(outf, 'w') as f :
708 f.write('\n'.join([';'.join(line) for line in res]).encode(self.parametres['syscoding']))
711 def make_ucecl_from_R(self, filein) :
712 with open(filein, 'rU') as f :
717 line = line.replace('\n', '').replace('"', '').split(';')
718 self.lc.append([int(line[0]) - 1, int(line[1])])
719 classesl = [val[1] for val in self.lc]
721 self.lc = sorted(self.lc, key=itemgetter(1))
722 self.lc = [[uce[0] for uce in self.lc if uce[1] == i] for i in range(clnb+1)]
723 self.lc0 = self.lc.pop(0)
726 def get_stat_by_cluster(self, outf, lclasses = None) :
727 log.info('get_stat_by_cluster')
728 if lclasses is None :
731 occurrences = dict([[i + 1, 0] for i in range(len(lclasses))])
732 formescl = dict([[i + 1, 0] for i in range(len(lclasses))])
733 hapaxcl = dict([[i + 1, 0] for i in range(len(lclasses))])
734 lenclasses = dict([[i+1,len(cl)] for i, cl in enumerate(lclasses)])
735 sets = [set(cl) for cl in lclasses]
736 for forme in self.formes :
737 formeuceeff = self.getformeuceseff(forme)
738 for i, classe in enumerate(lclasses) :
739 concern = sets[i].intersection(formeuceeff.keys())
741 occurrences[i+1] += sum([formeuceeff[uce] for uce in concern])
743 if self.formes[forme].freq == 1 :
745 log.info('%f' % (time() - t1))
746 if outf is not None :
747 toprint = '\n'.join([';'.join([`i`, `occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`]) for i in occurrences])
748 with open(outf, 'w') as f :
751 return [[`occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`] for i in occurrences]
753 def get_stat_by_et(self, outf, etoiles) :
754 lclasses = [self.getucesfrometoile(etoile) for etoile in etoiles]
755 stats = self.get_stat_by_cluster(None, lclasses)
756 stats = [[etoiles[i]] + val for i, val in enumerate(stats)]
758 def gethapaxbyet(self, etoiles) :
759 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
761 for uce in hapaxuces :
762 if uce in hucesdict :
766 etuces = [[] for et in etoiles]
767 for uci in self.ucis :
768 get = list(set(uci.etoiles).intersection(etoiles))
770 return '2 variables sur la meme ligne'
772 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
773 etuces = [set(val) for val in etuces]
774 return [sum([hucesdict[uce] for uce in list(etuce.intersection(hapaxuces))]) for etuce in etuces]
776 def gethapaxuces(self) :
777 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
778 hapax = [forme for forme in self.lems if self.lems[forme].freq == 1]
780 for i,uce in enumerate(hapaxuces) :
781 if uce in hucesdict :
782 hucesdict[uce][0] += 1
783 hucesdict[uce][1].append(hapax[i])
785 hucesdict[uce] = [1,[hapax[i]]]
787 for uce in hucesdict :
788 if hucesdict[uce][0] in huces :
789 huces[hucesdict[uce][0]].append(uce)
791 huces[hucesdict[uce][0]] = [uce]
792 huces = zip(huces, huces.values())
793 huces.sort(reverse=True)
797 for nb in huces[0:4] :
798 txt += "<p><h2>%i hapax par uce</h2><p>\n" % nb[0]
800 res = self.getconcorde([uce])
802 ucetxt = ' ' + row[1] + ' '
804 for hap in hucesdict[uce][1] :
805 laforme = self.getforme([forme for forme in self.lems[hap].formes][0]).forme
806 ucetxt = ucetxt.replace(' '+laforme+' ', ' <font color=red>'+laforme+'</font> ')
807 txt += '<p><b>' + ' '.join(self.getetbyuceid(uceid)) + '</b></p>'
808 txt += '<p>'+ucetxt+'</p>\n'
812 with open('/tmp/testhapxuce.html','w') as f :
815 def export_dictionary(self, fileout, syscoding) :
816 listformes = [[self.formes[forme].freq, forme, self.formes[forme].lem, self.formes[forme].gram] for forme in self.formes]
817 listformes.sort(reverse = True)
818 listformes = [forme[1:] + [`forme[0]`] for forme in listformes]
819 with open(fileout, 'w') as f :
820 f.write('\n'.join(['\t'.join(forme) for forme in listformes]).encode(syscoding))
822 def export_lems(self, fileout, syscoding) :
824 listlem = [[lem, '\t'.join(['\t'.join([self.idformes[forme].forme, `self.lems[lem].formes[forme]`]) for forme in self.lems[lem].formes])] for lem in self.lems]
826 with open(fileout, 'w') as f :
827 f.write('\n'.join(['\t'.join(lem) for lem in listlem]).encode(syscoding))
833 def __init__(self, corpus) :
834 ucinb = corpus.getucinb()
835 ucisize = corpus.getucisize()
836 ucimean = float(sum(ucisize))/float(ucinb)
837 detoile = corpus.make_etoiles_dict()
841 def __init__(self, iduci, line, paraset = None) :
843 self.etoiles = line.split()
845 if paraset is not None :
846 self.paras = paraset.split()
851 def __init__(self, iduce, idpara, iduci) :
857 def __init__(self, word, gramtype, idword, lem = None, freq = None) :
863 if freq is not None :
869 def __init__(self, parent, forme) :
870 self.formes = {forme.ident : forme.freq}
871 self.gram = forme.gram
872 self.freq = forme.freq
875 def add_forme(self, forme) :
876 self.formes[forme.ident] = forme.freq
877 self.freq += forme.freq
879 def decouperlist(chaine, longueur, longueurOptimale) :
881 on part du dernier caractère, et on recule jusqu'au début de la chaîne.
882 Si on trouve un '$', c'est fini.
883 Sinon, on cherche le meilleur candidat. C'est-à-dire le rapport poids/distance le plus important.
885 separateurs = [[u'.', 6.0], [u'?', 6.0], [u'!', 6.0], [u'£$£', 6.0], [u':', 5.0], [u';', 4.0], [u',', 1.0], [u' ', 0.01]]
886 dsep = dict([[val[0],val[1]] for val in separateurs])
887 trouve = False # si on a trouvé un bon séparateur
888 iDecoupe = 0 # indice du caractere ou il faut decouper
890 longueur = min(longueur, len(chaine) - 1)
891 chaineTravail = chaine[:longueur + 1]
893 meilleur = ['', 0, 0] # type, poids et position du meilleur separateur
896 indice = chaineTravail.index(u'$')
898 iDecoupe = indice - 1
903 caractere = chaineTravail[nbCar]
904 distance = abs(longueurOptimale - nbCar) + 1
905 meilleureDistance = abs(longueurOptimale - meilleur[2]) + 1
906 if caractere in dsep :
907 if (float(dsep[caractere]) / distance) > (float(meilleur[1]) / meilleureDistance) :
908 meilleur[0] = caractere
909 meilleur[1] = dsep[caractere]
914 if (float(dsep[' ']) / distance) > (float(meilleur[1]) / meilleureDistance) :
916 meilleur[1] = dsep[' ']
923 #if meilleur[0] != ' ' :
924 # fin = chaine[iDecoupe + 1:]
925 # retour = chaineTravail[:iDecoupe]
927 fin = chaine[iDecoupe + 1:]
928 retour = chaineTravail[:iDecoupe + 1]
929 return len(retour) > 0, retour, fin
930 # si on a rien trouvé
931 return False, chaine, ''
933 def testetoile(line) :
934 return line.startswith(u'****')
937 return line[0:4].isdigit() and u'*' in line
939 def prep_txtlist(txt) :
940 return txt.split() + [u'$']
942 def prep_txtcharact(txt) :
947 Class for building a corpus
949 def __init__(self, infile, parametres_corpus, lexique = None, expressions = None, dlg = None) :
950 log.info('begin building corpus...')
951 self.lexique = lexique
952 self.expressions = expressions
954 self.corpus = Corpus(self, parametres_corpus)
957 self.lim = parametres_corpus.get('lim', 1000000)
958 self.encoding = parametres_corpus['encoding']
959 self.corpus.pathout = PathOut(filename = parametres_corpus['originalpath'], dirout = parametres_corpus['pathout'])
960 self.corpus.pathout.createdir(parametres_corpus['pathout'])
961 self.corpus.parametres['uuid'] = str(uuid4())
962 self.corpus.parametres['corpus_name'] = os.path.split(self.corpus.parametres['pathout'])[1]
963 self.corpus.parametres['type'] = 'corpus'
964 if self.corpus.parametres['keep_ponct'] :
965 self.ponctuation_espace = [' ', '']
967 self.ponctuation_espace = [' ','.', u'£$£', ';', '?', '!', ',', ':','']
969 self.tolist = self.corpus.parametres.get('tolist', 0)
976 def prep_makeuce(self) :
977 method = self.corpus.parametres.get('ucemethod', 0)
979 self.decouper = decouperlist
980 self.prep_txt = prep_txtlist
981 self.ucesize = self.corpus.parametres.get('ucesize', 40)
983 self.decouper = decoupercharact
984 self.prep_txt = prep_txtcharact
985 self.ucesize = self.corpus.parametres.get('ucesize', 240)
986 log.info('method uce : %s' % method)
991 self.read_corpus(self.infile)
992 except Warning, args :
993 log.info('pas kool %s' % args)
997 self.corpus.parametres['ira'] = self.corpus.pathout['Corpus.cira']
998 self.time = time() - t1
1000 DoConf().makeoptions(['corpus'],[self.corpus.parametres], self.corpus.pathout['Corpus.cira'])
1001 log.info('time : %f' % (time() - t1))
1004 self.conn_f = sqlite3.connect(self.corpus.pathout['formes.db'])
1005 self.cf = self.conn_f.cursor()
1006 self.cf.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
1007 self.cf.execute('CREATE TABLE IF NOT EXISTS eff (id INTEGER, eff TEXT);')
1008 self.conn_f.commit()
1009 self.cf = self.conn_f.cursor()
1010 self.cf.execute('PRAGMA temp_store=MEMORY;')
1011 self.cf.execute('PRAGMA journal_mode=MEMORY;')
1012 self.cf.execute('PRAGMA synchronous = OFF;')
1013 self.cf.execute('begin')
1014 self.conn = sqlite3.connect(self.corpus.pathout['uces.db'])
1015 self.c = self.conn.cursor()
1016 self.c.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
1018 self.c = self.conn.cursor()
1019 self.c.execute('PRAGMA temp_store=MEMORY;')
1020 self.c.execute('PRAGMA journal_mode=MEMORY;')
1021 self.c.execute('PRAGMA synchronous = OFF;')
1022 self.c.execute('begin')
1025 #commit index and close db
1027 self.conn_f.commit()
1028 self.cf.execute('CREATE INDEX iduces ON uces (id);')
1029 self.cf.execute('CREATE INDEX ideff ON eff (id);')
1033 self.conn_corpus = sqlite3.connect(self.corpus.pathout['corpus.db'])
1034 self.ccorpus = self.conn_corpus.cursor()
1035 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS etoiles (uci INTEGER, et TEXT, paras TEXT);')
1036 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS luces (uci INTEGER, para INTEGER, uce INTEGER);')
1037 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS formes (ident INTEGER, forme TEXT, lem TEXT, gram TEXT, freq INTEGER);')
1038 self.conn_corpus.commit()
1039 self.ccorpus = self.conn_corpus.cursor()
1040 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
1041 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
1042 self.ccorpus.execute('PRAGMA synchronous = OFF;')
1043 self.ccorpus.execute('begin')
1044 self.backup_corpus()
1045 self.ccorpus.execute('CREATE INDEX iduci ON luces (uci);')
1046 self.conn_corpus.commit()
1047 self.conn_corpus.close()
1048 #self.corpus.parametres['corpus_ira'] = self.corpus.pathout['corpus.cira']
1050 def buildcleans(self) :
1051 if self.corpus.parametres.get('lower', 1) :
1052 self.cleans.append(self.dolower)
1053 if self.corpus.parametres.get('firstclean', 1) :
1054 self.cleans.append(self.firstclean)
1055 if self.corpus.parametres['charact'] :
1056 self.rule = self.corpus.parametres.get('keep_caract', u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_")
1057 self.cleans.append(self.docharact)
1058 if self.corpus.parametres.get('expressions', 1) :
1059 self.cleans.append(self.make_expression)
1060 if self.corpus.parametres.get('apos', 1) :
1061 self.cleans.append(self.doapos)
1062 if self.corpus.parametres.get('tiret', 1):
1063 self.cleans.append(self.dotiret)
1065 def make_expression(self,txt) :
1066 for expression in self.expressions:
1067 if expression in txt :
1068 txt = txt.replace(expression, self.expressions[expression][0])
1071 def dolower(self, txt) :
1074 def docharact(self, txt) :
1075 #rule = u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_-"
1076 list_keep = u"[" + self.rule + "]+"
1077 return re.sub(list_keep, ' ', txt)
1079 def doapos(self, txt) :
1080 return txt.replace(u'\'', u' ')
1082 def dotiret(self, txt) :
1083 return txt.replace(u'-', u' ')
1085 def firstclean(self, txt) :
1086 txt = txt.replace(u'’',"'")
1087 txt = txt.replace(u'œ', u'oe')
1088 return txt.replace('...',u' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'…', u' £$£ ')
1090 def make_cleans(self, txt) :
1091 for clean in self.cleans :
1095 def backup_uce(self) :
1096 if self.corpus.idformesuces != {} :
1097 log.info('backup %i' % len(self.corpus.idformesuces))
1098 touce = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].keys()])) for forme in self.corpus.idformesuces]
1099 toeff = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].values()])) for forme in self.corpus.idformesuces]
1100 self.cf.executemany('INSERT INTO uces VALUES (?,?);', touce)
1101 self.cf.executemany('INSERT INTO eff VALUES (?,?);', toeff)
1102 self.corpus.idformesuces = {}
1105 def backup_corpus(self) :
1106 log.info('start backup corpus')
1108 for uci in self.corpus.ucis :
1109 self.ccorpus.execute('INSERT INTO etoiles VALUES (?,?,?);' ,(uci.ident,' '.join(uci.etoiles), ' '.join(uci.paras,)))
1110 for uce in uci.uces :
1111 self.ccorpus.execute('INSERT INTO luces VALUES (?,?,?);',(`uci.ident`,`uce.para`,`uce.ident`,))
1112 for forme in self.corpus.formes :
1113 self.ccorpus.execute('INSERT INTO formes VALUES (?,?,?,?,?);', (`self.corpus.formes[forme].ident`, forme, self.corpus.formes[forme].lem, self.corpus.formes[forme].gram, `self.corpus.formes[forme].freq`,))
1114 log.info('%f' % (time() - t))
1116 def dofinish(self) :
1117 self.corpus.parametres['date'] = datetime.datetime.now().ctime()
1118 minutes, seconds = divmod(self.time, 60)
1119 hours, minutes = divmod(minutes, 60)
1120 self.corpus.parametres['time'] = '%.0fh %.0fm %.0fs' % (hours, minutes, seconds)
1121 self.corpus.parametres['ucinb'] = self.corpus.getucinb()
1122 self.corpus.parametres['ucenb'] = self.corpus.getucenb()
1123 self.corpus.parametres['occurrences'] = self.corpus.gettotocc()
1124 self.corpus.parametres['formesnb'] = len(self.corpus.formes)
1125 hapaxnb = self.corpus.gethapaxnb()
1126 pourhapaxf = (float(hapaxnb) / len(self.corpus.formes)) * 100
1127 pourhapaxocc = (float(hapaxnb) / self.corpus.parametres['occurrences']) * 100
1128 self.corpus.parametres['hapax'] = '%i - %.2f %% des formes - %.2f %% des occurrences' % (hapaxnb, pourhapaxf, pourhapaxocc)
1131 class BuildFromAlceste(BuildCorpus) :
1132 def read_corpus(self, infile) :
1133 if self.dlg is not None :
1134 self.dlg.Pulse('textes : 0 - segments : 0')
1137 if self.corpus.parametres['ucimark'] == 0 :
1138 self.testuci = testetoile
1139 elif self.corpus.parametres['ucimark'] == 1 :
1140 self.testuci = testint
1146 with codecs.open(infile, 'r', self.encoding) as f :
1147 for linenb, line in enumerate(f) :
1148 line = line.rstrip('\n\r')
1149 if self.testuci(line) :
1152 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci - 1)
1154 self.corpus.ucis.append(Uci(iduci, line))
1157 if self.corpus.ucis[-1].uces == [] :
1158 log.info(u'Empty text : %i' % linenb)
1160 self.corpus.ucis.pop()
1161 self.corpus.ucis.append(Uci(iduci, line))
1162 if self.dlg is not None :
1163 if not (iduci + 1) % 10 :
1164 self.dlg.Pulse('textes : %i - segments : %i' % (iduci + 1, iduce +1))
1165 elif line.startswith(u'-*') :
1168 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1171 self.corpus.ucis[-1].paras.append(line.split()[0])
1173 raise Exception('paragrapheOT %i' % linenb)
1174 elif line.strip() != '' and iduci != -1 :
1176 if txt != [] and iduci != -1 :
1177 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1182 self.corpus.ucis.pop()
1183 log.info(Exception("Empty text %i" % linenb))
1185 raise Exception('EmptyText %i' % linenb)
1186 if iduci != -1 and iduce != -1:
1189 log.info(_(u"No Text in corpora. Are you sure of the formatting ?"))
1190 raise Exception('TextBeforeTextMark %i' % linenb)
1191 except UnicodeDecodeError :
1192 raise Exception("CorpusEncoding")
1194 def treattxt(self, txt, iduce, idpara, iduci) :
1195 if self.corpus.parametres.get('ucemethod', 0) == 2 and self.corpus.parametres['douce']:
1196 txt = 'laphrasepoursplitter'.join(txt)
1197 txt = self.make_cleans(txt)
1198 txt = ' '.join([val for val in txt.split() if val not in self.ponctuation_espace])
1199 ucetxt = txt.split('laphrasepoursplitter')
1202 txt = self.make_cleans(txt)
1203 ucetxt = self.make_uces(txt, self.corpus.parametres['douce'])
1204 if self.corpus.ucis[-1].paras == [] :
1208 self.corpus.ucis[-1].uces.append(Uce(iduce, idpara, iduci))
1209 self.c.execute('INSERT INTO uces VALUES(?,?);', (`iduce`,uce))
1210 if not self.tolist :
1216 self.corpus.add_word(word)
1217 log.debug(' '.join([`iduci`,`idpara`,`iduce`]))
1218 if self.last > self.lim :
1221 return iduce, idpara
1223 def make_uces(self, txt, douce = True, keep_ponct = False) :
1224 txt = ' '.join(txt.split())
1227 reste, texte_uce, suite = self.decouper(self.prep_txt(txt), self.ucesize + 15, self.ucesize)
1229 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1232 reste, texte_uce, suite = self.decouper(suite, self.ucesize + 15, self.ucesize)
1233 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1238 return [' '.join([val for val in txt.split() if val not in self.ponctuation_espace])]
1240 #decouper (list_sep)
1241 #make_uces (decouper)
1242 #treat_txt (make_uces)
1246 def __init__(self, parent, dlg = None) :
1247 self.parent = parent
1249 parametres = DoConf(os.path.join(self.parent.UserConfigPath,'corpus.cfg')).getoptions('corpus')
1250 parametres['pathout'] = PathOut(parent.filename, 'corpus').mkdirout()
1251 dial = CorpusPref(parent, parametres)
1252 dial.CenterOnParent()
1253 dial.txtpath.SetLabel(parent.filename)
1254 #dial.repout_choices.SetValue(parametres['pathout'])
1255 self.res = dial.ShowModal()
1256 if self.res == 5100 :
1257 parametres = dial.doparametres()
1258 parametres['originalpath'] = parent.filename
1259 PathOut().createdir(parametres['pathout'])
1260 ReadLexique(self.parent, lang = parametres['lang'])
1261 if parametres['lang'] != 'other' and os.path.exists(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')):
1262 self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp'))
1264 self.parent.expressions = {}
1265 self.parametres = parametres
1267 if self.dlg is not None :
1271 def doanalyse(self) :
1272 return BuildFromAlceste(self.parent.filename, self.parametres, self.parent.lexique, self.parent.expressions, dlg = self.dlg).corpus
1275 if __name__ == '__main__' :
1277 parametres = {'formesdb':'formes.db', 'ucesdb': 'uces.db', 'corpusdb' : 'corpus.db', 'syscoding' : 'utf-8', 'encoding' : encoding}
1278 intro = BuildCorpus(infile, parametres)#, tar_in, tar_infouce)#, tar_formes)