1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
11 from functions import decoupercharact, ReadDicoAsDico, DoConf
17 from operator import itemgetter
18 from uuid import uuid4
19 from chemins import PathOut
20 from dialog import CorpusPref
21 from functions import ReadLexique, ReadDicoAsDico
22 from colors import colors
26 log = logging.getLogger('iramuteq.corpus')
29 def copycorpus(corpus) :
30 log.info('copy corpus')
31 copy_corpus = Corpus(corpus.parent, parametres = corpus.parametres)
32 copy_corpus.ucis = corpus.ucis
33 copy_corpus.formes = corpus.formes
34 copy_corpus.pathout = corpus.pathout
35 copy_corpus.conn_all()
45 def __init__(self, parent, parametres = {}, read = False) :
47 self.parametres = parametres
49 self.connformes = None
51 self.conncorpus = None
58 self.idformesuces = {}
63 self.pathout = PathOut(dirout = parametres['pathout'])
66 def add_word(self, word) :
67 if word in self.formes :
68 self.formes[word].freq += 1
69 if self.formes[word].ident in self.idformesuces :
70 if self.ucis[-1].uces[-1].ident in self.idformesuces[self.formes[word].ident] :
71 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] += 1
73 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] = 1
75 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident: 1}
77 if word in self.parent.lexique :
78 gramtype = self.parent.lexique[word][1]
79 lem = self.parent.lexique[word][0]
86 self.formes[word] = Word(word, gramtype, len(self.formes), lem)
87 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident : 1}
90 """connect corpus to db"""
91 if self.connformes is None :
92 log.info('connexion corpus')
93 self.connuces = sqlite3.connect(self.pathout['uces.db'])
94 self.cuces = self.connuces.cursor()
95 self.connformes = sqlite3.connect(self.pathout['formes.db'])
96 self.cformes = self.connformes.cursor()
97 self.conncorpus = sqlite3.connect(self.pathout['corpus.db'])
98 self.ccorpus = self.conncorpus.cursor()
99 self.cformes.execute('PRAGMA temp_store=MEMORY;')
100 self.cformes.execute('PRAGMA journal_mode=MEMORY;')
101 self.cformes.execute('PRAGMA synchronous = OFF;')
102 self.cuces.execute('PRAGMA temp_store=MEMORY;')
103 self.cuces.execute('PRAGMA journal_mode=MEMORY;')
104 self.cuces.execute('PRAGMA synchronous = OFF;')
105 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
106 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
107 self.ccorpus.execute('PRAGMA synchronous = OFF;')
109 def read_corpus(self) :
110 log.info('read corpus')
111 self.parametres['syscoding'] = sys.getdefaultencoding()
112 if self.conncorpus is None :
114 res = self.ccorpus.execute('SELECT * FROM etoiles;')
116 self.ucis.append(Uci(row[0], row[1], row[2]))
117 uces = self.conncorpus.cursor().execute('SELECT * FROM luces where uci=?;',(`self.ucis[-1].ident`,))
119 self.ucis[-1].uces.append(Uce(uce[2], uce[1], uce[0]))
120 res = self.ccorpus.execute('SELECT * FROM formes;')
121 self.formes = dict([[forme[1], Word(forme[1], forme[3], forme[0], lem = forme[2], freq = forme[4])] for forme in res])
124 def getworduces(self, wordid) :
125 if isinstance(wordid, basestring) :
126 wordid = self.formes[wordid].ident
127 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`wordid`,))
128 return list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
130 def getformeuceseff(self, formeid) :
131 if isinstance(formeid, basestring) :
132 formeid = self.formes[formeid].ident
133 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`formeid`,))
134 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
135 query = 'SELECT eff FROM eff where id=%i ORDER BY id' % formeid
136 res = self.cformes.execute(query)
137 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
139 for i, uce in enumerate(uces) :
140 formeuceeff[uce] = formeuceeff.get(uce, 0) + eff[i]
143 def getlemuces(self, lem) :
144 formesid = ', '.join([`val` for val in self.lems[lem].formes])
145 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
146 res = self.cformes.execute(query)
147 return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))))
149 def getlemucis(self, lem) :
150 uces = self.getlemuces(lem)
151 return list(set([self.getucefromid(val).uci for val in uces]))
153 def getlemuceseff(self, lem, luces = None) :
154 formesid = ', '.join([`val` for val in self.lems[lem].formes])
155 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
156 res = self.cformes.execute(query)
157 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
158 query = 'SELECT eff FROM eff where id IN (%s) ORDER BY id' % formesid
159 res = self.cformes.execute(query)
160 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
162 for i, uce in enumerate(uces) :
163 lemuceeff[uce] = lemuceeff.get(uce, 0) + eff[i]
166 def getlemclustereff(self, lem, cluster) :
167 return len(list(set(self.lc[cluster]).intersection(self.getlemuces(lem))))
169 def getlemeff(self, lem) :
170 return self.lems[lem].freq
175 def getforme(self, formeid) :
176 if self.idformes is None : self.make_idformes()
177 return self.idformes[formeid]
179 def gettotocc(self) :
180 return sum([self.formes[forme].freq for forme in self.formes])
182 def getucemean(self) :
183 return float(self.gettotocc())/self.getucenb()
186 return self.ucis[-1].uces[-1].ident + 1
189 return self.ucis[-1].ident + 1
191 def getucisize(self) :
192 ucesize = self.getucesize()
193 return [sum(ucesize[uci.uces[0].ident:(uci.uces[-1].ident + 1)]) for uci in self.ucis]
195 def getucesize(self) :
196 res = self.getalluces()
197 return [len(uce[1].split()) for uce in res]
199 # def getlemseff(self) :
200 # if self.idformes is None :
201 # self.make_idformes()
202 # return dict([[lem, sum([self.idformes[forme].freq for forme in self.lems[lem]])] for lem in self.lems])
204 # def getlemsefftype(self) :
205 # if self.idformes is None :
206 # self.make_idformes()
207 # if self.lems is None :
209 # return dict([[lem, [sum([self.idformes[forme].freq for forme in self.lems[lem]]), '', self.idformes[self.lems[lem].keys()[0]].gram]] for lem in self.lems])
211 def getconcorde(self, uces) :
212 return self.cuces.execute('select * from uces where id IN (%s);' % ', '.join([`i` for i in uces]))
214 def getwordconcorde(self, word) :
215 return self.getconcorde(self.getworduces(word))
217 def getlemconcorde(self, lem) :
218 return self.getconcorde(self.getlemuces(lem))
220 def getalluces(self) :
221 return self.cuces.execute('SELECT * FROM uces')
223 def getucesfrometoile(self, etoile) :
224 return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles]
226 def getucefromid(self, uceid) :
227 if self.iduces is None : self.make_iduces()
228 return self.iduces[uceid]
230 def gethapaxnb(self) :
231 return len([None for forme in self.formes if self.formes[forme].freq == 1])
233 def getactivesnb(self, key) :
234 return len([lem for lem in self.lems if self.lems[lem].act == key])
235 # def make_lems(self, lem = True) :
236 # log.info('make lems')
238 # for forme in self.formes :
239 # if self.formes[forme].lem in self.lems :
240 # if self.formes[forme].ident not in self.lems[self.formes[forme].lem] :
241 # self.lems[self.formes[forme].lem][self.formes[forme].ident] = 0
243 # self.lems[self.formes[forme].lem] = {self.formes[forme].ident : 0}
245 def getetbyuceid(self, uceid) :
246 if self.uceuci is None : self.uceuci = dict([[uce.ident,uci.ident] for uci in self.ucis for uce in uci.uces])
247 return self.ucis[self.uceuci[uceid]].etoiles
249 def make_lems(self, lem = True) :
250 log.info('make lems')
253 for forme in self.formes :
254 if self.formes[forme].lem in self.lems :
255 if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes :
256 self.lems[self.formes[forme].lem].add_forme(self.formes[forme])
258 self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
260 self.lems = dict([[forme, Lem(self, self.formes[forme])] for forme in self.formes])
262 def make_idformes(self) :
263 self.idformes = dict([[self.formes[forme].ident, self.formes[forme]] for forme in self.formes])
265 def make_iduces(self) :
266 if self.iduces is None :
267 self.iduces = dict([[uce.ident, uce] for uci in self.ucis for uce in uci.uces])
269 def make_lexitable(self, mineff, etoiles) :
270 tokeep = [lem for lem in self.lems if self.lems[lem].freq >= mineff]
271 etuces = [[] for et in etoiles]
272 for uci in self.ucis :
273 get = list(set(uci.etoiles).intersection(etoiles))
275 return '2 variables sur la meme ligne'
277 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
278 etuces = [set(val) for val in etuces]
281 deff = self.getlemuceseff(lem)
283 tab.append([lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])
284 tab.insert(0, [''] + etoiles)
287 def make_efftype_from_etoiles(self, etoiles) :
289 etuces = [[] for et in etoiles]
290 for uci in self.ucis :
291 get = list(set(uci.etoiles).intersection(etoiles))
293 return '2 variables sur la meme ligne'
295 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
296 etuces = [set(val) for val in etuces]
297 for lem in self.lems :
298 deff = self.getlemuceseff(lem)
300 gram = self.lems[lem].gram
302 dtype[gram] = [i + j for i, j in zip(dtype[gram], [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])]
304 dtype[gram] = [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces]
305 tabout = [[gram] + dtype[gram] for gram in dtype]
306 tabout.insert(0, [''] + etoiles)
309 def make_uceactsize(self, actives) :
310 res = self.getalluces()
313 deff = self.getlemuceseff(lem)
315 ucesize[uce] = ucesize.get(uce, 0) + 1
318 def make_uc(self, actives, lim1, lim2) :
319 uceactsize = self.make_uceactsize(actives)
325 for uce in [uce for uci in self.ucis for uce in uci.uces] :
326 if uce.para == lastpara :
328 last1 += uceactsize.get(uce.ident,0)
329 uc1[-1].append(uce.ident)
331 uc1.append([uce.ident])
334 last2 += uceactsize.get(uce.ident, 0)
335 uc2[-1].append(uce.ident)
337 uc2.append([uce.ident])
340 last1 = uceactsize.get(uce.ident, 0)
341 last2 = uceactsize.get(uce.ident, 0)
343 uc1.append([uce.ident])
344 uc2.append([uce.ident])
347 def make_and_write_sparse_matrix_from_uc(self, actives, sizeuc1, sizeuc2, uc1out, uc2out, listuce1out, listuce2out) :
348 uc1, uc2 = self.make_uc(actives, sizeuc1, sizeuc2)
349 log.info('taille uc1 : %i - taille uc2 : %i' % (len(uc1), len(uc2)))
350 self.write_ucmatrix(uc1, actives, uc1out)
351 self.write_ucmatrix(uc2, actives, uc2out)
352 listuce1 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc1) for uce in ucl]
353 listuce2 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc2) for uce in ucl]
354 with open(listuce1out, 'w') as f :
355 f.write('\n'.join([';'.join(line) for line in listuce1]))
356 with open(listuce2out, 'w') as f :
357 f.write('\n'.join([';'.join(line) for line in listuce2]))
358 return len(uc1), len(uc2)
360 def write_ucmatrix(self, uc, actives, fileout) :
361 log.info('write uc matrix %s' % fileout)
362 uces_uc = dict([[uce, i] for i, ucl in enumerate(uc) for uce in ucl])
365 with open(fileout + '~', 'w+') as f :
366 for i, lem in enumerate(actives) :
367 for uce in self.getlemuces(lem):
368 if (uces_uc[uce], i) not in deja_la :
370 f.write(''.join([' '.join([`uces_uc[uce]+1`,`i+1`,`1`]),'\n']))
371 deja_la[(uces_uc[uce], i)] = 0
373 with open(fileout, 'w') as ffin :
374 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (len(uc), len(actives), nbl))
377 os.remove(fileout + '~')
380 def export_corpus(self, outf) :
381 #outf = 'export_corpus.txt'
383 res = self.getalluces()
387 with open(outf,'w') as f :
389 if self.iduces[uce[0]].uci == actuci and self.iduces[uce[0]].para == actpara :
390 f.write(uce[1].encode(self.parametres['syscoding']) + '\n')
391 elif self.iduces[uce[0]].uci != actuci :
392 actuci = self.iduces[uce[0]].uci
393 if self.ucis[self.iduces[uce[0]].uci].paras == [] :
394 actpara = self.iduces[uce[0]].para
395 f.write('\n' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n' + uce[1].encode(self.parametres['syscoding']) + '\n')
398 actpara = self.iduces[uce[0]].para
399 f.write('\n'.join([' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']), self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
400 elif self.iduces[uce[0]].para != actpara :
401 actpara = self.iduces[uce[0]].para
403 f.write('\n'.join([self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
405 def export_corpus_classes(self, outf, alc = True, lem = False) :
407 for i, lc in enumerate(self.lc) :
410 for uce in self.lc0 :
412 res = self.getalluces()
414 with open(outf, 'w') as f :
417 actuci = self.iduces[uce[0]].uci
419 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
421 etline = ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles + ['*classe_%i' % ucecl[uce[0]]])
423 etline = ' '.join(['<' + '='.join(et.split('_')) + '>' for et in self.ucis[self.iduces[uce[0]].uci].etoiles[1:]])
424 f.write(etline.encode(self.parametres['syscoding']) + '\n')
425 f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
427 def make_and_write_sparse_matrix_from_uces(self, actives, outfile, listuce = False) :
428 log.info('make_and_write_sparse_matrix_from_uces %s' % outfile)
430 with open(outfile + '~', 'w+') as f :
431 for i, lem in enumerate(actives) :
432 for uce in sorted(self.getlemuces(lem)) :
434 f.write(''.join([' '.join([`uce+1`, `i+1`,`1`]),'\n']))
436 with open(outfile, 'w') as ffin :
437 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
440 os.remove(outfile + '~')
442 with open(listuce, 'w') as f :
443 f.write('\n'.join(['uce;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucenb())]))
445 def make_and_write_sparse_matrix_from_uci(self, actives, outfile, listuci = False) :
446 log.info('make_and_write_sparse_matrix_from_ucis %s' % outfile)
448 with open(outfile + '~', 'w+') as f :
449 for i, lem in enumerate(actives) :
450 for uci in sorted(self.getlemucis(lem)) :
452 f.write(''.join([' '.join([`uci+1`, `i+1`,`1`]),'\n']))
454 with open(outfile, 'w') as ffin :
455 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucinb(), len(actives), nbl))
458 os.remove(outfile + '~')
460 with open(listuci, 'w') as f :
461 f.write('\n'.join(['uci;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucinb())]))
463 def make_and_write_sparse_matrix_from_classe(self, actives, uces, outfile) :
464 log.info('make_and_write_sparse_matrix_from_classe %s' % outfile)
466 duces = dict([[uce, i] for i, uce in enumerate(uces)])
467 with open(outfile + '~', 'w+') as f :
468 for i, lem in enumerate(actives) :
469 uces_ok = list(set(self.getlemuces(lem)).intersection(uces))
471 f.write(''.join([' '.join([`duces[uce]+1`,`i+1`,`1`]),'\n']))
473 with open(outfile, 'w') as ffin :
474 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
477 os.remove(outfile + '~')
479 def make_table_with_classe(self, uces, list_act) :
480 table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
481 uces = dict([[uce, i] for i, uce in enumerate(uces)])
482 for i, lem in enumerate(list_act) :
483 lemuces = list(set(self.getlemuces(lem)).intersection(uces))
485 table_uce[uces[uce]][i] = 1
486 table_uce.insert(0, list_act)
489 def parse_active(self, gramact, gramsup = None) :
490 log.info('parse actives')
491 for lem in self.lems :
492 if lem.startswith('_') and lem.endswith('_') :
493 self.lems[lem].act = 2
494 elif self.lems[lem].gram in gramact :
495 self.lems[lem].act = 1
496 elif gramsup is not None :
497 if self.lems[lem].gram in gramsup :
498 self.lems[lem].act = 2
500 self.lems[lem].act = 0
502 self.lems[lem].act = 2
504 def make_actives_limit(self, limit, key = 1) :
505 if self.idformes is None :
507 return [lem for lem in self.lems if self.getlemeff(lem) >= limit and self.lems[lem].act == key]
509 def make_actives_nb(self, nbmax, key) :
510 log.info('make_actives_nb : %i - %i' % (nbmax,key))
511 if self.idformes is None :
513 allactives = [[self.lems[lem].freq, lem] for lem in self.lems if self.lems[lem].act == key and self.lems[lem].freq >= 3]
514 self.activenb = len(allactives)
515 allactives = sorted(allactives, reverse = True)
516 if len(allactives) <= nbmax :
517 log.info('nb = %i - eff min = %i ' % (len(allactives), allactives[-1][0]))
518 return [val[1] for val in allactives], allactives[-1][0]
520 effs = [val[0] for val in allactives]
521 if effs.count(effs[nbmax - 1]) > 1 :
522 lim = effs[nbmax - 1] + 1
526 stop = effs.index(lim)
533 log.info('nb actives = %i - eff min = %i ' % (stop + 1, lim))
534 return [val[1] for val in allactives[0:stop + 1]], lim
536 def make_and_write_profile(self, actives, ucecl, fileout) :
537 log.info('formes/classes')
538 tab = [[lem] + [len(set(self.getlemuces(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
539 tab = [[line[0]] + [`val` for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
540 with open(fileout, 'w') as f :
541 f.write('\n'.join([';'.join(line) for line in tab]).encode(self.parametres['syscoding']))
543 def make_etoiles(self) :
545 for uci in self.ucis :
546 etoiles.update(uci.etoiles[1:] + uci.paras)
549 def make_etoiles_dict(self) :
550 etoiles = [et for uci in self.ucis for et in uci.etoiles[1:]]
552 for etoile in etoiles :
553 et = etoile.split('_')
556 endet = '_'.join(et[1:])
557 if endet in det[et[0]] :
558 det[et[0]][endet] += 1
560 det[et[0]][endet] = 1
565 endet = '_'.join(et[1:])
566 det[et[0]] = {endet :1}
571 def make_etline(self, listet) :
572 etuces = [[] for et in listet]
573 for uci in self.ucis :
574 get = list(set(uci.etoiles).intersection(listet))
576 return '2 variables sur la meme ligne'
578 etuces[listet.index(get[0])] += [uce.ident for uce in uci.uces]
582 def make_and_write_profile_et(self, ucecl, fileout) :
583 log.info('etoiles/classes')
584 etoiles = self.make_etoiles()
585 with open(fileout, 'w') as f :
586 f.write('\n'.join([';'.join([etoile] + [`len(set(self.getucesfrometoile(etoile)).intersection(classe))` for classe in ucecl]) for etoile in etoiles]).encode(self.parametres['syscoding']))
588 def make_colored_corpus(self) :
590 for i, lc in enumerate(self.lc) :
593 for uce in self.lc0 :
595 color = ['black'] + colors[len(self.lc) - 1]
597 <meta http-equiv="content-Type" content="text/html; charset=%s" />
599 ''' % sys.getdefaultencoding()
600 res = self.getalluces()
605 if self.iduces[uce[0]].uci != actuci :
606 actuci = self.iduces[uce[0]].uci
607 txt += '<br><hr>' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles) + '<br><br>'
608 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
610 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
611 return txt + '\n</body></html>'
613 def count_from_list(self, l, d) :
621 def count_from_list_cl(self, l, d, a, clnb) :
630 def find_segments(self, taille_segment, taille_limite) :
632 for uce in self.getalluces() :
634 d = self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
635 l = [[d[val], val] for val in d if d[val] >= 3]
638 if len(l) > taille_limite :
639 l = l[-taille_limite:]
642 def find_segments_in_classe(self, list_uce, taille_segment, taille_limite):
644 for uce in self.getconcorde(list_uce) :
646 d =self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
647 l = [[d[val], val, taille_segment] for val in d if d[val] >= 3]
650 if len(l) > taille_limite :
651 l = l[-taille_limite:]
654 def make_segments_profile(self, fileout, lenmin = 3, lenmax = 10, effmin = 50, lem = False) :
656 for b, classe in enumerate(self.lc) :
657 for uce in self.getconcorde(classe) :
660 uce = [self.formes[forme].lem for forme in uce]
661 for taille_segment in range(lenmin,lenmax) :
662 d =self.count_from_list_cl([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d, b, len(self.lc))
663 result = [[seg] + [str(val) for val in d[seg]] for seg in d if sum(d[seg]) >= effmin]
664 with open(fileout, 'w') as f :
665 f.write('\n'.join([';'.join(line) for line in result]))
667 def make_proftype(self, outf) :
669 for lem in self.lems :
670 gram = self.lems[lem].gram
672 res[gram] = [0 for val in self.lc]
673 lemuceeff = self.getlemuceseff(lem)
674 for i, classe in enumerate(self.lc) :
675 concern = set(classe).intersection(lemuceeff.keys())
676 res[gram][i] += sum([lemuceeff[uce] for uce in concern])
677 res = [[gram] + [`val` for val in res[gram]] for gram in res]
679 with open(outf, 'w') as f :
680 f.write('\n'.join([';'.join(line) for line in res]).encode(self.parametres['syscoding']))
683 def make_ucecl_from_R(self, filein) :
684 with open(filein, 'rU') as f :
689 line = line.replace('\n', '').replace('"', '').split(';')
690 self.lc.append([int(line[0]) - 1, int(line[1])])
691 classesl = [val[1] for val in self.lc]
693 self.lc = sorted(self.lc, key=itemgetter(1))
694 self.lc = [[uce[0] for uce in self.lc if uce[1] == i] for i in range(clnb+1)]
695 self.lc0 = self.lc.pop(0)
698 def get_stat_by_cluster(self, outf) :
699 log.info('get_stat_by_cluster')
701 occurrences = dict([[i + 1, 0] for i in range(len(self.lc))])
702 formescl = dict([[i + 1, 0] for i in range(len(self.lc))])
703 hapaxcl = dict([[i + 1, 0] for i in range(len(self.lc))])
704 lenclasses = dict([[i+1,len(cl)] for i, cl in enumerate(self.lc)])
705 sets = [set(cl) for cl in self.lc]
706 for forme in self.formes :
707 formeuceeff = self.getformeuceseff(forme)
708 for i, classe in enumerate(self.lc) :
709 concern = sets[i].intersection(formeuceeff.keys())
711 occurrences[i+1] += sum([formeuceeff[uce] for uce in concern])
713 if self.formes[forme].freq == 1 :
715 toprint = '\n'.join([';'.join([`i`, `occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`]) for i in occurrences])
716 with open(outf, 'w') as f :
718 log.info('%f' % (time() - t1))
720 def gethapaxbyet(self, etoiles) :
721 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
723 for uce in hapaxuces :
724 if uce in hucesdict :
728 etuces = [[] for et in etoiles]
729 for uci in self.ucis :
730 get = list(set(uci.etoiles).intersection(etoiles))
732 return '2 variables sur la meme ligne'
734 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
735 etuces = [set(val) for val in etuces]
736 return [sum([hucesdict[uce] for uce in list(etuce.intersection(hapaxuces))]) for etuce in etuces]
738 def gethapaxuces(self) :
739 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
740 hapax = [forme for forme in self.lems if self.lems[forme].freq == 1]
742 for i,uce in enumerate(hapaxuces) :
743 if uce in hucesdict :
744 hucesdict[uce][0] += 1
745 hucesdict[uce][1].append(hapax[i])
747 hucesdict[uce] = [1,[hapax[i]]]
749 for uce in hucesdict :
750 if hucesdict[uce][0] in huces :
751 huces[hucesdict[uce][0]].append(uce)
753 huces[hucesdict[uce][0]] = [uce]
754 huces = zip(huces, huces.values())
755 huces.sort(reverse=True)
759 for nb in huces[0:4] :
760 txt += "<p><h2>%i hapax par uce</h2><p>\n" % nb[0]
762 res = self.getconcorde([uce])
764 ucetxt = ' ' + row[1] + ' '
766 for hap in hucesdict[uce][1] :
767 laforme = self.getforme([forme for forme in self.lems[hap].formes][0]).forme
768 ucetxt = ucetxt.replace(' '+laforme+' ', ' <font color=red>'+laforme+'</font> ')
769 txt += '<p><b>' + ' '.join(self.getetbyuceid(uceid)) + '</b></p>'
770 txt += '<p>'+ucetxt+'</p>\n'
774 with open('/tmp/testhapxuce.html','w') as f :
779 def __init__(self, corpus) :
780 ucinb = corpus.getucinb()
781 ucisize = corpus.getucisize()
782 ucimean = float(sum(ucisize))/float(ucinb)
783 detoile = corpus.make_etoiles_dict()
787 def __init__(self, iduci, line, paraset = None) :
789 self.etoiles = line.split()
791 if paraset is not None :
792 self.paras = paraset.split()
797 def __init__(self, iduce, idpara, iduci) :
803 def __init__(self, word, gramtype, idword, lem = None, freq = None) :
809 if freq is not None :
815 def __init__(self, parent, forme) :
816 self.formes = {forme.ident : forme.freq}
817 self.gram = forme.gram
818 self.freq = forme.freq
821 def add_forme(self, forme) :
822 self.formes[forme.ident] = forme.freq
823 self.freq += forme.freq
825 def decouperlist(chaine, longueur, longueurOptimale) :
827 on part du dernier caractère, et on recule jusqu'au début de la chaîne.
828 Si on trouve un '$', c'est fini.
829 Sinon, on cherche le meilleur candidat. C'est-à-dire le rapport poids/distance le plus important.
831 separateurs = [[u'.', 6.0], [u'?', 6.0], [u'!', 6.0], [u'£$£', 6.0], [u':', 5.0], [u';', 4.0], [u',', 1.0], [u' ', 0.01]]
832 dsep = dict([[val[0],val[1]] for val in separateurs])
833 trouve = False # si on a trouvé un bon séparateur
834 iDecoupe = 0 # indice du caractere ou il faut decouper
836 longueur = min(longueur, len(chaine) - 1)
837 chaineTravail = chaine[:longueur + 1]
839 meilleur = ['', 0, 0] # type, poids et position du meilleur separateur
842 indice = chaineTravail.index(u'$')
844 iDecoupe = indice - 1
849 caractere = chaineTravail[nbCar]
850 distance = abs(longueurOptimale - nbCar) + 1
851 meilleureDistance = abs(longueurOptimale - meilleur[2]) + 1
852 if caractere in dsep :
853 if (float(dsep[caractere]) / distance) > (float(meilleur[1]) / meilleureDistance) :
854 meilleur[0] = caractere
855 meilleur[1] = dsep[caractere]
860 if (float(dsep[' ']) / distance) > (float(meilleur[1]) / meilleureDistance) :
862 meilleur[1] = dsep[' ']
869 #if meilleur[0] != ' ' :
870 # fin = chaine[iDecoupe + 1:]
871 # retour = chaineTravail[:iDecoupe]
873 fin = chaine[iDecoupe + 1:]
874 retour = chaineTravail[:iDecoupe + 1]
875 return len(retour) > 0, retour, fin
876 # si on a rien trouvé
877 return False, chaine, ''
879 def testetoile(line) :
880 return line.startswith(u'****')
883 return line[0:4].isdigit() and u'*' in line
885 def prep_txtlist(txt) :
886 return txt.split() + [u'$']
888 def prep_txtcharact(txt) :
893 Class for building a corpus
895 def __init__(self, infile, parametres_corpus, lexique = None, expressions = None, dlg = None) :
896 log.info('begin building corpus...')
897 self.lexique = lexique
898 self.expressions = expressions
900 self.corpus = Corpus(self, parametres_corpus)
903 self.lim = parametres_corpus.get('lim', 1000000)
904 self.encoding = parametres_corpus['encoding']
905 self.corpus.pathout = PathOut(filename = parametres_corpus['originalpath'], dirout = parametres_corpus['pathout'])
906 self.corpus.pathout.createdir(parametres_corpus['pathout'])
907 self.corpus.parametres['uuid'] = str(uuid4())
908 self.corpus.parametres['corpus_name'] = os.path.split(self.corpus.parametres['pathout'])[1]
909 self.corpus.parametres['type'] = 'corpus'
910 if self.corpus.parametres['keep_ponct'] :
911 self.ponctuation_espace = [' ', '']
913 self.ponctuation_espace = [' ','.', u'£$£', ';', '?', '!', ',', ':','']
915 self.tolist = self.corpus.parametres.get('tolist', 0)
922 def prep_makeuce(self) :
923 method = self.corpus.parametres.get('ucemethod', 0)
925 self.decouper = decouperlist
926 self.prep_txt = prep_txtlist
927 self.ucesize = self.corpus.parametres.get('ucesize', 40)
929 self.decouper = decoupercharact
930 self.prep_txt = prep_txtcharact
931 self.ucesize = self.corpus.parametres.get('ucesize', 240)
932 log.info('method uce : %s' % method)
937 self.read_corpus(self.infile)
938 except Warning, args :
939 log.info('pas kool %s' % args)
943 self.corpus.parametres['ira'] = self.corpus.pathout['Corpus.cira']
944 self.time = time() - t1
946 DoConf().makeoptions(['corpus'],[self.corpus.parametres], self.corpus.pathout['Corpus.cira'])
947 log.info('time : %f' % (time() - t1))
950 self.conn_f = sqlite3.connect(self.corpus.pathout['formes.db'])
951 self.cf = self.conn_f.cursor()
952 self.cf.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
953 self.cf.execute('CREATE TABLE IF NOT EXISTS eff (id INTEGER, eff TEXT);')
955 self.cf = self.conn_f.cursor()
956 self.cf.execute('PRAGMA temp_store=MEMORY;')
957 self.cf.execute('PRAGMA journal_mode=MEMORY;')
958 self.cf.execute('PRAGMA synchronous = OFF;')
959 self.cf.execute('begin')
960 self.conn = sqlite3.connect(self.corpus.pathout['uces.db'])
961 self.c = self.conn.cursor()
962 self.c.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
964 self.c = self.conn.cursor()
965 self.c.execute('PRAGMA temp_store=MEMORY;')
966 self.c.execute('PRAGMA journal_mode=MEMORY;')
967 self.c.execute('PRAGMA synchronous = OFF;')
968 self.c.execute('begin')
971 #commit index and close db
974 self.cf.execute('CREATE INDEX iduces ON uces (id);')
975 self.cf.execute('CREATE INDEX ideff ON eff (id);')
979 self.conn_corpus = sqlite3.connect(self.corpus.pathout['corpus.db'])
980 self.ccorpus = self.conn_corpus.cursor()
981 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS etoiles (uci INTEGER, et TEXT, paras TEXT);')
982 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS luces (uci INTEGER, para INTEGER, uce INTEGER);')
983 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS formes (ident INTEGER, forme TEXT, lem TEXT, gram TEXT, freq INTEGER);')
984 self.conn_corpus.commit()
985 self.ccorpus = self.conn_corpus.cursor()
986 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
987 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
988 self.ccorpus.execute('PRAGMA synchronous = OFF;')
989 self.ccorpus.execute('begin')
991 self.ccorpus.execute('CREATE INDEX iduci ON luces (uci);')
992 self.conn_corpus.commit()
993 self.conn_corpus.close()
994 #self.corpus.parametres['corpus_ira'] = self.corpus.pathout['corpus.cira']
996 def buildcleans(self) :
997 if self.corpus.parametres.get('lower', 1) :
998 self.cleans.append(self.dolower)
999 if self.corpus.parametres.get('firstclean', 1) :
1000 self.cleans.append(self.firstclean)
1001 if self.corpus.parametres['charact'] :
1002 self.rule = self.corpus.parametres.get('keep_caract', u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_")
1003 self.cleans.append(self.docharact)
1004 if self.corpus.parametres.get('expressions', 1) :
1005 self.cleans.append(self.make_expression)
1006 if self.corpus.parametres.get('apos', 1) :
1007 self.cleans.append(self.doapos)
1008 if self.corpus.parametres.get('tiret', 1):
1009 self.cleans.append(self.dotiret)
1011 def make_expression(self,txt) :
1012 for expression in self.expressions:
1013 if expression in txt :
1014 txt = txt.replace(expression, self.expressions[expression][0])
1017 def dolower(self, txt) :
1020 def docharact(self, txt) :
1021 #rule = u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_-"
1022 list_keep = u"[" + self.rule + "]+"
1023 return re.sub(list_keep, ' ', txt)
1025 def doapos(self, txt) :
1026 return txt.replace(u'\'', u' ')
1028 def dotiret(self, txt) :
1029 return txt.replace(u'-', u' ')
1031 def firstclean(self, txt) :
1032 txt = txt.replace(u'’',"'")
1033 txt = txt.replace(u'œ', u'oe')
1034 return txt.replace('...',u' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'…', ' £$£ ')
1036 def make_cleans(self, txt) :
1037 for clean in self.cleans :
1041 def backup_uce(self) :
1042 if self.corpus.idformesuces != {} :
1043 log.info('backup %i' % len(self.corpus.idformesuces))
1044 touce = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].keys()])) for forme in self.corpus.idformesuces]
1045 toeff = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].values()])) for forme in self.corpus.idformesuces]
1046 self.cf.executemany('INSERT INTO uces VALUES (?,?);', touce)
1047 self.cf.executemany('INSERT INTO eff VALUES (?,?);', toeff)
1048 self.corpus.idformesuces = {}
1051 def backup_corpus(self) :
1052 log.info('start backup corpus')
1054 for uci in self.corpus.ucis :
1055 self.ccorpus.execute('INSERT INTO etoiles VALUES (?,?,?);' ,(uci.ident,' '.join(uci.etoiles), ' '.join(uci.paras,)))
1056 for uce in uci.uces :
1057 self.ccorpus.execute('INSERT INTO luces VALUES (?,?,?);',(`uci.ident`,`uce.para`,`uce.ident`,))
1058 for forme in self.corpus.formes :
1059 self.ccorpus.execute('INSERT INTO formes VALUES (?,?,?,?,?);', (`self.corpus.formes[forme].ident`, forme, self.corpus.formes[forme].lem, self.corpus.formes[forme].gram, `self.corpus.formes[forme].freq`,))
1060 log.info('%f' % (time() - t))
1062 def dofinish(self) :
1063 self.corpus.parametres['date'] = datetime.datetime.now().ctime()
1064 minutes, seconds = divmod(self.time, 60)
1065 hours, minutes = divmod(minutes, 60)
1066 self.corpus.parametres['time'] = '%.0fh %.0fm %.0fs' % (hours, minutes, seconds)
1067 self.corpus.parametres['ucinb'] = self.corpus.getucinb()
1068 self.corpus.parametres['ucenb'] = self.corpus.getucenb()
1069 self.corpus.parametres['occurrences'] = self.corpus.gettotocc()
1070 self.corpus.parametres['formesnb'] = len(self.corpus.formes)
1071 hapaxnb = self.corpus.gethapaxnb()
1072 pourhapaxf = (float(hapaxnb) / len(self.corpus.formes)) * 100
1073 pourhapaxocc = (float(hapaxnb) / self.corpus.parametres['occurrences']) * 100
1074 self.corpus.parametres['hapax'] = '%i - %.2f %% des formes - %.2f %% des occurrences' % (hapaxnb, pourhapaxf, pourhapaxocc)
1077 class BuildFromAlceste(BuildCorpus) :
1078 def read_corpus(self, infile) :
1079 if self.dlg is not None :
1080 self.dlg.Pulse('textes : 0 - segments : 0')
1083 if self.corpus.parametres['ucimark'] == 0 :
1084 self.testuci = testetoile
1085 elif self.corpus.parametres['ucimark'] == 1 :
1086 self.testuci = testint
1092 with codecs.open(infile, 'r', self.encoding) as f :
1093 for linenb, line in enumerate(f) :
1094 line = line.rstrip('\n\r')
1095 if self.testuci(line) :
1098 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci - 1)
1100 self.corpus.ucis.append(Uci(iduci, line))
1103 if self.corpus.ucis[-1].uces == [] :
1104 log.info(u'Empty text : %i' % linenb)
1106 self.corpus.ucis.pop()
1107 #raise Exception("EmptyText %i" % linenb)
1108 self.corpus.ucis.append(Uci(iduci, line))
1109 if self.dlg is not None :
1110 if not (iduci + 1) % 10 :
1111 self.dlg.Pulse('textes : %i - segments : %i' % (iduci + 1, iduce +1))
1112 elif line.startswith(u'-*') :
1115 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1118 self.corpus.ucis[-1].paras.append(line.split()[0])
1120 raise Exception('paragrapheOT')
1121 elif line.strip() != '' and iduci != -1 :
1123 if txt != [] and iduci != -1 :
1124 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1127 raise Exception("EmptyText")
1128 if iduci != -1 and iduce != -1:
1131 log.info(_(u"No Text in corpora. Are you sure of the formatting ?"))
1132 raise Exception('TextBeforeTextMark')
1133 except UnicodeDecodeError :
1134 raise Exception("CorpusEncoding")
1136 def treattxt(self, txt, iduce, idpara, iduci) :
1137 if self.corpus.parametres.get('ucemethod', 0) == 2 and self.corpus.parametres['douce']:
1138 txt = 'laphrasepoursplitter'.join(txt)
1139 txt = self.make_cleans(txt)
1140 txt = ' '.join([val for val in txt.split() if val not in self.ponctuation_espace])
1141 ucetxt = txt.split('laphrasepoursplitter')
1144 txt = self.make_cleans(txt)
1145 ucetxt = self.make_uces(txt, self.corpus.parametres['douce'])
1146 if self.corpus.ucis[-1].paras == [] :
1150 self.corpus.ucis[-1].uces.append(Uce(iduce, idpara, iduci))
1151 self.c.execute('INSERT INTO uces VALUES(?,?);', (`iduce`,uce))
1152 if not self.tolist :
1158 self.corpus.add_word(word)
1159 log.debug(' '.join([`iduci`,`idpara`,`iduce`]))
1160 if self.last > self.lim :
1163 return iduce, idpara
1165 def make_uces(self, txt, douce = True, keep_ponct = False) :
1166 txt = ' '.join(txt.split())
1169 reste, texte_uce, suite = self.decouper(self.prep_txt(txt), self.ucesize + 15, self.ucesize)
1171 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1174 reste, texte_uce, suite = self.decouper(suite, self.ucesize + 15, self.ucesize)
1175 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1180 return [' '.join([val for val in txt.split() if val not in self.ponctuation_espace])]
1182 #decouper (list_sep)
1183 #make_uces (decouper)
1184 #treat_txt (make_uces)
1188 def __init__(self, parent, dlg = None) :
1189 self.parent = parent
1191 parametres = DoConf(os.path.join(self.parent.UserConfigPath,'corpus.cfg')).getoptions('corpus')
1192 parametres['pathout'] = PathOut(parent.filename, 'corpus').mkdirout()
1193 dial = CorpusPref(parent, parametres)
1194 dial.CenterOnParent()
1195 dial.txtpath.SetLabel(parent.filename)
1196 #dial.repout_choices.SetValue(parametres['pathout'])
1197 self.res = dial.ShowModal()
1198 if self.res == 5100 :
1199 parametres = dial.doparametres()
1200 parametres['originalpath'] = parent.filename
1201 PathOut().createdir(parametres['pathout'])
1202 ReadLexique(self.parent, lang = parametres['lang'])
1203 self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp'))
1204 self.parametres = parametres
1206 if self.dlg is not None :
1210 def doanalyse(self) :
1211 return BuildFromAlceste(self.parent.filename, self.parametres, self.parent.lexique, self.parent.expressions, dlg = self.dlg).corpus
1214 if __name__ == '__main__' :
1216 parametres = {'formesdb':'formes.db', 'ucesdb': 'uces.db', 'corpusdb' : 'corpus.db', 'syscoding' : 'utf-8', 'encoding' : encoding}
1217 intro = BuildCorpus(infile, parametres)#, tar_in, tar_infouce)#, tar_formes)