1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
11 from functions import decoupercharact, ReadDicoAsDico, DoConf
17 from operator import itemgetter
18 from uuid import uuid4
19 from chemins import PathOut
20 from dialog import CorpusPref
21 from functions import ReadLexique, ReadDicoAsDico
22 from colors import colors
26 log = logging.getLogger('iramuteq.corpus')
29 def copycorpus(corpus) :
30 log.info('copy corpus')
31 copy_corpus = Corpus(corpus.parent, parametres = corpus.parametres)
32 copy_corpus.ucis = corpus.ucis
33 copy_corpus.formes = corpus.formes
34 copy_corpus.pathout = corpus.pathout
35 copy_corpus.conn_all()
45 def __init__(self, parent, parametres = {}, read = False) :
47 self.parametres = parametres
49 self.connformes = None
51 self.conncorpus = None
58 self.idformesuces = {}
63 self.pathout = PathOut(dirout = parametres['pathout'])
66 def add_word(self, word) :
67 if word in self.formes :
68 self.formes[word].freq += 1
69 if self.formes[word].ident in self.idformesuces :
70 if self.ucis[-1].uces[-1].ident in self.idformesuces[self.formes[word].ident] :
71 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] += 1
73 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] = 1
75 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident: 1}
77 if word in self.parent.lexique :
78 gramtype = self.parent.lexique[word][1]
79 lem = self.parent.lexique[word][0]
86 self.formes[word] = Word(word, gramtype, len(self.formes), lem)
87 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident : 1}
90 """connect corpus to db"""
91 if self.connformes is None :
92 log.info('connexion corpus')
93 self.connuces = sqlite3.connect(self.pathout['uces.db'])
94 self.cuces = self.connuces.cursor()
95 self.connformes = sqlite3.connect(self.pathout['formes.db'])
96 self.cformes = self.connformes.cursor()
97 self.conncorpus = sqlite3.connect(self.pathout['corpus.db'])
98 self.ccorpus = self.conncorpus.cursor()
99 self.cformes.execute('PRAGMA temp_store=MEMORY;')
100 self.cformes.execute('PRAGMA journal_mode=MEMORY;')
101 self.cformes.execute('PRAGMA synchronous = OFF;')
102 self.cuces.execute('PRAGMA temp_store=MEMORY;')
103 self.cuces.execute('PRAGMA journal_mode=MEMORY;')
104 self.cuces.execute('PRAGMA synchronous = OFF;')
105 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
106 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
107 self.ccorpus.execute('PRAGMA synchronous = OFF;')
109 def read_corpus(self) :
110 log.info('read corpus')
111 self.parametres['syscoding'] = sys.getdefaultencoding()
112 if self.conncorpus is None :
114 res = self.ccorpus.execute('SELECT * FROM etoiles;')
116 self.ucis.append(Uci(row[0], row[1], row[2]))
117 uces = self.conncorpus.cursor().execute('SELECT * FROM luces where uci=?;',(`self.ucis[-1].ident`,))
119 self.ucis[-1].uces.append(Uce(uce[2], uce[1], uce[0]))
120 res = self.ccorpus.execute('SELECT * FROM formes;')
121 self.formes = dict([[forme[1], Word(forme[1], forme[3], forme[0], lem = forme[2], freq = forme[4])] for forme in res])
124 def getworduces(self, wordid) :
125 if isinstance(wordid, basestring) :
126 wordid = self.formes[wordid].ident
127 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`wordid`,))
128 return list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
130 def getformeuceseff(self, formeid) :
131 if isinstance(formeid, basestring) :
132 formeid = self.formes[formeid].ident
133 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`formeid`,))
134 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
135 query = 'SELECT eff FROM eff where id=%i ORDER BY id' % formeid
136 res = self.cformes.execute(query)
137 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
139 for i, uce in enumerate(uces) :
140 formeuceeff[uce] = formeuceeff.get(uce, 0) + eff[i]
143 def getlemuces(self, lem) :
144 formesid = ', '.join([`val` for val in self.lems[lem].formes])
145 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
146 res = self.cformes.execute(query)
147 return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))))
149 def getlemucis(self, lem) :
150 uces = self.getlemuces(lem)
151 return list(set([self.getucefromid(val).uci for val in uces]))
153 def getlemuceseff(self, lem, luces = None) :
154 formesid = ', '.join([`val` for val in self.lems[lem].formes])
155 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
156 res = self.cformes.execute(query)
157 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
158 query = 'SELECT eff FROM eff where id IN (%s) ORDER BY id' % formesid
159 res = self.cformes.execute(query)
160 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
162 for i, uce in enumerate(uces) :
163 lemuceeff[uce] = lemuceeff.get(uce, 0) + eff[i]
166 def getlemclustereff(self, lem, cluster) :
167 return len(list(set(self.lc[cluster]).intersection(self.getlemuces(lem))))
169 def getlemeff(self, lem) :
170 return self.lems[lem].freq
175 def getforme(self, formeid) :
176 if self.idformes is None : self.make_idformes()
177 return self.idformes[formeid]
179 def gettotocc(self) :
180 return sum([self.formes[forme].freq for forme in self.formes])
182 def getucemean(self) :
183 return float(self.gettotocc())/self.getucenb()
186 return self.ucis[-1].uces[-1].ident + 1
189 return self.ucis[-1].ident + 1
191 def getucisize(self) :
192 ucesize = self.getucesize()
193 return [sum(ucesize[uci.uces[0].ident:(uci.uces[-1].ident + 1)]) for uci in self.ucis]
195 def getucesize(self) :
196 res = self.getalluces()
197 return [len(uce[1].split()) for uce in res]
199 def getconcorde(self, uces) :
200 return self.cuces.execute('select * from uces where id IN (%s);' % ', '.join([`i` for i in uces]))
202 def getwordconcorde(self, word) :
203 return self.getconcorde(self.getworduces(word))
205 def getlemconcorde(self, lem) :
206 return self.getconcorde(self.getlemuces(lem))
208 def getalluces(self) :
209 return self.cuces.execute('SELECT * FROM uces')
211 def getucesfrometoile(self, etoile) :
212 return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles]
214 def getucefromid(self, uceid) :
215 if self.iduces is None : self.make_iduces()
216 return self.iduces[uceid]
218 def gethapaxnb(self) :
219 return len([None for forme in self.formes if self.formes[forme].freq == 1])
221 def getactivesnb(self, key) :
222 return len([lem for lem in self.lems if self.lems[lem].act == key])
223 # def make_lems(self, lem = True) :
224 # log.info('make lems')
226 # for forme in self.formes :
227 # if self.formes[forme].lem in self.lems :
228 # if self.formes[forme].ident not in self.lems[self.formes[forme].lem] :
229 # self.lems[self.formes[forme].lem][self.formes[forme].ident] = 0
231 # self.lems[self.formes[forme].lem] = {self.formes[forme].ident : 0}
233 def getetbyuceid(self, uceid) :
234 if self.uceuci is None : self.uceuci = dict([[uce.ident,uci.ident] for uci in self.ucis for uce in uci.uces])
235 return self.ucis[self.uceuci[uceid]].etoiles
237 def make_lems(self, lem = True) :
238 log.info('make lems')
241 for forme in self.formes :
242 if self.formes[forme].lem in self.lems :
243 if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes :
244 self.lems[self.formes[forme].lem].add_forme(self.formes[forme])
246 self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
248 self.lems = dict([[forme, Lem(self, self.formes[forme])] for forme in self.formes])
250 def make_idformes(self) :
251 self.idformes = dict([[self.formes[forme].ident, self.formes[forme]] for forme in self.formes])
253 def make_iduces(self) :
254 if self.iduces is None :
255 self.iduces = dict([[uce.ident, uce] for uci in self.ucis for uce in uci.uces])
257 def make_lexitable(self, mineff, etoiles) :
258 tokeep = [lem for lem in self.lems if self.lems[lem].freq >= mineff]
259 etuces = [[] for et in etoiles]
260 for uci in self.ucis :
261 get = list(set(uci.etoiles).intersection(etoiles))
263 return '2 variables sur la meme ligne'
265 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
266 etuces = [set(val) for val in etuces]
269 deff = self.getlemuceseff(lem)
271 tab.append([lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])
272 tab.insert(0, [''] + etoiles)
275 def make_efftype_from_etoiles(self, etoiles) :
277 etuces = [[] for et in etoiles]
278 for uci in self.ucis :
279 get = list(set(uci.etoiles).intersection(etoiles))
281 return '2 variables sur la meme ligne'
283 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
284 etuces = [set(val) for val in etuces]
285 for lem in self.lems :
286 deff = self.getlemuceseff(lem)
288 gram = self.lems[lem].gram
290 dtype[gram] = [i + j for i, j in zip(dtype[gram], [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])]
292 dtype[gram] = [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces]
293 tabout = [[gram] + dtype[gram] for gram in dtype]
294 tabout.insert(0, [''] + etoiles)
297 def make_uceactsize(self, actives) :
298 res = self.getalluces()
301 deff = self.getlemuceseff(lem)
303 ucesize[uce] = ucesize.get(uce, 0) + 1
306 def make_uc(self, actives, lim1, lim2) :
307 uceactsize = self.make_uceactsize(actives)
313 for uce in [uce for uci in self.ucis for uce in uci.uces] :
314 if uce.para == lastpara :
316 last1 += uceactsize.get(uce.ident,0)
317 uc1[-1].append(uce.ident)
319 uc1.append([uce.ident])
322 last2 += uceactsize.get(uce.ident, 0)
323 uc2[-1].append(uce.ident)
325 uc2.append([uce.ident])
328 last1 = uceactsize.get(uce.ident, 0)
329 last2 = uceactsize.get(uce.ident, 0)
331 uc1.append([uce.ident])
332 uc2.append([uce.ident])
335 def make_and_write_sparse_matrix_from_uc(self, actives, sizeuc1, sizeuc2, uc1out, uc2out, listuce1out, listuce2out) :
336 uc1, uc2 = self.make_uc(actives, sizeuc1, sizeuc2)
337 log.info('taille uc1 : %i - taille uc2 : %i' % (len(uc1), len(uc2)))
338 self.write_ucmatrix(uc1, actives, uc1out)
339 self.write_ucmatrix(uc2, actives, uc2out)
340 listuce1 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc1) for uce in ucl]
341 listuce2 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc2) for uce in ucl]
342 with open(listuce1out, 'w') as f :
343 f.write('\n'.join([';'.join(line) for line in listuce1]))
344 with open(listuce2out, 'w') as f :
345 f.write('\n'.join([';'.join(line) for line in listuce2]))
346 return len(uc1), len(uc2)
348 def write_ucmatrix(self, uc, actives, fileout) :
349 log.info('write uc matrix %s' % fileout)
350 uces_uc = dict([[uce, i] for i, ucl in enumerate(uc) for uce in ucl])
353 with open(fileout + '~', 'w+') as f :
354 for i, lem in enumerate(actives) :
355 for uce in self.getlemuces(lem):
356 if (uces_uc[uce], i) not in deja_la :
358 f.write(''.join([' '.join([`uces_uc[uce]+1`,`i+1`,`1`]),'\n']))
359 deja_la[(uces_uc[uce], i)] = 0
361 with open(fileout, 'w') as ffin :
362 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (len(uc), len(actives), nbl))
365 os.remove(fileout + '~')
368 def export_corpus(self, outf) :
369 #outf = 'export_corpus.txt'
371 res = self.getalluces()
375 with open(outf,'w') as f :
377 if self.iduces[uce[0]].uci == actuci and self.iduces[uce[0]].para == actpara :
378 f.write(uce[1].encode(self.parametres['syscoding']) + '\n')
379 elif self.iduces[uce[0]].uci != actuci :
380 actuci = self.iduces[uce[0]].uci
381 if self.ucis[self.iduces[uce[0]].uci].paras == [] :
382 actpara = self.iduces[uce[0]].para
383 f.write('\n' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n' + uce[1].encode(self.parametres['syscoding']) + '\n')
386 actpara = self.iduces[uce[0]].para
387 f.write('\n'.join([' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']), self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
388 elif self.iduces[uce[0]].para != actpara :
389 actpara = self.iduces[uce[0]].para
391 f.write('\n'.join([self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
393 def export_corpus_classes(self, outf, alc = True, lem = False) :
395 for i, lc in enumerate(self.lc) :
398 for uce in self.lc0 :
400 res = self.getalluces()
402 with open(outf, 'w') as f :
405 actuci = self.iduces[uce[0]].uci
407 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
409 etline = ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles + ['*classe_%i' % ucecl[uce[0]]])
411 etline = ' '.join(['<' + '='.join(et.split('_')) + '>' for et in self.ucis[self.iduces[uce[0]].uci].etoiles[1:]])
412 f.write(etline.encode(self.parametres['syscoding']) + '\n')
413 f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
415 def make_and_write_sparse_matrix_from_uces(self, actives, outfile, listuce = False) :
416 log.info('make_and_write_sparse_matrix_from_uces %s' % outfile)
418 with open(outfile + '~', 'w+') as f :
419 for i, lem in enumerate(actives) :
420 for uce in sorted(self.getlemuces(lem)) :
422 f.write(''.join([' '.join([`uce+1`, `i+1`,`1`]),'\n']))
424 with open(outfile, 'w') as ffin :
425 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
428 os.remove(outfile + '~')
430 with open(listuce, 'w') as f :
431 f.write('\n'.join(['uce;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucenb())]))
433 def make_and_write_sparse_matrix_from_uci(self, actives, outfile, listuci = False) :
434 log.info('make_and_write_sparse_matrix_from_ucis %s' % outfile)
436 with open(outfile + '~', 'w+') as f :
437 for i, lem in enumerate(actives) :
438 for uci in sorted(self.getlemucis(lem)) :
440 f.write(''.join([' '.join([`uci+1`, `i+1`,`1`]),'\n']))
442 with open(outfile, 'w') as ffin :
443 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucinb(), len(actives), nbl))
446 os.remove(outfile + '~')
448 with open(listuci, 'w') as f :
449 f.write('\n'.join(['uci;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucinb())]))
451 def make_and_write_sparse_matrix_from_classe(self, actives, uces, outfile) :
452 log.info('make_and_write_sparse_matrix_from_classe %s' % outfile)
454 duces = dict([[uce, i] for i, uce in enumerate(uces)])
455 with open(outfile + '~', 'w+') as f :
456 for i, lem in enumerate(actives) :
457 uces_ok = list(set(self.getlemuces(lem)).intersection(uces))
459 f.write(''.join([' '.join([`duces[uce]+1`,`i+1`,`1`]),'\n']))
461 with open(outfile, 'w') as ffin :
462 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
465 os.remove(outfile + '~')
467 def make_table_with_classe(self, uces, list_act) :
468 table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
469 uces = dict([[uce, i] for i, uce in enumerate(uces)])
470 for i, lem in enumerate(list_act) :
471 lemuces = list(set(self.getlemuces(lem)).intersection(uces))
473 table_uce[uces[uce]][i] = 1
474 table_uce.insert(0, list_act)
477 def parse_active(self, gramact, gramsup = None) :
478 log.info('parse actives')
479 for lem in self.lems :
480 if lem.startswith('_') and lem.endswith('_') :
481 self.lems[lem].act = 2
482 elif self.lems[lem].gram in gramact :
483 self.lems[lem].act = 1
484 elif gramsup is not None :
485 if self.lems[lem].gram in gramsup :
486 self.lems[lem].act = 2
488 self.lems[lem].act = 0
490 self.lems[lem].act = 2
492 def make_actives_limit(self, limit, key = 1) :
493 if self.idformes is None :
495 return [lem for lem in self.lems if self.getlemeff(lem) >= limit and self.lems[lem].act == key]
497 def make_actives_nb(self, nbmax, key) :
498 log.info('make_actives_nb : %i - %i' % (nbmax,key))
499 if self.idformes is None :
501 allactives = [[self.lems[lem].freq, lem] for lem in self.lems if self.lems[lem].act == key and self.lems[lem].freq >= 3]
502 self.activenb = len(allactives)
503 allactives = sorted(allactives, reverse = True)
504 if len(allactives) <= nbmax :
505 log.info('nb = %i - eff min = %i ' % (len(allactives), allactives[-1][0]))
506 return [val[1] for val in allactives], allactives[-1][0]
508 effs = [val[0] for val in allactives]
509 if effs.count(effs[nbmax - 1]) > 1 :
510 lim = effs[nbmax - 1] + 1
514 stop = effs.index(lim)
521 log.info('nb actives = %i - eff min = %i ' % (stop + 1, lim))
522 return [val[1] for val in allactives[0:stop + 1]], lim
524 def make_and_write_profile(self, actives, ucecl, fileout) :
525 log.info('formes/classes')
526 tab = [[lem] + [len(set(self.getlemuces(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
527 tab = [[line[0]] + [`val` for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
528 with open(fileout, 'w') as f :
529 f.write('\n'.join([';'.join(line) for line in tab]).encode(self.parametres['syscoding']))
531 def make_etoiles(self) :
533 for uci in self.ucis :
534 etoiles.update(uci.etoiles[1:] + uci.paras)
537 def make_etoiles_dict(self) :
538 etoiles = [et for uci in self.ucis for et in uci.etoiles[1:]]
540 for etoile in etoiles :
541 et = etoile.split('_')
544 endet = '_'.join(et[1:])
545 if endet in det[et[0]] :
546 det[et[0]][endet] += 1
548 det[et[0]][endet] = 1
553 endet = '_'.join(et[1:])
554 det[et[0]] = {endet :1}
559 def make_etline(self, listet) :
560 etuces = [[] for et in listet]
561 for uci in self.ucis :
562 get = list(set(uci.etoiles).intersection(listet))
564 return '2 variables sur la meme ligne'
566 etuces[listet.index(get[0])] += [uce.ident for uce in uci.uces]
570 def make_and_write_profile_et(self, ucecl, fileout) :
571 log.info('etoiles/classes')
572 etoiles = self.make_etoiles()
573 with open(fileout, 'w') as f :
574 f.write('\n'.join([';'.join([etoile] + [`len(set(self.getucesfrometoile(etoile)).intersection(classe))` for classe in ucecl]) for etoile in etoiles]).encode(self.parametres['syscoding']))
576 def make_colored_corpus(self) :
578 for i, lc in enumerate(self.lc) :
581 for uce in self.lc0 :
583 color = ['black'] + colors[len(self.lc) - 1]
585 <meta http-equiv="content-Type" content="text/html; charset=%s" />
587 ''' % sys.getdefaultencoding()
588 res = self.getalluces()
593 if self.iduces[uce[0]].uci != actuci :
594 actuci = self.iduces[uce[0]].uci
595 txt += '<br><hr>' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles) + '<br><br>'
596 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
598 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
599 return txt + '\n</body></html>'
601 def count_from_list(self, l, d) :
609 def count_from_list_cl(self, l, d, a, clnb) :
618 def find_segments(self, taille_segment, taille_limite) :
620 for uce in self.getalluces() :
622 d = self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
623 l = [[d[val], val] for val in d if d[val] >= 3]
626 if len(l) > taille_limite :
627 l = l[-taille_limite:]
630 def find_segments_in_classe(self, list_uce, taille_segment, taille_limite):
632 for uce in self.getconcorde(list_uce) :
634 d =self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
635 l = [[d[val], val, taille_segment] for val in d if d[val] >= 3]
638 if len(l) > taille_limite :
639 l = l[-taille_limite:]
642 def make_segments_profile(self, fileout, lenmin = 3, lenmax = 10, effmin = 50, lem = False) :
644 for b, classe in enumerate(self.lc) :
645 for uce in self.getconcorde(classe) :
648 uce = [self.formes[forme].lem for forme in uce]
649 for taille_segment in range(lenmin,lenmax) :
650 d =self.count_from_list_cl([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d, b, len(self.lc))
651 result = [[seg] + [str(val) for val in d[seg]] for seg in d if sum(d[seg]) >= effmin]
652 with open(fileout, 'w') as f :
653 f.write('\n'.join([';'.join(line) for line in result]))
655 def make_proftype(self, outf) :
657 for lem in self.lems :
658 gram = self.lems[lem].gram
660 res[gram] = [0 for val in self.lc]
661 lemuceeff = self.getlemuceseff(lem)
662 for i, classe in enumerate(self.lc) :
663 concern = set(classe).intersection(lemuceeff.keys())
664 res[gram][i] += sum([lemuceeff[uce] for uce in concern])
665 res = [[gram] + [`val` for val in res[gram]] for gram in res]
667 with open(outf, 'w') as f :
668 f.write('\n'.join([';'.join(line) for line in res]).encode(self.parametres['syscoding']))
671 def make_ucecl_from_R(self, filein) :
672 with open(filein, 'rU') as f :
677 line = line.replace('\n', '').replace('"', '').split(';')
678 self.lc.append([int(line[0]) - 1, int(line[1])])
679 classesl = [val[1] for val in self.lc]
681 self.lc = sorted(self.lc, key=itemgetter(1))
682 self.lc = [[uce[0] for uce in self.lc if uce[1] == i] for i in range(clnb+1)]
683 self.lc0 = self.lc.pop(0)
686 def get_stat_by_cluster(self, outf) :
687 log.info('get_stat_by_cluster')
689 occurrences = dict([[i + 1, 0] for i in range(len(self.lc))])
690 formescl = dict([[i + 1, 0] for i in range(len(self.lc))])
691 hapaxcl = dict([[i + 1, 0] for i in range(len(self.lc))])
692 lenclasses = dict([[i+1,len(cl)] for i, cl in enumerate(self.lc)])
693 sets = [set(cl) for cl in self.lc]
694 for forme in self.formes :
695 formeuceeff = self.getformeuceseff(forme)
696 for i, classe in enumerate(self.lc) :
697 concern = sets[i].intersection(formeuceeff.keys())
699 occurrences[i+1] += sum([formeuceeff[uce] for uce in concern])
701 if self.formes[forme].freq == 1 :
703 toprint = '\n'.join([';'.join([`i`, `occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`]) for i in occurrences])
704 with open(outf, 'w') as f :
706 log.info('%f' % (time() - t1))
708 def gethapaxbyet(self, etoiles) :
709 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
711 for uce in hapaxuces :
712 if uce in hucesdict :
716 etuces = [[] for et in etoiles]
717 for uci in self.ucis :
718 get = list(set(uci.etoiles).intersection(etoiles))
720 return '2 variables sur la meme ligne'
722 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
723 etuces = [set(val) for val in etuces]
724 return [sum([hucesdict[uce] for uce in list(etuce.intersection(hapaxuces))]) for etuce in etuces]
726 def gethapaxuces(self) :
727 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
728 hapax = [forme for forme in self.lems if self.lems[forme].freq == 1]
730 for i,uce in enumerate(hapaxuces) :
731 if uce in hucesdict :
732 hucesdict[uce][0] += 1
733 hucesdict[uce][1].append(hapax[i])
735 hucesdict[uce] = [1,[hapax[i]]]
737 for uce in hucesdict :
738 if hucesdict[uce][0] in huces :
739 huces[hucesdict[uce][0]].append(uce)
741 huces[hucesdict[uce][0]] = [uce]
742 huces = zip(huces, huces.values())
743 huces.sort(reverse=True)
747 for nb in huces[0:4] :
748 txt += "<p><h2>%i hapax par uce</h2><p>\n" % nb[0]
750 res = self.getconcorde([uce])
752 ucetxt = ' ' + row[1] + ' '
754 for hap in hucesdict[uce][1] :
755 laforme = self.getforme([forme for forme in self.lems[hap].formes][0]).forme
756 ucetxt = ucetxt.replace(' '+laforme+' ', ' <font color=red>'+laforme+'</font> ')
757 txt += '<p><b>' + ' '.join(self.getetbyuceid(uceid)) + '</b></p>'
758 txt += '<p>'+ucetxt+'</p>\n'
762 with open('/tmp/testhapxuce.html','w') as f :
767 def __init__(self, corpus) :
768 ucinb = corpus.getucinb()
769 ucisize = corpus.getucisize()
770 ucimean = float(sum(ucisize))/float(ucinb)
771 detoile = corpus.make_etoiles_dict()
775 def __init__(self, iduci, line, paraset = None) :
777 self.etoiles = line.split()
779 if paraset is not None :
780 self.paras = paraset.split()
785 def __init__(self, iduce, idpara, iduci) :
791 def __init__(self, word, gramtype, idword, lem = None, freq = None) :
797 if freq is not None :
803 def __init__(self, parent, forme) :
804 self.formes = {forme.ident : forme.freq}
805 self.gram = forme.gram
806 self.freq = forme.freq
809 def add_forme(self, forme) :
810 self.formes[forme.ident] = forme.freq
811 self.freq += forme.freq
813 def decouperlist(chaine, longueur, longueurOptimale) :
815 on part du dernier caractère, et on recule jusqu'au début de la chaîne.
816 Si on trouve un '$', c'est fini.
817 Sinon, on cherche le meilleur candidat. C'est-à-dire le rapport poids/distance le plus important.
819 separateurs = [[u'.', 6.0], [u'?', 6.0], [u'!', 6.0], [u'£$£', 6.0], [u':', 5.0], [u';', 4.0], [u',', 1.0], [u' ', 0.01]]
820 dsep = dict([[val[0],val[1]] for val in separateurs])
821 trouve = False # si on a trouvé un bon séparateur
822 iDecoupe = 0 # indice du caractere ou il faut decouper
824 longueur = min(longueur, len(chaine) - 1)
825 chaineTravail = chaine[:longueur + 1]
827 meilleur = ['', 0, 0] # type, poids et position du meilleur separateur
830 indice = chaineTravail.index(u'$')
832 iDecoupe = indice - 1
837 caractere = chaineTravail[nbCar]
838 distance = abs(longueurOptimale - nbCar) + 1
839 meilleureDistance = abs(longueurOptimale - meilleur[2]) + 1
840 if caractere in dsep :
841 if (float(dsep[caractere]) / distance) > (float(meilleur[1]) / meilleureDistance) :
842 meilleur[0] = caractere
843 meilleur[1] = dsep[caractere]
848 if (float(dsep[' ']) / distance) > (float(meilleur[1]) / meilleureDistance) :
850 meilleur[1] = dsep[' ']
857 #if meilleur[0] != ' ' :
858 # fin = chaine[iDecoupe + 1:]
859 # retour = chaineTravail[:iDecoupe]
861 fin = chaine[iDecoupe + 1:]
862 retour = chaineTravail[:iDecoupe + 1]
863 return len(retour) > 0, retour, fin
864 # si on a rien trouvé
865 return False, chaine, ''
867 def testetoile(line) :
868 return line.startswith(u'****')
871 return line[0:4].isdigit() and u'*' in line
873 def prep_txtlist(txt) :
874 return txt.split() + [u'$']
876 def prep_txtcharact(txt) :
881 Class for building a corpus
883 def __init__(self, infile, parametres_corpus, lexique = None, expressions = None, dlg = None) :
884 log.info('begin building corpus...')
885 self.lexique = lexique
886 self.expressions = expressions
888 self.corpus = Corpus(self, parametres_corpus)
891 self.lim = parametres_corpus.get('lim', 1000000)
892 self.encoding = parametres_corpus['encoding']
893 self.corpus.pathout = PathOut(filename = parametres_corpus['originalpath'], dirout = parametres_corpus['pathout'])
894 self.corpus.pathout.createdir(parametres_corpus['pathout'])
895 self.corpus.parametres['uuid'] = str(uuid4())
896 self.corpus.parametres['corpus_name'] = os.path.split(self.corpus.parametres['pathout'])[1]
897 self.corpus.parametres['type'] = 'corpus'
898 if self.corpus.parametres['keep_ponct'] :
899 self.ponctuation_espace = [' ', '']
901 self.ponctuation_espace = [' ','.', u'£$£', ';', '?', '!', ',', ':','']
903 self.tolist = self.corpus.parametres.get('tolist', 0)
910 def prep_makeuce(self) :
911 method = self.corpus.parametres.get('ucemethod', 0)
913 self.decouper = decouperlist
914 self.prep_txt = prep_txtlist
915 self.ucesize = self.corpus.parametres.get('ucesize', 40)
917 self.decouper = decoupercharact
918 self.prep_txt = prep_txtcharact
919 self.ucesize = self.corpus.parametres.get('ucesize', 240)
920 log.info('method uce : %s' % method)
925 self.read_corpus(self.infile)
926 except Warning, args :
927 log.info('pas kool %s' % args)
931 self.corpus.parametres['ira'] = self.corpus.pathout['Corpus.cira']
932 self.time = time() - t1
934 DoConf().makeoptions(['corpus'],[self.corpus.parametres], self.corpus.pathout['Corpus.cira'])
935 log.info('time : %f' % (time() - t1))
938 self.conn_f = sqlite3.connect(self.corpus.pathout['formes.db'])
939 self.cf = self.conn_f.cursor()
940 self.cf.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
941 self.cf.execute('CREATE TABLE IF NOT EXISTS eff (id INTEGER, eff TEXT);')
943 self.cf = self.conn_f.cursor()
944 self.cf.execute('PRAGMA temp_store=MEMORY;')
945 self.cf.execute('PRAGMA journal_mode=MEMORY;')
946 self.cf.execute('PRAGMA synchronous = OFF;')
947 self.cf.execute('begin')
948 self.conn = sqlite3.connect(self.corpus.pathout['uces.db'])
949 self.c = self.conn.cursor()
950 self.c.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
952 self.c = self.conn.cursor()
953 self.c.execute('PRAGMA temp_store=MEMORY;')
954 self.c.execute('PRAGMA journal_mode=MEMORY;')
955 self.c.execute('PRAGMA synchronous = OFF;')
956 self.c.execute('begin')
959 #commit index and close db
962 self.cf.execute('CREATE INDEX iduces ON uces (id);')
963 self.cf.execute('CREATE INDEX ideff ON eff (id);')
967 self.conn_corpus = sqlite3.connect(self.corpus.pathout['corpus.db'])
968 self.ccorpus = self.conn_corpus.cursor()
969 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS etoiles (uci INTEGER, et TEXT, paras TEXT);')
970 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS luces (uci INTEGER, para INTEGER, uce INTEGER);')
971 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS formes (ident INTEGER, forme TEXT, lem TEXT, gram TEXT, freq INTEGER);')
972 self.conn_corpus.commit()
973 self.ccorpus = self.conn_corpus.cursor()
974 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
975 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
976 self.ccorpus.execute('PRAGMA synchronous = OFF;')
977 self.ccorpus.execute('begin')
979 self.ccorpus.execute('CREATE INDEX iduci ON luces (uci);')
980 self.conn_corpus.commit()
981 self.conn_corpus.close()
982 #self.corpus.parametres['corpus_ira'] = self.corpus.pathout['corpus.cira']
984 def buildcleans(self) :
985 if self.corpus.parametres.get('lower', 1) :
986 self.cleans.append(self.dolower)
987 if self.corpus.parametres.get('firstclean', 1) :
988 self.cleans.append(self.firstclean)
989 if self.corpus.parametres['charact'] :
990 self.rule = self.corpus.parametres.get('keep_caract', u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_")
991 self.cleans.append(self.docharact)
992 if self.corpus.parametres.get('expressions', 1) :
993 self.cleans.append(self.make_expression)
994 if self.corpus.parametres.get('apos', 1) :
995 self.cleans.append(self.doapos)
996 if self.corpus.parametres.get('tiret', 1):
997 self.cleans.append(self.dotiret)
999 def make_expression(self,txt) :
1000 for expression in self.expressions:
1001 if expression in txt :
1002 txt = txt.replace(expression, self.expressions[expression][0])
1005 def dolower(self, txt) :
1008 def docharact(self, txt) :
1009 #rule = u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_-"
1010 list_keep = u"[" + self.rule + "]+"
1011 return re.sub(list_keep, ' ', txt)
1013 def doapos(self, txt) :
1014 return txt.replace(u'\'', u' ')
1016 def dotiret(self, txt) :
1017 return txt.replace(u'-', u' ')
1019 def firstclean(self, txt) :
1020 txt = txt.replace(u'’',"'")
1021 txt = txt.replace(u'œ', u'oe')
1022 return txt.replace('...',u' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'…', ' £$£ ')
1024 def make_cleans(self, txt) :
1025 for clean in self.cleans :
1029 def backup_uce(self) :
1030 if self.corpus.idformesuces != {} :
1031 log.info('backup %i' % len(self.corpus.idformesuces))
1032 touce = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].keys()])) for forme in self.corpus.idformesuces]
1033 toeff = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].values()])) for forme in self.corpus.idformesuces]
1034 self.cf.executemany('INSERT INTO uces VALUES (?,?);', touce)
1035 self.cf.executemany('INSERT INTO eff VALUES (?,?);', toeff)
1036 self.corpus.idformesuces = {}
1039 def backup_corpus(self) :
1040 log.info('start backup corpus')
1042 for uci in self.corpus.ucis :
1043 self.ccorpus.execute('INSERT INTO etoiles VALUES (?,?,?);' ,(uci.ident,' '.join(uci.etoiles), ' '.join(uci.paras,)))
1044 for uce in uci.uces :
1045 self.ccorpus.execute('INSERT INTO luces VALUES (?,?,?);',(`uci.ident`,`uce.para`,`uce.ident`,))
1046 for forme in self.corpus.formes :
1047 self.ccorpus.execute('INSERT INTO formes VALUES (?,?,?,?,?);', (`self.corpus.formes[forme].ident`, forme, self.corpus.formes[forme].lem, self.corpus.formes[forme].gram, `self.corpus.formes[forme].freq`,))
1048 log.info('%f' % (time() - t))
1050 def dofinish(self) :
1051 self.corpus.parametres['date'] = datetime.datetime.now().ctime()
1052 minutes, seconds = divmod(self.time, 60)
1053 hours, minutes = divmod(minutes, 60)
1054 self.corpus.parametres['time'] = '%.0fh %.0fm %.0fs' % (hours, minutes, seconds)
1055 self.corpus.parametres['ucinb'] = self.corpus.getucinb()
1056 self.corpus.parametres['ucenb'] = self.corpus.getucenb()
1057 self.corpus.parametres['occurrences'] = self.corpus.gettotocc()
1058 self.corpus.parametres['formesnb'] = len(self.corpus.formes)
1059 hapaxnb = self.corpus.gethapaxnb()
1060 pourhapaxf = (float(hapaxnb) / len(self.corpus.formes)) * 100
1061 pourhapaxocc = (float(hapaxnb) / self.corpus.parametres['occurrences']) * 100
1062 self.corpus.parametres['hapax'] = '%i - %.2f %% des formes - %.2f %% des occurrences' % (hapaxnb, pourhapaxf, pourhapaxocc)
1065 class BuildFromAlceste(BuildCorpus) :
1066 def read_corpus(self, infile) :
1067 if self.dlg is not None :
1068 self.dlg.Pulse('textes : 0 - segments : 0')
1071 if self.corpus.parametres['ucimark'] == 0 :
1072 self.testuci = testetoile
1073 elif self.corpus.parametres['ucimark'] == 1 :
1074 self.testuci = testint
1080 with codecs.open(infile, 'r', self.encoding) as f :
1081 for linenb, line in enumerate(f) :
1082 line = line.rstrip('\n\r')
1083 if self.testuci(line) :
1086 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci - 1)
1088 self.corpus.ucis.append(Uci(iduci, line))
1091 if self.corpus.ucis[-1].uces == [] :
1092 log.info(u'Empty text : %i' % linenb)
1094 self.corpus.ucis.pop()
1095 #raise Exception("EmptyText %i" % linenb)
1096 self.corpus.ucis.append(Uci(iduci, line))
1097 if self.dlg is not None :
1098 if not (iduci + 1) % 10 :
1099 self.dlg.Pulse('textes : %i - segments : %i' % (iduci + 1, iduce +1))
1100 elif line.startswith(u'-*') :
1103 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1106 self.corpus.ucis[-1].paras.append(line.split()[0])
1108 raise Exception('paragrapheOT')
1109 elif line.strip() != '' and iduci != -1 :
1111 if txt != [] and iduci != -1 :
1112 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1115 raise Exception("EmptyText")
1116 if iduci != -1 and iduce != -1:
1119 log.info(_(u"No Text in corpora. Are you sure of the formatting ?"))
1120 raise Exception('TextBeforeTextMark')
1121 except UnicodeDecodeError :
1122 raise Exception("CorpusEncoding")
1124 def treattxt(self, txt, iduce, idpara, iduci) :
1125 if self.corpus.parametres.get('ucemethod', 0) == 2 and self.corpus.parametres['douce']:
1126 txt = 'laphrasepoursplitter'.join(txt)
1127 txt = self.make_cleans(txt)
1128 txt = ' '.join([val for val in txt.split() if val not in self.ponctuation_espace])
1129 ucetxt = txt.split('laphrasepoursplitter')
1132 txt = self.make_cleans(txt)
1133 ucetxt = self.make_uces(txt, self.corpus.parametres['douce'])
1134 if self.corpus.ucis[-1].paras == [] :
1138 self.corpus.ucis[-1].uces.append(Uce(iduce, idpara, iduci))
1139 self.c.execute('INSERT INTO uces VALUES(?,?);', (`iduce`,uce))
1140 if not self.tolist :
1146 self.corpus.add_word(word)
1147 log.debug(' '.join([`iduci`,`idpara`,`iduce`]))
1148 if self.last > self.lim :
1151 return iduce, idpara
1153 def make_uces(self, txt, douce = True, keep_ponct = False) :
1154 txt = ' '.join(txt.split())
1157 reste, texte_uce, suite = self.decouper(self.prep_txt(txt), self.ucesize + 15, self.ucesize)
1159 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1162 reste, texte_uce, suite = self.decouper(suite, self.ucesize + 15, self.ucesize)
1163 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1168 return [' '.join([val for val in txt.split() if val not in self.ponctuation_espace])]
1170 #decouper (list_sep)
1171 #make_uces (decouper)
1172 #treat_txt (make_uces)
1176 def __init__(self, parent, dlg = None) :
1177 self.parent = parent
1179 parametres = DoConf(os.path.join(self.parent.UserConfigPath,'corpus.cfg')).getoptions('corpus')
1180 parametres['pathout'] = PathOut(parent.filename, 'corpus').mkdirout()
1181 dial = CorpusPref(parent, parametres)
1182 dial.CenterOnParent()
1183 dial.txtpath.SetLabel(parent.filename)
1184 #dial.repout_choices.SetValue(parametres['pathout'])
1185 self.res = dial.ShowModal()
1186 if self.res == 5100 :
1187 parametres = dial.doparametres()
1188 parametres['originalpath'] = parent.filename
1189 PathOut().createdir(parametres['pathout'])
1190 ReadLexique(self.parent, lang = parametres['lang'])
1191 self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp'))
1192 self.parametres = parametres
1194 if self.dlg is not None :
1198 def doanalyse(self) :
1199 return BuildFromAlceste(self.parent.filename, self.parametres, self.parent.lexique, self.parent.expressions, dlg = self.dlg).corpus
1202 if __name__ == '__main__' :
1204 parametres = {'formesdb':'formes.db', 'ucesdb': 'uces.db', 'corpusdb' : 'corpus.db', 'syscoding' : 'utf-8', 'encoding' : encoding}
1205 intro = BuildCorpus(infile, parametres)#, tar_in, tar_infouce)#, tar_formes)