1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
11 from functions import decoupercharact, ReadDicoAsDico, DoConf
17 from operator import itemgetter
18 from uuid import uuid4
19 from chemins import PathOut
20 from dialog import CorpusPref
21 from functions import ReadLexique, ReadDicoAsDico
22 from colors import colors
26 log = logging.getLogger('iramuteq.corpus')
29 def copycorpus(corpus) :
30 log.info('copy corpus')
31 copy_corpus = Corpus(corpus.parent, parametres = corpus.parametres)
32 copy_corpus.ucis = corpus.ucis
33 copy_corpus.formes = corpus.formes
34 copy_corpus.pathout = corpus.pathout
35 copy_corpus.conn_all()
45 def __init__(self, parent, parametres = {}, read = False) :
47 self.parametres = parametres
49 self.connformes = None
51 self.conncorpus = None
58 self.idformesuces = {}
63 self.pathout = PathOut(dirout = parametres['pathout'])
66 def add_word(self, word) :
67 if word in self.formes :
68 self.formes[word].freq += 1
69 if self.formes[word].ident in self.idformesuces :
70 if self.ucis[-1].uces[-1].ident in self.idformesuces[self.formes[word].ident] :
71 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] += 1
73 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] = 1
75 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident: 1}
77 if word in self.parent.lexique :
78 gramtype = self.parent.lexique[word][1]
79 lem = self.parent.lexique[word][0]
86 self.formes[word] = Word(word, gramtype, len(self.formes), lem)
87 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident : 1}
90 """connect corpus to db"""
91 if self.connformes is None :
92 log.info('connexion corpus')
93 self.connuces = sqlite3.connect(self.pathout['uces.db'])
94 self.cuces = self.connuces.cursor()
95 self.connformes = sqlite3.connect(self.pathout['formes.db'])
96 self.cformes = self.connformes.cursor()
97 self.conncorpus = sqlite3.connect(self.pathout['corpus.db'])
98 self.ccorpus = self.conncorpus.cursor()
99 self.cformes.execute('PRAGMA temp_store=MEMORY;')
100 self.cformes.execute('PRAGMA journal_mode=MEMORY;')
101 self.cformes.execute('PRAGMA synchronous = OFF;')
102 self.cuces.execute('PRAGMA temp_store=MEMORY;')
103 self.cuces.execute('PRAGMA journal_mode=MEMORY;')
104 self.cuces.execute('PRAGMA synchronous = OFF;')
105 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
106 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
107 self.ccorpus.execute('PRAGMA synchronous = OFF;')
109 def read_corpus(self) :
110 log.info('read corpus')
111 self.parametres['syscoding'] = sys.getdefaultencoding()
112 if self.conncorpus is None :
114 res = self.ccorpus.execute('SELECT * FROM etoiles;')
116 self.ucis.append(Uci(row[0], row[1], row[2]))
117 uces = self.conncorpus.cursor().execute('SELECT * FROM luces where uci=?;',(`self.ucis[-1].ident`,))
119 self.ucis[-1].uces.append(Uce(uce[2], uce[1], uce[0]))
120 res = self.ccorpus.execute('SELECT * FROM formes;')
121 self.formes = dict([[forme[1], Word(forme[1], forme[3], forme[0], lem = forme[2], freq = forme[4])] for forme in res])
124 def getworduces(self, wordid) :
125 if isinstance(wordid, basestring) :
126 wordid = self.formes[wordid].ident
127 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`wordid`,))
128 return list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
130 def getlemuces(self, lem) :
131 formesid = ', '.join([`val` for val in self.lems[lem].formes])
132 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
133 res = self.cformes.execute(query)
134 return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))))
136 def getlemucis(self, lem) :
137 uces = self.getlemuces(lem)
138 return list(set([self.getucefromid(val).uci for val in uces]))
140 def getlemuceseff(self, lem) :
141 formesid = ', '.join([`val` for val in self.lems[lem].formes])
142 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
143 res = self.cformes.execute(query)
144 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
145 query = 'SELECT eff FROM eff where id IN (%s) ORDER BY id' % formesid
146 res = self.cformes.execute(query)
147 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
149 for i, uce in enumerate(uces) :
150 lemuceeff[uce] = lemuceeff.get(uce, 0) + eff[i]
153 def getlemeff(self, lem) :
154 return self.lems[lem].freq
159 def getforme(self, formeid) :
160 if self.idformes is None : self.make_idformes()
161 return self.idformes[formeid]
163 def gettotocc(self) :
164 return sum([self.formes[forme].freq for forme in self.formes])
166 def getucemean(self) :
167 return float(self.gettotocc())/self.getucenb()
170 return self.ucis[-1].uces[-1].ident + 1
173 return self.ucis[-1].ident + 1
175 def getucisize(self) :
176 ucesize = self.getucesize()
177 return [sum(ucesize[uci.uces[0].ident:(uci.uces[-1].ident + 1)]) for uci in self.ucis]
179 def getucesize(self) :
180 res = self.getalluces()
181 return [len(uce[1].split()) for uce in res]
183 # def getlemseff(self) :
184 # if self.idformes is None :
185 # self.make_idformes()
186 # return dict([[lem, sum([self.idformes[forme].freq for forme in self.lems[lem]])] for lem in self.lems])
188 # def getlemsefftype(self) :
189 # if self.idformes is None :
190 # self.make_idformes()
191 # if self.lems is None :
193 # return dict([[lem, [sum([self.idformes[forme].freq for forme in self.lems[lem]]), '', self.idformes[self.lems[lem].keys()[0]].gram]] for lem in self.lems])
195 def getconcorde(self, uces) :
196 return self.cuces.execute('select * from uces where id IN (%s);' % ', '.join([`i` for i in uces]))
198 def getwordconcorde(self, word) :
199 return self.getconcorde(self.getworduces(word))
201 def getlemconcorde(self, lem) :
202 return self.getconcorde(self.getlemuces(lem))
204 def getalluces(self) :
205 return self.cuces.execute('SELECT * FROM uces')
207 def getucesfrometoile(self, etoile) :
208 return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles]
210 def getucefromid(self, uceid) :
211 if self.iduces is None : self.make_iduces()
212 return self.iduces[uceid]
214 def gethapaxnb(self) :
215 return len([None for forme in self.formes if self.formes[forme].freq == 1])
217 def getactivesnb(self, key) :
218 return len([lem for lem in self.lems if self.lems[lem].act == key])
219 # def make_lems(self, lem = True) :
220 # log.info('make lems')
222 # for forme in self.formes :
223 # if self.formes[forme].lem in self.lems :
224 # if self.formes[forme].ident not in self.lems[self.formes[forme].lem] :
225 # self.lems[self.formes[forme].lem][self.formes[forme].ident] = 0
227 # self.lems[self.formes[forme].lem] = {self.formes[forme].ident : 0}
229 def getetbyuceid(self, uceid) :
230 if self.uceuci is None : self.uceuci = dict([[uce.ident,uci.ident] for uci in self.ucis for uce in uci.uces])
231 return self.ucis[self.uceuci[uceid]].etoiles
233 def make_lems(self, lem = True) :
234 log.info('make lems')
237 for forme in self.formes :
238 if self.formes[forme].lem in self.lems :
239 if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes :
240 self.lems[self.formes[forme].lem].add_forme(self.formes[forme])
242 self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
244 self.lems = dict([[forme, Lem(self, self.formes[forme])] for forme in self.formes])
246 def make_idformes(self) :
247 self.idformes = dict([[self.formes[forme].ident, self.formes[forme]] for forme in self.formes])
249 def make_iduces(self) :
250 if self.iduces is None :
251 self.iduces = dict([[uce.ident, uce] for uci in self.ucis for uce in uci.uces])
253 def make_lexitable(self, mineff, etoiles) :
254 tokeep = [lem for lem in self.lems if self.lems[lem].freq > mineff]
255 etuces = [[] for et in etoiles]
256 for uci in self.ucis :
257 get = list(set(uci.etoiles).intersection(etoiles))
259 return '2 variables sur la meme ligne'
261 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
262 etuces = [set(val) for val in etuces]
265 deff = self.getlemuceseff(lem)
267 tab.append([lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])
268 tab.insert(0, [''] + etoiles)
271 def make_efftype_from_etoiles(self, etoiles) :
273 etuces = [[] for et in etoiles]
274 for uci in self.ucis :
275 get = list(set(uci.etoiles).intersection(etoiles))
277 return '2 variables sur la meme ligne'
279 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
280 etuces = [set(val) for val in etuces]
281 for lem in self.lems :
282 deff = self.getlemuceseff(lem)
284 gram = self.lems[lem].gram
286 dtype[gram] = [i + j for i, j in zip(dtype[gram], [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])]
288 dtype[gram] = [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces]
289 tabout = [[gram] + dtype[gram] for gram in dtype]
290 tabout.insert(0, [''] + etoiles)
293 def make_uceactsize(self, actives) :
294 res = self.getalluces()
297 deff = self.getlemuceseff(lem)
299 ucesize[uce] = ucesize.get(uce, 0) + 1
302 def make_uc(self, actives, lim1, lim2) :
303 uceactsize = self.make_uceactsize(actives)
309 for uce in [uce for uci in self.ucis for uce in uci.uces] :
310 if uce.para == lastpara :
312 last1 += uceactsize.get(uce.ident,0)
313 uc1[-1].append(uce.ident)
315 uc1.append([uce.ident])
318 last2 += uceactsize.get(uce.ident, 0)
319 uc2[-1].append(uce.ident)
321 uc2.append([uce.ident])
324 last1 = uceactsize.get(uce.ident, 0)
325 last2 = uceactsize.get(uce.ident, 0)
327 uc1.append([uce.ident])
328 uc2.append([uce.ident])
331 def make_and_write_sparse_matrix_from_uc(self, actives, sizeuc1, sizeuc2, uc1out, uc2out, listuce1out, listuce2out) :
332 uc1, uc2 = self.make_uc(actives, sizeuc1, sizeuc2)
333 log.info('taille uc1 : %i - taille uc2 : %i' % (len(uc1), len(uc2)))
334 self.write_ucmatrix(uc1, actives, uc1out)
335 self.write_ucmatrix(uc2, actives, uc2out)
336 listuce1 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc1) for uce in ucl]
337 listuce2 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc2) for uce in ucl]
338 with open(listuce1out, 'w') as f :
339 f.write('\n'.join([';'.join(line) for line in listuce1]))
340 with open(listuce2out, 'w') as f :
341 f.write('\n'.join([';'.join(line) for line in listuce2]))
342 return len(uc1), len(uc2)
344 def write_ucmatrix(self, uc, actives, fileout) :
345 log.info('write uc matrix %s' % fileout)
346 uces_uc = dict([[uce, i] for i, ucl in enumerate(uc) for uce in ucl])
349 with open(fileout + '~', 'w+') as f :
350 for i, lem in enumerate(actives) :
351 for uce in self.getlemuces(lem):
352 if (uces_uc[uce], i) not in deja_la :
354 f.write(''.join([' '.join([`uces_uc[uce]+1`,`i+1`,`1`]),'\n']))
355 deja_la[(uces_uc[uce], i)] = 0
357 with open(fileout, 'w') as ffin :
358 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (len(uc), len(actives), nbl))
361 os.remove(fileout + '~')
364 def export_corpus(self, outf) :
365 #outf = 'export_corpus.txt'
367 res = self.getalluces()
371 with open(outf,'w') as f :
373 if self.iduces[uce[0]].uci == actuci and self.iduces[uce[0]].para == actpara :
374 f.write(uce[1].encode(self.parametres['syscoding']) + '\n')
375 elif self.iduces[uce[0]].uci != actuci :
376 actuci = self.iduces[uce[0]].uci
377 if self.ucis[self.iduces[uce[0]].uci].paras == [] :
378 actpara = self.iduces[uce[0]].para
379 f.write('\n' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n' + uce[1].encode(self.parametres['syscoding']) + '\n')
382 actpara = self.iduces[uce[0]].para
383 f.write('\n'.join([' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']), self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
384 elif self.iduces[uce[0]].para != actpara :
385 actpara = self.iduces[uce[0]].para
387 f.write('\n'.join([self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
389 def make_and_write_sparse_matrix_from_uces(self, actives, outfile, listuce = False) :
390 log.info('make_and_write_sparse_matrix_from_uces %s' % outfile)
392 with open(outfile + '~', 'w+') as f :
393 for i, lem in enumerate(actives) :
394 for uce in sorted(self.getlemuces(lem)) :
396 f.write(''.join([' '.join([`uce+1`, `i+1`,`1`]),'\n']))
398 with open(outfile, 'w') as ffin :
399 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
402 os.remove(outfile + '~')
404 with open(listuce, 'w') as f :
405 f.write('\n'.join(['uce;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucenb())]))
407 def make_and_write_sparse_matrix_from_uci(self, actives, outfile, listuci = False) :
408 log.info('make_and_write_sparse_matrix_from_ucis %s' % outfile)
410 with open(outfile + '~', 'w+') as f :
411 for i, lem in enumerate(actives) :
412 for uci in sorted(self.getlemucis(lem)) :
414 f.write(''.join([' '.join([`uci+1`, `i+1`,`1`]),'\n']))
416 with open(outfile, 'w') as ffin :
417 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucinb(), len(actives), nbl))
420 os.remove(outfile + '~')
422 with open(listuci, 'w') as f :
423 f.write('\n'.join(['uci;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucinb())]))
425 def make_and_write_sparse_matrix_from_classe(self, actives, uces, outfile) :
426 log.info('make_and_write_sparse_matrix_from_classe %s' % outfile)
428 duces = dict([[uce, i] for i, uce in enumerate(uces)])
429 with open(outfile + '~', 'w+') as f :
430 for i, lem in enumerate(actives) :
431 uces_ok = list(set(self.getlemuces(lem)).intersection(uces))
433 f.write(''.join([' '.join([`duces[uce]+1`,`i+1`,`1`]),'\n']))
435 with open(outfile, 'w') as ffin :
436 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
439 os.remove(outfile + '~')
441 def make_table_with_classe(self, uces, list_act) :
442 table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
443 uces = dict([[uce, i] for i, uce in enumerate(uces)])
444 for i, lem in enumerate(list_act) :
445 lemuces = list(set(self.getlemuces(lem)).intersection(uces))
447 table_uce[uces[uce]][i] = 1
448 table_uce.insert(0, list_act)
451 def parse_active(self, gramact, gramsup = None) :
452 log.info('parse actives')
453 for lem in self.lems :
454 if self.lems[lem].gram in gramact :
455 self.lems[lem].act = 1
456 elif gramsup is not None :
457 if self.lems[lem].gram in gramsup :
458 self.lems[lem].act = 2
460 self.lems[lem].act = 0
462 self.lems[lem].act = 2
464 def make_actives_limit(self, limit, key = 1) :
465 if self.idformes is None :
467 return [lem for lem in self.lems if self.getlemeff(lem) >= limit and self.lems[lem].act == key]
469 def make_actives_nb(self, nbmax, key) :
470 log.info('make_actives_nb : %i - %i' % (nbmax,key))
471 if self.idformes is None :
473 allactives = [[self.lems[lem].freq, lem] for lem in self.lems if self.lems[lem].act == key and self.lems[lem].freq >= 3]
474 self.activenb = len(allactives)
475 allactives = sorted(allactives, reverse = True)
476 if len(allactives) <= nbmax :
477 log.info('nb = %i - eff min = %i ' % (len(allactives), allactives[-1][0]))
478 return [val[1] for val in allactives], allactives[-1][0]
480 effs = [val[0] for val in allactives]
481 if effs.count(effs[nbmax - 1]) > 1 :
482 lim = effs[nbmax - 1] + 1
486 stop = effs.index(lim)
492 log.info('nb actives = %i - eff min = %i ' % (stop, lim))
493 return [val[1] for val in allactives[0:stop + 1]], lim
495 def make_and_write_profile(self, actives, ucecl, fileout) :
496 log.info('formes/classes')
497 tab = [[lem] + [len(set(self.getlemuces(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
498 tab = [[line[0]] + [`val` for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
499 with open(fileout, 'w') as f :
500 f.write('\n'.join([';'.join(line) for line in tab]).encode(self.parametres['syscoding']))
502 def make_etoiles(self) :
504 for uci in self.ucis :
505 etoiles.update(uci.etoiles[1:] + uci.paras)
508 def make_etoiles_dict(self) :
509 etoiles = [et for uci in self.ucis for et in uci.etoiles[1:]]
511 for etoile in etoiles :
512 et = etoile.split('_')
515 endet = '_'.join(et[1:])
516 if endet in det[et[0]] :
517 det[et[0]][endet] += 1
519 det[et[0]][endet] = 1
524 endet = '_'.join(et[1:])
525 det[et[0]] = {endet :1}
530 def make_etline(self, listet) :
531 etuces = [[] for et in listet]
532 for uci in self.ucis :
533 get = list(set(uci.etoiles).intersection(listet))
535 return '2 variables sur la meme ligne'
537 etuces[listet.index(get[0])] += [uce.ident for uce in uci.uces]
541 def make_and_write_profile_et(self, ucecl, fileout) :
542 log.info('etoiles/classes')
543 etoiles = self.make_etoiles()
544 with open(fileout, 'w') as f :
545 f.write('\n'.join([';'.join([etoile] + [`len(set(self.getucesfrometoile(etoile)).intersection(classe))` for classe in ucecl]) for etoile in etoiles]).encode(self.parametres['syscoding']))
547 def make_colored_corpus(self) :
549 for i, lc in enumerate(self.lc) :
552 for uce in self.lc0 :
554 color = ['black'] + colors[len(self.lc) - 1]
556 <meta http-equiv="content-Type" content="text/html; charset=%s" />
558 ''' % sys.getdefaultencoding()
559 res = self.getalluces()
564 if self.iduces[uce[0]].uci != actuci :
565 actuci = self.iduces[uce[0]].uci
566 txt += '<br><hr>' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles) + '<br><br>'
567 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
569 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
570 return txt + '\n</body></html>'
572 def count_from_list(self, l, d) :
580 def count_from_list_cl(self, l, d, a, clnb) :
589 def find_segments(self, taille_segment, taille_limite) :
591 for uce in self.getalluces() :
593 d = self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
594 l = [[d[val], val] for val in d if d[val] >= 3]
597 if len(l) > taille_limite :
598 l = l[-taille_limite:]
601 def find_segments_in_classe(self, list_uce, taille_segment, taille_limite):
603 for uce in self.getconcorde(list_uce) :
605 d =self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
606 l = [[d[val], val, taille_segment] for val in d if d[val] >= 3]
609 if len(l) > taille_limite :
610 l = l[-taille_limite:]
613 def make_segments_profile(self, fileout, lenmin = 3, lenmax = 10, effmin = 50, lem = False) :
615 for b, classe in enumerate(self.lc) :
616 for uce in self.getconcorde(classe) :
619 uce = [self.formes[forme].lem for forme in uce]
620 for taille_segment in range(lenmin,lenmax) :
621 d =self.count_from_list_cl([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d, b, len(self.lc))
622 result = [[seg] + [str(val) for val in d[seg]] for seg in d if sum(d[seg]) >= effmin]
623 with open(fileout, 'w') as f :
624 f.write('\n'.join([';'.join(line) for line in result]))
626 def make_ucecl_from_R(self, filein) :
627 with open(filein, 'rU') as f :
632 line = line.replace('\n', '').replace('"', '').split(';')
633 self.lc.append([int(line[0]) - 1, int(line[1])])
634 classesl = [val[1] for val in self.lc]
636 self.lc = sorted(self.lc, key=itemgetter(1))
637 self.lc = [[uce[0] for uce in self.lc if uce[1] == i] for i in range(clnb+1)]
638 self.lc0 = self.lc.pop(0)
641 def gethapaxbyet(self, etoiles) :
642 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
644 for uce in hapaxuces :
645 if uce in hucesdict :
649 etuces = [[] for et in etoiles]
650 for uci in self.ucis :
651 get = list(set(uci.etoiles).intersection(etoiles))
653 return '2 variables sur la meme ligne'
655 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
656 etuces = [set(val) for val in etuces]
657 return [sum([hucesdict[uce] for uce in list(etuce.intersection(hapaxuces))]) for etuce in etuces]
659 def gethapaxuces(self) :
660 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
661 hapax = [forme for forme in self.lems if self.lems[forme].freq == 1]
663 for i,uce in enumerate(hapaxuces) :
664 if uce in hucesdict :
665 hucesdict[uce][0] += 1
666 hucesdict[uce][1].append(hapax[i])
668 hucesdict[uce] = [1,[hapax[i]]]
670 for uce in hucesdict :
671 if hucesdict[uce][0] in huces :
672 huces[hucesdict[uce][0]].append(uce)
674 huces[hucesdict[uce][0]] = [uce]
675 huces = zip(huces, huces.values())
676 huces.sort(reverse=True)
680 for nb in huces[0:4] :
681 txt += "<p><h2>%i hapax par uce</h2><p>\n" % nb[0]
683 res = self.getconcorde([uce])
685 ucetxt = ' ' + row[1] + ' '
687 for hap in hucesdict[uce][1] :
688 laforme = self.getforme([forme for forme in self.lems[hap].formes][0]).forme
689 ucetxt = ucetxt.replace(' '+laforme+' ', ' <font color=red>'+laforme+'</font> ')
690 txt += '<p><b>' + ' '.join(self.getetbyuceid(uceid)) + '</b></p>'
691 txt += '<p>'+ucetxt+'</p>\n'
695 with open('/tmp/testhapxuce.html','w') as f :
700 def __init__(self, corpus) :
701 ucinb = corpus.getucinb()
702 ucisize = corpus.getucisize()
703 ucimean = float(sum(ucisize))/float(ucinb)
704 detoile = corpus.make_etoiles_dict()
708 def __init__(self, iduci, line, paraset = None) :
710 self.etoiles = line.split()
712 if paraset is not None :
713 self.paras = paraset.split()
718 def __init__(self, iduce, idpara, iduci) :
724 def __init__(self, word, gramtype, idword, lem = None, freq = None) :
730 if freq is not None :
736 def __init__(self, parent, forme) :
737 self.formes = {forme.ident : forme.freq}
738 self.gram = forme.gram
739 self.freq = forme.freq
742 def add_forme(self, forme) :
743 self.formes[forme.ident] = forme.freq
744 self.freq += forme.freq
746 def decouperlist(chaine, longueur, longueurOptimale) :
748 on part du dernier caractère, et on recule jusqu'au début de la chaîne.
749 Si on trouve un '$', c'est fini.
750 Sinon, on cherche le meilleur candidat. C'est-à-dire le rapport poids/distance le plus important.
752 separateurs = [[u'.', 6.0], [u'?', 6.0], [u'!', 6.0], [u'£$£', 6.0], [u':', 5.0], [u';', 4.0], [u',', 1.0], [u' ', 0.01]]
753 dsep = dict([[val[0],val[1]] for val in separateurs])
754 trouve = False # si on a trouvé un bon séparateur
755 iDecoupe = 0 # indice du caractere ou il faut decouper
757 longueur = min(longueur, len(chaine) - 1)
758 chaineTravail = chaine[:longueur + 1]
760 meilleur = ['', 0, 0] # type, poids et position du meilleur separateur
763 indice = chaineTravail.index(u'$')
765 iDecoupe = indice - 1
770 caractere = chaineTravail[nbCar]
771 distance = abs(longueurOptimale - nbCar) + 1
772 meilleureDistance = abs(longueurOptimale - meilleur[2]) + 1
773 if caractere in dsep :
774 if (float(dsep[caractere]) / distance) > (float(meilleur[1]) / meilleureDistance) :
775 meilleur[0] = caractere
776 meilleur[1] = dsep[caractere]
781 if (float(dsep[' ']) / distance) > (float(meilleur[1]) / meilleureDistance) :
783 meilleur[1] = dsep[' ']
790 #if meilleur[0] != ' ' :
791 # fin = chaine[iDecoupe + 1:]
792 # retour = chaineTravail[:iDecoupe]
794 fin = chaine[iDecoupe + 1:]
795 retour = chaineTravail[:iDecoupe + 1]
796 return len(retour) > 0, retour, fin
797 # si on a rien trouvé
798 return False, chaine, ''
800 def testetoile(line) :
801 return line.startswith(u'****')
804 return line[0:4].isdigit() and u'*' in line
806 def prep_txtlist(txt) :
807 return txt.split() + [u'$']
809 def prep_txtcharact(txt) :
814 Class for building a corpus
816 def __init__(self, infile, parametres_corpus, lexique = None, expressions = None, dlg = None) :
817 log.info('begin building corpus...')
818 self.lexique = lexique
819 self.expressions = expressions
821 self.corpus = Corpus(self, parametres_corpus)
824 self.lim = parametres_corpus.get('lim', 1000000)
825 self.encoding = parametres_corpus['encoding']
826 self.corpus.pathout = PathOut(filename = parametres_corpus['originalpath'], dirout = parametres_corpus['pathout'])
827 self.corpus.pathout.createdir(parametres_corpus['pathout'])
828 self.corpus.parametres['uuid'] = str(uuid4())
829 self.corpus.parametres['corpus_name'] = os.path.split(self.corpus.parametres['pathout'])[1]
830 self.corpus.parametres['type'] = 'corpus'
831 if self.corpus.parametres['keep_ponct'] :
832 self.ponctuation_espace = [' ', '']
834 self.ponctuation_espace = [' ','.', u'£$£', ';', '?', '!', ',', ':','']
836 self.tolist = self.corpus.parametres.get('tolist', 0)
843 def prep_makeuce(self) :
844 method = self.corpus.parametres.get('ucemethod', 0)
846 self.decouper = decouperlist
847 self.prep_txt = prep_txtlist
848 self.ucesize = self.corpus.parametres.get('ucesize', 40)
850 self.decouper = decoupercharact
851 self.prep_txt = prep_txtcharact
852 self.ucesize = self.corpus.parametres.get('ucesize', 240)
853 log.info('method uce : %s' % method)
858 self.read_corpus(self.infile)
859 except Warning, args :
860 log.info('pas kool %s' % args)
864 self.corpus.parametres['ira'] = self.corpus.pathout['Corpus.cira']
865 self.time = time() - t1
867 DoConf().makeoptions(['corpus'],[self.corpus.parametres], self.corpus.pathout['Corpus.cira'])
868 log.info('time : %f' % (time() - t1))
871 self.conn_f = sqlite3.connect(self.corpus.pathout['formes.db'])
872 self.cf = self.conn_f.cursor()
873 self.cf.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
874 self.cf.execute('CREATE TABLE IF NOT EXISTS eff (id INTEGER, eff TEXT);')
876 self.cf = self.conn_f.cursor()
877 self.cf.execute('PRAGMA temp_store=MEMORY;')
878 self.cf.execute('PRAGMA journal_mode=MEMORY;')
879 self.cf.execute('PRAGMA synchronous = OFF;')
880 self.cf.execute('begin')
881 self.conn = sqlite3.connect(self.corpus.pathout['uces.db'])
882 self.c = self.conn.cursor()
883 self.c.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
885 self.c = self.conn.cursor()
886 self.c.execute('PRAGMA temp_store=MEMORY;')
887 self.c.execute('PRAGMA journal_mode=MEMORY;')
888 self.c.execute('PRAGMA synchronous = OFF;')
889 self.c.execute('begin')
892 #commit index and close db
895 self.cf.execute('CREATE INDEX iduces ON uces (id);')
896 self.cf.execute('CREATE INDEX ideff ON eff (id);')
900 self.conn_corpus = sqlite3.connect(self.corpus.pathout['corpus.db'])
901 self.ccorpus = self.conn_corpus.cursor()
902 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS etoiles (uci INTEGER, et TEXT, paras TEXT);')
903 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS luces (uci INTEGER, para INTEGER, uce INTEGER);')
904 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS formes (ident INTEGER, forme TEXT, lem TEXT, gram TEXT, freq INTEGER);')
905 self.conn_corpus.commit()
906 self.ccorpus = self.conn_corpus.cursor()
907 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
908 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
909 self.ccorpus.execute('PRAGMA synchronous = OFF;')
910 self.ccorpus.execute('begin')
912 self.ccorpus.execute('CREATE INDEX iduci ON luces (uci);')
913 self.conn_corpus.commit()
914 self.conn_corpus.close()
915 #self.corpus.parametres['corpus_ira'] = self.corpus.pathout['corpus.cira']
917 def buildcleans(self) :
918 if self.corpus.parametres.get('lower', 1) :
919 self.cleans.append(self.dolower)
920 if self.corpus.parametres.get('firstclean', 1) :
921 self.cleans.append(self.firstclean)
922 if self.corpus.parametres['charact'] :
923 self.rule = self.corpus.parametres.get('keep_caract', u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_")
924 self.cleans.append(self.docharact)
925 if self.corpus.parametres.get('expressions', 1) :
926 self.cleans.append(self.make_expression)
927 if self.corpus.parametres.get('apos', 1) :
928 self.cleans.append(self.doapos)
929 if self.corpus.parametres.get('tiret', 1):
930 self.cleans.append(self.dotiret)
932 def make_expression(self,txt) :
933 for expression in self.expressions:
934 if expression in txt :
935 txt = txt.replace(expression, self.expressions[expression][0])
938 def dolower(self, txt) :
941 def docharact(self, txt) :
942 #rule = u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_-"
943 list_keep = u"[" + self.rule + "]+"
944 return re.sub(list_keep, ' ', txt)
946 def doapos(self, txt) :
947 return txt.replace(u'\'', u' ')
949 def dotiret(self, txt) :
950 return txt.replace(u'-', u' ')
952 def firstclean(self, txt) :
953 txt = txt.replace(u'’',"'")
954 txt = txt.replace(u'œ', u'oe')
955 return txt.replace('...',u' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'…', ' £$£ ')
957 def make_cleans(self, txt) :
958 for clean in self.cleans :
962 def backup_uce(self) :
963 if self.corpus.idformesuces != {} :
964 log.info('backup %i' % len(self.corpus.idformesuces))
965 touce = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].keys()])) for forme in self.corpus.idformesuces]
966 toeff = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].values()])) for forme in self.corpus.idformesuces]
967 self.cf.executemany('INSERT INTO uces VALUES (?,?);', touce)
968 self.cf.executemany('INSERT INTO eff VALUES (?,?);', toeff)
969 self.corpus.idformesuces = {}
972 def backup_corpus(self) :
973 log.info('start backup corpus')
975 for uci in self.corpus.ucis :
976 self.ccorpus.execute('INSERT INTO etoiles VALUES (?,?,?);' ,(uci.ident,' '.join(uci.etoiles), ' '.join(uci.paras,)))
977 for uce in uci.uces :
978 self.ccorpus.execute('INSERT INTO luces VALUES (?,?,?);',(`uci.ident`,`uce.para`,`uce.ident`,))
979 for forme in self.corpus.formes :
980 self.ccorpus.execute('INSERT INTO formes VALUES (?,?,?,?,?);', (`self.corpus.formes[forme].ident`, forme, self.corpus.formes[forme].lem, self.corpus.formes[forme].gram, `self.corpus.formes[forme].freq`,))
981 log.info('%f' % (time() - t))
984 self.corpus.parametres['date'] = datetime.datetime.now().ctime()
985 minutes, seconds = divmod(self.time, 60)
986 hours, minutes = divmod(minutes, 60)
987 self.corpus.parametres['time'] = '%.0fh %.0fm %.0fs' % (hours, minutes, seconds)
988 self.corpus.parametres['ucinb'] = self.corpus.getucinb()
989 self.corpus.parametres['ucenb'] = self.corpus.getucenb()
990 self.corpus.parametres['occurrences'] = self.corpus.gettotocc()
991 self.corpus.parametres['formesnb'] = len(self.corpus.formes)
992 hapaxnb = self.corpus.gethapaxnb()
993 pourhapaxf = (float(hapaxnb) / len(self.corpus.formes)) * 100
994 pourhapaxocc = (float(hapaxnb) / self.corpus.parametres['occurrences']) * 100
995 self.corpus.parametres['hapax'] = '%i - %.2f %% des formes - %.2f %% des occurrences' % (hapaxnb, pourhapaxf, pourhapaxocc)
998 class BuildFromAlceste(BuildCorpus) :
999 def read_corpus(self, infile) :
1000 if self.dlg is not None :
1001 self.dlg.Pulse('textes : 0 - segments : 0')
1004 if self.corpus.parametres['ucimark'] == 0 :
1005 self.testuci = testetoile
1006 elif self.corpus.parametres['ucimark'] == 1 :
1007 self.testuci = testint
1013 with codecs.open(infile, 'r', self.encoding) as f :
1014 for linenb, line in enumerate(f) :
1015 line = line.rstrip('\n\r')
1016 if self.testuci(line) :
1019 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci - 1)
1021 self.corpus.ucis.append(Uci(iduci, line))
1024 if self.corpus.ucis[-1].uces == [] :
1025 log.info(u'Empty text : %i' % linenb)
1027 self.corpus.ucis.pop()
1028 #raise Exception("EmptyText %i" % linenb)
1029 self.corpus.ucis.append(Uci(iduci, line))
1030 if self.dlg is not None :
1031 if not (iduci + 1) % 10 :
1032 self.dlg.Pulse('textes : %i - segments : %i' % (iduci + 1, iduce +1))
1033 elif line.startswith(u'-*') :
1036 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1039 self.corpus.ucis[-1].paras.append(line.split()[0])
1041 raise Exception('paragrapheOT')
1042 elif line.strip() != '' and iduci != -1 :
1044 if txt != [] and iduci != -1 :
1045 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1048 raise Exception("EmptyText")
1049 if iduci != -1 and iduce != -1:
1052 log.info(_(u"No Text in corpora. Are you sure of the formatting ?"))
1053 raise Exception('TextBeforeTextMark')
1054 except UnicodeDecodeError :
1055 raise Exception("CorpusEncoding")
1057 def treattxt(self, txt, iduce, idpara, iduci) :
1058 if self.corpus.parametres.get('ucemethod', 0) == 2 and self.corpus.parametres['douce']:
1059 txt = 'laphrasepoursplitter'.join(txt)
1060 txt = self.make_cleans(txt)
1061 txt = ' '.join([val for val in txt.split() if val not in self.ponctuation_espace])
1062 ucetxt = txt.split('laphrasepoursplitter')
1065 txt = self.make_cleans(txt)
1066 ucetxt = self.make_uces(txt, self.corpus.parametres['douce'])
1067 if self.corpus.ucis[-1].paras == [] :
1071 self.corpus.ucis[-1].uces.append(Uce(iduce, idpara, iduci))
1072 self.c.execute('INSERT INTO uces VALUES(?,?);', (`iduce`,uce))
1073 if not self.tolist :
1079 self.corpus.add_word(word)
1080 #if self.dlg is not None :
1081 # if self.limitshow > self.count :
1082 # self.dlg.Pulse('textes : %i - segments : %i' % (iduci + 1, iduce +1))
1084 # self.limitshow = 0
1086 # self.limitshow = self.last / 100000
1087 log.debug(' '.join([`iduci`,`idpara`,`iduce`]))
1088 if self.last > self.lim :
1091 return iduce, idpara
1093 def make_uces(self, txt, douce = True, keep_ponct = False) :
1094 txt = ' '.join(txt.split())
1097 reste, texte_uce, suite = self.decouper(self.prep_txt(txt), self.ucesize + 15, self.ucesize)
1105 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1108 reste, texte_uce, suite = self.decouper(suite, self.ucesize + 15, self.ucesize)
1116 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1118 #print 'RESTEE UUCEEEEEEEEEEEEE', uce
1122 return [' '.join([val for val in txt.split() if val not in self.ponctuation_espace])]
1124 #decouper (list_sep)
1125 #make_uces (decouper)
1126 #treat_txt (make_uces)
1130 def __init__(self, parent, dlg = None) :
1131 self.parent = parent
1133 parametres = DoConf(os.path.join(self.parent.UserConfigPath,'corpus.cfg')).getoptions('corpus')
1134 parametres['pathout'] = PathOut(parent.filename, 'corpus').mkdirout()
1135 dial = CorpusPref(parent, parametres)
1136 dial.CenterOnParent()
1137 dial.txtpath.SetLabel(parent.filename)
1138 #dial.repout_choices.SetValue(parametres['pathout'])
1139 self.res = dial.ShowModal()
1140 if self.res == 5100 :
1141 parametres = dial.doparametres()
1142 parametres['originalpath'] = parent.filename
1143 PathOut().createdir(parametres['pathout'])
1144 ReadLexique(self.parent, lang = parametres['lang'])
1145 self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp'))
1146 self.parametres = parametres
1149 def doanalyse(self) :
1150 return BuildFromAlceste(self.parent.filename, self.parametres, self.parent.lexique, self.parent.expressions, dlg = self.dlg).corpus
1153 if __name__ == '__main__' :
1155 parametres = {'formesdb':'formes.db', 'ucesdb': 'uces.db', 'corpusdb' : 'corpus.db', 'syscoding' : 'utf-8', 'encoding' : encoding}
1156 intro = BuildCorpus(infile, parametres)#, tar_in, tar_infouce)#, tar_formes)