1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
11 from functions import decoupercharact, ReadDicoAsDico, DoConf
17 from operator import itemgetter
18 from uuid import uuid4
19 from chemins import PathOut
20 from dialog import CorpusPref
21 from functions import ReadLexique, ReadDicoAsDico
22 from colors import colors
26 log = logging.getLogger('iramuteq.corpus')
29 def copycorpus(corpus) :
30 log.info('copy corpus')
31 copy_corpus = Corpus(corpus.parent, parametres = corpus.parametres)
32 copy_corpus.ucis = corpus.ucis
33 copy_corpus.formes = corpus.formes
34 copy_corpus.pathout = corpus.pathout
35 copy_corpus.conn_all()
45 def __init__(self, parent, parametres = {}, read = False) :
47 self.parametres = parametres
49 self.connformes = None
51 self.conncorpus = None
58 self.idformesuces = {}
63 self.pathout = PathOut(dirout = parametres['pathout'])
66 def add_word(self, word) :
67 if word in self.formes :
68 self.formes[word].freq += 1
69 if self.formes[word].ident in self.idformesuces :
70 if self.ucis[-1].uces[-1].ident in self.idformesuces[self.formes[word].ident] :
71 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] += 1
73 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] = 1
75 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident: 1}
77 if word in self.parent.lexique :
78 gramtype = self.parent.lexique[word][1]
79 lem = self.parent.lexique[word][0]
86 self.formes[word] = Word(word, gramtype, len(self.formes), lem)
87 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident : 1}
90 """connect corpus to db"""
91 if self.connformes is None :
92 log.info('connexion corpus')
93 self.connuces = sqlite3.connect(self.pathout['uces.db'])
94 self.cuces = self.connuces.cursor()
95 self.connformes = sqlite3.connect(self.pathout['formes.db'])
96 self.cformes = self.connformes.cursor()
97 self.conncorpus = sqlite3.connect(self.pathout['corpus.db'])
98 self.ccorpus = self.conncorpus.cursor()
99 self.cformes.execute('PRAGMA temp_store=MEMORY;')
100 self.cformes.execute('PRAGMA journal_mode=MEMORY;')
101 self.cformes.execute('PRAGMA synchronous = OFF;')
102 self.cuces.execute('PRAGMA temp_store=MEMORY;')
103 self.cuces.execute('PRAGMA journal_mode=MEMORY;')
104 self.cuces.execute('PRAGMA synchronous = OFF;')
105 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
106 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
107 self.ccorpus.execute('PRAGMA synchronous = OFF;')
109 def read_corpus(self) :
110 log.info('read corpus')
111 self.parametres['syscoding'] = sys.getdefaultencoding()
112 if self.conncorpus is None :
114 res = self.ccorpus.execute('SELECT * FROM etoiles;')
116 self.ucis.append(Uci(row[0], row[1], row[2]))
117 uces = self.conncorpus.cursor().execute('SELECT * FROM luces where uci=?;',(`self.ucis[-1].ident`,))
119 self.ucis[-1].uces.append(Uce(uce[2], uce[1], uce[0]))
120 res = self.ccorpus.execute('SELECT * FROM formes;')
121 self.formes = dict([[forme[1], Word(forme[1], forme[3], forme[0], lem = forme[2], freq = forme[4])] for forme in res])
124 def getworduces(self, wordid) :
125 if isinstance(wordid, basestring) :
126 wordid = self.formes[wordid].ident
127 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`wordid`,))
128 return list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
130 def getlemuces(self, lem) :
131 formesid = ', '.join([`val` for val in self.lems[lem].formes])
132 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
133 res = self.cformes.execute(query)
134 return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))))
136 def getlemucis(self, lem) :
137 uces = self.getlemuces(lem)
138 return list(set([self.getucefromid(val).uci for val in uces]))
140 def getlemuceseff(self, lem) :
141 formesid = ', '.join([`val` for val in self.lems[lem].formes])
142 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
143 res = self.cformes.execute(query)
144 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
145 query = 'SELECT eff FROM eff where id IN (%s) ORDER BY id' % formesid
146 res = self.cformes.execute(query)
147 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
149 for i, uce in enumerate(uces) :
150 lemuceeff[uce] = lemuceeff.get(uce, 0) + eff[i]
153 def getlemeff(self, lem) :
154 return self.lems[lem].freq
159 def getforme(self, formeid) :
160 if self.idformes is None : self.make_idformes()
161 return self.idformes[formeid]
163 def gettotocc(self) :
164 return sum([self.formes[forme].freq for forme in self.formes])
166 def getucemean(self) :
167 return float(self.gettotocc())/self.getucenb()
170 return self.ucis[-1].uces[-1].ident + 1
173 return self.ucis[-1].ident + 1
175 def getucisize(self) :
176 ucesize = self.getucesize()
177 return [sum(ucesize[uci.uces[0].ident:(uci.uces[-1].ident + 1)]) for uci in self.ucis]
179 def getucesize(self) :
180 res = self.getalluces()
181 return [len(uce[1].split()) for uce in res]
183 # def getlemseff(self) :
184 # if self.idformes is None :
185 # self.make_idformes()
186 # return dict([[lem, sum([self.idformes[forme].freq for forme in self.lems[lem]])] for lem in self.lems])
188 # def getlemsefftype(self) :
189 # if self.idformes is None :
190 # self.make_idformes()
191 # if self.lems is None :
193 # return dict([[lem, [sum([self.idformes[forme].freq for forme in self.lems[lem]]), '', self.idformes[self.lems[lem].keys()[0]].gram]] for lem in self.lems])
195 def getconcorde(self, uces) :
196 return self.cuces.execute('select * from uces where id IN (%s);' % ', '.join([`i` for i in uces]))
198 def getwordconcorde(self, word) :
199 return self.getconcorde(self.getworduces(word))
201 def getlemconcorde(self, lem) :
202 return self.getconcorde(self.getlemuces(lem))
204 def getalluces(self) :
205 return self.cuces.execute('SELECT * FROM uces')
207 def getucesfrometoile(self, etoile) :
208 return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles]
210 def getucefromid(self, uceid) :
211 if self.iduces is None : self.make_iduces()
212 return self.iduces[uceid]
214 def gethapaxnb(self) :
215 return len([None for forme in self.formes if self.formes[forme].freq == 1])
217 def getactivesnb(self, key) :
218 return len([lem for lem in self.lems if self.lems[lem].act == key])
219 # def make_lems(self, lem = True) :
220 # log.info('make lems')
222 # for forme in self.formes :
223 # if self.formes[forme].lem in self.lems :
224 # if self.formes[forme].ident not in self.lems[self.formes[forme].lem] :
225 # self.lems[self.formes[forme].lem][self.formes[forme].ident] = 0
227 # self.lems[self.formes[forme].lem] = {self.formes[forme].ident : 0}
229 def getetbyuceid(self, uceid) :
230 if self.uceuci is None : self.uceuci = dict([[uce.ident,uci.ident] for uci in self.ucis for uce in uci.uces])
231 return self.ucis[self.uceuci[uceid]].etoiles
233 def make_lems(self, lem = True) :
234 log.info('make lems')
237 for forme in self.formes :
238 if self.formes[forme].lem in self.lems :
239 if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes :
240 self.lems[self.formes[forme].lem].add_forme(self.formes[forme])
242 self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
244 self.lems = dict([[forme, Lem(self, self.formes[forme])] for forme in self.formes])
246 def make_idformes(self) :
247 self.idformes = dict([[self.formes[forme].ident, self.formes[forme]] for forme in self.formes])
249 def make_iduces(self) :
250 if self.iduces is None :
251 self.iduces = dict([[uce.ident, uce] for uci in self.ucis for uce in uci.uces])
253 def make_lexitable(self, mineff, etoiles) :
254 tokeep = [lem for lem in self.lems if self.lems[lem].freq > mineff]
255 etuces = [[] for et in etoiles]
256 for uci in self.ucis :
257 get = list(set(uci.etoiles).intersection(etoiles))
259 return '2 variables sur la meme ligne'
261 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
262 etuces = [set(val) for val in etuces]
265 deff = self.getlemuceseff(lem)
267 tab.append([lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])
268 tab.insert(0, [''] + etoiles)
271 def make_efftype_from_etoiles(self, etoiles) :
273 etuces = [[] for et in etoiles]
274 for uci in self.ucis :
275 get = list(set(uci.etoiles).intersection(etoiles))
277 return '2 variables sur la meme ligne'
279 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
280 etuces = [set(val) for val in etuces]
281 for lem in self.lems :
282 deff = self.getlemuceseff(lem)
284 gram = self.lems[lem].gram
286 dtype[gram] = [i + j for i, j in zip(dtype[gram], [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])]
288 dtype[gram] = [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces]
289 tabout = [[gram] + dtype[gram] for gram in dtype]
290 tabout.insert(0, [''] + etoiles)
293 def make_uceactsize(self, actives) :
294 res = self.getalluces()
297 deff = self.getlemuceseff(lem)
299 ucesize[uce] = ucesize.get(uce, 0) + 1
302 def make_uc(self, actives, lim1, lim2) :
303 uceactsize = self.make_uceactsize(actives)
309 for uce in [uce for uci in self.ucis for uce in uci.uces] :
310 if uce.para == lastpara :
312 last1 += uceactsize.get(uce.ident,0)
313 uc1[-1].append(uce.ident)
315 uc1.append([uce.ident])
318 last2 += uceactsize.get(uce.ident, 0)
319 uc2[-1].append(uce.ident)
321 uc2.append([uce.ident])
324 last1 = uceactsize.get(uce.ident, 0)
325 last2 = uceactsize.get(uce.ident, 0)
327 uc1.append([uce.ident])
328 uc2.append([uce.ident])
331 def make_and_write_sparse_matrix_from_uc(self, actives, sizeuc1, sizeuc2, uc1out, uc2out, listuce1out, listuce2out) :
332 uc1, uc2 = self.make_uc(actives, sizeuc1, sizeuc2)
333 log.info('taille uc1 : %i - taille uc2 : %i' % (len(uc1), len(uc2)))
334 self.write_ucmatrix(uc1, actives, uc1out)
335 self.write_ucmatrix(uc2, actives, uc2out)
336 listuce1 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc1) for uce in ucl]
337 listuce2 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc2) for uce in ucl]
338 with open(listuce1out, 'w') as f :
339 f.write('\n'.join([';'.join(line) for line in listuce1]))
340 with open(listuce2out, 'w') as f :
341 f.write('\n'.join([';'.join(line) for line in listuce2]))
342 return len(uc1), len(uc2)
344 def write_ucmatrix(self, uc, actives, fileout) :
345 log.info('write uc matrix %s' % fileout)
346 uces_uc = dict([[uce, i] for i, ucl in enumerate(uc) for uce in ucl])
349 with open(fileout + '~', 'w+') as f :
350 for i, lem in enumerate(actives) :
351 for uce in self.getlemuces(lem):
352 if (uces_uc[uce], i) not in deja_la :
354 f.write(''.join([' '.join([`uces_uc[uce]+1`,`i+1`,`1`]),'\n']))
355 deja_la[(uces_uc[uce], i)] = 0
357 with open(fileout, 'w') as ffin :
358 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (len(uc), len(actives), nbl))
361 os.remove(fileout + '~')
364 def export_corpus(self, outf) :
365 #outf = 'export_corpus.txt'
367 res = self.getalluces()
371 with open(outf,'w') as f :
373 if self.iduces[uce[0]].uci == actuci and self.iduces[uce[0]].para == actpara :
374 f.write(uce[1].encode(self.parametres['syscoding']) + '\n')
375 elif self.iduces[uce[0]].uci != actuci :
376 actuci = self.iduces[uce[0]].uci
377 if self.ucis[self.iduces[uce[0]].uci].paras == [] :
378 actpara = self.iduces[uce[0]].para
379 f.write('\n' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n' + uce[1].encode(self.parametres['syscoding']) + '\n')
382 actpara = self.iduces[uce[0]].para
383 f.write('\n'.join([' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']), self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
384 elif self.iduces[uce[0]].para != actpara :
385 actpara = self.iduces[uce[0]].para
387 f.write('\n'.join([self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
389 def make_and_write_sparse_matrix_from_uces(self, actives, outfile, listuce = False) :
390 log.info('make_and_write_sparse_matrix_from_uces %s' % outfile)
392 with open(outfile + '~', 'w+') as f :
393 for i, lem in enumerate(actives) :
394 for uce in sorted(self.getlemuces(lem)) :
396 f.write(''.join([' '.join([`uce+1`, `i+1`,`1`]),'\n']))
398 with open(outfile, 'w') as ffin :
399 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
402 os.remove(outfile + '~')
404 with open(listuce, 'w') as f :
405 f.write('\n'.join(['uce;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucenb())]))
407 def make_and_write_sparse_matrix_from_uci(self, actives, outfile, listuci = False) :
408 log.info('make_and_write_sparse_matrix_from_ucis %s' % outfile)
410 with open(outfile + '~', 'w+') as f :
411 for i, lem in enumerate(actives) :
412 for uci in sorted(self.getlemucis(lem)) :
414 f.write(''.join([' '.join([`uci+1`, `i+1`,`1`]),'\n']))
416 with open(outfile, 'w') as ffin :
417 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucinb(), len(actives), nbl))
420 os.remove(outfile + '~')
422 with open(listuci, 'w') as f :
423 f.write('\n'.join(['uci;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucinb())]))
425 def make_and_write_sparse_matrix_from_classe(self, actives, uces, outfile) :
426 log.info('make_and_write_sparse_matrix_from_classe %s' % outfile)
428 duces = dict([[uce, i] for i, uce in enumerate(uces)])
429 with open(outfile + '~', 'w+') as f :
430 for i, lem in enumerate(actives) :
431 uces_ok = list(set(self.getlemuces(lem)).intersection(uces))
433 f.write(''.join([' '.join([`duces[uce]+1`,`i+1`,`1`]),'\n']))
435 with open(outfile, 'w') as ffin :
436 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
439 os.remove(outfile + '~')
441 def make_table_with_classe(self, uces, list_act) :
442 table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
443 uces = dict([[uce, i] for i, uce in enumerate(uces)])
444 for i, lem in enumerate(list_act) :
445 lemuces = list(set(self.getlemuces(lem)).intersection(uces))
447 table_uce[uces[uce]][i] = 1
448 table_uce.insert(0, list_act)
451 def parse_active(self, gramact, gramsup = None) :
452 log.info('parse actives')
453 for lem in self.lems :
454 if self.lems[lem].gram in gramact :
455 self.lems[lem].act = 1
456 elif gramsup is not None :
457 if self.lems[lem].gram in gramsup :
458 self.lems[lem].act = 2
460 self.lems[lem].act = 0
462 self.lems[lem].act = 2
464 def make_actives_limit(self, limit, key = 1) :
465 if self.idformes is None :
467 return [lem for lem in self.lems if self.getlemeff(lem) >= limit and self.lems[lem].act == key]
469 def make_actives_nb(self, nbmax, key) :
470 log.info('make_actives_nb : %i - %i' % (nbmax,key))
471 if self.idformes is None :
473 allactives = [[self.lems[lem].freq, lem] for lem in self.lems if self.lems[lem].act == key and self.lems[lem].freq >= 3]
474 self.activenb = len(allactives)
475 allactives = sorted(allactives, reverse = True)
476 if len(allactives) <= nbmax :
477 log.info('nb = %i - eff min = %i ' % (len(allactives), allactives[-1][0]))
478 return [val[1] for val in allactives], allactives[-1][0]
480 effs = [val[0] for val in allactives]
481 if effs.count(effs[nbmax - 1]) > 1 :
482 lim = effs[nbmax - 1] + 1
486 stop = effs.index(lim)
492 log.info('nb actives = %i - eff min = %i ' % (stop, lim))
493 return [val[1] for val in allactives[0:stop + 1]], lim
495 def make_and_write_profile(self, actives, ucecl, fileout) :
496 log.info('formes/classes')
497 tab = [[lem] + [len(set(self.getlemuces(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
498 tab = [[line[0]] + [`val` for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
499 with open(fileout, 'w') as f :
500 f.write('\n'.join([';'.join(line) for line in tab]).encode(self.parametres['syscoding']))
502 def make_etoiles(self) :
504 for uci in self.ucis :
505 etoiles.update(uci.etoiles[1:] + uci.paras)
508 def make_etoiles_dict(self) :
509 etoiles = [et for uci in self.ucis for et in uci.etoiles[1:]]
511 for etoile in etoiles :
512 et = etoile.split('_')
515 endet = '_'.join(et[1:])
516 if endet in det[et[0]] :
517 det[et[0]][endet] += 1
519 det[et[0]][endet] = 1
524 endet = '_'.join(et[1:])
525 det[et[0]] = {endet :1}
531 def make_and_write_profile_et(self, ucecl, fileout) :
532 log.info('etoiles/classes')
533 etoiles = self.make_etoiles()
534 with open(fileout, 'w') as f :
535 f.write('\n'.join([';'.join([etoile] + [`len(set(self.getucesfrometoile(etoile)).intersection(classe))` for classe in ucecl]) for etoile in etoiles]).encode(self.parametres['syscoding']))
537 def make_colored_corpus(self) :
539 for i, lc in enumerate(self.lc) :
542 for uce in self.lc0 :
544 color = ['black'] + colors[len(self.lc) - 1]
546 <meta http-equiv="content-Type" content="text/html; charset=%s" />
548 ''' % sys.getdefaultencoding()
549 res = self.getalluces()
554 if self.iduces[uce[0]].uci != actuci :
555 actuci = self.iduces[uce[0]].uci
556 txt += '<br><hr>' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles) + '<br><br>'
557 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
559 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
560 return txt + '\n</body></html>'
562 def count_from_list(self, l, d) :
570 def count_from_list_cl(self, l, d, a, clnb) :
579 def find_segments(self, taille_segment, taille_limite) :
581 for uce in self.getalluces() :
583 d = self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
584 l = [[d[val], val] for val in d if d[val] >= 3]
587 if len(l) > taille_limite :
588 l = l[-taille_limite:]
591 def find_segments_in_classe(self, list_uce, taille_segment, taille_limite):
593 for uce in self.getconcorde(list_uce) :
595 d =self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
596 l = [[d[val], val, taille_segment] for val in d if d[val] >= 3]
599 if len(l) > taille_limite :
600 l = l[-taille_limite:]
603 def make_segments_profile(self, fileout, lenmin = 3, lenmax = 10, effmin = 50, lem = False) :
605 for b, classe in enumerate(self.lc) :
606 for uce in self.getconcorde(classe) :
609 uce = [self.formes[forme].lem for forme in uce]
610 for taille_segment in range(lenmin,lenmax) :
611 d =self.count_from_list_cl([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d, b, len(self.lc))
612 result = [[seg] + [str(val) for val in d[seg]] for seg in d if sum(d[seg]) >= effmin]
613 with open(fileout, 'w') as f :
614 f.write('\n'.join([';'.join(line) for line in result]))
616 def make_ucecl_from_R(self, filein) :
617 with open(filein, 'rU') as f :
622 line = line.replace('\n', '').replace('"', '').split(';')
623 self.lc.append([int(line[0]) - 1, int(line[1])])
624 classesl = [val[1] for val in self.lc]
626 self.lc = sorted(self.lc, key=itemgetter(1))
627 self.lc = [[uce[0] for uce in self.lc if uce[1] == i] for i in range(clnb+1)]
628 self.lc0 = self.lc.pop(0)
631 def gethapaxbyet(self, etoiles) :
632 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
634 for uce in hapaxuces :
635 if uce in hucesdict :
639 etuces = [[] for et in etoiles]
640 for uci in self.ucis :
641 get = list(set(uci.etoiles).intersection(etoiles))
643 return '2 variables sur la meme ligne'
645 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
646 etuces = [set(val) for val in etuces]
647 return [sum([hucesdict[uce] for uce in list(etuce.intersection(hapaxuces))]) for etuce in etuces]
649 def gethapaxuces(self) :
650 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
651 hapax = [forme for forme in self.lems if self.lems[forme].freq == 1]
653 for i,uce in enumerate(hapaxuces) :
654 if uce in hucesdict :
655 hucesdict[uce][0] += 1
656 hucesdict[uce][1].append(hapax[i])
658 hucesdict[uce] = [1,[hapax[i]]]
660 for uce in hucesdict :
661 if hucesdict[uce][0] in huces :
662 huces[hucesdict[uce][0]].append(uce)
664 huces[hucesdict[uce][0]] = [uce]
665 huces = zip(huces, huces.values())
666 huces.sort(reverse=True)
670 for nb in huces[0:4] :
671 txt += "<p><h2>%i hapax par uce</h2><p>\n" % nb[0]
673 res = self.getconcorde([uce])
675 ucetxt = ' ' + row[1] + ' '
677 for hap in hucesdict[uce][1] :
678 laforme = self.getforme([forme for forme in self.lems[hap].formes][0]).forme
679 ucetxt = ucetxt.replace(' '+laforme+' ', ' <font color=red>'+laforme+'</font> ')
680 txt += '<p><b>' + ' '.join(self.getetbyuceid(uceid)) + '</b></p>'
681 txt += '<p>'+ucetxt+'</p>\n'
685 with open('/tmp/testhapxuce.html','w') as f :
690 def __init__(self, corpus) :
691 ucinb = corpus.getucinb()
692 ucisize = corpus.getucisize()
693 ucimean = float(sum(ucisize))/float(ucinb)
694 detoile = corpus.make_etoiles_dict()
698 def __init__(self, iduci, line, paraset = None) :
700 self.etoiles = line.split()
702 if paraset is not None :
703 self.paras = paraset.split()
708 def __init__(self, iduce, idpara, iduci) :
714 def __init__(self, word, gramtype, idword, lem = None, freq = None) :
720 if freq is not None :
726 def __init__(self, parent, forme) :
727 self.formes = {forme.ident : forme.freq}
728 self.gram = forme.gram
729 self.freq = forme.freq
732 def add_forme(self, forme) :
733 self.formes[forme.ident] = forme.freq
734 self.freq += forme.freq
736 def decouperlist(chaine, longueur, longueurOptimale) :
738 on part du dernier caractère, et on recule jusqu'au début de la chaîne.
739 Si on trouve un '$', c'est fini.
740 Sinon, on cherche le meilleur candidat. C'est-à-dire le rapport poids/distance le plus important.
742 separateurs = [[u'.', 6.0], [u'?', 6.0], [u'!', 6.0], [u'£$£', 6.0], [u':', 5.0], [u';', 4.0], [u',', 1.0], [u' ', 0.01]]
743 dsep = dict([[val[0],val[1]] for val in separateurs])
744 trouve = False # si on a trouvé un bon séparateur
745 iDecoupe = 0 # indice du caractere ou il faut decouper
747 longueur = min(longueur, len(chaine) - 1)
748 chaineTravail = chaine[:longueur + 1]
750 meilleur = ['', 0, 0] # type, poids et position du meilleur separateur
753 indice = chaineTravail.index(u'$')
755 iDecoupe = indice - 1
760 caractere = chaineTravail[nbCar]
761 distance = abs(longueurOptimale - nbCar) + 1
762 meilleureDistance = abs(longueurOptimale - meilleur[2]) + 1
763 if caractere in dsep :
764 if (float(dsep[caractere]) / distance) > (float(meilleur[1]) / meilleureDistance) :
765 meilleur[0] = caractere
766 meilleur[1] = dsep[caractere]
771 if (float(dsep[' ']) / distance) > (float(meilleur[1]) / meilleureDistance) :
773 meilleur[1] = dsep[' ']
780 #if meilleur[0] != ' ' :
781 # fin = chaine[iDecoupe + 1:]
782 # retour = chaineTravail[:iDecoupe]
784 fin = chaine[iDecoupe + 1:]
785 retour = chaineTravail[:iDecoupe + 1]
786 return len(retour) > 0, retour, fin
787 # si on a rien trouvé
788 return False, chaine, ''
790 def testetoile(line) :
791 return line.startswith(u'****')
794 return line[0:4].isdigit() and u'*' in line
796 def prep_txtlist(txt) :
797 return txt.split() + [u'$']
799 def prep_txtcharact(txt) :
804 Class for building a corpus
806 def __init__(self, infile, parametres_corpus, lexique = None, expressions = None, dlg = None) :
807 log.info('begin building corpus...')
808 self.lexique = lexique
809 self.expressions = expressions
811 self.corpus = Corpus(self, parametres_corpus)
814 self.lim = parametres_corpus.get('lim', 1000000)
815 self.encoding = parametres_corpus['encoding']
816 self.corpus.pathout = PathOut(filename = parametres_corpus['originalpath'], dirout = parametres_corpus['pathout'])
817 self.corpus.pathout.createdir(parametres_corpus['pathout'])
818 self.corpus.parametres['uuid'] = str(uuid4())
819 self.corpus.parametres['corpus_name'] = os.path.split(self.corpus.parametres['pathout'])[1]
820 self.corpus.parametres['type'] = 'corpus'
821 if self.corpus.parametres['keep_ponct'] :
822 self.ponctuation_espace = [' ', '']
824 self.ponctuation_espace = [' ','.', u'£$£', ';', '?', '!', ',', ':','']
826 self.tolist = self.corpus.parametres.get('tolist', 0)
833 def prep_makeuce(self) :
834 method = self.corpus.parametres.get('ucemethod', 0)
836 self.decouper = decouperlist
837 self.prep_txt = prep_txtlist
838 self.ucesize = self.corpus.parametres.get('ucesize', 40)
840 self.decouper = decoupercharact
841 self.prep_txt = prep_txtcharact
842 self.ucesize = self.corpus.parametres.get('ucesize', 240)
843 log.info('method uce : %s' % method)
848 self.read_corpus(self.infile)
849 except Warning, args :
850 log.info('pas kool %s' % args)
854 self.corpus.parametres['ira'] = self.corpus.pathout['Corpus.cira']
855 self.time = time() - t1
857 DoConf().makeoptions(['corpus'],[self.corpus.parametres], self.corpus.pathout['Corpus.cira'])
858 log.info('time : %f' % (time() - t1))
861 self.conn_f = sqlite3.connect(self.corpus.pathout['formes.db'])
862 self.cf = self.conn_f.cursor()
863 self.cf.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
864 self.cf.execute('CREATE TABLE IF NOT EXISTS eff (id INTEGER, eff TEXT);')
866 self.cf = self.conn_f.cursor()
867 self.cf.execute('PRAGMA temp_store=MEMORY;')
868 self.cf.execute('PRAGMA journal_mode=MEMORY;')
869 self.cf.execute('PRAGMA synchronous = OFF;')
870 self.cf.execute('begin')
871 self.conn = sqlite3.connect(self.corpus.pathout['uces.db'])
872 self.c = self.conn.cursor()
873 self.c.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
875 self.c = self.conn.cursor()
876 self.c.execute('PRAGMA temp_store=MEMORY;')
877 self.c.execute('PRAGMA journal_mode=MEMORY;')
878 self.c.execute('PRAGMA synchronous = OFF;')
879 self.c.execute('begin')
882 #commit index and close db
885 self.cf.execute('CREATE INDEX iduces ON uces (id);')
886 self.cf.execute('CREATE INDEX ideff ON eff (id);')
890 self.conn_corpus = sqlite3.connect(self.corpus.pathout['corpus.db'])
891 self.ccorpus = self.conn_corpus.cursor()
892 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS etoiles (uci INTEGER, et TEXT, paras TEXT);')
893 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS luces (uci INTEGER, para INTEGER, uce INTEGER);')
894 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS formes (ident INTEGER, forme TEXT, lem TEXT, gram TEXT, freq INTEGER);')
895 self.conn_corpus.commit()
896 self.ccorpus = self.conn_corpus.cursor()
897 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
898 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
899 self.ccorpus.execute('PRAGMA synchronous = OFF;')
900 self.ccorpus.execute('begin')
902 self.ccorpus.execute('CREATE INDEX iduci ON luces (uci);')
903 self.conn_corpus.commit()
904 self.conn_corpus.close()
905 #self.corpus.parametres['corpus_ira'] = self.corpus.pathout['corpus.cira']
907 def buildcleans(self) :
908 if self.corpus.parametres.get('lower', 1) :
909 self.cleans.append(self.dolower)
910 if self.corpus.parametres.get('firstclean', 1) :
911 self.cleans.append(self.firstclean)
912 if self.corpus.parametres['charact'] :
913 self.rule = self.corpus.parametres.get('keep_caract', u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_")
914 self.cleans.append(self.docharact)
915 if self.corpus.parametres.get('expressions', 1) :
916 self.cleans.append(self.make_expression)
917 if self.corpus.parametres.get('apos', 1) :
918 self.cleans.append(self.doapos)
919 if self.corpus.parametres.get('tiret', 1):
920 self.cleans.append(self.dotiret)
922 def make_expression(self,txt) :
923 for expression in self.expressions:
924 if expression in txt :
925 txt = txt.replace(expression, self.expressions[expression][0])
928 def dolower(self, txt) :
931 def docharact(self, txt) :
932 #rule = u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_-"
933 list_keep = u"[" + self.rule + "]+"
934 return re.sub(list_keep, ' ', txt)
936 def doapos(self, txt) :
937 return txt.replace(u'\'', u' ')
939 def dotiret(self, txt) :
940 return txt.replace(u'-', u' ')
942 def firstclean(self, txt) :
943 txt = txt.replace(u'’',"'")
944 txt = txt.replace(u'œ', u'oe')
945 return txt.replace('...',u' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'…', ' £$£ ')
947 def make_cleans(self, txt) :
948 for clean in self.cleans :
952 def backup_uce(self) :
953 if self.corpus.idformesuces != {} :
954 log.info('backup %i' % len(self.corpus.idformesuces))
955 touce = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].keys()])) for forme in self.corpus.idformesuces]
956 toeff = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].values()])) for forme in self.corpus.idformesuces]
957 self.cf.executemany('INSERT INTO uces VALUES (?,?);', touce)
958 self.cf.executemany('INSERT INTO eff VALUES (?,?);', toeff)
959 self.corpus.idformesuces = {}
962 def backup_corpus(self) :
963 log.info('start backup corpus')
965 for uci in self.corpus.ucis :
966 self.ccorpus.execute('INSERT INTO etoiles VALUES (?,?,?);' ,(uci.ident,' '.join(uci.etoiles), ' '.join(uci.paras,)))
967 for uce in uci.uces :
968 self.ccorpus.execute('INSERT INTO luces VALUES (?,?,?);',(`uci.ident`,`uce.para`,`uce.ident`,))
969 for forme in self.corpus.formes :
970 self.ccorpus.execute('INSERT INTO formes VALUES (?,?,?,?,?);', (`self.corpus.formes[forme].ident`, forme, self.corpus.formes[forme].lem, self.corpus.formes[forme].gram, `self.corpus.formes[forme].freq`,))
971 log.info('%f' % (time() - t))
974 self.corpus.parametres['date'] = datetime.datetime.now().ctime()
975 minutes, seconds = divmod(self.time, 60)
976 hours, minutes = divmod(minutes, 60)
977 self.corpus.parametres['time'] = '%.0fh %.0fm %.0fs' % (hours, minutes, seconds)
978 self.corpus.parametres['ucinb'] = self.corpus.getucinb()
979 self.corpus.parametres['ucenb'] = self.corpus.getucenb()
980 self.corpus.parametres['occurrences'] = self.corpus.gettotocc()
981 self.corpus.parametres['formesnb'] = len(self.corpus.formes)
982 hapaxnb = self.corpus.gethapaxnb()
983 pourhapaxf = (float(hapaxnb) / len(self.corpus.formes)) * 100
984 pourhapaxocc = (float(hapaxnb) / self.corpus.parametres['occurrences']) * 100
985 self.corpus.parametres['hapax'] = '%i - %.2f %% des formes - %.2f %% des occurrences' % (hapaxnb, pourhapaxf, pourhapaxocc)
988 class BuildFromAlceste(BuildCorpus) :
989 #def __init___(self, infile, parametres_corpus) :
990 # BuildCorpus.__init__(self, infile, parametres_corpus)
993 def read_corpus(self, infile) :
996 if self.corpus.parametres['ucimark'] == 0 :
997 self.testuci = testetoile
998 elif self.corpus.parametres['ucimark'] == 1 :
999 self.testuci = testint
1006 with codecs.open(infile, 'r', self.encoding) as f :
1009 line = line.rstrip('\n\r')
1010 if self.testuci(line) :
1013 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci - 1)
1015 self.corpus.ucis.append(Uci(iduci, line))
1018 if self.corpus.ucis[-1].uces == [] :
1019 log.info('linenb : %i' % linenb)
1020 raise Exception("EmptyText %i" % linenb)
1021 self.corpus.ucis.append(Uci(iduci, line))
1022 elif line.startswith(u'-*') :
1025 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1028 self.corpus.ucis[-1].paras.append(line.split()[0])
1030 raise Exception('paragrapheOT')
1031 elif line.strip() != '' and iduci != -1 :
1033 if txt != [] and iduci != -1 :
1034 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1037 raise Exception("EmptyText")
1038 if iduci != -1 and iduce != -1:
1041 log.info(_(u"No Texte in corpora. Are you sure of the formatting ?"))
1042 raise Exception('TextBeforeTextMark')
1043 except UnicodeDecodeError :
1044 raise Exception("CorpusEncoding")
1046 def treattxt(self, txt, iduce, idpara, iduci) :
1047 if self.corpus.parametres.get('ucemethod', 0) == 2 and self.corpus.parametres['douce']:
1048 txt = 'laphrasepoursplitter'.join(txt)
1049 txt = self.make_cleans(txt)
1050 txt = ' '.join([val for val in txt.split() if val not in self.ponctuation_espace])
1051 ucetxt = txt.split('laphrasepoursplitter')
1056 txt = self.make_cleans(txt)
1059 ucetxt = self.make_uces(txt, self.corpus.parametres['douce'])
1060 if self.corpus.ucis[-1].paras == [] :
1064 self.corpus.ucis[-1].uces.append(Uce(iduce, idpara, iduci))
1065 self.c.execute('INSERT INTO uces VALUES(?,?);', (`iduce`,uce))
1066 if not self.tolist :
1072 self.corpus.add_word(word)
1073 if self.dlg is not None :
1074 if self.limitshow > self.count :
1075 self.dlg.Pulse('uci: %i - uce : %i' % (iduci + 1, iduce +1))
1079 self.limitshow = self.last / 100000
1080 log.debug(' '.join([`iduci`,`idpara`,`iduce`]))
1081 if self.last > self.lim :
1084 return iduce, idpara
1086 def make_uces(self, txt, douce = True, keep_ponct = False) :
1087 txt = ' '.join(txt.split())
1090 reste, texte_uce, suite = self.decouper(self.prep_txt(txt), self.ucesize + 15, self.ucesize)
1098 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1101 reste, texte_uce, suite = self.decouper(suite, self.ucesize + 15, self.ucesize)
1109 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1111 #print 'RESTEE UUCEEEEEEEEEEEEE', uce
1115 return [' '.join([val for val in txt.split() if val not in self.ponctuation_espace])]
1117 #decouper (list_sep)
1118 #make_uces (decouper)
1119 #treat_txt (make_uces)
1123 def __init__(self, parent, dlg = None) :
1124 self.parent = parent
1126 parametres = DoConf(os.path.join(self.parent.UserConfigPath,'corpus.cfg')).getoptions('corpus')
1127 parametres['pathout'] = PathOut(parent.filename, 'corpus').mkdirout()
1128 dial = CorpusPref(parent, parametres)
1129 dial.CenterOnParent()
1130 dial.txtpath.SetLabel(parent.filename)
1131 #dial.repout_choices.SetValue(parametres['pathout'])
1132 self.res = dial.ShowModal()
1133 if self.res == 5100 :
1134 parametres = dial.doparametres()
1135 parametres['originalpath'] = parent.filename
1136 PathOut().createdir(parametres['pathout'])
1137 ReadLexique(self.parent, lang = parametres['lang'])
1138 self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp'))
1139 self.parametres = parametres
1142 def doanalyse(self) :
1143 return BuildFromAlceste(self.parent.filename, self.parametres, self.parent.lexique, self.parent.expressions, dlg = self.dlg).corpus
1146 if __name__ == '__main__' :
1148 parametres = {'formesdb':'formes.db', 'ucesdb': 'uces.db', 'corpusdb' : 'corpus.db', 'syscoding' : 'utf-8', 'encoding' : encoding}
1149 intro = BuildCorpus(infile, parametres)#, tar_in, tar_infouce)#, tar_formes)