1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
11 from functions import decoupercharact, ReadDicoAsDico, DoConf
17 from operator import itemgetter
18 from uuid import uuid4
19 from chemins import PathOut
20 from dialog import CorpusPref
21 from functions import ReadLexique, ReadDicoAsDico
22 from colors import colors
26 log = logging.getLogger('iramuteq.corpus')
29 def copycorpus(corpus) :
30 log.info('copy corpus')
31 copy_corpus = Corpus(corpus.parent, parametres = corpus.parametres)
32 copy_corpus.ucis = corpus.ucis
33 copy_corpus.formes = corpus.formes
34 copy_corpus.pathout = corpus.pathout
35 copy_corpus.conn_all()
45 def __init__(self, parent, parametres = {}, read = False) :
47 self.parametres = parametres
49 self.connformes = None
51 self.conncorpus = None
58 self.idformesuces = {}
63 self.pathout = PathOut(dirout = parametres['pathout'])
66 def add_word(self, word) :
67 if word in self.formes :
68 self.formes[word].freq += 1
69 if self.formes[word].ident in self.idformesuces :
70 if self.ucis[-1].uces[-1].ident in self.idformesuces[self.formes[word].ident] :
71 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] += 1
73 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] = 1
75 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident: 1}
77 if word in self.parent.lexique :
78 gramtype = self.parent.lexique[word][1]
79 lem = self.parent.lexique[word][0]
86 self.formes[word] = Word(word, gramtype, len(self.formes), lem)
87 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident : 1}
90 """connect corpus to db"""
91 if self.connformes is None :
92 log.info('connexion corpus')
93 self.connuces = sqlite3.connect(self.pathout['uces.db'])
94 self.cuces = self.connuces.cursor()
95 self.connformes = sqlite3.connect(self.pathout['formes.db'])
96 self.cformes = self.connformes.cursor()
97 self.conncorpus = sqlite3.connect(self.pathout['corpus.db'])
98 self.ccorpus = self.conncorpus.cursor()
99 self.cformes.execute('PRAGMA temp_store=MEMORY;')
100 self.cformes.execute('PRAGMA journal_mode=MEMORY;')
101 self.cformes.execute('PRAGMA synchronous = OFF;')
102 self.cuces.execute('PRAGMA temp_store=MEMORY;')
103 self.cuces.execute('PRAGMA journal_mode=MEMORY;')
104 self.cuces.execute('PRAGMA synchronous = OFF;')
105 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
106 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
107 self.ccorpus.execute('PRAGMA synchronous = OFF;')
109 def read_corpus(self) :
110 log.info('read corpus')
111 self.parametres['syscoding'] = sys.getdefaultencoding()
112 if self.conncorpus is None :
114 res = self.ccorpus.execute('SELECT * FROM etoiles;')
116 self.ucis.append(Uci(row[0], row[1], row[2]))
117 uces = self.conncorpus.cursor().execute('SELECT * FROM luces where uci=?;',(`self.ucis[-1].ident`,))
119 self.ucis[-1].uces.append(Uce(uce[2], uce[1], uce[0]))
120 res = self.ccorpus.execute('SELECT * FROM formes;')
121 self.formes = dict([[forme[1], Word(forme[1], forme[3], forme[0], lem = forme[2], freq = forme[4])] for forme in res])
124 def getworduces(self, wordid) :
125 if isinstance(wordid, basestring) :
126 wordid = self.formes[wordid].ident
127 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`wordid`,))
128 return list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
130 def getlemuces(self, lem) :
131 formesid = ', '.join([`val` for val in self.lems[lem].formes])
132 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
133 res = self.cformes.execute(query)
134 return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))))
136 def getlemucis(self, lem) :
137 uces = self.getlemuces(lem)
138 return list(set([self.getucefromid(val).uci for val in uces]))
140 def getlemuceseff(self, lem) :
141 formesid = ', '.join([`val` for val in self.lems[lem].formes])
142 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
143 res = self.cformes.execute(query)
144 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
145 query = 'SELECT eff FROM eff where id IN (%s) ORDER BY id' % formesid
146 res = self.cformes.execute(query)
147 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
149 for i, uce in enumerate(uces) :
150 lemuceeff[uce] = lemuceeff.get(uce, 0) + eff[i]
153 def getlemeff(self, lem) :
154 return self.lems[lem].freq
159 def getforme(self, formeid) :
160 if self.idformes is None : self.make_idformes()
161 return self.idformes[formeid]
163 def gettotocc(self) :
164 return sum([self.formes[forme].freq for forme in self.formes])
166 def getucemean(self) :
167 return float(self.gettotocc())/self.getucenb()
170 return self.ucis[-1].uces[-1].ident + 1
173 return self.ucis[-1].ident + 1
175 def getucisize(self) :
176 ucesize = self.getucesize()
177 return [sum(ucesize[uci.uces[0].ident:(uci.uces[-1].ident + 1)]) for uci in self.ucis]
179 def getucesize(self) :
180 res = self.getalluces()
181 return [len(uce[1].split()) for uce in res]
183 # def getlemseff(self) :
184 # if self.idformes is None :
185 # self.make_idformes()
186 # return dict([[lem, sum([self.idformes[forme].freq for forme in self.lems[lem]])] for lem in self.lems])
188 # def getlemsefftype(self) :
189 # if self.idformes is None :
190 # self.make_idformes()
191 # if self.lems is None :
193 # return dict([[lem, [sum([self.idformes[forme].freq for forme in self.lems[lem]]), '', self.idformes[self.lems[lem].keys()[0]].gram]] for lem in self.lems])
195 def getconcorde(self, uces) :
196 return self.cuces.execute('select * from uces where id IN (%s);' % ', '.join([`i` for i in uces]))
198 def getwordconcorde(self, word) :
199 return self.getconcorde(self.getworduces(word))
201 def getlemconcorde(self, lem) :
202 return self.getconcorde(self.getlemuces(lem))
204 def getalluces(self) :
205 return self.cuces.execute('SELECT * FROM uces')
207 def getucesfrometoile(self, etoile) :
208 return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles]
210 def getucefromid(self, uceid) :
211 if self.iduces is None : self.make_iduces()
212 return self.iduces[uceid]
214 def gethapaxnb(self) :
215 return len([None for forme in self.formes if self.formes[forme].freq == 1])
217 def getactivesnb(self, key) :
218 return len([lem for lem in self.lems if self.lems[lem].act == key])
219 # def make_lems(self, lem = True) :
220 # log.info('make lems')
222 # for forme in self.formes :
223 # if self.formes[forme].lem in self.lems :
224 # if self.formes[forme].ident not in self.lems[self.formes[forme].lem] :
225 # self.lems[self.formes[forme].lem][self.formes[forme].ident] = 0
227 # self.lems[self.formes[forme].lem] = {self.formes[forme].ident : 0}
229 def getetbyuceid(self, uceid) :
230 if self.uceuci is None : self.uceuci = dict([[uce.ident,uci.ident] for uci in self.ucis for uce in uci.uces])
231 return self.ucis[self.uceuci[uceid]].etoiles
233 def make_lems(self, lem = True) :
234 log.info('make lems')
237 for forme in self.formes :
238 if self.formes[forme].lem in self.lems :
239 if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes :
240 self.lems[self.formes[forme].lem].add_forme(self.formes[forme])
242 self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
244 self.lems = dict([[forme, Lem(self, self.formes[forme])] for forme in self.formes])
246 def make_idformes(self) :
247 self.idformes = dict([[self.formes[forme].ident, self.formes[forme]] for forme in self.formes])
249 def make_iduces(self) :
250 if self.iduces is None :
251 self.iduces = dict([[uce.ident, uce] for uci in self.ucis for uce in uci.uces])
253 def make_lexitable(self, mineff, etoiles) :
254 tokeep = [lem for lem in self.lems if self.lems[lem].freq > mineff]
255 etuces = [[] for et in etoiles]
256 for uci in self.ucis :
257 get = list(set(uci.etoiles).intersection(etoiles))
259 return '2 variables sur la meme ligne'
261 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
262 etuces = [set(val) for val in etuces]
265 deff = self.getlemuceseff(lem)
267 tab.append([lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])
268 tab.insert(0, [''] + etoiles)
271 def make_efftype_from_etoiles(self, etoiles) :
273 etuces = [[] for et in etoiles]
274 for uci in self.ucis :
275 get = list(set(uci.etoiles).intersection(etoiles))
277 return '2 variables sur la meme ligne'
279 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
280 etuces = [set(val) for val in etuces]
281 for lem in self.lems :
282 deff = self.getlemuceseff(lem)
284 gram = self.lems[lem].gram
286 dtype[gram] = [i + j for i, j in zip(dtype[gram], [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])]
288 dtype[gram] = [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces]
289 tabout = [[gram] + dtype[gram] for gram in dtype]
290 tabout.insert(0, [''] + etoiles)
293 def make_uceactsize(self, actives) :
294 res = self.getalluces()
297 deff = self.getlemuceseff(lem)
299 ucesize[uce] = ucesize.get(uce, 0) + 1
302 def make_uc(self, actives, lim1, lim2) :
303 uceactsize = self.make_uceactsize(actives)
309 for uce in [uce for uci in self.ucis for uce in uci.uces] :
310 if uce.para == lastpara :
312 last1 += uceactsize.get(uce.ident,0)
313 uc1[-1].append(uce.ident)
315 uc1.append([uce.ident])
318 last2 += uceactsize.get(uce.ident, 0)
319 uc2[-1].append(uce.ident)
321 uc2.append([uce.ident])
324 last1 = uceactsize.get(uce.ident, 0)
325 last2 = uceactsize.get(uce.ident, 0)
327 uc1.append([uce.ident])
328 uc2.append([uce.ident])
331 def make_and_write_sparse_matrix_from_uc(self, actives, sizeuc1, sizeuc2, uc1out, uc2out, listuce1out, listuce2out) :
332 uc1, uc2 = self.make_uc(actives, sizeuc1, sizeuc2)
333 log.info('taille uc1 : %i - taille uc2 : %i' % (len(uc1), len(uc2)))
334 self.write_ucmatrix(uc1, actives, uc1out)
335 self.write_ucmatrix(uc2, actives, uc2out)
336 listuce1 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc1) for uce in ucl]
337 listuce2 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc2) for uce in ucl]
338 with open(listuce1out, 'w') as f :
339 f.write('\n'.join([';'.join(line) for line in listuce1]))
340 with open(listuce2out, 'w') as f :
341 f.write('\n'.join([';'.join(line) for line in listuce2]))
342 return len(uc1), len(uc2)
344 def write_ucmatrix(self, uc, actives, fileout) :
345 log.info('write uc matrix %s' % fileout)
346 uces_uc = dict([[uce, i] for i, ucl in enumerate(uc) for uce in ucl])
349 with open(fileout + '~', 'w+') as f :
350 for i, lem in enumerate(actives) :
351 for uce in self.getlemuces(lem):
352 if (uces_uc[uce], i) not in deja_la :
354 f.write(''.join([' '.join([`uces_uc[uce]+1`,`i+1`,`1`]),'\n']))
355 deja_la[(uces_uc[uce], i)] = 0
357 with open(fileout, 'w') as ffin :
358 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (len(uc), len(actives), nbl))
361 os.remove(fileout + '~')
364 def export_corpus(self, outf) :
365 #outf = 'export_corpus.txt'
367 res = self.getalluces()
371 with open(outf,'w') as f :
373 if self.iduces[uce[0]].uci == actuci and self.iduces[uce[0]].para == actpara :
374 f.write(uce[1].encode(self.parametres['syscoding']) + '\n')
375 elif self.iduces[uce[0]].uci != actuci :
376 actuci = self.iduces[uce[0]].uci
377 if self.ucis[self.iduces[uce[0]].uci].paras == [] :
378 actpara = self.iduces[uce[0]].para
379 f.write('\n' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n' + uce[1].encode(self.parametres['syscoding']) + '\n')
382 actpara = self.iduces[uce[0]].para
383 f.write('\n'.join([' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']), self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
384 elif self.iduces[uce[0]].para != actpara :
385 actpara = self.iduces[uce[0]].para
387 f.write('\n'.join([self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
389 def make_and_write_sparse_matrix_from_uces(self, actives, outfile, listuce = False) :
390 log.info('make_and_write_sparse_matrix_from_uces %s' % outfile)
392 with open(outfile + '~', 'w+') as f :
393 for i, lem in enumerate(actives) :
394 for uce in sorted(self.getlemuces(lem)) :
396 f.write(''.join([' '.join([`uce+1`, `i+1`,`1`]),'\n']))
398 with open(outfile, 'w') as ffin :
399 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
402 os.remove(outfile + '~')
404 with open(listuce, 'w') as f :
405 f.write('\n'.join(['uce;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucenb())]))
407 def make_and_write_sparse_matrix_from_uci(self, actives, outfile, listuci = False) :
408 log.info('make_and_write_sparse_matrix_from_ucis %s' % outfile)
410 with open(outfile + '~', 'w+') as f :
411 for i, lem in enumerate(actives) :
412 for uci in sorted(self.getlemucis(lem)) :
414 f.write(''.join([' '.join([`uci+1`, `i+1`,`1`]),'\n']))
416 with open(outfile, 'w') as ffin :
417 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucinb(), len(actives), nbl))
420 os.remove(outfile + '~')
422 with open(listuci, 'w') as f :
423 f.write('\n'.join(['uci;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucinb())]))
425 def make_and_write_sparse_matrix_from_classe(self, actives, uces, outfile) :
426 log.info('make_and_write_sparse_matrix_from_classe %s' % outfile)
428 duces = dict([[uce, i] for i, uce in enumerate(uces)])
429 with open(outfile + '~', 'w+') as f :
430 for i, lem in enumerate(actives) :
431 uces_ok = list(set(self.getlemuces(lem)).intersection(uces))
433 f.write(''.join([' '.join([`duces[uce]+1`,`i+1`,`1`]),'\n']))
435 with open(outfile, 'w') as ffin :
436 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
439 os.remove(outfile + '~')
441 def make_table_with_classe(self, uces, list_act) :
442 table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
443 uces = dict([[uce, i] for i, uce in enumerate(uces)])
444 for i, lem in enumerate(list_act) :
445 lemuces = list(set(self.getlemuces(lem)).intersection(uces))
447 table_uce[uces[uce]][i] = 1
448 table_uce.insert(0, list_act)
451 def parse_active(self, gramact, gramsup = None) :
452 log.info('parse actives')
453 for lem in self.lems :
454 if self.lems[lem].gram in gramact :
455 self.lems[lem].act = 1
456 elif gramsup is not None :
457 if self.lems[lem].gram in gramsup :
458 self.lems[lem].act = 2
460 self.lems[lem].act = 0
462 self.lems[lem].act = 2
464 def make_actives_limit(self, limit, key = 1) :
465 if self.idformes is None :
467 return [lem for lem in self.lems if self.getlemeff(lem) >= limit and self.lems[lem].act == key]
469 def make_actives_nb(self, nbmax, key) :
470 log.info('make_actives_nb : %i - %i' % (nbmax,key))
471 if self.idformes is None :
473 allactives = [[self.lems[lem].freq, lem] for lem in self.lems if self.lems[lem].act == key and self.lems[lem].freq >= 3]
474 self.activenb = len(allactives)
475 allactives = sorted(allactives, reverse = True)
476 if len(allactives) <= nbmax :
477 log.info('nb = %i - eff min = %i ' % (len(allactives), allactives[-1][0]))
478 return [val[1] for val in allactives], allactives[-1][0]
480 effs = [val[0] for val in allactives]
481 if effs.count(effs[nbmax - 1]) > 1 :
482 lim = effs[nbmax - 1] + 1
486 stop = effs.index(lim)
492 log.info('nb actives = %i - eff min = %i ' % (stop, lim))
493 return [val[1] for val in allactives[0:stop + 1]], lim
495 def make_and_write_profile(self, actives, ucecl, fileout) :
496 log.info('formes/classes')
497 tab = [[lem] + [len(set(self.getlemuces(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
498 tab = [[line[0]] + [`val` for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
499 with open(fileout, 'w') as f :
500 f.write('\n'.join([';'.join(line) for line in tab]).encode(self.parametres['syscoding']))
502 def make_etoiles(self) :
504 for uci in self.ucis :
505 etoiles.update(uci.etoiles[1:] + uci.paras)
508 def make_etoiles_dict(self) :
509 etoiles = [et for uci in self.ucis for et in uci.etoiles[1:]]
511 for etoile in etoiles :
512 et = etoile.split('_')
515 endet = '_'.join(et[1:])
516 if endet in det[et[0]] :
517 det[et[0]][endet] += 1
519 det[et[0]][endet] = 1
524 endet = '_'.join(et[1:])
525 det[et[0]] = {endet :1}
530 def make_and_write_profile_et(self, ucecl, fileout) :
531 log.info('etoiles/classes')
532 etoiles = self.make_etoiles()
533 with open(fileout, 'w') as f :
534 f.write('\n'.join([';'.join([etoile] + [`len(set(self.getucesfrometoile(etoile)).intersection(classe))` for classe in ucecl]) for etoile in etoiles]).encode(self.parametres['syscoding']))
536 def make_colored_corpus(self) :
538 for i, lc in enumerate(self.lc) :
541 for uce in self.lc0 :
543 color = ['black'] + colors[len(self.lc) - 1]
545 <meta http-equiv="content-Type" content="text/html; charset=%s" />
547 ''' % sys.getdefaultencoding()
548 res = self.getalluces()
553 if self.iduces[uce[0]].uci != actuci :
554 actuci = self.iduces[uce[0]].uci
555 txt += '<br><hr>' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles) + '<br><br>'
556 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
558 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
559 return txt + '\n</body></html>'
561 def count_from_list(self, l, d) :
569 def count_from_list_cl(self, l, d, a, clnb) :
578 def find_segments(self, taille_segment, taille_limite) :
580 for uce in self.getalluces() :
582 d = self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
583 l = [[d[val], val] for val in d if d[val] >= 3]
586 if len(l) > taille_limite :
587 l = l[-taille_limite:]
590 def find_segments_in_classe(self, list_uce, taille_segment, taille_limite):
592 for uce in self.getconcorde(list_uce) :
594 d =self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
595 l = [[d[val], val, taille_segment] for val in d if d[val] >= 3]
598 if len(l) > taille_limite :
599 l = l[-taille_limite:]
602 def make_segments_profile(self, fileout, lenmin = 3, lenmax = 10, effmin = 50, lem = False) :
604 for b, classe in enumerate(self.lc) :
605 for uce in self.getconcorde(classe) :
608 uce = [self.formes[forme].lem for forme in uce]
609 for taille_segment in range(lenmin,lenmax) :
610 d =self.count_from_list_cl([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d, b, len(self.lc))
611 result = [[seg] + [str(val) for val in d[seg]] for seg in d if sum(d[seg]) >= effmin]
612 with open(fileout, 'w') as f :
613 f.write('\n'.join([';'.join(line) for line in result]))
615 def make_ucecl_from_R(self, filein) :
616 with open(filein, 'rU') as f :
621 line = line.replace('\n', '').replace('"', '').split(';')
622 self.lc.append([int(line[0]) - 1, int(line[1])])
623 classesl = [val[1] for val in self.lc]
625 self.lc = sorted(self.lc, key=itemgetter(1))
626 self.lc = [[uce[0] for uce in self.lc if uce[1] == i] for i in range(clnb+1)]
627 self.lc0 = self.lc.pop(0)
630 def gethapaxbyet(self, etoiles) :
631 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
633 for uce in hapaxuces :
634 if uce in hucesdict :
638 etuces = [[] for et in etoiles]
639 for uci in self.ucis :
640 get = list(set(uci.etoiles).intersection(etoiles))
642 return '2 variables sur la meme ligne'
644 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
645 etuces = [set(val) for val in etuces]
646 return [sum([hucesdict[uce] for uce in list(etuce.intersection(hapaxuces))]) for etuce in etuces]
648 def gethapaxuces(self) :
649 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
650 hapax = [forme for forme in self.lems if self.lems[forme].freq == 1]
652 for i,uce in enumerate(hapaxuces) :
653 if uce in hucesdict :
654 hucesdict[uce][0] += 1
655 hucesdict[uce][1].append(hapax[i])
657 hucesdict[uce] = [1,[hapax[i]]]
659 for uce in hucesdict :
660 if hucesdict[uce][0] in huces :
661 huces[hucesdict[uce][0]].append(uce)
663 huces[hucesdict[uce][0]] = [uce]
664 huces = zip(huces, huces.values())
665 huces.sort(reverse=True)
669 for nb in huces[0:4] :
670 txt += "<p><h2>%i hapax par uce</h2><p>\n" % nb[0]
672 res = self.getconcorde([uce])
674 ucetxt = ' ' + row[1] + ' '
676 for hap in hucesdict[uce][1] :
677 laforme = self.getforme([forme for forme in self.lems[hap].formes][0]).forme
678 ucetxt = ucetxt.replace(' '+laforme+' ', ' <font color=red>'+laforme+'</font> ')
679 txt += '<p><b>' + ' '.join(self.getetbyuceid(uceid)) + '</b></p>'
680 txt += '<p>'+ucetxt+'</p>\n'
684 with open('/tmp/testhapxuce.html','w') as f :
689 def __init__(self, corpus) :
690 ucinb = corpus.getucinb()
691 ucisize = corpus.getucisize()
692 ucimean = float(sum(ucisize))/float(ucinb)
693 detoile = corpus.make_etoiles_dict()
697 def __init__(self, iduci, line, paraset = None) :
699 self.etoiles = line.split()
701 if paraset is not None :
702 self.paras = paraset.split()
707 def __init__(self, iduce, idpara, iduci) :
713 def __init__(self, word, gramtype, idword, lem = None, freq = None) :
719 if freq is not None :
725 def __init__(self, parent, forme) :
726 self.formes = {forme.ident : forme.freq}
727 self.gram = forme.gram
728 self.freq = forme.freq
731 def add_forme(self, forme) :
732 self.formes[forme.ident] = forme.freq
733 self.freq += forme.freq
735 def decouperlist(chaine, longueur, longueurOptimale) :
737 on part du dernier caractère, et on recule jusqu'au début de la chaîne.
738 Si on trouve un '$', c'est fini.
739 Sinon, on cherche le meilleur candidat. C'est-à-dire le rapport poids/distance le plus important.
741 separateurs = [[u'.', 6.0], [u'?', 6.0], [u'!', 6.0], [u'£$£', 6.0], [u':', 5.0], [u';', 4.0], [u',', 1.0], [u' ', 0.01]]
742 dsep = dict([[val[0],val[1]] for val in separateurs])
743 trouve = False # si on a trouvé un bon séparateur
744 iDecoupe = 0 # indice du caractere ou il faut decouper
746 longueur = min(longueur, len(chaine) - 1)
747 chaineTravail = chaine[:longueur + 1]
749 meilleur = ['', 0, 0] # type, poids et position du meilleur separateur
752 indice = chaineTravail.index(u'$')
754 iDecoupe = indice - 1
759 caractere = chaineTravail[nbCar]
760 distance = abs(longueurOptimale - nbCar) + 1
761 meilleureDistance = abs(longueurOptimale - meilleur[2]) + 1
762 if caractere in dsep :
763 if (float(dsep[caractere]) / distance) > (float(meilleur[1]) / meilleureDistance) :
764 meilleur[0] = caractere
765 meilleur[1] = dsep[caractere]
770 if (float(dsep[' ']) / distance) > (float(meilleur[1]) / meilleureDistance) :
772 meilleur[1] = dsep[' ']
779 #if meilleur[0] != ' ' :
780 # fin = chaine[iDecoupe + 1:]
781 # retour = chaineTravail[:iDecoupe]
783 fin = chaine[iDecoupe + 1:]
784 retour = chaineTravail[:iDecoupe + 1]
785 return len(retour) > 0, retour, fin
786 # si on a rien trouvé
787 return False, chaine, ''
789 def testetoile(line) :
790 return line.startswith(u'****')
793 return line[0:4].isdigit() and u'*' in line
795 def prep_txtlist(txt) :
796 return txt.split() + [u'$']
798 def prep_txtcharact(txt) :
803 Class for building a corpus
805 def __init__(self, infile, parametres_corpus, lexique = None, expressions = None, dlg = None) :
806 log.info('begin building corpus...')
807 self.lexique = lexique
808 self.expressions = expressions
810 self.corpus = Corpus(self, parametres_corpus)
813 self.lim = parametres_corpus.get('lim', 1000000)
814 self.encoding = parametres_corpus['encoding']
815 self.corpus.pathout = PathOut(filename = parametres_corpus['originalpath'], dirout = parametres_corpus['pathout'])
816 self.corpus.pathout.createdir(parametres_corpus['pathout'])
817 self.corpus.parametres['uuid'] = str(uuid4())
818 self.corpus.parametres['corpus_name'] = os.path.split(self.corpus.parametres['pathout'])[1]
819 self.corpus.parametres['type'] = 'corpus'
820 if self.corpus.parametres['keep_ponct'] :
821 self.ponctuation_espace = [' ', '']
823 self.ponctuation_espace = [' ','.', u'£$£', ';', '?', '!', ',', ':','']
825 self.tolist = self.corpus.parametres.get('tolist', 0)
832 def prep_makeuce(self) :
833 method = self.corpus.parametres.get('ucemethod', 0)
835 self.decouper = decouperlist
836 self.prep_txt = prep_txtlist
837 self.ucesize = self.corpus.parametres.get('ucesize', 40)
839 self.decouper = decoupercharact
840 self.prep_txt = prep_txtcharact
841 self.ucesize = self.corpus.parametres.get('ucesize', 240)
842 log.info('method uce : %s' % method)
847 self.read_corpus(self.infile)
848 except Warning, args :
849 log.info('pas kool %s' % args)
853 self.corpus.parametres['ira'] = self.corpus.pathout['Corpus.cira']
854 self.time = time() - t1
856 DoConf().makeoptions(['corpus'],[self.corpus.parametres], self.corpus.pathout['Corpus.cira'])
857 log.info('time : %f' % (time() - t1))
860 self.conn_f = sqlite3.connect(self.corpus.pathout['formes.db'])
861 self.cf = self.conn_f.cursor()
862 self.cf.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
863 self.cf.execute('CREATE TABLE IF NOT EXISTS eff (id INTEGER, eff TEXT);')
865 self.cf = self.conn_f.cursor()
866 self.cf.execute('PRAGMA temp_store=MEMORY;')
867 self.cf.execute('PRAGMA journal_mode=MEMORY;')
868 self.cf.execute('PRAGMA synchronous = OFF;')
869 self.cf.execute('begin')
870 self.conn = sqlite3.connect(self.corpus.pathout['uces.db'])
871 self.c = self.conn.cursor()
872 self.c.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
874 self.c = self.conn.cursor()
875 self.c.execute('PRAGMA temp_store=MEMORY;')
876 self.c.execute('PRAGMA journal_mode=MEMORY;')
877 self.c.execute('PRAGMA synchronous = OFF;')
878 self.c.execute('begin')
881 #commit index and close db
884 self.cf.execute('CREATE INDEX iduces ON uces (id);')
885 self.cf.execute('CREATE INDEX ideff ON eff (id);')
889 self.conn_corpus = sqlite3.connect(self.corpus.pathout['corpus.db'])
890 self.ccorpus = self.conn_corpus.cursor()
891 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS etoiles (uci INTEGER, et TEXT, paras TEXT);')
892 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS luces (uci INTEGER, para INTEGER, uce INTEGER);')
893 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS formes (ident INTEGER, forme TEXT, lem TEXT, gram TEXT, freq INTEGER);')
894 self.conn_corpus.commit()
895 self.ccorpus = self.conn_corpus.cursor()
896 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
897 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
898 self.ccorpus.execute('PRAGMA synchronous = OFF;')
899 self.ccorpus.execute('begin')
901 self.ccorpus.execute('CREATE INDEX iduci ON luces (uci);')
902 self.conn_corpus.commit()
903 self.conn_corpus.close()
904 #self.corpus.parametres['corpus_ira'] = self.corpus.pathout['corpus.cira']
906 def buildcleans(self) :
907 if self.corpus.parametres.get('lower', 1) :
908 self.cleans.append(self.dolower)
909 if self.corpus.parametres.get('firstclean', 1) :
910 self.cleans.append(self.firstclean)
911 if self.corpus.parametres['charact'] :
912 self.rule = self.corpus.parametres.get('keep_caract', u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_")
913 self.cleans.append(self.docharact)
914 if self.corpus.parametres.get('expressions', 1) :
915 self.cleans.append(self.make_expression)
916 if self.corpus.parametres.get('apos', 1) :
917 self.cleans.append(self.doapos)
918 if self.corpus.parametres.get('tiret', 1):
919 self.cleans.append(self.dotiret)
921 def make_expression(self,txt) :
922 for expression in self.expressions:
923 if expression in txt :
924 txt = txt.replace(expression, self.expressions[expression][0])
927 def dolower(self, txt) :
930 def docharact(self, txt) :
931 #rule = u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_-"
932 list_keep = u"[" + self.rule + "]+"
933 return re.sub(list_keep, ' ', txt)
935 def doapos(self, txt) :
936 return txt.replace(u'\'', u' ')
938 def dotiret(self, txt) :
939 return txt.replace(u'-', u' ')
941 def firstclean(self, txt) :
942 txt = txt.replace(u'’',"'")
943 txt = txt.replace(u'œ', u'oe')
944 return txt.replace('...',u' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'…', ' £$£ ')
946 def make_cleans(self, txt) :
947 for clean in self.cleans :
951 def backup_uce(self) :
952 if self.corpus.idformesuces != {} :
953 log.info('backup %i' % len(self.corpus.idformesuces))
954 touce = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].keys()])) for forme in self.corpus.idformesuces]
955 toeff = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].values()])) for forme in self.corpus.idformesuces]
956 self.cf.executemany('INSERT INTO uces VALUES (?,?);', touce)
957 self.cf.executemany('INSERT INTO eff VALUES (?,?);', toeff)
958 self.corpus.idformesuces = {}
961 def backup_corpus(self) :
962 log.info('start backup corpus')
964 for uci in self.corpus.ucis :
965 self.ccorpus.execute('INSERT INTO etoiles VALUES (?,?,?);' ,(uci.ident,' '.join(uci.etoiles), ' '.join(uci.paras,)))
966 for uce in uci.uces :
967 self.ccorpus.execute('INSERT INTO luces VALUES (?,?,?);',(`uci.ident`,`uce.para`,`uce.ident`,))
968 for forme in self.corpus.formes :
969 self.ccorpus.execute('INSERT INTO formes VALUES (?,?,?,?,?);', (`self.corpus.formes[forme].ident`, forme, self.corpus.formes[forme].lem, self.corpus.formes[forme].gram, `self.corpus.formes[forme].freq`,))
970 log.info('%f' % (time() - t))
973 self.corpus.parametres['date'] = datetime.datetime.now().ctime()
974 minutes, seconds = divmod(self.time, 60)
975 hours, minutes = divmod(minutes, 60)
976 self.corpus.parametres['time'] = '%.0fh %.0fm %.0fs' % (hours, minutes, seconds)
977 self.corpus.parametres['ucinb'] = self.corpus.getucinb()
978 self.corpus.parametres['ucenb'] = self.corpus.getucenb()
979 self.corpus.parametres['occurrences'] = self.corpus.gettotocc()
980 self.corpus.parametres['formesnb'] = len(self.corpus.formes)
981 hapaxnb = self.corpus.gethapaxnb()
982 pourhapaxf = (float(hapaxnb) / len(self.corpus.formes)) * 100
983 pourhapaxocc = (float(hapaxnb) / self.corpus.parametres['occurrences']) * 100
984 self.corpus.parametres['hapax'] = '%i - %.2f %% des formes - %.2f %% des occurrences' % (hapaxnb, pourhapaxf, pourhapaxocc)
987 class BuildFromAlceste(BuildCorpus) :
988 #def __init___(self, infile, parametres_corpus) :
989 # BuildCorpus.__init__(self, infile, parametres_corpus)
992 def read_corpus(self, infile) :
995 if self.corpus.parametres['ucimark'] == 0 :
996 self.testuci = testetoile
997 elif self.corpus.parametres['ucimark'] == 1 :
998 self.testuci = testint
1005 with codecs.open(infile, 'r', self.encoding) as f :
1008 line = line.rstrip('\n\r')
1009 if self.testuci(line) :
1012 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci - 1)
1014 self.corpus.ucis.append(Uci(iduci, line))
1017 if self.corpus.ucis[-1].uces == [] :
1018 log.info('linenb : %i' % linenb)
1019 raise Exception("EmptyText %i" % linenb)
1020 self.corpus.ucis.append(Uci(iduci, line))
1021 elif line.startswith(u'-*') :
1024 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1027 self.corpus.ucis[-1].paras.append(line.split()[0])
1029 raise Exception('paragrapheOT')
1030 elif line.strip() != '' and iduci != -1 :
1032 if txt != [] and iduci != -1 :
1033 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1036 raise Exception("EmptyText")
1037 if iduci != -1 and iduce != -1:
1040 log.info(_(u"No Texte in corpora. Are you sure of the formatting ?"))
1041 raise Exception('TextBeforeTextMark')
1042 except UnicodeDecodeError :
1043 raise Exception("CorpusEncoding")
1045 def treattxt(self, txt, iduce, idpara, iduci) :
1046 if self.corpus.parametres.get('ucemethod', 0) == 2 and self.corpus.parametres['douce']:
1047 txt = 'laphrasepoursplitter'.join(txt)
1048 txt = self.make_cleans(txt)
1049 txt = ' '.join([val for val in txt.split() if val not in self.ponctuation_espace])
1050 ucetxt = txt.split('laphrasepoursplitter')
1053 txt = self.make_cleans(txt)
1054 ucetxt = self.make_uces(txt, self.corpus.parametres['douce'])
1055 if self.corpus.ucis[-1].paras == [] :
1059 self.corpus.ucis[-1].uces.append(Uce(iduce, idpara, iduci))
1060 self.c.execute('INSERT INTO uces VALUES(?,?);', (`iduce`,uce))
1061 if not self.tolist :
1067 self.corpus.add_word(word)
1068 if self.dlg is not None :
1069 if self.limitshow > self.count :
1070 self.dlg.Pulse('uci: %i - uce : %i' % (iduci + 1, iduce +1))
1074 self.limitshow = self.last / 100000
1075 log.debug(' '.join([`iduci`,`idpara`,`iduce`]))
1076 if self.last > self.lim :
1079 return iduce, idpara
1081 def make_uces(self, txt, douce = True, keep_ponct = False) :
1082 txt = ' '.join(txt.split())
1085 reste, texte_uce, suite = self.decouper(self.prep_txt(txt), self.ucesize + 15, self.ucesize)
1093 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1096 reste, texte_uce, suite = self.decouper(suite, self.ucesize + 15, self.ucesize)
1104 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1106 #print 'RESTEE UUCEEEEEEEEEEEEE', uce
1110 return [' '.join([val for val in txt.split() if val not in self.ponctuation_espace])]
1112 #decouper (list_sep)
1113 #make_uces (decouper)
1114 #treat_txt (make_uces)
1118 def __init__(self, parent, dlg = None) :
1119 self.parent = parent
1121 parametres = DoConf(os.path.join(self.parent.UserConfigPath,'corpus.cfg')).getoptions('corpus')
1122 parametres['pathout'] = PathOut(parent.filename, 'corpus').mkdirout()
1123 dial = CorpusPref(parent, parametres)
1124 dial.CenterOnParent()
1125 dial.txtpath.SetLabel(parent.filename)
1126 #dial.repout_choices.SetValue(parametres['pathout'])
1127 self.res = dial.ShowModal()
1128 if self.res == 5100 :
1129 parametres = dial.doparametres()
1130 parametres['originalpath'] = parent.filename
1131 PathOut().createdir(parametres['pathout'])
1132 ReadLexique(self.parent, lang = parametres['lang'])
1133 self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp'))
1134 self.parametres = parametres
1137 def doanalyse(self) :
1138 return BuildFromAlceste(self.parent.filename, self.parametres, self.parent.lexique, self.parent.expressions, dlg = self.dlg).corpus
1141 if __name__ == '__main__' :
1143 parametres = {'formesdb':'formes.db', 'ucesdb': 'uces.db', 'corpusdb' : 'corpus.db', 'syscoding' : 'utf-8', 'encoding' : encoding}
1144 intro = BuildCorpus(infile, parametres)#, tar_in, tar_infouce)#, tar_formes)