1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
11 from functions import decoupercharact, ReadDicoAsDico, DoConf
17 from operator import itemgetter
18 from uuid import uuid4
19 from chemins import PathOut
20 from dialog import CorpusPref
21 from functions import ReadLexique, ReadDicoAsDico
22 from colors import colors
26 log = logging.getLogger('iramuteq.corpus')
29 def copycorpus(corpus) :
30 log.info('copy corpus')
31 copy_corpus = Corpus(corpus.parent, parametres = corpus.parametres)
32 copy_corpus.ucis = corpus.ucis
33 copy_corpus.formes = corpus.formes
34 copy_corpus.pathout = corpus.pathout
35 copy_corpus.conn_all()
45 def __init__(self, parent, parametres = {}, read = False) :
47 self.parametres = parametres
49 self.connformes = None
51 self.conncorpus = None
58 self.idformesuces = {}
63 self.pathout = PathOut(dirout = parametres['pathout'])
66 def add_word(self, word) :
67 if word in self.formes :
68 self.formes[word].freq += 1
69 if self.formes[word].ident in self.idformesuces :
70 if self.ucis[-1].uces[-1].ident in self.idformesuces[self.formes[word].ident] :
71 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] += 1
73 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] = 1
75 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident: 1}
77 if word in self.parent.lexique :
78 gramtype = self.parent.lexique[word][1]
79 lem = self.parent.lexique[word][0]
86 self.formes[word] = Word(word, gramtype, len(self.formes), lem)
87 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident : 1}
90 """connect corpus to db"""
91 if self.connformes is None :
92 log.info('connexion corpus')
93 self.connuces = sqlite3.connect(self.pathout['uces.db'])
94 self.cuces = self.connuces.cursor()
95 self.connformes = sqlite3.connect(self.pathout['formes.db'])
96 self.cformes = self.connformes.cursor()
97 self.conncorpus = sqlite3.connect(self.pathout['corpus.db'])
98 self.ccorpus = self.conncorpus.cursor()
99 self.cformes.execute('PRAGMA temp_store=MEMORY;')
100 self.cformes.execute('PRAGMA journal_mode=MEMORY;')
101 self.cformes.execute('PRAGMA synchronous = OFF;')
102 self.cuces.execute('PRAGMA temp_store=MEMORY;')
103 self.cuces.execute('PRAGMA journal_mode=MEMORY;')
104 self.cuces.execute('PRAGMA synchronous = OFF;')
105 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
106 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
107 self.ccorpus.execute('PRAGMA synchronous = OFF;')
109 def read_corpus(self) :
110 log.info('read corpus')
111 self.parametres['syscoding'] = sys.getdefaultencoding()
112 if self.conncorpus is None :
114 res = self.ccorpus.execute('SELECT * FROM etoiles;')
116 self.ucis.append(Uci(row[0], row[1], row[2]))
117 uces = self.conncorpus.cursor().execute('SELECT * FROM luces where uci=?;',(`self.ucis[-1].ident`,))
119 self.ucis[-1].uces.append(Uce(uce[2], uce[1], uce[0]))
120 res = self.ccorpus.execute('SELECT * FROM formes;')
121 self.formes = dict([[forme[1], Word(forme[1], forme[3], forme[0], lem = forme[2], freq = forme[4])] for forme in res])
124 def getworduces(self, wordid) :
125 if isinstance(wordid, basestring) :
126 wordid = self.formes[wordid].ident
127 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`wordid`,))
128 return list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
130 def getlemuces(self, lem) :
131 formesid = ', '.join([`val` for val in self.lems[lem].formes])
132 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
133 res = self.cformes.execute(query)
134 return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))))
136 def getlemucis(self, lem) :
137 uces = self.getlemuces(lem)
138 return list(set([self.getucefromid(val).uci for val in uces]))
140 def getlemuceseff(self, lem) :
141 formesid = ', '.join([`val` for val in self.lems[lem].formes])
142 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
143 res = self.cformes.execute(query)
144 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
145 query = 'SELECT eff FROM eff where id IN (%s) ORDER BY id' % formesid
146 res = self.cformes.execute(query)
147 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
149 for i, uce in enumerate(uces) :
150 lemuceeff[uce] = lemuceeff.get(uce, 0) + eff[i]
153 def getlemeff(self, lem) :
154 return self.lems[lem].freq
159 def getforme(self, formeid) :
160 if self.idformes is None : self.make_idformes()
161 return self.idformes[formeid]
163 def gettotocc(self) :
164 return sum([self.formes[forme].freq for forme in self.formes])
166 def getucemean(self) :
167 return float(self.gettotocc())/self.getucenb()
170 return self.ucis[-1].uces[-1].ident + 1
173 return self.ucis[-1].ident + 1
175 def getucisize(self) :
176 ucesize = self.getucesize()
177 return [sum(ucesize[uci.uces[0].ident:(uci.uces[-1].ident + 1)]) for uci in self.ucis]
179 def getucesize(self) :
180 res = self.getalluces()
181 return [len(uce[1].split()) for uce in res]
183 # def getlemseff(self) :
184 # if self.idformes is None :
185 # self.make_idformes()
186 # return dict([[lem, sum([self.idformes[forme].freq for forme in self.lems[lem]])] for lem in self.lems])
188 # def getlemsefftype(self) :
189 # if self.idformes is None :
190 # self.make_idformes()
191 # if self.lems is None :
193 # return dict([[lem, [sum([self.idformes[forme].freq for forme in self.lems[lem]]), '', self.idformes[self.lems[lem].keys()[0]].gram]] for lem in self.lems])
195 def getconcorde(self, uces) :
196 return self.cuces.execute('select * from uces where id IN (%s);' % ', '.join([`i` for i in uces]))
198 def getwordconcorde(self, word) :
199 return self.getconcorde(self.getworduces(word))
201 def getlemconcorde(self, lem) :
202 return self.getconcorde(self.getlemuces(lem))
204 def getalluces(self) :
205 return self.cuces.execute('SELECT * FROM uces')
207 def getucesfrometoile(self, etoile) :
208 return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles]
210 def getucefromid(self, uceid) :
211 if self.iduces is None : self.make_iduces()
212 return self.iduces[uceid]
214 def gethapaxnb(self) :
215 return len([None for forme in self.formes if self.formes[forme].freq == 1])
217 def getactivesnb(self, key) :
218 return len([lem for lem in self.lems if self.lems[lem].act == key])
219 # def make_lems(self, lem = True) :
220 # log.info('make lems')
222 # for forme in self.formes :
223 # if self.formes[forme].lem in self.lems :
224 # if self.formes[forme].ident not in self.lems[self.formes[forme].lem] :
225 # self.lems[self.formes[forme].lem][self.formes[forme].ident] = 0
227 # self.lems[self.formes[forme].lem] = {self.formes[forme].ident : 0}
229 def getetbyuceid(self, uceid) :
230 if self.uceuci is None : self.uceuci = dict([[uce.ident,uci.ident] for uci in self.ucis for uce in uci.uces])
231 return self.ucis[self.uceuci[uceid]].etoiles
233 def make_lems(self, lem = True) :
234 log.info('make lems')
237 for forme in self.formes :
238 if self.formes[forme].lem in self.lems :
239 if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes :
240 self.lems[self.formes[forme].lem].add_forme(self.formes[forme])
242 self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
244 self.lems = dict([[forme, Lem(self, self.formes[forme])] for forme in self.formes])
246 def make_idformes(self) :
247 self.idformes = dict([[self.formes[forme].ident, self.formes[forme]] for forme in self.formes])
249 def make_iduces(self) :
250 if self.iduces is None :
251 self.iduces = dict([[uce.ident, uce] for uci in self.ucis for uce in uci.uces])
253 def make_lexitable(self, mineff, etoiles) :
254 tokeep = [lem for lem in self.lems if self.lems[lem].freq > mineff]
255 etuces = [[] for et in etoiles]
256 for uci in self.ucis :
257 get = list(set(uci.etoiles).intersection(etoiles))
259 return '2 variables sur la meme ligne'
261 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
262 etuces = [set(val) for val in etuces]
265 deff = self.getlemuceseff(lem)
267 tab.append([lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])
268 tab.insert(0, [''] + etoiles)
271 def make_efftype_from_etoiles(self, etoiles) :
273 etuces = [[] for et in etoiles]
274 for uci in self.ucis :
275 get = list(set(uci.etoiles).intersection(etoiles))
277 return '2 variables sur la meme ligne'
279 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
280 etuces = [set(val) for val in etuces]
281 for lem in self.lems :
282 deff = self.getlemuceseff(lem)
284 gram = self.lems[lem].gram
286 dtype[gram] = [i + j for i, j in zip(dtype[gram], [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])]
288 dtype[gram] = [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces]
289 tabout = [[gram] + dtype[gram] for gram in dtype]
290 tabout.insert(0, [''] + etoiles)
293 def make_uceactsize(self, actives) :
294 res = self.getalluces()
297 deff = self.getlemuceseff(lem)
299 ucesize[uce] = ucesize.get(uce, 0) + 1
302 def make_uc(self, actives, lim1, lim2) :
303 uceactsize = self.make_uceactsize(actives)
309 for uce in [uce for uci in self.ucis for uce in uci.uces] :
310 if uce.para == lastpara :
312 last1 += uceactsize.get(uce.ident,0)
313 uc1[-1].append(uce.ident)
315 uc1.append([uce.ident])
318 last2 += uceactsize.get(uce.ident, 0)
319 uc2[-1].append(uce.ident)
321 uc2.append([uce.ident])
324 last1 = uceactsize.get(uce.ident, 0)
325 last2 = uceactsize.get(uce.ident, 0)
327 uc1.append([uce.ident])
328 uc2.append([uce.ident])
331 def make_and_write_sparse_matrix_from_uc(self, actives, sizeuc1, sizeuc2, uc1out, uc2out, listuce1out, listuce2out) :
332 uc1, uc2 = self.make_uc(actives, sizeuc1, sizeuc2)
333 log.info('taille uc1 : %i - taille uc2 : %i' % (len(uc1), len(uc2)))
334 self.write_ucmatrix(uc1, actives, uc1out)
335 self.write_ucmatrix(uc2, actives, uc2out)
336 listuce1 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc1) for uce in ucl]
337 listuce2 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc2) for uce in ucl]
338 with open(listuce1out, 'w') as f :
339 f.write('\n'.join([';'.join(line) for line in listuce1]))
340 with open(listuce2out, 'w') as f :
341 f.write('\n'.join([';'.join(line) for line in listuce2]))
342 return len(uc1), len(uc2)
344 def write_ucmatrix(self, uc, actives, fileout) :
345 log.info('write uc matrix %s' % fileout)
346 uces_uc = dict([[uce, i] for i, ucl in enumerate(uc) for uce in ucl])
349 with open(fileout + '~', 'w+') as f :
350 for i, lem in enumerate(actives) :
351 for uce in self.getlemuces(lem):
352 if (uces_uc[uce], i) not in deja_la :
354 f.write(''.join([' '.join([`uces_uc[uce]+1`,`i+1`,`1`]),'\n']))
355 deja_la[(uces_uc[uce], i)] = 0
357 with open(fileout, 'w') as ffin :
358 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (len(uc), len(actives), nbl))
361 os.remove(fileout + '~')
364 def export_corpus(self, outf) :
365 #outf = 'export_corpus.txt'
367 res = self.getalluces()
371 with open(outf,'w') as f :
373 if self.iduces[uce[0]].uci == actuci and self.iduces[uce[0]].para == actpara :
374 f.write(uce[1].encode(self.parametres['syscoding']) + '\n')
375 elif self.iduces[uce[0]].uci != actuci :
376 actuci = self.iduces[uce[0]].uci
377 if self.ucis[self.iduces[uce[0]].uci].paras == [] :
378 actpara = self.iduces[uce[0]].para
379 f.write('\n' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n' + uce[1].encode(self.parametres['syscoding']) + '\n')
382 actpara = self.iduces[uce[0]].para
383 f.write('\n'.join([' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']), self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
384 elif self.iduces[uce[0]].para != actpara :
385 actpara = self.iduces[uce[0]].para
387 f.write('\n'.join([self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
389 def make_and_write_sparse_matrix_from_uces(self, actives, outfile, listuce = False) :
390 log.info('make_and_write_sparse_matrix_from_uces %s' % outfile)
392 with open(outfile + '~', 'w+') as f :
393 for i, lem in enumerate(actives) :
394 for uce in sorted(self.getlemuces(lem)) :
396 f.write(''.join([' '.join([`uce+1`, `i+1`,`1`]),'\n']))
398 with open(outfile, 'w') as ffin :
399 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
402 os.remove(outfile + '~')
404 with open(listuce, 'w') as f :
405 f.write('\n'.join(['uce;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucenb())]))
407 def make_and_write_sparse_matrix_from_uci(self, actives, outfile, listuci = False) :
408 log.info('make_and_write_sparse_matrix_from_ucis %s' % outfile)
410 with open(outfile + '~', 'w+') as f :
411 for i, lem in enumerate(actives) :
412 for uci in sorted(self.getlemucis(lem)) :
414 f.write(''.join([' '.join([`uci+1`, `i+1`,`1`]),'\n']))
416 with open(outfile, 'w') as ffin :
417 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucinb(), len(actives), nbl))
420 os.remove(outfile + '~')
422 with open(listuci, 'w') as f :
423 f.write('\n'.join(['uci;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucinb())]))
425 def make_and_write_sparse_matrix_from_classe(self, actives, uces, outfile) :
426 log.info('make_and_write_sparse_matrix_from_classe %s' % outfile)
428 duces = dict([[uce, i] for i, uce in enumerate(uces)])
429 with open(outfile + '~', 'w+') as f :
430 for i, lem in enumerate(actives) :
431 uces_ok = list(set(self.getlemuces(lem)).intersection(uces))
433 f.write(''.join([' '.join([`duces[uce]+1`,`i+1`,`1`]),'\n']))
435 with open(outfile, 'w') as ffin :
436 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
439 os.remove(outfile + '~')
441 def make_table_with_classe(self, uces, list_act) :
442 table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
443 uces = dict([[uce, i] for i, uce in enumerate(uces)])
444 for i, lem in enumerate(list_act) :
445 lemuces = list(set(self.getlemuces(lem)).intersection(uces))
447 table_uce[uces[uce]][i] = 1
448 table_uce.insert(0, list_act)
451 def parse_active(self, gramact, gramsup = None) :
452 log.info('parse actives')
453 for lem in self.lems :
454 if self.lems[lem].gram in gramact :
455 self.lems[lem].act = 1
456 elif gramsup is not None :
457 if self.lems[lem].gram in gramsup :
458 self.lems[lem].act = 2
460 self.lems[lem].act = 0
462 self.lems[lem].act = 2
464 def make_actives_limit(self, limit, key = 1) :
465 if self.idformes is None :
467 return [lem for lem in self.lems if self.getlemeff(lem) >= limit and self.lems[lem].act == key]
469 def make_actives_nb(self, nbmax, key) :
470 log.info('make_actives_nb : %i - %i' % (nbmax,key))
471 if self.idformes is None :
473 allactives = [[self.lems[lem].freq, lem] for lem in self.lems if self.lems[lem].act == key and self.lems[lem].freq >= 3]
474 self.activenb = len(allactives)
475 allactives = sorted(allactives, reverse = True)
476 if len(allactives) <= nbmax :
477 log.info('nb = %i - eff min = %i ' % (len(allactives), allactives[-1][0]))
478 return [val[1] for val in allactives], allactives[-1][0]
480 effs = [val[0] for val in allactives]
481 if effs.count(effs[nbmax - 1]) > 1 :
482 lim = effs[nbmax - 1] + 1
486 stop = effs.index(lim)
492 log.info('nb actives = %i - eff min = %i ' % (stop, lim))
493 return [val[1] for val in allactives[0:stop + 1]], lim
495 def make_and_write_profile(self, actives, ucecl, fileout) :
496 log.info('formes/classes')
497 tab = [[lem] + [len(set(self.getlemuces(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
498 tab = [[line[0]] + [`val` for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
499 with open(fileout, 'w') as f :
500 f.write('\n'.join([';'.join(line) for line in tab]).encode(self.parametres['syscoding']))
502 def make_etoiles(self) :
504 for uci in self.ucis :
505 etoiles.update(uci.etoiles[1:] + uci.paras)
508 def make_etoiles_dict(self) :
509 etoiles = [et for uci in self.ucis for et in uci.etoiles[1:]]
511 for etoile in etoiles :
512 et = etoile.split('_')
515 if et[1] in det[et[0]] :
516 det[et[0]][et[1]] += 1
518 det[et[0]][et[1]] = 1
523 det[et[0]] = {et[1] :1}
528 def make_and_write_profile_et(self, ucecl, fileout) :
529 log.info('etoiles/classes')
530 etoiles = self.make_etoiles()
531 with open(fileout, 'w') as f :
532 f.write('\n'.join([';'.join([etoile] + [`len(set(self.getucesfrometoile(etoile)).intersection(classe))` for classe in ucecl]) for etoile in etoiles]).encode(self.parametres['syscoding']))
534 def make_colored_corpus(self) :
536 for i, lc in enumerate(self.lc) :
539 for uce in self.lc0 :
541 color = ['black'] + colors[len(self.lc) - 1]
543 <meta http-equiv="content-Type" content="text/html; charset=%s" />
545 ''' % sys.getdefaultencoding()
546 res = self.getalluces()
551 if self.iduces[uce[0]].uci != actuci :
552 actuci = self.iduces[uce[0]].uci
553 txt += '<br><hr>' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles) + '<br><br>'
554 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
556 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
557 return txt + '\n</body></html>'
559 def count_from_list(self, l, d) :
567 def count_from_list_cl(self, l, d, a, clnb) :
576 def find_segments(self, taille_segment, taille_limite) :
578 for uce in self.getalluces() :
580 d = self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
581 l = [[d[val], val] for val in d if d[val] >= 3]
584 if len(l) > taille_limite :
585 l = l[-taille_limite:]
588 def find_segments_in_classe(self, list_uce, taille_segment, taille_limite):
590 for uce in self.getconcorde(list_uce) :
592 d =self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
593 l = [[d[val], val, taille_segment] for val in d if d[val] >= 3]
596 if len(l) > taille_limite :
597 l = l[-taille_limite:]
600 def make_segments_profile(self, fileout, lenmin = 3, lenmax = 10, effmin = 50, lem = False) :
602 for b, classe in enumerate(self.lc) :
603 for uce in self.getconcorde(classe) :
606 uce = [self.formes[forme].lem for forme in uce]
607 for taille_segment in range(lenmin,lenmax) :
608 d =self.count_from_list_cl([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d, b, len(self.lc))
609 result = [[seg] + [str(val) for val in d[seg]] for seg in d if sum(d[seg]) >= effmin]
610 with open(fileout, 'w') as f :
611 f.write('\n'.join([';'.join(line) for line in result]))
613 def make_ucecl_from_R(self, filein) :
614 with open(filein, 'rU') as f :
619 line = line.replace('\n', '').replace('"', '').split(';')
620 self.lc.append([int(line[0]) - 1, int(line[1])])
621 classesl = [val[1] for val in self.lc]
623 self.lc = sorted(self.lc, key=itemgetter(1))
624 self.lc = [[uce[0] for uce in self.lc if uce[1] == i] for i in range(clnb+1)]
625 self.lc0 = self.lc.pop(0)
628 def gethapaxbyet(self, etoiles) :
629 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
631 for uce in hapaxuces :
632 if uce in hucesdict :
636 etuces = [[] for et in etoiles]
637 for uci in self.ucis :
638 get = list(set(uci.etoiles).intersection(etoiles))
640 return '2 variables sur la meme ligne'
642 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
643 etuces = [set(val) for val in etuces]
644 return [sum([hucesdict[uce] for uce in list(etuce.intersection(hapaxuces))]) for etuce in etuces]
646 def gethapaxuces(self) :
647 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
648 hapax = [forme for forme in self.lems if self.lems[forme].freq == 1]
650 for i,uce in enumerate(hapaxuces) :
651 if uce in hucesdict :
652 hucesdict[uce][0] += 1
653 hucesdict[uce][1].append(hapax[i])
655 hucesdict[uce] = [1,[hapax[i]]]
657 for uce in hucesdict :
658 if hucesdict[uce][0] in huces :
659 huces[hucesdict[uce][0]].append(uce)
661 huces[hucesdict[uce][0]] = [uce]
662 huces = zip(huces, huces.values())
663 huces.sort(reverse=True)
667 for nb in huces[0:4] :
668 txt += "<p><h2>%i hapax par uce</h2><p>\n" % nb[0]
670 res = self.getconcorde([uce])
672 ucetxt = ' ' + row[1] + ' '
674 for hap in hucesdict[uce][1] :
675 laforme = self.getforme([forme for forme in self.lems[hap].formes][0]).forme
676 ucetxt = ucetxt.replace(' '+laforme+' ', ' <font color=red>'+laforme+'</font> ')
677 txt += '<p><b>' + ' '.join(self.getetbyuceid(uceid)) + '</b></p>'
678 txt += '<p>'+ucetxt+'</p>\n'
682 with open('/tmp/testhapxuce.html','w') as f :
687 def __init__(self, corpus) :
688 ucinb = corpus.getucinb()
689 ucisize = corpus.getucisize()
690 ucimean = float(sum(ucisize))/float(ucinb)
691 detoile = corpus.make_etoiles_dict()
695 def __init__(self, iduci, line, paraset = None) :
697 self.etoiles = line.split()
699 if paraset is not None :
700 self.paras = paraset.split()
705 def __init__(self, iduce, idpara, iduci) :
711 def __init__(self, word, gramtype, idword, lem = None, freq = None) :
717 if freq is not None :
723 def __init__(self, parent, forme) :
724 self.formes = {forme.ident : forme.freq}
725 self.gram = forme.gram
726 self.freq = forme.freq
729 def add_forme(self, forme) :
730 self.formes[forme.ident] = forme.freq
731 self.freq += forme.freq
733 def decouperlist(chaine, longueur, longueurOptimale) :
735 on part du dernier caractère, et on recule jusqu'au début de la chaîne.
736 Si on trouve un '$', c'est fini.
737 Sinon, on cherche le meilleur candidat. C'est-à-dire le rapport poids/distance le plus important.
739 separateurs = [[u'.', 6.0], [u'?', 6.0], [u'!', 6.0], [u'£$£', 6.0], [u':', 5.0], [u';', 4.0], [u',', 1.0], [u' ', 0.01]]
740 dsep = dict([[val[0],val[1]] for val in separateurs])
741 trouve = False # si on a trouvé un bon séparateur
742 iDecoupe = 0 # indice du caractere ou il faut decouper
744 longueur = min(longueur, len(chaine) - 1)
745 chaineTravail = chaine[:longueur + 1]
747 meilleur = ['', 0, 0] # type, poids et position du meilleur separateur
750 indice = chaineTravail.index(u'$')
752 iDecoupe = indice - 1
757 caractere = chaineTravail[nbCar]
758 distance = abs(longueurOptimale - nbCar) + 1
759 meilleureDistance = abs(longueurOptimale - meilleur[2]) + 1
760 if caractere in dsep :
761 if (float(dsep[caractere]) / distance) > (float(meilleur[1]) / meilleureDistance) :
762 meilleur[0] = caractere
763 meilleur[1] = dsep[caractere]
768 if (float(dsep[' ']) / distance) > (float(meilleur[1]) / meilleureDistance) :
770 meilleur[1] = dsep[' ']
777 #if meilleur[0] != ' ' :
778 # fin = chaine[iDecoupe + 1:]
779 # retour = chaineTravail[:iDecoupe]
781 fin = chaine[iDecoupe + 1:]
782 retour = chaineTravail[:iDecoupe + 1]
783 return len(retour) > 0, retour, fin
784 # si on a rien trouvé
785 return False, chaine, ''
787 def testetoile(line) :
788 return line.startswith(u'****')
791 return line[0:4].isdigit() and u'*' in line
793 def prep_txtlist(txt) :
794 return txt.split() + [u'$']
796 def prep_txtcharact(txt) :
801 Class for building a corpus
803 def __init__(self, infile, parametres_corpus, lexique = None, expressions = None, dlg = None) :
804 log.info('begin building corpus...')
805 self.lexique = lexique
806 self.expressions = expressions
808 self.corpus = Corpus(self, parametres_corpus)
811 self.lim = parametres_corpus.get('lim', 1000000)
812 self.encoding = parametres_corpus['encoding']
813 self.corpus.pathout = PathOut(filename = parametres_corpus['originalpath'], dirout = parametres_corpus['pathout'])
814 self.corpus.pathout.createdir(parametres_corpus['pathout'])
815 self.corpus.parametres['uuid'] = str(uuid4())
816 self.corpus.parametres['corpus_name'] = os.path.split(self.corpus.parametres['pathout'])[1]
817 self.corpus.parametres['type'] = 'corpus'
818 if self.corpus.parametres['keep_ponct'] :
819 self.ponctuation_espace = [' ', '']
821 self.ponctuation_espace = [' ','.', u'£$£', ';', '?', '!', ',', ':','']
823 self.tolist = self.corpus.parametres.get('tolist', 0)
830 def prep_makeuce(self) :
831 method = self.corpus.parametres.get('ucemethod', 0)
833 self.decouper = decouperlist
834 self.prep_txt = prep_txtlist
835 self.ucesize = self.corpus.parametres.get('ucesize', 40)
837 self.decouper = decoupercharact
838 self.prep_txt = prep_txtcharact
839 self.ucesize = self.corpus.parametres.get('ucesize', 240)
840 log.info('method uce : %s' % method)
845 self.read_corpus(self.infile)
846 except Warning, args :
847 log.info('pas kool %s' % args)
851 self.corpus.parametres['ira'] = self.corpus.pathout['Corpus.cira']
852 self.time = time() - t1
854 DoConf().makeoptions(['corpus'],[self.corpus.parametres], self.corpus.pathout['Corpus.cira'])
855 log.info('time : %f' % (time() - t1))
858 self.conn_f = sqlite3.connect(self.corpus.pathout['formes.db'])
859 self.cf = self.conn_f.cursor()
860 self.cf.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
861 self.cf.execute('CREATE TABLE IF NOT EXISTS eff (id INTEGER, eff TEXT);')
863 self.cf = self.conn_f.cursor()
864 self.cf.execute('PRAGMA temp_store=MEMORY;')
865 self.cf.execute('PRAGMA journal_mode=MEMORY;')
866 self.cf.execute('PRAGMA synchronous = OFF;')
867 self.cf.execute('begin')
868 self.conn = sqlite3.connect(self.corpus.pathout['uces.db'])
869 self.c = self.conn.cursor()
870 self.c.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
872 self.c = self.conn.cursor()
873 self.c.execute('PRAGMA temp_store=MEMORY;')
874 self.c.execute('PRAGMA journal_mode=MEMORY;')
875 self.c.execute('PRAGMA synchronous = OFF;')
876 self.c.execute('begin')
879 #commit index and close db
882 self.cf.execute('CREATE INDEX iduces ON uces (id);')
883 self.cf.execute('CREATE INDEX ideff ON eff (id);')
887 self.conn_corpus = sqlite3.connect(self.corpus.pathout['corpus.db'])
888 self.ccorpus = self.conn_corpus.cursor()
889 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS etoiles (uci INTEGER, et TEXT, paras TEXT);')
890 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS luces (uci INTEGER, para INTEGER, uce INTEGER);')
891 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS formes (ident INTEGER, forme TEXT, lem TEXT, gram TEXT, freq INTEGER);')
892 self.conn_corpus.commit()
893 self.ccorpus = self.conn_corpus.cursor()
894 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
895 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
896 self.ccorpus.execute('PRAGMA synchronous = OFF;')
897 self.ccorpus.execute('begin')
899 self.ccorpus.execute('CREATE INDEX iduci ON luces (uci);')
900 self.conn_corpus.commit()
901 self.conn_corpus.close()
902 #self.corpus.parametres['corpus_ira'] = self.corpus.pathout['corpus.cira']
904 def buildcleans(self) :
905 if self.corpus.parametres.get('lower', 1) :
906 self.cleans.append(self.dolower)
907 if self.corpus.parametres.get('firstclean', 1) :
908 self.cleans.append(self.firstclean)
909 self.rule = self.corpus.parametres.get('keep_caract', u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_")
910 self.cleans.append(self.docharact)
911 if self.corpus.parametres.get('expressions', 1) :
912 self.cleans.append(self.make_expression)
913 if self.corpus.parametres.get('apos', 1) :
914 self.cleans.append(self.doapos)
915 if self.corpus.parametres.get('tiret', 1):
916 self.cleans.append(self.dotiret)
918 def make_expression(self,txt) :
919 for expression in self.expressions:
920 if expression in txt :
921 txt = txt.replace(expression, self.expressions[expression][0])
924 def dolower(self, txt) :
927 def docharact(self, txt) :
928 #rule = u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_-"
929 list_keep = u"[" + self.rule + "]+"
930 return re.sub(list_keep, ' ', txt)
932 def doapos(self, txt) :
933 return txt.replace(u'\'', u' ')
935 def dotiret(self, txt) :
936 return txt.replace(u'-', u' ')
938 def firstclean(self, txt) :
939 txt = txt.replace(u'’',"'")
940 txt = txt.replace(u'œ', u'oe')
941 return txt.replace('...',u' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'…', ' £$£ ')
943 def make_cleans(self, txt) :
944 for clean in self.cleans :
948 def backup_uce(self) :
949 if self.corpus.idformesuces != {} :
950 log.info('backup %i' % len(self.corpus.idformesuces))
951 touce = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].keys()])) for forme in self.corpus.idformesuces]
952 toeff = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].values()])) for forme in self.corpus.idformesuces]
953 self.cf.executemany('INSERT INTO uces VALUES (?,?);', touce)
954 self.cf.executemany('INSERT INTO eff VALUES (?,?);', toeff)
955 self.corpus.idformesuces = {}
958 def backup_corpus(self) :
959 log.info('start backup corpus')
961 for uci in self.corpus.ucis :
962 self.ccorpus.execute('INSERT INTO etoiles VALUES (?,?,?);' ,(uci.ident,' '.join(uci.etoiles), ' '.join(uci.paras,)))
963 for uce in uci.uces :
964 self.ccorpus.execute('INSERT INTO luces VALUES (?,?,?);',(`uci.ident`,`uce.para`,`uce.ident`,))
965 for forme in self.corpus.formes :
966 self.ccorpus.execute('INSERT INTO formes VALUES (?,?,?,?,?);', (`self.corpus.formes[forme].ident`, forme, self.corpus.formes[forme].lem, self.corpus.formes[forme].gram, `self.corpus.formes[forme].freq`,))
967 log.info('%f' % (time() - t))
970 self.corpus.parametres['date'] = datetime.datetime.now().ctime()
971 minutes, seconds = divmod(self.time, 60)
972 hours, minutes = divmod(minutes, 60)
973 self.corpus.parametres['time'] = '%.0fh %.0fm %.0fs' % (hours, minutes, seconds)
974 self.corpus.parametres['ucinb'] = self.corpus.getucinb()
975 self.corpus.parametres['ucenb'] = self.corpus.getucenb()
976 self.corpus.parametres['occurrences'] = self.corpus.gettotocc()
977 self.corpus.parametres['formesnb'] = len(self.corpus.formes)
978 hapaxnb = self.corpus.gethapaxnb()
979 pourhapaxf = (float(hapaxnb) / len(self.corpus.formes)) * 100
980 pourhapaxocc = (float(hapaxnb) / self.corpus.parametres['occurrences']) * 100
981 self.corpus.parametres['hapax'] = '%i - %.2f %% des formes - %.2f %% des occurrences' % (hapaxnb, pourhapaxf, pourhapaxocc)
984 class BuildFromAlceste(BuildCorpus) :
985 #def __init___(self, infile, parametres_corpus) :
986 # BuildCorpus.__init__(self, infile, parametres_corpus)
989 def read_corpus(self, infile) :
992 if self.corpus.parametres['ucimark'] == 0 :
993 self.testuci = testetoile
994 elif self.corpus.parametres['ucimark'] == 1 :
995 self.testuci = testint
1002 with codecs.open(infile, 'r', self.encoding) as f :
1005 line = line.rstrip('\n\r')
1006 if self.testuci(line) :
1009 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci - 1)
1011 self.corpus.ucis.append(Uci(iduci, line))
1014 if self.corpus.ucis[-1].uces == [] :
1015 log.info('linenb : %i' % linenb)
1016 raise Exception("EmptyText %i" % linenb)
1017 self.corpus.ucis.append(Uci(iduci, line))
1018 elif line.startswith(u'-*') :
1021 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1024 self.corpus.ucis[-1].paras.append(line.split()[0])
1026 raise Exception('paragrapheOT')
1027 elif line.strip() != '' and iduci != -1 :
1029 if txt != [] and iduci != -1 :
1030 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1033 raise Exception("EmptyText")
1034 if iduci != -1 and iduce != -1:
1037 log.info(_(u"No Texte in corpora. Are you sure of the formatting ?"))
1038 raise Exception('TextBeforeTextMark')
1039 except UnicodeDecodeError :
1040 raise Exception("CorpusEncoding")
1042 def treattxt(self, txt, iduce, idpara, iduci) :
1043 if self.corpus.parametres.get('ucemethod', 0) == 2 and self.corpus.parametres['douce']:
1044 txt = 'laphrasepoursplitter'.join(txt)
1045 txt = self.make_cleans(txt)
1046 txt = ' '.join([val for val in txt.split() if val not in self.ponctuation_espace])
1047 ucetxt = txt.split('laphrasepoursplitter')
1050 txt = self.make_cleans(txt)
1051 ucetxt = self.make_uces(txt, self.corpus.parametres['douce'])
1052 if self.corpus.ucis[-1].paras == [] :
1056 self.corpus.ucis[-1].uces.append(Uce(iduce, idpara, iduci))
1057 self.c.execute('INSERT INTO uces VALUES(?,?);', (`iduce`,uce))
1058 if not self.tolist :
1064 self.corpus.add_word(word)
1065 if self.dlg is not None :
1066 if self.limitshow > self.count :
1067 self.dlg.Pulse('uci: %i - uce : %i' % (iduci + 1, iduce +1))
1071 self.limitshow = self.last / 100000
1072 log.debug(' '.join([`iduci`,`idpara`,`iduce`]))
1073 if self.last > self.lim :
1076 return iduce, idpara
1078 def make_uces(self, txt, douce = True, keep_ponct = False) :
1079 txt = ' '.join(txt.split())
1082 reste, texte_uce, suite = self.decouper(self.prep_txt(txt), self.ucesize + 15, self.ucesize)
1090 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1093 reste, texte_uce, suite = self.decouper(suite, self.ucesize + 15, self.ucesize)
1101 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1103 #print 'RESTEE UUCEEEEEEEEEEEEE', uce
1107 return [' '.join([val for val in txt.split() if val not in self.ponctuation_espace])]
1109 #decouper (list_sep)
1110 #make_uces (decouper)
1111 #treat_txt (make_uces)
1115 def __init__(self, parent, dlg = None) :
1116 self.parent = parent
1118 parametres = DoConf(os.path.join(self.parent.UserConfigPath,'corpus.cfg')).getoptions('corpus')
1119 parametres['pathout'] = PathOut(parent.filename, 'corpus').mkdirout()
1120 dial = CorpusPref(parent, parametres)
1121 dial.CenterOnParent()
1122 dial.txtpath.SetLabel(parent.filename)
1123 #dial.repout_choices.SetValue(parametres['pathout'])
1124 self.res = dial.ShowModal()
1125 if self.res == 5100 :
1126 parametres = dial.doparametres()
1127 parametres['originalpath'] = parent.filename
1128 PathOut().createdir(parametres['pathout'])
1129 ReadLexique(self.parent, lang = parametres['lang'])
1130 self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp'))
1131 self.parametres = parametres
1134 def doanalyse(self) :
1135 return BuildFromAlceste(self.parent.filename, self.parametres, self.parent.lexique, self.parent.expressions, dlg = self.dlg).corpus
1138 if __name__ == '__main__' :
1140 parametres = {'formesdb':'formes.db', 'ucesdb': 'uces.db', 'corpusdb' : 'corpus.db', 'syscoding' : 'utf-8', 'encoding' : encoding}
1141 intro = BuildCorpus(infile, parametres)#, tar_in, tar_infouce)#, tar_formes)