1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
11 from functions import decoupercharact, ReadDicoAsDico, DoConf
17 from operator import itemgetter
18 from uuid import uuid4
19 from chemins import PathOut
20 from dialog import CorpusPref
21 from functions import ReadLexique, ReadDicoAsDico
22 from colors import colors
26 log = logging.getLogger('iramuteq.corpus')
29 def copycorpus(corpus) :
30 log.info('copy corpus')
31 copy_corpus = Corpus(corpus.parent, parametres = corpus.parametres)
32 copy_corpus.ucis = corpus.ucis
33 copy_corpus.formes = corpus.formes
34 copy_corpus.pathout = corpus.pathout
35 copy_corpus.conn_all()
45 def __init__(self, parent, parametres = {}, read = False) :
47 self.parametres = parametres
49 self.connformes = None
51 self.conncorpus = None
58 self.idformesuces = {}
63 self.pathout = PathOut(dirout = parametres['pathout'])
66 def add_word(self, word) :
67 if word in self.formes :
68 self.formes[word].freq += 1
69 if self.formes[word].ident in self.idformesuces :
70 if self.ucis[-1].uces[-1].ident in self.idformesuces[self.formes[word].ident] :
71 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] += 1
73 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] = 1
75 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident: 1}
77 if word in self.parent.lexique :
78 gramtype = self.parent.lexique[word][1]
79 lem = self.parent.lexique[word][0]
86 self.formes[word] = Word(word, gramtype, len(self.formes), lem)
87 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident : 1}
90 """connect corpus to db"""
91 if self.connformes is None :
92 log.info('connexion corpus')
93 self.connuces = sqlite3.connect(self.pathout['uces.db'])
94 self.cuces = self.connuces.cursor()
95 self.connformes = sqlite3.connect(self.pathout['formes.db'])
96 self.cformes = self.connformes.cursor()
97 self.conncorpus = sqlite3.connect(self.pathout['corpus.db'])
98 self.ccorpus = self.conncorpus.cursor()
99 self.cformes.execute('PRAGMA temp_store=MEMORY;')
100 self.cformes.execute('PRAGMA journal_mode=MEMORY;')
101 self.cformes.execute('PRAGMA synchronous = OFF;')
102 self.cuces.execute('PRAGMA temp_store=MEMORY;')
103 self.cuces.execute('PRAGMA journal_mode=MEMORY;')
104 self.cuces.execute('PRAGMA synchronous = OFF;')
105 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
106 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
107 self.ccorpus.execute('PRAGMA synchronous = OFF;')
109 def read_corpus(self) :
110 log.info('read corpus')
111 self.parametres['syscoding'] = sys.getdefaultencoding()
112 if self.conncorpus is None :
114 res = self.ccorpus.execute('SELECT * FROM etoiles;')
116 self.ucis.append(Uci(row[0], row[1], row[2]))
117 uces = self.conncorpus.cursor().execute('SELECT * FROM luces where uci=?;',(`self.ucis[-1].ident`,))
119 self.ucis[-1].uces.append(Uce(uce[2], uce[1], uce[0]))
120 res = self.ccorpus.execute('SELECT * FROM formes;')
121 self.formes = dict([[forme[1], Word(forme[1], forme[3], forme[0], lem = forme[2], freq = forme[4])] for forme in res])
124 def getworduces(self, wordid) :
125 if isinstance(wordid, basestring) :
126 wordid = self.formes[wordid].ident
127 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`wordid`,))
128 return list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
130 def getformeuceseff(self, formeid) :
131 if isinstance(formeid, basestring) :
132 formeid = self.formes[formeid].ident
133 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`formeid`,))
134 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
135 query = 'SELECT eff FROM eff where id=%i ORDER BY id' % formeid
136 res = self.cformes.execute(query)
137 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
139 for i, uce in enumerate(uces) :
140 formeuceeff[uce] = formeuceeff.get(uce, 0) + eff[i]
143 def getlemuces(self, lem) :
144 formesid = ', '.join([`val` for val in self.lems[lem].formes])
145 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
146 res = self.cformes.execute(query)
147 return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))))
149 def getlemucis(self, lem) :
150 uces = self.getlemuces(lem)
151 return list(set([self.getucefromid(val).uci for val in uces]))
153 def getlemuceseff(self, lem, luces = None) :
154 formesid = ', '.join([`val` for val in self.lems[lem].formes])
155 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
156 res = self.cformes.execute(query)
157 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
158 query = 'SELECT eff FROM eff where id IN (%s) ORDER BY id' % formesid
159 res = self.cformes.execute(query)
160 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
162 for i, uce in enumerate(uces) :
163 lemuceeff[uce] = lemuceeff.get(uce, 0) + eff[i]
166 def getlemeff(self, lem) :
167 return self.lems[lem].freq
172 def getforme(self, formeid) :
173 if self.idformes is None : self.make_idformes()
174 return self.idformes[formeid]
176 def gettotocc(self) :
177 return sum([self.formes[forme].freq for forme in self.formes])
179 def getucemean(self) :
180 return float(self.gettotocc())/self.getucenb()
183 return self.ucis[-1].uces[-1].ident + 1
186 return self.ucis[-1].ident + 1
188 def getucisize(self) :
189 ucesize = self.getucesize()
190 return [sum(ucesize[uci.uces[0].ident:(uci.uces[-1].ident + 1)]) for uci in self.ucis]
192 def getucesize(self) :
193 res = self.getalluces()
194 return [len(uce[1].split()) for uce in res]
196 # def getlemseff(self) :
197 # if self.idformes is None :
198 # self.make_idformes()
199 # return dict([[lem, sum([self.idformes[forme].freq for forme in self.lems[lem]])] for lem in self.lems])
201 # def getlemsefftype(self) :
202 # if self.idformes is None :
203 # self.make_idformes()
204 # if self.lems is None :
206 # return dict([[lem, [sum([self.idformes[forme].freq for forme in self.lems[lem]]), '', self.idformes[self.lems[lem].keys()[0]].gram]] for lem in self.lems])
208 def getconcorde(self, uces) :
209 return self.cuces.execute('select * from uces where id IN (%s);' % ', '.join([`i` for i in uces]))
211 def getwordconcorde(self, word) :
212 return self.getconcorde(self.getworduces(word))
214 def getlemconcorde(self, lem) :
215 return self.getconcorde(self.getlemuces(lem))
217 def getalluces(self) :
218 return self.cuces.execute('SELECT * FROM uces')
220 def getucesfrometoile(self, etoile) :
221 return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles]
223 def getucefromid(self, uceid) :
224 if self.iduces is None : self.make_iduces()
225 return self.iduces[uceid]
227 def gethapaxnb(self) :
228 return len([None for forme in self.formes if self.formes[forme].freq == 1])
230 def getactivesnb(self, key) :
231 return len([lem for lem in self.lems if self.lems[lem].act == key])
232 # def make_lems(self, lem = True) :
233 # log.info('make lems')
235 # for forme in self.formes :
236 # if self.formes[forme].lem in self.lems :
237 # if self.formes[forme].ident not in self.lems[self.formes[forme].lem] :
238 # self.lems[self.formes[forme].lem][self.formes[forme].ident] = 0
240 # self.lems[self.formes[forme].lem] = {self.formes[forme].ident : 0}
242 def getetbyuceid(self, uceid) :
243 if self.uceuci is None : self.uceuci = dict([[uce.ident,uci.ident] for uci in self.ucis for uce in uci.uces])
244 return self.ucis[self.uceuci[uceid]].etoiles
246 def make_lems(self, lem = True) :
247 log.info('make lems')
250 for forme in self.formes :
251 if self.formes[forme].lem in self.lems :
252 if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes :
253 self.lems[self.formes[forme].lem].add_forme(self.formes[forme])
255 self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
257 self.lems = dict([[forme, Lem(self, self.formes[forme])] for forme in self.formes])
259 def make_idformes(self) :
260 self.idformes = dict([[self.formes[forme].ident, self.formes[forme]] for forme in self.formes])
262 def make_iduces(self) :
263 if self.iduces is None :
264 self.iduces = dict([[uce.ident, uce] for uci in self.ucis for uce in uci.uces])
266 def make_lexitable(self, mineff, etoiles) :
267 tokeep = [lem for lem in self.lems if self.lems[lem].freq >= mineff]
268 etuces = [[] for et in etoiles]
269 for uci in self.ucis :
270 get = list(set(uci.etoiles).intersection(etoiles))
272 return '2 variables sur la meme ligne'
274 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
275 etuces = [set(val) for val in etuces]
278 deff = self.getlemuceseff(lem)
280 tab.append([lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])
281 tab.insert(0, [''] + etoiles)
284 def make_efftype_from_etoiles(self, etoiles) :
286 etuces = [[] for et in etoiles]
287 for uci in self.ucis :
288 get = list(set(uci.etoiles).intersection(etoiles))
290 return '2 variables sur la meme ligne'
292 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
293 etuces = [set(val) for val in etuces]
294 for lem in self.lems :
295 deff = self.getlemuceseff(lem)
297 gram = self.lems[lem].gram
299 dtype[gram] = [i + j for i, j in zip(dtype[gram], [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])]
301 dtype[gram] = [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces]
302 tabout = [[gram] + dtype[gram] for gram in dtype]
303 tabout.insert(0, [''] + etoiles)
306 def make_uceactsize(self, actives) :
307 res = self.getalluces()
310 deff = self.getlemuceseff(lem)
312 ucesize[uce] = ucesize.get(uce, 0) + 1
315 def make_uc(self, actives, lim1, lim2) :
316 uceactsize = self.make_uceactsize(actives)
322 for uce in [uce for uci in self.ucis for uce in uci.uces] :
323 if uce.para == lastpara :
325 last1 += uceactsize.get(uce.ident,0)
326 uc1[-1].append(uce.ident)
328 uc1.append([uce.ident])
331 last2 += uceactsize.get(uce.ident, 0)
332 uc2[-1].append(uce.ident)
334 uc2.append([uce.ident])
337 last1 = uceactsize.get(uce.ident, 0)
338 last2 = uceactsize.get(uce.ident, 0)
340 uc1.append([uce.ident])
341 uc2.append([uce.ident])
344 def make_and_write_sparse_matrix_from_uc(self, actives, sizeuc1, sizeuc2, uc1out, uc2out, listuce1out, listuce2out) :
345 uc1, uc2 = self.make_uc(actives, sizeuc1, sizeuc2)
346 log.info('taille uc1 : %i - taille uc2 : %i' % (len(uc1), len(uc2)))
347 self.write_ucmatrix(uc1, actives, uc1out)
348 self.write_ucmatrix(uc2, actives, uc2out)
349 listuce1 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc1) for uce in ucl]
350 listuce2 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc2) for uce in ucl]
351 with open(listuce1out, 'w') as f :
352 f.write('\n'.join([';'.join(line) for line in listuce1]))
353 with open(listuce2out, 'w') as f :
354 f.write('\n'.join([';'.join(line) for line in listuce2]))
355 return len(uc1), len(uc2)
357 def write_ucmatrix(self, uc, actives, fileout) :
358 log.info('write uc matrix %s' % fileout)
359 uces_uc = dict([[uce, i] for i, ucl in enumerate(uc) for uce in ucl])
362 with open(fileout + '~', 'w+') as f :
363 for i, lem in enumerate(actives) :
364 for uce in self.getlemuces(lem):
365 if (uces_uc[uce], i) not in deja_la :
367 f.write(''.join([' '.join([`uces_uc[uce]+1`,`i+1`,`1`]),'\n']))
368 deja_la[(uces_uc[uce], i)] = 0
370 with open(fileout, 'w') as ffin :
371 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (len(uc), len(actives), nbl))
374 os.remove(fileout + '~')
377 def export_corpus(self, outf) :
378 #outf = 'export_corpus.txt'
380 res = self.getalluces()
384 with open(outf,'w') as f :
386 if self.iduces[uce[0]].uci == actuci and self.iduces[uce[0]].para == actpara :
387 f.write(uce[1].encode(self.parametres['syscoding']) + '\n')
388 elif self.iduces[uce[0]].uci != actuci :
389 actuci = self.iduces[uce[0]].uci
390 if self.ucis[self.iduces[uce[0]].uci].paras == [] :
391 actpara = self.iduces[uce[0]].para
392 f.write('\n' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n' + uce[1].encode(self.parametres['syscoding']) + '\n')
395 actpara = self.iduces[uce[0]].para
396 f.write('\n'.join([' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']), self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
397 elif self.iduces[uce[0]].para != actpara :
398 actpara = self.iduces[uce[0]].para
400 f.write('\n'.join([self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
402 def export_corpus_classes(self, outf, alc = True, lem = False) :
404 for i, lc in enumerate(self.lc) :
407 for uce in self.lc0 :
409 res = self.getalluces()
411 with open(outf, 'w') as f :
414 actuci = self.iduces[uce[0]].uci
416 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
418 etline = ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles + ['*classe_%i' % ucecl[uce[0]]])
420 etline = ' '.join(['<' + '='.join(et.split('_')) + '>' for et in self.ucis[self.iduces[uce[0]].uci].etoiles[1:]])
421 f.write(etline.encode(self.parametres['syscoding']) + '\n')
422 f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
424 def make_and_write_sparse_matrix_from_uces(self, actives, outfile, listuce = False) :
425 log.info('make_and_write_sparse_matrix_from_uces %s' % outfile)
427 with open(outfile + '~', 'w+') as f :
428 for i, lem in enumerate(actives) :
429 for uce in sorted(self.getlemuces(lem)) :
431 f.write(''.join([' '.join([`uce+1`, `i+1`,`1`]),'\n']))
433 with open(outfile, 'w') as ffin :
434 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
437 os.remove(outfile + '~')
439 with open(listuce, 'w') as f :
440 f.write('\n'.join(['uce;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucenb())]))
442 def make_and_write_sparse_matrix_from_uci(self, actives, outfile, listuci = False) :
443 log.info('make_and_write_sparse_matrix_from_ucis %s' % outfile)
445 with open(outfile + '~', 'w+') as f :
446 for i, lem in enumerate(actives) :
447 for uci in sorted(self.getlemucis(lem)) :
449 f.write(''.join([' '.join([`uci+1`, `i+1`,`1`]),'\n']))
451 with open(outfile, 'w') as ffin :
452 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucinb(), len(actives), nbl))
455 os.remove(outfile + '~')
457 with open(listuci, 'w') as f :
458 f.write('\n'.join(['uci;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucinb())]))
460 def make_and_write_sparse_matrix_from_classe(self, actives, uces, outfile) :
461 log.info('make_and_write_sparse_matrix_from_classe %s' % outfile)
463 duces = dict([[uce, i] for i, uce in enumerate(uces)])
464 with open(outfile + '~', 'w+') as f :
465 for i, lem in enumerate(actives) :
466 uces_ok = list(set(self.getlemuces(lem)).intersection(uces))
468 f.write(''.join([' '.join([`duces[uce]+1`,`i+1`,`1`]),'\n']))
470 with open(outfile, 'w') as ffin :
471 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
474 os.remove(outfile + '~')
476 def make_table_with_classe(self, uces, list_act) :
477 table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
478 uces = dict([[uce, i] for i, uce in enumerate(uces)])
479 for i, lem in enumerate(list_act) :
480 lemuces = list(set(self.getlemuces(lem)).intersection(uces))
482 table_uce[uces[uce]][i] = 1
483 table_uce.insert(0, list_act)
486 def parse_active(self, gramact, gramsup = None) :
487 log.info('parse actives')
488 for lem in self.lems :
489 if lem.startswith('_') and lem.endswith('_') :
490 self.lems[lem].act = 2
491 elif self.lems[lem].gram in gramact :
492 self.lems[lem].act = 1
493 elif gramsup is not None :
494 if self.lems[lem].gram in gramsup :
495 self.lems[lem].act = 2
497 self.lems[lem].act = 0
499 self.lems[lem].act = 2
501 def make_actives_limit(self, limit, key = 1) :
502 if self.idformes is None :
504 return [lem for lem in self.lems if self.getlemeff(lem) >= limit and self.lems[lem].act == key]
506 def make_actives_nb(self, nbmax, key) :
507 log.info('make_actives_nb : %i - %i' % (nbmax,key))
508 if self.idformes is None :
510 allactives = [[self.lems[lem].freq, lem] for lem in self.lems if self.lems[lem].act == key and self.lems[lem].freq >= 3]
511 self.activenb = len(allactives)
512 allactives = sorted(allactives, reverse = True)
513 if len(allactives) <= nbmax :
514 log.info('nb = %i - eff min = %i ' % (len(allactives), allactives[-1][0]))
515 return [val[1] for val in allactives], allactives[-1][0]
517 effs = [val[0] for val in allactives]
518 if effs.count(effs[nbmax - 1]) > 1 :
519 lim = effs[nbmax - 1] + 1
523 stop = effs.index(lim)
529 log.info('nb actives = %i - eff min = %i ' % (stop, lim))
530 return [val[1] for val in allactives[0:stop + 1]], lim
532 def make_and_write_profile(self, actives, ucecl, fileout) :
533 log.info('formes/classes')
534 tab = [[lem] + [len(set(self.getlemuces(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
535 tab = [[line[0]] + [`val` for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
536 with open(fileout, 'w') as f :
537 f.write('\n'.join([';'.join(line) for line in tab]).encode(self.parametres['syscoding']))
539 def make_etoiles(self) :
541 for uci in self.ucis :
542 etoiles.update(uci.etoiles[1:] + uci.paras)
545 def make_etoiles_dict(self) :
546 etoiles = [et for uci in self.ucis for et in uci.etoiles[1:]]
548 for etoile in etoiles :
549 et = etoile.split('_')
552 endet = '_'.join(et[1:])
553 if endet in det[et[0]] :
554 det[et[0]][endet] += 1
556 det[et[0]][endet] = 1
561 endet = '_'.join(et[1:])
562 det[et[0]] = {endet :1}
567 def make_etline(self, listet) :
568 etuces = [[] for et in listet]
569 for uci in self.ucis :
570 get = list(set(uci.etoiles).intersection(listet))
572 return '2 variables sur la meme ligne'
574 etuces[listet.index(get[0])] += [uce.ident for uce in uci.uces]
578 def make_and_write_profile_et(self, ucecl, fileout) :
579 log.info('etoiles/classes')
580 etoiles = self.make_etoiles()
581 with open(fileout, 'w') as f :
582 f.write('\n'.join([';'.join([etoile] + [`len(set(self.getucesfrometoile(etoile)).intersection(classe))` for classe in ucecl]) for etoile in etoiles]).encode(self.parametres['syscoding']))
584 def make_colored_corpus(self) :
586 for i, lc in enumerate(self.lc) :
589 for uce in self.lc0 :
591 color = ['black'] + colors[len(self.lc) - 1]
593 <meta http-equiv="content-Type" content="text/html; charset=%s" />
595 ''' % sys.getdefaultencoding()
596 res = self.getalluces()
601 if self.iduces[uce[0]].uci != actuci :
602 actuci = self.iduces[uce[0]].uci
603 txt += '<br><hr>' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles) + '<br><br>'
604 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
606 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
607 return txt + '\n</body></html>'
609 def count_from_list(self, l, d) :
617 def count_from_list_cl(self, l, d, a, clnb) :
626 def find_segments(self, taille_segment, taille_limite) :
628 for uce in self.getalluces() :
630 d = self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
631 l = [[d[val], val] for val in d if d[val] >= 3]
634 if len(l) > taille_limite :
635 l = l[-taille_limite:]
638 def find_segments_in_classe(self, list_uce, taille_segment, taille_limite):
640 for uce in self.getconcorde(list_uce) :
642 d =self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
643 l = [[d[val], val, taille_segment] for val in d if d[val] >= 3]
646 if len(l) > taille_limite :
647 l = l[-taille_limite:]
650 def make_segments_profile(self, fileout, lenmin = 3, lenmax = 10, effmin = 50, lem = False) :
652 for b, classe in enumerate(self.lc) :
653 for uce in self.getconcorde(classe) :
656 uce = [self.formes[forme].lem for forme in uce]
657 for taille_segment in range(lenmin,lenmax) :
658 d =self.count_from_list_cl([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d, b, len(self.lc))
659 result = [[seg] + [str(val) for val in d[seg]] for seg in d if sum(d[seg]) >= effmin]
660 with open(fileout, 'w') as f :
661 f.write('\n'.join([';'.join(line) for line in result]))
663 def make_proftype(self, outf) :
665 for lem in self.lems :
666 gram = self.lems[lem].gram
668 res[gram] = [0 for val in self.lc]
669 lemuceeff = self.getlemuceseff(lem)
670 for i, classe in enumerate(self.lc) :
671 concern = set(classe).intersection(lemuceeff.keys())
672 res[gram][i] += sum([lemuceeff[uce] for uce in concern])
673 res = [[gram] + [`val` for val in res[gram]] for gram in res]
675 with open(outf, 'w') as f :
676 f.write('\n'.join([';'.join(line) for line in res]).encode(self.parametres['syscoding']))
679 def make_ucecl_from_R(self, filein) :
680 with open(filein, 'rU') as f :
685 line = line.replace('\n', '').replace('"', '').split(';')
686 self.lc.append([int(line[0]) - 1, int(line[1])])
687 classesl = [val[1] for val in self.lc]
689 self.lc = sorted(self.lc, key=itemgetter(1))
690 self.lc = [[uce[0] for uce in self.lc if uce[1] == i] for i in range(clnb+1)]
691 self.lc0 = self.lc.pop(0)
694 def get_stat_by_cluster(self, outf) :
695 log.info('get_stat_by_cluster')
697 occurrences = dict([[i + 1, 0] for i in range(len(self.lc))])
698 formescl = dict([[i + 1, 0] for i in range(len(self.lc))])
699 hapaxcl = dict([[i + 1, 0] for i in range(len(self.lc))])
700 lenclasses = dict([[i+1,len(cl)] for i, cl in enumerate(self.lc)])
701 sets = [set(cl) for cl in self.lc]
702 for forme in self.formes :
703 formeuceeff = self.getformeuceseff(forme)
704 for i, classe in enumerate(self.lc) :
705 concern = sets[i].intersection(formeuceeff.keys())
707 occurrences[i+1] += sum([formeuceeff[uce] for uce in concern])
709 if self.formes[forme].freq == 1 :
711 toprint = '\n'.join([';'.join([`i`, `occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`]) for i in occurrences])
712 with open(outf, 'w') as f :
714 log.info('%f' % (time() - t1))
716 def gethapaxbyet(self, etoiles) :
717 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
719 for uce in hapaxuces :
720 if uce in hucesdict :
724 etuces = [[] for et in etoiles]
725 for uci in self.ucis :
726 get = list(set(uci.etoiles).intersection(etoiles))
728 return '2 variables sur la meme ligne'
730 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
731 etuces = [set(val) for val in etuces]
732 return [sum([hucesdict[uce] for uce in list(etuce.intersection(hapaxuces))]) for etuce in etuces]
734 def gethapaxuces(self) :
735 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
736 hapax = [forme for forme in self.lems if self.lems[forme].freq == 1]
738 for i,uce in enumerate(hapaxuces) :
739 if uce in hucesdict :
740 hucesdict[uce][0] += 1
741 hucesdict[uce][1].append(hapax[i])
743 hucesdict[uce] = [1,[hapax[i]]]
745 for uce in hucesdict :
746 if hucesdict[uce][0] in huces :
747 huces[hucesdict[uce][0]].append(uce)
749 huces[hucesdict[uce][0]] = [uce]
750 huces = zip(huces, huces.values())
751 huces.sort(reverse=True)
755 for nb in huces[0:4] :
756 txt += "<p><h2>%i hapax par uce</h2><p>\n" % nb[0]
758 res = self.getconcorde([uce])
760 ucetxt = ' ' + row[1] + ' '
762 for hap in hucesdict[uce][1] :
763 laforme = self.getforme([forme for forme in self.lems[hap].formes][0]).forme
764 ucetxt = ucetxt.replace(' '+laforme+' ', ' <font color=red>'+laforme+'</font> ')
765 txt += '<p><b>' + ' '.join(self.getetbyuceid(uceid)) + '</b></p>'
766 txt += '<p>'+ucetxt+'</p>\n'
770 with open('/tmp/testhapxuce.html','w') as f :
775 def __init__(self, corpus) :
776 ucinb = corpus.getucinb()
777 ucisize = corpus.getucisize()
778 ucimean = float(sum(ucisize))/float(ucinb)
779 detoile = corpus.make_etoiles_dict()
783 def __init__(self, iduci, line, paraset = None) :
785 self.etoiles = line.split()
787 if paraset is not None :
788 self.paras = paraset.split()
793 def __init__(self, iduce, idpara, iduci) :
799 def __init__(self, word, gramtype, idword, lem = None, freq = None) :
805 if freq is not None :
811 def __init__(self, parent, forme) :
812 self.formes = {forme.ident : forme.freq}
813 self.gram = forme.gram
814 self.freq = forme.freq
817 def add_forme(self, forme) :
818 self.formes[forme.ident] = forme.freq
819 self.freq += forme.freq
821 def decouperlist(chaine, longueur, longueurOptimale) :
823 on part du dernier caractère, et on recule jusqu'au début de la chaîne.
824 Si on trouve un '$', c'est fini.
825 Sinon, on cherche le meilleur candidat. C'est-à-dire le rapport poids/distance le plus important.
827 separateurs = [[u'.', 6.0], [u'?', 6.0], [u'!', 6.0], [u'£$£', 6.0], [u':', 5.0], [u';', 4.0], [u',', 1.0], [u' ', 0.01]]
828 dsep = dict([[val[0],val[1]] for val in separateurs])
829 trouve = False # si on a trouvé un bon séparateur
830 iDecoupe = 0 # indice du caractere ou il faut decouper
832 longueur = min(longueur, len(chaine) - 1)
833 chaineTravail = chaine[:longueur + 1]
835 meilleur = ['', 0, 0] # type, poids et position du meilleur separateur
838 indice = chaineTravail.index(u'$')
840 iDecoupe = indice - 1
845 caractere = chaineTravail[nbCar]
846 distance = abs(longueurOptimale - nbCar) + 1
847 meilleureDistance = abs(longueurOptimale - meilleur[2]) + 1
848 if caractere in dsep :
849 if (float(dsep[caractere]) / distance) > (float(meilleur[1]) / meilleureDistance) :
850 meilleur[0] = caractere
851 meilleur[1] = dsep[caractere]
856 if (float(dsep[' ']) / distance) > (float(meilleur[1]) / meilleureDistance) :
858 meilleur[1] = dsep[' ']
865 #if meilleur[0] != ' ' :
866 # fin = chaine[iDecoupe + 1:]
867 # retour = chaineTravail[:iDecoupe]
869 fin = chaine[iDecoupe + 1:]
870 retour = chaineTravail[:iDecoupe + 1]
871 return len(retour) > 0, retour, fin
872 # si on a rien trouvé
873 return False, chaine, ''
875 def testetoile(line) :
876 return line.startswith(u'****')
879 return line[0:4].isdigit() and u'*' in line
881 def prep_txtlist(txt) :
882 return txt.split() + [u'$']
884 def prep_txtcharact(txt) :
889 Class for building a corpus
891 def __init__(self, infile, parametres_corpus, lexique = None, expressions = None, dlg = None) :
892 log.info('begin building corpus...')
893 self.lexique = lexique
894 self.expressions = expressions
896 self.corpus = Corpus(self, parametres_corpus)
899 self.lim = parametres_corpus.get('lim', 1000000)
900 self.encoding = parametres_corpus['encoding']
901 self.corpus.pathout = PathOut(filename = parametres_corpus['originalpath'], dirout = parametres_corpus['pathout'])
902 self.corpus.pathout.createdir(parametres_corpus['pathout'])
903 self.corpus.parametres['uuid'] = str(uuid4())
904 self.corpus.parametres['corpus_name'] = os.path.split(self.corpus.parametres['pathout'])[1]
905 self.corpus.parametres['type'] = 'corpus'
906 if self.corpus.parametres['keep_ponct'] :
907 self.ponctuation_espace = [' ', '']
909 self.ponctuation_espace = [' ','.', u'£$£', ';', '?', '!', ',', ':','']
911 self.tolist = self.corpus.parametres.get('tolist', 0)
918 def prep_makeuce(self) :
919 method = self.corpus.parametres.get('ucemethod', 0)
921 self.decouper = decouperlist
922 self.prep_txt = prep_txtlist
923 self.ucesize = self.corpus.parametres.get('ucesize', 40)
925 self.decouper = decoupercharact
926 self.prep_txt = prep_txtcharact
927 self.ucesize = self.corpus.parametres.get('ucesize', 240)
928 log.info('method uce : %s' % method)
933 self.read_corpus(self.infile)
934 except Warning, args :
935 log.info('pas kool %s' % args)
939 self.corpus.parametres['ira'] = self.corpus.pathout['Corpus.cira']
940 self.time = time() - t1
942 DoConf().makeoptions(['corpus'],[self.corpus.parametres], self.corpus.pathout['Corpus.cira'])
943 log.info('time : %f' % (time() - t1))
946 self.conn_f = sqlite3.connect(self.corpus.pathout['formes.db'])
947 self.cf = self.conn_f.cursor()
948 self.cf.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
949 self.cf.execute('CREATE TABLE IF NOT EXISTS eff (id INTEGER, eff TEXT);')
951 self.cf = self.conn_f.cursor()
952 self.cf.execute('PRAGMA temp_store=MEMORY;')
953 self.cf.execute('PRAGMA journal_mode=MEMORY;')
954 self.cf.execute('PRAGMA synchronous = OFF;')
955 self.cf.execute('begin')
956 self.conn = sqlite3.connect(self.corpus.pathout['uces.db'])
957 self.c = self.conn.cursor()
958 self.c.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
960 self.c = self.conn.cursor()
961 self.c.execute('PRAGMA temp_store=MEMORY;')
962 self.c.execute('PRAGMA journal_mode=MEMORY;')
963 self.c.execute('PRAGMA synchronous = OFF;')
964 self.c.execute('begin')
967 #commit index and close db
970 self.cf.execute('CREATE INDEX iduces ON uces (id);')
971 self.cf.execute('CREATE INDEX ideff ON eff (id);')
975 self.conn_corpus = sqlite3.connect(self.corpus.pathout['corpus.db'])
976 self.ccorpus = self.conn_corpus.cursor()
977 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS etoiles (uci INTEGER, et TEXT, paras TEXT);')
978 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS luces (uci INTEGER, para INTEGER, uce INTEGER);')
979 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS formes (ident INTEGER, forme TEXT, lem TEXT, gram TEXT, freq INTEGER);')
980 self.conn_corpus.commit()
981 self.ccorpus = self.conn_corpus.cursor()
982 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
983 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
984 self.ccorpus.execute('PRAGMA synchronous = OFF;')
985 self.ccorpus.execute('begin')
987 self.ccorpus.execute('CREATE INDEX iduci ON luces (uci);')
988 self.conn_corpus.commit()
989 self.conn_corpus.close()
990 #self.corpus.parametres['corpus_ira'] = self.corpus.pathout['corpus.cira']
992 def buildcleans(self) :
993 if self.corpus.parametres.get('lower', 1) :
994 self.cleans.append(self.dolower)
995 if self.corpus.parametres.get('firstclean', 1) :
996 self.cleans.append(self.firstclean)
997 if self.corpus.parametres['charact'] :
998 self.rule = self.corpus.parametres.get('keep_caract', u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_")
999 self.cleans.append(self.docharact)
1000 if self.corpus.parametres.get('expressions', 1) :
1001 self.cleans.append(self.make_expression)
1002 if self.corpus.parametres.get('apos', 1) :
1003 self.cleans.append(self.doapos)
1004 if self.corpus.parametres.get('tiret', 1):
1005 self.cleans.append(self.dotiret)
1007 def make_expression(self,txt) :
1008 for expression in self.expressions:
1009 if expression in txt :
1010 txt = txt.replace(expression, self.expressions[expression][0])
1013 def dolower(self, txt) :
1016 def docharact(self, txt) :
1017 #rule = u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_-"
1018 list_keep = u"[" + self.rule + "]+"
1019 return re.sub(list_keep, ' ', txt)
1021 def doapos(self, txt) :
1022 return txt.replace(u'\'', u' ')
1024 def dotiret(self, txt) :
1025 return txt.replace(u'-', u' ')
1027 def firstclean(self, txt) :
1028 txt = txt.replace(u'’',"'")
1029 txt = txt.replace(u'œ', u'oe')
1030 return txt.replace('...',u' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'…', ' £$£ ')
1032 def make_cleans(self, txt) :
1033 for clean in self.cleans :
1037 def backup_uce(self) :
1038 if self.corpus.idformesuces != {} :
1039 log.info('backup %i' % len(self.corpus.idformesuces))
1040 touce = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].keys()])) for forme in self.corpus.idformesuces]
1041 toeff = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].values()])) for forme in self.corpus.idformesuces]
1042 self.cf.executemany('INSERT INTO uces VALUES (?,?);', touce)
1043 self.cf.executemany('INSERT INTO eff VALUES (?,?);', toeff)
1044 self.corpus.idformesuces = {}
1047 def backup_corpus(self) :
1048 log.info('start backup corpus')
1050 for uci in self.corpus.ucis :
1051 self.ccorpus.execute('INSERT INTO etoiles VALUES (?,?,?);' ,(uci.ident,' '.join(uci.etoiles), ' '.join(uci.paras,)))
1052 for uce in uci.uces :
1053 self.ccorpus.execute('INSERT INTO luces VALUES (?,?,?);',(`uci.ident`,`uce.para`,`uce.ident`,))
1054 for forme in self.corpus.formes :
1055 self.ccorpus.execute('INSERT INTO formes VALUES (?,?,?,?,?);', (`self.corpus.formes[forme].ident`, forme, self.corpus.formes[forme].lem, self.corpus.formes[forme].gram, `self.corpus.formes[forme].freq`,))
1056 log.info('%f' % (time() - t))
1058 def dofinish(self) :
1059 self.corpus.parametres['date'] = datetime.datetime.now().ctime()
1060 minutes, seconds = divmod(self.time, 60)
1061 hours, minutes = divmod(minutes, 60)
1062 self.corpus.parametres['time'] = '%.0fh %.0fm %.0fs' % (hours, minutes, seconds)
1063 self.corpus.parametres['ucinb'] = self.corpus.getucinb()
1064 self.corpus.parametres['ucenb'] = self.corpus.getucenb()
1065 self.corpus.parametres['occurrences'] = self.corpus.gettotocc()
1066 self.corpus.parametres['formesnb'] = len(self.corpus.formes)
1067 hapaxnb = self.corpus.gethapaxnb()
1068 pourhapaxf = (float(hapaxnb) / len(self.corpus.formes)) * 100
1069 pourhapaxocc = (float(hapaxnb) / self.corpus.parametres['occurrences']) * 100
1070 self.corpus.parametres['hapax'] = '%i - %.2f %% des formes - %.2f %% des occurrences' % (hapaxnb, pourhapaxf, pourhapaxocc)
1073 class BuildFromAlceste(BuildCorpus) :
1074 def read_corpus(self, infile) :
1075 if self.dlg is not None :
1076 self.dlg.Pulse('textes : 0 - segments : 0')
1079 if self.corpus.parametres['ucimark'] == 0 :
1080 self.testuci = testetoile
1081 elif self.corpus.parametres['ucimark'] == 1 :
1082 self.testuci = testint
1088 with codecs.open(infile, 'r', self.encoding) as f :
1089 for linenb, line in enumerate(f) :
1090 line = line.rstrip('\n\r')
1091 if self.testuci(line) :
1094 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci - 1)
1096 self.corpus.ucis.append(Uci(iduci, line))
1099 if self.corpus.ucis[-1].uces == [] :
1100 log.info(u'Empty text : %i' % linenb)
1102 self.corpus.ucis.pop()
1103 #raise Exception("EmptyText %i" % linenb)
1104 self.corpus.ucis.append(Uci(iduci, line))
1105 if self.dlg is not None :
1106 if not (iduci + 1) % 10 :
1107 self.dlg.Pulse('textes : %i - segments : %i' % (iduci + 1, iduce +1))
1108 elif line.startswith(u'-*') :
1111 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1114 self.corpus.ucis[-1].paras.append(line.split()[0])
1116 raise Exception('paragrapheOT')
1117 elif line.strip() != '' and iduci != -1 :
1119 if txt != [] and iduci != -1 :
1120 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1123 raise Exception("EmptyText")
1124 if iduci != -1 and iduce != -1:
1127 log.info(_(u"No Text in corpora. Are you sure of the formatting ?"))
1128 raise Exception('TextBeforeTextMark')
1129 except UnicodeDecodeError :
1130 raise Exception("CorpusEncoding")
1132 def treattxt(self, txt, iduce, idpara, iduci) :
1133 if self.corpus.parametres.get('ucemethod', 0) == 2 and self.corpus.parametres['douce']:
1134 txt = 'laphrasepoursplitter'.join(txt)
1135 txt = self.make_cleans(txt)
1136 txt = ' '.join([val for val in txt.split() if val not in self.ponctuation_espace])
1137 ucetxt = txt.split('laphrasepoursplitter')
1140 txt = self.make_cleans(txt)
1141 ucetxt = self.make_uces(txt, self.corpus.parametres['douce'])
1142 if self.corpus.ucis[-1].paras == [] :
1146 self.corpus.ucis[-1].uces.append(Uce(iduce, idpara, iduci))
1147 self.c.execute('INSERT INTO uces VALUES(?,?);', (`iduce`,uce))
1148 if not self.tolist :
1154 self.corpus.add_word(word)
1155 #if self.dlg is not None :
1156 # if self.limitshow > self.count :
1157 # self.dlg.Pulse('textes : %i - segments : %i' % (iduci + 1, iduce +1))
1159 # self.limitshow = 0
1161 # self.limitshow = self.last / 100000
1162 log.debug(' '.join([`iduci`,`idpara`,`iduce`]))
1163 if self.last > self.lim :
1166 return iduce, idpara
1168 def make_uces(self, txt, douce = True, keep_ponct = False) :
1169 txt = ' '.join(txt.split())
1172 reste, texte_uce, suite = self.decouper(self.prep_txt(txt), self.ucesize + 15, self.ucesize)
1180 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1183 reste, texte_uce, suite = self.decouper(suite, self.ucesize + 15, self.ucesize)
1191 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1193 #print 'RESTEE UUCEEEEEEEEEEEEE', uce
1197 return [' '.join([val for val in txt.split() if val not in self.ponctuation_espace])]
1199 #decouper (list_sep)
1200 #make_uces (decouper)
1201 #treat_txt (make_uces)
1205 def __init__(self, parent, dlg = None) :
1206 self.parent = parent
1208 parametres = DoConf(os.path.join(self.parent.UserConfigPath,'corpus.cfg')).getoptions('corpus')
1209 parametres['pathout'] = PathOut(parent.filename, 'corpus').mkdirout()
1210 dial = CorpusPref(parent, parametres)
1211 dial.CenterOnParent()
1212 dial.txtpath.SetLabel(parent.filename)
1213 #dial.repout_choices.SetValue(parametres['pathout'])
1214 self.res = dial.ShowModal()
1215 if self.res == 5100 :
1216 parametres = dial.doparametres()
1217 parametres['originalpath'] = parent.filename
1218 PathOut().createdir(parametres['pathout'])
1219 ReadLexique(self.parent, lang = parametres['lang'])
1220 self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp'))
1221 self.parametres = parametres
1224 def doanalyse(self) :
1225 return BuildFromAlceste(self.parent.filename, self.parametres, self.parent.lexique, self.parent.expressions, dlg = self.dlg).corpus
1228 if __name__ == '__main__' :
1230 parametres = {'formesdb':'formes.db', 'ucesdb': 'uces.db', 'corpusdb' : 'corpus.db', 'syscoding' : 'utf-8', 'encoding' : encoding}
1231 intro = BuildCorpus(infile, parametres)#, tar_in, tar_infouce)#, tar_formes)