1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
11 from functions import decoupercharact, ReadDicoAsDico, DoConf
17 from operator import itemgetter
18 from uuid import uuid4
19 from chemins import PathOut
20 from dialog import CorpusPref
21 from functions import ReadLexique, ReadDicoAsDico
22 from colors import colors
26 log = logging.getLogger('iramuteq.corpus')
29 def copycorpus(corpus) :
30 log.info('copy corpus')
31 copy_corpus = Corpus(corpus.parent, parametres = corpus.parametres)
32 copy_corpus.ucis = corpus.ucis
33 copy_corpus.formes = corpus.formes
34 copy_corpus.pathout = corpus.pathout
35 copy_corpus.conn_all()
45 def __init__(self, parent, parametres = {}, read = False) :
47 self.parametres = parametres
49 self.connformes = None
51 self.conncorpus = None
58 self.idformesuces = {}
63 self.pathout = PathOut(dirout = parametres['pathout'])
66 def add_word(self, word) :
67 if word in self.formes :
68 self.formes[word].freq += 1
69 if self.formes[word].ident in self.idformesuces :
70 if self.ucis[-1].uces[-1].ident in self.idformesuces[self.formes[word].ident] :
71 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] += 1
73 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] = 1
75 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident: 1}
77 if word in self.parent.lexique :
78 gramtype = self.parent.lexique[word][1]
79 lem = self.parent.lexique[word][0]
86 self.formes[word] = Word(word, gramtype, len(self.formes), lem)
87 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident : 1}
90 """connect corpus to db"""
91 if self.connformes is None :
92 log.info('connexion corpus')
93 self.connuces = sqlite3.connect(self.pathout['uces.db'])
94 self.cuces = self.connuces.cursor()
95 self.connformes = sqlite3.connect(self.pathout['formes.db'])
96 self.cformes = self.connformes.cursor()
97 self.conncorpus = sqlite3.connect(self.pathout['corpus.db'])
98 self.ccorpus = self.conncorpus.cursor()
99 self.cformes.execute('PRAGMA temp_store=MEMORY;')
100 self.cformes.execute('PRAGMA journal_mode=MEMORY;')
101 self.cformes.execute('PRAGMA synchronous = OFF;')
102 self.cuces.execute('PRAGMA temp_store=MEMORY;')
103 self.cuces.execute('PRAGMA journal_mode=MEMORY;')
104 self.cuces.execute('PRAGMA synchronous = OFF;')
105 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
106 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
107 self.ccorpus.execute('PRAGMA synchronous = OFF;')
109 def read_corpus(self) :
110 log.info('read corpus')
111 self.parametres['syscoding'] = sys.getdefaultencoding()
112 if self.conncorpus is None :
114 res = self.ccorpus.execute('SELECT * FROM etoiles;')
116 self.ucis.append(Uci(row[0], row[1], row[2]))
117 uces = self.conncorpus.cursor().execute('SELECT * FROM luces where uci=?;',(`self.ucis[-1].ident`,))
119 self.ucis[-1].uces.append(Uce(uce[2], uce[1], uce[0]))
120 res = self.ccorpus.execute('SELECT * FROM formes;')
121 self.formes = dict([[forme[1], Word(forme[1], forme[3], forme[0], lem = forme[2], freq = forme[4])] for forme in res])
124 def getworduces(self, wordid) :
125 if isinstance(wordid, basestring) :
126 wordid = self.formes[wordid].ident
127 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`wordid`,))
128 return list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
130 def getformeuceseff(self, formeid) :
131 if isinstance(formeid, basestring) :
132 formeid = self.formes[formeid].ident
133 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`formeid`,))
134 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
135 query = 'SELECT eff FROM eff where id=%i ORDER BY id' % formeid
136 res = self.cformes.execute(query)
137 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
139 for i, uce in enumerate(uces) :
140 formeuceeff[uce] = formeuceeff.get(uce, 0) + eff[i]
143 def getlemuces(self, lem) :
144 formesid = ', '.join([`val` for val in self.lems[lem].formes])
145 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
146 res = self.cformes.execute(query)
147 return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))))
149 def getlemucis(self, lem) :
150 uces = self.getlemuces(lem)
151 return list(set([self.getucefromid(val).uci for val in uces]))
153 def getlemuceseff(self, lem, luces = None) :
154 formesid = ', '.join([`val` for val in self.lems[lem].formes])
155 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
156 res = self.cformes.execute(query)
157 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
158 query = 'SELECT eff FROM eff where id IN (%s) ORDER BY id' % formesid
159 res = self.cformes.execute(query)
160 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
162 for i, uce in enumerate(uces) :
163 lemuceeff[uce] = lemuceeff.get(uce, 0) + eff[i]
166 def getlemeff(self, lem) :
167 return self.lems[lem].freq
172 def getforme(self, formeid) :
173 if self.idformes is None : self.make_idformes()
174 return self.idformes[formeid]
176 def gettotocc(self) :
177 return sum([self.formes[forme].freq for forme in self.formes])
179 def getucemean(self) :
180 return float(self.gettotocc())/self.getucenb()
183 return self.ucis[-1].uces[-1].ident + 1
186 return self.ucis[-1].ident + 1
188 def getucisize(self) :
189 ucesize = self.getucesize()
190 return [sum(ucesize[uci.uces[0].ident:(uci.uces[-1].ident + 1)]) for uci in self.ucis]
192 def getucesize(self) :
193 res = self.getalluces()
194 return [len(uce[1].split()) for uce in res]
196 # def getlemseff(self) :
197 # if self.idformes is None :
198 # self.make_idformes()
199 # return dict([[lem, sum([self.idformes[forme].freq for forme in self.lems[lem]])] for lem in self.lems])
201 # def getlemsefftype(self) :
202 # if self.idformes is None :
203 # self.make_idformes()
204 # if self.lems is None :
206 # return dict([[lem, [sum([self.idformes[forme].freq for forme in self.lems[lem]]), '', self.idformes[self.lems[lem].keys()[0]].gram]] for lem in self.lems])
208 def getconcorde(self, uces) :
209 return self.cuces.execute('select * from uces where id IN (%s);' % ', '.join([`i` for i in uces]))
211 def getwordconcorde(self, word) :
212 return self.getconcorde(self.getworduces(word))
214 def getlemconcorde(self, lem) :
215 return self.getconcorde(self.getlemuces(lem))
217 def getalluces(self) :
218 return self.cuces.execute('SELECT * FROM uces')
220 def getucesfrometoile(self, etoile) :
221 return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles]
223 def getucefromid(self, uceid) :
224 if self.iduces is None : self.make_iduces()
225 return self.iduces[uceid]
227 def gethapaxnb(self) :
228 return len([None for forme in self.formes if self.formes[forme].freq == 1])
230 def getactivesnb(self, key) :
231 return len([lem for lem in self.lems if self.lems[lem].act == key])
232 # def make_lems(self, lem = True) :
233 # log.info('make lems')
235 # for forme in self.formes :
236 # if self.formes[forme].lem in self.lems :
237 # if self.formes[forme].ident not in self.lems[self.formes[forme].lem] :
238 # self.lems[self.formes[forme].lem][self.formes[forme].ident] = 0
240 # self.lems[self.formes[forme].lem] = {self.formes[forme].ident : 0}
242 def getetbyuceid(self, uceid) :
243 if self.uceuci is None : self.uceuci = dict([[uce.ident,uci.ident] for uci in self.ucis for uce in uci.uces])
244 return self.ucis[self.uceuci[uceid]].etoiles
246 def make_lems(self, lem = True) :
247 log.info('make lems')
250 for forme in self.formes :
251 if self.formes[forme].lem in self.lems :
252 if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes :
253 self.lems[self.formes[forme].lem].add_forme(self.formes[forme])
255 self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
257 self.lems = dict([[forme, Lem(self, self.formes[forme])] for forme in self.formes])
259 def make_idformes(self) :
260 self.idformes = dict([[self.formes[forme].ident, self.formes[forme]] for forme in self.formes])
262 def make_iduces(self) :
263 if self.iduces is None :
264 self.iduces = dict([[uce.ident, uce] for uci in self.ucis for uce in uci.uces])
266 def make_lexitable(self, mineff, etoiles) :
267 tokeep = [lem for lem in self.lems if self.lems[lem].freq >= mineff]
268 etuces = [[] for et in etoiles]
269 for uci in self.ucis :
270 get = list(set(uci.etoiles).intersection(etoiles))
272 return '2 variables sur la meme ligne'
274 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
275 etuces = [set(val) for val in etuces]
278 deff = self.getlemuceseff(lem)
280 tab.append([lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])
281 tab.insert(0, [''] + etoiles)
284 def make_efftype_from_etoiles(self, etoiles) :
286 etuces = [[] for et in etoiles]
287 for uci in self.ucis :
288 get = list(set(uci.etoiles).intersection(etoiles))
290 return '2 variables sur la meme ligne'
292 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
293 etuces = [set(val) for val in etuces]
294 for lem in self.lems :
295 deff = self.getlemuceseff(lem)
297 gram = self.lems[lem].gram
299 dtype[gram] = [i + j for i, j in zip(dtype[gram], [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])]
301 dtype[gram] = [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces]
302 tabout = [[gram] + dtype[gram] for gram in dtype]
303 tabout.insert(0, [''] + etoiles)
306 def make_uceactsize(self, actives) :
307 res = self.getalluces()
310 deff = self.getlemuceseff(lem)
312 ucesize[uce] = ucesize.get(uce, 0) + 1
315 def make_uc(self, actives, lim1, lim2) :
316 uceactsize = self.make_uceactsize(actives)
322 for uce in [uce for uci in self.ucis for uce in uci.uces] :
323 if uce.para == lastpara :
325 last1 += uceactsize.get(uce.ident,0)
326 uc1[-1].append(uce.ident)
328 uc1.append([uce.ident])
331 last2 += uceactsize.get(uce.ident, 0)
332 uc2[-1].append(uce.ident)
334 uc2.append([uce.ident])
337 last1 = uceactsize.get(uce.ident, 0)
338 last2 = uceactsize.get(uce.ident, 0)
340 uc1.append([uce.ident])
341 uc2.append([uce.ident])
344 def make_and_write_sparse_matrix_from_uc(self, actives, sizeuc1, sizeuc2, uc1out, uc2out, listuce1out, listuce2out) :
345 uc1, uc2 = self.make_uc(actives, sizeuc1, sizeuc2)
346 log.info('taille uc1 : %i - taille uc2 : %i' % (len(uc1), len(uc2)))
347 self.write_ucmatrix(uc1, actives, uc1out)
348 self.write_ucmatrix(uc2, actives, uc2out)
349 listuce1 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc1) for uce in ucl]
350 listuce2 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc2) for uce in ucl]
351 with open(listuce1out, 'w') as f :
352 f.write('\n'.join([';'.join(line) for line in listuce1]))
353 with open(listuce2out, 'w') as f :
354 f.write('\n'.join([';'.join(line) for line in listuce2]))
355 return len(uc1), len(uc2)
357 def write_ucmatrix(self, uc, actives, fileout) :
358 log.info('write uc matrix %s' % fileout)
359 uces_uc = dict([[uce, i] for i, ucl in enumerate(uc) for uce in ucl])
362 with open(fileout + '~', 'w+') as f :
363 for i, lem in enumerate(actives) :
364 for uce in self.getlemuces(lem):
365 if (uces_uc[uce], i) not in deja_la :
367 f.write(''.join([' '.join([`uces_uc[uce]+1`,`i+1`,`1`]),'\n']))
368 deja_la[(uces_uc[uce], i)] = 0
370 with open(fileout, 'w') as ffin :
371 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (len(uc), len(actives), nbl))
374 os.remove(fileout + '~')
377 def export_corpus(self, outf) :
378 #outf = 'export_corpus.txt'
380 res = self.getalluces()
384 with open(outf,'w') as f :
386 if self.iduces[uce[0]].uci == actuci and self.iduces[uce[0]].para == actpara :
387 f.write(uce[1].encode(self.parametres['syscoding']) + '\n')
388 elif self.iduces[uce[0]].uci != actuci :
389 actuci = self.iduces[uce[0]].uci
390 if self.ucis[self.iduces[uce[0]].uci].paras == [] :
391 actpara = self.iduces[uce[0]].para
392 f.write('\n' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n' + uce[1].encode(self.parametres['syscoding']) + '\n')
395 actpara = self.iduces[uce[0]].para
396 f.write('\n'.join([' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']), self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
397 elif self.iduces[uce[0]].para != actpara :
398 actpara = self.iduces[uce[0]].para
400 f.write('\n'.join([self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
402 def export_corpus_classes(self, outf, alc = True, lem = False) :
404 for i, lc in enumerate(self.lc) :
407 for uce in self.lc0 :
409 res = self.getalluces()
411 with open(outf, 'w') as f :
414 actuci = self.iduces[uce[0]].uci
416 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
418 etline = ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles + ['*classe_%i' % ucecl[uce[0]]])
420 etline = ' '.join(['<' + '='.join(et.split('_')) + '>' for et in self.ucis[self.iduces[uce[0]].uci].etoiles[1:]])
421 f.write(etline.encode(self.parametres['syscoding']) + '\n')
422 f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
424 def make_and_write_sparse_matrix_from_uces(self, actives, outfile, listuce = False) :
425 log.info('make_and_write_sparse_matrix_from_uces %s' % outfile)
427 with open(outfile + '~', 'w+') as f :
428 for i, lem in enumerate(actives) :
429 for uce in sorted(self.getlemuces(lem)) :
431 f.write(''.join([' '.join([`uce+1`, `i+1`,`1`]),'\n']))
433 with open(outfile, 'w') as ffin :
434 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
437 os.remove(outfile + '~')
439 with open(listuce, 'w') as f :
440 f.write('\n'.join(['uce;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucenb())]))
442 def make_and_write_sparse_matrix_from_uci(self, actives, outfile, listuci = False) :
443 log.info('make_and_write_sparse_matrix_from_ucis %s' % outfile)
445 with open(outfile + '~', 'w+') as f :
446 for i, lem in enumerate(actives) :
447 for uci in sorted(self.getlemucis(lem)) :
449 f.write(''.join([' '.join([`uci+1`, `i+1`,`1`]),'\n']))
451 with open(outfile, 'w') as ffin :
452 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucinb(), len(actives), nbl))
455 os.remove(outfile + '~')
457 with open(listuci, 'w') as f :
458 f.write('\n'.join(['uci;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucinb())]))
460 def make_and_write_sparse_matrix_from_classe(self, actives, uces, outfile) :
461 log.info('make_and_write_sparse_matrix_from_classe %s' % outfile)
463 duces = dict([[uce, i] for i, uce in enumerate(uces)])
464 with open(outfile + '~', 'w+') as f :
465 for i, lem in enumerate(actives) :
466 uces_ok = list(set(self.getlemuces(lem)).intersection(uces))
468 f.write(''.join([' '.join([`duces[uce]+1`,`i+1`,`1`]),'\n']))
470 with open(outfile, 'w') as ffin :
471 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
474 os.remove(outfile + '~')
476 def make_table_with_classe(self, uces, list_act) :
477 table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
478 uces = dict([[uce, i] for i, uce in enumerate(uces)])
479 for i, lem in enumerate(list_act) :
480 lemuces = list(set(self.getlemuces(lem)).intersection(uces))
482 table_uce[uces[uce]][i] = 1
483 table_uce.insert(0, list_act)
486 def parse_active(self, gramact, gramsup = None) :
487 log.info('parse actives')
488 for lem in self.lems :
489 if lem.startswith('_') and lem.endswith('_') :
490 self.lems[lem].act = 2
491 elif self.lems[lem].gram in gramact :
492 self.lems[lem].act = 1
493 elif gramsup is not None :
494 if self.lems[lem].gram in gramsup :
495 self.lems[lem].act = 2
497 self.lems[lem].act = 0
499 self.lems[lem].act = 2
501 def make_actives_limit(self, limit, key = 1) :
502 if self.idformes is None :
504 return [lem for lem in self.lems if self.getlemeff(lem) >= limit and self.lems[lem].act == key]
506 def make_actives_nb(self, nbmax, key) :
507 log.info('make_actives_nb : %i - %i' % (nbmax,key))
508 if self.idformes is None :
510 allactives = [[self.lems[lem].freq, lem] for lem in self.lems if self.lems[lem].act == key and self.lems[lem].freq >= 3]
511 self.activenb = len(allactives)
512 allactives = sorted(allactives, reverse = True)
513 if len(allactives) <= nbmax :
514 log.info('nb = %i - eff min = %i ' % (len(allactives), allactives[-1][0]))
515 return [val[1] for val in allactives], allactives[-1][0]
517 effs = [val[0] for val in allactives]
518 if effs.count(effs[nbmax - 1]) > 1 :
519 lim = effs[nbmax - 1] + 1
523 stop = effs.index(lim)
530 log.info('nb actives = %i - eff min = %i ' % (stop + 1, lim))
531 return [val[1] for val in allactives[0:stop + 1]], lim
533 def make_and_write_profile(self, actives, ucecl, fileout) :
534 log.info('formes/classes')
535 tab = [[lem] + [len(set(self.getlemuces(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
536 tab = [[line[0]] + [`val` for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
537 with open(fileout, 'w') as f :
538 f.write('\n'.join([';'.join(line) for line in tab]).encode(self.parametres['syscoding']))
540 def make_etoiles(self) :
542 for uci in self.ucis :
543 etoiles.update(uci.etoiles[1:] + uci.paras)
546 def make_etoiles_dict(self) :
547 etoiles = [et for uci in self.ucis for et in uci.etoiles[1:]]
549 for etoile in etoiles :
550 et = etoile.split('_')
553 endet = '_'.join(et[1:])
554 if endet in det[et[0]] :
555 det[et[0]][endet] += 1
557 det[et[0]][endet] = 1
562 endet = '_'.join(et[1:])
563 det[et[0]] = {endet :1}
568 def make_etline(self, listet) :
569 etuces = [[] for et in listet]
570 for uci in self.ucis :
571 get = list(set(uci.etoiles).intersection(listet))
573 return '2 variables sur la meme ligne'
575 etuces[listet.index(get[0])] += [uce.ident for uce in uci.uces]
579 def make_and_write_profile_et(self, ucecl, fileout) :
580 log.info('etoiles/classes')
581 etoiles = self.make_etoiles()
582 with open(fileout, 'w') as f :
583 f.write('\n'.join([';'.join([etoile] + [`len(set(self.getucesfrometoile(etoile)).intersection(classe))` for classe in ucecl]) for etoile in etoiles]).encode(self.parametres['syscoding']))
585 def make_colored_corpus(self) :
587 for i, lc in enumerate(self.lc) :
590 for uce in self.lc0 :
592 color = ['black'] + colors[len(self.lc) - 1]
594 <meta http-equiv="content-Type" content="text/html; charset=%s" />
596 ''' % sys.getdefaultencoding()
597 res = self.getalluces()
602 if self.iduces[uce[0]].uci != actuci :
603 actuci = self.iduces[uce[0]].uci
604 txt += '<br><hr>' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles) + '<br><br>'
605 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
607 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
608 return txt + '\n</body></html>'
610 def count_from_list(self, l, d) :
618 def count_from_list_cl(self, l, d, a, clnb) :
627 def find_segments(self, taille_segment, taille_limite) :
629 for uce in self.getalluces() :
631 d = self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
632 l = [[d[val], val] for val in d if d[val] >= 3]
635 if len(l) > taille_limite :
636 l = l[-taille_limite:]
639 def find_segments_in_classe(self, list_uce, taille_segment, taille_limite):
641 for uce in self.getconcorde(list_uce) :
643 d =self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
644 l = [[d[val], val, taille_segment] for val in d if d[val] >= 3]
647 if len(l) > taille_limite :
648 l = l[-taille_limite:]
651 def make_segments_profile(self, fileout, lenmin = 3, lenmax = 10, effmin = 50, lem = False) :
653 for b, classe in enumerate(self.lc) :
654 for uce in self.getconcorde(classe) :
657 uce = [self.formes[forme].lem for forme in uce]
658 for taille_segment in range(lenmin,lenmax) :
659 d =self.count_from_list_cl([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d, b, len(self.lc))
660 result = [[seg] + [str(val) for val in d[seg]] for seg in d if sum(d[seg]) >= effmin]
661 with open(fileout, 'w') as f :
662 f.write('\n'.join([';'.join(line) for line in result]))
664 def make_proftype(self, outf) :
666 for lem in self.lems :
667 gram = self.lems[lem].gram
669 res[gram] = [0 for val in self.lc]
670 lemuceeff = self.getlemuceseff(lem)
671 for i, classe in enumerate(self.lc) :
672 concern = set(classe).intersection(lemuceeff.keys())
673 res[gram][i] += sum([lemuceeff[uce] for uce in concern])
674 res = [[gram] + [`val` for val in res[gram]] for gram in res]
676 with open(outf, 'w') as f :
677 f.write('\n'.join([';'.join(line) for line in res]).encode(self.parametres['syscoding']))
680 def make_ucecl_from_R(self, filein) :
681 with open(filein, 'rU') as f :
686 line = line.replace('\n', '').replace('"', '').split(';')
687 self.lc.append([int(line[0]) - 1, int(line[1])])
688 classesl = [val[1] for val in self.lc]
690 self.lc = sorted(self.lc, key=itemgetter(1))
691 self.lc = [[uce[0] for uce in self.lc if uce[1] == i] for i in range(clnb+1)]
692 self.lc0 = self.lc.pop(0)
695 def get_stat_by_cluster(self, outf) :
696 log.info('get_stat_by_cluster')
698 occurrences = dict([[i + 1, 0] for i in range(len(self.lc))])
699 formescl = dict([[i + 1, 0] for i in range(len(self.lc))])
700 hapaxcl = dict([[i + 1, 0] for i in range(len(self.lc))])
701 lenclasses = dict([[i+1,len(cl)] for i, cl in enumerate(self.lc)])
702 sets = [set(cl) for cl in self.lc]
703 for forme in self.formes :
704 formeuceeff = self.getformeuceseff(forme)
705 for i, classe in enumerate(self.lc) :
706 concern = sets[i].intersection(formeuceeff.keys())
708 occurrences[i+1] += sum([formeuceeff[uce] for uce in concern])
710 if self.formes[forme].freq == 1 :
712 toprint = '\n'.join([';'.join([`i`, `occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`]) for i in occurrences])
713 with open(outf, 'w') as f :
715 log.info('%f' % (time() - t1))
717 def gethapaxbyet(self, etoiles) :
718 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
720 for uce in hapaxuces :
721 if uce in hucesdict :
725 etuces = [[] for et in etoiles]
726 for uci in self.ucis :
727 get = list(set(uci.etoiles).intersection(etoiles))
729 return '2 variables sur la meme ligne'
731 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
732 etuces = [set(val) for val in etuces]
733 return [sum([hucesdict[uce] for uce in list(etuce.intersection(hapaxuces))]) for etuce in etuces]
735 def gethapaxuces(self) :
736 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
737 hapax = [forme for forme in self.lems if self.lems[forme].freq == 1]
739 for i,uce in enumerate(hapaxuces) :
740 if uce in hucesdict :
741 hucesdict[uce][0] += 1
742 hucesdict[uce][1].append(hapax[i])
744 hucesdict[uce] = [1,[hapax[i]]]
746 for uce in hucesdict :
747 if hucesdict[uce][0] in huces :
748 huces[hucesdict[uce][0]].append(uce)
750 huces[hucesdict[uce][0]] = [uce]
751 huces = zip(huces, huces.values())
752 huces.sort(reverse=True)
756 for nb in huces[0:4] :
757 txt += "<p><h2>%i hapax par uce</h2><p>\n" % nb[0]
759 res = self.getconcorde([uce])
761 ucetxt = ' ' + row[1] + ' '
763 for hap in hucesdict[uce][1] :
764 laforme = self.getforme([forme for forme in self.lems[hap].formes][0]).forme
765 ucetxt = ucetxt.replace(' '+laforme+' ', ' <font color=red>'+laforme+'</font> ')
766 txt += '<p><b>' + ' '.join(self.getetbyuceid(uceid)) + '</b></p>'
767 txt += '<p>'+ucetxt+'</p>\n'
771 with open('/tmp/testhapxuce.html','w') as f :
776 def __init__(self, corpus) :
777 ucinb = corpus.getucinb()
778 ucisize = corpus.getucisize()
779 ucimean = float(sum(ucisize))/float(ucinb)
780 detoile = corpus.make_etoiles_dict()
784 def __init__(self, iduci, line, paraset = None) :
786 self.etoiles = line.split()
788 if paraset is not None :
789 self.paras = paraset.split()
794 def __init__(self, iduce, idpara, iduci) :
800 def __init__(self, word, gramtype, idword, lem = None, freq = None) :
806 if freq is not None :
812 def __init__(self, parent, forme) :
813 self.formes = {forme.ident : forme.freq}
814 self.gram = forme.gram
815 self.freq = forme.freq
818 def add_forme(self, forme) :
819 self.formes[forme.ident] = forme.freq
820 self.freq += forme.freq
822 def decouperlist(chaine, longueur, longueurOptimale) :
824 on part du dernier caractère, et on recule jusqu'au début de la chaîne.
825 Si on trouve un '$', c'est fini.
826 Sinon, on cherche le meilleur candidat. C'est-à-dire le rapport poids/distance le plus important.
828 separateurs = [[u'.', 6.0], [u'?', 6.0], [u'!', 6.0], [u'£$£', 6.0], [u':', 5.0], [u';', 4.0], [u',', 1.0], [u' ', 0.01]]
829 dsep = dict([[val[0],val[1]] for val in separateurs])
830 trouve = False # si on a trouvé un bon séparateur
831 iDecoupe = 0 # indice du caractere ou il faut decouper
833 longueur = min(longueur, len(chaine) - 1)
834 chaineTravail = chaine[:longueur + 1]
836 meilleur = ['', 0, 0] # type, poids et position du meilleur separateur
839 indice = chaineTravail.index(u'$')
841 iDecoupe = indice - 1
846 caractere = chaineTravail[nbCar]
847 distance = abs(longueurOptimale - nbCar) + 1
848 meilleureDistance = abs(longueurOptimale - meilleur[2]) + 1
849 if caractere in dsep :
850 if (float(dsep[caractere]) / distance) > (float(meilleur[1]) / meilleureDistance) :
851 meilleur[0] = caractere
852 meilleur[1] = dsep[caractere]
857 if (float(dsep[' ']) / distance) > (float(meilleur[1]) / meilleureDistance) :
859 meilleur[1] = dsep[' ']
866 #if meilleur[0] != ' ' :
867 # fin = chaine[iDecoupe + 1:]
868 # retour = chaineTravail[:iDecoupe]
870 fin = chaine[iDecoupe + 1:]
871 retour = chaineTravail[:iDecoupe + 1]
872 return len(retour) > 0, retour, fin
873 # si on a rien trouvé
874 return False, chaine, ''
876 def testetoile(line) :
877 return line.startswith(u'****')
880 return line[0:4].isdigit() and u'*' in line
882 def prep_txtlist(txt) :
883 return txt.split() + [u'$']
885 def prep_txtcharact(txt) :
890 Class for building a corpus
892 def __init__(self, infile, parametres_corpus, lexique = None, expressions = None, dlg = None) :
893 log.info('begin building corpus...')
894 self.lexique = lexique
895 self.expressions = expressions
897 self.corpus = Corpus(self, parametres_corpus)
900 self.lim = parametres_corpus.get('lim', 1000000)
901 self.encoding = parametres_corpus['encoding']
902 self.corpus.pathout = PathOut(filename = parametres_corpus['originalpath'], dirout = parametres_corpus['pathout'])
903 self.corpus.pathout.createdir(parametres_corpus['pathout'])
904 self.corpus.parametres['uuid'] = str(uuid4())
905 self.corpus.parametres['corpus_name'] = os.path.split(self.corpus.parametres['pathout'])[1]
906 self.corpus.parametres['type'] = 'corpus'
907 if self.corpus.parametres['keep_ponct'] :
908 self.ponctuation_espace = [' ', '']
910 self.ponctuation_espace = [' ','.', u'£$£', ';', '?', '!', ',', ':','']
912 self.tolist = self.corpus.parametres.get('tolist', 0)
919 def prep_makeuce(self) :
920 method = self.corpus.parametres.get('ucemethod', 0)
922 self.decouper = decouperlist
923 self.prep_txt = prep_txtlist
924 self.ucesize = self.corpus.parametres.get('ucesize', 40)
926 self.decouper = decoupercharact
927 self.prep_txt = prep_txtcharact
928 self.ucesize = self.corpus.parametres.get('ucesize', 240)
929 log.info('method uce : %s' % method)
934 self.read_corpus(self.infile)
935 except Warning, args :
936 log.info('pas kool %s' % args)
940 self.corpus.parametres['ira'] = self.corpus.pathout['Corpus.cira']
941 self.time = time() - t1
943 DoConf().makeoptions(['corpus'],[self.corpus.parametres], self.corpus.pathout['Corpus.cira'])
944 log.info('time : %f' % (time() - t1))
947 self.conn_f = sqlite3.connect(self.corpus.pathout['formes.db'])
948 self.cf = self.conn_f.cursor()
949 self.cf.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
950 self.cf.execute('CREATE TABLE IF NOT EXISTS eff (id INTEGER, eff TEXT);')
952 self.cf = self.conn_f.cursor()
953 self.cf.execute('PRAGMA temp_store=MEMORY;')
954 self.cf.execute('PRAGMA journal_mode=MEMORY;')
955 self.cf.execute('PRAGMA synchronous = OFF;')
956 self.cf.execute('begin')
957 self.conn = sqlite3.connect(self.corpus.pathout['uces.db'])
958 self.c = self.conn.cursor()
959 self.c.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
961 self.c = self.conn.cursor()
962 self.c.execute('PRAGMA temp_store=MEMORY;')
963 self.c.execute('PRAGMA journal_mode=MEMORY;')
964 self.c.execute('PRAGMA synchronous = OFF;')
965 self.c.execute('begin')
968 #commit index and close db
971 self.cf.execute('CREATE INDEX iduces ON uces (id);')
972 self.cf.execute('CREATE INDEX ideff ON eff (id);')
976 self.conn_corpus = sqlite3.connect(self.corpus.pathout['corpus.db'])
977 self.ccorpus = self.conn_corpus.cursor()
978 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS etoiles (uci INTEGER, et TEXT, paras TEXT);')
979 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS luces (uci INTEGER, para INTEGER, uce INTEGER);')
980 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS formes (ident INTEGER, forme TEXT, lem TEXT, gram TEXT, freq INTEGER);')
981 self.conn_corpus.commit()
982 self.ccorpus = self.conn_corpus.cursor()
983 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
984 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
985 self.ccorpus.execute('PRAGMA synchronous = OFF;')
986 self.ccorpus.execute('begin')
988 self.ccorpus.execute('CREATE INDEX iduci ON luces (uci);')
989 self.conn_corpus.commit()
990 self.conn_corpus.close()
991 #self.corpus.parametres['corpus_ira'] = self.corpus.pathout['corpus.cira']
993 def buildcleans(self) :
994 if self.corpus.parametres.get('lower', 1) :
995 self.cleans.append(self.dolower)
996 if self.corpus.parametres.get('firstclean', 1) :
997 self.cleans.append(self.firstclean)
998 if self.corpus.parametres['charact'] :
999 self.rule = self.corpus.parametres.get('keep_caract', u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_")
1000 self.cleans.append(self.docharact)
1001 if self.corpus.parametres.get('expressions', 1) :
1002 self.cleans.append(self.make_expression)
1003 if self.corpus.parametres.get('apos', 1) :
1004 self.cleans.append(self.doapos)
1005 if self.corpus.parametres.get('tiret', 1):
1006 self.cleans.append(self.dotiret)
1008 def make_expression(self,txt) :
1009 for expression in self.expressions:
1010 if expression in txt :
1011 txt = txt.replace(expression, self.expressions[expression][0])
1014 def dolower(self, txt) :
1017 def docharact(self, txt) :
1018 #rule = u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_-"
1019 list_keep = u"[" + self.rule + "]+"
1020 return re.sub(list_keep, ' ', txt)
1022 def doapos(self, txt) :
1023 return txt.replace(u'\'', u' ')
1025 def dotiret(self, txt) :
1026 return txt.replace(u'-', u' ')
1028 def firstclean(self, txt) :
1029 txt = txt.replace(u'’',"'")
1030 txt = txt.replace(u'œ', u'oe')
1031 return txt.replace('...',u' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'…', ' £$£ ')
1033 def make_cleans(self, txt) :
1034 for clean in self.cleans :
1038 def backup_uce(self) :
1039 if self.corpus.idformesuces != {} :
1040 log.info('backup %i' % len(self.corpus.idformesuces))
1041 touce = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].keys()])) for forme in self.corpus.idformesuces]
1042 toeff = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].values()])) for forme in self.corpus.idformesuces]
1043 self.cf.executemany('INSERT INTO uces VALUES (?,?);', touce)
1044 self.cf.executemany('INSERT INTO eff VALUES (?,?);', toeff)
1045 self.corpus.idformesuces = {}
1048 def backup_corpus(self) :
1049 log.info('start backup corpus')
1051 for uci in self.corpus.ucis :
1052 self.ccorpus.execute('INSERT INTO etoiles VALUES (?,?,?);' ,(uci.ident,' '.join(uci.etoiles), ' '.join(uci.paras,)))
1053 for uce in uci.uces :
1054 self.ccorpus.execute('INSERT INTO luces VALUES (?,?,?);',(`uci.ident`,`uce.para`,`uce.ident`,))
1055 for forme in self.corpus.formes :
1056 self.ccorpus.execute('INSERT INTO formes VALUES (?,?,?,?,?);', (`self.corpus.formes[forme].ident`, forme, self.corpus.formes[forme].lem, self.corpus.formes[forme].gram, `self.corpus.formes[forme].freq`,))
1057 log.info('%f' % (time() - t))
1059 def dofinish(self) :
1060 self.corpus.parametres['date'] = datetime.datetime.now().ctime()
1061 minutes, seconds = divmod(self.time, 60)
1062 hours, minutes = divmod(minutes, 60)
1063 self.corpus.parametres['time'] = '%.0fh %.0fm %.0fs' % (hours, minutes, seconds)
1064 self.corpus.parametres['ucinb'] = self.corpus.getucinb()
1065 self.corpus.parametres['ucenb'] = self.corpus.getucenb()
1066 self.corpus.parametres['occurrences'] = self.corpus.gettotocc()
1067 self.corpus.parametres['formesnb'] = len(self.corpus.formes)
1068 hapaxnb = self.corpus.gethapaxnb()
1069 pourhapaxf = (float(hapaxnb) / len(self.corpus.formes)) * 100
1070 pourhapaxocc = (float(hapaxnb) / self.corpus.parametres['occurrences']) * 100
1071 self.corpus.parametres['hapax'] = '%i - %.2f %% des formes - %.2f %% des occurrences' % (hapaxnb, pourhapaxf, pourhapaxocc)
1074 class BuildFromAlceste(BuildCorpus) :
1075 def read_corpus(self, infile) :
1076 if self.dlg is not None :
1077 self.dlg.Pulse('textes : 0 - segments : 0')
1080 if self.corpus.parametres['ucimark'] == 0 :
1081 self.testuci = testetoile
1082 elif self.corpus.parametres['ucimark'] == 1 :
1083 self.testuci = testint
1089 with codecs.open(infile, 'r', self.encoding) as f :
1090 for linenb, line in enumerate(f) :
1091 line = line.rstrip('\n\r')
1092 if self.testuci(line) :
1095 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci - 1)
1097 self.corpus.ucis.append(Uci(iduci, line))
1100 if self.corpus.ucis[-1].uces == [] :
1101 log.info(u'Empty text : %i' % linenb)
1103 self.corpus.ucis.pop()
1104 #raise Exception("EmptyText %i" % linenb)
1105 self.corpus.ucis.append(Uci(iduci, line))
1106 if self.dlg is not None :
1107 if not (iduci + 1) % 10 :
1108 self.dlg.Pulse('textes : %i - segments : %i' % (iduci + 1, iduce +1))
1109 elif line.startswith(u'-*') :
1112 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1115 self.corpus.ucis[-1].paras.append(line.split()[0])
1117 raise Exception('paragrapheOT')
1118 elif line.strip() != '' and iduci != -1 :
1120 if txt != [] and iduci != -1 :
1121 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1124 raise Exception("EmptyText")
1125 if iduci != -1 and iduce != -1:
1128 log.info(_(u"No Text in corpora. Are you sure of the formatting ?"))
1129 raise Exception('TextBeforeTextMark')
1130 except UnicodeDecodeError :
1131 raise Exception("CorpusEncoding")
1133 def treattxt(self, txt, iduce, idpara, iduci) :
1134 if self.corpus.parametres.get('ucemethod', 0) == 2 and self.corpus.parametres['douce']:
1135 txt = 'laphrasepoursplitter'.join(txt)
1136 txt = self.make_cleans(txt)
1137 txt = ' '.join([val for val in txt.split() if val not in self.ponctuation_espace])
1138 ucetxt = txt.split('laphrasepoursplitter')
1141 txt = self.make_cleans(txt)
1142 ucetxt = self.make_uces(txt, self.corpus.parametres['douce'])
1143 if self.corpus.ucis[-1].paras == [] :
1147 self.corpus.ucis[-1].uces.append(Uce(iduce, idpara, iduci))
1148 self.c.execute('INSERT INTO uces VALUES(?,?);', (`iduce`,uce))
1149 if not self.tolist :
1155 self.corpus.add_word(word)
1156 #if self.dlg is not None :
1157 # if self.limitshow > self.count :
1158 # self.dlg.Pulse('textes : %i - segments : %i' % (iduci + 1, iduce +1))
1160 # self.limitshow = 0
1162 # self.limitshow = self.last / 100000
1163 log.debug(' '.join([`iduci`,`idpara`,`iduce`]))
1164 if self.last > self.lim :
1167 return iduce, idpara
1169 def make_uces(self, txt, douce = True, keep_ponct = False) :
1170 txt = ' '.join(txt.split())
1173 reste, texte_uce, suite = self.decouper(self.prep_txt(txt), self.ucesize + 15, self.ucesize)
1181 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1184 reste, texte_uce, suite = self.decouper(suite, self.ucesize + 15, self.ucesize)
1192 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1194 #print 'RESTEE UUCEEEEEEEEEEEEE', uce
1198 return [' '.join([val for val in txt.split() if val not in self.ponctuation_espace])]
1200 #decouper (list_sep)
1201 #make_uces (decouper)
1202 #treat_txt (make_uces)
1206 def __init__(self, parent, dlg = None) :
1207 self.parent = parent
1209 parametres = DoConf(os.path.join(self.parent.UserConfigPath,'corpus.cfg')).getoptions('corpus')
1210 parametres['pathout'] = PathOut(parent.filename, 'corpus').mkdirout()
1211 dial = CorpusPref(parent, parametres)
1212 dial.CenterOnParent()
1213 dial.txtpath.SetLabel(parent.filename)
1214 #dial.repout_choices.SetValue(parametres['pathout'])
1215 self.res = dial.ShowModal()
1216 if self.res == 5100 :
1217 parametres = dial.doparametres()
1218 parametres['originalpath'] = parent.filename
1219 PathOut().createdir(parametres['pathout'])
1220 ReadLexique(self.parent, lang = parametres['lang'])
1221 self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp'))
1222 self.parametres = parametres
1225 def doanalyse(self) :
1226 return BuildFromAlceste(self.parent.filename, self.parametres, self.parent.lexique, self.parent.expressions, dlg = self.dlg).corpus
1229 if __name__ == '__main__' :
1231 parametres = {'formesdb':'formes.db', 'ucesdb': 'uces.db', 'corpusdb' : 'corpus.db', 'syscoding' : 'utf-8', 'encoding' : encoding}
1232 intro = BuildCorpus(infile, parametres)#, tar_in, tar_infouce)#, tar_formes)