1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
11 from functions import decoupercharact, ReadDicoAsDico, DoConf, ReadLexique
16 from operator import itemgetter
17 from uuid import uuid4
18 from chemins import PathOut
19 from dialog import CorpusPref, SubTextFromMetaDial
21 from colors import colors
25 log = logging.getLogger('iramuteq.corpus')
28 def copycorpus(corpus) :
29 log.info('copy corpus')
30 copy_corpus = Corpus(corpus.parent, parametres = corpus.parametres)
31 copy_corpus.ucis = corpus.ucis
32 copy_corpus.formes = corpus.formes
33 copy_corpus.pathout = corpus.pathout
34 copy_corpus.conn_all()
38 return Uce(uce.ident, uce.para, uce.uci)
42 nuci = Uci(uci.ident, '')
43 nuci.etoiles = copy(uci.etoiles)
44 nuci.uces = [CopyUce(uce) for uce in uci.uces]
53 def __init__(self, parent, parametres = {}, read = False) :
55 self.parametres = parametres
57 self.connformes = None
59 self.conncorpus = None
66 self.idformesuces = {}
71 self.pathout = PathOut(dirout = parametres['pathout'])
74 def add_word(self, word) :
75 if word in self.formes :
76 self.formes[word].freq += 1
77 if self.formes[word].ident in self.idformesuces :
78 if self.ucis[-1].uces[-1].ident in self.idformesuces[self.formes[word].ident] :
79 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] += 1
81 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] = 1
83 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident: 1}
85 if word in self.parent.lexique :
86 gramtype = self.parent.lexique[word][1]
87 lem = self.parent.lexique[word][0]
94 self.formes[word] = Word(word, gramtype, len(self.formes), lem)
95 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident : 1}
97 def add_word_from_forme(self, word, stident):
98 if word.forme in self.formes :
99 self.formes[word.forme].freq += 1
100 if self.formes[word.forme].ident in self.idformesuces :
101 if stident in self.idformesuces[self.formes[word.forme].ident] :
102 self.idformesuces[self.formes[word.forme].ident][stident] += 1
104 self.idformesuces[self.formes[word.forme].ident][stident] = 1
106 self.idformesuces[self.formes[word.forme].ident] = {stident: 1}
108 self.formes[word.forme] = Word(word.forme, word.gram, len(self.formes), word.lem)
109 self.idformesuces[self.formes[word.forme].ident] = {stident : 1}
112 """connect corpus to db"""
113 if self.connformes is None :
114 log.info('connexion corpus')
115 self.connuces = sqlite3.connect(self.pathout['uces.db'])
116 self.cuces = self.connuces.cursor()
117 self.connformes = sqlite3.connect(self.pathout['formes.db'])
118 self.cformes = self.connformes.cursor()
119 self.conncorpus = sqlite3.connect(self.pathout['corpus.db'])
120 self.ccorpus = self.conncorpus.cursor()
121 self.cformes.execute('PRAGMA temp_store=MEMORY;')
122 self.cformes.execute('PRAGMA journal_mode=MEMORY;')
123 self.cformes.execute('PRAGMA synchronous = OFF;')
124 self.cuces.execute('PRAGMA temp_store=MEMORY;')
125 self.cuces.execute('PRAGMA journal_mode=MEMORY;')
126 self.cuces.execute('PRAGMA synchronous = OFF;')
127 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
128 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
129 self.ccorpus.execute('PRAGMA synchronous = OFF;')
131 def read_corpus(self) :
132 log.info('read corpus')
133 self.parametres['syscoding'] = sys.getdefaultencoding()
134 if self.conncorpus is None :
136 res = self.ccorpus.execute('SELECT * FROM etoiles;')
138 self.ucis.append(Uci(row[0], row[1], row[2]))
139 uces = self.conncorpus.cursor().execute('SELECT * FROM luces where uci=?;',(`self.ucis[-1].ident`,))
141 self.ucis[-1].uces.append(Uce(uce[2], uce[1], uce[0]))
142 res = self.ccorpus.execute('SELECT * FROM formes;')
143 self.formes = dict([[forme[1], Word(forme[1], forme[3], forme[0], lem = forme[2], freq = forme[4])] for forme in res])
146 def getworduces(self, wordid) :
147 if isinstance(wordid, basestring) :
148 wordid = self.formes[wordid].ident
149 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`wordid`,))
150 return list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
152 def getworducis(self, wordid) :
153 res = self.getworduces(wordid)
154 return list(set([self.getucefromid(uce).uci for uce in res]))
156 def getformeuceseff(self, formeid) :
157 if isinstance(formeid, basestring) :
158 formeid = self.formes[formeid].ident
159 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`formeid`,))
160 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
161 query = 'SELECT eff FROM eff where id=%i ORDER BY id' % formeid
162 res = self.cformes.execute(query)
163 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
165 for i, uce in enumerate(uces) :
166 formeuceeff[uce] = formeuceeff.get(uce, 0) + eff[i]
169 def getlemuces(self, lem) :
170 formesid = ', '.join([`val` for val in self.lems[lem].formes])
171 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
172 res = self.cformes.execute(query)
173 return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))))
175 def gettgenst(self, tgen):
176 formesid = ', '.join([`val` for lem in tgen for val in self.lems[lem].formes if lem in self.lems])
177 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
178 res = self.cformes.execute(query)
179 return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))))
181 def gettgentxt(self, tgen):
182 sts = self.gettgenst(tgen)
183 return list(set([self.getucefromid(val).uci for val in sts]))
185 def getlemucis(self, lem) :
186 uces = self.getlemuces(lem)
187 return list(set([self.getucefromid(val).uci for val in uces]))
189 def getlemuceseff(self, lem, luces = None) :
190 formesid = ', '.join([`val` for val in self.lems[lem].formes])
191 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
192 res = self.cformes.execute(query)
193 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
194 query = 'SELECT eff FROM eff where id IN (%s) ORDER BY id' % formesid
195 res = self.cformes.execute(query)
196 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
198 for i, uce in enumerate(uces) :
199 lemuceeff[uce] = lemuceeff.get(uce, 0) + eff[i]
202 def getlemclustereff(self, lem, cluster) :
203 return len(list(set(self.lc[cluster]).intersection(self.getlemuces(lem))))
205 def getlemeff(self, lem) :
206 return self.lems[lem].freq
211 def getforme(self, formeid) :
212 if self.idformes is None : self.make_idformes()
213 return self.idformes[formeid]
215 def gettotocc(self) :
216 return sum([self.formes[forme].freq for forme in self.formes])
218 def getucemean(self) :
219 return float(self.gettotocc())/self.getucenb()
222 return self.ucis[-1].uces[-1].ident + 1
225 return self.ucis[-1].ident + 1
227 def getucisize(self) :
228 ucesize = self.getucesize()
229 return [sum(ucesize[uci.uces[0].ident:(uci.uces[-1].ident + 1)]) for uci in self.ucis]
231 def getucesize(self) :
232 res = self.getalluces()
233 return [len(uce[1].split()) for uce in res]
235 def getconcorde(self, uces) :
236 return self.cuces.execute('select * from uces where id IN (%s) ORDER BY id;' % ', '.join([`i` for i in uces]))
238 def getuciconcorde(self, ucis) :
239 uces = [[val,[uce.ident for uce in self.ucis[val].uces]] for val in ucis]
240 uces = [[val[0], '\n'.join([row[1] for row in self.getconcorde(val[1])])] for val in uces]
243 def getwordconcorde(self, word) :
244 return self.getconcorde(self.getworduces(word))
246 def getlemconcorde(self, lem) :
247 return self.getconcorde(self.getlemuces(lem))
249 def getalluces(self) :
250 return self.cuces.execute('SELECT * FROM uces')
252 def getallucis(self):
253 uces = [row[1] for row in self.getalluces()]
254 return [[uci.ident, '\n'.join([uces[uce.ident] for uce in uci.uces])] for uci in self.ucis]
256 def getucesfrometoile(self, etoile) :
257 return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles]
259 def getetoileuces(self) :
260 log.info('get uces etoiles')
263 for uci in self.ucis :
264 etoiles = uci.etoiles[1:]
266 if et in etoileuces :
267 etoileuces[et] += [uce.ident for uce in uci.uces]
269 etoileuces[et] = [uce.ident for uce in uci.uces]
271 for et in uci.paras :
272 if et in etoileuces :
273 etoileuces[et] += [uce.ident for uce in uci.uces if uce.para == idpara]
275 etoileuces[et] = [uce.ident for uce in uci.uces if uce.para == idpara]
281 def getetoileucis(self):
283 for uci in self.ucis :
284 etoiles = uci.etoiles[1:]
286 if et in etoileuces :
287 etoileuces[et] += [uci.ident]
289 etoileuces[et] = [uci.ident]
292 def getucefromid(self, uceid) :
293 if self.iduces is None : self.make_iduces()
294 return self.iduces[uceid]
296 def gethapaxnb(self) :
297 return len([None for forme in self.formes if self.formes[forme].freq == 1])
299 def getactivesnb(self, key) :
300 return len([lem for lem in self.lems if self.lems[lem].act == key])
301 # def make_lems(self, lem = True) :
302 # log.info('make lems')
304 # for forme in self.formes :
305 # if self.formes[forme].lem in self.lems :
306 # if self.formes[forme].ident not in self.lems[self.formes[forme].lem] :
307 # self.lems[self.formes[forme].lem][self.formes[forme].ident] = 0
309 # self.lems[self.formes[forme].lem] = {self.formes[forme].ident : 0}
311 def getetbyuceid(self, uceid) :
312 if self.uceuci is None : self.uceuci = dict([[uce.ident,uci.ident] for uci in self.ucis for uce in uci.uces])
313 return self.ucis[self.uceuci[uceid]].etoiles
315 def make_lems(self, lem = True) :
316 log.info('make lems')
319 for forme in self.formes :
320 if self.formes[forme].lem in self.lems :
321 if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes :
322 self.lems[self.formes[forme].lem].add_forme(self.formes[forme])
324 self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
326 self.lems = dict([[forme, Lem(self, self.formes[forme])] for forme in self.formes])
328 def make_lems_from_dict(self, dictionnaire, dolem = True) :
329 log.info('make lems from dict')
331 for forme in self.formes :
332 if self.formes[forme].forme in dictionnaire :
333 lem = dictionnaire[forme][0]
334 gram = dictionnaire[forme][1]
335 elif forme.isdigit() :
341 self.formes[forme].lem = lem
342 self.formes[forme].gram = gram
344 if self.formes[forme].lem in self.lems :
345 if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes :
346 self.lems[self.formes[forme].lem].add_forme(self.formes[forme])
348 self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
350 self.lems[forme] = Lem(self, self.formes[forme])
352 def make_idformes(self) :
353 self.idformes = dict([[self.formes[forme].ident, self.formes[forme]] for forme in self.formes])
355 def make_iduces(self) :
356 if self.iduces is None :
357 self.iduces = dict([[uce.ident, uce] for uci in self.ucis for uce in uci.uces])
359 def make_lexitable(self, mineff, etoiles, gram = 0) :
364 tokeep = [lem for lem in self.lems if self.lems[lem].freq >= mineff and self.lems[lem].act in grams]
365 etuces = [[] for et in etoiles]
366 for uci in self.ucis :
367 get = list(set(uci.etoiles).intersection(etoiles))
369 log.info('2 variables sur une ligne')
371 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
372 etuces = [set(val) for val in etuces]
375 deff = self.getlemuceseff(lem)
377 tab.append([lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])
378 tab.insert(0, [''] + etoiles)
381 def make_tgen_table(self, tgen, etoiles, tot = None):
382 lclasses = [self.getucesfrometoile(etoile) for etoile in etoiles]
383 sets = [set(cl) for cl in lclasses]
384 totoccurrences = dict([[val, 0] for val in etoiles])
386 for forme in self.formes :
387 formeuceeff = self.getformeuceseff(forme)
388 for i, classe in enumerate(lclasses) :
389 concern = sets[i].intersection(formeuceeff.keys())
391 totoccurrences[etoiles[i]] += sum([formeuceeff[uce] for uce in concern])
392 #tgenoccurrences = dict([[val, 0] for val in etoiles])
395 tgenoccurrences[t] = dict([[val, 0] for val in etoiles])
397 lemuceeff = self.getlemuceseff(lem)
398 for i, classe in enumerate(lclasses) :
399 concern = sets[i].intersection(lemuceeff.keys())
401 tgenoccurrences[t][etoiles[i]] += sum([lemuceeff[uce] for uce in concern])
402 return tgenoccurrences, totoccurrences
404 def make_tgen_profile(self, tgen, ucecl, uci = False) :
405 log.info('tgen/classes')
407 tab = [[lem] + [len(set(self.gettgentxt(tgen[lem])).intersection(classe)) for classe in ucecl] for lem in tgen]
409 tab = [[lem] + [len(set(self.gettgenst(tgen[lem])).intersection(classe)) for classe in ucecl] for lem in tgen]
410 tab = [[line[0]] + [val for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
414 #while nam + `i` in tgen :
417 #last = [nam] + [`len(classe)` for classe in ucecl]
419 #line0 = ['tgen'] + ['_'.join(['cluster', `i+1`]) for i in range(len(ucecl))]
421 #with open(fileout, 'w') as f :
422 # f.write('\n'.join(['\t'.join(line) for line in tab]).encode(self.parametres['syscoding']))
424 def make_efftype_from_etoiles(self, etoiles) :
426 etuces = [[] for et in etoiles]
427 for uci in self.ucis :
428 get = list(set(uci.etoiles).intersection(etoiles))
430 return '2 variables sur la meme ligne'
432 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
433 etuces = [set(val) for val in etuces]
434 for lem in self.lems :
435 deff = self.getlemuceseff(lem)
437 gram = self.lems[lem].gram
439 dtype[gram] = [i + j for i, j in zip(dtype[gram], [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])]
441 dtype[gram] = [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces]
442 tabout = [[gram] + dtype[gram] for gram in dtype]
443 tabout.insert(0, [''] + etoiles)
446 def make_uceactsize(self, actives) :
447 res = self.getalluces()
450 deff = self.getlemuceseff(lem)
452 ucesize[uce] = ucesize.get(uce, 0) + 1
455 def make_uc(self, actives, lim1, lim2) :
456 uceactsize = self.make_uceactsize(actives)
462 for uce in [uce for uci in self.ucis for uce in uci.uces] :
463 if uce.para == lastpara :
465 last1 += uceactsize.get(uce.ident,0)
466 uc1[-1].append(uce.ident)
468 uc1.append([uce.ident])
471 last2 += uceactsize.get(uce.ident, 0)
472 uc2[-1].append(uce.ident)
474 uc2.append([uce.ident])
477 last1 = uceactsize.get(uce.ident, 0)
478 last2 = uceactsize.get(uce.ident, 0)
480 uc1.append([uce.ident])
481 uc2.append([uce.ident])
484 def make_and_write_sparse_matrix_from_uc(self, actives, sizeuc1, sizeuc2, uc1out, uc2out, listuce1out, listuce2out) :
485 uc1, uc2 = self.make_uc(actives, sizeuc1, sizeuc2)
486 log.info('taille uc1 : %i - taille uc2 : %i' % (len(uc1), len(uc2)))
487 self.write_ucmatrix(uc1, actives, uc1out)
488 self.write_ucmatrix(uc2, actives, uc2out)
489 listuce1 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc1) for uce in ucl]
490 listuce2 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc2) for uce in ucl]
491 with open(listuce1out, 'w') as f :
492 f.write('\n'.join([';'.join(line) for line in listuce1]))
493 with open(listuce2out, 'w') as f :
494 f.write('\n'.join([';'.join(line) for line in listuce2]))
495 return len(uc1), len(uc2)
497 def write_ucmatrix(self, uc, actives, fileout) :
498 log.info('write uc matrix %s' % fileout)
499 uces_uc = dict([[uce, i] for i, ucl in enumerate(uc) for uce in ucl])
502 with open(fileout + '~', 'w+') as f :
503 for i, lem in enumerate(actives) :
504 for uce in self.getlemuces(lem):
505 if (uces_uc[uce], i) not in deja_la :
507 f.write(''.join([' '.join([`uces_uc[uce]+1`,`i+1`,`1`]),'\n']))
508 deja_la[(uces_uc[uce], i)] = 0
510 with open(fileout, 'w') as ffin :
511 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (len(uc), len(actives), nbl))
514 os.remove(fileout + '~')
517 def export_corpus(self, outf) :
518 #outf = 'export_corpus.txt'
520 res = self.getalluces()
524 with open(outf,'w') as f :
526 if self.iduces[uce[0]].uci == actuci and self.iduces[uce[0]].para == actpara :
527 f.write(uce[1].encode(self.parametres['syscoding']) + '\n')
528 elif self.iduces[uce[0]].uci != actuci :
529 actuci = self.iduces[uce[0]].uci
530 if self.ucis[self.iduces[uce[0]].uci].paras == [] :
531 actpara = self.iduces[uce[0]].para
532 f.write('\n' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n' + uce[1].encode(self.parametres['syscoding']) + '\n')
535 actpara = self.iduces[uce[0]].para
536 f.write('\n'.join([' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']), self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
537 elif self.iduces[uce[0]].para != actpara :
538 actpara = self.iduces[uce[0]].para
540 f.write('\n'.join([self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
542 def export_corpus_classes(self, outf, alc = True, lem = False, uci = False) :
544 for i, lc in enumerate(self.lc) :
547 for uce in self.lc0 :
550 res = self.getalluces()
553 res = self.getallucis()
554 with open(outf, 'w') as f :
558 actuci = self.iduces[uce[0]].uci
562 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
564 etline = ' '.join(self.ucis[actuci].etoiles + ['*classe_%i' % ucecl[uce[0]]])
566 etline = ' '.join(['<' + '='.join(et.split('_')) + '>' for et in self.ucis[actuci].etoiles[1:]])
567 f.write(etline.encode(self.parametres['syscoding']) + '\n')
568 f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
570 def export_classe(self, outf, classe, lem = False, uci = False) :
571 sts = self.lc[classe - 1]
573 res = self.getconcorde(sts)
576 res = self.getuciconcorde(sts)
577 with open(outf, 'w') as f :
581 f.write(' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n')
583 f.write(' '.join(self.ucis[uce[0]].etoiles).encode(self.parametres['syscoding']) + '\n')
585 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
586 f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
588 def export_owledge(self, rep, classe, lem = False, uci = False) :
589 sts = self.lc[classe - 1]
591 res = self.getconcorde(sts)
594 res = self.getuciconcorde(sts)
598 outf = '.'.join([`ident`, 'txt'])
599 outf = os.path.join(rep, outf)
601 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
602 with open(outf, 'w') as f :
603 f.write(guce.encode('cp1252', errors = 'replace'))
605 def export_tropes(self, fileout, classe, lem = False, uci = False) :
606 sts = self.lc[classe - 1]
608 res = self.getconcorde(sts)
611 res = self.getuciconcorde(sts)
612 with open(fileout, 'w') as f :
616 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
617 f.write(guce.encode('cp1252', errors = 'replace'))
620 def make_and_write_sparse_matrix_from_uces(self, actives, outfile, listuce = False) :
621 log.info('make_and_write_sparse_matrix_from_uces %s' % outfile)
623 with open(outfile + '~', 'w+') as f :
624 for i, lem in enumerate(actives) :
625 for uce in sorted(self.getlemuces(lem)) :
627 f.write(''.join([' '.join([`uce+1`, `i+1`,`1`]),'\n']))
629 with open(outfile, 'w') as ffin :
630 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
633 os.remove(outfile + '~')
635 with open(listuce, 'w') as f :
636 f.write('\n'.join(['uce;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucenb())]))
638 def make_and_write_sparse_matrix_from_uci(self, actives, outfile, listuci = False) :
639 log.info('make_and_write_sparse_matrix_from_ucis %s' % outfile)
641 with open(outfile + '~', 'w+') as f :
642 for i, lem in enumerate(actives) :
643 for uci in sorted(self.getlemucis(lem)) :
645 f.write(''.join([' '.join([`uci+1`, `i+1`,`1`]),'\n']))
647 with open(outfile, 'w') as ffin :
648 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucinb(), len(actives), nbl))
651 os.remove(outfile + '~')
653 with open(listuci, 'w') as f :
654 f.write('\n'.join(['uci;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucinb())]))
656 def make_and_write_sparse_matrix_from_classe(self, actives, uces, outfile) :
657 log.info('make_and_write_sparse_matrix_from_classe %s' % outfile)
659 duces = dict([[uce, i] for i, uce in enumerate(uces)])
660 with open(outfile + '~', 'w+') as f :
661 for i, lem in enumerate(actives) :
662 uces_ok = list(set(self.getlemuces(lem)).intersection(uces))
664 f.write(''.join([' '.join([`duces[uce]+1`,`i+1`,`1`]),'\n']))
666 with open(outfile, 'w') as ffin :
667 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
670 os.remove(outfile + '~')
672 def make_table_with_classe(self, uces, list_act, uci = False) :
673 table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
674 uces = dict([[uce, i] for i, uce in enumerate(uces)])
676 getlem = self.getlemucis
678 getlem = self.getlemuces
679 for i, lem in enumerate(list_act) :
680 lemuces = list(set(getlem(lem)).intersection(uces))
682 table_uce[uces[uce]][i] = 1
683 table_uce.insert(0, list_act)
686 def make_pondtable_with_classe(self, uces, list_act) :
687 table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
688 uces = dict([[uce, i] for i, uce in enumerate(uces)])
689 for i, lem in enumerate(list_act) :
690 uceseff = self.getlemuceseff(lem)
691 lemuces = list(set(uceseff.keys()).intersection(uces))
693 table_uce[uces[uce]][i] = uceseff[uce]
694 table_uce.insert(0, list_act)
697 def parse_active(self, gramact, gramsup = None) :
698 log.info('parse actives')
699 for lem in self.lems :
700 if lem.startswith('_') and lem.endswith('_') :
701 self.lems[lem].act = 2
702 elif self.lems[lem].gram in gramact :
703 self.lems[lem].act = 1
704 elif gramsup is not None and self.lems[lem].gram not in gramact:
705 if self.lems[lem].gram in gramsup :
706 self.lems[lem].act = 2
708 self.lems[lem].act = 0
710 self.lems[lem].act = 2
712 def make_actives_limit(self, limit, key = 1) :
713 if self.idformes is None :
715 return [lem for lem in self.lems if self.getlemeff(lem) >= limit and self.lems[lem].act == key]
717 def make_actives_nb(self, nbmax, key) :
718 log.info('make_actives_nb : %i - %i' % (nbmax,key))
719 if self.idformes is None :
721 allactives = [[self.lems[lem].freq, lem] for lem in self.lems if self.lems[lem].act == key and self.lems[lem].freq >= 3]
722 self.activenb = len(allactives)
723 allactives = sorted(allactives, reverse = True)
724 if self.activenb == 0 :
726 if len(allactives) <= nbmax :
727 log.info('nb = %i - eff min = %i ' % (len(allactives), allactives[-1][0]))
728 return [val[1] for val in allactives], allactives[-1][0]
730 effs = [val[0] for val in allactives]
731 if effs.count(effs[nbmax - 1]) > 1 :
732 lim = effs[nbmax - 1] + 1
736 stop = effs.index(lim)
743 log.info('nb actives = %i - eff min = %i ' % (stop + 1, lim))
744 return [val[1] for val in allactives[0:stop + 1]], lim
746 def make_and_write_profile(self, actives, ucecl, fileout, uci = False) :
747 log.info('formes/classes')
749 tab = [[lem] + [len(set(self.getlemucis(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
751 tab = [[lem] + [len(set(self.getlemuces(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
752 tab = [[line[0]] + [`val` for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
753 with open(fileout, 'w') as f :
754 f.write('\n'.join([';'.join(line) for line in tab]).encode(self.parametres['syscoding']))
756 def make_etoiles(self) :
758 for uci in self.ucis :
759 etoiles.update(uci.etoiles[1:])
762 def make_themes(self):
764 for uci in self.ucis :
765 themes.update(uci.paras)
768 def make_etoiles_dict(self) :
769 etoiles = [et for uci in self.ucis for et in uci.etoiles[1:]]
771 for etoile in etoiles :
772 et = etoile.split('_')
775 endet = '_'.join(et[1:])
776 if etoile in det[et[0]] :
777 det[et[0]][etoile] += 1
779 det[et[0]][etoile] = 1
784 endet = '_'.join(et[1:])
785 det[et[0]] = {etoile :1}
790 def make_theme_dict(self):
791 themes = [val for uci in self.ucis for val in uci.paras]
793 for theme in themes :
794 th = theme.split('_')
797 endth = '_'.join(th[1:])
798 if theme in det[th[0]] :
799 det[th[0]][theme] += 1
801 det[th[0]][theme] = 1
806 endth = '_'.join(th[1:])
807 det[th[0]] = {theme:1}
812 def make_etline(self, listet) :
813 etuces = [[] for et in listet]
814 for uci in self.ucis :
815 get = list(set(uci.etoiles).intersection(listet))
817 return '2 variables sur la meme ligne'
819 etuces[listet.index(get[0])] += [uce.ident for uce in uci.uces]
822 def make_and_write_profile_et(self, ucecl, fileout, uci = False) :
823 log.info('etoiles/classes')
825 etoileuces = self.getetoileuces()
827 etoileuces = self.getetoileucis()
828 etoileuces = dict([[et, etoileuces[et]] for et in etoileuces if len(etoileuces[et]) > 1])
829 with open(fileout, 'w') as f :
830 f.write('\n'.join([';'.join([et] + [`len(set(etoileuces[et]).intersection(classe))` for classe in ucecl]) for et in etoileuces]).encode(self.parametres['syscoding']))
831 #etoiles = self.make_etoiles()
832 #with open(fileout, 'w') as f :
833 # f.write('\n'.join([';'.join([etoile] + [`len(set(self.getucesfrometoile(etoile)).intersection(classe))` for classe in ucecl]) for etoile in etoiles]).encode(self.parametres['syscoding']))
835 def make_colored_corpus(self, uci = False) :
837 for i, lc in enumerate(self.lc) :
840 for uce in self.lc0 :
842 color = ['black'] + colors[len(self.lc) - 1]
844 <meta http-equiv="content-Type" content="text/html; charset=%s" />
846 ''' % sys.getdefaultencoding()
848 res = self.getalluces()
853 if self.iduces[uce[0]].uci != actuci :
854 actuci = self.iduces[uce[0]].uci
855 txt += '<br><hr>' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles) + '<br><br>'
856 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
858 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
860 res = self.getallucis()
863 if self.ucis[uce[0]].ident != actuci :
864 actuci = self.ucis[uce[0]].ident
865 txt += '<br><hr>' + ' '.join(self.ucis[self.ucis[uce[0]].ident].etoiles) + '<br><br>'
866 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
868 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
869 return txt + '\n</body></html>'
871 def count_from_list(self, l, d) :
879 def count_from_list_cl(self, l, d, a, clnb) :
888 def find_segments(self, taille_segment, taille_limite) :
890 for uce in self.getalluces() :
892 d = self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
893 l = [[d[val], val] for val in d if d[val] >= 3]
896 if len(l) > taille_limite :
897 l = l[-taille_limite:]
900 def find_segments_in_classe(self, list_uce, taille_segment, taille_limite, uci = False):
903 concorde = self.getconcorde
905 concorde = self.getuciconcorde
906 for uce in concorde(list_uce) :
908 d =self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
909 l = [[d[val], val, taille_segment] for val in d if d[val] >= 3]
912 if len(l) > taille_limite :
913 l = l[-taille_limite:]
916 def make_segments_profile(self, fileout, lenmin = 3, lenmax = 10, effmin = 50, lem = False) :
918 for b, classe in enumerate(self.lc) :
919 for uce in self.getconcorde(classe) :
922 uce = [self.formes[forme].lem for forme in uce]
923 for taille_segment in range(lenmin,lenmax) :
924 d =self.count_from_list_cl([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d, b, len(self.lc))
925 result = [[seg] + [str(val) for val in d[seg]] for seg in d if sum(d[seg]) >= effmin]
926 with open(fileout, 'w') as f :
927 f.write('\n'.join([';'.join(line) for line in result]))
929 def make_proftype(self, outf) :
931 for lem in self.lems :
932 gram = self.lems[lem].gram
934 res[gram] = [0 for val in self.lc]
935 lemuceeff = self.getlemuceseff(lem)
936 for i, classe in enumerate(self.lc) :
937 concern = set(classe).intersection(lemuceeff.keys())
938 res[gram][i] += sum([lemuceeff[uce] for uce in concern])
939 res = [[gram] + [`val` for val in res[gram]] for gram in res]
941 with open(outf, 'w') as f :
942 f.write('\n'.join([';'.join(line) for line in res]).encode(self.parametres['syscoding']))
945 def make_ucecl_from_R(self, filein) :
946 with open(filein, 'rU') as f :
951 line = line.replace('\n', '').replace('"', '').split(';')
952 self.lc.append([int(line[0]) - 1, int(line[1])])
953 classesl = [val[1] for val in self.lc]
955 self.lc = sorted(self.lc, key=itemgetter(1))
956 self.lc = [[uce[0] for uce in self.lc if uce[1] == i] for i in range(clnb+1)]
957 self.lc0 = self.lc.pop(0)
960 def get_stat_by_cluster(self, outf, lclasses = None) :
961 log.info('get_stat_by_cluster')
962 if lclasses is None :
965 occurrences = dict([[i + 1, 0] for i in range(len(lclasses))])
966 formescl = dict([[i + 1, 0] for i in range(len(lclasses))])
967 hapaxcl = dict([[i + 1, 0] for i in range(len(lclasses))])
968 lenclasses = dict([[i+1,len(cl)] for i, cl in enumerate(lclasses)])
969 sets = [set(cl) for cl in lclasses]
970 for forme in self.formes :
971 formeuceeff = self.getformeuceseff(forme)
972 for i, classe in enumerate(lclasses) :
973 concern = sets[i].intersection(formeuceeff.keys())
975 occurrences[i+1] += sum([formeuceeff[uce] for uce in concern])
977 if self.formes[forme].freq == 1 :
979 log.info('%f' % (time() - t1))
980 if outf is not None :
981 toprint = '\n'.join([';'.join([`i`, `occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`]) for i in occurrences])
982 with open(outf, 'w') as f :
985 return [[`occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`] for i in occurrences]
987 def get_stat_by_et(self, outf, etoiles) :
988 lclasses = [self.getucesfrometoile(etoile) for etoile in etoiles]
989 stats = self.get_stat_by_cluster(None, lclasses)
990 stats = [[etoiles[i]] + val for i, val in enumerate(stats)]
992 def gethapaxbyet(self, etoiles) :
993 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
995 for uce in hapaxuces :
996 if uce in hucesdict :
1000 etuces = [[] for et in etoiles]
1001 for uci in self.ucis :
1002 get = list(set(uci.etoiles).intersection(etoiles))
1004 return '2 variables sur la meme ligne'
1006 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
1007 etuces = [set(val) for val in etuces]
1008 return [sum([hucesdict[uce] for uce in list(etuce.intersection(hapaxuces))]) for etuce in etuces]
1010 def gethapaxuces(self) :
1011 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
1012 hapax = [forme for forme in self.lems if self.lems[forme].freq == 1]
1014 for i,uce in enumerate(hapaxuces) :
1015 if uce in hucesdict :
1016 hucesdict[uce][0] += 1
1017 hucesdict[uce][1].append(hapax[i])
1019 hucesdict[uce] = [1,[hapax[i]]]
1021 for uce in hucesdict :
1022 if hucesdict[uce][0] in huces :
1023 huces[hucesdict[uce][0]].append(uce)
1025 huces[hucesdict[uce][0]] = [uce]
1026 huces = zip(huces, huces.values())
1027 huces.sort(reverse=True)
1031 for nb in huces[0:4] :
1032 txt += "<p><h2>%i hapax par uce</h2><p>\n" % nb[0]
1034 res = self.getconcorde([uce])
1036 ucetxt = ' ' + row[1] + ' '
1038 for hap in hucesdict[uce][1] :
1039 laforme = self.getforme([forme for forme in self.lems[hap].formes][0]).forme
1040 ucetxt = ucetxt.replace(' '+laforme+' ', ' <font color=red>'+laforme+'</font> ')
1041 txt += '<p><b>' + ' '.join(self.getetbyuceid(uceid)) + '</b></p>'
1042 txt += '<p>'+ucetxt+'</p>\n'
1046 with open('/tmp/testhapxuce.html','w') as f :
1049 def export_dictionary(self, fileout, syscoding) :
1050 listformes = [[self.formes[forme].freq, forme, self.formes[forme].lem, self.formes[forme].gram] for forme in self.formes]
1051 listformes.sort(reverse = True)
1052 listformes = [forme[1:] + [`forme[0]`] for forme in listformes]
1053 with open(fileout, 'w') as f :
1054 f.write('\n'.join(['\t'.join(forme) for forme in listformes]).encode(syscoding))
1056 def export_lems(self, fileout, syscoding) :
1057 self.make_idformes()
1058 listlem = [[lem, '\t'.join(['\t'.join([self.idformes[forme].forme, `self.lems[lem].formes[forme]`]) for forme in self.lems[lem].formes])] for lem in self.lems]
1060 with open(fileout, 'w') as f :
1061 f.write('\n'.join(['\t'.join(lem) for lem in listlem]).encode(syscoding))
1066 def __init__(self, corpus) :
1067 ucinb = corpus.getucinb()
1068 ucisize = corpus.getucisize()
1069 ucimean = float(sum(ucisize))/float(ucinb)
1070 detoile = corpus.make_etoiles_dict()
1073 def __init__(self, iduci, line, paraset = None) :
1075 self.etoiles = line.split()
1077 if paraset is not None :
1078 self.paras = paraset.split()
1083 def __init__(self, iduce, idpara, iduci) :
1089 def __init__(self, word, gramtype, idword, lem = None, freq = None) :
1092 self.gram = gramtype
1095 if freq is not None :
1101 def __init__(self, parent, forme) :
1102 self.formes = {forme.ident : forme.freq}
1103 self.gram = forme.gram
1104 self.freq = forme.freq
1105 self.act = forme.act
1107 def add_forme(self, forme) :
1108 self.formes[forme.ident] = forme.freq
1109 self.freq += forme.freq
1111 def decouperlist(chaine, longueur, longueurOptimale) :
1113 on part du dernier caractère, et on recule jusqu'au début de la chaîne.
1114 Si on trouve un '$', c'est fini.
1115 Sinon, on cherche le meilleur candidat. C'est-à-dire le rapport poids/distance le plus important.
1117 separateurs = [[u'.', 6.0], [u'?', 6.0], [u'!', 6.0], [u'£$£', 6.0], [u':', 5.0], [u';', 4.0], [u',', 1.0], [u' ', 0.01]]
1118 dsep = dict([[val[0],val[1]] for val in separateurs])
1119 trouve = False # si on a trouvé un bon séparateur
1120 iDecoupe = 0 # indice du caractere ou il faut decouper
1122 longueur = min(longueur, len(chaine) - 1)
1123 chaineTravail = chaine[:longueur + 1]
1125 meilleur = ['', 0, 0] # type, poids et position du meilleur separateur
1128 indice = chaineTravail.index(u'$')
1130 iDecoupe = indice - 1
1135 caractere = chaineTravail[nbCar]
1136 distance = abs(longueurOptimale - nbCar) + 1
1137 meilleureDistance = abs(longueurOptimale - meilleur[2]) + 1
1138 if caractere in dsep :
1139 if (float(dsep[caractere]) / distance) > (float(meilleur[1]) / meilleureDistance) :
1140 meilleur[0] = caractere
1141 meilleur[1] = dsep[caractere]
1146 if (float(dsep[' ']) / distance) > (float(meilleur[1]) / meilleureDistance) :
1148 meilleur[1] = dsep[' ']
1155 #if meilleur[0] != ' ' :
1156 # fin = chaine[iDecoupe + 1:]
1157 # retour = chaineTravail[:iDecoupe]
1159 fin = chaine[iDecoupe + 1:]
1160 retour = chaineTravail[:iDecoupe + 1]
1161 return len(retour) > 0, retour, fin
1162 # si on a rien trouvé
1163 return False, chaine, ''
1165 def testetoile(line) :
1166 return line.startswith(u'****')
1169 return line[0:4].isdigit() and u'*' in line
1171 def prep_txtlist(txt) :
1172 return txt.split() + [u'$']
1174 def prep_txtcharact(txt) :
1179 Class for building a corpus
1181 def __init__(self, infile, parametres_corpus, lexique = None, expressions = None, dlg = None) :
1182 log.info('begin building corpus...')
1183 self.lexique = lexique
1184 self.expressions = expressions
1186 self.corpus = Corpus(self, parametres_corpus)
1187 self.infile = infile
1189 self.lim = parametres_corpus.get('lim', 1000000)
1190 self.encoding = parametres_corpus['encoding']
1191 self.corpus.pathout = PathOut(filename = parametres_corpus['originalpath'], dirout = parametres_corpus['pathout'])
1192 self.corpus.pathout.createdir(parametres_corpus['pathout'])
1193 self.corpus.parametres['uuid'] = str(uuid4())
1194 self.corpus.parametres['corpus_name'] = parametres_corpus['corpus_name']#os.path.split(self.corpus.parametres['pathout'])[1]
1195 self.corpus.parametres['type'] = 'corpus'
1196 if self.corpus.parametres['keep_ponct'] :
1197 self.ponctuation_espace = [' ', '']
1199 self.ponctuation_espace = [' ','.', u'£$£', ';', '?', '!', ',', ':','']
1201 self.tolist = self.corpus.parametres.get('tolist', 0)
1208 def prep_makeuce(self) :
1209 method = self.corpus.parametres.get('ucemethod', 0)
1211 self.decouper = decouperlist
1212 self.prep_txt = prep_txtlist
1213 self.ucesize = self.corpus.parametres.get('ucesize', 40)
1215 self.decouper = decoupercharact
1216 self.prep_txt = prep_txtcharact
1217 self.ucesize = self.corpus.parametres.get('ucesize', 240)
1218 log.info('method uce : %s' % method)
1223 self.read_corpus(self.infile)
1224 except Warning, args :
1225 log.info('pas kool %s' % args)
1229 self.corpus.parametres['ira'] = self.corpus.pathout['Corpus.cira']
1230 self.time = time() - t1
1232 DoConf().makeoptions(['corpus'],[self.corpus.parametres], self.corpus.pathout['Corpus.cira'])
1233 log.info('time : %f' % (time() - t1))
1236 self.conn_f = sqlite3.connect(self.corpus.pathout['formes.db'])
1237 self.cf = self.conn_f.cursor()
1238 self.cf.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
1239 self.cf.execute('CREATE TABLE IF NOT EXISTS eff (id INTEGER, eff TEXT);')
1240 self.conn_f.commit()
1241 self.cf = self.conn_f.cursor()
1242 self.cf.execute('PRAGMA temp_store=MEMORY;')
1243 self.cf.execute('PRAGMA journal_mode=MEMORY;')
1244 self.cf.execute('PRAGMA synchronous = OFF;')
1245 self.cf.execute('begin')
1246 self.conn = sqlite3.connect(self.corpus.pathout['uces.db'])
1247 self.c = self.conn.cursor()
1248 self.c.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
1250 self.c = self.conn.cursor()
1251 self.c.execute('PRAGMA temp_store=MEMORY;')
1252 self.c.execute('PRAGMA journal_mode=MEMORY;')
1253 self.c.execute('PRAGMA synchronous = OFF;')
1254 self.c.execute('begin')
1257 #commit index and close db
1259 self.conn_f.commit()
1260 self.cf.execute('CREATE INDEX iduces ON uces (id);')
1261 self.cf.execute('CREATE INDEX ideff ON eff (id);')
1265 self.conn_corpus = sqlite3.connect(self.corpus.pathout['corpus.db'])
1266 self.ccorpus = self.conn_corpus.cursor()
1267 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS etoiles (uci INTEGER, et TEXT, paras TEXT);')
1268 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS luces (uci INTEGER, para INTEGER, uce INTEGER);')
1269 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS formes (ident INTEGER, forme TEXT, lem TEXT, gram TEXT, freq INTEGER);')
1270 self.conn_corpus.commit()
1271 self.ccorpus = self.conn_corpus.cursor()
1272 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
1273 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
1274 self.ccorpus.execute('PRAGMA synchronous = OFF;')
1275 self.ccorpus.execute('begin')
1276 self.backup_corpus()
1277 self.ccorpus.execute('CREATE INDEX iduci ON luces (uci);')
1278 self.conn_corpus.commit()
1279 self.conn_corpus.close()
1280 #self.corpus.parametres['corpus_ira'] = self.corpus.pathout['corpus.cira']
1282 def buildcleans(self) :
1283 if self.corpus.parametres.get('lower', 1) :
1284 self.cleans.append(self.dolower)
1285 if self.corpus.parametres.get('firstclean', 1) :
1286 self.cleans.append(self.firstclean)
1287 if self.corpus.parametres['charact'] :
1288 self.rule = self.corpus.parametres.get('keep_caract', u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_")
1289 self.cleans.append(self.docharact)
1290 if self.corpus.parametres.get('expressions', 1) :
1291 self.cleans.append(self.make_expression)
1292 if self.corpus.parametres.get('apos', 1) :
1293 self.cleans.append(self.doapos)
1294 if self.corpus.parametres.get('tiret', 1):
1295 self.cleans.append(self.dotiret)
1297 def make_expression(self,txt) :
1298 for expression in self.expressions:
1299 if expression in txt :
1300 txt = txt.replace(expression, self.expressions[expression][0])
1303 def dolower(self, txt) :
1306 def docharact(self, txt) :
1307 #rule = u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_-"
1308 list_keep = u"[" + self.rule + "]+"
1309 return re.sub(list_keep, ' ', txt)
1311 def doapos(self, txt) :
1312 return txt.replace(u'\'', u' ')
1314 def dotiret(self, txt) :
1315 return txt.replace(u'-', u' ')
1317 def firstclean(self, txt) :
1318 txt = txt.replace(u'’',"'")
1319 txt = txt.replace(u'œ', u'oe')
1320 return txt.replace('...',u' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'…', u' £$£ ')
1322 def make_cleans(self, txt) :
1323 for clean in self.cleans :
1327 def backup_uce(self) :
1328 if self.corpus.idformesuces != {} :
1329 log.info('backup %i' % len(self.corpus.idformesuces))
1330 touce = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].keys()])) for forme in self.corpus.idformesuces]
1331 toeff = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].values()])) for forme in self.corpus.idformesuces]
1332 self.cf.executemany('INSERT INTO uces VALUES (?,?);', touce)
1333 self.cf.executemany('INSERT INTO eff VALUES (?,?);', toeff)
1334 self.corpus.idformesuces = {}
1337 def backup_corpus(self) :
1338 log.info('start backup corpus')
1340 for uci in self.corpus.ucis :
1341 self.ccorpus.execute('INSERT INTO etoiles VALUES (?,?,?);' ,(uci.ident,' '.join(uci.etoiles), ' '.join(uci.paras,)))
1342 for uce in uci.uces :
1343 self.ccorpus.execute('INSERT INTO luces VALUES (?,?,?);',(`uci.ident`,`uce.para`,`uce.ident`,))
1344 for forme in self.corpus.formes :
1345 self.ccorpus.execute('INSERT INTO formes VALUES (?,?,?,?,?);', (`self.corpus.formes[forme].ident`, forme, self.corpus.formes[forme].lem, self.corpus.formes[forme].gram, `self.corpus.formes[forme].freq`,))
1346 log.info('%f' % (time() - t))
1348 def dofinish(self) :
1349 self.corpus.parametres['date'] = datetime.datetime.now().ctime()
1350 minutes, seconds = divmod(self.time, 60)
1351 hours, minutes = divmod(minutes, 60)
1352 self.corpus.parametres['time'] = '%.0fh %.0fm %.0fs' % (hours, minutes, seconds)
1353 self.corpus.parametres['ucinb'] = self.corpus.getucinb()
1354 self.corpus.parametres['ucenb'] = self.corpus.getucenb()
1355 self.corpus.parametres['occurrences'] = self.corpus.gettotocc()
1356 self.corpus.parametres['formesnb'] = len(self.corpus.formes)
1357 hapaxnb = self.corpus.gethapaxnb()
1358 pourhapaxf = (float(hapaxnb) / len(self.corpus.formes)) * 100
1359 pourhapaxocc = (float(hapaxnb) / self.corpus.parametres['occurrences']) * 100
1360 self.corpus.parametres['hapax'] = '%i - %.2f %% des formes - %.2f %% des occurrences' % (hapaxnb, pourhapaxf, pourhapaxocc)
1362 class BuildSubCorpus(BuildCorpus):
1363 def __init__(self, corpus, parametres, dlg = None) :
1364 log.info('begin subcorpus...')
1368 self.corpus = Corpus(self, {'type' : 'corpus', 'originalpath' : corpus.parametres['originalpath'], 'encoding' : corpus.parametres['encoding']})
1370 self.parametres = parametres
1371 self.encoding = corpus.parametres['encoding']
1372 self.corpus.parametres['corpus_name'] = parametres['corpus_name']
1373 self.corpus.pathout = PathOut(filename = corpus.parametres['originalpath'], dirout = parametres['pathout'])
1374 self.corpus.pathout.createdir(parametres['pathout'])
1375 self.corpus.parametres['pathout'] = parametres['pathout']
1376 self.corpus.parametres['meta'] = parametres.get('meta', False)
1377 self.corpus.parametres['uuid'] = str(uuid4())
1378 if parametres.get('frommeta', False) :
1379 print 'make subtexts'
1380 self.corpus.ucis = [CopyUci(uci) for uci in self.ori.ucis if set(parametres['meta']).intersection(uci.etoiles) != set()]
1381 elif parametres.get('fromtheme', False) :
1382 print 'make subtexts from theme'
1384 for uci in self.ori.ucis :
1385 if uci.paras != [] :
1388 for et in uci.paras :
1389 if et in parametres['meta'] :
1390 newuce += [CopyUce(uce) for uce in uci.uces if uce.para == idpara]
1396 nuci.paras = newpara
1397 self.corpus.ucis.append(nuci)
1400 elif parametres.get('fromclusters', False) :
1401 self.parametres['uceids'] = [st for i in self.parametres['meta'] for st in self.parametres['lc'][i]]
1403 elif parametres.get('fromuceids', False) :
1409 def fromuceids(self):
1411 dictucekeep = dict(zip(self.parametres['uceids'], self.parametres['uceids']))
1413 for uci in self.ori.ucis :
1414 if uci.paras == [] :
1415 keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep]
1418 nuci.uces = keepuces
1419 self.corpus.ucis.append(nuci)
1424 for et in uci.paras :
1425 keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep]
1433 nuci.paras = newpara
1434 self.corpus.ucis.append(nuci)
1436 def read_corpus(self, infile = None):
1437 self.olduceid = [uce.ident for uci in self.corpus.ucis for uce in uci.uces]
1443 print 'redo text, para and st ident'
1444 for uci in self.corpus.ucis :
1445 uci.ident = ident_uci
1447 for uce in uci.uces :
1449 if uce.para != lastpara :
1452 uce.para = ident_para
1454 uce.para = ident_para
1455 newuceident[uce.ident] = ident_uce
1456 uce.ident = ident_uce
1458 print 'backup st text and forms'
1459 for row in self.ori.getconcorde(self.olduceid) :
1460 self.c.execute('INSERT INTO uces VALUES(?,?);', (`newuceident[row[0]]`, row[1]))
1461 for word in row[1].split() :
1462 self.corpus.add_word_from_forme(self.ori.formes[word], newuceident[row[0]])
1466 class BuildFromAlceste(BuildCorpus) :
1467 def read_corpus(self, infile) :
1468 if self.dlg is not None :
1469 self.dlg.Pulse('textes : 0 - segments : 0')
1472 if self.corpus.parametres['ucimark'] == 0 :
1473 self.testuci = testetoile
1474 elif self.corpus.parametres['ucimark'] == 1 :
1475 self.testuci = testint
1481 with codecs.open(infile, 'r', self.encoding) as f :
1482 for linenb, line in enumerate(f) :
1483 line = line.rstrip('\n\r')#FIXME .lstrip(codecs.BOM).lstrip(codecs.BOM_UTF8)
1484 if self.testuci(line) :
1487 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci - 1)
1489 self.corpus.ucis.append(Uci(iduci, line))
1492 if self.corpus.ucis[-1].uces == [] :
1493 log.info(u'Empty text : %i' % linenb)
1495 self.corpus.ucis.pop()
1496 self.corpus.ucis.append(Uci(iduci, line))
1497 if self.dlg is not None :
1498 if not (iduci + 1) % 10 :
1499 self.dlg.Pulse('textes : %i - segments : %i' % (iduci + 1, iduce +1))
1500 elif line.startswith(u'-*') :
1503 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1506 self.corpus.ucis[-1].paras.append(line.split()[0])
1508 raise Exception('paragrapheOT %i' % linenb)
1509 elif line.strip() != '' and iduci != -1 :
1511 if txt != [] and iduci != -1 :
1512 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1517 self.corpus.ucis.pop()
1518 log.info(Exception("Empty text %i" % linenb))
1520 raise Exception('EmptyText %i' % linenb)
1521 if iduci != -1 and iduce != -1:
1524 log.info(_(u"No Text in corpus. Are you sure of the formatting ?"))
1525 raise Exception('TextBeforeTextMark %i' % linenb)
1526 except UnicodeDecodeError :
1527 raise Exception("CorpusEncoding")
1529 def treattxt(self, txt, iduce, idpara, iduci) :
1530 if self.corpus.parametres.get('ucemethod', 0) == 2 and self.corpus.parametres['douce']:
1531 txt = 'laphrasepoursplitter'.join(txt)
1532 txt = self.make_cleans(txt)
1533 txt = ' '.join([val for val in txt.split() if val not in self.ponctuation_espace])
1534 ucetxt = txt.split('laphrasepoursplitter')
1537 txt = self.make_cleans(txt)
1538 ucetxt = self.make_uces(txt, self.corpus.parametres['douce'])
1539 if self.corpus.ucis[-1].paras == [] :
1543 self.corpus.ucis[-1].uces.append(Uce(iduce, idpara, iduci))
1544 self.c.execute('INSERT INTO uces VALUES(?,?);', (`iduce`,uce))
1545 if not self.tolist :
1551 self.corpus.add_word(word)
1552 log.debug(' '.join([`iduci`,`idpara`,`iduce`]))
1553 if self.last > self.lim :
1556 return iduce, idpara
1558 def make_uces(self, txt, douce = True, keep_ponct = False) :
1559 txt = ' '.join(txt.split())
1562 reste, texte_uce, suite = self.decouper(self.prep_txt(txt), self.ucesize + 15, self.ucesize)
1564 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1567 reste, texte_uce, suite = self.decouper(suite, self.ucesize + 15, self.ucesize)
1568 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1573 return [' '.join([val for val in txt.split() if val not in self.ponctuation_espace])]
1575 #decouper (list_sep)
1576 #make_uces (decouper)
1577 #treat_txt (make_uces)
1581 def __init__(self, parent, dlg = None) :
1582 self.parent = parent
1584 parametres = DoConf(os.path.join(self.parent.UserConfigPath,'corpus.cfg')).getoptions('corpus')
1585 parametres['pathout'] = PathOut(parent.filename, 'corpus').mkdirout()
1586 parametres['corpus_name'] = os.path.split(parametres['pathout'])[1]
1587 dial = CorpusPref(parent, parametres)
1588 dial.CenterOnParent()
1589 dial.txtpath.SetLabel(parent.filename)
1590 #dial.repout_choices.SetValue(parametres['pathout'])
1591 self.res = dial.ShowModal()
1592 if self.res == 5100 :
1593 parametres = dial.doparametres()
1594 parametres['originalpath'] = parent.filename
1595 PathOut().createdir(parametres['pathout'])
1596 if parametres.get('dictionary', False) :
1597 filein = parametres['dictionary']
1600 if dial.corpusname.GetValue() != '' :
1601 parametres['corpus_name'] = dial.corpusname.GetValue()
1603 ReadLexique(self.parent, lang = parametres['lang'], filein = filein)
1604 if parametres['lang'] != 'other' and os.path.exists(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')):
1605 self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp'))
1607 self.parent.expressions = {}
1608 self.parametres = parametres
1611 if self.dlg is not None :
1614 def doanalyse(self) :
1615 return BuildFromAlceste(self.parent.filename, self.parametres, self.parent.lexique, self.parent.expressions, dlg = self.dlg).corpus
1618 def __init__(self, parent, corpus, parametres = None, dlg = None):
1619 self.parent = parent
1622 corpus_name = 'Sub' + corpus.parametres['corpus_name']
1623 if dlg is not None :
1624 busy = wx.BusyInfo(_("Please wait...").decode('utf8'), self)
1626 parametres['corpus_name'] = corpus_name
1627 if parametres.get('frommeta', False) :
1628 parametres['meta'] = corpus.make_etoiles()
1629 elif parametres.get('fromtheme', False) :
1630 parametres['meta'] = corpus.make_themes()
1631 elif parametres.get('fromclusters', False) :
1632 parametres['meta'] = [' '.join(['classe', `i`]) for i in range(1,parametres['clnb'] + 1)]
1634 parametres['meta'] = []
1635 if 'fromclusters' not in parametres :
1636 parametres['meta'].sort()
1637 if dlg is not None :
1639 dial = SubTextFromMetaDial(parent, parametres)
1640 self.res = dial.ShowModal()
1641 if self.res == 5100 :
1642 if dial.subcorpusname.GetValue() != '' :
1643 corpus_name = ''.join([l for l in dial.subcorpusname.GetValue() if l.isalnum() or l in ['_']])
1644 if corpus_name != '' :
1645 parametres['corpus_name'] = corpus_name
1647 parametres['corpus_name'] = 'Sub' + corpus.parametres['corpus_name']
1648 pathout = os.path.join(corpus.parametres['pathout'], parametres['corpus_name'])
1650 while os.path.exists(pathout + '_%i' % i) :
1652 parametres['pathout'] = pathout + '_%i' % i
1653 meta = dial.m_listBox1.GetSelections()
1654 if not 'fromclusters' in parametres :
1655 parametres['meta'] = [parametres['meta'][val] for val in meta]
1657 parametres['meta'] = meta
1658 self.parametres = parametres
1663 def doanalyse(self):
1664 return BuildSubCorpus(self.ori, parametres = self.parametres, dlg = self.dlg).corpus