1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
11 from functions import decoupercharact, ReadDicoAsDico, DoConf, ReadLexique
16 from operator import itemgetter
17 from uuid import uuid4
18 from chemins import PathOut
19 from dialog import CorpusPref, SubTextFromMetaDial
21 from colors import colors
25 log = logging.getLogger('iramuteq.corpus')
28 def copycorpus(corpus) :
29 log.info('copy corpus')
30 copy_corpus = Corpus(corpus.parent, parametres = corpus.parametres)
31 copy_corpus.ucis = corpus.ucis
32 copy_corpus.formes = corpus.formes
33 copy_corpus.pathout = corpus.pathout
34 copy_corpus.conn_all()
38 return Uce(uce.ident, uce.para, uce.uci)
42 nuci = Uci(uci.ident, '')
43 nuci.etoiles = copy(uci.etoiles)
44 nuci.uces = [CopyUce(uce) for uce in uci.uces]
53 def __init__(self, parent, parametres = {}, read = False) :
55 self.parametres = parametres
57 self.connformes = None
59 self.conncorpus = None
66 self.idformesuces = {}
71 self.pathout = PathOut(dirout = parametres['pathout'])
74 def add_word(self, word) :
75 if word in self.formes :
76 self.formes[word].freq += 1
77 if self.formes[word].ident in self.idformesuces :
78 if self.ucis[-1].uces[-1].ident in self.idformesuces[self.formes[word].ident] :
79 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] += 1
81 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] = 1
83 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident: 1}
85 if word in self.parent.lexique :
86 gramtype = self.parent.lexique[word][1]
87 lem = self.parent.lexique[word][0]
94 self.formes[word] = Word(word, gramtype, len(self.formes), lem)
95 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident : 1}
97 def add_word_from_forme(self, word, stident):
98 if word.forme in self.formes :
99 self.formes[word.forme].freq += 1
100 if self.formes[word.forme].ident in self.idformesuces :
101 if stident in self.idformesuces[self.formes[word.forme].ident] :
102 self.idformesuces[self.formes[word.forme].ident][stident] += 1
104 self.idformesuces[self.formes[word.forme].ident][stident] = 1
106 self.idformesuces[self.formes[word.forme].ident] = {stident: 1}
108 self.formes[word.forme] = Word(word.forme, word.gram, len(self.formes), word.lem)
109 self.idformesuces[self.formes[word.forme].ident] = {stident : 1}
112 """connect corpus to db"""
113 if self.connformes is None :
114 log.info('connexion corpus')
115 self.connuces = sqlite3.connect(self.pathout['uces.db'])
116 self.cuces = self.connuces.cursor()
117 self.connformes = sqlite3.connect(self.pathout['formes.db'])
118 self.cformes = self.connformes.cursor()
119 self.conncorpus = sqlite3.connect(self.pathout['corpus.db'])
120 self.ccorpus = self.conncorpus.cursor()
121 self.cformes.execute('PRAGMA temp_store=MEMORY;')
122 self.cformes.execute('PRAGMA journal_mode=MEMORY;')
123 self.cformes.execute('PRAGMA synchronous = OFF;')
124 self.cuces.execute('PRAGMA temp_store=MEMORY;')
125 self.cuces.execute('PRAGMA journal_mode=MEMORY;')
126 self.cuces.execute('PRAGMA synchronous = OFF;')
127 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
128 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
129 self.ccorpus.execute('PRAGMA synchronous = OFF;')
131 def read_corpus(self) :
132 log.info('read corpus')
133 self.parametres['syscoding'] = sys.getdefaultencoding()
134 if self.conncorpus is None :
136 res = self.ccorpus.execute('SELECT * FROM etoiles;')
138 self.ucis.append(Uci(row[0], row[1], row[2]))
139 uces = self.conncorpus.cursor().execute('SELECT * FROM luces where uci=?;',(`self.ucis[-1].ident`,))
141 self.ucis[-1].uces.append(Uce(uce[2], uce[1], uce[0]))
142 res = self.ccorpus.execute('SELECT * FROM formes;')
143 self.formes = dict([[forme[1], Word(forme[1], forme[3], forme[0], lem = forme[2], freq = forme[4])] for forme in res])
146 def getworduces(self, wordid) :
147 if isinstance(wordid, basestring) :
148 wordid = self.formes[wordid].ident
149 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`wordid`,))
150 return list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
152 def getworducis(self, wordid) :
153 res = self.getworduces(wordid)
154 return list(set([self.getucefromid(uce).uci for uce in res]))
156 def getformeuceseff(self, formeid) :
157 if isinstance(formeid, basestring) :
158 formeid = self.formes[formeid].ident
159 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`formeid`,))
160 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
161 query = 'SELECT eff FROM eff where id=%i ORDER BY id' % formeid
162 res = self.cformes.execute(query)
163 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
165 for i, uce in enumerate(uces) :
166 formeuceeff[uce] = formeuceeff.get(uce, 0) + eff[i]
169 def getlemuces(self, lem) :
170 formesid = ', '.join([`val` for val in self.lems[lem].formes])
171 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
172 res = self.cformes.execute(query)
173 return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))))
175 def gettgenst(self, tgen):
176 formesid = ', '.join([`val` for lem in tgen for val in self.lems[lem].formes if lem in self.lems])
177 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
178 res = self.cformes.execute(query)
179 return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))))
181 def getlemucis(self, lem) :
182 uces = self.getlemuces(lem)
183 return list(set([self.getucefromid(val).uci for val in uces]))
185 def getlemuceseff(self, lem, luces = None) :
186 formesid = ', '.join([`val` for val in self.lems[lem].formes])
187 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
188 res = self.cformes.execute(query)
189 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
190 query = 'SELECT eff FROM eff where id IN (%s) ORDER BY id' % formesid
191 res = self.cformes.execute(query)
192 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
194 for i, uce in enumerate(uces) :
195 lemuceeff[uce] = lemuceeff.get(uce, 0) + eff[i]
198 def getlemclustereff(self, lem, cluster) :
199 return len(list(set(self.lc[cluster]).intersection(self.getlemuces(lem))))
201 def getlemeff(self, lem) :
202 return self.lems[lem].freq
207 def getforme(self, formeid) :
208 if self.idformes is None : self.make_idformes()
209 return self.idformes[formeid]
211 def gettotocc(self) :
212 return sum([self.formes[forme].freq for forme in self.formes])
214 def getucemean(self) :
215 return float(self.gettotocc())/self.getucenb()
218 return self.ucis[-1].uces[-1].ident + 1
221 return self.ucis[-1].ident + 1
223 def getucisize(self) :
224 ucesize = self.getucesize()
225 return [sum(ucesize[uci.uces[0].ident:(uci.uces[-1].ident + 1)]) for uci in self.ucis]
227 def getucesize(self) :
228 res = self.getalluces()
229 return [len(uce[1].split()) for uce in res]
231 def getconcorde(self, uces) :
232 return self.cuces.execute('select * from uces where id IN (%s) ORDER BY id;' % ', '.join([`i` for i in uces]))
234 def getuciconcorde(self, ucis) :
235 uces = [[val,[uce.ident for uce in self.ucis[val].uces]] for val in ucis]
236 uces = [[val[0], '\n'.join([row[1] for row in self.getconcorde(val[1])])] for val in uces]
239 def getwordconcorde(self, word) :
240 return self.getconcorde(self.getworduces(word))
242 def getlemconcorde(self, lem) :
243 return self.getconcorde(self.getlemuces(lem))
245 def getalluces(self) :
246 return self.cuces.execute('SELECT * FROM uces')
248 def getallucis(self):
249 uces = [row[1] for row in self.getalluces()]
250 return [[uci.ident, '\n'.join([uces[uce.ident] for uce in uci.uces])] for uci in self.ucis]
252 def getucesfrometoile(self, etoile) :
253 return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles]
255 def getetoileuces(self) :
256 log.info('get uces etoiles')
259 for uci in self.ucis :
260 etoiles = uci.etoiles[1:]
262 if et in etoileuces :
263 etoileuces[et] += [uce.ident for uce in uci.uces]
265 etoileuces[et] = [uce.ident for uce in uci.uces]
267 for et in uci.paras :
268 if et in etoileuces :
269 etoileuces[et] += [uce.ident for uce in uci.uces if uce.para == idpara]
271 etoileuces[et] = [uce.ident for uce in uci.uces if uce.para == idpara]
277 def getetoileucis(self):
279 for uci in self.ucis :
280 etoiles = uci.etoiles[1:]
282 if et in etoileuces :
283 etoileuces[et] += [uci.ident]
285 etoileuces[et] = [uci.ident]
288 def getucefromid(self, uceid) :
289 if self.iduces is None : self.make_iduces()
290 return self.iduces[uceid]
292 def gethapaxnb(self) :
293 return len([None for forme in self.formes if self.formes[forme].freq == 1])
295 def getactivesnb(self, key) :
296 return len([lem for lem in self.lems if self.lems[lem].act == key])
297 # def make_lems(self, lem = True) :
298 # log.info('make lems')
300 # for forme in self.formes :
301 # if self.formes[forme].lem in self.lems :
302 # if self.formes[forme].ident not in self.lems[self.formes[forme].lem] :
303 # self.lems[self.formes[forme].lem][self.formes[forme].ident] = 0
305 # self.lems[self.formes[forme].lem] = {self.formes[forme].ident : 0}
307 def getetbyuceid(self, uceid) :
308 if self.uceuci is None : self.uceuci = dict([[uce.ident,uci.ident] for uci in self.ucis for uce in uci.uces])
309 return self.ucis[self.uceuci[uceid]].etoiles
311 def make_lems(self, lem = True) :
312 log.info('make lems')
315 for forme in self.formes :
316 if self.formes[forme].lem in self.lems :
317 if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes :
318 self.lems[self.formes[forme].lem].add_forme(self.formes[forme])
320 self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
322 self.lems = dict([[forme, Lem(self, self.formes[forme])] for forme in self.formes])
324 def make_lems_from_dict(self, dictionnaire, dolem = True) :
325 log.info('make lems from dict')
327 for forme in self.formes :
328 if self.formes[forme].forme in dictionnaire :
329 lem = dictionnaire[forme][0]
330 gram = dictionnaire[forme][1]
331 elif forme.isdigit() :
337 self.formes[forme].lem = lem
338 self.formes[forme].gram = gram
340 if self.formes[forme].lem in self.lems :
341 if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes :
342 self.lems[self.formes[forme].lem].add_forme(self.formes[forme])
344 self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
346 self.lems[forme] = Lem(self, self.formes[forme])
348 def make_idformes(self) :
349 self.idformes = dict([[self.formes[forme].ident, self.formes[forme]] for forme in self.formes])
351 def make_iduces(self) :
352 if self.iduces is None :
353 self.iduces = dict([[uce.ident, uce] for uci in self.ucis for uce in uci.uces])
355 def make_lexitable(self, mineff, etoiles, gram = 0) :
360 tokeep = [lem for lem in self.lems if self.lems[lem].freq >= mineff and self.lems[lem].act in grams]
361 etuces = [[] for et in etoiles]
362 for uci in self.ucis :
363 get = list(set(uci.etoiles).intersection(etoiles))
365 log.info('2 variables sur une ligne')
367 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
368 etuces = [set(val) for val in etuces]
371 deff = self.getlemuceseff(lem)
373 tab.append([lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])
374 tab.insert(0, [''] + etoiles)
377 def make_tgen_table(self, tgen, etoiles, tot = None):
378 lclasses = [self.getucesfrometoile(etoile) for etoile in etoiles]
379 sets = [set(cl) for cl in lclasses]
380 totoccurrences = dict([[val, 0] for val in etoiles])
382 for forme in self.formes :
383 formeuceeff = self.getformeuceseff(forme)
384 for i, classe in enumerate(lclasses) :
385 concern = sets[i].intersection(formeuceeff.keys())
387 totoccurrences[etoiles[i]] += sum([formeuceeff[uce] for uce in concern])
388 #tgenoccurrences = dict([[val, 0] for val in etoiles])
391 tgenoccurrences[t] = dict([[val, 0] for val in etoiles])
393 lemuceeff = self.getlemuceseff(lem)
394 for i, classe in enumerate(lclasses) :
395 concern = sets[i].intersection(lemuceeff.keys())
397 tgenoccurrences[t][etoiles[i]] += sum([lemuceeff[uce] for uce in concern])
398 return tgenoccurrences, totoccurrences
400 def make_efftype_from_etoiles(self, etoiles) :
402 etuces = [[] for et in etoiles]
403 for uci in self.ucis :
404 get = list(set(uci.etoiles).intersection(etoiles))
406 return '2 variables sur la meme ligne'
408 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
409 etuces = [set(val) for val in etuces]
410 for lem in self.lems :
411 deff = self.getlemuceseff(lem)
413 gram = self.lems[lem].gram
415 dtype[gram] = [i + j for i, j in zip(dtype[gram], [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])]
417 dtype[gram] = [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces]
418 tabout = [[gram] + dtype[gram] for gram in dtype]
419 tabout.insert(0, [''] + etoiles)
422 def make_uceactsize(self, actives) :
423 res = self.getalluces()
426 deff = self.getlemuceseff(lem)
428 ucesize[uce] = ucesize.get(uce, 0) + 1
431 def make_uc(self, actives, lim1, lim2) :
432 uceactsize = self.make_uceactsize(actives)
438 for uce in [uce for uci in self.ucis for uce in uci.uces] :
439 if uce.para == lastpara :
441 last1 += uceactsize.get(uce.ident,0)
442 uc1[-1].append(uce.ident)
444 uc1.append([uce.ident])
447 last2 += uceactsize.get(uce.ident, 0)
448 uc2[-1].append(uce.ident)
450 uc2.append([uce.ident])
453 last1 = uceactsize.get(uce.ident, 0)
454 last2 = uceactsize.get(uce.ident, 0)
456 uc1.append([uce.ident])
457 uc2.append([uce.ident])
460 def make_and_write_sparse_matrix_from_uc(self, actives, sizeuc1, sizeuc2, uc1out, uc2out, listuce1out, listuce2out) :
461 uc1, uc2 = self.make_uc(actives, sizeuc1, sizeuc2)
462 log.info('taille uc1 : %i - taille uc2 : %i' % (len(uc1), len(uc2)))
463 self.write_ucmatrix(uc1, actives, uc1out)
464 self.write_ucmatrix(uc2, actives, uc2out)
465 listuce1 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc1) for uce in ucl]
466 listuce2 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc2) for uce in ucl]
467 with open(listuce1out, 'w') as f :
468 f.write('\n'.join([';'.join(line) for line in listuce1]))
469 with open(listuce2out, 'w') as f :
470 f.write('\n'.join([';'.join(line) for line in listuce2]))
471 return len(uc1), len(uc2)
473 def write_ucmatrix(self, uc, actives, fileout) :
474 log.info('write uc matrix %s' % fileout)
475 uces_uc = dict([[uce, i] for i, ucl in enumerate(uc) for uce in ucl])
478 with open(fileout + '~', 'w+') as f :
479 for i, lem in enumerate(actives) :
480 for uce in self.getlemuces(lem):
481 if (uces_uc[uce], i) not in deja_la :
483 f.write(''.join([' '.join([`uces_uc[uce]+1`,`i+1`,`1`]),'\n']))
484 deja_la[(uces_uc[uce], i)] = 0
486 with open(fileout, 'w') as ffin :
487 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (len(uc), len(actives), nbl))
490 os.remove(fileout + '~')
493 def export_corpus(self, outf) :
494 #outf = 'export_corpus.txt'
496 res = self.getalluces()
500 with open(outf,'w') as f :
502 if self.iduces[uce[0]].uci == actuci and self.iduces[uce[0]].para == actpara :
503 f.write(uce[1].encode(self.parametres['syscoding']) + '\n')
504 elif self.iduces[uce[0]].uci != actuci :
505 actuci = self.iduces[uce[0]].uci
506 if self.ucis[self.iduces[uce[0]].uci].paras == [] :
507 actpara = self.iduces[uce[0]].para
508 f.write('\n' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n' + uce[1].encode(self.parametres['syscoding']) + '\n')
511 actpara = self.iduces[uce[0]].para
512 f.write('\n'.join([' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']), self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
513 elif self.iduces[uce[0]].para != actpara :
514 actpara = self.iduces[uce[0]].para
516 f.write('\n'.join([self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
518 def export_corpus_classes(self, outf, alc = True, lem = False, uci = False) :
520 for i, lc in enumerate(self.lc) :
523 for uce in self.lc0 :
526 res = self.getalluces()
529 res = self.getallucis()
530 with open(outf, 'w') as f :
534 actuci = self.iduces[uce[0]].uci
538 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
540 etline = ' '.join(self.ucis[actuci].etoiles + ['*classe_%i' % ucecl[uce[0]]])
542 etline = ' '.join(['<' + '='.join(et.split('_')) + '>' for et in self.ucis[actuci].etoiles[1:]])
543 f.write(etline.encode(self.parametres['syscoding']) + '\n')
544 f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
546 def export_classe(self, outf, classe, lem = False, uci = False) :
547 sts = self.lc[classe - 1]
549 res = self.getconcorde(sts)
552 res = self.getuciconcorde(sts)
553 with open(outf, 'w') as f :
557 f.write(' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n')
559 f.write(' '.join(self.ucis[uce[0]].etoiles).encode(self.parametres['syscoding']) + '\n')
561 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
562 f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
564 def export_owledge(self, rep, classe, lem = False, uci = False) :
565 sts = self.lc[classe - 1]
567 res = self.getconcorde(sts)
570 res = self.getuciconcorde(sts)
574 outf = '.'.join([`ident`, 'txt'])
575 outf = os.path.join(rep, outf)
577 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
578 with open(outf, 'w') as f :
579 f.write(guce.encode('cp1252', errors = 'replace'))
581 def export_tropes(self, fileout, classe, lem = False, uci = False) :
582 sts = self.lc[classe - 1]
584 res = self.getconcorde(sts)
587 res = self.getuciconcorde(sts)
588 with open(fileout, 'w') as f :
592 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
593 f.write(guce.encode('cp1252', errors = 'replace'))
596 def make_and_write_sparse_matrix_from_uces(self, actives, outfile, listuce = False) :
597 log.info('make_and_write_sparse_matrix_from_uces %s' % outfile)
599 with open(outfile + '~', 'w+') as f :
600 for i, lem in enumerate(actives) :
601 for uce in sorted(self.getlemuces(lem)) :
603 f.write(''.join([' '.join([`uce+1`, `i+1`,`1`]),'\n']))
605 with open(outfile, 'w') as ffin :
606 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
609 os.remove(outfile + '~')
611 with open(listuce, 'w') as f :
612 f.write('\n'.join(['uce;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucenb())]))
614 def make_and_write_sparse_matrix_from_uci(self, actives, outfile, listuci = False) :
615 log.info('make_and_write_sparse_matrix_from_ucis %s' % outfile)
617 with open(outfile + '~', 'w+') as f :
618 for i, lem in enumerate(actives) :
619 for uci in sorted(self.getlemucis(lem)) :
621 f.write(''.join([' '.join([`uci+1`, `i+1`,`1`]),'\n']))
623 with open(outfile, 'w') as ffin :
624 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucinb(), len(actives), nbl))
627 os.remove(outfile + '~')
629 with open(listuci, 'w') as f :
630 f.write('\n'.join(['uci;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucinb())]))
632 def make_and_write_sparse_matrix_from_classe(self, actives, uces, outfile) :
633 log.info('make_and_write_sparse_matrix_from_classe %s' % outfile)
635 duces = dict([[uce, i] for i, uce in enumerate(uces)])
636 with open(outfile + '~', 'w+') as f :
637 for i, lem in enumerate(actives) :
638 uces_ok = list(set(self.getlemuces(lem)).intersection(uces))
640 f.write(''.join([' '.join([`duces[uce]+1`,`i+1`,`1`]),'\n']))
642 with open(outfile, 'w') as ffin :
643 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
646 os.remove(outfile + '~')
648 def make_table_with_classe(self, uces, list_act, uci = False) :
649 table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
650 uces = dict([[uce, i] for i, uce in enumerate(uces)])
652 getlem = self.getlemucis
654 getlem = self.getlemuces
655 for i, lem in enumerate(list_act) :
656 lemuces = list(set(getlem(lem)).intersection(uces))
658 table_uce[uces[uce]][i] = 1
659 table_uce.insert(0, list_act)
662 def make_pondtable_with_classe(self, uces, list_act) :
663 table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
664 uces = dict([[uce, i] for i, uce in enumerate(uces)])
665 for i, lem in enumerate(list_act) :
666 uceseff = self.getlemuceseff(lem)
667 lemuces = list(set(uceseff.keys()).intersection(uces))
669 table_uce[uces[uce]][i] = uceseff[uce]
670 table_uce.insert(0, list_act)
673 def parse_active(self, gramact, gramsup = None) :
674 log.info('parse actives')
675 for lem in self.lems :
676 if lem.startswith('_') and lem.endswith('_') :
677 self.lems[lem].act = 2
678 elif self.lems[lem].gram in gramact :
679 self.lems[lem].act = 1
680 elif gramsup is not None and self.lems[lem].gram not in gramact:
681 if self.lems[lem].gram in gramsup :
682 self.lems[lem].act = 2
684 self.lems[lem].act = 0
686 self.lems[lem].act = 2
688 def make_actives_limit(self, limit, key = 1) :
689 if self.idformes is None :
691 return [lem for lem in self.lems if self.getlemeff(lem) >= limit and self.lems[lem].act == key]
693 def make_actives_nb(self, nbmax, key) :
694 log.info('make_actives_nb : %i - %i' % (nbmax,key))
695 if self.idformes is None :
697 allactives = [[self.lems[lem].freq, lem] for lem in self.lems if self.lems[lem].act == key and self.lems[lem].freq >= 3]
698 self.activenb = len(allactives)
699 allactives = sorted(allactives, reverse = True)
700 if self.activenb == 0 :
702 if len(allactives) <= nbmax :
703 log.info('nb = %i - eff min = %i ' % (len(allactives), allactives[-1][0]))
704 return [val[1] for val in allactives], allactives[-1][0]
706 effs = [val[0] for val in allactives]
707 if effs.count(effs[nbmax - 1]) > 1 :
708 lim = effs[nbmax - 1] + 1
712 stop = effs.index(lim)
719 log.info('nb actives = %i - eff min = %i ' % (stop + 1, lim))
720 return [val[1] for val in allactives[0:stop + 1]], lim
722 def make_and_write_profile(self, actives, ucecl, fileout, uci = False) :
723 log.info('formes/classes')
725 tab = [[lem] + [len(set(self.getlemucis(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
727 tab = [[lem] + [len(set(self.getlemuces(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
728 tab = [[line[0]] + [`val` for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
729 with open(fileout, 'w') as f :
730 f.write('\n'.join([';'.join(line) for line in tab]).encode(self.parametres['syscoding']))
732 def make_etoiles(self) :
734 for uci in self.ucis :
735 etoiles.update(uci.etoiles[1:])
738 def make_themes(self):
740 for uci in self.ucis :
741 themes.update(uci.paras)
744 def make_etoiles_dict(self) :
745 etoiles = [et for uci in self.ucis for et in uci.etoiles[1:]]
747 for etoile in etoiles :
748 et = etoile.split('_')
751 endet = '_'.join(et[1:])
752 if etoile in det[et[0]] :
753 det[et[0]][etoile] += 1
755 det[et[0]][etoile] = 1
760 endet = '_'.join(et[1:])
761 det[et[0]] = {etoile :1}
766 def make_theme_dict(self):
767 themes = [val for uci in self.ucis for val in uci.paras]
769 for theme in themes :
770 th = theme.split('_')
773 endth = '_'.join(th[1:])
774 if theme in det[th[0]] :
775 det[th[0]][theme] += 1
777 det[th[0]][theme] = 1
782 endth = '_'.join(th[1:])
783 det[th[0]] = {theme:1}
788 def make_etline(self, listet) :
789 etuces = [[] for et in listet]
790 for uci in self.ucis :
791 get = list(set(uci.etoiles).intersection(listet))
793 return '2 variables sur la meme ligne'
795 etuces[listet.index(get[0])] += [uce.ident for uce in uci.uces]
798 def make_and_write_profile_et(self, ucecl, fileout, uci = False) :
799 log.info('etoiles/classes')
801 etoileuces = self.getetoileuces()
803 etoileuces = self.getetoileucis()
804 etoileuces = dict([[et, etoileuces[et]] for et in etoileuces if len(etoileuces[et]) > 1])
805 with open(fileout, 'w') as f :
806 f.write('\n'.join([';'.join([et] + [`len(set(etoileuces[et]).intersection(classe))` for classe in ucecl]) for et in etoileuces]).encode(self.parametres['syscoding']))
807 #etoiles = self.make_etoiles()
808 #with open(fileout, 'w') as f :
809 # f.write('\n'.join([';'.join([etoile] + [`len(set(self.getucesfrometoile(etoile)).intersection(classe))` for classe in ucecl]) for etoile in etoiles]).encode(self.parametres['syscoding']))
811 def make_colored_corpus(self, uci = False) :
813 for i, lc in enumerate(self.lc) :
816 for uce in self.lc0 :
818 color = ['black'] + colors[len(self.lc) - 1]
820 <meta http-equiv="content-Type" content="text/html; charset=%s" />
822 ''' % sys.getdefaultencoding()
824 res = self.getalluces()
829 if self.iduces[uce[0]].uci != actuci :
830 actuci = self.iduces[uce[0]].uci
831 txt += '<br><hr>' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles) + '<br><br>'
832 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
834 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
836 res = self.getallucis()
839 if self.ucis[uce[0]].ident != actuci :
840 actuci = self.ucis[uce[0]].ident
841 txt += '<br><hr>' + ' '.join(self.ucis[self.ucis[uce[0]].ident].etoiles) + '<br><br>'
842 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
844 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
845 return txt + '\n</body></html>'
847 def count_from_list(self, l, d) :
855 def count_from_list_cl(self, l, d, a, clnb) :
864 def find_segments(self, taille_segment, taille_limite) :
866 for uce in self.getalluces() :
868 d = self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
869 l = [[d[val], val] for val in d if d[val] >= 3]
872 if len(l) > taille_limite :
873 l = l[-taille_limite:]
876 def find_segments_in_classe(self, list_uce, taille_segment, taille_limite, uci = False):
879 concorde = self.getconcorde
881 concorde = self.getuciconcorde
882 for uce in concorde(list_uce) :
884 d =self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
885 l = [[d[val], val, taille_segment] for val in d if d[val] >= 3]
888 if len(l) > taille_limite :
889 l = l[-taille_limite:]
892 def make_segments_profile(self, fileout, lenmin = 3, lenmax = 10, effmin = 50, lem = False) :
894 for b, classe in enumerate(self.lc) :
895 for uce in self.getconcorde(classe) :
898 uce = [self.formes[forme].lem for forme in uce]
899 for taille_segment in range(lenmin,lenmax) :
900 d =self.count_from_list_cl([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d, b, len(self.lc))
901 result = [[seg] + [str(val) for val in d[seg]] for seg in d if sum(d[seg]) >= effmin]
902 with open(fileout, 'w') as f :
903 f.write('\n'.join([';'.join(line) for line in result]))
905 def make_proftype(self, outf) :
907 for lem in self.lems :
908 gram = self.lems[lem].gram
910 res[gram] = [0 for val in self.lc]
911 lemuceeff = self.getlemuceseff(lem)
912 for i, classe in enumerate(self.lc) :
913 concern = set(classe).intersection(lemuceeff.keys())
914 res[gram][i] += sum([lemuceeff[uce] for uce in concern])
915 res = [[gram] + [`val` for val in res[gram]] for gram in res]
917 with open(outf, 'w') as f :
918 f.write('\n'.join([';'.join(line) for line in res]).encode(self.parametres['syscoding']))
921 def make_ucecl_from_R(self, filein) :
922 with open(filein, 'rU') as f :
927 line = line.replace('\n', '').replace('"', '').split(';')
928 self.lc.append([int(line[0]) - 1, int(line[1])])
929 classesl = [val[1] for val in self.lc]
931 self.lc = sorted(self.lc, key=itemgetter(1))
932 self.lc = [[uce[0] for uce in self.lc if uce[1] == i] for i in range(clnb+1)]
933 self.lc0 = self.lc.pop(0)
936 def get_stat_by_cluster(self, outf, lclasses = None) :
937 log.info('get_stat_by_cluster')
938 if lclasses is None :
941 occurrences = dict([[i + 1, 0] for i in range(len(lclasses))])
942 formescl = dict([[i + 1, 0] for i in range(len(lclasses))])
943 hapaxcl = dict([[i + 1, 0] for i in range(len(lclasses))])
944 lenclasses = dict([[i+1,len(cl)] for i, cl in enumerate(lclasses)])
945 sets = [set(cl) for cl in lclasses]
946 for forme in self.formes :
947 formeuceeff = self.getformeuceseff(forme)
948 for i, classe in enumerate(lclasses) :
949 concern = sets[i].intersection(formeuceeff.keys())
951 occurrences[i+1] += sum([formeuceeff[uce] for uce in concern])
953 if self.formes[forme].freq == 1 :
955 log.info('%f' % (time() - t1))
956 if outf is not None :
957 toprint = '\n'.join([';'.join([`i`, `occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`]) for i in occurrences])
958 with open(outf, 'w') as f :
961 return [[`occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`] for i in occurrences]
963 def get_stat_by_et(self, outf, etoiles) :
964 lclasses = [self.getucesfrometoile(etoile) for etoile in etoiles]
965 stats = self.get_stat_by_cluster(None, lclasses)
966 stats = [[etoiles[i]] + val for i, val in enumerate(stats)]
968 def gethapaxbyet(self, etoiles) :
969 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
971 for uce in hapaxuces :
972 if uce in hucesdict :
976 etuces = [[] for et in etoiles]
977 for uci in self.ucis :
978 get = list(set(uci.etoiles).intersection(etoiles))
980 return '2 variables sur la meme ligne'
982 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
983 etuces = [set(val) for val in etuces]
984 return [sum([hucesdict[uce] for uce in list(etuce.intersection(hapaxuces))]) for etuce in etuces]
986 def gethapaxuces(self) :
987 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
988 hapax = [forme for forme in self.lems if self.lems[forme].freq == 1]
990 for i,uce in enumerate(hapaxuces) :
991 if uce in hucesdict :
992 hucesdict[uce][0] += 1
993 hucesdict[uce][1].append(hapax[i])
995 hucesdict[uce] = [1,[hapax[i]]]
997 for uce in hucesdict :
998 if hucesdict[uce][0] in huces :
999 huces[hucesdict[uce][0]].append(uce)
1001 huces[hucesdict[uce][0]] = [uce]
1002 huces = zip(huces, huces.values())
1003 huces.sort(reverse=True)
1007 for nb in huces[0:4] :
1008 txt += "<p><h2>%i hapax par uce</h2><p>\n" % nb[0]
1010 res = self.getconcorde([uce])
1012 ucetxt = ' ' + row[1] + ' '
1014 for hap in hucesdict[uce][1] :
1015 laforme = self.getforme([forme for forme in self.lems[hap].formes][0]).forme
1016 ucetxt = ucetxt.replace(' '+laforme+' ', ' <font color=red>'+laforme+'</font> ')
1017 txt += '<p><b>' + ' '.join(self.getetbyuceid(uceid)) + '</b></p>'
1018 txt += '<p>'+ucetxt+'</p>\n'
1022 with open('/tmp/testhapxuce.html','w') as f :
1025 def export_dictionary(self, fileout, syscoding) :
1026 listformes = [[self.formes[forme].freq, forme, self.formes[forme].lem, self.formes[forme].gram] for forme in self.formes]
1027 listformes.sort(reverse = True)
1028 listformes = [forme[1:] + [`forme[0]`] for forme in listformes]
1029 with open(fileout, 'w') as f :
1030 f.write('\n'.join(['\t'.join(forme) for forme in listformes]).encode(syscoding))
1032 def export_lems(self, fileout, syscoding) :
1033 self.make_idformes()
1034 listlem = [[lem, '\t'.join(['\t'.join([self.idformes[forme].forme, `self.lems[lem].formes[forme]`]) for forme in self.lems[lem].formes])] for lem in self.lems]
1036 with open(fileout, 'w') as f :
1037 f.write('\n'.join(['\t'.join(lem) for lem in listlem]).encode(syscoding))
1042 def __init__(self, corpus) :
1043 ucinb = corpus.getucinb()
1044 ucisize = corpus.getucisize()
1045 ucimean = float(sum(ucisize))/float(ucinb)
1046 detoile = corpus.make_etoiles_dict()
1049 def __init__(self, iduci, line, paraset = None) :
1051 self.etoiles = line.split()
1053 if paraset is not None :
1054 self.paras = paraset.split()
1059 def __init__(self, iduce, idpara, iduci) :
1065 def __init__(self, word, gramtype, idword, lem = None, freq = None) :
1068 self.gram = gramtype
1071 if freq is not None :
1077 def __init__(self, parent, forme) :
1078 self.formes = {forme.ident : forme.freq}
1079 self.gram = forme.gram
1080 self.freq = forme.freq
1081 self.act = forme.act
1083 def add_forme(self, forme) :
1084 self.formes[forme.ident] = forme.freq
1085 self.freq += forme.freq
1087 def decouperlist(chaine, longueur, longueurOptimale) :
1089 on part du dernier caractère, et on recule jusqu'au début de la chaîne.
1090 Si on trouve un '$', c'est fini.
1091 Sinon, on cherche le meilleur candidat. C'est-à-dire le rapport poids/distance le plus important.
1093 separateurs = [[u'.', 6.0], [u'?', 6.0], [u'!', 6.0], [u'£$£', 6.0], [u':', 5.0], [u';', 4.0], [u',', 1.0], [u' ', 0.01]]
1094 dsep = dict([[val[0],val[1]] for val in separateurs])
1095 trouve = False # si on a trouvé un bon séparateur
1096 iDecoupe = 0 # indice du caractere ou il faut decouper
1098 longueur = min(longueur, len(chaine) - 1)
1099 chaineTravail = chaine[:longueur + 1]
1101 meilleur = ['', 0, 0] # type, poids et position du meilleur separateur
1104 indice = chaineTravail.index(u'$')
1106 iDecoupe = indice - 1
1111 caractere = chaineTravail[nbCar]
1112 distance = abs(longueurOptimale - nbCar) + 1
1113 meilleureDistance = abs(longueurOptimale - meilleur[2]) + 1
1114 if caractere in dsep :
1115 if (float(dsep[caractere]) / distance) > (float(meilleur[1]) / meilleureDistance) :
1116 meilleur[0] = caractere
1117 meilleur[1] = dsep[caractere]
1122 if (float(dsep[' ']) / distance) > (float(meilleur[1]) / meilleureDistance) :
1124 meilleur[1] = dsep[' ']
1131 #if meilleur[0] != ' ' :
1132 # fin = chaine[iDecoupe + 1:]
1133 # retour = chaineTravail[:iDecoupe]
1135 fin = chaine[iDecoupe + 1:]
1136 retour = chaineTravail[:iDecoupe + 1]
1137 return len(retour) > 0, retour, fin
1138 # si on a rien trouvé
1139 return False, chaine, ''
1141 def testetoile(line) :
1142 return line.startswith(u'****')
1145 return line[0:4].isdigit() and u'*' in line
1147 def prep_txtlist(txt) :
1148 return txt.split() + [u'$']
1150 def prep_txtcharact(txt) :
1155 Class for building a corpus
1157 def __init__(self, infile, parametres_corpus, lexique = None, expressions = None, dlg = None) :
1158 log.info('begin building corpus...')
1159 self.lexique = lexique
1160 self.expressions = expressions
1162 self.corpus = Corpus(self, parametres_corpus)
1163 self.infile = infile
1165 self.lim = parametres_corpus.get('lim', 1000000)
1166 self.encoding = parametres_corpus['encoding']
1167 self.corpus.pathout = PathOut(filename = parametres_corpus['originalpath'], dirout = parametres_corpus['pathout'])
1168 self.corpus.pathout.createdir(parametres_corpus['pathout'])
1169 self.corpus.parametres['uuid'] = str(uuid4())
1170 self.corpus.parametres['corpus_name'] = parametres_corpus['corpus_name']#os.path.split(self.corpus.parametres['pathout'])[1]
1171 self.corpus.parametres['type'] = 'corpus'
1172 if self.corpus.parametres['keep_ponct'] :
1173 self.ponctuation_espace = [' ', '']
1175 self.ponctuation_espace = [' ','.', u'£$£', ';', '?', '!', ',', ':','']
1177 self.tolist = self.corpus.parametres.get('tolist', 0)
1184 def prep_makeuce(self) :
1185 method = self.corpus.parametres.get('ucemethod', 0)
1187 self.decouper = decouperlist
1188 self.prep_txt = prep_txtlist
1189 self.ucesize = self.corpus.parametres.get('ucesize', 40)
1191 self.decouper = decoupercharact
1192 self.prep_txt = prep_txtcharact
1193 self.ucesize = self.corpus.parametres.get('ucesize', 240)
1194 log.info('method uce : %s' % method)
1199 self.read_corpus(self.infile)
1200 except Warning, args :
1201 log.info('pas kool %s' % args)
1205 self.corpus.parametres['ira'] = self.corpus.pathout['Corpus.cira']
1206 self.time = time() - t1
1208 DoConf().makeoptions(['corpus'],[self.corpus.parametres], self.corpus.pathout['Corpus.cira'])
1209 log.info('time : %f' % (time() - t1))
1212 self.conn_f = sqlite3.connect(self.corpus.pathout['formes.db'])
1213 self.cf = self.conn_f.cursor()
1214 self.cf.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
1215 self.cf.execute('CREATE TABLE IF NOT EXISTS eff (id INTEGER, eff TEXT);')
1216 self.conn_f.commit()
1217 self.cf = self.conn_f.cursor()
1218 self.cf.execute('PRAGMA temp_store=MEMORY;')
1219 self.cf.execute('PRAGMA journal_mode=MEMORY;')
1220 self.cf.execute('PRAGMA synchronous = OFF;')
1221 self.cf.execute('begin')
1222 self.conn = sqlite3.connect(self.corpus.pathout['uces.db'])
1223 self.c = self.conn.cursor()
1224 self.c.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
1226 self.c = self.conn.cursor()
1227 self.c.execute('PRAGMA temp_store=MEMORY;')
1228 self.c.execute('PRAGMA journal_mode=MEMORY;')
1229 self.c.execute('PRAGMA synchronous = OFF;')
1230 self.c.execute('begin')
1233 #commit index and close db
1235 self.conn_f.commit()
1236 self.cf.execute('CREATE INDEX iduces ON uces (id);')
1237 self.cf.execute('CREATE INDEX ideff ON eff (id);')
1241 self.conn_corpus = sqlite3.connect(self.corpus.pathout['corpus.db'])
1242 self.ccorpus = self.conn_corpus.cursor()
1243 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS etoiles (uci INTEGER, et TEXT, paras TEXT);')
1244 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS luces (uci INTEGER, para INTEGER, uce INTEGER);')
1245 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS formes (ident INTEGER, forme TEXT, lem TEXT, gram TEXT, freq INTEGER);')
1246 self.conn_corpus.commit()
1247 self.ccorpus = self.conn_corpus.cursor()
1248 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
1249 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
1250 self.ccorpus.execute('PRAGMA synchronous = OFF;')
1251 self.ccorpus.execute('begin')
1252 self.backup_corpus()
1253 self.ccorpus.execute('CREATE INDEX iduci ON luces (uci);')
1254 self.conn_corpus.commit()
1255 self.conn_corpus.close()
1256 #self.corpus.parametres['corpus_ira'] = self.corpus.pathout['corpus.cira']
1258 def buildcleans(self) :
1259 if self.corpus.parametres.get('lower', 1) :
1260 self.cleans.append(self.dolower)
1261 if self.corpus.parametres.get('firstclean', 1) :
1262 self.cleans.append(self.firstclean)
1263 if self.corpus.parametres['charact'] :
1264 self.rule = self.corpus.parametres.get('keep_caract', u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_")
1265 self.cleans.append(self.docharact)
1266 if self.corpus.parametres.get('expressions', 1) :
1267 self.cleans.append(self.make_expression)
1268 if self.corpus.parametres.get('apos', 1) :
1269 self.cleans.append(self.doapos)
1270 if self.corpus.parametres.get('tiret', 1):
1271 self.cleans.append(self.dotiret)
1273 def make_expression(self,txt) :
1274 for expression in self.expressions:
1275 if expression in txt :
1276 txt = txt.replace(expression, self.expressions[expression][0])
1279 def dolower(self, txt) :
1282 def docharact(self, txt) :
1283 #rule = u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_-"
1284 list_keep = u"[" + self.rule + "]+"
1285 return re.sub(list_keep, ' ', txt)
1287 def doapos(self, txt) :
1288 return txt.replace(u'\'', u' ')
1290 def dotiret(self, txt) :
1291 return txt.replace(u'-', u' ')
1293 def firstclean(self, txt) :
1294 txt = txt.replace(u'’',"'")
1295 txt = txt.replace(u'œ', u'oe')
1296 return txt.replace('...',u' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'…', u' £$£ ')
1298 def make_cleans(self, txt) :
1299 for clean in self.cleans :
1303 def backup_uce(self) :
1304 if self.corpus.idformesuces != {} :
1305 log.info('backup %i' % len(self.corpus.idformesuces))
1306 touce = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].keys()])) for forme in self.corpus.idformesuces]
1307 toeff = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].values()])) for forme in self.corpus.idformesuces]
1308 self.cf.executemany('INSERT INTO uces VALUES (?,?);', touce)
1309 self.cf.executemany('INSERT INTO eff VALUES (?,?);', toeff)
1310 self.corpus.idformesuces = {}
1313 def backup_corpus(self) :
1314 log.info('start backup corpus')
1316 for uci in self.corpus.ucis :
1317 self.ccorpus.execute('INSERT INTO etoiles VALUES (?,?,?);' ,(uci.ident,' '.join(uci.etoiles), ' '.join(uci.paras,)))
1318 for uce in uci.uces :
1319 self.ccorpus.execute('INSERT INTO luces VALUES (?,?,?);',(`uci.ident`,`uce.para`,`uce.ident`,))
1320 for forme in self.corpus.formes :
1321 self.ccorpus.execute('INSERT INTO formes VALUES (?,?,?,?,?);', (`self.corpus.formes[forme].ident`, forme, self.corpus.formes[forme].lem, self.corpus.formes[forme].gram, `self.corpus.formes[forme].freq`,))
1322 log.info('%f' % (time() - t))
1324 def dofinish(self) :
1325 self.corpus.parametres['date'] = datetime.datetime.now().ctime()
1326 minutes, seconds = divmod(self.time, 60)
1327 hours, minutes = divmod(minutes, 60)
1328 self.corpus.parametres['time'] = '%.0fh %.0fm %.0fs' % (hours, minutes, seconds)
1329 self.corpus.parametres['ucinb'] = self.corpus.getucinb()
1330 self.corpus.parametres['ucenb'] = self.corpus.getucenb()
1331 self.corpus.parametres['occurrences'] = self.corpus.gettotocc()
1332 self.corpus.parametres['formesnb'] = len(self.corpus.formes)
1333 hapaxnb = self.corpus.gethapaxnb()
1334 pourhapaxf = (float(hapaxnb) / len(self.corpus.formes)) * 100
1335 pourhapaxocc = (float(hapaxnb) / self.corpus.parametres['occurrences']) * 100
1336 self.corpus.parametres['hapax'] = '%i - %.2f %% des formes - %.2f %% des occurrences' % (hapaxnb, pourhapaxf, pourhapaxocc)
1338 class BuildSubCorpus(BuildCorpus):
1339 def __init__(self, corpus, parametres, dlg = None) :
1340 log.info('begin subcorpus...')
1344 self.corpus = Corpus(self, {'type' : 'corpus', 'originalpath' : corpus.parametres['originalpath'], 'encoding' : corpus.parametres['encoding']})
1346 self.parametres = parametres
1347 self.encoding = corpus.parametres['encoding']
1348 self.corpus.parametres['corpus_name'] = parametres['corpus_name']
1349 self.corpus.pathout = PathOut(filename = corpus.parametres['originalpath'], dirout = parametres['pathout'])
1350 self.corpus.pathout.createdir(parametres['pathout'])
1351 self.corpus.parametres['pathout'] = parametres['pathout']
1352 self.corpus.parametres['meta'] = parametres.get('meta', False)
1353 self.corpus.parametres['uuid'] = str(uuid4())
1354 if parametres.get('frommeta', False) :
1355 print 'make subtexts'
1356 self.corpus.ucis = [CopyUci(uci) for uci in self.ori.ucis if set(parametres['meta']).intersection(uci.etoiles) != set()]
1357 elif parametres.get('fromtheme', False) :
1358 print 'make subtexts from theme'
1360 for uci in self.ori.ucis :
1361 if uci.paras != [] :
1364 for et in uci.paras :
1365 if et in parametres['meta'] :
1366 newuce += [CopyUce(uce) for uce in uci.uces if uce.para == idpara]
1372 nuci.paras = newpara
1373 self.corpus.ucis.append(nuci)
1376 elif parametres.get('fromclusters', False) :
1377 self.parametres['uceids'] = [st for i in self.parametres['meta'] for st in self.parametres['lc'][i]]
1379 elif parametres.get('fromuceids', False) :
1385 def fromuceids(self):
1387 dictucekeep = dict(zip(self.parametres['uceids'], self.parametres['uceids']))
1389 for uci in self.ori.ucis :
1390 if uci.paras == [] :
1391 keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep]
1394 nuci.uces = keepuces
1395 self.corpus.ucis.append(nuci)
1400 for et in uci.paras :
1401 keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep]
1409 nuci.paras = newpara
1410 self.corpus.ucis.append(nuci)
1412 def read_corpus(self, infile = None):
1413 self.olduceid = [uce.ident for uci in self.corpus.ucis for uce in uci.uces]
1419 print 'redo text, para and st ident'
1420 for uci in self.corpus.ucis :
1421 uci.ident = ident_uci
1423 for uce in uci.uces :
1425 if uce.para != lastpara :
1428 uce.para = ident_para
1430 uce.para = ident_para
1431 newuceident[uce.ident] = ident_uce
1432 uce.ident = ident_uce
1434 print 'backup st text and forms'
1435 for row in self.ori.getconcorde(self.olduceid) :
1436 self.c.execute('INSERT INTO uces VALUES(?,?);', (`newuceident[row[0]]`, row[1]))
1437 for word in row[1].split() :
1438 self.corpus.add_word_from_forme(self.ori.formes[word], newuceident[row[0]])
1442 class BuildFromAlceste(BuildCorpus) :
1443 def read_corpus(self, infile) :
1444 if self.dlg is not None :
1445 self.dlg.Pulse('textes : 0 - segments : 0')
1448 if self.corpus.parametres['ucimark'] == 0 :
1449 self.testuci = testetoile
1450 elif self.corpus.parametres['ucimark'] == 1 :
1451 self.testuci = testint
1457 with codecs.open(infile, 'r', self.encoding) as f :
1458 for linenb, line in enumerate(f) :
1459 line = line.rstrip('\n\r')#FIXME .lstrip(codecs.BOM).lstrip(codecs.BOM_UTF8)
1460 if self.testuci(line) :
1463 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci - 1)
1465 self.corpus.ucis.append(Uci(iduci, line))
1468 if self.corpus.ucis[-1].uces == [] :
1469 log.info(u'Empty text : %i' % linenb)
1471 self.corpus.ucis.pop()
1472 self.corpus.ucis.append(Uci(iduci, line))
1473 if self.dlg is not None :
1474 if not (iduci + 1) % 10 :
1475 self.dlg.Pulse('textes : %i - segments : %i' % (iduci + 1, iduce +1))
1476 elif line.startswith(u'-*') :
1479 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1482 self.corpus.ucis[-1].paras.append(line.split()[0])
1484 raise Exception('paragrapheOT %i' % linenb)
1485 elif line.strip() != '' and iduci != -1 :
1487 if txt != [] and iduci != -1 :
1488 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1493 self.corpus.ucis.pop()
1494 log.info(Exception("Empty text %i" % linenb))
1496 raise Exception('EmptyText %i' % linenb)
1497 if iduci != -1 and iduce != -1:
1500 log.info(_(u"No Text in corpus. Are you sure of the formatting ?"))
1501 raise Exception('TextBeforeTextMark %i' % linenb)
1502 except UnicodeDecodeError :
1503 raise Exception("CorpusEncoding")
1505 def treattxt(self, txt, iduce, idpara, iduci) :
1506 if self.corpus.parametres.get('ucemethod', 0) == 2 and self.corpus.parametres['douce']:
1507 txt = 'laphrasepoursplitter'.join(txt)
1508 txt = self.make_cleans(txt)
1509 txt = ' '.join([val for val in txt.split() if val not in self.ponctuation_espace])
1510 ucetxt = txt.split('laphrasepoursplitter')
1513 txt = self.make_cleans(txt)
1514 ucetxt = self.make_uces(txt, self.corpus.parametres['douce'])
1515 if self.corpus.ucis[-1].paras == [] :
1519 self.corpus.ucis[-1].uces.append(Uce(iduce, idpara, iduci))
1520 self.c.execute('INSERT INTO uces VALUES(?,?);', (`iduce`,uce))
1521 if not self.tolist :
1527 self.corpus.add_word(word)
1528 log.debug(' '.join([`iduci`,`idpara`,`iduce`]))
1529 if self.last > self.lim :
1532 return iduce, idpara
1534 def make_uces(self, txt, douce = True, keep_ponct = False) :
1535 txt = ' '.join(txt.split())
1538 reste, texte_uce, suite = self.decouper(self.prep_txt(txt), self.ucesize + 15, self.ucesize)
1540 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1543 reste, texte_uce, suite = self.decouper(suite, self.ucesize + 15, self.ucesize)
1544 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1549 return [' '.join([val for val in txt.split() if val not in self.ponctuation_espace])]
1551 #decouper (list_sep)
1552 #make_uces (decouper)
1553 #treat_txt (make_uces)
1557 def __init__(self, parent, dlg = None) :
1558 self.parent = parent
1560 parametres = DoConf(os.path.join(self.parent.UserConfigPath,'corpus.cfg')).getoptions('corpus')
1561 parametres['pathout'] = PathOut(parent.filename, 'corpus').mkdirout()
1562 parametres['corpus_name'] = os.path.split(parametres['pathout'])[1]
1563 dial = CorpusPref(parent, parametres)
1564 dial.CenterOnParent()
1565 dial.txtpath.SetLabel(parent.filename)
1566 #dial.repout_choices.SetValue(parametres['pathout'])
1567 self.res = dial.ShowModal()
1568 if self.res == 5100 :
1569 parametres = dial.doparametres()
1570 parametres['originalpath'] = parent.filename
1571 PathOut().createdir(parametres['pathout'])
1572 if parametres.get('dictionary', False) :
1573 filein = parametres['dictionary']
1577 ReadLexique(self.parent, lang = parametres['lang'], filein = filein)
1578 if parametres['lang'] != 'other' and os.path.exists(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')):
1579 self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp'))
1581 self.parent.expressions = {}
1582 self.parametres = parametres
1585 if self.dlg is not None :
1588 def doanalyse(self) :
1589 return BuildFromAlceste(self.parent.filename, self.parametres, self.parent.lexique, self.parent.expressions, dlg = self.dlg).corpus
1592 def __init__(self, parent, corpus, parametres = None, dlg = None):
1593 self.parent = parent
1596 corpus_name = 'Sub' + corpus.parametres['corpus_name']
1597 if dlg is not None :
1598 busy = wx.BusyInfo(_("Please wait...").decode('utf8'), self)
1600 parametres['corpus_name'] = corpus_name
1601 if parametres.get('frommeta', False) :
1602 parametres['meta'] = corpus.make_etoiles()
1603 elif parametres.get('fromtheme', False) :
1604 parametres['meta'] = corpus.make_themes()
1605 elif parametres.get('fromclusters', False) :
1606 parametres['meta'] = [' '.join(['classe', `i`]) for i in range(1,parametres['clnb'] + 1)]
1608 parametres['meta'] = []
1609 if 'fromclusters' not in parametres :
1610 parametres['meta'].sort()
1611 if dlg is not None :
1613 dial = SubTextFromMetaDial(parent, parametres)
1614 self.res = dial.ShowModal()
1615 if self.res == 5100 :
1616 if dial.subcorpusname.GetValue() != '' :
1617 corpus_name = ''.join([l for l in dial.subcorpusname.GetValue() if l.isalnum() or l in ['_']])
1618 if corpus_name != '' :
1619 parametres['corpus_name'] = corpus_name
1621 parametres['corpus_name'] = 'Sub' + corpus.parametres['corpus_name']
1622 pathout = os.path.join(corpus.parametres['pathout'], parametres['corpus_name'])
1624 while os.path.exists(pathout + '_%i' % i) :
1626 parametres['pathout'] = pathout + '_%i' % i
1627 meta = dial.m_listBox1.GetSelections()
1628 if not 'fromclusters' in parametres :
1629 parametres['meta'] = [parametres['meta'][val] for val in meta]
1631 parametres['meta'] = meta
1632 self.parametres = parametres
1637 def doanalyse(self):
1638 return BuildSubCorpus(self.ori, parametres = self.parametres, dlg = self.dlg).corpus