1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
11 from functions import decoupercharact, ReadDicoAsDico, DoConf, ReadLexique
16 from operator import itemgetter
17 from uuid import uuid4
18 from chemins import PathOut
19 from dialog import CorpusPref, SubTextFromMetaDial
20 from colors import colors
24 log = logging.getLogger('iramuteq.corpus')
27 def copycorpus(corpus) :
28 log.info('copy corpus')
29 copy_corpus = Corpus(corpus.parent, parametres = corpus.parametres)
30 copy_corpus.ucis = corpus.ucis
31 copy_corpus.formes = corpus.formes
32 copy_corpus.pathout = corpus.pathout
33 copy_corpus.conn_all()
42 def __init__(self, parent, parametres = {}, read = False) :
44 self.parametres = parametres
46 self.connformes = None
48 self.conncorpus = None
55 self.idformesuces = {}
60 self.pathout = PathOut(dirout = parametres['pathout'])
63 def add_word(self, word) :
64 if word in self.formes :
65 self.formes[word].freq += 1
66 if self.formes[word].ident in self.idformesuces :
67 if self.ucis[-1].uces[-1].ident in self.idformesuces[self.formes[word].ident] :
68 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] += 1
70 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] = 1
72 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident: 1}
74 if word in self.parent.lexique :
75 gramtype = self.parent.lexique[word][1]
76 lem = self.parent.lexique[word][0]
83 self.formes[word] = Word(word, gramtype, len(self.formes), lem)
84 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident : 1}
86 def add_word_from_forme(self, word, stident):
87 if word.forme in self.formes :
88 self.formes[word.forme].freq += 1
89 if self.formes[word.forme].ident in self.idformesuces :
90 if stident in self.idformesuces[self.formes[word.forme].ident] :
91 self.idformesuces[self.formes[word.forme].ident][stident] += 1
93 self.idformesuces[self.formes[word.forme].ident][stident] = 1
95 self.idformesuces[self.formes[word.forme].ident] = {stident: 1}
97 self.formes[word.forme] = word
98 self.formes[word.forme].ident = len(self.formes)
99 self.formes[word.forme].freq = 1
100 self.idformesuces[self.formes[word.forme].ident] = {stident : 1}
103 """connect corpus to db"""
104 if self.connformes is None :
105 log.info('connexion corpus')
106 self.connuces = sqlite3.connect(self.pathout['uces.db'])
107 self.cuces = self.connuces.cursor()
108 self.connformes = sqlite3.connect(self.pathout['formes.db'])
109 self.cformes = self.connformes.cursor()
110 self.conncorpus = sqlite3.connect(self.pathout['corpus.db'])
111 self.ccorpus = self.conncorpus.cursor()
112 self.cformes.execute('PRAGMA temp_store=MEMORY;')
113 self.cformes.execute('PRAGMA journal_mode=MEMORY;')
114 self.cformes.execute('PRAGMA synchronous = OFF;')
115 self.cuces.execute('PRAGMA temp_store=MEMORY;')
116 self.cuces.execute('PRAGMA journal_mode=MEMORY;')
117 self.cuces.execute('PRAGMA synchronous = OFF;')
118 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
119 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
120 self.ccorpus.execute('PRAGMA synchronous = OFF;')
122 def read_corpus(self) :
123 log.info('read corpus')
124 self.parametres['syscoding'] = sys.getdefaultencoding()
125 if self.conncorpus is None :
127 res = self.ccorpus.execute('SELECT * FROM etoiles;')
129 self.ucis.append(Uci(row[0], row[1], row[2]))
130 uces = self.conncorpus.cursor().execute('SELECT * FROM luces where uci=?;',(`self.ucis[-1].ident`,))
132 self.ucis[-1].uces.append(Uce(uce[2], uce[1], uce[0]))
133 res = self.ccorpus.execute('SELECT * FROM formes;')
134 self.formes = dict([[forme[1], Word(forme[1], forme[3], forme[0], lem = forme[2], freq = forme[4])] for forme in res])
137 def getworduces(self, wordid) :
138 if isinstance(wordid, basestring) :
139 wordid = self.formes[wordid].ident
140 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`wordid`,))
141 return list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
143 def getworducis(self, wordid) :
144 res = self.getworduces(wordid)
145 return list(set([self.getucefromid(uce).uci for uce in res]))
147 def getformeuceseff(self, formeid) :
148 if isinstance(formeid, basestring) :
149 formeid = self.formes[formeid].ident
150 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`formeid`,))
151 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
152 query = 'SELECT eff FROM eff where id=%i ORDER BY id' % formeid
153 res = self.cformes.execute(query)
154 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
156 for i, uce in enumerate(uces) :
157 formeuceeff[uce] = formeuceeff.get(uce, 0) + eff[i]
160 def getlemuces(self, lem) :
161 formesid = ', '.join([`val` for val in self.lems[lem].formes])
162 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
163 res = self.cformes.execute(query)
164 return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))))
166 def getlemucis(self, lem) :
167 uces = self.getlemuces(lem)
168 return list(set([self.getucefromid(val).uci for val in uces]))
170 def getlemuceseff(self, lem, luces = None) :
171 formesid = ', '.join([`val` for val in self.lems[lem].formes])
172 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
173 res = self.cformes.execute(query)
174 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
175 query = 'SELECT eff FROM eff where id IN (%s) ORDER BY id' % formesid
176 res = self.cformes.execute(query)
177 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
179 for i, uce in enumerate(uces) :
180 lemuceeff[uce] = lemuceeff.get(uce, 0) + eff[i]
183 def getlemclustereff(self, lem, cluster) :
184 return len(list(set(self.lc[cluster]).intersection(self.getlemuces(lem))))
186 def getlemeff(self, lem) :
187 return self.lems[lem].freq
192 def getforme(self, formeid) :
193 if self.idformes is None : self.make_idformes()
194 return self.idformes[formeid]
196 def gettotocc(self) :
197 return sum([self.formes[forme].freq for forme in self.formes])
199 def getucemean(self) :
200 return float(self.gettotocc())/self.getucenb()
203 return self.ucis[-1].uces[-1].ident + 1
206 return self.ucis[-1].ident + 1
208 def getucisize(self) :
209 ucesize = self.getucesize()
210 return [sum(ucesize[uci.uces[0].ident:(uci.uces[-1].ident + 1)]) for uci in self.ucis]
212 def getucesize(self) :
213 res = self.getalluces()
214 return [len(uce[1].split()) for uce in res]
216 def getconcorde(self, uces) :
217 return self.cuces.execute('select * from uces where id IN (%s) ORDER BY id;' % ', '.join([`i` for i in uces]))
219 def getuciconcorde(self, ucis) :
220 uces = [[val,[uce.ident for uce in self.ucis[val].uces]] for val in ucis]
221 uces = [[val[0], '\n'.join([row[1] for row in self.getconcorde(val[1])])] for val in uces]
224 def getwordconcorde(self, word) :
225 return self.getconcorde(self.getworduces(word))
227 def getlemconcorde(self, lem) :
228 return self.getconcorde(self.getlemuces(lem))
230 def getalluces(self) :
231 return self.cuces.execute('SELECT * FROM uces')
233 def getallucis(self):
234 uces = [row[1] for row in self.getalluces()]
235 return [[uci.ident, '\n'.join([uces[uce.ident] for uce in uci.uces])] for uci in self.ucis]
237 def getucesfrometoile(self, etoile) :
238 return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles]
240 def getetoileuces(self) :
241 log.info('get uces etoiles')
244 for uci in self.ucis :
245 etoiles = uci.etoiles[1:]
247 if et in etoileuces :
248 etoileuces[et] += [uce.ident for uce in uci.uces]
250 etoileuces[et] = [uce.ident for uce in uci.uces]
252 for et in uci.paras :
253 if et in etoileuces :
254 etoileuces[et] += [uce.ident for uce in uci.uces if uce.para == idpara]
256 etoileuces[et] = [uce.ident for uce in uci.uces if uce.para == idpara]
262 def getetoileucis(self):
264 for uci in self.ucis :
265 etoiles = uci.etoiles[1:]
267 if et in etoileuces :
268 etoileuces[et] += [uci.ident]
270 etoileuces[et] = [uci.ident]
273 def getucefromid(self, uceid) :
274 if self.iduces is None : self.make_iduces()
275 return self.iduces[uceid]
277 def gethapaxnb(self) :
278 return len([None for forme in self.formes if self.formes[forme].freq == 1])
280 def getactivesnb(self, key) :
281 return len([lem for lem in self.lems if self.lems[lem].act == key])
282 # def make_lems(self, lem = True) :
283 # log.info('make lems')
285 # for forme in self.formes :
286 # if self.formes[forme].lem in self.lems :
287 # if self.formes[forme].ident not in self.lems[self.formes[forme].lem] :
288 # self.lems[self.formes[forme].lem][self.formes[forme].ident] = 0
290 # self.lems[self.formes[forme].lem] = {self.formes[forme].ident : 0}
292 def getetbyuceid(self, uceid) :
293 if self.uceuci is None : self.uceuci = dict([[uce.ident,uci.ident] for uci in self.ucis for uce in uci.uces])
294 return self.ucis[self.uceuci[uceid]].etoiles
296 def make_lems(self, lem = True) :
297 log.info('make lems')
300 for forme in self.formes :
301 if self.formes[forme].lem in self.lems :
302 if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes :
303 self.lems[self.formes[forme].lem].add_forme(self.formes[forme])
305 self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
307 self.lems = dict([[forme, Lem(self, self.formes[forme])] for forme in self.formes])
309 def make_lems_from_dict(self, dictionnaire, dolem = True) :
310 log.info('make lems from dict')
312 for forme in self.formes :
313 if self.formes[forme].forme in dictionnaire :
314 lem = dictionnaire[forme][0]
315 gram = dictionnaire[forme][1]
316 elif forme.isdigit() :
322 self.formes[forme].lem = lem
323 self.formes[forme].gram = gram
325 if self.formes[forme].lem in self.lems :
326 if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes :
327 self.lems[self.formes[forme].lem].add_forme(self.formes[forme])
329 self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
331 self.lems[forme] = Lem(self, self.formes[forme])
333 def make_idformes(self) :
334 self.idformes = dict([[self.formes[forme].ident, self.formes[forme]] for forme in self.formes])
336 def make_iduces(self) :
337 if self.iduces is None :
338 self.iduces = dict([[uce.ident, uce] for uci in self.ucis for uce in uci.uces])
340 def make_lexitable(self, mineff, etoiles, gram = 0) :
345 tokeep = [lem for lem in self.lems if self.lems[lem].freq >= mineff and self.lems[lem].act in grams]
346 etuces = [[] for et in etoiles]
347 for uci in self.ucis :
348 get = list(set(uci.etoiles).intersection(etoiles))
350 log.info('2 variables sur une ligne')
352 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
353 etuces = [set(val) for val in etuces]
356 deff = self.getlemuceseff(lem)
358 tab.append([lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])
359 tab.insert(0, [''] + etoiles)
362 def make_tgen_table(self, tgen, etoiles, tot = None):
363 lclasses = [self.getucesfrometoile(etoile) for etoile in etoiles]
364 sets = [set(cl) for cl in lclasses]
365 totoccurrences = dict([[val, 0] for val in etoiles])
367 for forme in self.formes :
368 formeuceeff = self.getformeuceseff(forme)
369 for i, classe in enumerate(lclasses) :
370 concern = sets[i].intersection(formeuceeff.keys())
372 totoccurrences[etoiles[i]] += sum([formeuceeff[uce] for uce in concern])
373 #tgenoccurrences = dict([[val, 0] for val in etoiles])
376 tgenoccurrences[t] = dict([[val, 0] for val in etoiles])
378 lemuceeff = self.getlemuceseff(lem)
379 for i, classe in enumerate(lclasses) :
380 concern = sets[i].intersection(lemuceeff.keys())
382 tgenoccurrences[t][etoiles[i]] += sum([lemuceeff[uce] for uce in concern])
383 return tgenoccurrences, totoccurrences
385 def make_efftype_from_etoiles(self, etoiles) :
387 etuces = [[] for et in etoiles]
388 for uci in self.ucis :
389 get = list(set(uci.etoiles).intersection(etoiles))
391 return '2 variables sur la meme ligne'
393 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
394 etuces = [set(val) for val in etuces]
395 for lem in self.lems :
396 deff = self.getlemuceseff(lem)
398 gram = self.lems[lem].gram
400 dtype[gram] = [i + j for i, j in zip(dtype[gram], [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])]
402 dtype[gram] = [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces]
403 tabout = [[gram] + dtype[gram] for gram in dtype]
404 tabout.insert(0, [''] + etoiles)
407 def make_uceactsize(self, actives) :
408 res = self.getalluces()
411 deff = self.getlemuceseff(lem)
413 ucesize[uce] = ucesize.get(uce, 0) + 1
416 def make_uc(self, actives, lim1, lim2) :
417 uceactsize = self.make_uceactsize(actives)
423 for uce in [uce for uci in self.ucis for uce in uci.uces] :
424 if uce.para == lastpara :
426 last1 += uceactsize.get(uce.ident,0)
427 uc1[-1].append(uce.ident)
429 uc1.append([uce.ident])
432 last2 += uceactsize.get(uce.ident, 0)
433 uc2[-1].append(uce.ident)
435 uc2.append([uce.ident])
438 last1 = uceactsize.get(uce.ident, 0)
439 last2 = uceactsize.get(uce.ident, 0)
441 uc1.append([uce.ident])
442 uc2.append([uce.ident])
445 def make_and_write_sparse_matrix_from_uc(self, actives, sizeuc1, sizeuc2, uc1out, uc2out, listuce1out, listuce2out) :
446 uc1, uc2 = self.make_uc(actives, sizeuc1, sizeuc2)
447 log.info('taille uc1 : %i - taille uc2 : %i' % (len(uc1), len(uc2)))
448 self.write_ucmatrix(uc1, actives, uc1out)
449 self.write_ucmatrix(uc2, actives, uc2out)
450 listuce1 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc1) for uce in ucl]
451 listuce2 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc2) for uce in ucl]
452 with open(listuce1out, 'w') as f :
453 f.write('\n'.join([';'.join(line) for line in listuce1]))
454 with open(listuce2out, 'w') as f :
455 f.write('\n'.join([';'.join(line) for line in listuce2]))
456 return len(uc1), len(uc2)
458 def write_ucmatrix(self, uc, actives, fileout) :
459 log.info('write uc matrix %s' % fileout)
460 uces_uc = dict([[uce, i] for i, ucl in enumerate(uc) for uce in ucl])
463 with open(fileout + '~', 'w+') as f :
464 for i, lem in enumerate(actives) :
465 for uce in self.getlemuces(lem):
466 if (uces_uc[uce], i) not in deja_la :
468 f.write(''.join([' '.join([`uces_uc[uce]+1`,`i+1`,`1`]),'\n']))
469 deja_la[(uces_uc[uce], i)] = 0
471 with open(fileout, 'w') as ffin :
472 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (len(uc), len(actives), nbl))
475 os.remove(fileout + '~')
478 def export_corpus(self, outf) :
479 #outf = 'export_corpus.txt'
481 res = self.getalluces()
485 with open(outf,'w') as f :
487 if self.iduces[uce[0]].uci == actuci and self.iduces[uce[0]].para == actpara :
488 f.write(uce[1].encode(self.parametres['syscoding']) + '\n')
489 elif self.iduces[uce[0]].uci != actuci :
490 actuci = self.iduces[uce[0]].uci
491 if self.ucis[self.iduces[uce[0]].uci].paras == [] :
492 actpara = self.iduces[uce[0]].para
493 f.write('\n' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n' + uce[1].encode(self.parametres['syscoding']) + '\n')
496 actpara = self.iduces[uce[0]].para
497 f.write('\n'.join([' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']), self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
498 elif self.iduces[uce[0]].para != actpara :
499 actpara = self.iduces[uce[0]].para
501 f.write('\n'.join([self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
503 def export_corpus_classes(self, outf, alc = True, lem = False, uci = False) :
505 for i, lc in enumerate(self.lc) :
508 for uce in self.lc0 :
511 res = self.getalluces()
514 res = self.getallucis()
515 with open(outf, 'w') as f :
519 actuci = self.iduces[uce[0]].uci
523 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
525 etline = ' '.join(self.ucis[actuci].etoiles + ['*classe_%i' % ucecl[uce[0]]])
527 etline = ' '.join(['<' + '='.join(et.split('_')) + '>' for et in self.ucis[actuci].etoiles[1:]])
528 f.write(etline.encode(self.parametres['syscoding']) + '\n')
529 f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
531 def export_classe(self, outf, classe, lem = False, uci = False) :
532 sts = self.lc[classe - 1]
534 res = self.getconcorde(sts)
537 res = self.getuciconcorde(sts)
538 with open(outf, 'w') as f :
542 f.write(' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n')
544 f.write(' '.join(self.ucis[uce[0]].etoiles).encode(self.parametres['syscoding']) + '\n')
546 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
547 f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
549 def export_owledge(self, rep, classe, lem = False, uci = False) :
550 sts = self.lc[classe - 1]
552 res = self.getconcorde(sts)
555 res = self.getuciconcorde(sts)
559 outf = '.'.join([`ident`, 'txt'])
560 outf = os.path.join(rep, outf)
562 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
563 with open(outf, 'w') as f :
564 f.write(guce.encode('cp1252', errors = 'replace'))
566 def export_tropes(self, fileout, classe, lem = False, uci = False) :
567 sts = self.lc[classe - 1]
569 res = self.getconcorde(sts)
572 res = self.getuciconcorde(sts)
573 with open(fileout, 'w') as f :
577 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
578 f.write(guce.encode('cp1252', errors = 'replace'))
581 def make_and_write_sparse_matrix_from_uces(self, actives, outfile, listuce = False) :
582 log.info('make_and_write_sparse_matrix_from_uces %s' % outfile)
584 with open(outfile + '~', 'w+') as f :
585 for i, lem in enumerate(actives) :
586 for uce in sorted(self.getlemuces(lem)) :
588 f.write(''.join([' '.join([`uce+1`, `i+1`,`1`]),'\n']))
590 with open(outfile, 'w') as ffin :
591 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
594 os.remove(outfile + '~')
596 with open(listuce, 'w') as f :
597 f.write('\n'.join(['uce;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucenb())]))
599 def make_and_write_sparse_matrix_from_uci(self, actives, outfile, listuci = False) :
600 log.info('make_and_write_sparse_matrix_from_ucis %s' % outfile)
602 with open(outfile + '~', 'w+') as f :
603 for i, lem in enumerate(actives) :
604 for uci in sorted(self.getlemucis(lem)) :
606 f.write(''.join([' '.join([`uci+1`, `i+1`,`1`]),'\n']))
608 with open(outfile, 'w') as ffin :
609 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucinb(), len(actives), nbl))
612 os.remove(outfile + '~')
614 with open(listuci, 'w') as f :
615 f.write('\n'.join(['uci;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucinb())]))
617 def make_and_write_sparse_matrix_from_classe(self, actives, uces, outfile) :
618 log.info('make_and_write_sparse_matrix_from_classe %s' % outfile)
620 duces = dict([[uce, i] for i, uce in enumerate(uces)])
621 with open(outfile + '~', 'w+') as f :
622 for i, lem in enumerate(actives) :
623 uces_ok = list(set(self.getlemuces(lem)).intersection(uces))
625 f.write(''.join([' '.join([`duces[uce]+1`,`i+1`,`1`]),'\n']))
627 with open(outfile, 'w') as ffin :
628 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
631 os.remove(outfile + '~')
633 def make_table_with_classe(self, uces, list_act, uci = False) :
634 table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
635 uces = dict([[uce, i] for i, uce in enumerate(uces)])
637 getlem = self.getlemucis
639 getlem = self.getlemuces
640 for i, lem in enumerate(list_act) :
641 lemuces = list(set(getlem(lem)).intersection(uces))
643 table_uce[uces[uce]][i] = 1
644 table_uce.insert(0, list_act)
647 def make_pondtable_with_classe(self, uces, list_act) :
648 table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
649 uces = dict([[uce, i] for i, uce in enumerate(uces)])
650 for i, lem in enumerate(list_act) :
651 uceseff = self.getlemuceseff(lem)
652 lemuces = list(set(uceseff.keys()).intersection(uces))
654 table_uce[uces[uce]][i] = uceseff[uce]
655 table_uce.insert(0, list_act)
658 def parse_active(self, gramact, gramsup = None) :
659 log.info('parse actives')
660 for lem in self.lems :
661 if lem.startswith('_') and lem.endswith('_') :
662 self.lems[lem].act = 2
663 elif self.lems[lem].gram in gramact :
664 self.lems[lem].act = 1
665 elif gramsup is not None and self.lems[lem].gram not in gramact:
666 if self.lems[lem].gram in gramsup :
667 self.lems[lem].act = 2
669 self.lems[lem].act = 0
671 self.lems[lem].act = 2
673 def make_actives_limit(self, limit, key = 1) :
674 if self.idformes is None :
676 return [lem for lem in self.lems if self.getlemeff(lem) >= limit and self.lems[lem].act == key]
678 def make_actives_nb(self, nbmax, key) :
679 log.info('make_actives_nb : %i - %i' % (nbmax,key))
680 if self.idformes is None :
682 allactives = [[self.lems[lem].freq, lem] for lem in self.lems if self.lems[lem].act == key and self.lems[lem].freq >= 3]
683 self.activenb = len(allactives)
684 allactives = sorted(allactives, reverse = True)
685 if self.activenb == 0 :
687 if len(allactives) <= nbmax :
688 log.info('nb = %i - eff min = %i ' % (len(allactives), allactives[-1][0]))
689 return [val[1] for val in allactives], allactives[-1][0]
691 effs = [val[0] for val in allactives]
692 if effs.count(effs[nbmax - 1]) > 1 :
693 lim = effs[nbmax - 1] + 1
697 stop = effs.index(lim)
704 log.info('nb actives = %i - eff min = %i ' % (stop + 1, lim))
705 return [val[1] for val in allactives[0:stop + 1]], lim
707 def make_and_write_profile(self, actives, ucecl, fileout, uci = False) :
708 log.info('formes/classes')
710 tab = [[lem] + [len(set(self.getlemucis(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
712 tab = [[lem] + [len(set(self.getlemuces(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
713 tab = [[line[0]] + [`val` for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
714 with open(fileout, 'w') as f :
715 f.write('\n'.join([';'.join(line) for line in tab]).encode(self.parametres['syscoding']))
717 def make_etoiles(self) :
719 for uci in self.ucis :
720 etoiles.update(uci.etoiles[1:])
723 def make_themes(self):
725 for uci in self.ucis :
726 themes.update(uci.paras)
729 def make_etoiles_dict(self) :
730 etoiles = [et for uci in self.ucis for et in uci.etoiles[1:]]
732 for etoile in etoiles :
733 et = etoile.split('_')
736 endet = '_'.join(et[1:])
737 if etoile in det[et[0]] :
738 det[et[0]][etoile] += 1
740 det[et[0]][etoile] = 1
745 endet = '_'.join(et[1:])
746 det[et[0]] = {etoile :1}
751 def make_etline(self, listet) :
752 etuces = [[] for et in listet]
753 for uci in self.ucis :
754 get = list(set(uci.etoiles).intersection(listet))
756 return '2 variables sur la meme ligne'
758 etuces[listet.index(get[0])] += [uce.ident for uce in uci.uces]
761 def make_and_write_profile_et(self, ucecl, fileout, uci = False) :
762 log.info('etoiles/classes')
764 etoileuces = self.getetoileuces()
766 etoileuces = self.getetoileucis()
767 etoileuces = dict([[et, etoileuces[et]] for et in etoileuces if len(etoileuces[et]) > 1])
768 with open(fileout, 'w') as f :
769 f.write('\n'.join([';'.join([et] + [`len(set(etoileuces[et]).intersection(classe))` for classe in ucecl]) for et in etoileuces]).encode(self.parametres['syscoding']))
770 #etoiles = self.make_etoiles()
771 #with open(fileout, 'w') as f :
772 # f.write('\n'.join([';'.join([etoile] + [`len(set(self.getucesfrometoile(etoile)).intersection(classe))` for classe in ucecl]) for etoile in etoiles]).encode(self.parametres['syscoding']))
774 def make_colored_corpus(self, uci = False) :
776 for i, lc in enumerate(self.lc) :
779 for uce in self.lc0 :
781 color = ['black'] + colors[len(self.lc) - 1]
783 <meta http-equiv="content-Type" content="text/html; charset=%s" />
785 ''' % sys.getdefaultencoding()
787 res = self.getalluces()
792 if self.iduces[uce[0]].uci != actuci :
793 actuci = self.iduces[uce[0]].uci
794 txt += '<br><hr>' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles) + '<br><br>'
795 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
797 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
799 res = self.getallucis()
802 if self.ucis[uce[0]].ident != actuci :
803 actuci = self.ucis[uce[0]].ident
804 txt += '<br><hr>' + ' '.join(self.ucis[self.ucis[uce[0]].ident].etoiles) + '<br><br>'
805 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
807 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
808 return txt + '\n</body></html>'
810 def count_from_list(self, l, d) :
818 def count_from_list_cl(self, l, d, a, clnb) :
827 def find_segments(self, taille_segment, taille_limite) :
829 for uce in self.getalluces() :
831 d = self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
832 l = [[d[val], val] for val in d if d[val] >= 3]
835 if len(l) > taille_limite :
836 l = l[-taille_limite:]
839 def find_segments_in_classe(self, list_uce, taille_segment, taille_limite, uci = False):
842 concorde = self.getconcorde
844 concorde = self.getuciconcorde
845 for uce in concorde(list_uce) :
847 d =self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
848 l = [[d[val], val, taille_segment] for val in d if d[val] >= 3]
851 if len(l) > taille_limite :
852 l = l[-taille_limite:]
855 def make_segments_profile(self, fileout, lenmin = 3, lenmax = 10, effmin = 50, lem = False) :
857 for b, classe in enumerate(self.lc) :
858 for uce in self.getconcorde(classe) :
861 uce = [self.formes[forme].lem for forme in uce]
862 for taille_segment in range(lenmin,lenmax) :
863 d =self.count_from_list_cl([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d, b, len(self.lc))
864 result = [[seg] + [str(val) for val in d[seg]] for seg in d if sum(d[seg]) >= effmin]
865 with open(fileout, 'w') as f :
866 f.write('\n'.join([';'.join(line) for line in result]))
868 def make_proftype(self, outf) :
870 for lem in self.lems :
871 gram = self.lems[lem].gram
873 res[gram] = [0 for val in self.lc]
874 lemuceeff = self.getlemuceseff(lem)
875 for i, classe in enumerate(self.lc) :
876 concern = set(classe).intersection(lemuceeff.keys())
877 res[gram][i] += sum([lemuceeff[uce] for uce in concern])
878 res = [[gram] + [`val` for val in res[gram]] for gram in res]
880 with open(outf, 'w') as f :
881 f.write('\n'.join([';'.join(line) for line in res]).encode(self.parametres['syscoding']))
884 def make_ucecl_from_R(self, filein) :
885 with open(filein, 'rU') as f :
890 line = line.replace('\n', '').replace('"', '').split(';')
891 self.lc.append([int(line[0]) - 1, int(line[1])])
892 classesl = [val[1] for val in self.lc]
894 self.lc = sorted(self.lc, key=itemgetter(1))
895 self.lc = [[uce[0] for uce in self.lc if uce[1] == i] for i in range(clnb+1)]
896 self.lc0 = self.lc.pop(0)
899 def get_stat_by_cluster(self, outf, lclasses = None) :
900 log.info('get_stat_by_cluster')
901 if lclasses is None :
904 occurrences = dict([[i + 1, 0] for i in range(len(lclasses))])
905 formescl = dict([[i + 1, 0] for i in range(len(lclasses))])
906 hapaxcl = dict([[i + 1, 0] for i in range(len(lclasses))])
907 lenclasses = dict([[i+1,len(cl)] for i, cl in enumerate(lclasses)])
908 sets = [set(cl) for cl in lclasses]
909 for forme in self.formes :
910 formeuceeff = self.getformeuceseff(forme)
911 for i, classe in enumerate(lclasses) :
912 concern = sets[i].intersection(formeuceeff.keys())
914 occurrences[i+1] += sum([formeuceeff[uce] for uce in concern])
916 if self.formes[forme].freq == 1 :
918 log.info('%f' % (time() - t1))
919 if outf is not None :
920 toprint = '\n'.join([';'.join([`i`, `occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`]) for i in occurrences])
921 with open(outf, 'w') as f :
924 return [[`occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`] for i in occurrences]
926 def get_stat_by_et(self, outf, etoiles) :
927 lclasses = [self.getucesfrometoile(etoile) for etoile in etoiles]
928 stats = self.get_stat_by_cluster(None, lclasses)
929 stats = [[etoiles[i]] + val for i, val in enumerate(stats)]
931 def gethapaxbyet(self, etoiles) :
932 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
934 for uce in hapaxuces :
935 if uce in hucesdict :
939 etuces = [[] for et in etoiles]
940 for uci in self.ucis :
941 get = list(set(uci.etoiles).intersection(etoiles))
943 return '2 variables sur la meme ligne'
945 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
946 etuces = [set(val) for val in etuces]
947 return [sum([hucesdict[uce] for uce in list(etuce.intersection(hapaxuces))]) for etuce in etuces]
949 def gethapaxuces(self) :
950 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
951 hapax = [forme for forme in self.lems if self.lems[forme].freq == 1]
953 for i,uce in enumerate(hapaxuces) :
954 if uce in hucesdict :
955 hucesdict[uce][0] += 1
956 hucesdict[uce][1].append(hapax[i])
958 hucesdict[uce] = [1,[hapax[i]]]
960 for uce in hucesdict :
961 if hucesdict[uce][0] in huces :
962 huces[hucesdict[uce][0]].append(uce)
964 huces[hucesdict[uce][0]] = [uce]
965 huces = zip(huces, huces.values())
966 huces.sort(reverse=True)
970 for nb in huces[0:4] :
971 txt += "<p><h2>%i hapax par uce</h2><p>\n" % nb[0]
973 res = self.getconcorde([uce])
975 ucetxt = ' ' + row[1] + ' '
977 for hap in hucesdict[uce][1] :
978 laforme = self.getforme([forme for forme in self.lems[hap].formes][0]).forme
979 ucetxt = ucetxt.replace(' '+laforme+' ', ' <font color=red>'+laforme+'</font> ')
980 txt += '<p><b>' + ' '.join(self.getetbyuceid(uceid)) + '</b></p>'
981 txt += '<p>'+ucetxt+'</p>\n'
985 with open('/tmp/testhapxuce.html','w') as f :
988 def export_dictionary(self, fileout, syscoding) :
989 listformes = [[self.formes[forme].freq, forme, self.formes[forme].lem, self.formes[forme].gram] for forme in self.formes]
990 listformes.sort(reverse = True)
991 listformes = [forme[1:] + [`forme[0]`] for forme in listformes]
992 with open(fileout, 'w') as f :
993 f.write('\n'.join(['\t'.join(forme) for forme in listformes]).encode(syscoding))
995 def export_lems(self, fileout, syscoding) :
997 listlem = [[lem, '\t'.join(['\t'.join([self.idformes[forme].forme, `self.lems[lem].formes[forme]`]) for forme in self.lems[lem].formes])] for lem in self.lems]
999 with open(fileout, 'w') as f :
1000 f.write('\n'.join(['\t'.join(lem) for lem in listlem]).encode(syscoding))
1005 def __init__(self, corpus) :
1006 ucinb = corpus.getucinb()
1007 ucisize = corpus.getucisize()
1008 ucimean = float(sum(ucisize))/float(ucinb)
1009 detoile = corpus.make_etoiles_dict()
1012 def __init__(self, iduci, line, paraset = None) :
1014 self.etoiles = line.split()
1016 if paraset is not None :
1017 self.paras = paraset.split()
1022 def __init__(self, iduce, idpara, iduci) :
1028 def __init__(self, word, gramtype, idword, lem = None, freq = None) :
1031 self.gram = gramtype
1034 if freq is not None :
1040 def __init__(self, parent, forme) :
1041 self.formes = {forme.ident : forme.freq}
1042 self.gram = forme.gram
1043 self.freq = forme.freq
1044 self.act = forme.act
1046 def add_forme(self, forme) :
1047 self.formes[forme.ident] = forme.freq
1048 self.freq += forme.freq
1050 def decouperlist(chaine, longueur, longueurOptimale) :
1052 on part du dernier caractère, et on recule jusqu'au début de la chaîne.
1053 Si on trouve un '$', c'est fini.
1054 Sinon, on cherche le meilleur candidat. C'est-à-dire le rapport poids/distance le plus important.
1056 separateurs = [[u'.', 6.0], [u'?', 6.0], [u'!', 6.0], [u'£$£', 6.0], [u':', 5.0], [u';', 4.0], [u',', 1.0], [u' ', 0.01]]
1057 dsep = dict([[val[0],val[1]] for val in separateurs])
1058 trouve = False # si on a trouvé un bon séparateur
1059 iDecoupe = 0 # indice du caractere ou il faut decouper
1061 longueur = min(longueur, len(chaine) - 1)
1062 chaineTravail = chaine[:longueur + 1]
1064 meilleur = ['', 0, 0] # type, poids et position du meilleur separateur
1067 indice = chaineTravail.index(u'$')
1069 iDecoupe = indice - 1
1074 caractere = chaineTravail[nbCar]
1075 distance = abs(longueurOptimale - nbCar) + 1
1076 meilleureDistance = abs(longueurOptimale - meilleur[2]) + 1
1077 if caractere in dsep :
1078 if (float(dsep[caractere]) / distance) > (float(meilleur[1]) / meilleureDistance) :
1079 meilleur[0] = caractere
1080 meilleur[1] = dsep[caractere]
1085 if (float(dsep[' ']) / distance) > (float(meilleur[1]) / meilleureDistance) :
1087 meilleur[1] = dsep[' ']
1094 #if meilleur[0] != ' ' :
1095 # fin = chaine[iDecoupe + 1:]
1096 # retour = chaineTravail[:iDecoupe]
1098 fin = chaine[iDecoupe + 1:]
1099 retour = chaineTravail[:iDecoupe + 1]
1100 return len(retour) > 0, retour, fin
1101 # si on a rien trouvé
1102 return False, chaine, ''
1104 def testetoile(line) :
1105 return line.startswith(u'****')
1108 return line[0:4].isdigit() and u'*' in line
1110 def prep_txtlist(txt) :
1111 return txt.split() + [u'$']
1113 def prep_txtcharact(txt) :
1118 Class for building a corpus
1120 def __init__(self, infile, parametres_corpus, lexique = None, expressions = None, dlg = None) :
1121 log.info('begin building corpus...')
1122 self.lexique = lexique
1123 self.expressions = expressions
1125 self.corpus = Corpus(self, parametres_corpus)
1126 self.infile = infile
1128 self.lim = parametres_corpus.get('lim', 1000000)
1129 self.encoding = parametres_corpus['encoding']
1130 self.corpus.pathout = PathOut(filename = parametres_corpus['originalpath'], dirout = parametres_corpus['pathout'])
1131 self.corpus.pathout.createdir(parametres_corpus['pathout'])
1132 self.corpus.parametres['uuid'] = str(uuid4())
1133 self.corpus.parametres['corpus_name'] = parametres_corpus['corpus_name']#os.path.split(self.corpus.parametres['pathout'])[1]
1134 self.corpus.parametres['type'] = 'corpus'
1135 if self.corpus.parametres['keep_ponct'] :
1136 self.ponctuation_espace = [' ', '']
1138 self.ponctuation_espace = [' ','.', u'£$£', ';', '?', '!', ',', ':','']
1140 self.tolist = self.corpus.parametres.get('tolist', 0)
1147 def prep_makeuce(self) :
1148 method = self.corpus.parametres.get('ucemethod', 0)
1150 self.decouper = decouperlist
1151 self.prep_txt = prep_txtlist
1152 self.ucesize = self.corpus.parametres.get('ucesize', 40)
1154 self.decouper = decoupercharact
1155 self.prep_txt = prep_txtcharact
1156 self.ucesize = self.corpus.parametres.get('ucesize', 240)
1157 log.info('method uce : %s' % method)
1162 self.read_corpus(self.infile)
1163 except Warning, args :
1164 log.info('pas kool %s' % args)
1168 self.corpus.parametres['ira'] = self.corpus.pathout['Corpus.cira']
1169 self.time = time() - t1
1171 DoConf().makeoptions(['corpus'],[self.corpus.parametres], self.corpus.pathout['Corpus.cira'])
1172 log.info('time : %f' % (time() - t1))
1175 self.conn_f = sqlite3.connect(self.corpus.pathout['formes.db'])
1176 self.cf = self.conn_f.cursor()
1177 self.cf.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
1178 self.cf.execute('CREATE TABLE IF NOT EXISTS eff (id INTEGER, eff TEXT);')
1179 self.conn_f.commit()
1180 self.cf = self.conn_f.cursor()
1181 self.cf.execute('PRAGMA temp_store=MEMORY;')
1182 self.cf.execute('PRAGMA journal_mode=MEMORY;')
1183 self.cf.execute('PRAGMA synchronous = OFF;')
1184 self.cf.execute('begin')
1185 self.conn = sqlite3.connect(self.corpus.pathout['uces.db'])
1186 self.c = self.conn.cursor()
1187 self.c.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
1189 self.c = self.conn.cursor()
1190 self.c.execute('PRAGMA temp_store=MEMORY;')
1191 self.c.execute('PRAGMA journal_mode=MEMORY;')
1192 self.c.execute('PRAGMA synchronous = OFF;')
1193 self.c.execute('begin')
1196 #commit index and close db
1198 self.conn_f.commit()
1199 self.cf.execute('CREATE INDEX iduces ON uces (id);')
1200 self.cf.execute('CREATE INDEX ideff ON eff (id);')
1204 self.conn_corpus = sqlite3.connect(self.corpus.pathout['corpus.db'])
1205 self.ccorpus = self.conn_corpus.cursor()
1206 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS etoiles (uci INTEGER, et TEXT, paras TEXT);')
1207 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS luces (uci INTEGER, para INTEGER, uce INTEGER);')
1208 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS formes (ident INTEGER, forme TEXT, lem TEXT, gram TEXT, freq INTEGER);')
1209 self.conn_corpus.commit()
1210 self.ccorpus = self.conn_corpus.cursor()
1211 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
1212 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
1213 self.ccorpus.execute('PRAGMA synchronous = OFF;')
1214 self.ccorpus.execute('begin')
1215 self.backup_corpus()
1216 self.ccorpus.execute('CREATE INDEX iduci ON luces (uci);')
1217 self.conn_corpus.commit()
1218 self.conn_corpus.close()
1219 #self.corpus.parametres['corpus_ira'] = self.corpus.pathout['corpus.cira']
1221 def buildcleans(self) :
1222 if self.corpus.parametres.get('lower', 1) :
1223 self.cleans.append(self.dolower)
1224 if self.corpus.parametres.get('firstclean', 1) :
1225 self.cleans.append(self.firstclean)
1226 if self.corpus.parametres['charact'] :
1227 self.rule = self.corpus.parametres.get('keep_caract', u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_")
1228 self.cleans.append(self.docharact)
1229 if self.corpus.parametres.get('expressions', 1) :
1230 self.cleans.append(self.make_expression)
1231 if self.corpus.parametres.get('apos', 1) :
1232 self.cleans.append(self.doapos)
1233 if self.corpus.parametres.get('tiret', 1):
1234 self.cleans.append(self.dotiret)
1236 def make_expression(self,txt) :
1237 for expression in self.expressions:
1238 if expression in txt :
1239 txt = txt.replace(expression, self.expressions[expression][0])
1242 def dolower(self, txt) :
1245 def docharact(self, txt) :
1246 #rule = u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_-"
1247 list_keep = u"[" + self.rule + "]+"
1248 return re.sub(list_keep, ' ', txt)
1250 def doapos(self, txt) :
1251 return txt.replace(u'\'', u' ')
1253 def dotiret(self, txt) :
1254 return txt.replace(u'-', u' ')
1256 def firstclean(self, txt) :
1257 txt = txt.replace(u'’',"'")
1258 txt = txt.replace(u'œ', u'oe')
1259 return txt.replace('...',u' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'…', u' £$£ ')
1261 def make_cleans(self, txt) :
1262 for clean in self.cleans :
1266 def backup_uce(self) :
1267 if self.corpus.idformesuces != {} :
1268 log.info('backup %i' % len(self.corpus.idformesuces))
1269 touce = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].keys()])) for forme in self.corpus.idformesuces]
1270 toeff = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].values()])) for forme in self.corpus.idformesuces]
1271 self.cf.executemany('INSERT INTO uces VALUES (?,?);', touce)
1272 self.cf.executemany('INSERT INTO eff VALUES (?,?);', toeff)
1273 self.corpus.idformesuces = {}
1276 def backup_corpus(self) :
1277 log.info('start backup corpus')
1279 for uci in self.corpus.ucis :
1280 self.ccorpus.execute('INSERT INTO etoiles VALUES (?,?,?);' ,(uci.ident,' '.join(uci.etoiles), ' '.join(uci.paras,)))
1281 for uce in uci.uces :
1282 self.ccorpus.execute('INSERT INTO luces VALUES (?,?,?);',(`uci.ident`,`uce.para`,`uce.ident`,))
1283 for forme in self.corpus.formes :
1284 self.ccorpus.execute('INSERT INTO formes VALUES (?,?,?,?,?);', (`self.corpus.formes[forme].ident`, forme, self.corpus.formes[forme].lem, self.corpus.formes[forme].gram, `self.corpus.formes[forme].freq`,))
1285 log.info('%f' % (time() - t))
1287 def dofinish(self) :
1288 self.corpus.parametres['date'] = datetime.datetime.now().ctime()
1289 minutes, seconds = divmod(self.time, 60)
1290 hours, minutes = divmod(minutes, 60)
1291 self.corpus.parametres['time'] = '%.0fh %.0fm %.0fs' % (hours, minutes, seconds)
1292 self.corpus.parametres['ucinb'] = self.corpus.getucinb()
1293 self.corpus.parametres['ucenb'] = self.corpus.getucenb()
1294 self.corpus.parametres['occurrences'] = self.corpus.gettotocc()
1295 self.corpus.parametres['formesnb'] = len(self.corpus.formes)
1296 hapaxnb = self.corpus.gethapaxnb()
1297 pourhapaxf = (float(hapaxnb) / len(self.corpus.formes)) * 100
1298 pourhapaxocc = (float(hapaxnb) / self.corpus.parametres['occurrences']) * 100
1299 self.corpus.parametres['hapax'] = '%i - %.2f %% des formes - %.2f %% des occurrences' % (hapaxnb, pourhapaxf, pourhapaxocc)
1301 class BuildSubCorpus(BuildCorpus):
1302 def __init__(self, corpus, parametres, dlg = None) :
1303 log.info('begin subcorpus...')
1307 self.corpus = Corpus(self, {'type' : 'corpus', 'originalpath' : corpus.parametres['originalpath'], 'encoding' : corpus.parametres['encoding']})
1309 self.encoding = corpus.parametres['encoding']
1310 self.corpus.parametres['corpus_name'] = parametres['corpus_name']
1311 self.corpus.pathout = PathOut(filename = corpus.parametres['originalpath'], dirout = parametres['pathout'])
1312 self.corpus.pathout.createdir(parametres['pathout'])
1313 self.corpus.parametres['pathout'] = parametres['pathout']
1314 self.corpus.parametres['meta'] = parametres.get('meta', False)
1315 self.corpus.parametres['uuid'] = str(uuid4())
1316 if parametres.get('frommeta', False) :
1317 print 'make subtexts'
1318 self.corpus.ucis = [uci for uci in self.ori.ucis if set(parametres['meta']).intersection(uci.etoiles) != set()]
1319 elif parametres.get('fromtheme', False) :
1320 print 'make subtexts from theme'
1322 for uci in self.ori.ucis :
1323 if uci.paras != [] :
1326 for et in uci.paras :
1327 if et in parametres['meta'] :
1328 newuce += [uce for uce in uci.uces if uce.para == idpara]
1334 self.corpus.ucis.append(uci)
1337 elif parametres.get('fromcluster', False) :
1339 elif parametres.get('fromuceids', False) :
1341 dictucekeep = dict(zip(parametres['uceids'], parametres['uceids']))
1343 for uci in self.ori.ucis :
1344 if uci.paras == [] :
1345 keepuces = [uce for uce in uci.uces if uce.ident in dictucekeep]
1348 self.corpus.ucis.append(uci)
1353 for et in uci.paras :
1354 keepuces = [uce for uce in uci.uces if uce.para == idpara]
1362 self.corpus.ucis.append(uci)
1368 def read_corpus(self, infile = None):
1369 self.olduceid = [uce.ident for uci in self.corpus.ucis for uce in uci.uces]
1375 print 'redo text, para and st ident'
1376 for uci in self.corpus.ucis :
1377 uci.ident = ident_uci
1379 for uce in uci.uces :
1381 if uce.para != lastpara :
1384 uce.para = ident_para
1386 uce.para = ident_para
1387 newuceident[uce.ident] = ident_uce
1388 uce.ident = ident_uce
1390 print 'backup st text and forms'
1391 for row in self.ori.getconcorde(self.olduceid) :
1392 self.c.execute('INSERT INTO uces VALUES(?,?);', (`newuceident[row[0]]`, row[1]))
1393 for word in row[1].split() :
1394 self.corpus.add_word_from_forme(self.ori.formes[word], newuceident[row[0]])
1398 class BuildFromAlceste(BuildCorpus) :
1399 def read_corpus(self, infile) :
1400 if self.dlg is not None :
1401 self.dlg.Pulse('textes : 0 - segments : 0')
1404 if self.corpus.parametres['ucimark'] == 0 :
1405 self.testuci = testetoile
1406 elif self.corpus.parametres['ucimark'] == 1 :
1407 self.testuci = testint
1413 with codecs.open(infile, 'r', self.encoding) as f :
1414 for linenb, line in enumerate(f) :
1415 line = line.rstrip('\n\r')#FIXME .lstrip(codecs.BOM).lstrip(codecs.BOM_UTF8)
1416 if self.testuci(line) :
1419 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci - 1)
1421 self.corpus.ucis.append(Uci(iduci, line))
1424 if self.corpus.ucis[-1].uces == [] :
1425 log.info(u'Empty text : %i' % linenb)
1427 self.corpus.ucis.pop()
1428 self.corpus.ucis.append(Uci(iduci, line))
1429 if self.dlg is not None :
1430 if not (iduci + 1) % 10 :
1431 self.dlg.Pulse('textes : %i - segments : %i' % (iduci + 1, iduce +1))
1432 elif line.startswith(u'-*') :
1435 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1438 self.corpus.ucis[-1].paras.append(line.split()[0])
1440 raise Exception('paragrapheOT %i' % linenb)
1441 elif line.strip() != '' and iduci != -1 :
1443 if txt != [] and iduci != -1 :
1444 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1449 self.corpus.ucis.pop()
1450 log.info(Exception("Empty text %i" % linenb))
1452 raise Exception('EmptyText %i' % linenb)
1453 if iduci != -1 and iduce != -1:
1456 log.info(_(u"No Text in corpus. Are you sure of the formatting ?"))
1457 raise Exception('TextBeforeTextMark %i' % linenb)
1458 except UnicodeDecodeError :
1459 raise Exception("CorpusEncoding")
1461 def treattxt(self, txt, iduce, idpara, iduci) :
1462 if self.corpus.parametres.get('ucemethod', 0) == 2 and self.corpus.parametres['douce']:
1463 txt = 'laphrasepoursplitter'.join(txt)
1464 txt = self.make_cleans(txt)
1465 txt = ' '.join([val for val in txt.split() if val not in self.ponctuation_espace])
1466 ucetxt = txt.split('laphrasepoursplitter')
1469 txt = self.make_cleans(txt)
1470 ucetxt = self.make_uces(txt, self.corpus.parametres['douce'])
1471 if self.corpus.ucis[-1].paras == [] :
1475 self.corpus.ucis[-1].uces.append(Uce(iduce, idpara, iduci))
1476 self.c.execute('INSERT INTO uces VALUES(?,?);', (`iduce`,uce))
1477 if not self.tolist :
1483 self.corpus.add_word(word)
1484 log.debug(' '.join([`iduci`,`idpara`,`iduce`]))
1485 if self.last > self.lim :
1488 return iduce, idpara
1490 def make_uces(self, txt, douce = True, keep_ponct = False) :
1491 txt = ' '.join(txt.split())
1494 reste, texte_uce, suite = self.decouper(self.prep_txt(txt), self.ucesize + 15, self.ucesize)
1496 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1499 reste, texte_uce, suite = self.decouper(suite, self.ucesize + 15, self.ucesize)
1500 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1505 return [' '.join([val for val in txt.split() if val not in self.ponctuation_espace])]
1507 #decouper (list_sep)
1508 #make_uces (decouper)
1509 #treat_txt (make_uces)
1513 def __init__(self, parent, dlg = None) :
1514 self.parent = parent
1516 parametres = DoConf(os.path.join(self.parent.UserConfigPath,'corpus.cfg')).getoptions('corpus')
1517 parametres['pathout'] = PathOut(parent.filename, 'corpus').mkdirout()
1518 parametres['corpus_name'] = os.path.split(parametres['pathout'])[1]
1519 dial = CorpusPref(parent, parametres)
1520 dial.CenterOnParent()
1521 dial.txtpath.SetLabel(parent.filename)
1522 #dial.repout_choices.SetValue(parametres['pathout'])
1523 self.res = dial.ShowModal()
1524 if self.res == 5100 :
1525 parametres = dial.doparametres()
1526 parametres['originalpath'] = parent.filename
1527 PathOut().createdir(parametres['pathout'])
1528 ReadLexique(self.parent, lang = parametres['lang'])
1529 if parametres['lang'] != 'other' and os.path.exists(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')):
1530 self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp'))
1532 self.parent.expressions = {}
1533 self.parametres = parametres
1535 if self.dlg is not None :
1539 def doanalyse(self) :
1540 return BuildFromAlceste(self.parent.filename, self.parametres, self.parent.lexique, self.parent.expressions, dlg = self.dlg).corpus
1543 def __init__(self, parent, corpus, parametres = None, dlg = None):
1544 self.parent = parent
1547 corpus_name = 'Sub' + corpus.parametres['corpus_name']
1548 if dlg is not None :
1549 busy = wx.BusyInfo(_("Please wait...").decode('utf8'), self)
1551 parametres['corpus_name'] = corpus_name
1552 if parametres.get('frommeta', False) :
1553 parametres['meta'] = corpus.make_etoiles()
1554 elif parametres.get('fromtheme', False) :
1555 parametres['meta'] = corpus.make_themes()
1557 parametres['meta'] = []
1558 parametres['meta'].sort()
1559 if dlg is not None :
1561 dial = SubTextFromMetaDial(parent, parametres)
1562 self.res = dial.ShowModal()
1563 if self.res == 5100 :
1564 if dial.subcorpusname.GetValue() != '' :
1565 corpus_name = ''.join([l for l in dial.subcorpusname.GetValue() if l.isalnum() or l in ['_']])
1566 if corpus_name != '' :
1567 parametres['corpus_name'] = corpus_name
1569 parametres['corpus_name'] = 'Sub' + corpus.parametres['corpus_name']
1570 pathout = os.path.join(corpus.parametres['pathout'], parametres['corpus_name'])
1572 while os.path.exists(pathout + '_%i' % i) :
1574 parametres['pathout'] = pathout + '_%i' % i
1575 meta = dial.m_listBox1.GetSelections()
1576 parametres['meta'] = [parametres['meta'][val] for val in meta]
1577 self.parametres = parametres
1582 def doanalyse(self):
1583 return BuildSubCorpus(self.ori, parametres = self.parametres, dlg = self.dlg).corpus
1585 if __name__ == '__main__' :
1587 parametres = {'formesdb':'formes.db', 'ucesdb': 'uces.db', 'corpusdb' : 'corpus.db', 'syscoding' : 'utf-8', 'encoding' : 'utf8'}
1588 intro = BuildCorpus(infile, parametres)#, tar_in, tar_infouce)#, tar_formes)