1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
8 from functions import decoupercharact, ReadDicoAsDico, DoConf
14 from operator import itemgetter
15 from uuid import uuid4
16 from chemins import PathOut
17 from dialog import CorpusPref
18 from functions import ReadLexique, ReadDicoAsDico
22 log = logging.getLogger('iramuteq.corpus')
25 def copycorpus(corpus) :
26 log.info('copy corpus')
27 copy_corpus = Corpus(corpus.parent, parametres = corpus.parametres)
28 copy_corpus.ucis = corpus.ucis
29 copy_corpus.formes = corpus.formes
30 copy_corpus.pathout = corpus.pathout
31 copy_corpus.conn_all()
41 def __init__(self, parent, parametres = {}, read = False) :
43 self.parametres = parametres
45 self.connformes = None
47 self.conncorpus = None
54 self.idformesuces = {}
59 self.pathout = PathOut(dirout = parametres['pathout'])
62 def add_word(self, word) :
63 if word in self.formes :
64 self.formes[word].freq += 1
65 if self.formes[word].ident in self.idformesuces :
66 if self.ucis[-1].uces[-1].ident in self.idformesuces[self.formes[word].ident] :
67 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] += 1
69 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] = 1
71 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident: 1}
73 if word in self.parent.lexique :
74 gramtype = self.parent.lexique[word][1]
75 lem = self.parent.lexique[word][0]
82 self.formes[word] = Word(word, gramtype, len(self.formes), lem)
83 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident : 1}
86 """connect corpus to db"""
87 if self.connformes is None :
88 log.info('connexion corpus')
89 self.connuces = sqlite3.connect(self.pathout['uces.db'])
90 self.cuces = self.connuces.cursor()
91 self.connformes = sqlite3.connect(self.pathout['formes.db'])
92 self.cformes = self.connformes.cursor()
93 self.conncorpus = sqlite3.connect(self.pathout['corpus.db'])
94 self.ccorpus = self.conncorpus.cursor()
95 self.cformes.execute('PRAGMA temp_store=MEMORY;')
96 self.cformes.execute('PRAGMA journal_mode=MEMORY;')
97 self.cformes.execute('PRAGMA synchronous = OFF;')
98 self.cuces.execute('PRAGMA temp_store=MEMORY;')
99 self.cuces.execute('PRAGMA journal_mode=MEMORY;')
100 self.cuces.execute('PRAGMA synchronous = OFF;')
101 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
102 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
103 self.ccorpus.execute('PRAGMA synchronous = OFF;')
105 def read_corpus(self) :
106 log.info('read corpus')
107 self.parametres['syscoding'] = sys.getdefaultencoding()
108 if self.conncorpus is None :
110 res = self.ccorpus.execute('SELECT * FROM etoiles;')
112 self.ucis.append(Uci(row[0], row[1], row[2]))
113 uces = self.conncorpus.cursor().execute('SELECT * FROM luces where uci=?;',(`self.ucis[-1].ident`,))
115 self.ucis[-1].uces.append(Uce(uce[2], uce[1], uce[0]))
116 res = self.ccorpus.execute('SELECT * FROM formes;')
117 self.formes = dict([[forme[1], Word(forme[1], forme[3], forme[0], lem = forme[2], freq = forme[4])] for forme in res])
120 def getworduces(self, wordid) :
121 if isinstance(wordid, basestring) :
122 wordid = self.formes[wordid].ident
123 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`wordid`,))
124 return list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
126 def getlemuces(self, lem) :
127 formesid = ', '.join([`val` for val in self.lems[lem].formes])
128 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
129 res = self.cformes.execute(query)
130 return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))))
132 def getlemucis(self, lem) :
133 uces = self.getlemuces(lem)
134 return list(set([self.getucefromid(val).uci for val in uces]))
136 def getlemuceseff(self, lem) :
137 formesid = ', '.join([`val` for val in self.lems[lem].formes])
138 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
139 res = self.cformes.execute(query)
140 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
141 query = 'SELECT eff FROM eff where id IN (%s) ORDER BY id' % formesid
142 res = self.cformes.execute(query)
143 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
145 for i, uce in enumerate(uces) :
146 lemuceeff[uce] = lemuceeff.get(uce, 0) + eff[i]
149 def getlemeff(self, lem) :
150 return self.lems[lem].freq
155 def getforme(self, formeid) :
156 if self.idformes is None : self.make_idformes()
157 return self.idformes[formeid]
159 def gettotocc(self) :
160 return sum([self.formes[forme].freq for forme in self.formes])
162 def getucemean(self) :
163 return float(self.gettotocc())/self.getucenb()
166 return self.ucis[-1].uces[-1].ident + 1
169 return self.ucis[-1].ident + 1
171 def getucisize(self) :
172 ucesize = self.getucesize()
173 return [sum(ucesize[uci.uces[0].ident:(uci.uces[-1].ident + 1)]) for uci in self.ucis]
175 def getucesize(self) :
176 res = self.getalluces()
177 return [len(uce[1].split()) for uce in res]
179 # def getlemseff(self) :
180 # if self.idformes is None :
181 # self.make_idformes()
182 # return dict([[lem, sum([self.idformes[forme].freq for forme in self.lems[lem]])] for lem in self.lems])
184 # def getlemsefftype(self) :
185 # if self.idformes is None :
186 # self.make_idformes()
187 # if self.lems is None :
189 # return dict([[lem, [sum([self.idformes[forme].freq for forme in self.lems[lem]]), '', self.idformes[self.lems[lem].keys()[0]].gram]] for lem in self.lems])
191 def getconcorde(self, uces) :
192 return self.cuces.execute('select * from uces where id IN (%s);' % ', '.join([`i` for i in uces]))
194 def getwordconcorde(self, word) :
195 return self.getconcorde(self.getworduces(word))
197 def getlemconcorde(self, lem) :
198 return self.getconcorde(self.getlemuces(lem))
200 def getalluces(self) :
201 return self.cuces.execute('SELECT * FROM uces')
203 def getucesfrometoile(self, etoile) :
204 return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles]
206 def getucefromid(self, uceid) :
207 if self.iduces is None : self.make_iduces()
208 return self.iduces[uceid]
210 def gethapaxnb(self) :
211 return len([None for forme in self.formes if self.formes[forme].freq == 1])
213 def getactivesnb(self, key) :
214 return len([lem for lem in self.lems if self.lems[lem].act == key])
215 # def make_lems(self, lem = True) :
216 # log.info('make lems')
218 # for forme in self.formes :
219 # if self.formes[forme].lem in self.lems :
220 # if self.formes[forme].ident not in self.lems[self.formes[forme].lem] :
221 # self.lems[self.formes[forme].lem][self.formes[forme].ident] = 0
223 # self.lems[self.formes[forme].lem] = {self.formes[forme].ident : 0}
225 def getetbyuceid(self, uceid) :
226 if self.uceuci is None : self.uceuci = dict([[uce.ident,uci.ident] for uci in self.ucis for uce in uci.uces])
227 return self.ucis[self.uceuci[uceid]].etoiles
229 def make_lems(self, lem = True) :
230 log.info('make lems')
233 for forme in self.formes :
234 if self.formes[forme].lem in self.lems :
235 if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes :
236 self.lems[self.formes[forme].lem].add_forme(self.formes[forme])
238 self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
240 self.lems = dict([[forme, Lem(self, self.formes[forme])] for forme in self.formes])
242 def make_idformes(self) :
243 self.idformes = dict([[self.formes[forme].ident, self.formes[forme]] for forme in self.formes])
245 def make_iduces(self) :
246 if self.iduces is None :
247 self.iduces = dict([[uce.ident, uce] for uci in self.ucis for uce in uci.uces])
249 def make_lexitable(self, mineff, etoiles) :
250 tokeep = [lem for lem in self.lems if self.lems[lem].freq > mineff]
251 etuces = [[] for et in etoiles]
252 for uci in self.ucis :
253 get = list(set(uci.etoiles).intersection(etoiles))
255 return '2 variables sur la meme ligne'
257 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
258 etuces = [set(val) for val in etuces]
261 deff = self.getlemuceseff(lem)
263 tab.append([lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])
264 tab.insert(0, [''] + etoiles)
267 def make_efftype_from_etoiles(self, etoiles) :
269 etuces = [[] for et in etoiles]
270 for uci in self.ucis :
271 get = list(set(uci.etoiles).intersection(etoiles))
273 return '2 variables sur la meme ligne'
275 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
276 etuces = [set(val) for val in etuces]
277 for lem in self.lems :
278 deff = self.getlemuceseff(lem)
280 gram = self.lems[lem].gram
282 dtype[gram] = [i + j for i, j in zip(dtype[gram], [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])]
284 dtype[gram] = [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces]
285 tabout = [[gram] + dtype[gram] for gram in dtype]
286 tabout.insert(0, [''] + etoiles)
289 def make_uceactsize(self, actives) :
290 res = self.getalluces()
293 deff = self.getlemuceseff(lem)
295 ucesize[uce] = ucesize.get(uce, 0) + 1
298 def make_uc(self, actives, lim1, lim2) :
299 uceactsize = self.make_uceactsize(actives)
305 for uce in [uce for uci in self.ucis for uce in uci.uces] :
306 if uce.para == lastpara :
308 last1 += uceactsize.get(uce.ident,0)
309 uc1[-1].append(uce.ident)
311 uc1.append([uce.ident])
314 last2 += uceactsize.get(uce.ident, 0)
315 uc2[-1].append(uce.ident)
317 uc2.append([uce.ident])
320 last1 = uceactsize.get(uce.ident, 0)
321 last2 = uceactsize.get(uce.ident, 0)
323 uc1.append([uce.ident])
324 uc2.append([uce.ident])
327 def make_and_write_sparse_matrix_from_uc(self, actives, sizeuc1, sizeuc2, uc1out, uc2out, listuce1out, listuce2out) :
328 uc1, uc2 = self.make_uc(actives, sizeuc1, sizeuc2)
329 log.info('taille uc1 : %i - taille uc2 : %i' % (len(uc1), len(uc2)))
330 self.write_ucmatrix(uc1, actives, uc1out)
331 self.write_ucmatrix(uc2, actives, uc2out)
332 listuce1 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc1) for uce in ucl]
333 listuce2 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc2) for uce in ucl]
334 with open(listuce1out, 'w') as f :
335 f.write('\n'.join([';'.join(line) for line in listuce1]))
336 with open(listuce2out, 'w') as f :
337 f.write('\n'.join([';'.join(line) for line in listuce2]))
338 return len(uc1), len(uc2)
340 def write_ucmatrix(self, uc, actives, fileout) :
341 log.info('write uc matrix %s' % fileout)
342 uces_uc = dict([[uce, i] for i, ucl in enumerate(uc) for uce in ucl])
345 with open(fileout + '~', 'w+') as f :
346 for i, lem in enumerate(actives) :
347 for uce in self.getlemuces(lem):
348 if (uces_uc[uce], i) not in deja_la :
350 f.write(''.join([' '.join([`uces_uc[uce]+1`,`i+1`,`1`]),'\n']))
351 deja_la[(uces_uc[uce], i)] = 0
353 with open(fileout, 'w') as ffin :
354 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (len(uc), len(actives), nbl))
357 os.remove(fileout + '~')
360 def export_corpus(self, outf) :
361 #outf = 'export_corpus.txt'
363 res = self.getalluces()
367 with open(outf,'w') as f :
369 if self.iduces[uce[0]].uci == actuci and self.iduces[uce[0]].para == actpara :
370 f.write(uce[1].encode(self.parametres['syscoding']) + '\n')
371 elif self.iduces[uce[0]].uci != actuci :
372 actuci = self.iduces[uce[0]].uci
373 if self.ucis[self.iduces[uce[0]].uci].paras == [] :
374 actpara = self.iduces[uce[0]].para
375 f.write('\n' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n' + uce[1].encode(self.parametres['syscoding']) + '\n')
378 actpara = self.iduces[uce[0]].para
379 f.write('\n'.join([' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']), self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
380 elif self.iduces[uce[0]].para != actpara :
381 actpara = self.iduces[uce[0]].para
383 f.write('\n'.join([self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
385 def make_and_write_sparse_matrix_from_uces(self, actives, outfile, listuce = False) :
386 log.info('make_and_write_sparse_matrix_from_uces %s' % outfile)
388 with open(outfile + '~', 'w+') as f :
389 for i, lem in enumerate(actives) :
390 for uce in sorted(self.getlemuces(lem)) :
392 f.write(''.join([' '.join([`uce+1`, `i+1`,`1`]),'\n']))
394 with open(outfile, 'w') as ffin :
395 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
398 os.remove(outfile + '~')
400 with open(listuce, 'w') as f :
401 f.write('\n'.join(['uce;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucenb())]))
403 def make_and_write_sparse_matrix_from_uci(self, actives, outfile, listuci = False) :
404 log.info('make_and_write_sparse_matrix_from_ucis %s' % outfile)
406 with open(outfile + '~', 'w+') as f :
407 for i, lem in enumerate(actives) :
408 for uci in sorted(self.getlemucis(lem)) :
410 f.write(''.join([' '.join([`uci+1`, `i+1`,`1`]),'\n']))
412 with open(outfile, 'w') as ffin :
413 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucinb(), len(actives), nbl))
416 os.remove(outfile + '~')
418 with open(listuci, 'w') as f :
419 f.write('\n'.join(['uci;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucinb())]))
421 def make_and_write_sparse_matrix_from_classe(self, actives, uces, outfile) :
422 log.info('make_and_write_sparse_matrix_from_classe %s' % outfile)
424 duces = dict([[uce, i] for i, uce in enumerate(uces)])
425 with open(outfile + '~', 'w+') as f :
426 for i, lem in enumerate(actives) :
427 uces_ok = list(set(self.getlemuces(lem)).intersection(uces))
429 f.write(''.join([' '.join([`duces[uce]+1`,`i+1`,`1`]),'\n']))
431 with open(outfile, 'w') as ffin :
432 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
435 os.remove(outfile + '~')
437 def parse_active(self, gramact, gramsup = None) :
438 log.info('parse actives')
439 for lem in self.lems :
440 if self.lems[lem].gram in gramact :
441 self.lems[lem].act = 1
442 elif gramsup is not None :
443 if self.lems[lem].gram in gramsup :
444 self.lems[lem].act = 2
446 self.lems[lem].act = 0
448 self.lems[lem].act = 2
450 def make_actives_limit(self, limit) :
451 if self.idformes is None :
453 return [lem for lem in self.lems if self.getlemeff(lem) >= limit]
455 def make_actives_nb(self, nbmax, key) :
456 log.info('make_actives_nb : %i - %i' % (nbmax,key))
457 if self.idformes is None :
459 allactives = [[self.lems[lem].freq, lem] for lem in self.lems if self.lems[lem].act == key and self.lems[lem].freq >= 3]
460 self.activenb = len(allactives)
461 allactives = sorted(allactives, reverse = True)
462 if len(allactives) <= nbmax :
463 log.info('nb = %i - eff min = %i ' % (len(allactives), allactives[-1][0]))
464 return [val[1] for val in allactives], allactives[-1][0]
466 effs = [val[0] for val in allactives]
467 if effs.count(effs[nbmax - 1]) > 1 :
468 lim = effs[nbmax - 1] + 1
472 stop = effs.index(lim)
478 log.info('nb actives = %i - eff min = %i ' % (stop, lim))
479 return [val[1] for val in allactives[0:stop + 1]], lim
481 def make_and_write_profile(self, actives, ucecl, fileout) :
482 log.info('formes/classes')
483 tab = [[lem] + [len(set(self.getlemuces(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
484 tab = [[line[0]] + [`val` for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
485 with open(fileout, 'w') as f :
486 f.write('\n'.join([';'.join(line) for line in tab]).encode(self.parametres['syscoding']))
488 def make_etoiles(self) :
490 for uci in self.ucis :
491 etoiles.update(uci.etoiles[1:] + uci.paras)
494 def make_and_write_profile_et(self, ucecl, fileout) :
495 log.info('etoiles/classes')
496 etoiles = self.make_etoiles()
497 with open(fileout, 'w') as f :
498 f.write('\n'.join([';'.join([etoile] + [`len(set(self.getucesfrometoile(etoile)).intersection(classe))` for classe in ucecl]) for etoile in etoiles]).encode(self.parametres['syscoding']))
500 def count_from_list(self, l, d) :
508 def find_segments(self, taille_segment, taille_limite) :
510 for uce in self.getalluces() :
512 d = self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
513 l = [[d[val], val] for val in d if d[val] >= 3]
516 if len(l) > taille_limite :
517 l = l[-taille_limite:]
520 def make_ucecl_from_R(self, filein) :
521 with open(filein, 'rU') as f :
526 line = line.replace('\n', '').replace('"', '').split(';')
527 self.lc.append([int(line[0]) - 1, int(line[1])])
528 classesl = [val[1] for val in self.lc]
530 self.lc = sorted(self.lc, key=itemgetter(1))
531 self.lc = [[uce[0] for uce in self.lc if uce[1] == i] for i in range(clnb+1)]
532 self.lc0 = self.lc.pop(0)
535 def gethapaxbyet(self, etoiles) :
536 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
538 for uce in hapaxuces :
539 if uce in hucesdict :
543 etuces = [[] for et in etoiles]
544 for uci in self.ucis :
545 get = list(set(uci.etoiles).intersection(etoiles))
547 return '2 variables sur la meme ligne'
549 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
550 etuces = [set(val) for val in etuces]
551 return [sum([hucesdict[uce] for uce in list(etuce.intersection(hapaxuces))]) for etuce in etuces]
553 def gethapaxuces(self) :
554 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
555 hapax = [forme for forme in self.lems if self.lems[forme].freq == 1]
557 for i,uce in enumerate(hapaxuces) :
558 if uce in hucesdict :
559 hucesdict[uce][0] += 1
560 hucesdict[uce][1].append(hapax[i])
562 hucesdict[uce] = [1,[hapax[i]]]
564 for uce in hucesdict :
565 if hucesdict[uce][0] in huces :
566 huces[hucesdict[uce][0]].append(uce)
568 huces[hucesdict[uce][0]] = [uce]
569 huces = zip(huces, huces.values())
570 huces.sort(reverse=True)
574 for nb in huces[0:4] :
575 txt += "<p><h2>%i hapax par uce</h2><p>\n" % nb[0]
577 res = self.getconcorde([uce])
579 ucetxt = ' ' + row[1] + ' '
581 for hap in hucesdict[uce][1] :
582 laforme = self.getforme([forme for forme in self.lems[hap].formes][0]).forme
583 ucetxt = ucetxt.replace(' '+laforme+' ', ' <font color=red>'+laforme+'</font> ')
584 txt += '<p><b>' + ' '.join(self.getetbyuceid(uceid)) + '</b></p>'
585 txt += '<p>'+ucetxt+'</p>\n'
589 with open('/tmp/testhapxuce.html','w') as f :
594 def __init__(self, iduci, line, paraset = None) :
596 self.etoiles = line.split()
598 if paraset is not None :
599 self.paras = paraset.split()
604 def __init__(self, iduce, idpara, iduci) :
610 def __init__(self, word, gramtype, idword, lem = None, freq = None) :
616 if freq is not None :
622 def __init__(self, parent, forme) :
623 self.formes = {forme.ident : forme.freq}
624 self.gram = forme.gram
625 self.freq = forme.freq
628 def add_forme(self, forme) :
629 self.formes[forme.ident] = forme.freq
630 self.freq += forme.freq
632 def decouperlist(chaine, longueur, longueurOptimale) :
634 on part du dernier caractère, et on recule jusqu'au début de la chaîne.
635 Si on trouve un '$', c'est fini.
636 Sinon, on cherche le meilleur candidat. C'est-à-dire le rapport poids/distance le plus important.
638 separateurs = [[u'.', 6.0], [u'?', 6.0], [u'!', 6.0], [u'£', 6.0], [u':', 5.0], [u';', 4.0], [u',', 1.0], [u' ', 0.01]]
639 dsep = dict([[val[0],val[1]] for val in separateurs])
640 trouve = False # si on a trouvé un bon séparateur
641 iDecoupe = 0 # indice du caractere ou il faut decouper
643 longueur = min(longueur, len(chaine) - 1)
644 chaineTravail = chaine[:longueur + 1]
646 meilleur = ['', 0, 0] # type, poids et position du meilleur separateur
649 indice = chaineTravail.index(u'$')
656 caractere = chaineTravail[nbCar]
657 distance = abs(longueurOptimale - nbCar) + 1
658 meilleureDistance = abs(longueurOptimale - meilleur[2]) + 1
659 if caractere in dsep :
660 if (float(dsep[caractere]) / distance) > (float(meilleur[1]) / meilleureDistance) :
661 meilleur[0] = caractere
662 meilleur[1] = dsep[caractere]
667 if (float(dsep[' ']) / distance) > (float(meilleur[1]) / meilleureDistance) :
668 meilleur[0] = caractere
669 meilleur[1] = dsep[' ']
676 fin = chaine[iDecoupe + 1:]
677 retour = chaineTravail[:iDecoupe]
678 return len(retour) > 0, retour, fin
679 # si on a rien trouvé
680 return False, chaine, ''
682 def testetoile(line) :
683 return line.startswith(u'****')
686 return line[0:4].isdigit() and u'*' in line
688 def prep_txtlist(txt) :
689 return txt.split() + [u'$']
691 def prep_txtcharact(txt) :
696 Class for building a corpus
698 def __init__(self, infile, parametres_corpus, lexique = None, expressions = None, dlg = None) :
699 log.info('begin building corpus...')
700 self.lexique = lexique
701 self.expressions = expressions
703 self.corpus = Corpus(self, parametres_corpus)
706 self.lim = parametres_corpus.get('lim', 1000000)
707 self.encoding = parametres_corpus['encoding']
708 self.corpus.pathout = PathOut(filename = parametres_corpus['originalpath'], dirout = parametres_corpus['pathout'])
709 self.corpus.pathout.createdir(parametres_corpus['pathout'])
710 self.corpus.parametres['uuid'] = str(uuid4())
711 self.corpus.parametres['corpus_name'] = os.path.split(self.corpus.parametres['pathout'])[1]
712 self.corpus.parametres['type'] = 'corpus'
713 if self.corpus.parametres['keep_ponct'] :
714 self.ponctuation_espace = [' ', '']
716 self.ponctuation_espace = [' ','.', u'£', ';', '?', '!', ',', ':','']
718 self.tolist = self.corpus.parametres.get('tolist', 0)
725 def prep_makeuce(self) :
726 method = self.corpus.parametres.get('ucemethod', 0)
728 self.decouper = decouperlist
729 self.prep_txt = prep_txtlist
730 self.ucesize = self.corpus.parametres.get('ucesize', 40)
732 self.decouper = decoupercharact
733 self.prep_txt = prep_txtcharact
734 self.ucesize = self.corpus.parametres.get('ucesize', 240)
735 log.info('method uce : %s' % method)
739 self.read_corpus(self.infile)
741 self.corpus.parametres['ira'] = self.corpus.pathout['Corpus.cira']
742 self.time = time() - t1
744 DoConf().makeoptions(['corpus'],[self.corpus.parametres], self.corpus.pathout['Corpus.cira'])
745 log.info('time : %f' % (time() - t1))
748 self.conn_f = sqlite3.connect(self.corpus.pathout['formes.db'])
749 self.cf = self.conn_f.cursor()
750 self.cf.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
751 self.cf.execute('CREATE TABLE IF NOT EXISTS eff (id INTEGER, eff TEXT);')
753 self.cf = self.conn_f.cursor()
754 self.cf.execute('PRAGMA temp_store=MEMORY;')
755 self.cf.execute('PRAGMA journal_mode=MEMORY;')
756 self.cf.execute('PRAGMA synchronous = OFF;')
757 self.cf.execute('begin')
758 self.conn = sqlite3.connect(self.corpus.pathout['uces.db'])
759 self.c = self.conn.cursor()
760 self.c.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
762 self.c = self.conn.cursor()
763 self.c.execute('PRAGMA temp_store=MEMORY;')
764 self.c.execute('PRAGMA journal_mode=MEMORY;')
765 self.c.execute('PRAGMA synchronous = OFF;')
766 self.c.execute('begin')
769 #commit index and close db
772 self.cf.execute('CREATE INDEX iduces ON uces (id);')
773 self.cf.execute('CREATE INDEX ideff ON eff (id);')
777 self.conn_corpus = sqlite3.connect(self.corpus.pathout['corpus.db'])
778 self.ccorpus = self.conn_corpus.cursor()
779 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS etoiles (uci INTEGER, et TEXT, paras TEXT);')
780 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS luces (uci INTEGER, para INTEGER, uce INTEGER);')
781 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS formes (ident INTEGER, forme TEXT, lem TEXT, gram TEXT, freq INTEGER);')
782 self.conn_corpus.commit()
783 self.ccorpus = self.conn_corpus.cursor()
784 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
785 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
786 self.ccorpus.execute('PRAGMA synchronous = OFF;')
787 self.ccorpus.execute('begin')
789 self.ccorpus.execute('CREATE INDEX iduci ON luces (uci);')
790 self.conn_corpus.commit()
791 self.conn_corpus.close()
792 #self.corpus.parametres['corpus_ira'] = self.corpus.pathout['corpus.cira']
794 def buildcleans(self) :
795 if self.corpus.parametres.get('lower', 1) :
796 self.cleans.append(self.dolower)
797 if self.corpus.parametres.get('firstclean', 1) :
798 self.cleans.append(self.firstclean)
799 self.rule = self.corpus.parametres.get('keep_caract', u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_-")
800 self.cleans.append(self.docharact)
801 if self.corpus.parametres.get('expressions', 1) :
802 self.cleans.append(self.make_expression)
803 if self.corpus.parametres.get('apos', 1) :
804 self.cleans.append(self.doapos)
805 if self.corpus.parametres.get('tiret', 1):
806 self.cleans.append(self.dotiret)
808 def make_expression(self,txt) :
809 for expression in self.expressions:
810 if expression in txt :
811 txt = txt.replace(expression, self.expressions[expression][0])
814 def dolower(self, txt) :
817 def docharact(self, txt) :
818 #rule = u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_-"
819 list_keep = u"[" + self.rule + "]+"
820 return re.sub(list_keep, ' ', txt)
822 def doapos(self, txt) :
823 return txt.replace(u'\'', u' ')
825 def dotiret(self, txt) :
826 return txt.replace(u'-', u' ')
828 def firstclean(self, txt) :
829 txt = txt.replace(u'’',"'")
830 txt = txt.replace(u'œ', u'oe')
831 return txt.replace('...',u' £ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ')
833 def make_cleans(self, txt) :
834 for clean in self.cleans :
838 def backup_uce(self) :
839 if self.corpus.idformesuces != {} :
840 log.info('backup %i' % len(self.corpus.idformesuces))
841 touce = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].keys()])) for forme in self.corpus.idformesuces]
842 toeff = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].values()])) for forme in self.corpus.idformesuces]
843 self.cf.executemany('INSERT INTO uces VALUES (?,?);', touce)
844 self.cf.executemany('INSERT INTO eff VALUES (?,?);', toeff)
845 self.corpus.idformesuces = {}
848 def backup_corpus(self) :
849 log.info('start backup corpus')
851 for uci in self.corpus.ucis :
852 self.ccorpus.execute('INSERT INTO etoiles VALUES (?,?,?);' ,(uci.ident,' '.join(uci.etoiles), ' '.join(uci.paras,)))
853 for uce in uci.uces :
854 self.ccorpus.execute('INSERT INTO luces VALUES (?,?,?);',(`uci.ident`,`uce.para`,`uce.ident`,))
855 for forme in self.corpus.formes :
856 self.ccorpus.execute('INSERT INTO formes VALUES (?,?,?,?,?);', (`self.corpus.formes[forme].ident`, forme, self.corpus.formes[forme].lem, self.corpus.formes[forme].gram, `self.corpus.formes[forme].freq`,))
857 log.info('%f' % (time() - t))
860 self.corpus.parametres['date'] = datetime.datetime.now().ctime()
861 minutes, seconds = divmod(self.time, 60)
862 hours, minutes = divmod(minutes, 60)
863 self.corpus.parametres['time'] = '%.0fh %.0fm %.0fs' % (hours, minutes, seconds)
864 self.corpus.parametres['ucinb'] = self.corpus.getucinb()
865 self.corpus.parametres['ucenb'] = self.corpus.getucenb()
866 self.corpus.parametres['occurrences'] = self.corpus.gettotocc()
867 self.corpus.parametres['formesnb'] = len(self.corpus.formes)
868 hapaxnb = self.corpus.gethapaxnb()
869 pourhapaxf = (float(hapaxnb) / len(self.corpus.formes)) * 100
870 pourhapaxocc = (float(hapaxnb) / self.corpus.parametres['occurrences']) * 100
871 self.corpus.parametres['hapax'] = '%i - %.2f %% des formes - %.2f %% des occurrences' % (hapaxnb, pourhapaxf, pourhapaxocc)
874 class BuildFromAlceste(BuildCorpus) :
875 #def __init___(self, infile, parametres_corpus) :
876 # BuildCorpus.__init__(self, infile, parametres_corpus)
879 def read_corpus(self, infile) :
882 if self.corpus.parametres['ucimark'] == 0 :
883 self.testuci = testetoile
884 elif self.corpus.parametres['ucimark'] == 1 :
885 self.testuci = testint
890 with codecs.open(infile, 'rU', self.encoding) as f :
892 if self.testuci(line) :
895 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci - 1)
897 self.corpus.ucis.append(Uci(iduci, line))
899 self.corpus.ucis.append(Uci(iduci, line))
900 elif line.startswith(u'-*') :
902 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
905 self.corpus.ucis[-1].paras.append(line.split()[0])
906 elif line.strip() != '' and iduci != -1 :
909 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
913 def treattxt(self, txt, iduce, idpara, iduci) :
915 #log.debug('ATTENTION CHINOIS -> charactères')
916 #clean_chinois = [self.firstclean, self.dolower, self.make_expression, self.doapos, self.dotiret]
917 #log.debug('ATTENTION CHINOIS -> list(text)')
918 #txt = ' '.join(list(txt))
919 txt = self.make_cleans(txt)#, clean_chinois)
920 ucetxt = self.make_uces(txt, self.corpus.parametres['douce'])
921 if self.corpus.ucis[-1].paras == [] :
925 self.corpus.ucis[-1].uces.append(Uce(iduce, idpara, iduci))
926 self.c.execute('INSERT INTO uces VALUES(?,?);', (`iduce`,uce))
933 self.corpus.add_word(word)
934 if self.dlg is not None :
935 if self.limitshow > self.count :
936 self.dlg.Pulse('uci: %i - uce : %i' % (iduci + 1, iduce +1))
940 self.limitshow = self.last / 100000
941 log.debug(`iduci`, `idpara`, `iduce`)
942 if self.last > self.lim :
947 def make_uces(self, txt, douce = True, keep_ponct = False) :
948 txt = ' '.join(txt.split())
951 reste, texte_uce, suite = self.decouper(self.prep_txt(txt), self.ucesize + 15, self.ucesize)
959 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
962 reste, texte_uce, suite = self.decouper(suite, self.ucesize + 15, self.ucesize)
970 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
972 print 'RESTEE UUCEEEEEEEEEEEEE', uce
976 return [' '.join([val for val in txt.split() if val not in self.ponctuation_espace])]
979 #make_uces (decouper)
980 #treat_txt (make_uces)
984 def __init__(self, parent, dlg = None) :
987 parametres = DoConf(os.path.join(self.parent.UserConfigPath,'corpus.cfg')).getoptions('corpus')
988 parametres['pathout'] = PathOut(parent.filename, 'corpus').mkdirout()
989 dial = CorpusPref(parent, parametres)
990 dial.CenterOnParent()
991 dial.txtpath.SetLabel(parent.filename)
992 #dial.repout_choices.SetValue(parametres['pathout'])
993 self.res = dial.ShowModal()
994 if self.res == 5100 :
995 parametres = dial.doparametres()
996 parametres['originalpath'] = parent.filename
997 PathOut().createdir(parametres['pathout'])
998 ReadLexique(self.parent, lang = parametres['lang'])
999 self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp'))
1000 self.parametres = parametres
1003 def doanalyse(self) :
1004 return BuildFromAlceste(self.parent.filename, self.parametres, self.parent.lexique, self.parent.expressions, dlg = self.dlg).corpus
1007 if __name__ == '__main__' :
1009 parametres = {'formesdb':'formes.db', 'ucesdb': 'uces.db', 'corpusdb' : 'corpus.db', 'syscoding' : 'utf-8', 'encoding' : encoding}
1010 intro = BuildCorpus(infile, parametres)#, tar_in, tar_infouce)#, tar_formes)