1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
8 from functions import decoupercharact, ReadDicoAsDico, DoConf
14 from operator import itemgetter
15 from uuid import uuid4
16 from chemins import PathOut
17 from dialog import CorpusPref
18 from functions import ReadLexique, ReadDicoAsDico
22 log = logging.getLogger('iramuteq.corpus')
25 def copycorpus(corpus) :
26 log.info('copy corpus')
27 copy_corpus = Corpus(corpus.parent, parametres = corpus.parametres)
28 copy_corpus.ucis = corpus.ucis
29 copy_corpus.formes = corpus.formes
30 copy_corpus.pathout = corpus.pathout
31 copy_corpus.conn_all()
41 def __init__(self, parent, parametres = {}, read = False) :
43 self.parametres = parametres
45 self.connformes = None
47 self.conncorpus = None
54 self.idformesuces = {}
58 self.pathout = PathOut(dirout = parametres['pathout'])
61 def add_word(self, word) :
62 if word in self.formes :
63 self.formes[word].freq += 1
64 if self.formes[word].ident in self.idformesuces :
65 if self.ucis[-1].uces[-1].ident in self.idformesuces[self.formes[word].ident] :
66 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] += 1
68 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] = 1
70 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident: 1}
72 if word in self.parent.lexique :
73 gramtype = self.parent.lexique[word][1]
74 lem = self.parent.lexique[word][0]
81 self.formes[word] = Word(word, gramtype, len(self.formes), lem)
82 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident : 1}
85 """connect corpus to db"""
86 if self.connformes is None :
87 log.info('connexion corpus')
88 self.connuces = sqlite3.connect(self.pathout['uces.db'])
89 self.cuces = self.connuces.cursor()
90 self.connformes = sqlite3.connect(self.pathout['formes.db'])
91 self.cformes = self.connformes.cursor()
92 self.conncorpus = sqlite3.connect(self.pathout['corpus.db'])
93 self.ccorpus = self.conncorpus.cursor()
94 self.cformes.execute('PRAGMA temp_store=MEMORY;')
95 self.cformes.execute('PRAGMA journal_mode=MEMORY;')
96 self.cformes.execute('PRAGMA synchronous = OFF;')
97 self.cuces.execute('PRAGMA temp_store=MEMORY;')
98 self.cuces.execute('PRAGMA journal_mode=MEMORY;')
99 self.cuces.execute('PRAGMA synchronous = OFF;')
100 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
101 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
102 self.ccorpus.execute('PRAGMA synchronous = OFF;')
104 def read_corpus(self) :
105 log.info('read corpus')
106 self.parametres['syscoding'] = sys.getdefaultencoding()
107 if self.conncorpus is None :
109 res = self.ccorpus.execute('SELECT * FROM etoiles;')
111 self.ucis.append(Uci(row[0], row[1], row[2]))
112 uces = self.conncorpus.cursor().execute('SELECT * FROM luces where uci=?;',(`self.ucis[-1].ident`,))
114 self.ucis[-1].uces.append(Uce(uce[2], uce[1], uce[0]))
115 res = self.ccorpus.execute('SELECT * FROM formes;')
116 self.formes = dict([[forme[1], Word(forme[1], forme[3], forme[0], lem = forme[2], freq = forme[4])] for forme in res])
119 def getworduces(self, wordid) :
120 if isinstance(wordid, basestring) :
121 wordid = self.formes[wordid].ident
122 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`wordid`,))
123 return list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
125 def getlemuces(self, lem) :
126 formesid = ', '.join([`val` for val in self.lems[lem].formes])
127 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
128 res = self.cformes.execute(query)
129 return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))))
131 def getlemucis(self, lem) :
132 uces = self.getlemuces(lem)
133 return list(set([self.getucefromid(val).uci for val in uces]))
135 def getlemuceseff(self, lem) :
136 formesid = ', '.join([`val` for val in self.lems[lem].formes])
137 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
138 res = self.cformes.execute(query)
139 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
140 query = 'SELECT eff FROM eff where id IN (%s) ORDER BY id' % formesid
141 res = self.cformes.execute(query)
142 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
144 for i, uce in enumerate(uces) :
145 lemuceeff[uce] = lemuceeff.get(uce, 0) + eff[i]
148 def getlemeff(self, lem) :
149 return self.lems[lem].freq
154 def getforme(self, formeid) :
155 if self.idformes is None : self.make_idformes()
156 return self.idformes[formeid]
158 def gettotocc(self) :
159 return sum([self.formes[forme].freq for forme in self.formes])
161 def getucemean(self) :
162 return float(self.gettotocc())/self.getucenb()
165 return self.ucis[-1].uces[-1].ident + 1
168 return self.ucis[-1].ident + 1
170 def getucisize(self) :
171 ucesize = self.getucesize()
172 return [sum(ucesize[uci.uces[0].ident:(uci.uces[-1].ident + 1)]) for uci in self.ucis]
174 def getucesize(self) :
175 res = self.getalluces()
176 return [len(uce[1].split()) for uce in res]
178 # def getlemseff(self) :
179 # if self.idformes is None :
180 # self.make_idformes()
181 # return dict([[lem, sum([self.idformes[forme].freq for forme in self.lems[lem]])] for lem in self.lems])
183 # def getlemsefftype(self) :
184 # if self.idformes is None :
185 # self.make_idformes()
186 # if self.lems is None :
188 # return dict([[lem, [sum([self.idformes[forme].freq for forme in self.lems[lem]]), '', self.idformes[self.lems[lem].keys()[0]].gram]] for lem in self.lems])
190 def getconcorde(self, uces) :
191 return self.cuces.execute('select * from uces where id IN (%s);' % ', '.join([`i` for i in uces]))
193 def getwordconcorde(self, word) :
194 return self.getconcorde(self.getworduces(word))
196 def getlemconcorde(self, lem) :
197 return self.getconcorde(self.getlemuces(lem))
199 def getalluces(self) :
200 return self.cuces.execute('SELECT * FROM uces')
202 def getucesfrometoile(self, etoile) :
203 return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles]
205 def getucefromid(self, uceid) :
206 if self.iduces is None : self.make_iduces()
207 return self.iduces[uceid]
209 def gethapaxnb(self) :
210 return len([None for forme in self.formes if self.formes[forme].freq == 1])
212 def getactivesnb(self, key) :
213 return len([lem for lem in self.lems if self.lems[lem].act == key])
214 # def make_lems(self, lem = True) :
215 # log.info('make lems')
217 # for forme in self.formes :
218 # if self.formes[forme].lem in self.lems :
219 # if self.formes[forme].ident not in self.lems[self.formes[forme].lem] :
220 # self.lems[self.formes[forme].lem][self.formes[forme].ident] = 0
222 # self.lems[self.formes[forme].lem] = {self.formes[forme].ident : 0}
224 def make_lems(self, lem = True) :
225 log.info('make lems')
228 for forme in self.formes :
229 if self.formes[forme].lem in self.lems :
230 if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes :
231 self.lems[self.formes[forme].lem].add_forme(self.formes[forme])
233 self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
235 self.lems = dict([[forme, Lem(self, self.formes[forme])] for forme in self.formes])
237 def make_idformes(self) :
238 self.idformes = dict([[self.formes[forme].ident, self.formes[forme]] for forme in self.formes])
240 def make_iduces(self) :
241 if self.iduces is None :
242 self.iduces = dict([[uce.ident, uce] for uci in self.ucis for uce in uci.uces])
244 def make_lexitable(self, mineff, etoiles) :
245 tokeep = [lem for lem in self.lems if self.lems[lem].freq > mineff]
246 etuces = [[] for et in etoiles]
247 for uci in self.ucis :
248 get = list(set(uci.etoiles).intersection(etoiles))
250 return '2 variables sur la meme ligne'
252 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
253 etuces = [set(val) for val in etuces]
256 deff = self.getlemuceseff(lem)
258 tab.append([lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])
259 tab.insert(0, [''] + etoiles)
262 def make_efftype_from_etoiles(self, etoiles) :
264 etuces = [[] for et in etoiles]
265 for uci in self.ucis :
266 get = list(set(uci.etoiles).intersection(etoiles))
268 return '2 variables sur la meme ligne'
270 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
271 etuces = [set(val) for val in etuces]
272 for lem in self.lems :
273 deff = self.getlemuceseff(lem)
275 gram = self.lems[lem].gram
277 dtype[gram] = [i + j for i, j in zip(dtype[gram], [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])]
279 dtype[gram] = [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces]
280 tabout = [[gram] + dtype[gram] for gram in dtype]
281 tabout.insert(0, [''] + etoiles)
284 def make_uceactsize(self, actives) :
285 res = self.getalluces()
288 deff = self.getlemuceseff(lem)
290 ucesize[uce] = ucesize.get(uce, 0) + 1
293 def make_uc(self, actives, lim1, lim2) :
294 uceactsize = self.make_uceactsize(actives)
300 for uce in [uce for uci in self.ucis for uce in uci.uces] :
301 if uce.para == lastpara :
303 last1 += uceactsize.get(uce.ident,0)
304 uc1[-1].append(uce.ident)
306 uc1.append([uce.ident])
309 last2 += uceactsize.get(uce.ident, 0)
310 uc2[-1].append(uce.ident)
312 uc2.append([uce.ident])
315 last1 = uceactsize.get(uce.ident, 0)
316 last2 = uceactsize.get(uce.ident, 0)
318 uc1.append([uce.ident])
319 uc2.append([uce.ident])
322 def make_and_write_sparse_matrix_from_uc(self, actives, sizeuc1, sizeuc2, uc1out, uc2out, listuce1out, listuce2out) :
323 uc1, uc2 = self.make_uc(actives, sizeuc1, sizeuc2)
324 log.info('taille uc1 : %i - taille uc2 : %i' % (len(uc1), len(uc2)))
325 self.write_ucmatrix(uc1, actives, uc1out)
326 self.write_ucmatrix(uc2, actives, uc2out)
327 listuce1 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc1) for uce in ucl]
328 listuce2 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc2) for uce in ucl]
329 with open(listuce1out, 'w') as f :
330 f.write('\n'.join([';'.join(line) for line in listuce1]))
331 with open(listuce2out, 'w') as f :
332 f.write('\n'.join([';'.join(line) for line in listuce2]))
333 return len(uc1), len(uc2)
335 def write_ucmatrix(self, uc, actives, fileout) :
336 log.info('write uc matrix %s' % fileout)
337 uces_uc = dict([[uce, i] for i, ucl in enumerate(uc) for uce in ucl])
340 with open(fileout + '~', 'w+') as f :
341 for i, lem in enumerate(actives) :
342 for uce in self.getlemuces(lem):
343 if (uces_uc[uce], i) not in deja_la :
345 f.write(''.join([' '.join([`uces_uc[uce]+1`,`i+1`,`1`]),'\n']))
346 deja_la[(uces_uc[uce], i)] = 0
348 with open(fileout, 'w') as ffin :
349 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (len(uc), len(actives), nbl))
352 os.remove(fileout + '~')
355 def export_corpus(self, outf) :
356 #outf = 'export_corpus.txt'
358 res = self.getalluces()
362 with open(outf,'w') as f :
364 if self.iduces[uce[0]].uci == actuci and self.iduces[uce[0]].para == actpara :
365 f.write(uce[1].encode(self.parametres['syscoding']) + '\n')
366 elif self.iduces[uce[0]].uci != actuci :
367 actuci = self.iduces[uce[0]].uci
368 if self.ucis[self.iduces[uce[0]].uci].paras == [] :
369 actpara = self.iduces[uce[0]].para
370 f.write('\n' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n' + uce[1].encode(self.parametres['syscoding']) + '\n')
373 actpara = self.iduces[uce[0]].para
374 f.write('\n'.join([' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']), self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
375 elif self.iduces[uce[0]].para != actpara :
376 actpara = self.iduces[uce[0]].para
378 f.write('\n'.join([self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
380 def make_and_write_sparse_matrix_from_uces(self, actives, outfile, listuce = False) :
381 log.info('make_and_write_sparse_matrix_from_uces %s' % outfile)
383 with open(outfile + '~', 'w+') as f :
384 for i, lem in enumerate(actives) :
385 for uce in sorted(self.getlemuces(lem)) :
387 f.write(''.join([' '.join([`uce+1`, `i+1`,`1`]),'\n']))
389 with open(outfile, 'w') as ffin :
390 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
393 os.remove(outfile + '~')
395 with open(listuce, 'w') as f :
396 f.write('\n'.join(['uce;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucenb())]))
398 def make_and_write_sparse_matrix_from_uci(self, actives, outfile, listuci = False) :
399 log.info('make_and_write_sparse_matrix_from_ucis %s' % outfile)
401 with open(outfile + '~', 'w+') as f :
402 for i, lem in enumerate(actives) :
403 for uci in sorted(self.getlemucis(lem)) :
405 f.write(''.join([' '.join([`uci+1`, `i+1`,`1`]),'\n']))
407 with open(outfile, 'w') as ffin :
408 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucinb(), len(actives), nbl))
411 os.remove(outfile + '~')
413 with open(listuci, 'w') as f :
414 f.write('\n'.join(['uci;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucinb())]))
416 def make_and_write_sparse_matrix_from_classe(self, actives, uces, outfile) :
417 log.info('make_and_write_sparse_matrix_from_classe %s' % outfile)
419 duces = dict([[uce, i] for i, uce in enumerate(uces)])
420 with open(outfile + '~', 'w+') as f :
421 for i, lem in enumerate(actives) :
422 uces_ok = list(set(self.getlemuces(lem)).intersection(uces))
424 f.write(''.join([' '.join([`duces[uce]+1`,`i+1`,`1`]),'\n']))
426 with open(outfile, 'w') as ffin :
427 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
430 os.remove(outfile + '~')
432 def parse_active(self, gramact, gramsup = None) :
433 log.info('parse actives')
434 for lem in self.lems :
435 if self.lems[lem].gram in gramact :
436 self.lems[lem].act = 1
437 elif gramsup is not None :
438 if self.lems[lem].gram in gramsup :
439 self.lems[lem].act = 2
441 self.lems[lem].act = 0
443 self.lems[lem].act = 2
445 def make_actives_limit(self, limit) :
446 if self.idformes is None :
448 return [lem for lem in self.lems if self.getlemeff(lem) >= limit]
450 def make_actives_nb(self, nbmax, key) :
451 log.info('make_actives_nb : %i - %i' % (nbmax,key))
452 if self.idformes is None :
454 allactives = [[self.lems[lem].freq, lem] for lem in self.lems if self.lems[lem].act == key and self.lems[lem].freq >= 3]
455 self.activenb = len(allactives)
456 allactives = sorted(allactives, reverse = True)
457 if len(allactives) <= nbmax :
458 log.info('nb = %i - eff min = %i ' % (len(allactives), allactives[-1][0]))
459 return [val[1] for val in allactives], allactives[-1][0]
461 effs = [val[0] for val in allactives]
462 if effs.count(effs[nbmax - 1]) > 1 :
463 lim = effs[nbmax - 1] + 1
467 stop = effs.index(lim)
473 log.info('nb actives = %i - eff min = %i ' % (stop, lim))
474 return [val[1] for val in allactives[0:stop + 1]], lim
476 def make_and_write_profile(self, actives, ucecl, fileout) :
477 log.info('formes/classes')
478 tab = [[lem] + [len(set(self.getlemuces(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
479 tab = [[line[0]] + [`val` for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
480 with open(fileout, 'w') as f :
481 f.write('\n'.join([';'.join(line) for line in tab]).encode(self.parametres['syscoding']))
483 def make_etoiles(self) :
485 for uci in self.ucis :
486 etoiles.update(uci.etoiles[1:] + uci.paras)
489 def make_and_write_profile_et(self, ucecl, fileout) :
490 log.info('etoiles/classes')
491 etoiles = self.make_etoiles()
492 with open(fileout, 'w') as f :
493 f.write('\n'.join([';'.join([etoile] + [`len(set(self.getucesfrometoile(etoile)).intersection(classe))` for classe in ucecl]) for etoile in etoiles]).encode(self.parametres['syscoding']))
495 def count_from_list(self, l, d) :
503 def find_segments(self, taille_segment, taille_limite) :
505 for uce in self.getalluces() :
507 d = self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
508 l = [[d[val], val] for val in d if d[val] >= 3]
511 if len(l) > taille_limite :
512 l = l[-taille_limite:]
515 def make_ucecl_from_R(self, filein) :
516 with open(filein, 'rU') as f :
521 line = line.replace('\n', '').replace('"', '').split(';')
522 self.lc.append([int(line[0]) - 1, int(line[1])])
523 classesl = [val[1] for val in self.lc]
525 self.lc = sorted(self.lc, key=itemgetter(1))
526 self.lc = [[uce[0] for uce in self.lc if uce[1] == i] for i in range(clnb+1)]
527 self.lc0 = self.lc.pop(0)
531 def __init__(self, iduci, line, paraset = None) :
533 self.etoiles = line.split()
535 if paraset is not None :
536 self.paras = paraset.split()
541 def __init__(self, iduce, idpara, iduci) :
547 def __init__(self, word, gramtype, idword, lem = None, freq = None) :
553 if freq is not None :
559 def __init__(self, parent, forme) :
560 self.formes = {forme.ident : forme.freq}
561 self.gram = forme.gram
562 self.freq = forme.freq
565 def add_forme(self, forme) :
566 self.formes[forme.ident] = forme.freq
567 self.freq += forme.freq
569 def decouperlist(chaine, longueur, longueurOptimale) :
571 on part du dernier caractère, et on recule jusqu'au début de la chaîne.
572 Si on trouve un '$', c'est fini.
573 Sinon, on cherche le meilleur candidat. C'est-à-dire le rapport poids/distance le plus important.
575 separateurs = [[u'.', 6.0], [u'?', 6.0], [u'!', 6.0], [u'£', 6.0], [u':', 5.0], [u';', 4.0], [u',', 1.0], [u' ', 0.01]]
576 dsep = dict([[val[0],val[1]] for val in separateurs])
577 trouve = False # si on a trouvé un bon séparateur
578 iDecoupe = 0 # indice du caractere ou il faut decouper
580 longueur = min(longueur, len(chaine) - 1)
581 chaineTravail = chaine[:longueur + 1]
583 meilleur = ['', 0, 0] # type, poids et position du meilleur separateur
586 indice = chaineTravail.index(u'$')
593 caractere = chaineTravail[nbCar]
594 distance = abs(longueurOptimale - nbCar) + 1
595 meilleureDistance = abs(longueurOptimale - meilleur[2]) + 1
596 if caractere in dsep :
597 if (float(dsep[caractere]) / distance) > (float(meilleur[1]) / meilleureDistance) :
598 meilleur[0] = caractere
599 meilleur[1] = dsep[caractere]
604 if (float(dsep[' ']) / distance) > (float(meilleur[1]) / meilleureDistance) :
605 meilleur[0] = caractere
606 meilleur[1] = dsep[' ']
613 fin = chaine[iDecoupe + 1:]
614 retour = chaineTravail[:iDecoupe]
615 return len(retour) > 0, retour, fin
616 # si on a rien trouvé
617 return False, chaine, ''
619 def testetoile(line) :
620 return line.startswith(u'****')
623 return line[0:4].isdigit() and u'*' in line
625 def prep_txtlist(txt) :
626 return txt.split() + [u'$']
628 def prep_txtcharact(txt) :
633 Class for building a corpora
635 def __init__(self, infile, parametres_corpus, lexique = None, expressions = None, dlg = None) :
636 log.info('begin building corpus...')
637 self.lexique = lexique
638 self.expressions = expressions
640 self.corpus = Corpus(self, parametres_corpus)
643 self.lim = parametres_corpus.get('lim', 1000000)
644 self.encoding = parametres_corpus['encoding']
645 self.corpus.pathout = PathOut(filename = parametres_corpus['originalpath'], dirout = parametres_corpus['pathout'])
646 self.corpus.pathout.createdir(parametres_corpus['pathout'])
647 self.corpus.parametres['uuid'] = str(uuid4())
648 self.corpus.parametres['corpus_name'] = os.path.split(self.corpus.parametres['pathout'])[1]
649 self.corpus.parametres['type'] = 'corpus'
650 if self.corpus.parametres['keep_ponct'] :
651 self.ponctuation_espace = [' ', '']
653 self.ponctuation_espace = [' ','.', u'£', ';', '?', '!', ',', ':','']
655 self.tolist = self.corpus.parametres.get('tolist', 0)
662 def prep_makeuce(self) :
663 method = self.corpus.parametres.get('ucemethod', 0)
665 self.decouper = decouperlist
666 self.prep_txt = prep_txtlist
667 self.ucesize = self.corpus.parametres.get('ucesize', 40)
669 self.decouper = decoupercharact
670 self.prep_txt = prep_txtcharact
671 self.ucesize = self.corpus.parametres.get('ucesize', 240)
672 log.info('method uce : %s' % method)
676 self.read_corpus(self.infile)
678 self.corpus.parametres['ira'] = self.corpus.pathout['Corpus.cira']
679 self.time = time() - t1
681 DoConf().makeoptions(['corpus'],[self.corpus.parametres], self.corpus.pathout['Corpus.cira'])
682 log.info('time : %f' % (time() - t1))
685 self.conn_f = sqlite3.connect(self.corpus.pathout['formes.db'])
686 self.cf = self.conn_f.cursor()
687 self.cf.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
688 self.cf.execute('CREATE TABLE IF NOT EXISTS eff (id INTEGER, eff TEXT);')
690 self.cf = self.conn_f.cursor()
691 self.cf.execute('PRAGMA temp_store=MEMORY;')
692 self.cf.execute('PRAGMA journal_mode=MEMORY;')
693 self.cf.execute('PRAGMA synchronous = OFF;')
694 self.cf.execute('begin')
695 self.conn = sqlite3.connect(self.corpus.pathout['uces.db'])
696 self.c = self.conn.cursor()
697 self.c.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
699 self.c = self.conn.cursor()
700 self.c.execute('PRAGMA temp_store=MEMORY;')
701 self.c.execute('PRAGMA journal_mode=MEMORY;')
702 self.c.execute('PRAGMA synchronous = OFF;')
703 self.c.execute('begin')
706 #commit index and close db
709 self.cf.execute('CREATE INDEX iduces ON uces (id);')
710 self.cf.execute('CREATE INDEX ideff ON eff (id);')
714 self.conn_corpus = sqlite3.connect(self.corpus.pathout['corpus.db'])
715 self.ccorpus = self.conn_corpus.cursor()
716 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS etoiles (uci INTEGER, et TEXT, paras TEXT);')
717 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS luces (uci INTEGER, para INTEGER, uce INTEGER);')
718 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS formes (ident INTEGER, forme TEXT, lem TEXT, gram TEXT, freq INTEGER);')
719 self.conn_corpus.commit()
720 self.ccorpus = self.conn_corpus.cursor()
721 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
722 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
723 self.ccorpus.execute('PRAGMA synchronous = OFF;')
724 self.ccorpus.execute('begin')
726 self.ccorpus.execute('CREATE INDEX iduci ON luces (uci);')
727 self.conn_corpus.commit()
728 self.conn_corpus.close()
729 #self.corpus.parametres['corpus_ira'] = self.corpus.pathout['corpus.cira']
731 def buildcleans(self) :
732 if self.corpus.parametres.get('lower', 1) :
733 self.cleans.append(self.dolower)
734 if self.corpus.parametres.get('firstclean', 1) :
735 self.cleans.append(self.firstclean)
736 self.rule = self.corpus.parametres.get('keep_caract', u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_-")
737 self.cleans.append(self.docharact)
738 if self.corpus.parametres.get('expressions', 1) :
739 self.cleans.append(self.make_expression)
740 if self.corpus.parametres.get('apos', 1) :
741 self.cleans.append(self.doapos)
742 if self.corpus.parametres.get('tiret', 1):
743 self.cleans.append(self.dotiret)
745 def make_expression(self,txt) :
746 for expression in self.expressions:
747 if expression in txt :
748 txt = txt.replace(expression, self.expressions[expression][0])
751 def dolower(self, txt) :
754 def docharact(self, txt) :
755 #rule = u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_-"
756 list_keep = u"[" + self.rule + "]+"
757 return re.sub(list_keep, ' ', txt)
759 def doapos(self, txt) :
760 return txt.replace(u'\'', u' ')
762 def dotiret(self, txt) :
763 return txt.replace(u'-', u' ')
765 def firstclean(self, txt) :
766 txt = txt.replace(u'’',"'")
767 txt = txt.replace(u'œ', u'oe')
768 return txt.replace('...',u' £ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ')
770 def make_cleans(self, txt) :
771 for clean in self.cleans :
775 def backup_uce(self) :
776 if self.corpus.idformesuces != {} :
777 log.info('backup %i' % len(self.corpus.idformesuces))
778 touce = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].keys()])) for forme in self.corpus.idformesuces]
779 toeff = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].values()])) for forme in self.corpus.idformesuces]
780 self.cf.executemany('INSERT INTO uces VALUES (?,?);', touce)
781 self.cf.executemany('INSERT INTO eff VALUES (?,?);', toeff)
782 self.corpus.idformesuces = {}
785 def backup_corpus(self) :
786 log.info('start backup corpus')
788 for uci in self.corpus.ucis :
789 self.ccorpus.execute('INSERT INTO etoiles VALUES (?,?,?);' ,(uci.ident,' '.join(uci.etoiles), ' '.join(uci.paras,)))
790 for uce in uci.uces :
791 self.ccorpus.execute('INSERT INTO luces VALUES (?,?,?);',(`uci.ident`,`uce.para`,`uce.ident`,))
792 for forme in self.corpus.formes :
793 self.ccorpus.execute('INSERT INTO formes VALUES (?,?,?,?,?);', (`self.corpus.formes[forme].ident`, forme, self.corpus.formes[forme].lem, self.corpus.formes[forme].gram, `self.corpus.formes[forme].freq`,))
794 log.info('%f' % (time() - t))
797 self.corpus.parametres['date'] = datetime.datetime.now().ctime()
798 minutes, seconds = divmod(self.time, 60)
799 hours, minutes = divmod(minutes, 60)
800 self.corpus.parametres['time'] = '%.0fh %.0fm %.0fs' % (hours, minutes, seconds)
801 self.corpus.parametres['ucinb'] = self.corpus.getucinb()
802 self.corpus.parametres['ucenb'] = self.corpus.getucenb()
803 self.corpus.parametres['occurrences'] = self.corpus.gettotocc()
804 self.corpus.parametres['formesnb'] = len(self.corpus.formes)
805 hapaxnb = self.corpus.gethapaxnb()
806 pourhapaxf = (float(hapaxnb) / len(self.corpus.formes)) * 100
807 pourhapaxocc = (float(hapaxnb) / self.corpus.parametres['occurrences']) * 100
808 self.corpus.parametres['hapax'] = '%i - %.2f %% des formes - %.2f %% des occurrences' % (hapaxnb, pourhapaxf, pourhapaxocc)
811 class BuildFromAlceste(BuildCorpus) :
812 #def __init___(self, infile, parametres_corpus) :
813 # BuildCorpus.__init__(self, infile, parametres_corpus)
816 def read_corpus(self, infile) :
819 if self.corpus.parametres['ucimark'] == 0 :
820 self.testuci = testetoile
821 elif self.corpus.parametres['ucimark'] == 1 :
822 self.testuci = testint
827 with codecs.open(infile, 'rU', self.encoding) as f :
829 if self.testuci(line) :
832 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci - 1)
834 self.corpus.ucis.append(Uci(iduci, line))
836 self.corpus.ucis.append(Uci(iduci, line))
837 elif line.startswith(u'-*') :
839 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
842 self.corpus.ucis[-1].paras.append(line.split()[0])
843 elif line.strip() != '' and iduci != -1 :
846 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
850 def treattxt(self, txt, iduce, idpara, iduci) :
852 #log.debug('ATTENTION CHINOIS -> charactères')
853 #clean_chinois = [self.firstclean, self.dolower, self.make_expression, self.doapos, self.dotiret]
854 #log.debug('ATTENTION CHINOIS -> list(text)')
855 #txt = ' '.join(list(txt))
856 txt = self.make_cleans(txt)#, clean_chinois)
857 ucetxt = self.make_uces(txt, self.corpus.parametres['douce'])
858 if self.corpus.ucis[-1].paras == [] :
862 self.corpus.ucis[-1].uces.append(Uce(iduce, idpara, iduci))
863 self.c.execute('INSERT INTO uces VALUES(?,?);', (`iduce`,uce))
870 self.corpus.add_word(word)
871 if self.dlg is not None :
872 if self.limitshow > self.count :
873 self.dlg.Pulse('uci: %i - uce : %i' % (iduci + 1, iduce +1))
877 self.limitshow = self.last / 100000
878 log.debug(`iduci`, `idpara`, `iduce`)
879 if self.last > self.lim :
884 def make_uces(self, txt, douce = True, keep_ponct = False) :
885 txt = ' '.join(txt.split())
888 reste, texte_uce, suite = self.decouper(self.prep_txt(txt), self.ucesize + 15, self.ucesize)
896 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
899 reste, texte_uce, suite = self.decouper(suite, self.ucesize + 15, self.ucesize)
907 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
909 print 'RESTEE UUCEEEEEEEEEEEEE', uce
913 return [' '.join([val for val in txt.split() if val not in self.ponctuation_espace])]
916 #make_uces (decouper)
917 #treat_txt (make_uces)
921 def __init__(self, parent, dlg = None) :
924 parametres = DoConf(os.path.join(self.parent.UserConfigPath,'corpus.cfg')).getoptions('corpus')
925 parametres['pathout'] = PathOut(parent.filename, 'corpus').mkdirout()
926 dial = CorpusPref(parent, parametres)
927 dial.CenterOnParent()
928 dial.txtpath.SetLabel(parent.filename)
929 #dial.repout_choices.SetValue(parametres['pathout'])
930 self.res = dial.ShowModal()
931 if self.res == 5100 :
932 parametres = dial.doparametres()
933 parametres['originalpath'] = parent.filename
934 PathOut().createdir(parametres['pathout'])
935 ReadLexique(self.parent, lang = parametres['lang'])
936 self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp'))
937 self.parametres = parametres
940 def doanalyse(self) :
941 return BuildFromAlceste(self.parent.filename, self.parametres, self.parent.lexique, self.parent.expressions, dlg = self.dlg).corpus
944 if __name__ == '__main__' :
946 parametres = {'formesdb':'formes.db', 'ucesdb': 'uces.db', 'corpusdb' : 'corpus.db', 'syscoding' : 'utf-8', 'encoding' : encoding}
947 intro = BuildCorpus(infile, parametres)#, tar_in, tar_infouce)#, tar_formes)