1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
8 from functions import decoupercharact, ReadDicoAsDico, DoConf
14 from operator import itemgetter
15 from uuid import uuid4
16 from chemins import PathOut
17 from dialog import CorpusPref
18 from functions import ReadLexique, ReadDicoAsDico
22 log = logging.getLogger('iramuteq.corpus')
25 def copycorpus(corpus) :
26 log.info('copy corpus')
27 copy_corpus = Corpus(corpus.parent, parametres = corpus.parametres)
28 copy_corpus.ucis = corpus.ucis
29 copy_corpus.formes = corpus.formes
30 copy_corpus.pathout = corpus.pathout
31 copy_corpus.conn_all()
41 def __init__(self, parent, parametres = {}, read = False) :
43 self.parametres = parametres
45 self.connformes = None
47 self.conncorpus = None
54 self.idformesuces = {}
59 self.pathout = PathOut(dirout = parametres['pathout'])
62 def add_word(self, word) :
63 if word in self.formes :
64 self.formes[word].freq += 1
65 if self.formes[word].ident in self.idformesuces :
66 if self.ucis[-1].uces[-1].ident in self.idformesuces[self.formes[word].ident] :
67 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] += 1
69 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] = 1
71 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident: 1}
73 if word in self.parent.lexique :
74 gramtype = self.parent.lexique[word][1]
75 lem = self.parent.lexique[word][0]
82 self.formes[word] = Word(word, gramtype, len(self.formes), lem)
83 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident : 1}
86 """connect corpus to db"""
87 if self.connformes is None :
88 log.info('connexion corpus')
89 self.connuces = sqlite3.connect(self.pathout['uces.db'])
90 self.cuces = self.connuces.cursor()
91 self.connformes = sqlite3.connect(self.pathout['formes.db'])
92 self.cformes = self.connformes.cursor()
93 self.conncorpus = sqlite3.connect(self.pathout['corpus.db'])
94 self.ccorpus = self.conncorpus.cursor()
95 self.cformes.execute('PRAGMA temp_store=MEMORY;')
96 self.cformes.execute('PRAGMA journal_mode=MEMORY;')
97 self.cformes.execute('PRAGMA synchronous = OFF;')
98 self.cuces.execute('PRAGMA temp_store=MEMORY;')
99 self.cuces.execute('PRAGMA journal_mode=MEMORY;')
100 self.cuces.execute('PRAGMA synchronous = OFF;')
101 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
102 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
103 self.ccorpus.execute('PRAGMA synchronous = OFF;')
105 def read_corpus(self) :
106 log.info('read corpus')
107 self.parametres['syscoding'] = sys.getdefaultencoding()
108 if self.conncorpus is None :
110 res = self.ccorpus.execute('SELECT * FROM etoiles;')
112 self.ucis.append(Uci(row[0], row[1], row[2]))
113 uces = self.conncorpus.cursor().execute('SELECT * FROM luces where uci=?;',(`self.ucis[-1].ident`,))
115 self.ucis[-1].uces.append(Uce(uce[2], uce[1], uce[0]))
116 res = self.ccorpus.execute('SELECT * FROM formes;')
117 self.formes = dict([[forme[1], Word(forme[1], forme[3], forme[0], lem = forme[2], freq = forme[4])] for forme in res])
120 def getworduces(self, wordid) :
121 if isinstance(wordid, basestring) :
122 wordid = self.formes[wordid].ident
123 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`wordid`,))
124 return list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
126 def getlemuces(self, lem) :
127 formesid = ', '.join([`val` for val in self.lems[lem].formes])
128 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
129 res = self.cformes.execute(query)
130 return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))))
132 def getlemucis(self, lem) :
133 uces = self.getlemuces(lem)
134 return list(set([self.getucefromid(val).uci for val in uces]))
136 def getlemuceseff(self, lem) :
137 formesid = ', '.join([`val` for val in self.lems[lem].formes])
138 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
139 res = self.cformes.execute(query)
140 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
141 query = 'SELECT eff FROM eff where id IN (%s) ORDER BY id' % formesid
142 res = self.cformes.execute(query)
143 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
145 for i, uce in enumerate(uces) :
146 lemuceeff[uce] = lemuceeff.get(uce, 0) + eff[i]
149 def getlemeff(self, lem) :
150 return self.lems[lem].freq
155 def getforme(self, formeid) :
156 if self.idformes is None : self.make_idformes()
157 return self.idformes[formeid]
159 def gettotocc(self) :
160 return sum([self.formes[forme].freq for forme in self.formes])
162 def getucemean(self) :
163 return float(self.gettotocc())/self.getucenb()
166 return self.ucis[-1].uces[-1].ident + 1
169 return self.ucis[-1].ident + 1
171 def getucisize(self) :
172 ucesize = self.getucesize()
173 return [sum(ucesize[uci.uces[0].ident:(uci.uces[-1].ident + 1)]) for uci in self.ucis]
175 def getucesize(self) :
176 res = self.getalluces()
177 return [len(uce[1].split()) for uce in res]
179 # def getlemseff(self) :
180 # if self.idformes is None :
181 # self.make_idformes()
182 # return dict([[lem, sum([self.idformes[forme].freq for forme in self.lems[lem]])] for lem in self.lems])
184 # def getlemsefftype(self) :
185 # if self.idformes is None :
186 # self.make_idformes()
187 # if self.lems is None :
189 # return dict([[lem, [sum([self.idformes[forme].freq for forme in self.lems[lem]]), '', self.idformes[self.lems[lem].keys()[0]].gram]] for lem in self.lems])
191 def getconcorde(self, uces) :
192 return self.cuces.execute('select * from uces where id IN (%s);' % ', '.join([`i` for i in uces]))
194 def getwordconcorde(self, word) :
195 return self.getconcorde(self.getworduces(word))
197 def getlemconcorde(self, lem) :
198 return self.getconcorde(self.getlemuces(lem))
200 def getalluces(self) :
201 return self.cuces.execute('SELECT * FROM uces')
203 def getucesfrometoile(self, etoile) :
204 return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles]
206 def getucefromid(self, uceid) :
207 if self.iduces is None : self.make_iduces()
208 return self.iduces[uceid]
210 def gethapaxnb(self) :
211 return len([None for forme in self.formes if self.formes[forme].freq == 1])
213 def getactivesnb(self, key) :
214 return len([lem for lem in self.lems if self.lems[lem].act == key])
215 # def make_lems(self, lem = True) :
216 # log.info('make lems')
218 # for forme in self.formes :
219 # if self.formes[forme].lem in self.lems :
220 # if self.formes[forme].ident not in self.lems[self.formes[forme].lem] :
221 # self.lems[self.formes[forme].lem][self.formes[forme].ident] = 0
223 # self.lems[self.formes[forme].lem] = {self.formes[forme].ident : 0}
225 def getetbyuceid(self, uceid) :
226 if self.uceuci is None : self.uceuci = dict([[uce.ident,uci.ident] for uci in self.ucis for uce in uci.uces])
227 return self.ucis[self.uceuci[uceid]].etoiles
229 def make_lems(self, lem = True) :
230 log.info('make lems')
233 for forme in self.formes :
234 if self.formes[forme].lem in self.lems :
235 if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes :
236 self.lems[self.formes[forme].lem].add_forme(self.formes[forme])
238 self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
240 self.lems = dict([[forme, Lem(self, self.formes[forme])] for forme in self.formes])
242 def make_idformes(self) :
243 self.idformes = dict([[self.formes[forme].ident, self.formes[forme]] for forme in self.formes])
245 def make_iduces(self) :
246 if self.iduces is None :
247 self.iduces = dict([[uce.ident, uce] for uci in self.ucis for uce in uci.uces])
249 def make_lexitable(self, mineff, etoiles) :
250 tokeep = [lem for lem in self.lems if self.lems[lem].freq > mineff]
251 etuces = [[] for et in etoiles]
252 for uci in self.ucis :
253 get = list(set(uci.etoiles).intersection(etoiles))
255 return '2 variables sur la meme ligne'
257 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
258 etuces = [set(val) for val in etuces]
261 deff = self.getlemuceseff(lem)
263 tab.append([lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])
264 tab.insert(0, [''] + etoiles)
267 def make_efftype_from_etoiles(self, etoiles) :
269 etuces = [[] for et in etoiles]
270 for uci in self.ucis :
271 get = list(set(uci.etoiles).intersection(etoiles))
273 return '2 variables sur la meme ligne'
275 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
276 etuces = [set(val) for val in etuces]
277 for lem in self.lems :
278 deff = self.getlemuceseff(lem)
280 gram = self.lems[lem].gram
282 dtype[gram] = [i + j for i, j in zip(dtype[gram], [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])]
284 dtype[gram] = [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces]
285 tabout = [[gram] + dtype[gram] for gram in dtype]
286 tabout.insert(0, [''] + etoiles)
289 def make_uceactsize(self, actives) :
290 res = self.getalluces()
293 deff = self.getlemuceseff(lem)
295 ucesize[uce] = ucesize.get(uce, 0) + 1
298 def make_uc(self, actives, lim1, lim2) :
299 uceactsize = self.make_uceactsize(actives)
305 for uce in [uce for uci in self.ucis for uce in uci.uces] :
306 if uce.para == lastpara :
308 last1 += uceactsize.get(uce.ident,0)
309 uc1[-1].append(uce.ident)
311 uc1.append([uce.ident])
314 last2 += uceactsize.get(uce.ident, 0)
315 uc2[-1].append(uce.ident)
317 uc2.append([uce.ident])
320 last1 = uceactsize.get(uce.ident, 0)
321 last2 = uceactsize.get(uce.ident, 0)
323 uc1.append([uce.ident])
324 uc2.append([uce.ident])
327 def make_and_write_sparse_matrix_from_uc(self, actives, sizeuc1, sizeuc2, uc1out, uc2out, listuce1out, listuce2out) :
328 uc1, uc2 = self.make_uc(actives, sizeuc1, sizeuc2)
329 log.info('taille uc1 : %i - taille uc2 : %i' % (len(uc1), len(uc2)))
330 self.write_ucmatrix(uc1, actives, uc1out)
331 self.write_ucmatrix(uc2, actives, uc2out)
332 listuce1 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc1) for uce in ucl]
333 listuce2 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc2) for uce in ucl]
334 with open(listuce1out, 'w') as f :
335 f.write('\n'.join([';'.join(line) for line in listuce1]))
336 with open(listuce2out, 'w') as f :
337 f.write('\n'.join([';'.join(line) for line in listuce2]))
338 return len(uc1), len(uc2)
340 def write_ucmatrix(self, uc, actives, fileout) :
341 log.info('write uc matrix %s' % fileout)
342 uces_uc = dict([[uce, i] for i, ucl in enumerate(uc) for uce in ucl])
345 with open(fileout + '~', 'w+') as f :
346 for i, lem in enumerate(actives) :
347 for uce in self.getlemuces(lem):
348 if (uces_uc[uce], i) not in deja_la :
350 f.write(''.join([' '.join([`uces_uc[uce]+1`,`i+1`,`1`]),'\n']))
351 deja_la[(uces_uc[uce], i)] = 0
353 with open(fileout, 'w') as ffin :
354 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (len(uc), len(actives), nbl))
357 os.remove(fileout + '~')
360 def export_corpus(self, outf) :
361 #outf = 'export_corpus.txt'
363 res = self.getalluces()
367 with open(outf,'w') as f :
369 if self.iduces[uce[0]].uci == actuci and self.iduces[uce[0]].para == actpara :
370 f.write(uce[1].encode(self.parametres['syscoding']) + '\n')
371 elif self.iduces[uce[0]].uci != actuci :
372 actuci = self.iduces[uce[0]].uci
373 if self.ucis[self.iduces[uce[0]].uci].paras == [] :
374 actpara = self.iduces[uce[0]].para
375 f.write('\n' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n' + uce[1].encode(self.parametres['syscoding']) + '\n')
378 actpara = self.iduces[uce[0]].para
379 f.write('\n'.join([' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']), self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
380 elif self.iduces[uce[0]].para != actpara :
381 actpara = self.iduces[uce[0]].para
383 f.write('\n'.join([self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
385 def make_and_write_sparse_matrix_from_uces(self, actives, outfile, listuce = False) :
386 log.info('make_and_write_sparse_matrix_from_uces %s' % outfile)
388 with open(outfile + '~', 'w+') as f :
389 for i, lem in enumerate(actives) :
390 for uce in sorted(self.getlemuces(lem)) :
392 f.write(''.join([' '.join([`uce+1`, `i+1`,`1`]),'\n']))
394 with open(outfile, 'w') as ffin :
395 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
398 os.remove(outfile + '~')
400 with open(listuce, 'w') as f :
401 f.write('\n'.join(['uce;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucenb())]))
403 def make_and_write_sparse_matrix_from_uci(self, actives, outfile, listuci = False) :
404 log.info('make_and_write_sparse_matrix_from_ucis %s' % outfile)
406 with open(outfile + '~', 'w+') as f :
407 for i, lem in enumerate(actives) :
408 for uci in sorted(self.getlemucis(lem)) :
410 f.write(''.join([' '.join([`uci+1`, `i+1`,`1`]),'\n']))
412 with open(outfile, 'w') as ffin :
413 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucinb(), len(actives), nbl))
416 os.remove(outfile + '~')
418 with open(listuci, 'w') as f :
419 f.write('\n'.join(['uci;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucinb())]))
421 def make_and_write_sparse_matrix_from_classe(self, actives, uces, outfile) :
422 log.info('make_and_write_sparse_matrix_from_classe %s' % outfile)
424 duces = dict([[uce, i] for i, uce in enumerate(uces)])
425 with open(outfile + '~', 'w+') as f :
426 for i, lem in enumerate(actives) :
427 uces_ok = list(set(self.getlemuces(lem)).intersection(uces))
429 f.write(''.join([' '.join([`duces[uce]+1`,`i+1`,`1`]),'\n']))
431 with open(outfile, 'w') as ffin :
432 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
435 os.remove(outfile + '~')
437 def make_table_with_classe(self, uces, list_act) :
438 table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
439 uces = dict([[uce, i] for i, uce in enumerate(uces)])
440 for i, lem in enumerate(list_act) :
441 lemuces = list(set(self.getlemuces(lem)).intersection(uces))
443 table_uce[uces[uce]][i] = 1
444 table_uce.insert(0, list_act)
447 def parse_active(self, gramact, gramsup = None) :
448 log.info('parse actives')
449 for lem in self.lems :
450 if self.lems[lem].gram in gramact :
451 self.lems[lem].act = 1
452 elif gramsup is not None :
453 if self.lems[lem].gram in gramsup :
454 self.lems[lem].act = 2
456 self.lems[lem].act = 0
458 self.lems[lem].act = 2
460 def make_actives_limit(self, limit) :
461 if self.idformes is None :
463 return [lem for lem in self.lems if self.getlemeff(lem) >= limit and self.lems[lem].act == 1]
465 def make_actives_nb(self, nbmax, key) :
466 log.info('make_actives_nb : %i - %i' % (nbmax,key))
467 if self.idformes is None :
469 allactives = [[self.lems[lem].freq, lem] for lem in self.lems if self.lems[lem].act == key and self.lems[lem].freq >= 3]
470 self.activenb = len(allactives)
471 allactives = sorted(allactives, reverse = True)
472 if len(allactives) <= nbmax :
473 log.info('nb = %i - eff min = %i ' % (len(allactives), allactives[-1][0]))
474 return [val[1] for val in allactives], allactives[-1][0]
476 effs = [val[0] for val in allactives]
477 if effs.count(effs[nbmax - 1]) > 1 :
478 lim = effs[nbmax - 1] + 1
482 stop = effs.index(lim)
488 log.info('nb actives = %i - eff min = %i ' % (stop, lim))
489 return [val[1] for val in allactives[0:stop + 1]], lim
491 def make_and_write_profile(self, actives, ucecl, fileout) :
492 log.info('formes/classes')
493 tab = [[lem] + [len(set(self.getlemuces(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
494 tab = [[line[0]] + [`val` for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
495 with open(fileout, 'w') as f :
496 f.write('\n'.join([';'.join(line) for line in tab]).encode(self.parametres['syscoding']))
498 def make_etoiles(self) :
500 for uci in self.ucis :
501 etoiles.update(uci.etoiles[1:] + uci.paras)
504 def make_etoiles_dict(self) :
505 etoiles = [et for uci in self.ucis for et in uci.etoiles[1:]]
507 for etoile in etoiles :
508 et = etoile.split('_')
511 if et[1] in det[et[0]] :
512 det[et[0]][et[1]] += 1
514 det[et[0]][et[1]] = 1
519 det[et[0]] = {et[1] :1}
525 def make_and_write_profile_et(self, ucecl, fileout) :
526 log.info('etoiles/classes')
527 etoiles = self.make_etoiles()
528 with open(fileout, 'w') as f :
529 f.write('\n'.join([';'.join([etoile] + [`len(set(self.getucesfrometoile(etoile)).intersection(classe))` for classe in ucecl]) for etoile in etoiles]).encode(self.parametres['syscoding']))
531 def count_from_list(self, l, d) :
539 def find_segments(self, taille_segment, taille_limite) :
541 for uce in self.getalluces() :
543 d = self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
544 l = [[d[val], val] for val in d if d[val] >= 3]
547 if len(l) > taille_limite :
548 l = l[-taille_limite:]
551 def make_ucecl_from_R(self, filein) :
552 with open(filein, 'rU') as f :
557 line = line.replace('\n', '').replace('"', '').split(';')
558 self.lc.append([int(line[0]) - 1, int(line[1])])
559 classesl = [val[1] for val in self.lc]
561 self.lc = sorted(self.lc, key=itemgetter(1))
562 self.lc = [[uce[0] for uce in self.lc if uce[1] == i] for i in range(clnb+1)]
563 self.lc0 = self.lc.pop(0)
566 def gethapaxbyet(self, etoiles) :
567 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
569 for uce in hapaxuces :
570 if uce in hucesdict :
574 etuces = [[] for et in etoiles]
575 for uci in self.ucis :
576 get = list(set(uci.etoiles).intersection(etoiles))
578 return '2 variables sur la meme ligne'
580 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
581 etuces = [set(val) for val in etuces]
582 return [sum([hucesdict[uce] for uce in list(etuce.intersection(hapaxuces))]) for etuce in etuces]
584 def gethapaxuces(self) :
585 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
586 hapax = [forme for forme in self.lems if self.lems[forme].freq == 1]
588 for i,uce in enumerate(hapaxuces) :
589 if uce in hucesdict :
590 hucesdict[uce][0] += 1
591 hucesdict[uce][1].append(hapax[i])
593 hucesdict[uce] = [1,[hapax[i]]]
595 for uce in hucesdict :
596 if hucesdict[uce][0] in huces :
597 huces[hucesdict[uce][0]].append(uce)
599 huces[hucesdict[uce][0]] = [uce]
600 huces = zip(huces, huces.values())
601 huces.sort(reverse=True)
605 for nb in huces[0:4] :
606 txt += "<p><h2>%i hapax par uce</h2><p>\n" % nb[0]
608 res = self.getconcorde([uce])
610 ucetxt = ' ' + row[1] + ' '
612 for hap in hucesdict[uce][1] :
613 laforme = self.getforme([forme for forme in self.lems[hap].formes][0]).forme
614 ucetxt = ucetxt.replace(' '+laforme+' ', ' <font color=red>'+laforme+'</font> ')
615 txt += '<p><b>' + ' '.join(self.getetbyuceid(uceid)) + '</b></p>'
616 txt += '<p>'+ucetxt+'</p>\n'
620 with open('/tmp/testhapxuce.html','w') as f :
625 def __init__(self, corpus) :
626 ucinb = corpus.getucinb()
627 ucisize = corpus.getucisize()
628 ucimean = float(sum(ucisize))/float(ucinb)
629 detoile = corpus.make_etoiles_dict()
633 def __init__(self, iduci, line, paraset = None) :
635 self.etoiles = line.split()
637 if paraset is not None :
638 self.paras = paraset.split()
643 def __init__(self, iduce, idpara, iduci) :
649 def __init__(self, word, gramtype, idword, lem = None, freq = None) :
655 if freq is not None :
661 def __init__(self, parent, forme) :
662 self.formes = {forme.ident : forme.freq}
663 self.gram = forme.gram
664 self.freq = forme.freq
667 def add_forme(self, forme) :
668 self.formes[forme.ident] = forme.freq
669 self.freq += forme.freq
671 def decouperlist(chaine, longueur, longueurOptimale) :
673 on part du dernier caractère, et on recule jusqu'au début de la chaîne.
674 Si on trouve un '$', c'est fini.
675 Sinon, on cherche le meilleur candidat. C'est-à-dire le rapport poids/distance le plus important.
677 separateurs = [[u'.', 6.0], [u'?', 6.0], [u'!', 6.0], [u'£', 6.0], [u':', 5.0], [u';', 4.0], [u',', 1.0], [u' ', 0.01]]
678 dsep = dict([[val[0],val[1]] for val in separateurs])
679 trouve = False # si on a trouvé un bon séparateur
680 iDecoupe = 0 # indice du caractere ou il faut decouper
682 longueur = min(longueur, len(chaine) - 1)
683 chaineTravail = chaine[:longueur + 1]
685 meilleur = ['', 0, 0] # type, poids et position du meilleur separateur
688 indice = chaineTravail.index(u'$')
690 iDecoupe = indice - 1
695 caractere = chaineTravail[nbCar]
696 distance = abs(longueurOptimale - nbCar) + 1
697 meilleureDistance = abs(longueurOptimale - meilleur[2]) + 1
698 if caractere in dsep :
699 if (float(dsep[caractere]) / distance) > (float(meilleur[1]) / meilleureDistance) :
700 meilleur[0] = caractere
701 meilleur[1] = dsep[caractere]
706 if (float(dsep[' ']) / distance) > (float(meilleur[1]) / meilleureDistance) :
708 meilleur[1] = dsep[' ']
715 #if meilleur[0] != ' ' :
716 # fin = chaine[iDecoupe + 1:]
717 # retour = chaineTravail[:iDecoupe]
719 fin = chaine[iDecoupe + 1:]
720 retour = chaineTravail[:iDecoupe + 1]
721 return len(retour) > 0, retour, fin
722 # si on a rien trouvé
723 return False, chaine, ''
725 def testetoile(line) :
726 return line.startswith(u'****')
729 return line[0:4].isdigit() and u'*' in line
731 def prep_txtlist(txt) :
732 return txt.split() + [u'$']
734 def prep_txtcharact(txt) :
739 Class for building a corpus
741 def __init__(self, infile, parametres_corpus, lexique = None, expressions = None, dlg = None) :
742 log.info('begin building corpus...')
743 self.lexique = lexique
744 self.expressions = expressions
746 self.corpus = Corpus(self, parametres_corpus)
749 self.lim = parametres_corpus.get('lim', 1000000)
750 self.encoding = parametres_corpus['encoding']
751 self.corpus.pathout = PathOut(filename = parametres_corpus['originalpath'], dirout = parametres_corpus['pathout'])
752 self.corpus.pathout.createdir(parametres_corpus['pathout'])
753 self.corpus.parametres['uuid'] = str(uuid4())
754 self.corpus.parametres['corpus_name'] = os.path.split(self.corpus.parametres['pathout'])[1]
755 self.corpus.parametres['type'] = 'corpus'
756 if self.corpus.parametres['keep_ponct'] :
757 self.ponctuation_espace = [' ', '']
759 self.ponctuation_espace = [' ','.', u'£', ';', '?', '!', ',', ':','']
761 self.tolist = self.corpus.parametres.get('tolist', 0)
768 def prep_makeuce(self) :
769 method = self.corpus.parametres.get('ucemethod', 0)
771 self.decouper = decouperlist
772 self.prep_txt = prep_txtlist
773 self.ucesize = self.corpus.parametres.get('ucesize', 40)
775 self.decouper = decoupercharact
776 self.prep_txt = prep_txtcharact
777 self.ucesize = self.corpus.parametres.get('ucesize', 240)
778 log.info('method uce : %s' % method)
782 self.read_corpus(self.infile)
784 self.corpus.parametres['ira'] = self.corpus.pathout['Corpus.cira']
785 self.time = time() - t1
787 DoConf().makeoptions(['corpus'],[self.corpus.parametres], self.corpus.pathout['Corpus.cira'])
788 log.info('time : %f' % (time() - t1))
791 self.conn_f = sqlite3.connect(self.corpus.pathout['formes.db'])
792 self.cf = self.conn_f.cursor()
793 self.cf.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
794 self.cf.execute('CREATE TABLE IF NOT EXISTS eff (id INTEGER, eff TEXT);')
796 self.cf = self.conn_f.cursor()
797 self.cf.execute('PRAGMA temp_store=MEMORY;')
798 self.cf.execute('PRAGMA journal_mode=MEMORY;')
799 self.cf.execute('PRAGMA synchronous = OFF;')
800 self.cf.execute('begin')
801 self.conn = sqlite3.connect(self.corpus.pathout['uces.db'])
802 self.c = self.conn.cursor()
803 self.c.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
805 self.c = self.conn.cursor()
806 self.c.execute('PRAGMA temp_store=MEMORY;')
807 self.c.execute('PRAGMA journal_mode=MEMORY;')
808 self.c.execute('PRAGMA synchronous = OFF;')
809 self.c.execute('begin')
812 #commit index and close db
815 self.cf.execute('CREATE INDEX iduces ON uces (id);')
816 self.cf.execute('CREATE INDEX ideff ON eff (id);')
820 self.conn_corpus = sqlite3.connect(self.corpus.pathout['corpus.db'])
821 self.ccorpus = self.conn_corpus.cursor()
822 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS etoiles (uci INTEGER, et TEXT, paras TEXT);')
823 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS luces (uci INTEGER, para INTEGER, uce INTEGER);')
824 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS formes (ident INTEGER, forme TEXT, lem TEXT, gram TEXT, freq INTEGER);')
825 self.conn_corpus.commit()
826 self.ccorpus = self.conn_corpus.cursor()
827 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
828 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
829 self.ccorpus.execute('PRAGMA synchronous = OFF;')
830 self.ccorpus.execute('begin')
832 self.ccorpus.execute('CREATE INDEX iduci ON luces (uci);')
833 self.conn_corpus.commit()
834 self.conn_corpus.close()
835 #self.corpus.parametres['corpus_ira'] = self.corpus.pathout['corpus.cira']
837 def buildcleans(self) :
838 if self.corpus.parametres.get('lower', 1) :
839 self.cleans.append(self.dolower)
840 if self.corpus.parametres.get('firstclean', 1) :
841 self.cleans.append(self.firstclean)
842 self.rule = self.corpus.parametres.get('keep_caract', u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_-")
843 self.cleans.append(self.docharact)
844 if self.corpus.parametres.get('expressions', 1) :
845 self.cleans.append(self.make_expression)
846 if self.corpus.parametres.get('apos', 1) :
847 self.cleans.append(self.doapos)
848 if self.corpus.parametres.get('tiret', 1):
849 self.cleans.append(self.dotiret)
851 def make_expression(self,txt) :
852 for expression in self.expressions:
853 if expression in txt :
854 txt = txt.replace(expression, self.expressions[expression][0])
857 def dolower(self, txt) :
860 def docharact(self, txt) :
861 #rule = u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_-"
862 list_keep = u"[" + self.rule + "]+"
863 return re.sub(list_keep, ' ', txt)
865 def doapos(self, txt) :
866 return txt.replace(u'\'', u' ')
868 def dotiret(self, txt) :
869 return txt.replace(u'-', u' ')
871 def firstclean(self, txt) :
872 txt = txt.replace(u'’',"'")
873 txt = txt.replace(u'œ', u'oe')
874 return txt.replace('...',u' £ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ')
876 def make_cleans(self, txt) :
877 for clean in self.cleans :
881 def backup_uce(self) :
882 if self.corpus.idformesuces != {} :
883 log.info('backup %i' % len(self.corpus.idformesuces))
884 touce = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].keys()])) for forme in self.corpus.idformesuces]
885 toeff = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].values()])) for forme in self.corpus.idformesuces]
886 self.cf.executemany('INSERT INTO uces VALUES (?,?);', touce)
887 self.cf.executemany('INSERT INTO eff VALUES (?,?);', toeff)
888 self.corpus.idformesuces = {}
891 def backup_corpus(self) :
892 log.info('start backup corpus')
894 for uci in self.corpus.ucis :
895 self.ccorpus.execute('INSERT INTO etoiles VALUES (?,?,?);' ,(uci.ident,' '.join(uci.etoiles), ' '.join(uci.paras,)))
896 for uce in uci.uces :
897 self.ccorpus.execute('INSERT INTO luces VALUES (?,?,?);',(`uci.ident`,`uce.para`,`uce.ident`,))
898 for forme in self.corpus.formes :
899 self.ccorpus.execute('INSERT INTO formes VALUES (?,?,?,?,?);', (`self.corpus.formes[forme].ident`, forme, self.corpus.formes[forme].lem, self.corpus.formes[forme].gram, `self.corpus.formes[forme].freq`,))
900 log.info('%f' % (time() - t))
903 self.corpus.parametres['date'] = datetime.datetime.now().ctime()
904 minutes, seconds = divmod(self.time, 60)
905 hours, minutes = divmod(minutes, 60)
906 self.corpus.parametres['time'] = '%.0fh %.0fm %.0fs' % (hours, minutes, seconds)
907 self.corpus.parametres['ucinb'] = self.corpus.getucinb()
908 self.corpus.parametres['ucenb'] = self.corpus.getucenb()
909 self.corpus.parametres['occurrences'] = self.corpus.gettotocc()
910 self.corpus.parametres['formesnb'] = len(self.corpus.formes)
911 hapaxnb = self.corpus.gethapaxnb()
912 pourhapaxf = (float(hapaxnb) / len(self.corpus.formes)) * 100
913 pourhapaxocc = (float(hapaxnb) / self.corpus.parametres['occurrences']) * 100
914 self.corpus.parametres['hapax'] = '%i - %.2f %% des formes - %.2f %% des occurrences' % (hapaxnb, pourhapaxf, pourhapaxocc)
917 class BuildFromAlceste(BuildCorpus) :
918 #def __init___(self, infile, parametres_corpus) :
919 # BuildCorpus.__init__(self, infile, parametres_corpus)
922 def read_corpus(self, infile) :
925 if self.corpus.parametres['ucimark'] == 0 :
926 self.testuci = testetoile
927 elif self.corpus.parametres['ucimark'] == 1 :
928 self.testuci = testint
933 with codecs.open(infile, 'rU', self.encoding) as f :
935 if self.testuci(line) :
938 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci - 1)
940 self.corpus.ucis.append(Uci(iduci, line))
942 self.corpus.ucis.append(Uci(iduci, line))
943 elif line.startswith(u'-*') :
945 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
948 self.corpus.ucis[-1].paras.append(line.split()[0])
949 elif line.strip() != '' and iduci != -1 :
952 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
956 def treattxt(self, txt, iduce, idpara, iduci) :
957 if self.corpus.parametres.get('ucemethod', 0) == 2 and self.corpus.parametres['douce']:
958 txt = 'laphrasepoursplitter'.join(txt)
959 txt = self.make_cleans(txt)
960 txt = ' '.join([val for val in txt.split() if val not in self.ponctuation_espace])
961 ucetxt = txt.split('laphrasepoursplitter')
964 txt = self.make_cleans(txt)
965 ucetxt = self.make_uces(txt, self.corpus.parametres['douce'])
966 if self.corpus.ucis[-1].paras == [] :
970 self.corpus.ucis[-1].uces.append(Uce(iduce, idpara, iduci))
971 self.c.execute('INSERT INTO uces VALUES(?,?);', (`iduce`,uce))
978 self.corpus.add_word(word)
979 if self.dlg is not None :
980 if self.limitshow > self.count :
981 self.dlg.Pulse('uci: %i - uce : %i' % (iduci + 1, iduce +1))
985 self.limitshow = self.last / 100000
986 log.debug(`iduci`, `idpara`, `iduce`)
987 if self.last > self.lim :
992 def make_uces(self, txt, douce = True, keep_ponct = False) :
993 txt = ' '.join(txt.split())
996 reste, texte_uce, suite = self.decouper(self.prep_txt(txt), self.ucesize + 15, self.ucesize)
1004 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1007 reste, texte_uce, suite = self.decouper(suite, self.ucesize + 15, self.ucesize)
1015 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1017 print 'RESTEE UUCEEEEEEEEEEEEE', uce
1021 return [' '.join([val for val in txt.split() if val not in self.ponctuation_espace])]
1023 #decouper (list_sep)
1024 #make_uces (decouper)
1025 #treat_txt (make_uces)
1029 def __init__(self, parent, dlg = None) :
1030 self.parent = parent
1032 parametres = DoConf(os.path.join(self.parent.UserConfigPath,'corpus.cfg')).getoptions('corpus')
1033 parametres['pathout'] = PathOut(parent.filename, 'corpus').mkdirout()
1034 dial = CorpusPref(parent, parametres)
1035 dial.CenterOnParent()
1036 dial.txtpath.SetLabel(parent.filename)
1037 #dial.repout_choices.SetValue(parametres['pathout'])
1038 self.res = dial.ShowModal()
1039 if self.res == 5100 :
1040 parametres = dial.doparametres()
1041 parametres['originalpath'] = parent.filename
1042 PathOut().createdir(parametres['pathout'])
1043 ReadLexique(self.parent, lang = parametres['lang'])
1044 self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp'))
1045 self.parametres = parametres
1048 def doanalyse(self) :
1049 return BuildFromAlceste(self.parent.filename, self.parametres, self.parent.lexique, self.parent.expressions, dlg = self.dlg).corpus
1052 if __name__ == '__main__' :
1054 parametres = {'formesdb':'formes.db', 'ucesdb': 'uces.db', 'corpusdb' : 'corpus.db', 'syscoding' : 'utf-8', 'encoding' : encoding}
1055 intro = BuildCorpus(infile, parametres)#, tar_in, tar_infouce)#, tar_formes)