1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
8 from functions import decoupercharact, ReadDicoAsDico, DoConf
14 from operator import itemgetter
15 from uuid import uuid4
16 from chemins import PathOut
17 from dialog import CorpusPref
18 from functions import ReadLexique, ReadDicoAsDico
22 log = logging.getLogger('iramuteq.corpus')
25 def copycorpus(corpus) :
26 log.info('copy corpus')
27 copy_corpus = Corpus(corpus.parent, parametres = corpus.parametres)
28 copy_corpus.ucis = corpus.ucis
29 copy_corpus.formes = corpus.formes
30 copy_corpus.pathout = corpus.pathout
31 copy_corpus.conn_all()
41 def __init__(self, parent, parametres = {}, read = False) :
43 self.parametres = parametres
45 self.connformes = None
47 self.conncorpus = None
54 self.idformesuces = {}
59 self.pathout = PathOut(dirout = parametres['pathout'])
62 def add_word(self, word) :
63 if word in self.formes :
64 self.formes[word].freq += 1
65 if self.formes[word].ident in self.idformesuces :
66 if self.ucis[-1].uces[-1].ident in self.idformesuces[self.formes[word].ident] :
67 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] += 1
69 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] = 1
71 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident: 1}
73 if word in self.parent.lexique :
74 gramtype = self.parent.lexique[word][1]
75 lem = self.parent.lexique[word][0]
82 self.formes[word] = Word(word, gramtype, len(self.formes), lem)
83 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident : 1}
86 """connect corpus to db"""
87 if self.connformes is None :
88 log.info('connexion corpus')
89 self.connuces = sqlite3.connect(self.pathout['uces.db'])
90 self.cuces = self.connuces.cursor()
91 self.connformes = sqlite3.connect(self.pathout['formes.db'])
92 self.cformes = self.connformes.cursor()
93 self.conncorpus = sqlite3.connect(self.pathout['corpus.db'])
94 self.ccorpus = self.conncorpus.cursor()
95 self.cformes.execute('PRAGMA temp_store=MEMORY;')
96 self.cformes.execute('PRAGMA journal_mode=MEMORY;')
97 self.cformes.execute('PRAGMA synchronous = OFF;')
98 self.cuces.execute('PRAGMA temp_store=MEMORY;')
99 self.cuces.execute('PRAGMA journal_mode=MEMORY;')
100 self.cuces.execute('PRAGMA synchronous = OFF;')
101 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
102 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
103 self.ccorpus.execute('PRAGMA synchronous = OFF;')
105 def read_corpus(self) :
106 log.info('read corpus')
107 self.parametres['syscoding'] = sys.getdefaultencoding()
108 if self.conncorpus is None :
110 res = self.ccorpus.execute('SELECT * FROM etoiles;')
112 self.ucis.append(Uci(row[0], row[1], row[2]))
113 uces = self.conncorpus.cursor().execute('SELECT * FROM luces where uci=?;',(`self.ucis[-1].ident`,))
115 self.ucis[-1].uces.append(Uce(uce[2], uce[1], uce[0]))
116 res = self.ccorpus.execute('SELECT * FROM formes;')
117 self.formes = dict([[forme[1], Word(forme[1], forme[3], forme[0], lem = forme[2], freq = forme[4])] for forme in res])
120 def getworduces(self, wordid) :
121 if isinstance(wordid, basestring) :
122 wordid = self.formes[wordid].ident
123 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`wordid`,))
124 return list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
126 def getlemuces(self, lem) :
127 formesid = ', '.join([`val` for val in self.lems[lem].formes])
128 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
129 res = self.cformes.execute(query)
130 return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))))
132 def getlemucis(self, lem) :
133 uces = self.getlemuces(lem)
134 return list(set([self.getucefromid(val).uci for val in uces]))
136 def getlemuceseff(self, lem) :
137 formesid = ', '.join([`val` for val in self.lems[lem].formes])
138 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
139 res = self.cformes.execute(query)
140 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
141 query = 'SELECT eff FROM eff where id IN (%s) ORDER BY id' % formesid
142 res = self.cformes.execute(query)
143 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
145 for i, uce in enumerate(uces) :
146 lemuceeff[uce] = lemuceeff.get(uce, 0) + eff[i]
149 def getlemeff(self, lem) :
150 return self.lems[lem].freq
155 def getforme(self, formeid) :
156 if self.idformes is None : self.make_idformes()
157 return self.idformes[formeid]
159 def gettotocc(self) :
160 return sum([self.formes[forme].freq for forme in self.formes])
162 def getucemean(self) :
163 return float(self.gettotocc())/self.getucenb()
166 return self.ucis[-1].uces[-1].ident + 1
169 return self.ucis[-1].ident + 1
171 def getucisize(self) :
172 ucesize = self.getucesize()
173 return [sum(ucesize[uci.uces[0].ident:(uci.uces[-1].ident + 1)]) for uci in self.ucis]
175 def getucesize(self) :
176 res = self.getalluces()
177 return [len(uce[1].split()) for uce in res]
179 # def getlemseff(self) :
180 # if self.idformes is None :
181 # self.make_idformes()
182 # return dict([[lem, sum([self.idformes[forme].freq for forme in self.lems[lem]])] for lem in self.lems])
184 # def getlemsefftype(self) :
185 # if self.idformes is None :
186 # self.make_idformes()
187 # if self.lems is None :
189 # return dict([[lem, [sum([self.idformes[forme].freq for forme in self.lems[lem]]), '', self.idformes[self.lems[lem].keys()[0]].gram]] for lem in self.lems])
191 def getconcorde(self, uces) :
192 return self.cuces.execute('select * from uces where id IN (%s);' % ', '.join([`i` for i in uces]))
194 def getwordconcorde(self, word) :
195 return self.getconcorde(self.getworduces(word))
197 def getlemconcorde(self, lem) :
198 return self.getconcorde(self.getlemuces(lem))
200 def getalluces(self) :
201 return self.cuces.execute('SELECT * FROM uces')
203 def getucesfrometoile(self, etoile) :
204 return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles]
206 def getucefromid(self, uceid) :
207 if self.iduces is None : self.make_iduces()
208 return self.iduces[uceid]
210 def gethapaxnb(self) :
211 return len([None for forme in self.formes if self.formes[forme].freq == 1])
213 def getactivesnb(self, key) :
214 return len([lem for lem in self.lems if self.lems[lem].act == key])
215 # def make_lems(self, lem = True) :
216 # log.info('make lems')
218 # for forme in self.formes :
219 # if self.formes[forme].lem in self.lems :
220 # if self.formes[forme].ident not in self.lems[self.formes[forme].lem] :
221 # self.lems[self.formes[forme].lem][self.formes[forme].ident] = 0
223 # self.lems[self.formes[forme].lem] = {self.formes[forme].ident : 0}
225 def getetbyuceid(self, uceid) :
226 if self.uceuci is None : self.uceuci = dict([[uce.ident,uci.ident] for uci in self.ucis for uce in uci.uces])
227 return self.ucis[self.uceuci[uceid]].etoiles
229 def make_lems(self, lem = True) :
230 log.info('make lems')
233 for forme in self.formes :
234 if self.formes[forme].lem in self.lems :
235 if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes :
236 self.lems[self.formes[forme].lem].add_forme(self.formes[forme])
238 self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
240 self.lems = dict([[forme, Lem(self, self.formes[forme])] for forme in self.formes])
242 def make_idformes(self) :
243 self.idformes = dict([[self.formes[forme].ident, self.formes[forme]] for forme in self.formes])
245 def make_iduces(self) :
246 if self.iduces is None :
247 self.iduces = dict([[uce.ident, uce] for uci in self.ucis for uce in uci.uces])
249 def make_lexitable(self, mineff, etoiles) :
250 tokeep = [lem for lem in self.lems if self.lems[lem].freq > mineff]
251 etuces = [[] for et in etoiles]
252 for uci in self.ucis :
253 get = list(set(uci.etoiles).intersection(etoiles))
255 return '2 variables sur la meme ligne'
257 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
258 etuces = [set(val) for val in etuces]
261 deff = self.getlemuceseff(lem)
263 tab.append([lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])
264 tab.insert(0, [''] + etoiles)
267 def make_efftype_from_etoiles(self, etoiles) :
269 etuces = [[] for et in etoiles]
270 for uci in self.ucis :
271 get = list(set(uci.etoiles).intersection(etoiles))
273 return '2 variables sur la meme ligne'
275 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
276 etuces = [set(val) for val in etuces]
277 for lem in self.lems :
278 deff = self.getlemuceseff(lem)
280 gram = self.lems[lem].gram
282 dtype[gram] = [i + j for i, j in zip(dtype[gram], [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])]
284 dtype[gram] = [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces]
285 tabout = [[gram] + dtype[gram] for gram in dtype]
286 tabout.insert(0, [''] + etoiles)
289 def make_uceactsize(self, actives) :
290 res = self.getalluces()
293 deff = self.getlemuceseff(lem)
295 ucesize[uce] = ucesize.get(uce, 0) + 1
298 def make_uc(self, actives, lim1, lim2) :
299 uceactsize = self.make_uceactsize(actives)
305 for uce in [uce for uci in self.ucis for uce in uci.uces] :
306 if uce.para == lastpara :
308 last1 += uceactsize.get(uce.ident,0)
309 uc1[-1].append(uce.ident)
311 uc1.append([uce.ident])
314 last2 += uceactsize.get(uce.ident, 0)
315 uc2[-1].append(uce.ident)
317 uc2.append([uce.ident])
320 last1 = uceactsize.get(uce.ident, 0)
321 last2 = uceactsize.get(uce.ident, 0)
323 uc1.append([uce.ident])
324 uc2.append([uce.ident])
327 def make_and_write_sparse_matrix_from_uc(self, actives, sizeuc1, sizeuc2, uc1out, uc2out, listuce1out, listuce2out) :
328 uc1, uc2 = self.make_uc(actives, sizeuc1, sizeuc2)
329 log.info('taille uc1 : %i - taille uc2 : %i' % (len(uc1), len(uc2)))
330 self.write_ucmatrix(uc1, actives, uc1out)
331 self.write_ucmatrix(uc2, actives, uc2out)
332 listuce1 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc1) for uce in ucl]
333 listuce2 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc2) for uce in ucl]
334 with open(listuce1out, 'w') as f :
335 f.write('\n'.join([';'.join(line) for line in listuce1]))
336 with open(listuce2out, 'w') as f :
337 f.write('\n'.join([';'.join(line) for line in listuce2]))
338 return len(uc1), len(uc2)
340 def write_ucmatrix(self, uc, actives, fileout) :
341 log.info('write uc matrix %s' % fileout)
342 uces_uc = dict([[uce, i] for i, ucl in enumerate(uc) for uce in ucl])
345 with open(fileout + '~', 'w+') as f :
346 for i, lem in enumerate(actives) :
347 for uce in self.getlemuces(lem):
348 if (uces_uc[uce], i) not in deja_la :
350 f.write(''.join([' '.join([`uces_uc[uce]+1`,`i+1`,`1`]),'\n']))
351 deja_la[(uces_uc[uce], i)] = 0
353 with open(fileout, 'w') as ffin :
354 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (len(uc), len(actives), nbl))
357 os.remove(fileout + '~')
360 def export_corpus(self, outf) :
361 #outf = 'export_corpus.txt'
363 res = self.getalluces()
367 with open(outf,'w') as f :
369 if self.iduces[uce[0]].uci == actuci and self.iduces[uce[0]].para == actpara :
370 f.write(uce[1].encode(self.parametres['syscoding']) + '\n')
371 elif self.iduces[uce[0]].uci != actuci :
372 actuci = self.iduces[uce[0]].uci
373 if self.ucis[self.iduces[uce[0]].uci].paras == [] :
374 actpara = self.iduces[uce[0]].para
375 f.write('\n' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n' + uce[1].encode(self.parametres['syscoding']) + '\n')
378 actpara = self.iduces[uce[0]].para
379 f.write('\n'.join([' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']), self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
380 elif self.iduces[uce[0]].para != actpara :
381 actpara = self.iduces[uce[0]].para
383 f.write('\n'.join([self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
385 def make_and_write_sparse_matrix_from_uces(self, actives, outfile, listuce = False) :
386 log.info('make_and_write_sparse_matrix_from_uces %s' % outfile)
388 with open(outfile + '~', 'w+') as f :
389 for i, lem in enumerate(actives) :
390 for uce in sorted(self.getlemuces(lem)) :
392 f.write(''.join([' '.join([`uce+1`, `i+1`,`1`]),'\n']))
394 with open(outfile, 'w') as ffin :
395 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
398 os.remove(outfile + '~')
400 with open(listuce, 'w') as f :
401 f.write('\n'.join(['uce;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucenb())]))
403 def make_and_write_sparse_matrix_from_uci(self, actives, outfile, listuci = False) :
404 log.info('make_and_write_sparse_matrix_from_ucis %s' % outfile)
406 with open(outfile + '~', 'w+') as f :
407 for i, lem in enumerate(actives) :
408 for uci in sorted(self.getlemucis(lem)) :
410 f.write(''.join([' '.join([`uci+1`, `i+1`,`1`]),'\n']))
412 with open(outfile, 'w') as ffin :
413 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucinb(), len(actives), nbl))
416 os.remove(outfile + '~')
418 with open(listuci, 'w') as f :
419 f.write('\n'.join(['uci;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucinb())]))
421 def make_and_write_sparse_matrix_from_classe(self, actives, uces, outfile) :
422 log.info('make_and_write_sparse_matrix_from_classe %s' % outfile)
424 duces = dict([[uce, i] for i, uce in enumerate(uces)])
425 with open(outfile + '~', 'w+') as f :
426 for i, lem in enumerate(actives) :
427 uces_ok = list(set(self.getlemuces(lem)).intersection(uces))
429 f.write(''.join([' '.join([`duces[uce]+1`,`i+1`,`1`]),'\n']))
431 with open(outfile, 'w') as ffin :
432 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
435 os.remove(outfile + '~')
437 def make_table_with_classe(self, uces, list_act) :
438 table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
439 uces = dict([[uce, i] for i, uce in enumerate(uces)])
440 for i, lem in enumerate(list_act) :
441 lemuces = list(set(self.getlemuces(lem)).intersection(uces))
443 table_uce[uces[uce]][i] = 1
444 table_uce.insert(0, list_act)
447 def parse_active(self, gramact, gramsup = None) :
448 log.info('parse actives')
449 for lem in self.lems :
450 if self.lems[lem].gram in gramact :
451 self.lems[lem].act = 1
452 elif gramsup is not None :
453 if self.lems[lem].gram in gramsup :
454 self.lems[lem].act = 2
456 self.lems[lem].act = 0
458 self.lems[lem].act = 2
460 def make_actives_limit(self, limit) :
461 if self.idformes is None :
463 return [lem for lem in self.lems if self.getlemeff(lem) >= limit and self.lems[lem].act == 1]
465 def make_actives_nb(self, nbmax, key) :
466 log.info('make_actives_nb : %i - %i' % (nbmax,key))
467 if self.idformes is None :
469 allactives = [[self.lems[lem].freq, lem] for lem in self.lems if self.lems[lem].act == key and self.lems[lem].freq >= 3]
470 self.activenb = len(allactives)
471 allactives = sorted(allactives, reverse = True)
472 if len(allactives) <= nbmax :
473 log.info('nb = %i - eff min = %i ' % (len(allactives), allactives[-1][0]))
474 return [val[1] for val in allactives], allactives[-1][0]
476 effs = [val[0] for val in allactives]
477 if effs.count(effs[nbmax - 1]) > 1 :
478 lim = effs[nbmax - 1] + 1
482 stop = effs.index(lim)
488 log.info('nb actives = %i - eff min = %i ' % (stop, lim))
489 return [val[1] for val in allactives[0:stop + 1]], lim
491 def make_and_write_profile(self, actives, ucecl, fileout) :
492 log.info('formes/classes')
493 tab = [[lem] + [len(set(self.getlemuces(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
494 tab = [[line[0]] + [`val` for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
495 with open(fileout, 'w') as f :
496 f.write('\n'.join([';'.join(line) for line in tab]).encode(self.parametres['syscoding']))
498 def make_etoiles(self) :
500 for uci in self.ucis :
501 etoiles.update(uci.etoiles[1:] + uci.paras)
504 def make_etoiles_dict(self) :
505 etoiles = [et for uci in self.ucis for et in uci.etoiles[1:]]
507 for etoile in etoiles :
508 et = etoile.split('_')
511 if et[1] in det[et[0]] :
512 det[et[0]][et[1]] += 1
514 det[et[0]][et[1]] = 1
519 det[et[0]] = {et[1] :1}
525 def make_and_write_profile_et(self, ucecl, fileout) :
526 log.info('etoiles/classes')
527 etoiles = self.make_etoiles()
528 with open(fileout, 'w') as f :
529 f.write('\n'.join([';'.join([etoile] + [`len(set(self.getucesfrometoile(etoile)).intersection(classe))` for classe in ucecl]) for etoile in etoiles]).encode(self.parametres['syscoding']))
531 def count_from_list(self, l, d) :
539 def find_segments(self, taille_segment, taille_limite) :
541 for uce in self.getalluces() :
543 d = self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
544 l = [[d[val], val] for val in d if d[val] >= 3]
547 if len(l) > taille_limite :
548 l = l[-taille_limite:]
551 def find_segments_in_classe(self, list_uce, taille_segment, taille_limite):
553 for uce in self.getconcorde(list_uce) :
555 d =self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
556 l = [[d[val], val, taille_segment] for val in d if d[val] >= 3]
559 if len(l) > taille_limite :
560 l = l[-taille_limite:]
566 def make_ucecl_from_R(self, filein) :
567 with open(filein, 'rU') as f :
572 line = line.replace('\n', '').replace('"', '').split(';')
573 self.lc.append([int(line[0]) - 1, int(line[1])])
574 classesl = [val[1] for val in self.lc]
576 self.lc = sorted(self.lc, key=itemgetter(1))
577 self.lc = [[uce[0] for uce in self.lc if uce[1] == i] for i in range(clnb+1)]
578 self.lc0 = self.lc.pop(0)
581 def gethapaxbyet(self, etoiles) :
582 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
584 for uce in hapaxuces :
585 if uce in hucesdict :
589 etuces = [[] for et in etoiles]
590 for uci in self.ucis :
591 get = list(set(uci.etoiles).intersection(etoiles))
593 return '2 variables sur la meme ligne'
595 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
596 etuces = [set(val) for val in etuces]
597 return [sum([hucesdict[uce] for uce in list(etuce.intersection(hapaxuces))]) for etuce in etuces]
599 def gethapaxuces(self) :
600 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
601 hapax = [forme for forme in self.lems if self.lems[forme].freq == 1]
603 for i,uce in enumerate(hapaxuces) :
604 if uce in hucesdict :
605 hucesdict[uce][0] += 1
606 hucesdict[uce][1].append(hapax[i])
608 hucesdict[uce] = [1,[hapax[i]]]
610 for uce in hucesdict :
611 if hucesdict[uce][0] in huces :
612 huces[hucesdict[uce][0]].append(uce)
614 huces[hucesdict[uce][0]] = [uce]
615 huces = zip(huces, huces.values())
616 huces.sort(reverse=True)
620 for nb in huces[0:4] :
621 txt += "<p><h2>%i hapax par uce</h2><p>\n" % nb[0]
623 res = self.getconcorde([uce])
625 ucetxt = ' ' + row[1] + ' '
627 for hap in hucesdict[uce][1] :
628 laforme = self.getforme([forme for forme in self.lems[hap].formes][0]).forme
629 ucetxt = ucetxt.replace(' '+laforme+' ', ' <font color=red>'+laforme+'</font> ')
630 txt += '<p><b>' + ' '.join(self.getetbyuceid(uceid)) + '</b></p>'
631 txt += '<p>'+ucetxt+'</p>\n'
635 with open('/tmp/testhapxuce.html','w') as f :
640 def __init__(self, corpus) :
641 ucinb = corpus.getucinb()
642 ucisize = corpus.getucisize()
643 ucimean = float(sum(ucisize))/float(ucinb)
644 detoile = corpus.make_etoiles_dict()
648 def __init__(self, iduci, line, paraset = None) :
650 self.etoiles = line.split()
652 if paraset is not None :
653 self.paras = paraset.split()
658 def __init__(self, iduce, idpara, iduci) :
664 def __init__(self, word, gramtype, idword, lem = None, freq = None) :
670 if freq is not None :
676 def __init__(self, parent, forme) :
677 self.formes = {forme.ident : forme.freq}
678 self.gram = forme.gram
679 self.freq = forme.freq
682 def add_forme(self, forme) :
683 self.formes[forme.ident] = forme.freq
684 self.freq += forme.freq
686 def decouperlist(chaine, longueur, longueurOptimale) :
688 on part du dernier caractère, et on recule jusqu'au début de la chaîne.
689 Si on trouve un '$', c'est fini.
690 Sinon, on cherche le meilleur candidat. C'est-à-dire le rapport poids/distance le plus important.
692 separateurs = [[u'.', 6.0], [u'?', 6.0], [u'!', 6.0], [u'£', 6.0], [u':', 5.0], [u';', 4.0], [u',', 1.0], [u' ', 0.01]]
693 dsep = dict([[val[0],val[1]] for val in separateurs])
694 trouve = False # si on a trouvé un bon séparateur
695 iDecoupe = 0 # indice du caractere ou il faut decouper
697 longueur = min(longueur, len(chaine) - 1)
698 chaineTravail = chaine[:longueur + 1]
700 meilleur = ['', 0, 0] # type, poids et position du meilleur separateur
703 indice = chaineTravail.index(u'$')
705 iDecoupe = indice - 1
710 caractere = chaineTravail[nbCar]
711 distance = abs(longueurOptimale - nbCar) + 1
712 meilleureDistance = abs(longueurOptimale - meilleur[2]) + 1
713 if caractere in dsep :
714 if (float(dsep[caractere]) / distance) > (float(meilleur[1]) / meilleureDistance) :
715 meilleur[0] = caractere
716 meilleur[1] = dsep[caractere]
721 if (float(dsep[' ']) / distance) > (float(meilleur[1]) / meilleureDistance) :
723 meilleur[1] = dsep[' ']
730 #if meilleur[0] != ' ' :
731 # fin = chaine[iDecoupe + 1:]
732 # retour = chaineTravail[:iDecoupe]
734 fin = chaine[iDecoupe + 1:]
735 retour = chaineTravail[:iDecoupe + 1]
736 return len(retour) > 0, retour, fin
737 # si on a rien trouvé
738 return False, chaine, ''
740 def testetoile(line) :
741 return line.startswith(u'****')
744 return line[0:4].isdigit() and u'*' in line
746 def prep_txtlist(txt) :
747 return txt.split() + [u'$']
749 def prep_txtcharact(txt) :
754 Class for building a corpus
756 def __init__(self, infile, parametres_corpus, lexique = None, expressions = None, dlg = None) :
757 log.info('begin building corpus...')
758 self.lexique = lexique
759 self.expressions = expressions
761 self.corpus = Corpus(self, parametres_corpus)
764 self.lim = parametres_corpus.get('lim', 1000000)
765 self.encoding = parametres_corpus['encoding']
766 self.corpus.pathout = PathOut(filename = parametres_corpus['originalpath'], dirout = parametres_corpus['pathout'])
767 self.corpus.pathout.createdir(parametres_corpus['pathout'])
768 self.corpus.parametres['uuid'] = str(uuid4())
769 self.corpus.parametres['corpus_name'] = os.path.split(self.corpus.parametres['pathout'])[1]
770 self.corpus.parametres['type'] = 'corpus'
771 if self.corpus.parametres['keep_ponct'] :
772 self.ponctuation_espace = [' ', '']
774 self.ponctuation_espace = [' ','.', u'£', ';', '?', '!', ',', ':','']
776 self.tolist = self.corpus.parametres.get('tolist', 0)
783 def prep_makeuce(self) :
784 method = self.corpus.parametres.get('ucemethod', 0)
786 self.decouper = decouperlist
787 self.prep_txt = prep_txtlist
788 self.ucesize = self.corpus.parametres.get('ucesize', 40)
790 self.decouper = decoupercharact
791 self.prep_txt = prep_txtcharact
792 self.ucesize = self.corpus.parametres.get('ucesize', 240)
793 log.info('method uce : %s' % method)
797 self.read_corpus(self.infile)
799 self.corpus.parametres['ira'] = self.corpus.pathout['Corpus.cira']
800 self.time = time() - t1
802 DoConf().makeoptions(['corpus'],[self.corpus.parametres], self.corpus.pathout['Corpus.cira'])
803 log.info('time : %f' % (time() - t1))
806 self.conn_f = sqlite3.connect(self.corpus.pathout['formes.db'])
807 self.cf = self.conn_f.cursor()
808 self.cf.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
809 self.cf.execute('CREATE TABLE IF NOT EXISTS eff (id INTEGER, eff TEXT);')
811 self.cf = self.conn_f.cursor()
812 self.cf.execute('PRAGMA temp_store=MEMORY;')
813 self.cf.execute('PRAGMA journal_mode=MEMORY;')
814 self.cf.execute('PRAGMA synchronous = OFF;')
815 self.cf.execute('begin')
816 self.conn = sqlite3.connect(self.corpus.pathout['uces.db'])
817 self.c = self.conn.cursor()
818 self.c.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
820 self.c = self.conn.cursor()
821 self.c.execute('PRAGMA temp_store=MEMORY;')
822 self.c.execute('PRAGMA journal_mode=MEMORY;')
823 self.c.execute('PRAGMA synchronous = OFF;')
824 self.c.execute('begin')
827 #commit index and close db
830 self.cf.execute('CREATE INDEX iduces ON uces (id);')
831 self.cf.execute('CREATE INDEX ideff ON eff (id);')
835 self.conn_corpus = sqlite3.connect(self.corpus.pathout['corpus.db'])
836 self.ccorpus = self.conn_corpus.cursor()
837 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS etoiles (uci INTEGER, et TEXT, paras TEXT);')
838 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS luces (uci INTEGER, para INTEGER, uce INTEGER);')
839 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS formes (ident INTEGER, forme TEXT, lem TEXT, gram TEXT, freq INTEGER);')
840 self.conn_corpus.commit()
841 self.ccorpus = self.conn_corpus.cursor()
842 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
843 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
844 self.ccorpus.execute('PRAGMA synchronous = OFF;')
845 self.ccorpus.execute('begin')
847 self.ccorpus.execute('CREATE INDEX iduci ON luces (uci);')
848 self.conn_corpus.commit()
849 self.conn_corpus.close()
850 #self.corpus.parametres['corpus_ira'] = self.corpus.pathout['corpus.cira']
852 def buildcleans(self) :
853 if self.corpus.parametres.get('lower', 1) :
854 self.cleans.append(self.dolower)
855 if self.corpus.parametres.get('firstclean', 1) :
856 self.cleans.append(self.firstclean)
857 self.rule = self.corpus.parametres.get('keep_caract', u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_-")
858 self.cleans.append(self.docharact)
859 if self.corpus.parametres.get('expressions', 1) :
860 self.cleans.append(self.make_expression)
861 if self.corpus.parametres.get('apos', 1) :
862 self.cleans.append(self.doapos)
863 if self.corpus.parametres.get('tiret', 1):
864 self.cleans.append(self.dotiret)
866 def make_expression(self,txt) :
867 for expression in self.expressions:
868 if expression in txt :
869 txt = txt.replace(expression, self.expressions[expression][0])
872 def dolower(self, txt) :
875 def docharact(self, txt) :
876 #rule = u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_-"
877 list_keep = u"[" + self.rule + "]+"
878 return re.sub(list_keep, ' ', txt)
880 def doapos(self, txt) :
881 return txt.replace(u'\'', u' ')
883 def dotiret(self, txt) :
884 return txt.replace(u'-', u' ')
886 def firstclean(self, txt) :
887 txt = txt.replace(u'’',"'")
888 txt = txt.replace(u'œ', u'oe')
889 return txt.replace('...',u' £ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ')
891 def make_cleans(self, txt) :
892 for clean in self.cleans :
896 def backup_uce(self) :
897 if self.corpus.idformesuces != {} :
898 log.info('backup %i' % len(self.corpus.idformesuces))
899 touce = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].keys()])) for forme in self.corpus.idformesuces]
900 toeff = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].values()])) for forme in self.corpus.idformesuces]
901 self.cf.executemany('INSERT INTO uces VALUES (?,?);', touce)
902 self.cf.executemany('INSERT INTO eff VALUES (?,?);', toeff)
903 self.corpus.idformesuces = {}
906 def backup_corpus(self) :
907 log.info('start backup corpus')
909 for uci in self.corpus.ucis :
910 self.ccorpus.execute('INSERT INTO etoiles VALUES (?,?,?);' ,(uci.ident,' '.join(uci.etoiles), ' '.join(uci.paras,)))
911 for uce in uci.uces :
912 self.ccorpus.execute('INSERT INTO luces VALUES (?,?,?);',(`uci.ident`,`uce.para`,`uce.ident`,))
913 for forme in self.corpus.formes :
914 self.ccorpus.execute('INSERT INTO formes VALUES (?,?,?,?,?);', (`self.corpus.formes[forme].ident`, forme, self.corpus.formes[forme].lem, self.corpus.formes[forme].gram, `self.corpus.formes[forme].freq`,))
915 log.info('%f' % (time() - t))
918 self.corpus.parametres['date'] = datetime.datetime.now().ctime()
919 minutes, seconds = divmod(self.time, 60)
920 hours, minutes = divmod(minutes, 60)
921 self.corpus.parametres['time'] = '%.0fh %.0fm %.0fs' % (hours, minutes, seconds)
922 self.corpus.parametres['ucinb'] = self.corpus.getucinb()
923 self.corpus.parametres['ucenb'] = self.corpus.getucenb()
924 self.corpus.parametres['occurrences'] = self.corpus.gettotocc()
925 self.corpus.parametres['formesnb'] = len(self.corpus.formes)
926 hapaxnb = self.corpus.gethapaxnb()
927 pourhapaxf = (float(hapaxnb) / len(self.corpus.formes)) * 100
928 pourhapaxocc = (float(hapaxnb) / self.corpus.parametres['occurrences']) * 100
929 self.corpus.parametres['hapax'] = '%i - %.2f %% des formes - %.2f %% des occurrences' % (hapaxnb, pourhapaxf, pourhapaxocc)
932 class BuildFromAlceste(BuildCorpus) :
933 #def __init___(self, infile, parametres_corpus) :
934 # BuildCorpus.__init__(self, infile, parametres_corpus)
937 def read_corpus(self, infile) :
940 if self.corpus.parametres['ucimark'] == 0 :
941 self.testuci = testetoile
942 elif self.corpus.parametres['ucimark'] == 1 :
943 self.testuci = testint
948 with codecs.open(infile, 'rU', self.encoding) as f :
950 if self.testuci(line) :
953 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci - 1)
955 self.corpus.ucis.append(Uci(iduci, line))
957 self.corpus.ucis.append(Uci(iduci, line))
958 elif line.startswith(u'-*') :
960 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
963 self.corpus.ucis[-1].paras.append(line.split()[0])
964 elif line.strip() != '' and iduci != -1 :
967 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
971 def treattxt(self, txt, iduce, idpara, iduci) :
972 if self.corpus.parametres.get('ucemethod', 0) == 2 and self.corpus.parametres['douce']:
973 txt = 'laphrasepoursplitter'.join(txt)
974 txt = self.make_cleans(txt)
975 txt = ' '.join([val for val in txt.split() if val not in self.ponctuation_espace])
976 ucetxt = txt.split('laphrasepoursplitter')
979 txt = self.make_cleans(txt)
980 ucetxt = self.make_uces(txt, self.corpus.parametres['douce'])
981 if self.corpus.ucis[-1].paras == [] :
985 self.corpus.ucis[-1].uces.append(Uce(iduce, idpara, iduci))
986 self.c.execute('INSERT INTO uces VALUES(?,?);', (`iduce`,uce))
993 self.corpus.add_word(word)
994 if self.dlg is not None :
995 if self.limitshow > self.count :
996 self.dlg.Pulse('uci: %i - uce : %i' % (iduci + 1, iduce +1))
1000 self.limitshow = self.last / 100000
1001 log.debug(' '.join([`iduci`,`idpara`,`iduce`]))
1002 if self.last > self.lim :
1005 return iduce, idpara
1007 def make_uces(self, txt, douce = True, keep_ponct = False) :
1008 txt = ' '.join(txt.split())
1011 reste, texte_uce, suite = self.decouper(self.prep_txt(txt), self.ucesize + 15, self.ucesize)
1019 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1022 reste, texte_uce, suite = self.decouper(suite, self.ucesize + 15, self.ucesize)
1030 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1032 print 'RESTEE UUCEEEEEEEEEEEEE', uce
1036 return [' '.join([val for val in txt.split() if val not in self.ponctuation_espace])]
1038 #decouper (list_sep)
1039 #make_uces (decouper)
1040 #treat_txt (make_uces)
1044 def __init__(self, parent, dlg = None) :
1045 self.parent = parent
1047 parametres = DoConf(os.path.join(self.parent.UserConfigPath,'corpus.cfg')).getoptions('corpus')
1048 parametres['pathout'] = PathOut(parent.filename, 'corpus').mkdirout()
1049 dial = CorpusPref(parent, parametres)
1050 dial.CenterOnParent()
1051 dial.txtpath.SetLabel(parent.filename)
1052 #dial.repout_choices.SetValue(parametres['pathout'])
1053 self.res = dial.ShowModal()
1054 if self.res == 5100 :
1055 parametres = dial.doparametres()
1056 parametres['originalpath'] = parent.filename
1057 PathOut().createdir(parametres['pathout'])
1058 ReadLexique(self.parent, lang = parametres['lang'])
1059 self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp'))
1060 self.parametres = parametres
1063 def doanalyse(self) :
1064 return BuildFromAlceste(self.parent.filename, self.parametres, self.parent.lexique, self.parent.expressions, dlg = self.dlg).corpus
1067 if __name__ == '__main__' :
1069 parametres = {'formesdb':'formes.db', 'ucesdb': 'uces.db', 'corpusdb' : 'corpus.db', 'syscoding' : 'utf-8', 'encoding' : encoding}
1070 intro = BuildCorpus(infile, parametres)#, tar_in, tar_infouce)#, tar_formes)