1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
11 from functions import decoupercharact, ReadDicoAsDico, DoConf
16 from operator import itemgetter
17 from uuid import uuid4
18 from chemins import PathOut
19 from dialog import CorpusPref
20 from functions import ReadLexique, ReadDicoAsDico
21 from colors import colors
25 log = logging.getLogger('iramuteq.corpus')
28 def copycorpus(corpus) :
29 log.info('copy corpus')
30 copy_corpus = Corpus(corpus.parent, parametres = corpus.parametres)
31 copy_corpus.ucis = corpus.ucis
32 copy_corpus.formes = corpus.formes
33 copy_corpus.pathout = corpus.pathout
34 copy_corpus.conn_all()
43 def __init__(self, parent, parametres = {}, read = False) :
45 self.parametres = parametres
47 self.connformes = None
49 self.conncorpus = None
56 self.idformesuces = {}
61 self.pathout = PathOut(dirout = parametres['pathout'])
64 def add_word(self, word) :
65 if word in self.formes :
66 self.formes[word].freq += 1
67 if self.formes[word].ident in self.idformesuces :
68 if self.ucis[-1].uces[-1].ident in self.idformesuces[self.formes[word].ident] :
69 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] += 1
71 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] = 1
73 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident: 1}
75 if word in self.parent.lexique :
76 gramtype = self.parent.lexique[word][1]
77 lem = self.parent.lexique[word][0]
84 self.formes[word] = Word(word, gramtype, len(self.formes), lem)
85 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident : 1}
88 """connect corpus to db"""
89 if self.connformes is None :
90 log.info('connexion corpus')
91 self.connuces = sqlite3.connect(self.pathout['uces.db'])
92 self.cuces = self.connuces.cursor()
93 self.connformes = sqlite3.connect(self.pathout['formes.db'])
94 self.cformes = self.connformes.cursor()
95 self.conncorpus = sqlite3.connect(self.pathout['corpus.db'])
96 self.ccorpus = self.conncorpus.cursor()
97 self.cformes.execute('PRAGMA temp_store=MEMORY;')
98 self.cformes.execute('PRAGMA journal_mode=MEMORY;')
99 self.cformes.execute('PRAGMA synchronous = OFF;')
100 self.cuces.execute('PRAGMA temp_store=MEMORY;')
101 self.cuces.execute('PRAGMA journal_mode=MEMORY;')
102 self.cuces.execute('PRAGMA synchronous = OFF;')
103 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
104 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
105 self.ccorpus.execute('PRAGMA synchronous = OFF;')
107 def read_corpus(self) :
108 log.info('read corpus')
109 self.parametres['syscoding'] = sys.getdefaultencoding()
110 if self.conncorpus is None :
112 res = self.ccorpus.execute('SELECT * FROM etoiles;')
114 self.ucis.append(Uci(row[0], row[1], row[2]))
115 uces = self.conncorpus.cursor().execute('SELECT * FROM luces where uci=?;',(`self.ucis[-1].ident`,))
117 self.ucis[-1].uces.append(Uce(uce[2], uce[1], uce[0]))
118 res = self.ccorpus.execute('SELECT * FROM formes;')
119 self.formes = dict([[forme[1], Word(forme[1], forme[3], forme[0], lem = forme[2], freq = forme[4])] for forme in res])
122 def getworduces(self, wordid) :
123 if isinstance(wordid, basestring) :
124 wordid = self.formes[wordid].ident
125 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`wordid`,))
126 return list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
128 def getformeuceseff(self, formeid) :
129 if isinstance(formeid, basestring) :
130 formeid = self.formes[formeid].ident
131 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`formeid`,))
132 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
133 query = 'SELECT eff FROM eff where id=%i ORDER BY id' % formeid
134 res = self.cformes.execute(query)
135 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
137 for i, uce in enumerate(uces) :
138 formeuceeff[uce] = formeuceeff.get(uce, 0) + eff[i]
141 def getlemuces(self, lem) :
142 formesid = ', '.join([`val` for val in self.lems[lem].formes])
143 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
144 res = self.cformes.execute(query)
145 return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))))
147 def getlemucis(self, lem) :
148 uces = self.getlemuces(lem)
149 return list(set([self.getucefromid(val).uci for val in uces]))
151 def getlemuceseff(self, lem, luces = None) :
152 formesid = ', '.join([`val` for val in self.lems[lem].formes])
153 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
154 res = self.cformes.execute(query)
155 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
156 query = 'SELECT eff FROM eff where id IN (%s) ORDER BY id' % formesid
157 res = self.cformes.execute(query)
158 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
160 for i, uce in enumerate(uces) :
161 lemuceeff[uce] = lemuceeff.get(uce, 0) + eff[i]
164 def getlemclustereff(self, lem, cluster) :
165 return len(list(set(self.lc[cluster]).intersection(self.getlemuces(lem))))
167 def getlemeff(self, lem) :
168 return self.lems[lem].freq
173 def getforme(self, formeid) :
174 if self.idformes is None : self.make_idformes()
175 return self.idformes[formeid]
177 def gettotocc(self) :
178 return sum([self.formes[forme].freq for forme in self.formes])
180 def getucemean(self) :
181 return float(self.gettotocc())/self.getucenb()
184 return self.ucis[-1].uces[-1].ident + 1
187 return self.ucis[-1].ident + 1
189 def getucisize(self) :
190 ucesize = self.getucesize()
191 return [sum(ucesize[uci.uces[0].ident:(uci.uces[-1].ident + 1)]) for uci in self.ucis]
193 def getucesize(self) :
194 res = self.getalluces()
195 return [len(uce[1].split()) for uce in res]
197 def getconcorde(self, uces) :
198 return self.cuces.execute('select * from uces where id IN (%s);' % ', '.join([`i` for i in uces]))
200 def getwordconcorde(self, word) :
201 return self.getconcorde(self.getworduces(word))
203 def getlemconcorde(self, lem) :
204 return self.getconcorde(self.getlemuces(lem))
206 def getalluces(self) :
207 return self.cuces.execute('SELECT * FROM uces')
209 def getucesfrometoile(self, etoile) :
210 return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles]
212 def getetoileuces(self) :
213 log.info('get uces etoiles')
216 for uci in self.ucis :
217 etoiles = uci.etoiles[1:]
219 if et in etoileuces :
220 etoileuces[et] += [uce.ident for uce in uci.uces]
222 etoileuces[et] = [uce.ident for uce in uci.uces]
224 for et in uci.paras :
225 if et in etoileuces :
226 etoileuces[et] += [uce.ident for uce in uci.uces if uce.para == idpara]
228 etoileuces[et] = [uce.ident for uce in uci.uces if uce.para == idpara]
234 def getucefromid(self, uceid) :
235 if self.iduces is None : self.make_iduces()
236 return self.iduces[uceid]
238 def gethapaxnb(self) :
239 return len([None for forme in self.formes if self.formes[forme].freq == 1])
241 def getactivesnb(self, key) :
242 return len([lem for lem in self.lems if self.lems[lem].act == key])
243 # def make_lems(self, lem = True) :
244 # log.info('make lems')
246 # for forme in self.formes :
247 # if self.formes[forme].lem in self.lems :
248 # if self.formes[forme].ident not in self.lems[self.formes[forme].lem] :
249 # self.lems[self.formes[forme].lem][self.formes[forme].ident] = 0
251 # self.lems[self.formes[forme].lem] = {self.formes[forme].ident : 0}
253 def getetbyuceid(self, uceid) :
254 if self.uceuci is None : self.uceuci = dict([[uce.ident,uci.ident] for uci in self.ucis for uce in uci.uces])
255 return self.ucis[self.uceuci[uceid]].etoiles
257 def make_lems(self, lem = True) :
258 log.info('make lems')
261 for forme in self.formes :
262 if self.formes[forme].lem in self.lems :
263 if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes :
264 self.lems[self.formes[forme].lem].add_forme(self.formes[forme])
266 self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
268 self.lems = dict([[forme, Lem(self, self.formes[forme])] for forme in self.formes])
270 def make_idformes(self) :
271 self.idformes = dict([[self.formes[forme].ident, self.formes[forme]] for forme in self.formes])
273 def make_iduces(self) :
274 if self.iduces is None :
275 self.iduces = dict([[uce.ident, uce] for uci in self.ucis for uce in uci.uces])
277 def make_lexitable(self, mineff, etoiles, gram = 0) :
282 tokeep = [lem for lem in self.lems if self.lems[lem].freq >= mineff and self.lems[lem].act in grams]
283 etuces = [[] for et in etoiles]
284 for uci in self.ucis :
285 get = list(set(uci.etoiles).intersection(etoiles))
287 log.info('2 variables sur une ligne')
289 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
290 etuces = [set(val) for val in etuces]
293 deff = self.getlemuceseff(lem)
295 tab.append([lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])
296 tab.insert(0, [''] + etoiles)
299 def make_efftype_from_etoiles(self, etoiles) :
301 etuces = [[] for et in etoiles]
302 for uci in self.ucis :
303 get = list(set(uci.etoiles).intersection(etoiles))
305 return '2 variables sur la meme ligne'
307 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
308 etuces = [set(val) for val in etuces]
309 for lem in self.lems :
310 deff = self.getlemuceseff(lem)
312 gram = self.lems[lem].gram
314 dtype[gram] = [i + j for i, j in zip(dtype[gram], [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])]
316 dtype[gram] = [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces]
317 tabout = [[gram] + dtype[gram] for gram in dtype]
318 tabout.insert(0, [''] + etoiles)
321 def make_uceactsize(self, actives) :
322 res = self.getalluces()
325 deff = self.getlemuceseff(lem)
327 ucesize[uce] = ucesize.get(uce, 0) + 1
330 def make_uc(self, actives, lim1, lim2) :
331 uceactsize = self.make_uceactsize(actives)
337 for uce in [uce for uci in self.ucis for uce in uci.uces] :
338 if uce.para == lastpara :
340 last1 += uceactsize.get(uce.ident,0)
341 uc1[-1].append(uce.ident)
343 uc1.append([uce.ident])
346 last2 += uceactsize.get(uce.ident, 0)
347 uc2[-1].append(uce.ident)
349 uc2.append([uce.ident])
352 last1 = uceactsize.get(uce.ident, 0)
353 last2 = uceactsize.get(uce.ident, 0)
355 uc1.append([uce.ident])
356 uc2.append([uce.ident])
359 def make_and_write_sparse_matrix_from_uc(self, actives, sizeuc1, sizeuc2, uc1out, uc2out, listuce1out, listuce2out) :
360 uc1, uc2 = self.make_uc(actives, sizeuc1, sizeuc2)
361 log.info('taille uc1 : %i - taille uc2 : %i' % (len(uc1), len(uc2)))
362 self.write_ucmatrix(uc1, actives, uc1out)
363 self.write_ucmatrix(uc2, actives, uc2out)
364 listuce1 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc1) for uce in ucl]
365 listuce2 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc2) for uce in ucl]
366 with open(listuce1out, 'w') as f :
367 f.write('\n'.join([';'.join(line) for line in listuce1]))
368 with open(listuce2out, 'w') as f :
369 f.write('\n'.join([';'.join(line) for line in listuce2]))
370 return len(uc1), len(uc2)
372 def write_ucmatrix(self, uc, actives, fileout) :
373 log.info('write uc matrix %s' % fileout)
374 uces_uc = dict([[uce, i] for i, ucl in enumerate(uc) for uce in ucl])
377 with open(fileout + '~', 'w+') as f :
378 for i, lem in enumerate(actives) :
379 for uce in self.getlemuces(lem):
380 if (uces_uc[uce], i) not in deja_la :
382 f.write(''.join([' '.join([`uces_uc[uce]+1`,`i+1`,`1`]),'\n']))
383 deja_la[(uces_uc[uce], i)] = 0
385 with open(fileout, 'w') as ffin :
386 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (len(uc), len(actives), nbl))
389 os.remove(fileout + '~')
392 def export_corpus(self, outf) :
393 #outf = 'export_corpus.txt'
395 res = self.getalluces()
399 with open(outf,'w') as f :
401 if self.iduces[uce[0]].uci == actuci and self.iduces[uce[0]].para == actpara :
402 f.write(uce[1].encode(self.parametres['syscoding']) + '\n')
403 elif self.iduces[uce[0]].uci != actuci :
404 actuci = self.iduces[uce[0]].uci
405 if self.ucis[self.iduces[uce[0]].uci].paras == [] :
406 actpara = self.iduces[uce[0]].para
407 f.write('\n' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n' + uce[1].encode(self.parametres['syscoding']) + '\n')
410 actpara = self.iduces[uce[0]].para
411 f.write('\n'.join([' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']), self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
412 elif self.iduces[uce[0]].para != actpara :
413 actpara = self.iduces[uce[0]].para
415 f.write('\n'.join([self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
417 def export_corpus_classes(self, outf, alc = True, lem = False) :
419 for i, lc in enumerate(self.lc) :
422 for uce in self.lc0 :
424 res = self.getalluces()
426 with open(outf, 'w') as f :
429 actuci = self.iduces[uce[0]].uci
431 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
433 etline = ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles + ['*classe_%i' % ucecl[uce[0]]])
435 etline = ' '.join(['<' + '='.join(et.split('_')) + '>' for et in self.ucis[self.iduces[uce[0]].uci].etoiles[1:]])
436 f.write(etline.encode(self.parametres['syscoding']) + '\n')
437 f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
439 def export_classe(self, outf, classe, lem = False) :
440 sts = self.lc[classe - 1]
441 res = self.getconcorde(sts)
443 with open(outf, 'w') as f :
446 f.write(' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n')
448 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
449 f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
451 def make_and_write_sparse_matrix_from_uces(self, actives, outfile, listuce = False) :
452 log.info('make_and_write_sparse_matrix_from_uces %s' % outfile)
454 with open(outfile + '~', 'w+') as f :
455 for i, lem in enumerate(actives) :
456 for uce in sorted(self.getlemuces(lem)) :
458 f.write(''.join([' '.join([`uce+1`, `i+1`,`1`]),'\n']))
460 with open(outfile, 'w') as ffin :
461 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
464 os.remove(outfile + '~')
466 with open(listuce, 'w') as f :
467 f.write('\n'.join(['uce;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucenb())]))
469 def make_and_write_sparse_matrix_from_uci(self, actives, outfile, listuci = False) :
470 log.info('make_and_write_sparse_matrix_from_ucis %s' % outfile)
472 with open(outfile + '~', 'w+') as f :
473 for i, lem in enumerate(actives) :
474 for uci in sorted(self.getlemucis(lem)) :
476 f.write(''.join([' '.join([`uci+1`, `i+1`,`1`]),'\n']))
478 with open(outfile, 'w') as ffin :
479 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucinb(), len(actives), nbl))
482 os.remove(outfile + '~')
484 with open(listuci, 'w') as f :
485 f.write('\n'.join(['uci;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucinb())]))
487 def make_and_write_sparse_matrix_from_classe(self, actives, uces, outfile) :
488 log.info('make_and_write_sparse_matrix_from_classe %s' % outfile)
490 duces = dict([[uce, i] for i, uce in enumerate(uces)])
491 with open(outfile + '~', 'w+') as f :
492 for i, lem in enumerate(actives) :
493 uces_ok = list(set(self.getlemuces(lem)).intersection(uces))
495 f.write(''.join([' '.join([`duces[uce]+1`,`i+1`,`1`]),'\n']))
497 with open(outfile, 'w') as ffin :
498 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
501 os.remove(outfile + '~')
503 def make_table_with_classe(self, uces, list_act) :
504 table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
505 uces = dict([[uce, i] for i, uce in enumerate(uces)])
506 for i, lem in enumerate(list_act) :
507 lemuces = list(set(self.getlemuces(lem)).intersection(uces))
509 table_uce[uces[uce]][i] = 1
510 table_uce.insert(0, list_act)
513 def make_pondtable_with_classe(self, uces, list_act) :
514 table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
515 uces = dict([[uce, i] for i, uce in enumerate(uces)])
516 for i, lem in enumerate(list_act) :
517 uceseff = self.getlemuceseff(lem)
518 lemuces = list(set(uceseff.keys()).intersection(uces))
520 table_uce[uces[uce]][i] = uceseff[uce]
521 table_uce.insert(0, list_act)
524 def parse_active(self, gramact, gramsup = None) :
525 log.info('parse actives')
526 for lem in self.lems :
527 if lem.startswith('_') and lem.endswith('_') :
528 self.lems[lem].act = 2
529 elif self.lems[lem].gram in gramact :
530 self.lems[lem].act = 1
531 elif gramsup is not None and self.lems[lem].gram not in gramact:
532 if self.lems[lem].gram in gramsup :
533 self.lems[lem].act = 2
535 self.lems[lem].act = 0
537 self.lems[lem].act = 2
539 def make_actives_limit(self, limit, key = 1) :
540 if self.idformes is None :
542 return [lem for lem in self.lems if self.getlemeff(lem) >= limit and self.lems[lem].act == key]
544 def make_actives_nb(self, nbmax, key) :
545 log.info('make_actives_nb : %i - %i' % (nbmax,key))
546 if self.idformes is None :
548 allactives = [[self.lems[lem].freq, lem] for lem in self.lems if self.lems[lem].act == key and self.lems[lem].freq >= 3]
549 self.activenb = len(allactives)
550 allactives = sorted(allactives, reverse = True)
551 if self.activenb == 0 :
553 if len(allactives) <= nbmax :
554 log.info('nb = %i - eff min = %i ' % (len(allactives), allactives[-1][0]))
555 return [val[1] for val in allactives], allactives[-1][0]
557 effs = [val[0] for val in allactives]
558 if effs.count(effs[nbmax - 1]) > 1 :
559 lim = effs[nbmax - 1] + 1
563 stop = effs.index(lim)
570 log.info('nb actives = %i - eff min = %i ' % (stop + 1, lim))
571 return [val[1] for val in allactives[0:stop + 1]], lim
573 def make_and_write_profile(self, actives, ucecl, fileout) :
574 log.info('formes/classes')
575 tab = [[lem] + [len(set(self.getlemuces(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
576 tab = [[line[0]] + [`val` for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
577 with open(fileout, 'w') as f :
578 f.write('\n'.join([';'.join(line) for line in tab]).encode(self.parametres['syscoding']))
580 def make_etoiles(self) :
582 for uci in self.ucis :
583 etoiles.update(uci.etoiles[1:])
586 def make_etoiles_dict(self) :
587 etoiles = [et for uci in self.ucis for et in uci.etoiles[1:]]
589 for etoile in etoiles :
590 et = etoile.split('_')
593 endet = '_'.join(et[1:])
594 if etoile in det[et[0]] :
595 det[et[0]][etoile] += 1
597 det[et[0]][etoile] = 1
602 endet = '_'.join(et[1:])
603 det[et[0]] = {etoile :1}
608 def make_etline(self, listet) :
609 etuces = [[] for et in listet]
610 for uci in self.ucis :
611 get = list(set(uci.etoiles).intersection(listet))
613 return '2 variables sur la meme ligne'
615 etuces[listet.index(get[0])] += [uce.ident for uce in uci.uces]
618 def make_and_write_profile_et(self, ucecl, fileout) :
619 log.info('etoiles/classes')
620 etoileuces = self.getetoileuces()
621 etoileuces = dict([[et, etoileuces[et]] for et in etoileuces if len(etoileuces[et]) > 1])
622 with open(fileout, 'w') as f :
623 f.write('\n'.join([';'.join([et] + [`len(set(etoileuces[et]).intersection(classe))` for classe in ucecl]) for et in etoileuces]).encode(self.parametres['syscoding']))
624 #etoiles = self.make_etoiles()
625 #with open(fileout, 'w') as f :
626 # f.write('\n'.join([';'.join([etoile] + [`len(set(self.getucesfrometoile(etoile)).intersection(classe))` for classe in ucecl]) for etoile in etoiles]).encode(self.parametres['syscoding']))
628 def make_colored_corpus(self) :
630 for i, lc in enumerate(self.lc) :
633 for uce in self.lc0 :
635 color = ['black'] + colors[len(self.lc) - 1]
637 <meta http-equiv="content-Type" content="text/html; charset=%s" />
639 ''' % sys.getdefaultencoding()
640 res = self.getalluces()
645 if self.iduces[uce[0]].uci != actuci :
646 actuci = self.iduces[uce[0]].uci
647 txt += '<br><hr>' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles) + '<br><br>'
648 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
650 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
651 return txt + '\n</body></html>'
653 def count_from_list(self, l, d) :
661 def count_from_list_cl(self, l, d, a, clnb) :
670 def find_segments(self, taille_segment, taille_limite) :
672 for uce in self.getalluces() :
674 d = self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
675 l = [[d[val], val] for val in d if d[val] >= 3]
678 if len(l) > taille_limite :
679 l = l[-taille_limite:]
682 def find_segments_in_classe(self, list_uce, taille_segment, taille_limite):
684 for uce in self.getconcorde(list_uce) :
686 d =self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
687 l = [[d[val], val, taille_segment] for val in d if d[val] >= 3]
690 if len(l) > taille_limite :
691 l = l[-taille_limite:]
694 def make_segments_profile(self, fileout, lenmin = 3, lenmax = 10, effmin = 50, lem = False) :
696 for b, classe in enumerate(self.lc) :
697 for uce in self.getconcorde(classe) :
700 uce = [self.formes[forme].lem for forme in uce]
701 for taille_segment in range(lenmin,lenmax) :
702 d =self.count_from_list_cl([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d, b, len(self.lc))
703 result = [[seg] + [str(val) for val in d[seg]] for seg in d if sum(d[seg]) >= effmin]
704 with open(fileout, 'w') as f :
705 f.write('\n'.join([';'.join(line) for line in result]))
707 def make_proftype(self, outf) :
709 for lem in self.lems :
710 gram = self.lems[lem].gram
712 res[gram] = [0 for val in self.lc]
713 lemuceeff = self.getlemuceseff(lem)
714 for i, classe in enumerate(self.lc) :
715 concern = set(classe).intersection(lemuceeff.keys())
716 res[gram][i] += sum([lemuceeff[uce] for uce in concern])
717 res = [[gram] + [`val` for val in res[gram]] for gram in res]
719 with open(outf, 'w') as f :
720 f.write('\n'.join([';'.join(line) for line in res]).encode(self.parametres['syscoding']))
723 def make_ucecl_from_R(self, filein) :
724 with open(filein, 'rU') as f :
729 line = line.replace('\n', '').replace('"', '').split(';')
730 self.lc.append([int(line[0]) - 1, int(line[1])])
731 classesl = [val[1] for val in self.lc]
733 self.lc = sorted(self.lc, key=itemgetter(1))
734 self.lc = [[uce[0] for uce in self.lc if uce[1] == i] for i in range(clnb+1)]
735 self.lc0 = self.lc.pop(0)
738 def get_stat_by_cluster(self, outf, lclasses = None) :
739 log.info('get_stat_by_cluster')
740 if lclasses is None :
743 occurrences = dict([[i + 1, 0] for i in range(len(lclasses))])
744 formescl = dict([[i + 1, 0] for i in range(len(lclasses))])
745 hapaxcl = dict([[i + 1, 0] for i in range(len(lclasses))])
746 lenclasses = dict([[i+1,len(cl)] for i, cl in enumerate(lclasses)])
747 sets = [set(cl) for cl in lclasses]
748 for forme in self.formes :
749 formeuceeff = self.getformeuceseff(forme)
750 for i, classe in enumerate(lclasses) :
751 concern = sets[i].intersection(formeuceeff.keys())
753 occurrences[i+1] += sum([formeuceeff[uce] for uce in concern])
755 if self.formes[forme].freq == 1 :
757 log.info('%f' % (time() - t1))
758 if outf is not None :
759 toprint = '\n'.join([';'.join([`i`, `occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`]) for i in occurrences])
760 with open(outf, 'w') as f :
763 return [[`occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`] for i in occurrences]
765 def get_stat_by_et(self, outf, etoiles) :
766 lclasses = [self.getucesfrometoile(etoile) for etoile in etoiles]
767 stats = self.get_stat_by_cluster(None, lclasses)
768 stats = [[etoiles[i]] + val for i, val in enumerate(stats)]
770 def gethapaxbyet(self, etoiles) :
771 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
773 for uce in hapaxuces :
774 if uce in hucesdict :
778 etuces = [[] for et in etoiles]
779 for uci in self.ucis :
780 get = list(set(uci.etoiles).intersection(etoiles))
782 return '2 variables sur la meme ligne'
784 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
785 etuces = [set(val) for val in etuces]
786 return [sum([hucesdict[uce] for uce in list(etuce.intersection(hapaxuces))]) for etuce in etuces]
788 def gethapaxuces(self) :
789 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
790 hapax = [forme for forme in self.lems if self.lems[forme].freq == 1]
792 for i,uce in enumerate(hapaxuces) :
793 if uce in hucesdict :
794 hucesdict[uce][0] += 1
795 hucesdict[uce][1].append(hapax[i])
797 hucesdict[uce] = [1,[hapax[i]]]
799 for uce in hucesdict :
800 if hucesdict[uce][0] in huces :
801 huces[hucesdict[uce][0]].append(uce)
803 huces[hucesdict[uce][0]] = [uce]
804 huces = zip(huces, huces.values())
805 huces.sort(reverse=True)
809 for nb in huces[0:4] :
810 txt += "<p><h2>%i hapax par uce</h2><p>\n" % nb[0]
812 res = self.getconcorde([uce])
814 ucetxt = ' ' + row[1] + ' '
816 for hap in hucesdict[uce][1] :
817 laforme = self.getforme([forme for forme in self.lems[hap].formes][0]).forme
818 ucetxt = ucetxt.replace(' '+laforme+' ', ' <font color=red>'+laforme+'</font> ')
819 txt += '<p><b>' + ' '.join(self.getetbyuceid(uceid)) + '</b></p>'
820 txt += '<p>'+ucetxt+'</p>\n'
824 with open('/tmp/testhapxuce.html','w') as f :
827 def export_dictionary(self, fileout, syscoding) :
828 listformes = [[self.formes[forme].freq, forme, self.formes[forme].lem, self.formes[forme].gram] for forme in self.formes]
829 listformes.sort(reverse = True)
830 listformes = [forme[1:] + [`forme[0]`] for forme in listformes]
831 with open(fileout, 'w') as f :
832 f.write('\n'.join(['\t'.join(forme) for forme in listformes]).encode(syscoding))
834 def export_lems(self, fileout, syscoding) :
836 listlem = [[lem, '\t'.join(['\t'.join([self.idformes[forme].forme, `self.lems[lem].formes[forme]`]) for forme in self.lems[lem].formes])] for lem in self.lems]
838 with open(fileout, 'w') as f :
839 f.write('\n'.join(['\t'.join(lem) for lem in listlem]).encode(syscoding))
845 def __init__(self, corpus) :
846 ucinb = corpus.getucinb()
847 ucisize = corpus.getucisize()
848 ucimean = float(sum(ucisize))/float(ucinb)
849 detoile = corpus.make_etoiles_dict()
852 def __init__(self, iduci, line, paraset = None) :
854 self.etoiles = line.split()
856 if paraset is not None :
857 self.paras = paraset.split()
862 def __init__(self, iduce, idpara, iduci) :
868 def __init__(self, word, gramtype, idword, lem = None, freq = None) :
874 if freq is not None :
880 def __init__(self, parent, forme) :
881 self.formes = {forme.ident : forme.freq}
882 self.gram = forme.gram
883 self.freq = forme.freq
886 def add_forme(self, forme) :
887 self.formes[forme.ident] = forme.freq
888 self.freq += forme.freq
890 def decouperlist(chaine, longueur, longueurOptimale) :
892 on part du dernier caractère, et on recule jusqu'au début de la chaîne.
893 Si on trouve un '$', c'est fini.
894 Sinon, on cherche le meilleur candidat. C'est-à-dire le rapport poids/distance le plus important.
896 separateurs = [[u'.', 6.0], [u'?', 6.0], [u'!', 6.0], [u'£$£', 6.0], [u':', 5.0], [u';', 4.0], [u',', 1.0], [u' ', 0.01]]
897 dsep = dict([[val[0],val[1]] for val in separateurs])
898 trouve = False # si on a trouvé un bon séparateur
899 iDecoupe = 0 # indice du caractere ou il faut decouper
901 longueur = min(longueur, len(chaine) - 1)
902 chaineTravail = chaine[:longueur + 1]
904 meilleur = ['', 0, 0] # type, poids et position du meilleur separateur
907 indice = chaineTravail.index(u'$')
909 iDecoupe = indice - 1
914 caractere = chaineTravail[nbCar]
915 distance = abs(longueurOptimale - nbCar) + 1
916 meilleureDistance = abs(longueurOptimale - meilleur[2]) + 1
917 if caractere in dsep :
918 if (float(dsep[caractere]) / distance) > (float(meilleur[1]) / meilleureDistance) :
919 meilleur[0] = caractere
920 meilleur[1] = dsep[caractere]
925 if (float(dsep[' ']) / distance) > (float(meilleur[1]) / meilleureDistance) :
927 meilleur[1] = dsep[' ']
934 #if meilleur[0] != ' ' :
935 # fin = chaine[iDecoupe + 1:]
936 # retour = chaineTravail[:iDecoupe]
938 fin = chaine[iDecoupe + 1:]
939 retour = chaineTravail[:iDecoupe + 1]
940 return len(retour) > 0, retour, fin
941 # si on a rien trouvé
942 return False, chaine, ''
944 def testetoile(line) :
945 return line.startswith(u'****')
948 return line[0:4].isdigit() and u'*' in line
950 def prep_txtlist(txt) :
951 return txt.split() + [u'$']
953 def prep_txtcharact(txt) :
958 Class for building a corpus
960 def __init__(self, infile, parametres_corpus, lexique = None, expressions = None, dlg = None) :
961 log.info('begin building corpus...')
962 self.lexique = lexique
963 self.expressions = expressions
965 self.corpus = Corpus(self, parametres_corpus)
968 self.lim = parametres_corpus.get('lim', 1000000)
969 self.encoding = parametres_corpus['encoding']
970 self.corpus.pathout = PathOut(filename = parametres_corpus['originalpath'], dirout = parametres_corpus['pathout'])
971 self.corpus.pathout.createdir(parametres_corpus['pathout'])
972 self.corpus.parametres['uuid'] = str(uuid4())
973 self.corpus.parametres['corpus_name'] = os.path.split(self.corpus.parametres['pathout'])[1]
974 self.corpus.parametres['type'] = 'corpus'
975 if self.corpus.parametres['keep_ponct'] :
976 self.ponctuation_espace = [' ', '']
978 self.ponctuation_espace = [' ','.', u'£$£', ';', '?', '!', ',', ':','']
980 self.tolist = self.corpus.parametres.get('tolist', 0)
987 def prep_makeuce(self) :
988 method = self.corpus.parametres.get('ucemethod', 0)
990 self.decouper = decouperlist
991 self.prep_txt = prep_txtlist
992 self.ucesize = self.corpus.parametres.get('ucesize', 40)
994 self.decouper = decoupercharact
995 self.prep_txt = prep_txtcharact
996 self.ucesize = self.corpus.parametres.get('ucesize', 240)
997 log.info('method uce : %s' % method)
1002 self.read_corpus(self.infile)
1003 except Warning, args :
1004 log.info('pas kool %s' % args)
1008 self.corpus.parametres['ira'] = self.corpus.pathout['Corpus.cira']
1009 self.time = time() - t1
1011 DoConf().makeoptions(['corpus'],[self.corpus.parametres], self.corpus.pathout['Corpus.cira'])
1012 log.info('time : %f' % (time() - t1))
1015 self.conn_f = sqlite3.connect(self.corpus.pathout['formes.db'])
1016 self.cf = self.conn_f.cursor()
1017 self.cf.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
1018 self.cf.execute('CREATE TABLE IF NOT EXISTS eff (id INTEGER, eff TEXT);')
1019 self.conn_f.commit()
1020 self.cf = self.conn_f.cursor()
1021 self.cf.execute('PRAGMA temp_store=MEMORY;')
1022 self.cf.execute('PRAGMA journal_mode=MEMORY;')
1023 self.cf.execute('PRAGMA synchronous = OFF;')
1024 self.cf.execute('begin')
1025 self.conn = sqlite3.connect(self.corpus.pathout['uces.db'])
1026 self.c = self.conn.cursor()
1027 self.c.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
1029 self.c = self.conn.cursor()
1030 self.c.execute('PRAGMA temp_store=MEMORY;')
1031 self.c.execute('PRAGMA journal_mode=MEMORY;')
1032 self.c.execute('PRAGMA synchronous = OFF;')
1033 self.c.execute('begin')
1036 #commit index and close db
1038 self.conn_f.commit()
1039 self.cf.execute('CREATE INDEX iduces ON uces (id);')
1040 self.cf.execute('CREATE INDEX ideff ON eff (id);')
1044 self.conn_corpus = sqlite3.connect(self.corpus.pathout['corpus.db'])
1045 self.ccorpus = self.conn_corpus.cursor()
1046 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS etoiles (uci INTEGER, et TEXT, paras TEXT);')
1047 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS luces (uci INTEGER, para INTEGER, uce INTEGER);')
1048 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS formes (ident INTEGER, forme TEXT, lem TEXT, gram TEXT, freq INTEGER);')
1049 self.conn_corpus.commit()
1050 self.ccorpus = self.conn_corpus.cursor()
1051 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
1052 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
1053 self.ccorpus.execute('PRAGMA synchronous = OFF;')
1054 self.ccorpus.execute('begin')
1055 self.backup_corpus()
1056 self.ccorpus.execute('CREATE INDEX iduci ON luces (uci);')
1057 self.conn_corpus.commit()
1058 self.conn_corpus.close()
1059 #self.corpus.parametres['corpus_ira'] = self.corpus.pathout['corpus.cira']
1061 def buildcleans(self) :
1062 if self.corpus.parametres.get('lower', 1) :
1063 self.cleans.append(self.dolower)
1064 if self.corpus.parametres.get('firstclean', 1) :
1065 self.cleans.append(self.firstclean)
1066 if self.corpus.parametres['charact'] :
1067 self.rule = self.corpus.parametres.get('keep_caract', u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_")
1068 self.cleans.append(self.docharact)
1069 if self.corpus.parametres.get('expressions', 1) :
1070 self.cleans.append(self.make_expression)
1071 if self.corpus.parametres.get('apos', 1) :
1072 self.cleans.append(self.doapos)
1073 if self.corpus.parametres.get('tiret', 1):
1074 self.cleans.append(self.dotiret)
1076 def make_expression(self,txt) :
1077 for expression in self.expressions:
1078 if expression in txt :
1079 txt = txt.replace(expression, self.expressions[expression][0])
1082 def dolower(self, txt) :
1085 def docharact(self, txt) :
1086 #rule = u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_-"
1087 list_keep = u"[" + self.rule + "]+"
1088 return re.sub(list_keep, ' ', txt)
1090 def doapos(self, txt) :
1091 return txt.replace(u'\'', u' ')
1093 def dotiret(self, txt) :
1094 return txt.replace(u'-', u' ')
1096 def firstclean(self, txt) :
1097 txt = txt.replace(u'’',"'")
1098 txt = txt.replace(u'œ', u'oe')
1099 return txt.replace('...',u' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'…', u' £$£ ')
1101 def make_cleans(self, txt) :
1102 for clean in self.cleans :
1106 def backup_uce(self) :
1107 if self.corpus.idformesuces != {} :
1108 log.info('backup %i' % len(self.corpus.idformesuces))
1109 touce = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].keys()])) for forme in self.corpus.idformesuces]
1110 toeff = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].values()])) for forme in self.corpus.idformesuces]
1111 self.cf.executemany('INSERT INTO uces VALUES (?,?);', touce)
1112 self.cf.executemany('INSERT INTO eff VALUES (?,?);', toeff)
1113 self.corpus.idformesuces = {}
1116 def backup_corpus(self) :
1117 log.info('start backup corpus')
1119 for uci in self.corpus.ucis :
1120 self.ccorpus.execute('INSERT INTO etoiles VALUES (?,?,?);' ,(uci.ident,' '.join(uci.etoiles), ' '.join(uci.paras,)))
1121 for uce in uci.uces :
1122 self.ccorpus.execute('INSERT INTO luces VALUES (?,?,?);',(`uci.ident`,`uce.para`,`uce.ident`,))
1123 for forme in self.corpus.formes :
1124 self.ccorpus.execute('INSERT INTO formes VALUES (?,?,?,?,?);', (`self.corpus.formes[forme].ident`, forme, self.corpus.formes[forme].lem, self.corpus.formes[forme].gram, `self.corpus.formes[forme].freq`,))
1125 log.info('%f' % (time() - t))
1127 def dofinish(self) :
1128 self.corpus.parametres['date'] = datetime.datetime.now().ctime()
1129 minutes, seconds = divmod(self.time, 60)
1130 hours, minutes = divmod(minutes, 60)
1131 self.corpus.parametres['time'] = '%.0fh %.0fm %.0fs' % (hours, minutes, seconds)
1132 self.corpus.parametres['ucinb'] = self.corpus.getucinb()
1133 self.corpus.parametres['ucenb'] = self.corpus.getucenb()
1134 self.corpus.parametres['occurrences'] = self.corpus.gettotocc()
1135 self.corpus.parametres['formesnb'] = len(self.corpus.formes)
1136 hapaxnb = self.corpus.gethapaxnb()
1137 pourhapaxf = (float(hapaxnb) / len(self.corpus.formes)) * 100
1138 pourhapaxocc = (float(hapaxnb) / self.corpus.parametres['occurrences']) * 100
1139 self.corpus.parametres['hapax'] = '%i - %.2f %% des formes - %.2f %% des occurrences' % (hapaxnb, pourhapaxf, pourhapaxocc)
1142 class BuildFromAlceste(BuildCorpus) :
1143 def read_corpus(self, infile) :
1144 if self.dlg is not None :
1145 self.dlg.Pulse('textes : 0 - segments : 0')
1148 if self.corpus.parametres['ucimark'] == 0 :
1149 self.testuci = testetoile
1150 elif self.corpus.parametres['ucimark'] == 1 :
1151 self.testuci = testint
1157 with codecs.open(infile, 'r', self.encoding) as f :
1158 for linenb, line in enumerate(f) :
1159 line = line.rstrip('\n\r')
1160 if self.testuci(line) :
1163 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci - 1)
1165 self.corpus.ucis.append(Uci(iduci, line))
1168 if self.corpus.ucis[-1].uces == [] :
1169 log.info(u'Empty text : %i' % linenb)
1171 self.corpus.ucis.pop()
1172 self.corpus.ucis.append(Uci(iduci, line))
1173 if self.dlg is not None :
1174 if not (iduci + 1) % 10 :
1175 self.dlg.Pulse('textes : %i - segments : %i' % (iduci + 1, iduce +1))
1176 elif line.startswith(u'-*') :
1179 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1182 self.corpus.ucis[-1].paras.append(line.split()[0])
1184 raise Exception('paragrapheOT %i' % linenb)
1185 elif line.strip() != '' and iduci != -1 :
1187 if txt != [] and iduci != -1 :
1188 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1193 self.corpus.ucis.pop()
1194 log.info(Exception("Empty text %i" % linenb))
1196 raise Exception('EmptyText %i' % linenb)
1197 if iduci != -1 and iduce != -1:
1200 log.info(_(u"No Text in corpora. Are you sure of the formatting ?"))
1201 raise Exception('TextBeforeTextMark %i' % linenb)
1202 except UnicodeDecodeError :
1203 raise Exception("CorpusEncoding")
1205 def treattxt(self, txt, iduce, idpara, iduci) :
1206 if self.corpus.parametres.get('ucemethod', 0) == 2 and self.corpus.parametres['douce']:
1207 txt = 'laphrasepoursplitter'.join(txt)
1208 txt = self.make_cleans(txt)
1209 txt = ' '.join([val for val in txt.split() if val not in self.ponctuation_espace])
1210 ucetxt = txt.split('laphrasepoursplitter')
1213 txt = self.make_cleans(txt)
1214 ucetxt = self.make_uces(txt, self.corpus.parametres['douce'])
1215 if self.corpus.ucis[-1].paras == [] :
1219 self.corpus.ucis[-1].uces.append(Uce(iduce, idpara, iduci))
1220 self.c.execute('INSERT INTO uces VALUES(?,?);', (`iduce`,uce))
1221 if not self.tolist :
1227 self.corpus.add_word(word)
1228 log.debug(' '.join([`iduci`,`idpara`,`iduce`]))
1229 if self.last > self.lim :
1232 return iduce, idpara
1234 def make_uces(self, txt, douce = True, keep_ponct = False) :
1235 txt = ' '.join(txt.split())
1238 reste, texte_uce, suite = self.decouper(self.prep_txt(txt), self.ucesize + 15, self.ucesize)
1240 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1243 reste, texte_uce, suite = self.decouper(suite, self.ucesize + 15, self.ucesize)
1244 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1249 return [' '.join([val for val in txt.split() if val not in self.ponctuation_espace])]
1251 #decouper (list_sep)
1252 #make_uces (decouper)
1253 #treat_txt (make_uces)
1257 def __init__(self, parent, dlg = None) :
1258 self.parent = parent
1260 parametres = DoConf(os.path.join(self.parent.UserConfigPath,'corpus.cfg')).getoptions('corpus')
1261 parametres['pathout'] = PathOut(parent.filename, 'corpus').mkdirout()
1262 dial = CorpusPref(parent, parametres)
1263 dial.CenterOnParent()
1264 dial.txtpath.SetLabel(parent.filename)
1265 #dial.repout_choices.SetValue(parametres['pathout'])
1266 self.res = dial.ShowModal()
1267 if self.res == 5100 :
1268 parametres = dial.doparametres()
1269 parametres['originalpath'] = parent.filename
1270 PathOut().createdir(parametres['pathout'])
1271 ReadLexique(self.parent, lang = parametres['lang'])
1272 if parametres['lang'] != 'other' and os.path.exists(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')):
1273 self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp'))
1275 self.parent.expressions = {}
1276 self.parametres = parametres
1278 if self.dlg is not None :
1282 def doanalyse(self) :
1283 return BuildFromAlceste(self.parent.filename, self.parametres, self.parent.lexique, self.parent.expressions, dlg = self.dlg).corpus
1286 if __name__ == '__main__' :
1288 parametres = {'formesdb':'formes.db', 'ucesdb': 'uces.db', 'corpusdb' : 'corpus.db', 'syscoding' : 'utf-8', 'encoding' : encoding}
1289 intro = BuildCorpus(infile, parametres)#, tar_in, tar_infouce)#, tar_formes)