1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
11 from functions import decoupercharact, ReadDicoAsDico, DoConf
16 from operator import itemgetter
17 from uuid import uuid4
18 from chemins import PathOut
19 from dialog import CorpusPref
20 from functions import ReadLexique, ReadDicoAsDico
21 from colors import colors
25 log = logging.getLogger('iramuteq.corpus')
28 def copycorpus(corpus) :
29 log.info('copy corpus')
30 copy_corpus = Corpus(corpus.parent, parametres = corpus.parametres)
31 copy_corpus.ucis = corpus.ucis
32 copy_corpus.formes = corpus.formes
33 copy_corpus.pathout = corpus.pathout
34 copy_corpus.conn_all()
44 def __init__(self, parent, parametres = {}, read = False) :
46 self.parametres = parametres
48 self.connformes = None
50 self.conncorpus = None
57 self.idformesuces = {}
62 self.pathout = PathOut(dirout = parametres['pathout'])
65 def add_word(self, word) :
66 if word in self.formes :
67 self.formes[word].freq += 1
68 if self.formes[word].ident in self.idformesuces :
69 if self.ucis[-1].uces[-1].ident in self.idformesuces[self.formes[word].ident] :
70 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] += 1
72 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] = 1
74 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident: 1}
76 if word in self.parent.lexique :
77 gramtype = self.parent.lexique[word][1]
78 lem = self.parent.lexique[word][0]
85 self.formes[word] = Word(word, gramtype, len(self.formes), lem)
86 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident : 1}
89 """connect corpus to db"""
90 if self.connformes is None :
91 log.info('connexion corpus')
92 self.connuces = sqlite3.connect(self.pathout['uces.db'])
93 self.cuces = self.connuces.cursor()
94 self.connformes = sqlite3.connect(self.pathout['formes.db'])
95 self.cformes = self.connformes.cursor()
96 self.conncorpus = sqlite3.connect(self.pathout['corpus.db'])
97 self.ccorpus = self.conncorpus.cursor()
98 self.cformes.execute('PRAGMA temp_store=MEMORY;')
99 self.cformes.execute('PRAGMA journal_mode=MEMORY;')
100 self.cformes.execute('PRAGMA synchronous = OFF;')
101 self.cuces.execute('PRAGMA temp_store=MEMORY;')
102 self.cuces.execute('PRAGMA journal_mode=MEMORY;')
103 self.cuces.execute('PRAGMA synchronous = OFF;')
104 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
105 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
106 self.ccorpus.execute('PRAGMA synchronous = OFF;')
108 def read_corpus(self) :
109 log.info('read corpus')
110 self.parametres['syscoding'] = sys.getdefaultencoding()
111 if self.conncorpus is None :
113 res = self.ccorpus.execute('SELECT * FROM etoiles;')
115 self.ucis.append(Uci(row[0], row[1], row[2]))
116 uces = self.conncorpus.cursor().execute('SELECT * FROM luces where uci=?;',(`self.ucis[-1].ident`,))
118 self.ucis[-1].uces.append(Uce(uce[2], uce[1], uce[0]))
119 res = self.ccorpus.execute('SELECT * FROM formes;')
120 self.formes = dict([[forme[1], Word(forme[1], forme[3], forme[0], lem = forme[2], freq = forme[4])] for forme in res])
123 def getworduces(self, wordid) :
124 if isinstance(wordid, basestring) :
125 wordid = self.formes[wordid].ident
126 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`wordid`,))
127 return list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
129 def getformeuceseff(self, formeid) :
130 if isinstance(formeid, basestring) :
131 formeid = self.formes[formeid].ident
132 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`formeid`,))
133 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
134 query = 'SELECT eff FROM eff where id=%i ORDER BY id' % formeid
135 res = self.cformes.execute(query)
136 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
138 for i, uce in enumerate(uces) :
139 formeuceeff[uce] = formeuceeff.get(uce, 0) + eff[i]
142 def getlemuces(self, lem) :
143 formesid = ', '.join([`val` for val in self.lems[lem].formes])
144 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
145 res = self.cformes.execute(query)
146 return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))))
148 def getlemucis(self, lem) :
149 uces = self.getlemuces(lem)
150 return list(set([self.getucefromid(val).uci for val in uces]))
152 def getlemuceseff(self, lem, luces = None) :
153 formesid = ', '.join([`val` for val in self.lems[lem].formes])
154 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
155 res = self.cformes.execute(query)
156 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
157 query = 'SELECT eff FROM eff where id IN (%s) ORDER BY id' % formesid
158 res = self.cformes.execute(query)
159 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
161 for i, uce in enumerate(uces) :
162 lemuceeff[uce] = lemuceeff.get(uce, 0) + eff[i]
165 def getlemclustereff(self, lem, cluster) :
166 return len(list(set(self.lc[cluster]).intersection(self.getlemuces(lem))))
168 def getlemeff(self, lem) :
169 return self.lems[lem].freq
174 def getforme(self, formeid) :
175 if self.idformes is None : self.make_idformes()
176 return self.idformes[formeid]
178 def gettotocc(self) :
179 return sum([self.formes[forme].freq for forme in self.formes])
181 def getucemean(self) :
182 return float(self.gettotocc())/self.getucenb()
185 return self.ucis[-1].uces[-1].ident + 1
188 return self.ucis[-1].ident + 1
190 def getucisize(self) :
191 ucesize = self.getucesize()
192 return [sum(ucesize[uci.uces[0].ident:(uci.uces[-1].ident + 1)]) for uci in self.ucis]
194 def getucesize(self) :
195 res = self.getalluces()
196 return [len(uce[1].split()) for uce in res]
198 def getconcorde(self, uces) :
199 return self.cuces.execute('select * from uces where id IN (%s);' % ', '.join([`i` for i in uces]))
201 def getwordconcorde(self, word) :
202 return self.getconcorde(self.getworduces(word))
204 def getlemconcorde(self, lem) :
205 return self.getconcorde(self.getlemuces(lem))
207 def getalluces(self) :
208 return self.cuces.execute('SELECT * FROM uces')
210 def getucesfrometoile(self, etoile) :
211 return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles]
213 def getetoileuces(self) :
214 log.info('get uces etoiles')
217 for uci in self.ucis :
218 etoiles = uci.etoiles[1:]
220 if et in etoileuces :
221 etoileuces[et] += [uce.ident for uce in uci.uces]
223 etoileuces[et] = [uce.ident for uce in uci.uces]
225 for et in uci.paras :
226 if et in etoileuces :
227 etoileuces[et] += [uce.ident for uce in uci.uces if uce.para == idpara]
229 etoileuces[et] = [uce.ident for uce in uci.uces if uce.para == idpara]
235 def getucefromid(self, uceid) :
236 if self.iduces is None : self.make_iduces()
237 return self.iduces[uceid]
239 def gethapaxnb(self) :
240 return len([None for forme in self.formes if self.formes[forme].freq == 1])
242 def getactivesnb(self, key) :
243 return len([lem for lem in self.lems if self.lems[lem].act == key])
244 # def make_lems(self, lem = True) :
245 # log.info('make lems')
247 # for forme in self.formes :
248 # if self.formes[forme].lem in self.lems :
249 # if self.formes[forme].ident not in self.lems[self.formes[forme].lem] :
250 # self.lems[self.formes[forme].lem][self.formes[forme].ident] = 0
252 # self.lems[self.formes[forme].lem] = {self.formes[forme].ident : 0}
254 def getetbyuceid(self, uceid) :
255 if self.uceuci is None : self.uceuci = dict([[uce.ident,uci.ident] for uci in self.ucis for uce in uci.uces])
256 return self.ucis[self.uceuci[uceid]].etoiles
258 def make_lems(self, lem = True) :
259 log.info('make lems')
262 for forme in self.formes :
263 if self.formes[forme].lem in self.lems :
264 if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes :
265 self.lems[self.formes[forme].lem].add_forme(self.formes[forme])
267 self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
269 self.lems = dict([[forme, Lem(self, self.formes[forme])] for forme in self.formes])
271 def make_idformes(self) :
272 self.idformes = dict([[self.formes[forme].ident, self.formes[forme]] for forme in self.formes])
274 def make_iduces(self) :
275 if self.iduces is None :
276 self.iduces = dict([[uce.ident, uce] for uci in self.ucis for uce in uci.uces])
278 def make_lexitable(self, mineff, etoiles, gram = 0) :
283 tokeep = [lem for lem in self.lems if self.lems[lem].freq >= mineff and self.lems[lem].act in grams]
284 etuces = [[] for et in etoiles]
285 for uci in self.ucis :
286 get = list(set(uci.etoiles).intersection(etoiles))
288 log.info('2 variables sur une ligne')
290 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
291 etuces = [set(val) for val in etuces]
294 deff = self.getlemuceseff(lem)
296 tab.append([lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])
297 tab.insert(0, [''] + etoiles)
300 def make_efftype_from_etoiles(self, etoiles) :
302 etuces = [[] for et in etoiles]
303 for uci in self.ucis :
304 get = list(set(uci.etoiles).intersection(etoiles))
306 return '2 variables sur la meme ligne'
308 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
309 etuces = [set(val) for val in etuces]
310 for lem in self.lems :
311 deff = self.getlemuceseff(lem)
313 gram = self.lems[lem].gram
315 dtype[gram] = [i + j for i, j in zip(dtype[gram], [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])]
317 dtype[gram] = [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces]
318 tabout = [[gram] + dtype[gram] for gram in dtype]
319 tabout.insert(0, [''] + etoiles)
322 def make_uceactsize(self, actives) :
323 res = self.getalluces()
326 deff = self.getlemuceseff(lem)
328 ucesize[uce] = ucesize.get(uce, 0) + 1
331 def make_uc(self, actives, lim1, lim2) :
332 uceactsize = self.make_uceactsize(actives)
338 for uce in [uce for uci in self.ucis for uce in uci.uces] :
339 if uce.para == lastpara :
341 last1 += uceactsize.get(uce.ident,0)
342 uc1[-1].append(uce.ident)
344 uc1.append([uce.ident])
347 last2 += uceactsize.get(uce.ident, 0)
348 uc2[-1].append(uce.ident)
350 uc2.append([uce.ident])
353 last1 = uceactsize.get(uce.ident, 0)
354 last2 = uceactsize.get(uce.ident, 0)
356 uc1.append([uce.ident])
357 uc2.append([uce.ident])
360 def make_and_write_sparse_matrix_from_uc(self, actives, sizeuc1, sizeuc2, uc1out, uc2out, listuce1out, listuce2out) :
361 uc1, uc2 = self.make_uc(actives, sizeuc1, sizeuc2)
362 log.info('taille uc1 : %i - taille uc2 : %i' % (len(uc1), len(uc2)))
363 self.write_ucmatrix(uc1, actives, uc1out)
364 self.write_ucmatrix(uc2, actives, uc2out)
365 listuce1 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc1) for uce in ucl]
366 listuce2 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc2) for uce in ucl]
367 with open(listuce1out, 'w') as f :
368 f.write('\n'.join([';'.join(line) for line in listuce1]))
369 with open(listuce2out, 'w') as f :
370 f.write('\n'.join([';'.join(line) for line in listuce2]))
371 return len(uc1), len(uc2)
373 def write_ucmatrix(self, uc, actives, fileout) :
374 log.info('write uc matrix %s' % fileout)
375 uces_uc = dict([[uce, i] for i, ucl in enumerate(uc) for uce in ucl])
378 with open(fileout + '~', 'w+') as f :
379 for i, lem in enumerate(actives) :
380 for uce in self.getlemuces(lem):
381 if (uces_uc[uce], i) not in deja_la :
383 f.write(''.join([' '.join([`uces_uc[uce]+1`,`i+1`,`1`]),'\n']))
384 deja_la[(uces_uc[uce], i)] = 0
386 with open(fileout, 'w') as ffin :
387 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (len(uc), len(actives), nbl))
390 os.remove(fileout + '~')
393 def export_corpus(self, outf) :
394 #outf = 'export_corpus.txt'
396 res = self.getalluces()
400 with open(outf,'w') as f :
402 if self.iduces[uce[0]].uci == actuci and self.iduces[uce[0]].para == actpara :
403 f.write(uce[1].encode(self.parametres['syscoding']) + '\n')
404 elif self.iduces[uce[0]].uci != actuci :
405 actuci = self.iduces[uce[0]].uci
406 if self.ucis[self.iduces[uce[0]].uci].paras == [] :
407 actpara = self.iduces[uce[0]].para
408 f.write('\n' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n' + uce[1].encode(self.parametres['syscoding']) + '\n')
411 actpara = self.iduces[uce[0]].para
412 f.write('\n'.join([' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']), self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
413 elif self.iduces[uce[0]].para != actpara :
414 actpara = self.iduces[uce[0]].para
416 f.write('\n'.join([self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
418 def export_corpus_classes(self, outf, alc = True, lem = False) :
420 for i, lc in enumerate(self.lc) :
423 for uce in self.lc0 :
425 res = self.getalluces()
427 with open(outf, 'w') as f :
430 actuci = self.iduces[uce[0]].uci
432 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
434 etline = ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles + ['*classe_%i' % ucecl[uce[0]]])
436 etline = ' '.join(['<' + '='.join(et.split('_')) + '>' for et in self.ucis[self.iduces[uce[0]].uci].etoiles[1:]])
437 f.write(etline.encode(self.parametres['syscoding']) + '\n')
438 f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
440 def export_classe(self, outf, classe, lem = False) :
441 sts = self.lc[classe - 1]
442 res = self.getconcorde(sts)
444 with open(outf, 'w') as f :
447 f.write(' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n')
449 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
450 f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
452 def make_and_write_sparse_matrix_from_uces(self, actives, outfile, listuce = False) :
453 log.info('make_and_write_sparse_matrix_from_uces %s' % outfile)
455 with open(outfile + '~', 'w+') as f :
456 for i, lem in enumerate(actives) :
457 for uce in sorted(self.getlemuces(lem)) :
459 f.write(''.join([' '.join([`uce+1`, `i+1`,`1`]),'\n']))
461 with open(outfile, 'w') as ffin :
462 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
465 os.remove(outfile + '~')
467 with open(listuce, 'w') as f :
468 f.write('\n'.join(['uce;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucenb())]))
470 def make_and_write_sparse_matrix_from_uci(self, actives, outfile, listuci = False) :
471 log.info('make_and_write_sparse_matrix_from_ucis %s' % outfile)
473 with open(outfile + '~', 'w+') as f :
474 for i, lem in enumerate(actives) :
475 for uci in sorted(self.getlemucis(lem)) :
477 f.write(''.join([' '.join([`uci+1`, `i+1`,`1`]),'\n']))
479 with open(outfile, 'w') as ffin :
480 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucinb(), len(actives), nbl))
483 os.remove(outfile + '~')
485 with open(listuci, 'w') as f :
486 f.write('\n'.join(['uci;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucinb())]))
488 def make_and_write_sparse_matrix_from_classe(self, actives, uces, outfile) :
489 log.info('make_and_write_sparse_matrix_from_classe %s' % outfile)
491 duces = dict([[uce, i] for i, uce in enumerate(uces)])
492 with open(outfile + '~', 'w+') as f :
493 for i, lem in enumerate(actives) :
494 uces_ok = list(set(self.getlemuces(lem)).intersection(uces))
496 f.write(''.join([' '.join([`duces[uce]+1`,`i+1`,`1`]),'\n']))
498 with open(outfile, 'w') as ffin :
499 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
502 os.remove(outfile + '~')
504 def make_table_with_classe(self, uces, list_act) :
505 table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
506 uces = dict([[uce, i] for i, uce in enumerate(uces)])
507 for i, lem in enumerate(list_act) :
508 lemuces = list(set(self.getlemuces(lem)).intersection(uces))
510 table_uce[uces[uce]][i] = 1
511 table_uce.insert(0, list_act)
514 def make_pondtable_with_classe(self, uces, list_act) :
515 table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
516 uces = dict([[uce, i] for i, uce in enumerate(uces)])
517 for i, lem in enumerate(list_act) :
518 uceseff = self.getlemuceseff(lem)
519 lemuces = list(set(uceseff.keys()).intersection(uces))
521 table_uce[uces[uce]][i] = uceseff[uce]
522 table_uce.insert(0, list_act)
525 def parse_active(self, gramact, gramsup = None) :
526 log.info('parse actives')
527 for lem in self.lems :
528 if lem.startswith('_') and lem.endswith('_') :
529 self.lems[lem].act = 2
530 elif self.lems[lem].gram in gramact :
531 self.lems[lem].act = 1
532 elif gramsup is not None and self.lems[lem].gram not in gramact:
533 if self.lems[lem].gram in gramsup :
534 self.lems[lem].act = 2
536 self.lems[lem].act = 0
538 self.lems[lem].act = 2
540 def make_actives_limit(self, limit, key = 1) :
541 if self.idformes is None :
543 return [lem for lem in self.lems if self.getlemeff(lem) >= limit and self.lems[lem].act == key]
545 def make_actives_nb(self, nbmax, key) :
546 log.info('make_actives_nb : %i - %i' % (nbmax,key))
547 if self.idformes is None :
549 allactives = [[self.lems[lem].freq, lem] for lem in self.lems if self.lems[lem].act == key and self.lems[lem].freq >= 3]
550 self.activenb = len(allactives)
551 allactives = sorted(allactives, reverse = True)
552 if len(allactives) <= nbmax :
553 log.info('nb = %i - eff min = %i ' % (len(allactives), allactives[-1][0]))
554 return [val[1] for val in allactives], allactives[-1][0]
556 effs = [val[0] for val in allactives]
557 if effs.count(effs[nbmax - 1]) > 1 :
558 lim = effs[nbmax - 1] + 1
562 stop = effs.index(lim)
569 log.info('nb actives = %i - eff min = %i ' % (stop + 1, lim))
570 return [val[1] for val in allactives[0:stop + 1]], lim
572 def make_and_write_profile(self, actives, ucecl, fileout) :
573 log.info('formes/classes')
574 tab = [[lem] + [len(set(self.getlemuces(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
575 tab = [[line[0]] + [`val` for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
576 with open(fileout, 'w') as f :
577 f.write('\n'.join([';'.join(line) for line in tab]).encode(self.parametres['syscoding']))
579 def make_etoiles(self) :
581 for uci in self.ucis :
582 etoiles.update(uci.etoiles[1:])
585 def make_etoiles_dict(self) :
586 etoiles = [et for uci in self.ucis for et in uci.etoiles[1:]]
588 for etoile in etoiles :
589 et = etoile.split('_')
592 endet = '_'.join(et[1:])
593 if etoile in det[et[0]] :
594 det[et[0]][etoile] += 1
596 det[et[0]][etoile] = 1
601 endet = '_'.join(et[1:])
602 det[et[0]] = {etoile :1}
607 def make_etline(self, listet) :
608 etuces = [[] for et in listet]
609 for uci in self.ucis :
610 get = list(set(uci.etoiles).intersection(listet))
612 return '2 variables sur la meme ligne'
614 etuces[listet.index(get[0])] += [uce.ident for uce in uci.uces]
617 def make_and_write_profile_et(self, ucecl, fileout) :
618 log.info('etoiles/classes')
619 etoileuces = self.getetoileuces()
620 etoileuces = dict([[et, etoileuces[et]] for et in etoileuces if len(etoileuces[et]) > 1])
621 with open(fileout, 'w') as f :
622 f.write('\n'.join([';'.join([et] + [`len(set(etoileuces[et]).intersection(classe))` for classe in ucecl]) for et in etoileuces]).encode(self.parametres['syscoding']))
623 #etoiles = self.make_etoiles()
624 #with open(fileout, 'w') as f :
625 # f.write('\n'.join([';'.join([etoile] + [`len(set(self.getucesfrometoile(etoile)).intersection(classe))` for classe in ucecl]) for etoile in etoiles]).encode(self.parametres['syscoding']))
627 def make_colored_corpus(self) :
629 for i, lc in enumerate(self.lc) :
632 for uce in self.lc0 :
634 color = ['black'] + colors[len(self.lc) - 1]
636 <meta http-equiv="content-Type" content="text/html; charset=%s" />
638 ''' % sys.getdefaultencoding()
639 res = self.getalluces()
644 if self.iduces[uce[0]].uci != actuci :
645 actuci = self.iduces[uce[0]].uci
646 txt += '<br><hr>' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles) + '<br><br>'
647 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
649 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
650 return txt + '\n</body></html>'
652 def count_from_list(self, l, d) :
660 def count_from_list_cl(self, l, d, a, clnb) :
669 def find_segments(self, taille_segment, taille_limite) :
671 for uce in self.getalluces() :
673 d = self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
674 l = [[d[val], val] for val in d if d[val] >= 3]
677 if len(l) > taille_limite :
678 l = l[-taille_limite:]
681 def find_segments_in_classe(self, list_uce, taille_segment, taille_limite):
683 for uce in self.getconcorde(list_uce) :
685 d =self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
686 l = [[d[val], val, taille_segment] for val in d if d[val] >= 3]
689 if len(l) > taille_limite :
690 l = l[-taille_limite:]
693 def make_segments_profile(self, fileout, lenmin = 3, lenmax = 10, effmin = 50, lem = False) :
695 for b, classe in enumerate(self.lc) :
696 for uce in self.getconcorde(classe) :
699 uce = [self.formes[forme].lem for forme in uce]
700 for taille_segment in range(lenmin,lenmax) :
701 d =self.count_from_list_cl([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d, b, len(self.lc))
702 result = [[seg] + [str(val) for val in d[seg]] for seg in d if sum(d[seg]) >= effmin]
703 with open(fileout, 'w') as f :
704 f.write('\n'.join([';'.join(line) for line in result]))
706 def make_proftype(self, outf) :
708 for lem in self.lems :
709 gram = self.lems[lem].gram
711 res[gram] = [0 for val in self.lc]
712 lemuceeff = self.getlemuceseff(lem)
713 for i, classe in enumerate(self.lc) :
714 concern = set(classe).intersection(lemuceeff.keys())
715 res[gram][i] += sum([lemuceeff[uce] for uce in concern])
716 res = [[gram] + [`val` for val in res[gram]] for gram in res]
718 with open(outf, 'w') as f :
719 f.write('\n'.join([';'.join(line) for line in res]).encode(self.parametres['syscoding']))
722 def make_ucecl_from_R(self, filein) :
723 with open(filein, 'rU') as f :
728 line = line.replace('\n', '').replace('"', '').split(';')
729 self.lc.append([int(line[0]) - 1, int(line[1])])
730 classesl = [val[1] for val in self.lc]
732 self.lc = sorted(self.lc, key=itemgetter(1))
733 self.lc = [[uce[0] for uce in self.lc if uce[1] == i] for i in range(clnb+1)]
734 self.lc0 = self.lc.pop(0)
737 def get_stat_by_cluster(self, outf, lclasses = None) :
738 log.info('get_stat_by_cluster')
739 if lclasses is None :
742 occurrences = dict([[i + 1, 0] for i in range(len(lclasses))])
743 formescl = dict([[i + 1, 0] for i in range(len(lclasses))])
744 hapaxcl = dict([[i + 1, 0] for i in range(len(lclasses))])
745 lenclasses = dict([[i+1,len(cl)] for i, cl in enumerate(lclasses)])
746 sets = [set(cl) for cl in lclasses]
747 for forme in self.formes :
748 formeuceeff = self.getformeuceseff(forme)
749 for i, classe in enumerate(lclasses) :
750 concern = sets[i].intersection(formeuceeff.keys())
752 occurrences[i+1] += sum([formeuceeff[uce] for uce in concern])
754 if self.formes[forme].freq == 1 :
756 log.info('%f' % (time() - t1))
757 if outf is not None :
758 toprint = '\n'.join([';'.join([`i`, `occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`]) for i in occurrences])
759 with open(outf, 'w') as f :
762 return [[`occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`] for i in occurrences]
764 def get_stat_by_et(self, outf, etoiles) :
765 lclasses = [self.getucesfrometoile(etoile) for etoile in etoiles]
766 stats = self.get_stat_by_cluster(None, lclasses)
767 stats = [[etoiles[i]] + val for i, val in enumerate(stats)]
769 def gethapaxbyet(self, etoiles) :
770 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
772 for uce in hapaxuces :
773 if uce in hucesdict :
777 etuces = [[] for et in etoiles]
778 for uci in self.ucis :
779 get = list(set(uci.etoiles).intersection(etoiles))
781 return '2 variables sur la meme ligne'
783 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
784 etuces = [set(val) for val in etuces]
785 return [sum([hucesdict[uce] for uce in list(etuce.intersection(hapaxuces))]) for etuce in etuces]
787 def gethapaxuces(self) :
788 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
789 hapax = [forme for forme in self.lems if self.lems[forme].freq == 1]
791 for i,uce in enumerate(hapaxuces) :
792 if uce in hucesdict :
793 hucesdict[uce][0] += 1
794 hucesdict[uce][1].append(hapax[i])
796 hucesdict[uce] = [1,[hapax[i]]]
798 for uce in hucesdict :
799 if hucesdict[uce][0] in huces :
800 huces[hucesdict[uce][0]].append(uce)
802 huces[hucesdict[uce][0]] = [uce]
803 huces = zip(huces, huces.values())
804 huces.sort(reverse=True)
808 for nb in huces[0:4] :
809 txt += "<p><h2>%i hapax par uce</h2><p>\n" % nb[0]
811 res = self.getconcorde([uce])
813 ucetxt = ' ' + row[1] + ' '
815 for hap in hucesdict[uce][1] :
816 laforme = self.getforme([forme for forme in self.lems[hap].formes][0]).forme
817 ucetxt = ucetxt.replace(' '+laforme+' ', ' <font color=red>'+laforme+'</font> ')
818 txt += '<p><b>' + ' '.join(self.getetbyuceid(uceid)) + '</b></p>'
819 txt += '<p>'+ucetxt+'</p>\n'
823 with open('/tmp/testhapxuce.html','w') as f :
826 def export_dictionary(self, fileout, syscoding) :
827 listformes = [[self.formes[forme].freq, forme, self.formes[forme].lem, self.formes[forme].gram] for forme in self.formes]
828 listformes.sort(reverse = True)
829 listformes = [forme[1:] + [`forme[0]`] for forme in listformes]
830 with open(fileout, 'w') as f :
831 f.write('\n'.join(['\t'.join(forme) for forme in listformes]).encode(syscoding))
833 def export_lems(self, fileout, syscoding) :
835 listlem = [[lem, '\t'.join(['\t'.join([self.idformes[forme].forme, `self.lems[lem].formes[forme]`]) for forme in self.lems[lem].formes])] for lem in self.lems]
837 with open(fileout, 'w') as f :
838 f.write('\n'.join(['\t'.join(lem) for lem in listlem]).encode(syscoding))
844 def __init__(self, corpus) :
845 ucinb = corpus.getucinb()
846 ucisize = corpus.getucisize()
847 ucimean = float(sum(ucisize))/float(ucinb)
848 detoile = corpus.make_etoiles_dict()
852 def __init__(self, iduci, line, paraset = None) :
854 self.etoiles = line.split()
856 if paraset is not None :
857 self.paras = paraset.split()
862 def __init__(self, iduce, idpara, iduci) :
868 def __init__(self, word, gramtype, idword, lem = None, freq = None) :
874 if freq is not None :
880 def __init__(self, parent, forme) :
881 self.formes = {forme.ident : forme.freq}
882 self.gram = forme.gram
883 self.freq = forme.freq
886 def add_forme(self, forme) :
887 self.formes[forme.ident] = forme.freq
888 self.freq += forme.freq
890 def decouperlist(chaine, longueur, longueurOptimale) :
892 on part du dernier caractère, et on recule jusqu'au début de la chaîne.
893 Si on trouve un '$', c'est fini.
894 Sinon, on cherche le meilleur candidat. C'est-à-dire le rapport poids/distance le plus important.
896 separateurs = [[u'.', 6.0], [u'?', 6.0], [u'!', 6.0], [u'£$£', 6.0], [u':', 5.0], [u';', 4.0], [u',', 1.0], [u' ', 0.01]]
897 dsep = dict([[val[0],val[1]] for val in separateurs])
898 trouve = False # si on a trouvé un bon séparateur
899 iDecoupe = 0 # indice du caractere ou il faut decouper
901 longueur = min(longueur, len(chaine) - 1)
902 chaineTravail = chaine[:longueur + 1]
904 meilleur = ['', 0, 0] # type, poids et position du meilleur separateur
907 indice = chaineTravail.index(u'$')
909 iDecoupe = indice - 1
914 caractere = chaineTravail[nbCar]
915 distance = abs(longueurOptimale - nbCar) + 1
916 meilleureDistance = abs(longueurOptimale - meilleur[2]) + 1
917 if caractere in dsep :
918 if (float(dsep[caractere]) / distance) > (float(meilleur[1]) / meilleureDistance) :
919 meilleur[0] = caractere
920 meilleur[1] = dsep[caractere]
925 if (float(dsep[' ']) / distance) > (float(meilleur[1]) / meilleureDistance) :
927 meilleur[1] = dsep[' ']
934 #if meilleur[0] != ' ' :
935 # fin = chaine[iDecoupe + 1:]
936 # retour = chaineTravail[:iDecoupe]
938 fin = chaine[iDecoupe + 1:]
939 retour = chaineTravail[:iDecoupe + 1]
940 return len(retour) > 0, retour, fin
941 # si on a rien trouvé
942 return False, chaine, ''
944 def testetoile(line) :
945 return line.startswith(u'****')
948 return line[0:4].isdigit() and u'*' in line
950 def prep_txtlist(txt) :
951 return txt.split() + [u'$']
953 def prep_txtcharact(txt) :
958 Class for building a corpus
960 def __init__(self, infile, parametres_corpus, lexique = None, expressions = None, dlg = None) :
961 log.info('begin building corpus...')
962 self.lexique = lexique
963 self.expressions = expressions
965 self.corpus = Corpus(self, parametres_corpus)
968 self.lim = parametres_corpus.get('lim', 1000000)
969 self.encoding = parametres_corpus['encoding']
970 self.corpus.pathout = PathOut(filename = parametres_corpus['originalpath'], dirout = parametres_corpus['pathout'])
971 self.corpus.pathout.createdir(parametres_corpus['pathout'])
972 self.corpus.parametres['uuid'] = str(uuid4())
973 self.corpus.parametres['corpus_name'] = os.path.split(self.corpus.parametres['pathout'])[1]
974 self.corpus.parametres['type'] = 'corpus'
975 if self.corpus.parametres['keep_ponct'] :
976 self.ponctuation_espace = [' ', '']
978 self.ponctuation_espace = [' ','.', u'£$£', ';', '?', '!', ',', ':','']
980 self.tolist = self.corpus.parametres.get('tolist', 0)
987 def prep_makeuce(self) :
988 method = self.corpus.parametres.get('ucemethod', 0)
990 self.decouper = decouperlist
991 self.prep_txt = prep_txtlist
992 self.ucesize = self.corpus.parametres.get('ucesize', 40)
994 self.decouper = decoupercharact
995 self.prep_txt = prep_txtcharact
996 self.ucesize = self.corpus.parametres.get('ucesize', 240)
997 log.info('method uce : %s' % method)
1002 self.read_corpus(self.infile)
1003 except Warning, args :
1004 log.info('pas kool %s' % args)
1008 self.corpus.parametres['ira'] = self.corpus.pathout['Corpus.cira']
1009 self.time = time() - t1
1011 DoConf().makeoptions(['corpus'],[self.corpus.parametres], self.corpus.pathout['Corpus.cira'])
1012 log.info('time : %f' % (time() - t1))
1015 self.conn_f = sqlite3.connect(self.corpus.pathout['formes.db'])
1016 self.cf = self.conn_f.cursor()
1017 self.cf.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
1018 self.cf.execute('CREATE TABLE IF NOT EXISTS eff (id INTEGER, eff TEXT);')
1019 self.conn_f.commit()
1020 self.cf = self.conn_f.cursor()
1021 self.cf.execute('PRAGMA temp_store=MEMORY;')
1022 self.cf.execute('PRAGMA journal_mode=MEMORY;')
1023 self.cf.execute('PRAGMA synchronous = OFF;')
1024 self.cf.execute('begin')
1025 self.conn = sqlite3.connect(self.corpus.pathout['uces.db'])
1026 self.c = self.conn.cursor()
1027 self.c.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
1029 self.c = self.conn.cursor()
1030 self.c.execute('PRAGMA temp_store=MEMORY;')
1031 self.c.execute('PRAGMA journal_mode=MEMORY;')
1032 self.c.execute('PRAGMA synchronous = OFF;')
1033 self.c.execute('begin')
1036 #commit index and close db
1038 self.conn_f.commit()
1039 self.cf.execute('CREATE INDEX iduces ON uces (id);')
1040 self.cf.execute('CREATE INDEX ideff ON eff (id);')
1044 self.conn_corpus = sqlite3.connect(self.corpus.pathout['corpus.db'])
1045 self.ccorpus = self.conn_corpus.cursor()
1046 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS etoiles (uci INTEGER, et TEXT, paras TEXT);')
1047 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS luces (uci INTEGER, para INTEGER, uce INTEGER);')
1048 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS formes (ident INTEGER, forme TEXT, lem TEXT, gram TEXT, freq INTEGER);')
1049 self.conn_corpus.commit()
1050 self.ccorpus = self.conn_corpus.cursor()
1051 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
1052 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
1053 self.ccorpus.execute('PRAGMA synchronous = OFF;')
1054 self.ccorpus.execute('begin')
1055 self.backup_corpus()
1056 self.ccorpus.execute('CREATE INDEX iduci ON luces (uci);')
1057 self.conn_corpus.commit()
1058 self.conn_corpus.close()
1059 #self.corpus.parametres['corpus_ira'] = self.corpus.pathout['corpus.cira']
1061 def buildcleans(self) :
1062 if self.corpus.parametres.get('lower', 1) :
1063 self.cleans.append(self.dolower)
1064 if self.corpus.parametres.get('firstclean', 1) :
1065 self.cleans.append(self.firstclean)
1066 if self.corpus.parametres['charact'] :
1067 self.rule = self.corpus.parametres.get('keep_caract', u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_")
1068 self.cleans.append(self.docharact)
1069 if self.corpus.parametres.get('expressions', 1) :
1070 self.cleans.append(self.make_expression)
1071 if self.corpus.parametres.get('apos', 1) :
1072 self.cleans.append(self.doapos)
1073 if self.corpus.parametres.get('tiret', 1):
1074 self.cleans.append(self.dotiret)
1076 def make_expression(self,txt) :
1077 for expression in self.expressions:
1078 if expression in txt :
1079 txt = txt.replace(expression, self.expressions[expression][0])
1082 def dolower(self, txt) :
1085 def docharact(self, txt) :
1086 #rule = u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_-"
1087 list_keep = u"[" + self.rule + "]+"
1088 return re.sub(list_keep, ' ', txt)
1090 def doapos(self, txt) :
1091 return txt.replace(u'\'', u' ')
1093 def dotiret(self, txt) :
1094 return txt.replace(u'-', u' ')
1096 def firstclean(self, txt) :
1097 txt = txt.replace(u'’',"'")
1098 txt = txt.replace(u'œ', u'oe')
1099 return txt.replace('...',u' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'…', u' £$£ ')
1101 def make_cleans(self, txt) :
1102 for clean in self.cleans :
1106 def backup_uce(self) :
1107 if self.corpus.idformesuces != {} :
1108 log.info('backup %i' % len(self.corpus.idformesuces))
1109 touce = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].keys()])) for forme in self.corpus.idformesuces]
1110 toeff = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].values()])) for forme in self.corpus.idformesuces]
1111 self.cf.executemany('INSERT INTO uces VALUES (?,?);', touce)
1112 self.cf.executemany('INSERT INTO eff VALUES (?,?);', toeff)
1113 self.corpus.idformesuces = {}
1116 def backup_corpus(self) :
1117 log.info('start backup corpus')
1119 for uci in self.corpus.ucis :
1120 self.ccorpus.execute('INSERT INTO etoiles VALUES (?,?,?);' ,(uci.ident,' '.join(uci.etoiles), ' '.join(uci.paras,)))
1121 for uce in uci.uces :
1122 self.ccorpus.execute('INSERT INTO luces VALUES (?,?,?);',(`uci.ident`,`uce.para`,`uce.ident`,))
1123 for forme in self.corpus.formes :
1124 self.ccorpus.execute('INSERT INTO formes VALUES (?,?,?,?,?);', (`self.corpus.formes[forme].ident`, forme, self.corpus.formes[forme].lem, self.corpus.formes[forme].gram, `self.corpus.formes[forme].freq`,))
1125 log.info('%f' % (time() - t))
1127 def dofinish(self) :
1128 self.corpus.parametres['date'] = datetime.datetime.now().ctime()
1129 minutes, seconds = divmod(self.time, 60)
1130 hours, minutes = divmod(minutes, 60)
1131 self.corpus.parametres['time'] = '%.0fh %.0fm %.0fs' % (hours, minutes, seconds)
1132 self.corpus.parametres['ucinb'] = self.corpus.getucinb()
1133 self.corpus.parametres['ucenb'] = self.corpus.getucenb()
1134 self.corpus.parametres['occurrences'] = self.corpus.gettotocc()
1135 self.corpus.parametres['formesnb'] = len(self.corpus.formes)
1136 hapaxnb = self.corpus.gethapaxnb()
1137 pourhapaxf = (float(hapaxnb) / len(self.corpus.formes)) * 100
1138 pourhapaxocc = (float(hapaxnb) / self.corpus.parametres['occurrences']) * 100
1139 self.corpus.parametres['hapax'] = '%i - %.2f %% des formes - %.2f %% des occurrences' % (hapaxnb, pourhapaxf, pourhapaxocc)
1142 class BuildFromAlceste(BuildCorpus) :
1143 def read_corpus(self, infile) :
1144 if self.dlg is not None :
1145 self.dlg.Pulse('textes : 0 - segments : 0')
1148 if self.corpus.parametres['ucimark'] == 0 :
1149 self.testuci = testetoile
1150 elif self.corpus.parametres['ucimark'] == 1 :
1151 self.testuci = testint
1157 with codecs.open(infile, 'r', self.encoding) as f :
1158 for linenb, line in enumerate(f) :
1159 line = line.rstrip('\n\r')
1160 if self.testuci(line) :
1163 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci - 1)
1165 self.corpus.ucis.append(Uci(iduci, line))
1168 if self.corpus.ucis[-1].uces == [] :
1169 log.info(u'Empty text : %i' % linenb)
1171 self.corpus.ucis.pop()
1172 self.corpus.ucis.append(Uci(iduci, line))
1173 if self.dlg is not None :
1174 if not (iduci + 1) % 10 :
1175 self.dlg.Pulse('textes : %i - segments : %i' % (iduci + 1, iduce +1))
1176 elif line.startswith(u'-*') :
1179 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1182 self.corpus.ucis[-1].paras.append(line.split()[0])
1184 raise Exception('paragrapheOT %i' % linenb)
1185 elif line.strip() != '' and iduci != -1 :
1187 if txt != [] and iduci != -1 :
1188 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1193 self.corpus.ucis.pop()
1194 log.info(Exception("Empty text %i" % linenb))
1196 raise Exception('EmptyText %i' % linenb)
1197 if iduci != -1 and iduce != -1:
1200 log.info(_(u"No Text in corpora. Are you sure of the formatting ?"))
1201 raise Exception('TextBeforeTextMark %i' % linenb)
1202 except UnicodeDecodeError :
1203 raise Exception("CorpusEncoding")
1205 def treattxt(self, txt, iduce, idpara, iduci) :
1206 if self.corpus.parametres.get('ucemethod', 0) == 2 and self.corpus.parametres['douce']:
1207 txt = 'laphrasepoursplitter'.join(txt)
1208 txt = self.make_cleans(txt)
1209 txt = ' '.join([val for val in txt.split() if val not in self.ponctuation_espace])
1210 ucetxt = txt.split('laphrasepoursplitter')
1213 txt = self.make_cleans(txt)
1214 ucetxt = self.make_uces(txt, self.corpus.parametres['douce'])
1215 if self.corpus.ucis[-1].paras == [] :
1219 self.corpus.ucis[-1].uces.append(Uce(iduce, idpara, iduci))
1220 self.c.execute('INSERT INTO uces VALUES(?,?);', (`iduce`,uce))
1221 if not self.tolist :
1227 self.corpus.add_word(word)
1228 log.debug(' '.join([`iduci`,`idpara`,`iduce`]))
1229 if self.last > self.lim :
1232 return iduce, idpara
1234 def make_uces(self, txt, douce = True, keep_ponct = False) :
1235 txt = ' '.join(txt.split())
1238 reste, texte_uce, suite = self.decouper(self.prep_txt(txt), self.ucesize + 15, self.ucesize)
1240 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1243 reste, texte_uce, suite = self.decouper(suite, self.ucesize + 15, self.ucesize)
1244 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1249 return [' '.join([val for val in txt.split() if val not in self.ponctuation_espace])]
1251 #decouper (list_sep)
1252 #make_uces (decouper)
1253 #treat_txt (make_uces)
1257 def __init__(self, parent, dlg = None) :
1258 self.parent = parent
1260 parametres = DoConf(os.path.join(self.parent.UserConfigPath,'corpus.cfg')).getoptions('corpus')
1261 parametres['pathout'] = PathOut(parent.filename, 'corpus').mkdirout()
1262 dial = CorpusPref(parent, parametres)
1263 dial.CenterOnParent()
1264 dial.txtpath.SetLabel(parent.filename)
1265 #dial.repout_choices.SetValue(parametres['pathout'])
1266 self.res = dial.ShowModal()
1267 if self.res == 5100 :
1268 parametres = dial.doparametres()
1269 parametres['originalpath'] = parent.filename
1270 PathOut().createdir(parametres['pathout'])
1271 ReadLexique(self.parent, lang = parametres['lang'])
1272 if parametres['lang'] != 'other' and os.path.exists(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')):
1273 self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp'))
1275 self.parent.expressions = {}
1276 self.parametres = parametres
1278 if self.dlg is not None :
1282 def doanalyse(self) :
1283 return BuildFromAlceste(self.parent.filename, self.parametres, self.parent.lexique, self.parent.expressions, dlg = self.dlg).corpus
1286 if __name__ == '__main__' :
1288 parametres = {'formesdb':'formes.db', 'ucesdb': 'uces.db', 'corpusdb' : 'corpus.db', 'syscoding' : 'utf-8', 'encoding' : encoding}
1289 intro = BuildCorpus(infile, parametres)#, tar_in, tar_infouce)#, tar_formes)