1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
11 from functions import decoupercharact, ReadDicoAsDico, DoConf, ReadLexique
16 from operator import itemgetter
17 from uuid import uuid4
18 from chemins import PathOut
19 from dialog import CorpusPref
20 from colors import colors
24 log = logging.getLogger('iramuteq.corpus')
27 def copycorpus(corpus) :
28 log.info('copy corpus')
29 copy_corpus = Corpus(corpus.parent, parametres = corpus.parametres)
30 copy_corpus.ucis = corpus.ucis
31 copy_corpus.formes = corpus.formes
32 copy_corpus.pathout = corpus.pathout
33 copy_corpus.conn_all()
42 def __init__(self, parent, parametres = {}, read = False) :
44 self.parametres = parametres
46 self.connformes = None
48 self.conncorpus = None
55 self.idformesuces = {}
60 self.pathout = PathOut(dirout = parametres['pathout'])
63 def add_word(self, word) :
64 if word in self.formes :
65 self.formes[word].freq += 1
66 if self.formes[word].ident in self.idformesuces :
67 if self.ucis[-1].uces[-1].ident in self.idformesuces[self.formes[word].ident] :
68 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] += 1
70 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] = 1
72 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident: 1}
74 if word in self.parent.lexique :
75 gramtype = self.parent.lexique[word][1]
76 lem = self.parent.lexique[word][0]
83 self.formes[word] = Word(word, gramtype, len(self.formes), lem)
84 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident : 1}
87 """connect corpus to db"""
88 if self.connformes is None :
89 log.info('connexion corpus')
90 self.connuces = sqlite3.connect(self.pathout['uces.db'])
91 self.cuces = self.connuces.cursor()
92 self.connformes = sqlite3.connect(self.pathout['formes.db'])
93 self.cformes = self.connformes.cursor()
94 self.conncorpus = sqlite3.connect(self.pathout['corpus.db'])
95 self.ccorpus = self.conncorpus.cursor()
96 self.cformes.execute('PRAGMA temp_store=MEMORY;')
97 self.cformes.execute('PRAGMA journal_mode=MEMORY;')
98 self.cformes.execute('PRAGMA synchronous = OFF;')
99 self.cuces.execute('PRAGMA temp_store=MEMORY;')
100 self.cuces.execute('PRAGMA journal_mode=MEMORY;')
101 self.cuces.execute('PRAGMA synchronous = OFF;')
102 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
103 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
104 self.ccorpus.execute('PRAGMA synchronous = OFF;')
106 def read_corpus(self) :
107 log.info('read corpus')
108 self.parametres['syscoding'] = sys.getdefaultencoding()
109 if self.conncorpus is None :
111 res = self.ccorpus.execute('SELECT * FROM etoiles;')
113 self.ucis.append(Uci(row[0], row[1], row[2]))
114 uces = self.conncorpus.cursor().execute('SELECT * FROM luces where uci=?;',(`self.ucis[-1].ident`,))
116 self.ucis[-1].uces.append(Uce(uce[2], uce[1], uce[0]))
117 res = self.ccorpus.execute('SELECT * FROM formes;')
118 self.formes = dict([[forme[1], Word(forme[1], forme[3], forme[0], lem = forme[2], freq = forme[4])] for forme in res])
121 def getworduces(self, wordid) :
122 if isinstance(wordid, basestring) :
123 wordid = self.formes[wordid].ident
124 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`wordid`,))
125 return list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
127 def getworducis(self, wordid) :
128 res = self.getworduces(wordid)
129 return list(set([self.getucefromid(uce).uci for uce in res]))
131 def getformeuceseff(self, formeid) :
132 if isinstance(formeid, basestring) :
133 formeid = self.formes[formeid].ident
134 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`formeid`,))
135 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
136 query = 'SELECT eff FROM eff where id=%i ORDER BY id' % formeid
137 res = self.cformes.execute(query)
138 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
140 for i, uce in enumerate(uces) :
141 formeuceeff[uce] = formeuceeff.get(uce, 0) + eff[i]
144 def getlemuces(self, lem) :
145 formesid = ', '.join([`val` for val in self.lems[lem].formes])
146 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
147 res = self.cformes.execute(query)
148 return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))))
150 def getlemucis(self, lem) :
151 uces = self.getlemuces(lem)
152 return list(set([self.getucefromid(val).uci for val in uces]))
154 def getlemuceseff(self, lem, luces = None) :
155 formesid = ', '.join([`val` for val in self.lems[lem].formes])
156 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
157 res = self.cformes.execute(query)
158 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
159 query = 'SELECT eff FROM eff where id IN (%s) ORDER BY id' % formesid
160 res = self.cformes.execute(query)
161 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
163 for i, uce in enumerate(uces) :
164 lemuceeff[uce] = lemuceeff.get(uce, 0) + eff[i]
167 def getlemclustereff(self, lem, cluster) :
168 return len(list(set(self.lc[cluster]).intersection(self.getlemuces(lem))))
170 def getlemeff(self, lem) :
171 return self.lems[lem].freq
176 def getforme(self, formeid) :
177 if self.idformes is None : self.make_idformes()
178 return self.idformes[formeid]
180 def gettotocc(self) :
181 return sum([self.formes[forme].freq for forme in self.formes])
183 def getucemean(self) :
184 return float(self.gettotocc())/self.getucenb()
187 return self.ucis[-1].uces[-1].ident + 1
190 return self.ucis[-1].ident + 1
192 def getucisize(self) :
193 ucesize = self.getucesize()
194 return [sum(ucesize[uci.uces[0].ident:(uci.uces[-1].ident + 1)]) for uci in self.ucis]
196 def getucesize(self) :
197 res = self.getalluces()
198 return [len(uce[1].split()) for uce in res]
200 def getconcorde(self, uces) :
201 return self.cuces.execute('select * from uces where id IN (%s);' % ', '.join([`i` for i in uces]))
203 def getuciconcorde(self, ucis) :
204 uces = [[val,[uce.ident for uce in self.ucis[val].uces]] for val in ucis]
205 uces = [[val[0], '\n'.join([row[1] for row in self.getconcorde(val[1])])] for val in uces]
208 def getwordconcorde(self, word) :
209 return self.getconcorde(self.getworduces(word))
211 def getlemconcorde(self, lem) :
212 return self.getconcorde(self.getlemuces(lem))
214 def getalluces(self) :
215 return self.cuces.execute('SELECT * FROM uces')
217 def getallucis(self):
218 uces = [row[1] for row in self.getalluces()]
219 return [[uci.ident, '\n'.join([uces[uce.ident] for uce in uci.uces])] for uci in self.ucis]
221 def getucesfrometoile(self, etoile) :
222 return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles]
224 def getetoileuces(self) :
225 log.info('get uces etoiles')
228 for uci in self.ucis :
229 etoiles = uci.etoiles[1:]
231 if et in etoileuces :
232 etoileuces[et] += [uce.ident for uce in uci.uces]
234 etoileuces[et] = [uce.ident for uce in uci.uces]
236 for et in uci.paras :
237 if et in etoileuces :
238 etoileuces[et] += [uce.ident for uce in uci.uces if uce.para == idpara]
240 etoileuces[et] = [uce.ident for uce in uci.uces if uce.para == idpara]
246 def getetoileucis(self):
248 for uci in self.ucis :
249 etoiles = uci.etoiles[1:]
251 if et in etoileuces :
252 etoileuces[et] += [uci.ident]
254 etoileuces[et] = [uci.ident]
257 def getucefromid(self, uceid) :
258 if self.iduces is None : self.make_iduces()
259 return self.iduces[uceid]
261 def gethapaxnb(self) :
262 return len([None for forme in self.formes if self.formes[forme].freq == 1])
264 def getactivesnb(self, key) :
265 return len([lem for lem in self.lems if self.lems[lem].act == key])
266 # def make_lems(self, lem = True) :
267 # log.info('make lems')
269 # for forme in self.formes :
270 # if self.formes[forme].lem in self.lems :
271 # if self.formes[forme].ident not in self.lems[self.formes[forme].lem] :
272 # self.lems[self.formes[forme].lem][self.formes[forme].ident] = 0
274 # self.lems[self.formes[forme].lem] = {self.formes[forme].ident : 0}
276 def getetbyuceid(self, uceid) :
277 if self.uceuci is None : self.uceuci = dict([[uce.ident,uci.ident] for uci in self.ucis for uce in uci.uces])
278 return self.ucis[self.uceuci[uceid]].etoiles
280 def make_lems(self, lem = True) :
281 log.info('make lems')
284 for forme in self.formes :
285 if self.formes[forme].lem in self.lems :
286 if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes :
287 self.lems[self.formes[forme].lem].add_forme(self.formes[forme])
289 self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
291 self.lems = dict([[forme, Lem(self, self.formes[forme])] for forme in self.formes])
293 def make_idformes(self) :
294 self.idformes = dict([[self.formes[forme].ident, self.formes[forme]] for forme in self.formes])
296 def make_iduces(self) :
297 if self.iduces is None :
298 self.iduces = dict([[uce.ident, uce] for uci in self.ucis for uce in uci.uces])
300 def make_lexitable(self, mineff, etoiles, gram = 0) :
305 tokeep = [lem for lem in self.lems if self.lems[lem].freq >= mineff and self.lems[lem].act in grams]
306 etuces = [[] for et in etoiles]
307 for uci in self.ucis :
308 get = list(set(uci.etoiles).intersection(etoiles))
310 log.info('2 variables sur une ligne')
312 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
313 etuces = [set(val) for val in etuces]
316 deff = self.getlemuceseff(lem)
318 tab.append([lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])
319 tab.insert(0, [''] + etoiles)
322 def make_efftype_from_etoiles(self, etoiles) :
324 etuces = [[] for et in etoiles]
325 for uci in self.ucis :
326 get = list(set(uci.etoiles).intersection(etoiles))
328 return '2 variables sur la meme ligne'
330 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
331 etuces = [set(val) for val in etuces]
332 for lem in self.lems :
333 deff = self.getlemuceseff(lem)
335 gram = self.lems[lem].gram
337 dtype[gram] = [i + j for i, j in zip(dtype[gram], [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])]
339 dtype[gram] = [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces]
340 tabout = [[gram] + dtype[gram] for gram in dtype]
341 tabout.insert(0, [''] + etoiles)
344 def make_uceactsize(self, actives) :
345 res = self.getalluces()
348 deff = self.getlemuceseff(lem)
350 ucesize[uce] = ucesize.get(uce, 0) + 1
353 def make_uc(self, actives, lim1, lim2) :
354 uceactsize = self.make_uceactsize(actives)
360 for uce in [uce for uci in self.ucis for uce in uci.uces] :
361 if uce.para == lastpara :
363 last1 += uceactsize.get(uce.ident,0)
364 uc1[-1].append(uce.ident)
366 uc1.append([uce.ident])
369 last2 += uceactsize.get(uce.ident, 0)
370 uc2[-1].append(uce.ident)
372 uc2.append([uce.ident])
375 last1 = uceactsize.get(uce.ident, 0)
376 last2 = uceactsize.get(uce.ident, 0)
378 uc1.append([uce.ident])
379 uc2.append([uce.ident])
382 def make_and_write_sparse_matrix_from_uc(self, actives, sizeuc1, sizeuc2, uc1out, uc2out, listuce1out, listuce2out) :
383 uc1, uc2 = self.make_uc(actives, sizeuc1, sizeuc2)
384 log.info('taille uc1 : %i - taille uc2 : %i' % (len(uc1), len(uc2)))
385 self.write_ucmatrix(uc1, actives, uc1out)
386 self.write_ucmatrix(uc2, actives, uc2out)
387 listuce1 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc1) for uce in ucl]
388 listuce2 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc2) for uce in ucl]
389 with open(listuce1out, 'w') as f :
390 f.write('\n'.join([';'.join(line) for line in listuce1]))
391 with open(listuce2out, 'w') as f :
392 f.write('\n'.join([';'.join(line) for line in listuce2]))
393 return len(uc1), len(uc2)
395 def write_ucmatrix(self, uc, actives, fileout) :
396 log.info('write uc matrix %s' % fileout)
397 uces_uc = dict([[uce, i] for i, ucl in enumerate(uc) for uce in ucl])
400 with open(fileout + '~', 'w+') as f :
401 for i, lem in enumerate(actives) :
402 for uce in self.getlemuces(lem):
403 if (uces_uc[uce], i) not in deja_la :
405 f.write(''.join([' '.join([`uces_uc[uce]+1`,`i+1`,`1`]),'\n']))
406 deja_la[(uces_uc[uce], i)] = 0
408 with open(fileout, 'w') as ffin :
409 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (len(uc), len(actives), nbl))
412 os.remove(fileout + '~')
415 def export_corpus(self, outf) :
416 #outf = 'export_corpus.txt'
418 res = self.getalluces()
422 with open(outf,'w') as f :
424 if self.iduces[uce[0]].uci == actuci and self.iduces[uce[0]].para == actpara :
425 f.write(uce[1].encode(self.parametres['syscoding']) + '\n')
426 elif self.iduces[uce[0]].uci != actuci :
427 actuci = self.iduces[uce[0]].uci
428 if self.ucis[self.iduces[uce[0]].uci].paras == [] :
429 actpara = self.iduces[uce[0]].para
430 f.write('\n' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n' + uce[1].encode(self.parametres['syscoding']) + '\n')
433 actpara = self.iduces[uce[0]].para
434 f.write('\n'.join([' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']), self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
435 elif self.iduces[uce[0]].para != actpara :
436 actpara = self.iduces[uce[0]].para
438 f.write('\n'.join([self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
440 def export_corpus_classes(self, outf, alc = True, lem = False, uci = False) :
442 for i, lc in enumerate(self.lc) :
445 for uce in self.lc0 :
448 res = self.getalluces()
451 res = self.getallucis()
452 with open(outf, 'w') as f :
456 actuci = self.iduces[uce[0]].uci
460 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
462 etline = ' '.join(self.ucis[actuci].etoiles + ['*classe_%i' % ucecl[uce[0]]])
464 etline = ' '.join(['<' + '='.join(et.split('_')) + '>' for et in self.ucis[actuci].etoiles[1:]])
465 f.write(etline.encode(self.parametres['syscoding']) + '\n')
466 f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
468 def export_classe(self, outf, classe, lem = False, uci = False) :
469 sts = self.lc[classe - 1]
471 res = self.getconcorde(sts)
474 res = self.getuciconcorde(sts)
475 with open(outf, 'w') as f :
479 f.write(' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n')
481 f.write(' '.join(self.ucis[uce[0]].etoiles).encode(self.parametres['syscoding']) + '\n')
483 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
484 f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
486 def make_and_write_sparse_matrix_from_uces(self, actives, outfile, listuce = False) :
487 log.info('make_and_write_sparse_matrix_from_uces %s' % outfile)
489 with open(outfile + '~', 'w+') as f :
490 for i, lem in enumerate(actives) :
491 for uce in sorted(self.getlemuces(lem)) :
493 f.write(''.join([' '.join([`uce+1`, `i+1`,`1`]),'\n']))
495 with open(outfile, 'w') as ffin :
496 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
499 os.remove(outfile + '~')
501 with open(listuce, 'w') as f :
502 f.write('\n'.join(['uce;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucenb())]))
504 def make_and_write_sparse_matrix_from_uci(self, actives, outfile, listuci = False) :
505 log.info('make_and_write_sparse_matrix_from_ucis %s' % outfile)
507 with open(outfile + '~', 'w+') as f :
508 for i, lem in enumerate(actives) :
509 for uci in sorted(self.getlemucis(lem)) :
511 f.write(''.join([' '.join([`uci+1`, `i+1`,`1`]),'\n']))
513 with open(outfile, 'w') as ffin :
514 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucinb(), len(actives), nbl))
517 os.remove(outfile + '~')
519 with open(listuci, 'w') as f :
520 f.write('\n'.join(['uci;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucinb())]))
522 def make_and_write_sparse_matrix_from_classe(self, actives, uces, outfile) :
523 log.info('make_and_write_sparse_matrix_from_classe %s' % outfile)
525 duces = dict([[uce, i] for i, uce in enumerate(uces)])
526 with open(outfile + '~', 'w+') as f :
527 for i, lem in enumerate(actives) :
528 uces_ok = list(set(self.getlemuces(lem)).intersection(uces))
530 f.write(''.join([' '.join([`duces[uce]+1`,`i+1`,`1`]),'\n']))
532 with open(outfile, 'w') as ffin :
533 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
536 os.remove(outfile + '~')
538 def make_table_with_classe(self, uces, list_act, uci = False) :
539 table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
540 uces = dict([[uce, i] for i, uce in enumerate(uces)])
542 getlem = self.getlemucis
544 getlem = self.getlemuces
545 for i, lem in enumerate(list_act) :
546 lemuces = list(set(getlem(lem)).intersection(uces))
548 table_uce[uces[uce]][i] = 1
549 table_uce.insert(0, list_act)
552 def make_pondtable_with_classe(self, uces, list_act) :
553 table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
554 uces = dict([[uce, i] for i, uce in enumerate(uces)])
555 for i, lem in enumerate(list_act) :
556 uceseff = self.getlemuceseff(lem)
557 lemuces = list(set(uceseff.keys()).intersection(uces))
559 table_uce[uces[uce]][i] = uceseff[uce]
560 table_uce.insert(0, list_act)
563 def parse_active(self, gramact, gramsup = None) :
564 log.info('parse actives')
565 for lem in self.lems :
566 if lem.startswith('_') and lem.endswith('_') :
567 self.lems[lem].act = 2
568 elif self.lems[lem].gram in gramact :
569 self.lems[lem].act = 1
570 elif gramsup is not None and self.lems[lem].gram not in gramact:
571 if self.lems[lem].gram in gramsup :
572 self.lems[lem].act = 2
574 self.lems[lem].act = 0
576 self.lems[lem].act = 2
578 def make_actives_limit(self, limit, key = 1) :
579 if self.idformes is None :
581 return [lem for lem in self.lems if self.getlemeff(lem) >= limit and self.lems[lem].act == key]
583 def make_actives_nb(self, nbmax, key) :
584 log.info('make_actives_nb : %i - %i' % (nbmax,key))
585 if self.idformes is None :
587 allactives = [[self.lems[lem].freq, lem] for lem in self.lems if self.lems[lem].act == key and self.lems[lem].freq >= 3]
588 self.activenb = len(allactives)
589 allactives = sorted(allactives, reverse = True)
590 if self.activenb == 0 :
592 if len(allactives) <= nbmax :
593 log.info('nb = %i - eff min = %i ' % (len(allactives), allactives[-1][0]))
594 return [val[1] for val in allactives], allactives[-1][0]
596 effs = [val[0] for val in allactives]
597 if effs.count(effs[nbmax - 1]) > 1 :
598 lim = effs[nbmax - 1] + 1
602 stop = effs.index(lim)
609 log.info('nb actives = %i - eff min = %i ' % (stop + 1, lim))
610 return [val[1] for val in allactives[0:stop + 1]], lim
612 def make_and_write_profile(self, actives, ucecl, fileout, uci = False) :
613 log.info('formes/classes')
615 tab = [[lem] + [len(set(self.getlemucis(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
617 tab = [[lem] + [len(set(self.getlemuces(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
618 tab = [[line[0]] + [`val` for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
619 with open(fileout, 'w') as f :
620 f.write('\n'.join([';'.join(line) for line in tab]).encode(self.parametres['syscoding']))
622 def make_etoiles(self) :
624 for uci in self.ucis :
625 etoiles.update(uci.etoiles[1:])
628 def make_etoiles_dict(self) :
629 etoiles = [et for uci in self.ucis for et in uci.etoiles[1:]]
631 for etoile in etoiles :
632 et = etoile.split('_')
635 endet = '_'.join(et[1:])
636 if etoile in det[et[0]] :
637 det[et[0]][etoile] += 1
639 det[et[0]][etoile] = 1
644 endet = '_'.join(et[1:])
645 det[et[0]] = {etoile :1}
650 def make_etline(self, listet) :
651 etuces = [[] for et in listet]
652 for uci in self.ucis :
653 get = list(set(uci.etoiles).intersection(listet))
655 return '2 variables sur la meme ligne'
657 etuces[listet.index(get[0])] += [uce.ident for uce in uci.uces]
660 def make_and_write_profile_et(self, ucecl, fileout, uci = False) :
661 log.info('etoiles/classes')
663 etoileuces = self.getetoileuces()
665 etoileuces = self.getetoileucis()
666 etoileuces = dict([[et, etoileuces[et]] for et in etoileuces if len(etoileuces[et]) > 1])
667 with open(fileout, 'w') as f :
668 f.write('\n'.join([';'.join([et] + [`len(set(etoileuces[et]).intersection(classe))` for classe in ucecl]) for et in etoileuces]).encode(self.parametres['syscoding']))
669 #etoiles = self.make_etoiles()
670 #with open(fileout, 'w') as f :
671 # f.write('\n'.join([';'.join([etoile] + [`len(set(self.getucesfrometoile(etoile)).intersection(classe))` for classe in ucecl]) for etoile in etoiles]).encode(self.parametres['syscoding']))
673 def make_colored_corpus(self, uci = False) :
675 for i, lc in enumerate(self.lc) :
678 for uce in self.lc0 :
680 color = ['black'] + colors[len(self.lc) - 1]
682 <meta http-equiv="content-Type" content="text/html; charset=%s" />
684 ''' % sys.getdefaultencoding()
686 res = self.getalluces()
691 if self.iduces[uce[0]].uci != actuci :
692 actuci = self.iduces[uce[0]].uci
693 txt += '<br><hr>' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles) + '<br><br>'
694 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
696 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
698 res = self.getallucis()
701 if self.ucis[uce[0]].ident != actuci :
702 actuci = self.ucis[uce[0]].ident
703 txt += '<br><hr>' + ' '.join(self.ucis[self.ucis[uce[0]].ident].etoiles) + '<br><br>'
704 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
706 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
707 return txt + '\n</body></html>'
709 def count_from_list(self, l, d) :
717 def count_from_list_cl(self, l, d, a, clnb) :
726 def find_segments(self, taille_segment, taille_limite) :
728 for uce in self.getalluces() :
730 d = self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
731 l = [[d[val], val] for val in d if d[val] >= 3]
734 if len(l) > taille_limite :
735 l = l[-taille_limite:]
738 def find_segments_in_classe(self, list_uce, taille_segment, taille_limite, uci = False):
741 concorde = self.getconcorde
743 concorde = self.getuciconcorde
744 for uce in concorde(list_uce) :
746 d =self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
747 l = [[d[val], val, taille_segment] for val in d if d[val] >= 3]
750 if len(l) > taille_limite :
751 l = l[-taille_limite:]
754 def make_segments_profile(self, fileout, lenmin = 3, lenmax = 10, effmin = 50, lem = False) :
756 for b, classe in enumerate(self.lc) :
757 for uce in self.getconcorde(classe) :
760 uce = [self.formes[forme].lem for forme in uce]
761 for taille_segment in range(lenmin,lenmax) :
762 d =self.count_from_list_cl([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d, b, len(self.lc))
763 result = [[seg] + [str(val) for val in d[seg]] for seg in d if sum(d[seg]) >= effmin]
764 with open(fileout, 'w') as f :
765 f.write('\n'.join([';'.join(line) for line in result]))
767 def make_proftype(self, outf) :
769 for lem in self.lems :
770 gram = self.lems[lem].gram
772 res[gram] = [0 for val in self.lc]
773 lemuceeff = self.getlemuceseff(lem)
774 for i, classe in enumerate(self.lc) :
775 concern = set(classe).intersection(lemuceeff.keys())
776 res[gram][i] += sum([lemuceeff[uce] for uce in concern])
777 res = [[gram] + [`val` for val in res[gram]] for gram in res]
779 with open(outf, 'w') as f :
780 f.write('\n'.join([';'.join(line) for line in res]).encode(self.parametres['syscoding']))
783 def make_ucecl_from_R(self, filein) :
784 with open(filein, 'rU') as f :
789 line = line.replace('\n', '').replace('"', '').split(';')
790 self.lc.append([int(line[0]) - 1, int(line[1])])
791 classesl = [val[1] for val in self.lc]
793 self.lc = sorted(self.lc, key=itemgetter(1))
794 self.lc = [[uce[0] for uce in self.lc if uce[1] == i] for i in range(clnb+1)]
795 self.lc0 = self.lc.pop(0)
798 def get_stat_by_cluster(self, outf, lclasses = None) :
799 log.info('get_stat_by_cluster')
800 if lclasses is None :
803 occurrences = dict([[i + 1, 0] for i in range(len(lclasses))])
804 formescl = dict([[i + 1, 0] for i in range(len(lclasses))])
805 hapaxcl = dict([[i + 1, 0] for i in range(len(lclasses))])
806 lenclasses = dict([[i+1,len(cl)] for i, cl in enumerate(lclasses)])
807 sets = [set(cl) for cl in lclasses]
808 for forme in self.formes :
809 formeuceeff = self.getformeuceseff(forme)
810 for i, classe in enumerate(lclasses) :
811 concern = sets[i].intersection(formeuceeff.keys())
813 occurrences[i+1] += sum([formeuceeff[uce] for uce in concern])
815 if self.formes[forme].freq == 1 :
817 log.info('%f' % (time() - t1))
818 if outf is not None :
819 toprint = '\n'.join([';'.join([`i`, `occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`]) for i in occurrences])
820 with open(outf, 'w') as f :
823 return [[`occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`] for i in occurrences]
825 def get_stat_by_et(self, outf, etoiles) :
826 lclasses = [self.getucesfrometoile(etoile) for etoile in etoiles]
827 stats = self.get_stat_by_cluster(None, lclasses)
828 stats = [[etoiles[i]] + val for i, val in enumerate(stats)]
830 def gethapaxbyet(self, etoiles) :
831 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
833 for uce in hapaxuces :
834 if uce in hucesdict :
838 etuces = [[] for et in etoiles]
839 for uci in self.ucis :
840 get = list(set(uci.etoiles).intersection(etoiles))
842 return '2 variables sur la meme ligne'
844 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
845 etuces = [set(val) for val in etuces]
846 return [sum([hucesdict[uce] for uce in list(etuce.intersection(hapaxuces))]) for etuce in etuces]
848 def gethapaxuces(self) :
849 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
850 hapax = [forme for forme in self.lems if self.lems[forme].freq == 1]
852 for i,uce in enumerate(hapaxuces) :
853 if uce in hucesdict :
854 hucesdict[uce][0] += 1
855 hucesdict[uce][1].append(hapax[i])
857 hucesdict[uce] = [1,[hapax[i]]]
859 for uce in hucesdict :
860 if hucesdict[uce][0] in huces :
861 huces[hucesdict[uce][0]].append(uce)
863 huces[hucesdict[uce][0]] = [uce]
864 huces = zip(huces, huces.values())
865 huces.sort(reverse=True)
869 for nb in huces[0:4] :
870 txt += "<p><h2>%i hapax par uce</h2><p>\n" % nb[0]
872 res = self.getconcorde([uce])
874 ucetxt = ' ' + row[1] + ' '
876 for hap in hucesdict[uce][1] :
877 laforme = self.getforme([forme for forme in self.lems[hap].formes][0]).forme
878 ucetxt = ucetxt.replace(' '+laforme+' ', ' <font color=red>'+laforme+'</font> ')
879 txt += '<p><b>' + ' '.join(self.getetbyuceid(uceid)) + '</b></p>'
880 txt += '<p>'+ucetxt+'</p>\n'
884 with open('/tmp/testhapxuce.html','w') as f :
887 def export_dictionary(self, fileout, syscoding) :
888 listformes = [[self.formes[forme].freq, forme, self.formes[forme].lem, self.formes[forme].gram] for forme in self.formes]
889 listformes.sort(reverse = True)
890 listformes = [forme[1:] + [`forme[0]`] for forme in listformes]
891 with open(fileout, 'w') as f :
892 f.write('\n'.join(['\t'.join(forme) for forme in listformes]).encode(syscoding))
894 def export_lems(self, fileout, syscoding) :
896 listlem = [[lem, '\t'.join(['\t'.join([self.idformes[forme].forme, `self.lems[lem].formes[forme]`]) for forme in self.lems[lem].formes])] for lem in self.lems]
898 with open(fileout, 'w') as f :
899 f.write('\n'.join(['\t'.join(lem) for lem in listlem]).encode(syscoding))
905 def __init__(self, corpus) :
906 ucinb = corpus.getucinb()
907 ucisize = corpus.getucisize()
908 ucimean = float(sum(ucisize))/float(ucinb)
909 detoile = corpus.make_etoiles_dict()
912 def __init__(self, iduci, line, paraset = None) :
914 self.etoiles = line.split()
916 if paraset is not None :
917 self.paras = paraset.split()
922 def __init__(self, iduce, idpara, iduci) :
928 def __init__(self, word, gramtype, idword, lem = None, freq = None) :
934 if freq is not None :
940 def __init__(self, parent, forme) :
941 self.formes = {forme.ident : forme.freq}
942 self.gram = forme.gram
943 self.freq = forme.freq
946 def add_forme(self, forme) :
947 self.formes[forme.ident] = forme.freq
948 self.freq += forme.freq
950 def decouperlist(chaine, longueur, longueurOptimale) :
952 on part du dernier caractère, et on recule jusqu'au début de la chaîne.
953 Si on trouve un '$', c'est fini.
954 Sinon, on cherche le meilleur candidat. C'est-à-dire le rapport poids/distance le plus important.
956 separateurs = [[u'.', 6.0], [u'?', 6.0], [u'!', 6.0], [u'£$£', 6.0], [u':', 5.0], [u';', 4.0], [u',', 1.0], [u' ', 0.01]]
957 dsep = dict([[val[0],val[1]] for val in separateurs])
958 trouve = False # si on a trouvé un bon séparateur
959 iDecoupe = 0 # indice du caractere ou il faut decouper
961 longueur = min(longueur, len(chaine) - 1)
962 chaineTravail = chaine[:longueur + 1]
964 meilleur = ['', 0, 0] # type, poids et position du meilleur separateur
967 indice = chaineTravail.index(u'$')
969 iDecoupe = indice - 1
974 caractere = chaineTravail[nbCar]
975 distance = abs(longueurOptimale - nbCar) + 1
976 meilleureDistance = abs(longueurOptimale - meilleur[2]) + 1
977 if caractere in dsep :
978 if (float(dsep[caractere]) / distance) > (float(meilleur[1]) / meilleureDistance) :
979 meilleur[0] = caractere
980 meilleur[1] = dsep[caractere]
985 if (float(dsep[' ']) / distance) > (float(meilleur[1]) / meilleureDistance) :
987 meilleur[1] = dsep[' ']
994 #if meilleur[0] != ' ' :
995 # fin = chaine[iDecoupe + 1:]
996 # retour = chaineTravail[:iDecoupe]
998 fin = chaine[iDecoupe + 1:]
999 retour = chaineTravail[:iDecoupe + 1]
1000 return len(retour) > 0, retour, fin
1001 # si on a rien trouvé
1002 return False, chaine, ''
1004 def testetoile(line) :
1005 return line.startswith(u'****')
1008 return line[0:4].isdigit() and u'*' in line
1010 def prep_txtlist(txt) :
1011 return txt.split() + [u'$']
1013 def prep_txtcharact(txt) :
1018 Class for building a corpus
1020 def __init__(self, infile, parametres_corpus, lexique = None, expressions = None, dlg = None) :
1021 log.info('begin building corpus...')
1022 self.lexique = lexique
1023 self.expressions = expressions
1025 self.corpus = Corpus(self, parametres_corpus)
1026 self.infile = infile
1028 self.lim = parametres_corpus.get('lim', 1000000)
1029 self.encoding = parametres_corpus['encoding']
1030 self.corpus.pathout = PathOut(filename = parametres_corpus['originalpath'], dirout = parametres_corpus['pathout'])
1031 self.corpus.pathout.createdir(parametres_corpus['pathout'])
1032 self.corpus.parametres['uuid'] = str(uuid4())
1033 self.corpus.parametres['corpus_name'] = os.path.split(self.corpus.parametres['pathout'])[1]
1034 self.corpus.parametres['type'] = 'corpus'
1035 if self.corpus.parametres['keep_ponct'] :
1036 self.ponctuation_espace = [' ', '']
1038 self.ponctuation_espace = [' ','.', u'£$£', ';', '?', '!', ',', ':','']
1040 self.tolist = self.corpus.parametres.get('tolist', 0)
1047 def prep_makeuce(self) :
1048 method = self.corpus.parametres.get('ucemethod', 0)
1050 self.decouper = decouperlist
1051 self.prep_txt = prep_txtlist
1052 self.ucesize = self.corpus.parametres.get('ucesize', 40)
1054 self.decouper = decoupercharact
1055 self.prep_txt = prep_txtcharact
1056 self.ucesize = self.corpus.parametres.get('ucesize', 240)
1057 log.info('method uce : %s' % method)
1062 self.read_corpus(self.infile)
1063 except Warning, args :
1064 log.info('pas kool %s' % args)
1068 self.corpus.parametres['ira'] = self.corpus.pathout['Corpus.cira']
1069 self.time = time() - t1
1071 DoConf().makeoptions(['corpus'],[self.corpus.parametres], self.corpus.pathout['Corpus.cira'])
1072 log.info('time : %f' % (time() - t1))
1075 self.conn_f = sqlite3.connect(self.corpus.pathout['formes.db'])
1076 self.cf = self.conn_f.cursor()
1077 self.cf.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
1078 self.cf.execute('CREATE TABLE IF NOT EXISTS eff (id INTEGER, eff TEXT);')
1079 self.conn_f.commit()
1080 self.cf = self.conn_f.cursor()
1081 self.cf.execute('PRAGMA temp_store=MEMORY;')
1082 self.cf.execute('PRAGMA journal_mode=MEMORY;')
1083 self.cf.execute('PRAGMA synchronous = OFF;')
1084 self.cf.execute('begin')
1085 self.conn = sqlite3.connect(self.corpus.pathout['uces.db'])
1086 self.c = self.conn.cursor()
1087 self.c.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
1089 self.c = self.conn.cursor()
1090 self.c.execute('PRAGMA temp_store=MEMORY;')
1091 self.c.execute('PRAGMA journal_mode=MEMORY;')
1092 self.c.execute('PRAGMA synchronous = OFF;')
1093 self.c.execute('begin')
1096 #commit index and close db
1098 self.conn_f.commit()
1099 self.cf.execute('CREATE INDEX iduces ON uces (id);')
1100 self.cf.execute('CREATE INDEX ideff ON eff (id);')
1104 self.conn_corpus = sqlite3.connect(self.corpus.pathout['corpus.db'])
1105 self.ccorpus = self.conn_corpus.cursor()
1106 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS etoiles (uci INTEGER, et TEXT, paras TEXT);')
1107 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS luces (uci INTEGER, para INTEGER, uce INTEGER);')
1108 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS formes (ident INTEGER, forme TEXT, lem TEXT, gram TEXT, freq INTEGER);')
1109 self.conn_corpus.commit()
1110 self.ccorpus = self.conn_corpus.cursor()
1111 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
1112 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
1113 self.ccorpus.execute('PRAGMA synchronous = OFF;')
1114 self.ccorpus.execute('begin')
1115 self.backup_corpus()
1116 self.ccorpus.execute('CREATE INDEX iduci ON luces (uci);')
1117 self.conn_corpus.commit()
1118 self.conn_corpus.close()
1119 #self.corpus.parametres['corpus_ira'] = self.corpus.pathout['corpus.cira']
1121 def buildcleans(self) :
1122 if self.corpus.parametres.get('lower', 1) :
1123 self.cleans.append(self.dolower)
1124 if self.corpus.parametres.get('firstclean', 1) :
1125 self.cleans.append(self.firstclean)
1126 if self.corpus.parametres['charact'] :
1127 self.rule = self.corpus.parametres.get('keep_caract', u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_")
1128 self.cleans.append(self.docharact)
1129 if self.corpus.parametres.get('expressions', 1) :
1130 self.cleans.append(self.make_expression)
1131 if self.corpus.parametres.get('apos', 1) :
1132 self.cleans.append(self.doapos)
1133 if self.corpus.parametres.get('tiret', 1):
1134 self.cleans.append(self.dotiret)
1136 def make_expression(self,txt) :
1137 for expression in self.expressions:
1138 if expression in txt :
1139 txt = txt.replace(expression, self.expressions[expression][0])
1142 def dolower(self, txt) :
1145 def docharact(self, txt) :
1146 #rule = u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_-"
1147 list_keep = u"[" + self.rule + "]+"
1148 return re.sub(list_keep, ' ', txt)
1150 def doapos(self, txt) :
1151 return txt.replace(u'\'', u' ')
1153 def dotiret(self, txt) :
1154 return txt.replace(u'-', u' ')
1156 def firstclean(self, txt) :
1157 txt = txt.replace(u'’',"'")
1158 txt = txt.replace(u'œ', u'oe')
1159 return txt.replace('...',u' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'…', u' £$£ ')
1161 def make_cleans(self, txt) :
1162 for clean in self.cleans :
1166 def backup_uce(self) :
1167 if self.corpus.idformesuces != {} :
1168 log.info('backup %i' % len(self.corpus.idformesuces))
1169 touce = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].keys()])) for forme in self.corpus.idformesuces]
1170 toeff = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].values()])) for forme in self.corpus.idformesuces]
1171 self.cf.executemany('INSERT INTO uces VALUES (?,?);', touce)
1172 self.cf.executemany('INSERT INTO eff VALUES (?,?);', toeff)
1173 self.corpus.idformesuces = {}
1176 def backup_corpus(self) :
1177 log.info('start backup corpus')
1179 for uci in self.corpus.ucis :
1180 self.ccorpus.execute('INSERT INTO etoiles VALUES (?,?,?);' ,(uci.ident,' '.join(uci.etoiles), ' '.join(uci.paras,)))
1181 for uce in uci.uces :
1182 self.ccorpus.execute('INSERT INTO luces VALUES (?,?,?);',(`uci.ident`,`uce.para`,`uce.ident`,))
1183 for forme in self.corpus.formes :
1184 self.ccorpus.execute('INSERT INTO formes VALUES (?,?,?,?,?);', (`self.corpus.formes[forme].ident`, forme, self.corpus.formes[forme].lem, self.corpus.formes[forme].gram, `self.corpus.formes[forme].freq`,))
1185 log.info('%f' % (time() - t))
1187 def dofinish(self) :
1188 self.corpus.parametres['date'] = datetime.datetime.now().ctime()
1189 minutes, seconds = divmod(self.time, 60)
1190 hours, minutes = divmod(minutes, 60)
1191 self.corpus.parametres['time'] = '%.0fh %.0fm %.0fs' % (hours, minutes, seconds)
1192 self.corpus.parametres['ucinb'] = self.corpus.getucinb()
1193 self.corpus.parametres['ucenb'] = self.corpus.getucenb()
1194 self.corpus.parametres['occurrences'] = self.corpus.gettotocc()
1195 self.corpus.parametres['formesnb'] = len(self.corpus.formes)
1196 hapaxnb = self.corpus.gethapaxnb()
1197 pourhapaxf = (float(hapaxnb) / len(self.corpus.formes)) * 100
1198 pourhapaxocc = (float(hapaxnb) / self.corpus.parametres['occurrences']) * 100
1199 self.corpus.parametres['hapax'] = '%i - %.2f %% des formes - %.2f %% des occurrences' % (hapaxnb, pourhapaxf, pourhapaxocc)
1202 class BuildFromAlceste(BuildCorpus) :
1203 def read_corpus(self, infile) :
1204 if self.dlg is not None :
1205 self.dlg.Pulse('textes : 0 - segments : 0')
1208 if self.corpus.parametres['ucimark'] == 0 :
1209 self.testuci = testetoile
1210 elif self.corpus.parametres['ucimark'] == 1 :
1211 self.testuci = testint
1217 with codecs.open(infile, 'r', self.encoding) as f :
1218 for linenb, line in enumerate(f) :
1219 line = line.rstrip('\n\r')#FIXME .lstrip(codecs.BOM).lstrip(codecs.BOM_UTF8)
1220 if self.testuci(line) :
1223 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci - 1)
1225 self.corpus.ucis.append(Uci(iduci, line))
1228 if self.corpus.ucis[-1].uces == [] :
1229 log.info(u'Empty text : %i' % linenb)
1231 self.corpus.ucis.pop()
1232 self.corpus.ucis.append(Uci(iduci, line))
1233 if self.dlg is not None :
1234 if not (iduci + 1) % 10 :
1235 self.dlg.Pulse('textes : %i - segments : %i' % (iduci + 1, iduce +1))
1236 elif line.startswith(u'-*') :
1239 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1242 self.corpus.ucis[-1].paras.append(line.split()[0])
1244 raise Exception('paragrapheOT %i' % linenb)
1245 elif line.strip() != '' and iduci != -1 :
1247 if txt != [] and iduci != -1 :
1248 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1253 self.corpus.ucis.pop()
1254 log.info(Exception("Empty text %i" % linenb))
1256 raise Exception('EmptyText %i' % linenb)
1257 if iduci != -1 and iduce != -1:
1260 log.info(_(u"No Text in corpora. Are you sure of the formatting ?"))
1261 raise Exception('TextBeforeTextMark %i' % linenb)
1262 except UnicodeDecodeError :
1263 raise Exception("CorpusEncoding")
1265 def treattxt(self, txt, iduce, idpara, iduci) :
1266 if self.corpus.parametres.get('ucemethod', 0) == 2 and self.corpus.parametres['douce']:
1267 txt = 'laphrasepoursplitter'.join(txt)
1268 txt = self.make_cleans(txt)
1269 txt = ' '.join([val for val in txt.split() if val not in self.ponctuation_espace])
1270 ucetxt = txt.split('laphrasepoursplitter')
1273 txt = self.make_cleans(txt)
1274 ucetxt = self.make_uces(txt, self.corpus.parametres['douce'])
1275 if self.corpus.ucis[-1].paras == [] :
1279 self.corpus.ucis[-1].uces.append(Uce(iduce, idpara, iduci))
1280 self.c.execute('INSERT INTO uces VALUES(?,?);', (`iduce`,uce))
1281 if not self.tolist :
1287 self.corpus.add_word(word)
1288 log.debug(' '.join([`iduci`,`idpara`,`iduce`]))
1289 if self.last > self.lim :
1292 return iduce, idpara
1294 def make_uces(self, txt, douce = True, keep_ponct = False) :
1295 txt = ' '.join(txt.split())
1298 reste, texte_uce, suite = self.decouper(self.prep_txt(txt), self.ucesize + 15, self.ucesize)
1300 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1303 reste, texte_uce, suite = self.decouper(suite, self.ucesize + 15, self.ucesize)
1304 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1309 return [' '.join([val for val in txt.split() if val not in self.ponctuation_espace])]
1311 #decouper (list_sep)
1312 #make_uces (decouper)
1313 #treat_txt (make_uces)
1317 def __init__(self, parent, dlg = None) :
1318 self.parent = parent
1320 parametres = DoConf(os.path.join(self.parent.UserConfigPath,'corpus.cfg')).getoptions('corpus')
1321 parametres['pathout'] = PathOut(parent.filename, 'corpus').mkdirout()
1322 dial = CorpusPref(parent, parametres)
1323 dial.CenterOnParent()
1324 dial.txtpath.SetLabel(parent.filename)
1325 #dial.repout_choices.SetValue(parametres['pathout'])
1326 self.res = dial.ShowModal()
1327 if self.res == 5100 :
1328 parametres = dial.doparametres()
1329 parametres['originalpath'] = parent.filename
1330 PathOut().createdir(parametres['pathout'])
1331 ReadLexique(self.parent, lang = parametres['lang'])
1332 if parametres['lang'] != 'other' and os.path.exists(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')):
1333 self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp'))
1335 self.parent.expressions = {}
1336 self.parametres = parametres
1338 if self.dlg is not None :
1342 def doanalyse(self) :
1343 return BuildFromAlceste(self.parent.filename, self.parametres, self.parent.lexique, self.parent.expressions, dlg = self.dlg).corpus
1346 if __name__ == '__main__' :
1348 parametres = {'formesdb':'formes.db', 'ucesdb': 'uces.db', 'corpusdb' : 'corpus.db', 'syscoding' : 'utf-8', 'encoding' : 'utf8'}
1349 intro = BuildCorpus(infile, parametres)#, tar_in, tar_infouce)#, tar_formes)