1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
11 from functions import decoupercharact, ReadDicoAsDico, DoConf, ReadLexique
16 from operator import itemgetter
17 from uuid import uuid4
18 from chemins import PathOut
19 from dialog import CorpusPref
20 from colors import colors
24 log = logging.getLogger('iramuteq.corpus')
27 def copycorpus(corpus) :
28 log.info('copy corpus')
29 copy_corpus = Corpus(corpus.parent, parametres = corpus.parametres)
30 copy_corpus.ucis = corpus.ucis
31 copy_corpus.formes = corpus.formes
32 copy_corpus.pathout = corpus.pathout
33 copy_corpus.conn_all()
42 def __init__(self, parent, parametres = {}, read = False) :
44 self.parametres = parametres
46 self.connformes = None
48 self.conncorpus = None
55 self.idformesuces = {}
60 self.pathout = PathOut(dirout = parametres['pathout'])
63 def add_word(self, word) :
64 if word in self.formes :
65 self.formes[word].freq += 1
66 if self.formes[word].ident in self.idformesuces :
67 if self.ucis[-1].uces[-1].ident in self.idformesuces[self.formes[word].ident] :
68 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] += 1
70 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] = 1
72 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident: 1}
74 if word in self.parent.lexique :
75 gramtype = self.parent.lexique[word][1]
76 lem = self.parent.lexique[word][0]
83 self.formes[word] = Word(word, gramtype, len(self.formes), lem)
84 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident : 1}
87 """connect corpus to db"""
88 if self.connformes is None :
89 log.info('connexion corpus')
90 self.connuces = sqlite3.connect(self.pathout['uces.db'])
91 self.cuces = self.connuces.cursor()
92 self.connformes = sqlite3.connect(self.pathout['formes.db'])
93 self.cformes = self.connformes.cursor()
94 self.conncorpus = sqlite3.connect(self.pathout['corpus.db'])
95 self.ccorpus = self.conncorpus.cursor()
96 self.cformes.execute('PRAGMA temp_store=MEMORY;')
97 self.cformes.execute('PRAGMA journal_mode=MEMORY;')
98 self.cformes.execute('PRAGMA synchronous = OFF;')
99 self.cuces.execute('PRAGMA temp_store=MEMORY;')
100 self.cuces.execute('PRAGMA journal_mode=MEMORY;')
101 self.cuces.execute('PRAGMA synchronous = OFF;')
102 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
103 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
104 self.ccorpus.execute('PRAGMA synchronous = OFF;')
106 def read_corpus(self) :
107 log.info('read corpus')
108 self.parametres['syscoding'] = sys.getdefaultencoding()
109 if self.conncorpus is None :
111 res = self.ccorpus.execute('SELECT * FROM etoiles;')
113 self.ucis.append(Uci(row[0], row[1], row[2]))
114 uces = self.conncorpus.cursor().execute('SELECT * FROM luces where uci=?;',(`self.ucis[-1].ident`,))
116 self.ucis[-1].uces.append(Uce(uce[2], uce[1], uce[0]))
117 res = self.ccorpus.execute('SELECT * FROM formes;')
118 self.formes = dict([[forme[1], Word(forme[1], forme[3], forme[0], lem = forme[2], freq = forme[4])] for forme in res])
121 def getworduces(self, wordid) :
122 if isinstance(wordid, basestring) :
123 wordid = self.formes[wordid].ident
124 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`wordid`,))
125 return list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
127 def getworducis(self, wordid) :
128 res = self.getworduces(wordid)
129 return list(set([self.getucefromid(uce).uci for uce in res]))
131 def getformeuceseff(self, formeid) :
132 if isinstance(formeid, basestring) :
133 formeid = self.formes[formeid].ident
134 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`formeid`,))
135 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
136 query = 'SELECT eff FROM eff where id=%i ORDER BY id' % formeid
137 res = self.cformes.execute(query)
138 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
140 for i, uce in enumerate(uces) :
141 formeuceeff[uce] = formeuceeff.get(uce, 0) + eff[i]
144 def getlemuces(self, lem) :
145 formesid = ', '.join([`val` for val in self.lems[lem].formes])
146 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
147 res = self.cformes.execute(query)
148 return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))))
150 def getlemucis(self, lem) :
151 uces = self.getlemuces(lem)
152 return list(set([self.getucefromid(val).uci for val in uces]))
154 def getlemuceseff(self, lem, luces = None) :
155 formesid = ', '.join([`val` for val in self.lems[lem].formes])
156 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
157 res = self.cformes.execute(query)
158 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
159 query = 'SELECT eff FROM eff where id IN (%s) ORDER BY id' % formesid
160 res = self.cformes.execute(query)
161 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
163 for i, uce in enumerate(uces) :
164 lemuceeff[uce] = lemuceeff.get(uce, 0) + eff[i]
167 def getlemclustereff(self, lem, cluster) :
168 return len(list(set(self.lc[cluster]).intersection(self.getlemuces(lem))))
170 def getlemeff(self, lem) :
171 return self.lems[lem].freq
176 def getforme(self, formeid) :
177 if self.idformes is None : self.make_idformes()
178 return self.idformes[formeid]
180 def gettotocc(self) :
181 return sum([self.formes[forme].freq for forme in self.formes])
183 def getucemean(self) :
184 return float(self.gettotocc())/self.getucenb()
187 return self.ucis[-1].uces[-1].ident + 1
190 return self.ucis[-1].ident + 1
192 def getucisize(self) :
193 ucesize = self.getucesize()
194 return [sum(ucesize[uci.uces[0].ident:(uci.uces[-1].ident + 1)]) for uci in self.ucis]
196 def getucesize(self) :
197 res = self.getalluces()
198 return [len(uce[1].split()) for uce in res]
200 def getconcorde(self, uces) :
201 return self.cuces.execute('select * from uces where id IN (%s);' % ', '.join([`i` for i in uces]))
203 def getuciconcorde(self, ucis) :
204 uces = [[val,[uce.ident for uce in self.ucis[val].uces]] for val in ucis]
205 uces = [[val[0], '\n'.join([row[1] for row in self.getconcorde(val[1])])] for val in uces]
208 def getwordconcorde(self, word) :
209 return self.getconcorde(self.getworduces(word))
211 def getlemconcorde(self, lem) :
212 return self.getconcorde(self.getlemuces(lem))
214 def getalluces(self) :
215 return self.cuces.execute('SELECT * FROM uces')
217 def getallucis(self):
218 uces = [row[1] for row in self.getalluces()]
219 return [[uci.ident, '\n'.join([uces[uce.ident] for uce in uci.uces])] for uci in self.ucis]
221 def getucesfrometoile(self, etoile) :
222 return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles]
224 def getetoileuces(self) :
225 log.info('get uces etoiles')
228 for uci in self.ucis :
229 etoiles = uci.etoiles[1:]
231 if et in etoileuces :
232 etoileuces[et] += [uce.ident for uce in uci.uces]
234 etoileuces[et] = [uce.ident for uce in uci.uces]
236 for et in uci.paras :
237 if et in etoileuces :
238 etoileuces[et] += [uce.ident for uce in uci.uces if uce.para == idpara]
240 etoileuces[et] = [uce.ident for uce in uci.uces if uce.para == idpara]
246 def getetoileucis(self):
248 for uci in self.ucis :
249 etoiles = uci.etoiles[1:]
251 if et in etoileuces :
252 etoileuces[et] += [uci.ident]
254 etoileuces[et] = [uci.ident]
257 def getucefromid(self, uceid) :
258 if self.iduces is None : self.make_iduces()
259 return self.iduces[uceid]
261 def gethapaxnb(self) :
262 return len([None for forme in self.formes if self.formes[forme].freq == 1])
264 def getactivesnb(self, key) :
265 return len([lem for lem in self.lems if self.lems[lem].act == key])
266 # def make_lems(self, lem = True) :
267 # log.info('make lems')
269 # for forme in self.formes :
270 # if self.formes[forme].lem in self.lems :
271 # if self.formes[forme].ident not in self.lems[self.formes[forme].lem] :
272 # self.lems[self.formes[forme].lem][self.formes[forme].ident] = 0
274 # self.lems[self.formes[forme].lem] = {self.formes[forme].ident : 0}
276 def getetbyuceid(self, uceid) :
277 if self.uceuci is None : self.uceuci = dict([[uce.ident,uci.ident] for uci in self.ucis for uce in uci.uces])
278 return self.ucis[self.uceuci[uceid]].etoiles
280 def make_lems(self, lem = True) :
281 log.info('make lems')
284 for forme in self.formes :
285 if self.formes[forme].lem in self.lems :
286 if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes :
287 self.lems[self.formes[forme].lem].add_forme(self.formes[forme])
289 self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
291 self.lems = dict([[forme, Lem(self, self.formes[forme])] for forme in self.formes])
293 def make_idformes(self) :
294 self.idformes = dict([[self.formes[forme].ident, self.formes[forme]] for forme in self.formes])
296 def make_iduces(self) :
297 if self.iduces is None :
298 self.iduces = dict([[uce.ident, uce] for uci in self.ucis for uce in uci.uces])
300 def make_lexitable(self, mineff, etoiles, gram = 0) :
305 tokeep = [lem for lem in self.lems if self.lems[lem].freq >= mineff and self.lems[lem].act in grams]
306 etuces = [[] for et in etoiles]
307 for uci in self.ucis :
308 get = list(set(uci.etoiles).intersection(etoiles))
310 log.info('2 variables sur une ligne')
312 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
313 etuces = [set(val) for val in etuces]
316 deff = self.getlemuceseff(lem)
318 tab.append([lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])
319 tab.insert(0, [''] + etoiles)
322 def make_efftype_from_etoiles(self, etoiles) :
324 etuces = [[] for et in etoiles]
325 for uci in self.ucis :
326 get = list(set(uci.etoiles).intersection(etoiles))
328 return '2 variables sur la meme ligne'
330 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
331 etuces = [set(val) for val in etuces]
332 for lem in self.lems :
333 deff = self.getlemuceseff(lem)
335 gram = self.lems[lem].gram
337 dtype[gram] = [i + j for i, j in zip(dtype[gram], [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])]
339 dtype[gram] = [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces]
340 tabout = [[gram] + dtype[gram] for gram in dtype]
341 tabout.insert(0, [''] + etoiles)
344 def make_uceactsize(self, actives) :
345 res = self.getalluces()
348 deff = self.getlemuceseff(lem)
350 ucesize[uce] = ucesize.get(uce, 0) + 1
353 def make_uc(self, actives, lim1, lim2) :
354 uceactsize = self.make_uceactsize(actives)
360 for uce in [uce for uci in self.ucis for uce in uci.uces] :
361 if uce.para == lastpara :
363 last1 += uceactsize.get(uce.ident,0)
364 uc1[-1].append(uce.ident)
366 uc1.append([uce.ident])
369 last2 += uceactsize.get(uce.ident, 0)
370 uc2[-1].append(uce.ident)
372 uc2.append([uce.ident])
375 last1 = uceactsize.get(uce.ident, 0)
376 last2 = uceactsize.get(uce.ident, 0)
378 uc1.append([uce.ident])
379 uc2.append([uce.ident])
382 def make_and_write_sparse_matrix_from_uc(self, actives, sizeuc1, sizeuc2, uc1out, uc2out, listuce1out, listuce2out) :
383 uc1, uc2 = self.make_uc(actives, sizeuc1, sizeuc2)
384 log.info('taille uc1 : %i - taille uc2 : %i' % (len(uc1), len(uc2)))
385 self.write_ucmatrix(uc1, actives, uc1out)
386 self.write_ucmatrix(uc2, actives, uc2out)
387 listuce1 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc1) for uce in ucl]
388 listuce2 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc2) for uce in ucl]
389 with open(listuce1out, 'w') as f :
390 f.write('\n'.join([';'.join(line) for line in listuce1]))
391 with open(listuce2out, 'w') as f :
392 f.write('\n'.join([';'.join(line) for line in listuce2]))
393 return len(uc1), len(uc2)
395 def write_ucmatrix(self, uc, actives, fileout) :
396 log.info('write uc matrix %s' % fileout)
397 uces_uc = dict([[uce, i] for i, ucl in enumerate(uc) for uce in ucl])
400 with open(fileout + '~', 'w+') as f :
401 for i, lem in enumerate(actives) :
402 for uce in self.getlemuces(lem):
403 if (uces_uc[uce], i) not in deja_la :
405 f.write(''.join([' '.join([`uces_uc[uce]+1`,`i+1`,`1`]),'\n']))
406 deja_la[(uces_uc[uce], i)] = 0
408 with open(fileout, 'w') as ffin :
409 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (len(uc), len(actives), nbl))
412 os.remove(fileout + '~')
415 def export_corpus(self, outf) :
416 #outf = 'export_corpus.txt'
418 res = self.getalluces()
422 with open(outf,'w') as f :
424 if self.iduces[uce[0]].uci == actuci and self.iduces[uce[0]].para == actpara :
425 f.write(uce[1].encode(self.parametres['syscoding']) + '\n')
426 elif self.iduces[uce[0]].uci != actuci :
427 actuci = self.iduces[uce[0]].uci
428 if self.ucis[self.iduces[uce[0]].uci].paras == [] :
429 actpara = self.iduces[uce[0]].para
430 f.write('\n' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n' + uce[1].encode(self.parametres['syscoding']) + '\n')
433 actpara = self.iduces[uce[0]].para
434 f.write('\n'.join([' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']), self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
435 elif self.iduces[uce[0]].para != actpara :
436 actpara = self.iduces[uce[0]].para
438 f.write('\n'.join([self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
440 def export_corpus_classes(self, outf, alc = True, lem = False, uci = False) :
442 for i, lc in enumerate(self.lc) :
445 for uce in self.lc0 :
448 res = self.getalluces()
451 res = self.getallucis()
452 with open(outf, 'w') as f :
456 actuci = self.iduces[uce[0]].uci
460 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
462 etline = ' '.join(self.ucis[actuci].etoiles + ['*classe_%i' % ucecl[uce[0]]])
464 etline = ' '.join(['<' + '='.join(et.split('_')) + '>' for et in self.ucis[actuci].etoiles[1:]])
465 f.write(etline.encode(self.parametres['syscoding']) + '\n')
466 f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
468 def export_classe(self, outf, classe, lem = False, uci = False) :
469 sts = self.lc[classe - 1]
471 res = self.getconcorde(sts)
474 res = self.getuciconcorde(sts)
475 with open(outf, 'w') as f :
479 f.write(' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n')
481 f.write(' '.join(self.ucis[uce[0]].etoiles).encode(self.parametres['syscoding']) + '\n')
483 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
484 f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
486 def export_owledge(self, rep, classe, lem = False, uci = False) :
487 sts = self.lc[classe - 1]
489 res = self.getconcorde(sts)
492 res = self.getuciconcorde(sts)
496 outf = '.'.join([`ident`, 'txt'])
497 outf = os.path.join(rep, outf)
499 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
500 with open(outf, 'w') as f :
501 f.write(guce.encode('cp1252', errors = 'replace'))
503 def export_tropes(self, fileout, classe, lem = False, uci = False) :
504 sts = self.lc[classe - 1]
506 res = self.getconcorde(sts)
509 res = self.getuciconcorde(sts)
510 with open(fileout, 'w') as f :
514 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
515 f.write(guce.encode('cp1252', errors = 'replace'))
518 def make_and_write_sparse_matrix_from_uces(self, actives, outfile, listuce = False) :
519 log.info('make_and_write_sparse_matrix_from_uces %s' % outfile)
521 with open(outfile + '~', 'w+') as f :
522 for i, lem in enumerate(actives) :
523 for uce in sorted(self.getlemuces(lem)) :
525 f.write(''.join([' '.join([`uce+1`, `i+1`,`1`]),'\n']))
527 with open(outfile, 'w') as ffin :
528 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
531 os.remove(outfile + '~')
533 with open(listuce, 'w') as f :
534 f.write('\n'.join(['uce;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucenb())]))
536 def make_and_write_sparse_matrix_from_uci(self, actives, outfile, listuci = False) :
537 log.info('make_and_write_sparse_matrix_from_ucis %s' % outfile)
539 with open(outfile + '~', 'w+') as f :
540 for i, lem in enumerate(actives) :
541 for uci in sorted(self.getlemucis(lem)) :
543 f.write(''.join([' '.join([`uci+1`, `i+1`,`1`]),'\n']))
545 with open(outfile, 'w') as ffin :
546 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucinb(), len(actives), nbl))
549 os.remove(outfile + '~')
551 with open(listuci, 'w') as f :
552 f.write('\n'.join(['uci;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucinb())]))
554 def make_and_write_sparse_matrix_from_classe(self, actives, uces, outfile) :
555 log.info('make_and_write_sparse_matrix_from_classe %s' % outfile)
557 duces = dict([[uce, i] for i, uce in enumerate(uces)])
558 with open(outfile + '~', 'w+') as f :
559 for i, lem in enumerate(actives) :
560 uces_ok = list(set(self.getlemuces(lem)).intersection(uces))
562 f.write(''.join([' '.join([`duces[uce]+1`,`i+1`,`1`]),'\n']))
564 with open(outfile, 'w') as ffin :
565 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
568 os.remove(outfile + '~')
570 def make_table_with_classe(self, uces, list_act, uci = False) :
571 table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
572 uces = dict([[uce, i] for i, uce in enumerate(uces)])
574 getlem = self.getlemucis
576 getlem = self.getlemuces
577 for i, lem in enumerate(list_act) :
578 lemuces = list(set(getlem(lem)).intersection(uces))
580 table_uce[uces[uce]][i] = 1
581 table_uce.insert(0, list_act)
584 def make_pondtable_with_classe(self, uces, list_act) :
585 table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
586 uces = dict([[uce, i] for i, uce in enumerate(uces)])
587 for i, lem in enumerate(list_act) :
588 uceseff = self.getlemuceseff(lem)
589 lemuces = list(set(uceseff.keys()).intersection(uces))
591 table_uce[uces[uce]][i] = uceseff[uce]
592 table_uce.insert(0, list_act)
595 def parse_active(self, gramact, gramsup = None) :
596 log.info('parse actives')
597 for lem in self.lems :
598 if lem.startswith('_') and lem.endswith('_') :
599 self.lems[lem].act = 2
600 elif self.lems[lem].gram in gramact :
601 self.lems[lem].act = 1
602 elif gramsup is not None and self.lems[lem].gram not in gramact:
603 if self.lems[lem].gram in gramsup :
604 self.lems[lem].act = 2
606 self.lems[lem].act = 0
608 self.lems[lem].act = 2
610 def make_actives_limit(self, limit, key = 1) :
611 if self.idformes is None :
613 return [lem for lem in self.lems if self.getlemeff(lem) >= limit and self.lems[lem].act == key]
615 def make_actives_nb(self, nbmax, key) :
616 log.info('make_actives_nb : %i - %i' % (nbmax,key))
617 if self.idformes is None :
619 allactives = [[self.lems[lem].freq, lem] for lem in self.lems if self.lems[lem].act == key and self.lems[lem].freq >= 3]
620 self.activenb = len(allactives)
621 allactives = sorted(allactives, reverse = True)
622 if self.activenb == 0 :
624 if len(allactives) <= nbmax :
625 log.info('nb = %i - eff min = %i ' % (len(allactives), allactives[-1][0]))
626 return [val[1] for val in allactives], allactives[-1][0]
628 effs = [val[0] for val in allactives]
629 if effs.count(effs[nbmax - 1]) > 1 :
630 lim = effs[nbmax - 1] + 1
634 stop = effs.index(lim)
641 log.info('nb actives = %i - eff min = %i ' % (stop + 1, lim))
642 return [val[1] for val in allactives[0:stop + 1]], lim
644 def make_and_write_profile(self, actives, ucecl, fileout, uci = False) :
645 log.info('formes/classes')
647 tab = [[lem] + [len(set(self.getlemucis(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
649 tab = [[lem] + [len(set(self.getlemuces(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
650 tab = [[line[0]] + [`val` for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
651 with open(fileout, 'w') as f :
652 f.write('\n'.join([';'.join(line) for line in tab]).encode(self.parametres['syscoding']))
654 def make_etoiles(self) :
656 for uci in self.ucis :
657 etoiles.update(uci.etoiles[1:])
660 def make_etoiles_dict(self) :
661 etoiles = [et for uci in self.ucis for et in uci.etoiles[1:]]
663 for etoile in etoiles :
664 et = etoile.split('_')
667 endet = '_'.join(et[1:])
668 if etoile in det[et[0]] :
669 det[et[0]][etoile] += 1
671 det[et[0]][etoile] = 1
676 endet = '_'.join(et[1:])
677 det[et[0]] = {etoile :1}
682 def make_etline(self, listet) :
683 etuces = [[] for et in listet]
684 for uci in self.ucis :
685 get = list(set(uci.etoiles).intersection(listet))
687 return '2 variables sur la meme ligne'
689 etuces[listet.index(get[0])] += [uce.ident for uce in uci.uces]
692 def make_and_write_profile_et(self, ucecl, fileout, uci = False) :
693 log.info('etoiles/classes')
695 etoileuces = self.getetoileuces()
697 etoileuces = self.getetoileucis()
698 etoileuces = dict([[et, etoileuces[et]] for et in etoileuces if len(etoileuces[et]) > 1])
699 with open(fileout, 'w') as f :
700 f.write('\n'.join([';'.join([et] + [`len(set(etoileuces[et]).intersection(classe))` for classe in ucecl]) for et in etoileuces]).encode(self.parametres['syscoding']))
701 #etoiles = self.make_etoiles()
702 #with open(fileout, 'w') as f :
703 # f.write('\n'.join([';'.join([etoile] + [`len(set(self.getucesfrometoile(etoile)).intersection(classe))` for classe in ucecl]) for etoile in etoiles]).encode(self.parametres['syscoding']))
705 def make_colored_corpus(self, uci = False) :
707 for i, lc in enumerate(self.lc) :
710 for uce in self.lc0 :
712 color = ['black'] + colors[len(self.lc) - 1]
714 <meta http-equiv="content-Type" content="text/html; charset=%s" />
716 ''' % sys.getdefaultencoding()
718 res = self.getalluces()
723 if self.iduces[uce[0]].uci != actuci :
724 actuci = self.iduces[uce[0]].uci
725 txt += '<br><hr>' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles) + '<br><br>'
726 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
728 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
730 res = self.getallucis()
733 if self.ucis[uce[0]].ident != actuci :
734 actuci = self.ucis[uce[0]].ident
735 txt += '<br><hr>' + ' '.join(self.ucis[self.ucis[uce[0]].ident].etoiles) + '<br><br>'
736 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
738 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
739 return txt + '\n</body></html>'
741 def count_from_list(self, l, d) :
749 def count_from_list_cl(self, l, d, a, clnb) :
758 def find_segments(self, taille_segment, taille_limite) :
760 for uce in self.getalluces() :
762 d = self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
763 l = [[d[val], val] for val in d if d[val] >= 3]
766 if len(l) > taille_limite :
767 l = l[-taille_limite:]
770 def find_segments_in_classe(self, list_uce, taille_segment, taille_limite, uci = False):
773 concorde = self.getconcorde
775 concorde = self.getuciconcorde
776 for uce in concorde(list_uce) :
778 d =self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
779 l = [[d[val], val, taille_segment] for val in d if d[val] >= 3]
782 if len(l) > taille_limite :
783 l = l[-taille_limite:]
786 def make_segments_profile(self, fileout, lenmin = 3, lenmax = 10, effmin = 50, lem = False) :
788 for b, classe in enumerate(self.lc) :
789 for uce in self.getconcorde(classe) :
792 uce = [self.formes[forme].lem for forme in uce]
793 for taille_segment in range(lenmin,lenmax) :
794 d =self.count_from_list_cl([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d, b, len(self.lc))
795 result = [[seg] + [str(val) for val in d[seg]] for seg in d if sum(d[seg]) >= effmin]
796 with open(fileout, 'w') as f :
797 f.write('\n'.join([';'.join(line) for line in result]))
799 def make_proftype(self, outf) :
801 for lem in self.lems :
802 gram = self.lems[lem].gram
804 res[gram] = [0 for val in self.lc]
805 lemuceeff = self.getlemuceseff(lem)
806 for i, classe in enumerate(self.lc) :
807 concern = set(classe).intersection(lemuceeff.keys())
808 res[gram][i] += sum([lemuceeff[uce] for uce in concern])
809 res = [[gram] + [`val` for val in res[gram]] for gram in res]
811 with open(outf, 'w') as f :
812 f.write('\n'.join([';'.join(line) for line in res]).encode(self.parametres['syscoding']))
815 def make_ucecl_from_R(self, filein) :
816 with open(filein, 'rU') as f :
821 line = line.replace('\n', '').replace('"', '').split(';')
822 self.lc.append([int(line[0]) - 1, int(line[1])])
823 classesl = [val[1] for val in self.lc]
825 self.lc = sorted(self.lc, key=itemgetter(1))
826 self.lc = [[uce[0] for uce in self.lc if uce[1] == i] for i in range(clnb+1)]
827 self.lc0 = self.lc.pop(0)
830 def get_stat_by_cluster(self, outf, lclasses = None) :
831 log.info('get_stat_by_cluster')
832 if lclasses is None :
835 occurrences = dict([[i + 1, 0] for i in range(len(lclasses))])
836 formescl = dict([[i + 1, 0] for i in range(len(lclasses))])
837 hapaxcl = dict([[i + 1, 0] for i in range(len(lclasses))])
838 lenclasses = dict([[i+1,len(cl)] for i, cl in enumerate(lclasses)])
839 sets = [set(cl) for cl in lclasses]
840 for forme in self.formes :
841 formeuceeff = self.getformeuceseff(forme)
842 for i, classe in enumerate(lclasses) :
843 concern = sets[i].intersection(formeuceeff.keys())
845 occurrences[i+1] += sum([formeuceeff[uce] for uce in concern])
847 if self.formes[forme].freq == 1 :
849 log.info('%f' % (time() - t1))
850 if outf is not None :
851 toprint = '\n'.join([';'.join([`i`, `occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`]) for i in occurrences])
852 with open(outf, 'w') as f :
855 return [[`occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`] for i in occurrences]
857 def get_stat_by_et(self, outf, etoiles) :
858 lclasses = [self.getucesfrometoile(etoile) for etoile in etoiles]
859 stats = self.get_stat_by_cluster(None, lclasses)
860 stats = [[etoiles[i]] + val for i, val in enumerate(stats)]
862 def gethapaxbyet(self, etoiles) :
863 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
865 for uce in hapaxuces :
866 if uce in hucesdict :
870 etuces = [[] for et in etoiles]
871 for uci in self.ucis :
872 get = list(set(uci.etoiles).intersection(etoiles))
874 return '2 variables sur la meme ligne'
876 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
877 etuces = [set(val) for val in etuces]
878 return [sum([hucesdict[uce] for uce in list(etuce.intersection(hapaxuces))]) for etuce in etuces]
880 def gethapaxuces(self) :
881 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
882 hapax = [forme for forme in self.lems if self.lems[forme].freq == 1]
884 for i,uce in enumerate(hapaxuces) :
885 if uce in hucesdict :
886 hucesdict[uce][0] += 1
887 hucesdict[uce][1].append(hapax[i])
889 hucesdict[uce] = [1,[hapax[i]]]
891 for uce in hucesdict :
892 if hucesdict[uce][0] in huces :
893 huces[hucesdict[uce][0]].append(uce)
895 huces[hucesdict[uce][0]] = [uce]
896 huces = zip(huces, huces.values())
897 huces.sort(reverse=True)
901 for nb in huces[0:4] :
902 txt += "<p><h2>%i hapax par uce</h2><p>\n" % nb[0]
904 res = self.getconcorde([uce])
906 ucetxt = ' ' + row[1] + ' '
908 for hap in hucesdict[uce][1] :
909 laforme = self.getforme([forme for forme in self.lems[hap].formes][0]).forme
910 ucetxt = ucetxt.replace(' '+laforme+' ', ' <font color=red>'+laforme+'</font> ')
911 txt += '<p><b>' + ' '.join(self.getetbyuceid(uceid)) + '</b></p>'
912 txt += '<p>'+ucetxt+'</p>\n'
916 with open('/tmp/testhapxuce.html','w') as f :
919 def export_dictionary(self, fileout, syscoding) :
920 listformes = [[self.formes[forme].freq, forme, self.formes[forme].lem, self.formes[forme].gram] for forme in self.formes]
921 listformes.sort(reverse = True)
922 listformes = [forme[1:] + [`forme[0]`] for forme in listformes]
923 with open(fileout, 'w') as f :
924 f.write('\n'.join(['\t'.join(forme) for forme in listformes]).encode(syscoding))
926 def export_lems(self, fileout, syscoding) :
928 listlem = [[lem, '\t'.join(['\t'.join([self.idformes[forme].forme, `self.lems[lem].formes[forme]`]) for forme in self.lems[lem].formes])] for lem in self.lems]
930 with open(fileout, 'w') as f :
931 f.write('\n'.join(['\t'.join(lem) for lem in listlem]).encode(syscoding))
937 def __init__(self, corpus) :
938 ucinb = corpus.getucinb()
939 ucisize = corpus.getucisize()
940 ucimean = float(sum(ucisize))/float(ucinb)
941 detoile = corpus.make_etoiles_dict()
944 def __init__(self, iduci, line, paraset = None) :
946 self.etoiles = line.split()
948 if paraset is not None :
949 self.paras = paraset.split()
954 def __init__(self, iduce, idpara, iduci) :
960 def __init__(self, word, gramtype, idword, lem = None, freq = None) :
966 if freq is not None :
972 def __init__(self, parent, forme) :
973 self.formes = {forme.ident : forme.freq}
974 self.gram = forme.gram
975 self.freq = forme.freq
978 def add_forme(self, forme) :
979 self.formes[forme.ident] = forme.freq
980 self.freq += forme.freq
982 def decouperlist(chaine, longueur, longueurOptimale) :
984 on part du dernier caractère, et on recule jusqu'au début de la chaîne.
985 Si on trouve un '$', c'est fini.
986 Sinon, on cherche le meilleur candidat. C'est-à-dire le rapport poids/distance le plus important.
988 separateurs = [[u'.', 6.0], [u'?', 6.0], [u'!', 6.0], [u'£$£', 6.0], [u':', 5.0], [u';', 4.0], [u',', 1.0], [u' ', 0.01]]
989 dsep = dict([[val[0],val[1]] for val in separateurs])
990 trouve = False # si on a trouvé un bon séparateur
991 iDecoupe = 0 # indice du caractere ou il faut decouper
993 longueur = min(longueur, len(chaine) - 1)
994 chaineTravail = chaine[:longueur + 1]
996 meilleur = ['', 0, 0] # type, poids et position du meilleur separateur
999 indice = chaineTravail.index(u'$')
1001 iDecoupe = indice - 1
1006 caractere = chaineTravail[nbCar]
1007 distance = abs(longueurOptimale - nbCar) + 1
1008 meilleureDistance = abs(longueurOptimale - meilleur[2]) + 1
1009 if caractere in dsep :
1010 if (float(dsep[caractere]) / distance) > (float(meilleur[1]) / meilleureDistance) :
1011 meilleur[0] = caractere
1012 meilleur[1] = dsep[caractere]
1017 if (float(dsep[' ']) / distance) > (float(meilleur[1]) / meilleureDistance) :
1019 meilleur[1] = dsep[' ']
1026 #if meilleur[0] != ' ' :
1027 # fin = chaine[iDecoupe + 1:]
1028 # retour = chaineTravail[:iDecoupe]
1030 fin = chaine[iDecoupe + 1:]
1031 retour = chaineTravail[:iDecoupe + 1]
1032 return len(retour) > 0, retour, fin
1033 # si on a rien trouvé
1034 return False, chaine, ''
1036 def testetoile(line) :
1037 return line.startswith(u'****')
1040 return line[0:4].isdigit() and u'*' in line
1042 def prep_txtlist(txt) :
1043 return txt.split() + [u'$']
1045 def prep_txtcharact(txt) :
1050 Class for building a corpus
1052 def __init__(self, infile, parametres_corpus, lexique = None, expressions = None, dlg = None) :
1053 log.info('begin building corpus...')
1054 self.lexique = lexique
1055 self.expressions = expressions
1057 self.corpus = Corpus(self, parametres_corpus)
1058 self.infile = infile
1060 self.lim = parametres_corpus.get('lim', 1000000)
1061 self.encoding = parametres_corpus['encoding']
1062 self.corpus.pathout = PathOut(filename = parametres_corpus['originalpath'], dirout = parametres_corpus['pathout'])
1063 self.corpus.pathout.createdir(parametres_corpus['pathout'])
1064 self.corpus.parametres['uuid'] = str(uuid4())
1065 self.corpus.parametres['corpus_name'] = os.path.split(self.corpus.parametres['pathout'])[1]
1066 self.corpus.parametres['type'] = 'corpus'
1067 if self.corpus.parametres['keep_ponct'] :
1068 self.ponctuation_espace = [' ', '']
1070 self.ponctuation_espace = [' ','.', u'£$£', ';', '?', '!', ',', ':','']
1072 self.tolist = self.corpus.parametres.get('tolist', 0)
1079 def prep_makeuce(self) :
1080 method = self.corpus.parametres.get('ucemethod', 0)
1082 self.decouper = decouperlist
1083 self.prep_txt = prep_txtlist
1084 self.ucesize = self.corpus.parametres.get('ucesize', 40)
1086 self.decouper = decoupercharact
1087 self.prep_txt = prep_txtcharact
1088 self.ucesize = self.corpus.parametres.get('ucesize', 240)
1089 log.info('method uce : %s' % method)
1094 self.read_corpus(self.infile)
1095 except Warning, args :
1096 log.info('pas kool %s' % args)
1100 self.corpus.parametres['ira'] = self.corpus.pathout['Corpus.cira']
1101 self.time = time() - t1
1103 DoConf().makeoptions(['corpus'],[self.corpus.parametres], self.corpus.pathout['Corpus.cira'])
1104 log.info('time : %f' % (time() - t1))
1107 self.conn_f = sqlite3.connect(self.corpus.pathout['formes.db'])
1108 self.cf = self.conn_f.cursor()
1109 self.cf.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
1110 self.cf.execute('CREATE TABLE IF NOT EXISTS eff (id INTEGER, eff TEXT);')
1111 self.conn_f.commit()
1112 self.cf = self.conn_f.cursor()
1113 self.cf.execute('PRAGMA temp_store=MEMORY;')
1114 self.cf.execute('PRAGMA journal_mode=MEMORY;')
1115 self.cf.execute('PRAGMA synchronous = OFF;')
1116 self.cf.execute('begin')
1117 self.conn = sqlite3.connect(self.corpus.pathout['uces.db'])
1118 self.c = self.conn.cursor()
1119 self.c.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
1121 self.c = self.conn.cursor()
1122 self.c.execute('PRAGMA temp_store=MEMORY;')
1123 self.c.execute('PRAGMA journal_mode=MEMORY;')
1124 self.c.execute('PRAGMA synchronous = OFF;')
1125 self.c.execute('begin')
1128 #commit index and close db
1130 self.conn_f.commit()
1131 self.cf.execute('CREATE INDEX iduces ON uces (id);')
1132 self.cf.execute('CREATE INDEX ideff ON eff (id);')
1136 self.conn_corpus = sqlite3.connect(self.corpus.pathout['corpus.db'])
1137 self.ccorpus = self.conn_corpus.cursor()
1138 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS etoiles (uci INTEGER, et TEXT, paras TEXT);')
1139 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS luces (uci INTEGER, para INTEGER, uce INTEGER);')
1140 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS formes (ident INTEGER, forme TEXT, lem TEXT, gram TEXT, freq INTEGER);')
1141 self.conn_corpus.commit()
1142 self.ccorpus = self.conn_corpus.cursor()
1143 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
1144 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
1145 self.ccorpus.execute('PRAGMA synchronous = OFF;')
1146 self.ccorpus.execute('begin')
1147 self.backup_corpus()
1148 self.ccorpus.execute('CREATE INDEX iduci ON luces (uci);')
1149 self.conn_corpus.commit()
1150 self.conn_corpus.close()
1151 #self.corpus.parametres['corpus_ira'] = self.corpus.pathout['corpus.cira']
1153 def buildcleans(self) :
1154 if self.corpus.parametres.get('lower', 1) :
1155 self.cleans.append(self.dolower)
1156 if self.corpus.parametres.get('firstclean', 1) :
1157 self.cleans.append(self.firstclean)
1158 if self.corpus.parametres['charact'] :
1159 self.rule = self.corpus.parametres.get('keep_caract', u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_")
1160 self.cleans.append(self.docharact)
1161 if self.corpus.parametres.get('expressions', 1) :
1162 self.cleans.append(self.make_expression)
1163 if self.corpus.parametres.get('apos', 1) :
1164 self.cleans.append(self.doapos)
1165 if self.corpus.parametres.get('tiret', 1):
1166 self.cleans.append(self.dotiret)
1168 def make_expression(self,txt) :
1169 for expression in self.expressions:
1170 if expression in txt :
1171 txt = txt.replace(expression, self.expressions[expression][0])
1174 def dolower(self, txt) :
1177 def docharact(self, txt) :
1178 #rule = u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_-"
1179 list_keep = u"[" + self.rule + "]+"
1180 return re.sub(list_keep, ' ', txt)
1182 def doapos(self, txt) :
1183 return txt.replace(u'\'', u' ')
1185 def dotiret(self, txt) :
1186 return txt.replace(u'-', u' ')
1188 def firstclean(self, txt) :
1189 txt = txt.replace(u'’',"'")
1190 txt = txt.replace(u'œ', u'oe')
1191 return txt.replace('...',u' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'…', u' £$£ ')
1193 def make_cleans(self, txt) :
1194 for clean in self.cleans :
1198 def backup_uce(self) :
1199 if self.corpus.idformesuces != {} :
1200 log.info('backup %i' % len(self.corpus.idformesuces))
1201 touce = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].keys()])) for forme in self.corpus.idformesuces]
1202 toeff = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].values()])) for forme in self.corpus.idformesuces]
1203 self.cf.executemany('INSERT INTO uces VALUES (?,?);', touce)
1204 self.cf.executemany('INSERT INTO eff VALUES (?,?);', toeff)
1205 self.corpus.idformesuces = {}
1208 def backup_corpus(self) :
1209 log.info('start backup corpus')
1211 for uci in self.corpus.ucis :
1212 self.ccorpus.execute('INSERT INTO etoiles VALUES (?,?,?);' ,(uci.ident,' '.join(uci.etoiles), ' '.join(uci.paras,)))
1213 for uce in uci.uces :
1214 self.ccorpus.execute('INSERT INTO luces VALUES (?,?,?);',(`uci.ident`,`uce.para`,`uce.ident`,))
1215 for forme in self.corpus.formes :
1216 self.ccorpus.execute('INSERT INTO formes VALUES (?,?,?,?,?);', (`self.corpus.formes[forme].ident`, forme, self.corpus.formes[forme].lem, self.corpus.formes[forme].gram, `self.corpus.formes[forme].freq`,))
1217 log.info('%f' % (time() - t))
1219 def dofinish(self) :
1220 self.corpus.parametres['date'] = datetime.datetime.now().ctime()
1221 minutes, seconds = divmod(self.time, 60)
1222 hours, minutes = divmod(minutes, 60)
1223 self.corpus.parametres['time'] = '%.0fh %.0fm %.0fs' % (hours, minutes, seconds)
1224 self.corpus.parametres['ucinb'] = self.corpus.getucinb()
1225 self.corpus.parametres['ucenb'] = self.corpus.getucenb()
1226 self.corpus.parametres['occurrences'] = self.corpus.gettotocc()
1227 self.corpus.parametres['formesnb'] = len(self.corpus.formes)
1228 hapaxnb = self.corpus.gethapaxnb()
1229 pourhapaxf = (float(hapaxnb) / len(self.corpus.formes)) * 100
1230 pourhapaxocc = (float(hapaxnb) / self.corpus.parametres['occurrences']) * 100
1231 self.corpus.parametres['hapax'] = '%i - %.2f %% des formes - %.2f %% des occurrences' % (hapaxnb, pourhapaxf, pourhapaxocc)
1234 class BuildFromAlceste(BuildCorpus) :
1235 def read_corpus(self, infile) :
1236 if self.dlg is not None :
1237 self.dlg.Pulse('textes : 0 - segments : 0')
1240 if self.corpus.parametres['ucimark'] == 0 :
1241 self.testuci = testetoile
1242 elif self.corpus.parametres['ucimark'] == 1 :
1243 self.testuci = testint
1249 with codecs.open(infile, 'r', self.encoding) as f :
1250 for linenb, line in enumerate(f) :
1251 line = line.rstrip('\n\r')#FIXME .lstrip(codecs.BOM).lstrip(codecs.BOM_UTF8)
1252 if self.testuci(line) :
1255 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci - 1)
1257 self.corpus.ucis.append(Uci(iduci, line))
1260 if self.corpus.ucis[-1].uces == [] :
1261 log.info(u'Empty text : %i' % linenb)
1263 self.corpus.ucis.pop()
1264 self.corpus.ucis.append(Uci(iduci, line))
1265 if self.dlg is not None :
1266 if not (iduci + 1) % 10 :
1267 self.dlg.Pulse('textes : %i - segments : %i' % (iduci + 1, iduce +1))
1268 elif line.startswith(u'-*') :
1271 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1274 self.corpus.ucis[-1].paras.append(line.split()[0])
1276 raise Exception('paragrapheOT %i' % linenb)
1277 elif line.strip() != '' and iduci != -1 :
1279 if txt != [] and iduci != -1 :
1280 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1285 self.corpus.ucis.pop()
1286 log.info(Exception("Empty text %i" % linenb))
1288 raise Exception('EmptyText %i' % linenb)
1289 if iduci != -1 and iduce != -1:
1292 log.info(_(u"No Text in corpora. Are you sure of the formatting ?"))
1293 raise Exception('TextBeforeTextMark %i' % linenb)
1294 except UnicodeDecodeError :
1295 raise Exception("CorpusEncoding")
1297 def treattxt(self, txt, iduce, idpara, iduci) :
1298 if self.corpus.parametres.get('ucemethod', 0) == 2 and self.corpus.parametres['douce']:
1299 txt = 'laphrasepoursplitter'.join(txt)
1300 txt = self.make_cleans(txt)
1301 txt = ' '.join([val for val in txt.split() if val not in self.ponctuation_espace])
1302 ucetxt = txt.split('laphrasepoursplitter')
1305 txt = self.make_cleans(txt)
1306 ucetxt = self.make_uces(txt, self.corpus.parametres['douce'])
1307 if self.corpus.ucis[-1].paras == [] :
1311 self.corpus.ucis[-1].uces.append(Uce(iduce, idpara, iduci))
1312 self.c.execute('INSERT INTO uces VALUES(?,?);', (`iduce`,uce))
1313 if not self.tolist :
1319 self.corpus.add_word(word)
1320 log.debug(' '.join([`iduci`,`idpara`,`iduce`]))
1321 if self.last > self.lim :
1324 return iduce, idpara
1326 def make_uces(self, txt, douce = True, keep_ponct = False) :
1327 txt = ' '.join(txt.split())
1330 reste, texte_uce, suite = self.decouper(self.prep_txt(txt), self.ucesize + 15, self.ucesize)
1332 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1335 reste, texte_uce, suite = self.decouper(suite, self.ucesize + 15, self.ucesize)
1336 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1341 return [' '.join([val for val in txt.split() if val not in self.ponctuation_espace])]
1343 #decouper (list_sep)
1344 #make_uces (decouper)
1345 #treat_txt (make_uces)
1349 def __init__(self, parent, dlg = None) :
1350 self.parent = parent
1352 parametres = DoConf(os.path.join(self.parent.UserConfigPath,'corpus.cfg')).getoptions('corpus')
1353 parametres['pathout'] = PathOut(parent.filename, 'corpus').mkdirout()
1354 dial = CorpusPref(parent, parametres)
1355 dial.CenterOnParent()
1356 dial.txtpath.SetLabel(parent.filename)
1357 #dial.repout_choices.SetValue(parametres['pathout'])
1358 self.res = dial.ShowModal()
1359 if self.res == 5100 :
1360 parametres = dial.doparametres()
1361 parametres['originalpath'] = parent.filename
1362 PathOut().createdir(parametres['pathout'])
1363 ReadLexique(self.parent, lang = parametres['lang'])
1364 if parametres['lang'] != 'other' and os.path.exists(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')):
1365 self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp'))
1367 self.parent.expressions = {}
1368 self.parametres = parametres
1370 if self.dlg is not None :
1374 def doanalyse(self) :
1375 return BuildFromAlceste(self.parent.filename, self.parametres, self.parent.lexique, self.parent.expressions, dlg = self.dlg).corpus
1378 if __name__ == '__main__' :
1380 parametres = {'formesdb':'formes.db', 'ucesdb': 'uces.db', 'corpusdb' : 'corpus.db', 'syscoding' : 'utf-8', 'encoding' : 'utf8'}
1381 intro = BuildCorpus(infile, parametres)#, tar_in, tar_infouce)#, tar_formes)