1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
3 #Copyright (c) 2008-2020 Pierre Ratinaud
4 #modification pour python 3 : Laurent Mérat, 6x7 - mai 2020
7 #------------------------------------
8 # import des modules python
9 #------------------------------------
19 from operator import itemgetter
20 from uuid import uuid4
23 #------test spacy------------
25 #nlp = spacy.load("fr_core_news_lg")
27 #------------------------------------
28 # import des fichiers du projet
29 #------------------------------------
30 from functions import decoupercharact, ReadDicoAsDico, DoConf, ReadLexique, progressbar
31 from chemins import PathOut
32 from dialog import CorpusPref, SubTextFromMetaDial, MergeClusterFrame
33 from colors import colors
39 log = logging.getLogger('iramuteq.corpus')
42 def copycorpus(corpus) :
43 log.info('copy corpus')
44 copy_corpus = Corpus(corpus.parent, parametres = corpus.parametres)
45 copy_corpus.ucis = corpus.ucis
46 copy_corpus.formes = corpus.formes
47 copy_corpus.pathout = corpus.pathout
48 copy_corpus.conn_all()
52 return Uce(uce.ident, uce.para, uce.uci)
55 nuci = Uci(uci.ident, '')
56 nuci.etoiles = copy(uci.etoiles)
57 nuci.uces = [CopyUce(uce) for uce in uci.uces]
58 nuci.paras = copy(uci.paras)
67 def __init__(self, parent, parametres = {}, read = False) :
69 self.parametres = parametres
71 self.connformes = None
73 self.conncorpus = None
80 self.idformesuces = {}
85 self.pathout = PathOut(dirout = parametres['pathout'])
88 def add_word(self, word) :
89 if word in self.formes :
90 self.formes[word].freq += 1
91 if self.formes[word].ident in self.idformesuces :
92 if self.ucis[-1].uces[-1].ident in self.idformesuces[self.formes[word].ident] :
93 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] += 1
95 self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] = 1
97 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident: 1}
99 if word in self.parent.lexique :
100 gramtype = self.parent.lexique[word][1]
101 lem = self.parent.lexique[word][0]
102 elif word.isdigit() :
108 self.formes[word] = Word(word, gramtype, len(self.formes), lem)
109 self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident : 1}
111 def add_word_from_forme(self, word, stident):
112 if word.forme in self.formes :
113 self.formes[word.forme].freq += 1
114 if self.formes[word.forme].ident in self.idformesuces :
115 if stident in self.idformesuces[self.formes[word.forme].ident] :
116 self.idformesuces[self.formes[word.forme].ident][stident] += 1
118 self.idformesuces[self.formes[word.forme].ident][stident] = 1
120 self.idformesuces[self.formes[word.forme].ident] = {stident: 1}
122 self.formes[word.forme] = Word(word.forme, word.gram, len(self.formes), word.lem)
123 self.idformesuces[self.formes[word.forme].ident] = {stident : 1}
126 """connect corpus to db"""
127 if self.connformes is None :
128 log.info('connexion corpus')
129 self.connuces = sqlite3.connect(self.pathout['uces.db'])
130 self.cuces = self.connuces.cursor()
131 self.connformes = sqlite3.connect(self.pathout['formes.db'])
132 self.cformes = self.connformes.cursor()
133 self.conncorpus = sqlite3.connect(self.pathout['corpus.db'])
134 self.ccorpus = self.conncorpus.cursor()
135 self.cformes.execute('PRAGMA temp_store=MEMORY;')
136 self.cformes.execute('PRAGMA journal_mode=MEMORY;')
137 self.cformes.execute('PRAGMA synchronous = OFF;')
138 self.cuces.execute('PRAGMA temp_store=MEMORY;')
139 self.cuces.execute('PRAGMA journal_mode=MEMORY;')
140 self.cuces.execute('PRAGMA synchronous = OFF;')
141 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
142 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
143 self.ccorpus.execute('PRAGMA synchronous = OFF;')
145 def read_corpus(self) :
146 log.info('read corpus')
147 self.parametres['syscoding'] = 'utf8'
148 if self.conncorpus is None :
150 res = self.ccorpus.execute('SELECT * FROM etoiles;')
152 self.ucis.append(Uci(row[0], row[1], row[2]))
153 uces = self.conncorpus.cursor().execute('SELECT * FROM luces where uci=?;',(repr(self.ucis[-1].ident),))
155 self.ucis[-1].uces.append(Uce(uce[2], uce[1], uce[0]))
156 res = self.ccorpus.execute('SELECT * FROM formes;')
157 self.formes = dict([[forme[1], Word(forme[1], forme[3], forme[0], lem = forme[2], freq = forme[4])] for forme in res])
160 def getworduces(self, wordid) :
161 if isinstance(wordid, str) :
162 wordid = self.formes[wordid].ident
163 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (repr(wordid),))
164 return list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
166 def getworducis(self, wordid) :
167 res = self.getworduces(wordid)
168 return list(set([self.getucefromid(uce).uci for uce in res]))
170 def getformeuceseff(self, formeid) :
171 if isinstance(formeid, str) :
172 formeid = self.formes[formeid].ident
173 res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (repr(formeid),))
174 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
175 query = 'SELECT eff FROM eff where id=%i ORDER BY id' % formeid
176 res = self.cformes.execute(query)
177 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
179 for i, uce in enumerate(uces) :
180 formeuceeff[uce] = formeuceeff.get(uce, 0) + eff[i]
183 def getlemuces(self, lem) :
184 formesid = ', '.join([repr(val) for val in self.lems[lem].formes])
185 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
186 res = self.cformes.execute(query)
187 return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))))
189 def gettgenst(self, tgen):
192 if lem in self.lems :
193 formesid += self.lems[lem].formes
195 print('abscent : %s' % lem)
196 query = 'SELECT uces FROM uces where id IN %s ORDER BY id' % str(tuple(formesid))
197 res = self.cformes.execute(query)
198 return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))))
200 def gettgenstprof(self, tgen, classe, i, clnb):
203 if lem in self.lems :
204 lemst = self.getlemuces(lem)
206 if not lem in self.tgenlem :
207 self.tgenlem[lem] = [0] * clnb
208 self.tgenlem[lem][i] = len(set(lemst).intersection(classe))
210 print('abscent: ',lem)
211 return list(set(tgenst))
213 def gettgentxt(self, tgen):
214 sts = self.gettgenst(tgen)
215 return list(set([self.getucefromid(val).uci for val in sts]))
217 def getlemucis(self, lem) :
218 uces = self.getlemuces(lem)
219 return list(set([self.getucefromid(val).uci for val in uces]))
221 def getlemuceseff(self, lem, luces = None) :
222 formesid = ', '.join([repr(val) for val in self.lems[lem].formes])
223 query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
224 res = self.cformes.execute(query)
225 uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
226 query = 'SELECT eff FROM eff where id IN (%s) ORDER BY id' % formesid
227 res = self.cformes.execute(query)
228 eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
230 for i, uce in enumerate(uces) :
231 lemuceeff[uce] = lemuceeff.get(uce, 0) + eff[i]
234 def getlemclustereff(self, lem, cluster) :
235 return len(list(set(self.lc[cluster]).intersection(self.getlemuces(lem))))
237 def getlemeff(self, lem) :
238 return self.lems[lem].freq
243 def getforme(self, formeid) :
244 if self.idformes is None : self.make_idformes()
245 return self.idformes[formeid]
247 def gettotocc(self) :
248 return sum([self.formes[forme].freq for forme in self.formes])
250 def getucemean(self) :
251 return float(self.gettotocc())/self.getucenb()
254 return self.ucis[-1].uces[-1].ident + 1
257 return self.ucis[-1].ident + 1
259 def getucisize(self) :
260 ucesize = self.getucesize()
261 return [sum(ucesize[uci.uces[0].ident:(uci.uces[-1].ident + 1)]) for uci in self.ucis if len(uci.uces) != 0]
263 def getucesize(self) :
264 res = self.getalluces()
265 return [len(uce[1].split()) for uce in res]
267 def getconcorde(self, uces) :
268 return self.cuces.execute('select * from uces where id IN (%s) ORDER BY id;' % ', '.join([repr(i) for i in uces]))
270 def getuciconcorde(self, ucis) :
271 uces = [[val,[uce.ident for uce in self.ucis[val].uces]] for val in ucis]
272 uces = [[val[0], '\n'.join([row[1] for row in self.getconcorde(val[1])])] for val in uces]
275 def getuciconcorde_uces(self, uciid, uceid) :
276 uces = [uce.ident for uce in self.ucis[uciid].uces]
277 uces = [row for row in self.getconcorde(uces)]
280 def getwordconcorde(self, word) :
281 return self.getconcorde(self.getworduces(word))
283 def getlemconcorde(self, lem) :
284 return self.getconcorde(self.getlemuces(lem))
286 def getalluces(self) :
287 return self.cuces.execute('SELECT * FROM uces')
289 def getallucis(self):
290 uces = [row[1] for row in self.getalluces()]
291 return [[uci.ident, '\n'.join([uces[uce.ident] for uce in uci.uces])] for uci in self.ucis]
293 def getucesfrometoile(self, etoile) :
294 return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles]
296 def getucisfrometoile(self, etoile):
297 uces = [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles]
298 return list(set([self.getucefromid(val).uci for val in uces]))
301 def getetoileuces(self) :
302 log.info('get uces etoiles')
305 for uci in self.ucis :
306 etoiles = uci.etoiles[1:]
308 if et in etoileuces :
309 etoileuces[et] += [uce.ident for uce in uci.uces]
311 etoileuces[et] = [uce.ident for uce in uci.uces]
313 for et in uci.paras :
314 if et in etoileuces :
315 etoileuces[et] += [uce.ident for uce in uci.uces if uce.para == idpara]
317 etoileuces[et] = [uce.ident for uce in uci.uces if uce.para == idpara]
323 def getetoileucis(self):
325 for uci in self.ucis :
326 etoiles = uci.etoiles[1:]
328 if et in etoileuces :
329 etoileuces[et] += [uci.ident]
331 etoileuces[et] = [uci.ident]
334 def getucefromid(self, uceid) :
335 if self.iduces is None : self.make_iduces()
336 return self.iduces[uceid]
338 def gethapaxnb(self) :
339 return len([None for forme in self.formes if self.formes[forme].freq == 1])
341 def getactivesnb(self, key) :
342 return len([lem for lem in self.lems if self.lems[lem].act == key])
344 # fonction inactive mais avec une incertitude concernant l'indentation sur le dernier else
345 # def make_lems(self, lem = True) :
346 # log.info('make lems')
348 # for forme in self.formes :
349 # if self.formes[forme].lem in self.lems :
350 # if self.formes[forme].ident not in self.lems[self.formes[forme].lem] :
351 # self.lems[self.formes[forme].lem][self.formes[forme].ident] = 0
353 # self.lems[self.formes[forme].lem] = {self.formes[forme].ident : 0}
355 def getetbyuceid(self, uceid) :
356 if self.uceuci is None : self.uceuci = dict([[uce.ident,uci.ident] for uci in self.ucis for uce in uci.uces])
357 return self.ucis[self.uceuci[uceid]].etoiles
359 def make_lems(self, lem = True) :
360 log.info('make lems')
363 for forme in self.formes :
364 if self.formes[forme].lem in self.lems :
365 if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes :
366 self.lems[self.formes[forme].lem].add_forme(self.formes[forme])
368 self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
370 self.lems = dict([[forme, Lem(self, self.formes[forme])] for forme in self.formes])
372 def make_lems_from_dict(self, dictionnaire, dolem = True) :
373 log.info('make lems from dict')
375 for forme in self.formes :
376 if self.formes[forme].forme in dictionnaire :
377 lem = dictionnaire[forme][0]
378 gram = dictionnaire[forme][1]
379 elif forme.isdigit() :
385 self.formes[forme].lem = lem
386 self.formes[forme].gram = gram
388 if self.formes[forme].lem in self.lems :
389 if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes :
390 self.lems[self.formes[forme].lem].add_forme(self.formes[forme])
392 self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
394 self.lems[forme] = Lem(self, self.formes[forme])
396 def make_idformes(self) :
397 self.idformes = dict([[self.formes[forme].ident, self.formes[forme]] for forme in self.formes])
399 def make_iduces(self) :
400 if self.iduces is None :
401 self.iduces = dict([[uce.ident, uce] for uci in self.ucis for uce in uci.uces])
403 def make_lexitable(self, mineff, etoiles, gram = 0) :
408 tokeep = [lem for lem in self.lems if self.lems[lem].freq >= mineff and self.lems[lem].act in grams]
409 etuces = [[] for et in etoiles]
410 for uci in self.ucis :
411 get = list(set(uci.etoiles).intersection(etoiles))
413 log.info('2 variables sur une ligne')
415 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
416 etuces = [set(val) for val in etuces]
419 deff = self.getlemuceseff(lem)
420 ucesk = list(deff.keys())
421 line = [lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces]
422 if sum(line[1:]) >= mineff :
424 tab.insert(0, [''] + etoiles)
427 def make_tgen_table(self, tgen, etoiles, tot = None):
428 lclasses = [self.getucesfrometoile(etoile) for etoile in etoiles]
429 sets = [set(cl) for cl in lclasses]
430 totoccurrences = dict([[val, 0] for val in etoiles])
432 for forme in self.formes :
433 formeuceeff = self.getformeuceseff(forme)
434 for i, classe in enumerate(lclasses) :
435 concern = sets[i].intersection(list(formeuceeff.keys()))
437 totoccurrences[etoiles[i]] += sum([formeuceeff[uce] for uce in concern])
438 #tgenoccurrences = dict([[val, 0] for val in etoiles])
441 tgenoccurrences[t] = dict([[val, 0] for val in etoiles])
443 lemuceeff = self.getlemuceseff(lem)
444 for i, classe in enumerate(lclasses) :
445 concern = sets[i].intersection(list(lemuceeff.keys()))
447 tgenoccurrences[t][etoiles[i]] += sum([lemuceeff[uce] for uce in concern])
448 return tgenoccurrences, totoccurrences
450 def make_tgen_profile(self, tgen, ucecl, uci = False) :
451 log.info('tgen/classes')
455 #FIXME : NE MARCHE PLUS CHANGER CA
456 tab = [[lem] + [len(set(self.gettgentxt(tgen[lem])).intersection(classe)) for i, classe in enumerate(ucecl)] for lem in tgen]
458 tab = [[lem] + [len(set(self.gettgenstprof(tgen[lem], classe, i, clnb)).intersection(classe)) for i, classe in enumerate(ucecl)] for lem in tgen]
459 tab = [[line[0]] + [val for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
464 #while nam + `i` in tgen :
467 #last = [nam] + [`len(classe)` for classe in ucecl]
469 #line0 = ['tgen'] + ['_'.join(['cluster', `i+1`]) for i in range(len(ucecl))]
471 #with open(fileout, 'w') as f :
472 # f.write('\n'.join(['\t'.join(line) for line in tab]).encode(self.parametres['syscoding']))
474 def make_efftype_from_etoiles(self, etoiles) :
476 etuces = [[] for et in etoiles]
477 for uci in self.ucis :
478 get = list(set(uci.etoiles).intersection(etoiles))
480 return '2 variables sur la meme ligne'
482 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
483 etuces = [set(val) for val in etuces]
484 for lem in self.lems :
485 deff = self.getlemuceseff(lem)
486 ucesk = list(deff.keys())
487 gram = self.lems[lem].gram
489 dtype[gram] = [i + j for i, j in zip(dtype[gram], [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])]
491 dtype[gram] = [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces]
492 tabout = [[gram] + dtype[gram] for gram in dtype]
493 tabout.insert(0, [''] + etoiles)
496 def make_uceactsize(self, actives) :
497 res = self.getalluces()
500 deff = self.getlemuceseff(lem)
502 ucesize[uce] = ucesize.get(uce, 0) + 1
505 def make_uc(self, actives, lim1, lim2) :
506 uceactsize = self.make_uceactsize(actives)
512 for uce in [uce for uci in self.ucis for uce in uci.uces] :
513 if uce.para == lastpara :
515 last1 += uceactsize.get(uce.ident,0)
516 uc1[-1].append(uce.ident)
518 uc1.append([uce.ident])
521 last2 += uceactsize.get(uce.ident, 0)
522 uc2[-1].append(uce.ident)
524 uc2.append([uce.ident])
527 last1 = uceactsize.get(uce.ident, 0)
528 last2 = uceactsize.get(uce.ident, 0)
530 uc1.append([uce.ident])
531 uc2.append([uce.ident])
534 def make_and_write_sparse_matrix_from_uc(self, actives, sizeuc1, sizeuc2, uc1out, uc2out, listuce1out, listuce2out) :
535 uc1, uc2 = self.make_uc(actives, sizeuc1, sizeuc2)
536 log.info('taille uc1 : %i - taille uc2 : %i' % (len(uc1), len(uc2)))
537 self.write_ucmatrix(uc1, actives, uc1out)
538 self.write_ucmatrix(uc2, actives, uc2out)
539 listuce1 = [['uce', 'uc']] + [[repr(uce), repr(i)] for i, ucl in enumerate(uc1) for uce in ucl]
540 listuce2 = [['uce', 'uc']] + [[repr(uce), repr(i)] for i, ucl in enumerate(uc2) for uce in ucl]
541 with open(listuce1out, 'w') as f :
542 f.write('\n'.join([';'.join(line) for line in listuce1]))
543 with open(listuce2out, 'w') as f :
544 f.write('\n'.join([';'.join(line) for line in listuce2]))
545 return len(uc1), len(uc2)
547 def write_ucmatrix(self, uc, actives, fileout) :
548 log.info('write uc matrix %s' % fileout)
549 uces_uc = dict([[uce, i] for i, ucl in enumerate(uc) for uce in ucl])
552 with open(fileout + '~', 'w+') as f :
553 for i, lem in enumerate(actives) :
554 for uce in self.getlemuces(lem):
555 if (uces_uc[uce], i) not in deja_la :
557 f.write(''.join([' '.join([repr(uces_uc[uce]+1),repr(i+1),repr(1)]),'\n']))
558 deja_la[(uces_uc[uce], i)] = 0
560 with open(fileout, 'w') as ffin :
561 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (len(uc), len(actives), nbl))
564 os.remove(fileout + '~')
567 def export_corpus(self, outf) :
568 #outf = 'export_corpus.txt'
570 res = self.getalluces()
574 with open(outf,'w', encoding='utf8') as f :
576 if self.iduces[uce[0]].uci == actuci and self.iduces[uce[0]].para == actpara :
577 f.write(uce[1] + '\n')
578 elif self.iduces[uce[0]].uci != actuci :
579 actuci = self.iduces[uce[0]].uci
580 if self.ucis[self.iduces[uce[0]].uci].paras == [] :
581 actpara = self.iduces[uce[0]].para
582 f.write('\n' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles) + '\n' + uce[1] + '\n')
585 actpara = self.iduces[uce[0]].para
586 f.write('\n'.join([' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles), self.ucis[self.iduces[uce[0]].uci].paras[ident], uce[1]] + '\n'))
587 elif self.iduces[uce[0]].para != actpara :
588 actpara = self.iduces[uce[0]].para
590 f.write('\n'.join([self.ucis[self.iduces[uce[0]].uci].paras[ident], uce[1]]) + '\n')
592 def export_meta_table(self, outf) :
593 metas = [[repr(i)] + text.etoiles[1:] for i, text in enumerate(self.ucis)]
594 longueur_max = max([len(val) for val in metas])
595 first = ['column_%i' % i for i in range(longueur_max)]
596 metas.insert(0, first)
597 with open(outf, 'w', encoding='utf8') as f :
598 f.write('\n'.join(['\t'.join(line) for line in metas]))
600 def export_corpus_classes(self, outf, alc = True, lem = False, uci = False) :
602 for i, lc in enumerate(self.lc) :
605 for uce in self.lc0 :
608 res = self.getalluces()
611 res = self.getallucis()
612 with open(outf, 'w', encoding='utf8') as f :
616 actuci = self.iduces[uce[0]].uci
620 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
622 etline = ' '.join(self.ucis[actuci].etoiles + ['*classe_%i' % ucecl[uce[0]]])
624 etline = ' '.join(['<' + '='.join(et.split('_')) + '>' for et in self.ucis[actuci].etoiles[1:]])
625 f.write(etline + '\n')
626 f.write(guce + '\n\n')
628 def export_classe(self, outf, classe, lem = False, uci = False) :
629 sts = self.lc[classe - 1]
631 res = self.getconcorde(sts)
634 res = self.getuciconcorde(sts)
635 with open(outf, 'w', encoding='utf8') as f :
639 f.write(' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles) + '\n')
641 f.write(' '.join(self.ucis[uce[0]].etoiles) + '\n')
643 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
644 f.write(guce + '\n\n')
646 def export_owledge(self, rep, classe, lem = False, uci = False) :
647 sts = self.lc[classe - 1]
649 res = self.getconcorde(sts)
652 res = self.getuciconcorde(sts)
656 outf = '.'.join([repr(ident), 'txt'])
657 outf = os.path.join(rep, outf)
659 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
660 with open(outf, 'w', encoding='utf8') as f :
661 f.write(guce) #.encode('cp1252', errors = 'replace'))
663 def export_tropes(self, fileout, classe, lem = False, uci = False) :
664 sts = self.lc[classe - 1]
666 res = self.getconcorde(sts)
669 res = self.getuciconcorde(sts)
670 with open(fileout, 'w', encoding='utf8') as f :
674 guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
675 f.write(guce) #.encode('cp1252', errors = 'replace'))
678 def make_and_write_sparse_matrix_from_uces(self, actives, outfile, listuce = False) :
679 log.info('make_and_write_sparse_matrix_from_uces %s' % outfile)
681 with open(outfile + '~', 'w+') as f :
682 for i, lem in enumerate(actives) :
683 for uce in sorted(self.getlemuces(lem)) :
685 f.write(''.join([' '.join([repr(uce+1), repr(i+1),repr(1)]),'\n']))
687 with open(outfile, 'w') as ffin :
688 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
691 os.remove(outfile + '~')
693 with open(listuce, 'w') as f :
694 f.write('\n'.join(['uce;uc'] + [';'.join([repr(i),repr(i)]) for i in range(0, self.getucenb())]))
696 def make_and_write_sparse_matrix_from_uci(self, actives, outfile, listuci = False) :
697 log.info('make_and_write_sparse_matrix_from_ucis %s' % outfile)
699 with open(outfile + '~', 'w+') as f :
700 for i, lem in enumerate(actives) :
701 for uci in sorted(self.getlemucis(lem)) :
703 f.write(''.join([' '.join([repr(uci+1), repr(i+1),repr(1)]),'\n']))
705 with open(outfile, 'w') as ffin :
706 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucinb(), len(actives), nbl))
709 os.remove(outfile + '~')
711 with open(listuci, 'w') as f :
712 f.write('\n'.join(['uci;uc'] + [';'.join([repr(i),repr(i)]) for i in range(0, self.getucinb())]))
714 def make_and_write_sparse_matrix_from_classe(self, actives, uces, outfile) :
715 log.info('make_and_write_sparse_matrix_from_classe %s' % outfile)
717 duces = dict([[uce, i] for i, uce in enumerate(uces)])
718 with open(outfile + '~', 'w+') as f :
719 for i, lem in enumerate(actives) :
720 uces_ok = list(set(self.getlemuces(lem)).intersection(uces))
722 f.write(''.join([' '.join([repr(duces[uce]+1),repr(i+1),repr(1)]),'\n']))
724 with open(outfile, 'w') as ffin :
725 ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (len(uces), len(actives), nbl))
728 os.remove(outfile + '~')
730 def make_table_with_classe(self, uces, list_act, uci = False) :
731 table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
732 uces = dict([[uce, i] for i, uce in enumerate(uces)])
734 getlem = self.getlemucis
736 getlem = self.getlemuces
737 for i, lem in enumerate(list_act) :
738 lemuces = list(set(getlem(lem)).intersection(uces))
740 table_uce[uces[uce]][i] = 1
741 table_uce.insert(0, list_act)
744 def make_pondtable_with_classe(self, uces, list_act) :
745 table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
746 uces = dict([[uce, i] for i, uce in enumerate(uces)])
747 for i, lem in enumerate(list_act) :
748 uceseff = self.getlemuceseff(lem)
749 lemuces = list(set(uceseff.keys()).intersection(uces))
751 table_uce[uces[uce]][i] = uceseff[uce]
752 table_uce.insert(0, list_act)
755 def parse_active(self, gramact, gramsup = None) :
756 log.info('parse actives')
757 for lem in self.lems :
758 if lem.startswith('_') and lem.endswith('_') :
759 self.lems[lem].act = 2
760 elif self.lems[lem].gram in gramact :
761 self.lems[lem].act = 1
762 elif gramsup is not None and self.lems[lem].gram not in gramact:
763 if self.lems[lem].gram in gramsup :
764 self.lems[lem].act = 2
766 self.lems[lem].act = 0
768 self.lems[lem].act = 2
770 def make_actives_limit(self, limit, key = 1) :
771 if self.idformes is None :
773 return [lem for lem in self.lems if self.getlemeff(lem) >= limit and self.lems[lem].act == key]
775 def make_actives_nb(self, nbmax, key) :
776 log.info('make_actives_nb : %i - %i' % (nbmax,key))
777 if self.idformes is None :
779 allactives = [[self.lems[lem].freq, lem] for lem in self.lems if self.lems[lem].act == key and self.lems[lem].freq >= 3]
780 self.activenb = len(allactives)
781 allactives = sorted(allactives, reverse = True)
782 if self.activenb == 0 :
784 if len(allactives) <= nbmax :
785 log.info('nb = %i - eff min = %i ' % (len(allactives), allactives[-1][0]))
786 return [val[1] for val in allactives], allactives[-1][0]
788 effs = [val[0] for val in allactives]
789 if effs.count(effs[nbmax - 1]) > 1 :
790 lim = effs[nbmax - 1] + 1
794 stop = effs.index(lim)
801 log.info('nb actives = %i - eff min = %i ' % (stop + 1, lim))
802 return [val[1] for val in allactives[0:stop]], lim
804 def make_and_write_profile(self, actives, ucecl, fileout, uci = False) :
805 log.info('formes/classes')
807 tab = [[lem] + [len(set(self.getlemucis(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
809 tab = [[lem] + [len(set(self.getlemuces(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
810 tab = [[line[0]] + [repr(val) for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
811 with open(fileout, 'w', encoding='utf8') as f :
812 f.write('\n'.join([';'.join(line) for line in tab]))
814 def make_etoiles(self) :
816 for uci in self.ucis :
817 etoiles.update(uci.etoiles[1:])
820 def make_themes(self):
822 for uci in self.ucis :
823 themes.update(uci.paras)
826 def make_etoiles_dict(self) :
827 etoiles = [et for uci in self.ucis for et in uci.etoiles[1:]]
829 for etoile in etoiles :
830 et = etoile.split('_')
833 endet = '_'.join(et[1:])
834 if etoile in det[et[0]] :
835 det[et[0]][etoile] += 1
837 det[et[0]][etoile] = 1
842 endet = '_'.join(et[1:])
843 det[et[0]] = {etoile :1}
848 def make_theme_dict(self):
849 themes = [val for uci in self.ucis for val in uci.paras]
851 for theme in themes :
852 th = theme.split('_')
855 endth = '_'.join(th[1:])
856 if theme in det[th[0]] :
857 det[th[0]][theme] += 1
859 det[th[0]][theme] = 1
864 endth = '_'.join(th[1:])
865 det[th[0]] = {theme:1}
870 def make_etline(self, listet) :
871 etuces = [[] for et in listet]
872 for uci in self.ucis :
873 get = list(set(uci.etoiles).intersection(listet))
875 return '2 variables sur la meme ligne'
877 etuces[listet.index(get[0])] += [uce.ident for uce in uci.uces]
880 def make_and_write_profile_et(self, ucecl, fileout, uci = False) :
881 log.info('etoiles/classes')
883 etoileuces = self.getetoileuces()
885 etoileuces = self.getetoileucis()
886 etoileuces = dict([[et, etoileuces[et]] for et in etoileuces if len(etoileuces[et]) > 0])
887 with open(fileout, 'w', encoding='utf8') as f :
888 f.write('\n'.join([';'.join([et] + [repr(len(set(etoileuces[et]).intersection(classe))) for classe in ucecl]) for et in etoileuces])) #.encode(self.parametres['syscoding'])
889 #etoiles = self.make_etoiles()
890 #with open(fileout, 'w') as f :
891 # f.write('\n'.join([';'.join([etoile] + [`len(set(self.getucesfrometoile(etoile)).intersection(classe))` for classe in ucecl]) for etoile in etoiles]).encode(self.parametres['syscoding']))
893 def make_colored_corpus(self, uci = False) :
895 for i, lc in enumerate(self.lc) :
898 for uce in self.lc0 :
900 color = ['black'] + colors[len(self.lc) - 1]
902 <meta http-equiv="content-Type" content="text/html; charset=utf8" />
906 res = self.getalluces()
911 if self.iduces[uce[0]].uci != actuci :
912 actuci = self.iduces[uce[0]].uci
913 txt += '<br><hr>' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles) + '<br><br>'
914 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
916 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
918 res = self.getallucis()
921 if self.ucis[uce[0]].ident != actuci :
922 actuci = self.ucis[uce[0]].ident
923 txt += '<br><hr>' + ' '.join(self.ucis[self.ucis[uce[0]].ident].etoiles) + '<br><br>'
924 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
926 txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
927 return txt + '\n</body></html>'
929 def make_cut_corpus(self, uci = False) :
932 res = self.getalluces()
937 if self.iduces[uce[0]].uci != actuci :
938 actuci = self.iduces[uce[0]].uci
939 txt += '\n' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles) + '\n'
940 txt += ''.join(['\n',uce[1],'\n'])
942 txt += ''.join(['\n',uce[1],'\n'])
944 res = self.getallucis()
947 if self.ucis[uce[0]].ident != actuci :
948 actuci = self.ucis[uce[0]].ident
949 txt += '\n' + ' '.join(self.ucis[self.ucis[uce[0]].ident].etoiles) + '\n'
950 txt += ''.join(['\n',uce[1],'\n'])
952 txt += ''.join(['\n',uce[1],'\n'])
955 def count_from_list(self, l, d) :
963 def count_from_list_cl(self, l, d, a, clnb) :
972 def find_segments(self, taille_segment, taille_limite) :
974 for uce in self.getalluces() :
976 d = self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
977 l = [[d[val], val] for val in d if d[val] >= 3]
980 if len(l) > taille_limite :
981 l = l[-taille_limite:]
984 def find_segments_in_classe(self, list_uce, taille_segment, taille_limite, uci = False):
987 concorde = self.getconcorde
989 concorde = self.getuciconcorde
990 for uce in concorde(list_uce) :
992 d =self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
993 l = [[d[val], val, taille_segment] for val in d if d[val] >= 3]
996 if len(l) > taille_limite :
997 l = l[-taille_limite:]
1000 def make_segments_profile(self, fileout, lenmin = 3, lenmax = 10, effmin = 50, lem = False) :
1002 for b, classe in enumerate(self.lc) :
1003 for uce in self.getconcorde(classe) :
1004 uce = uce[1].split()
1006 uce = [self.formes[forme].lem for forme in uce]
1007 for taille_segment in range(lenmin,lenmax) :
1008 d =self.count_from_list_cl([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d, b, len(self.lc))
1009 result = [[seg] + [str(val) for val in d[seg]] for seg in d if sum(d[seg]) >= effmin]
1010 with open(fileout, 'w', encoding='utf8') as f :
1011 f.write('\n'.join([';'.join(line) for line in result]))
1013 def make_proftype(self, outf) :
1015 for lem in self.lems :
1016 gram = self.lems[lem].gram
1017 if not gram in res :
1018 res[gram] = [0 for val in self.lc]
1019 lemuceeff = self.getlemuceseff(lem)
1020 for i, classe in enumerate(self.lc) :
1021 concern = set(classe).intersection(list(lemuceeff.keys()))
1022 res[gram][i] += sum([lemuceeff[uce] for uce in concern])
1023 res = [[gram] + [repr(val) for val in res[gram]] for gram in res]
1025 with open(outf, 'w', encoding='utf8') as f :
1026 f.write('\n'.join([';'.join(line) for line in res]))
1028 def make_ucecl_from_R(self, filein) :
1029 with open(filein, 'r') as f :
1034 line = line.replace('\n', '').replace('"', '').split(';')
1035 self.lc.append([int(line[0]) - 1, int(line[1])])
1036 classesl = [val[1] for val in self.lc]
1037 clnb = max(classesl)
1038 self.lc = sorted(self.lc, key=itemgetter(1))
1039 self.lc = [[uce[0] for uce in self.lc if uce[1] == i] for i in range(clnb+1)]
1040 self.lc0 = self.lc.pop(0)
1043 def get_stat_by_cluster(self, outf, lclasses = None) :
1044 log.info('get_stat_by_cluster')
1045 if lclasses is None :
1048 occurrences = dict([[i + 1, 0] for i in range(len(lclasses))])
1049 formescl = dict([[i + 1, 0] for i in range(len(lclasses))])
1050 hapaxcl = dict([[i + 1, 0] for i in range(len(lclasses))])
1051 lenclasses = dict([[i+1,len(cl)] for i, cl in enumerate(lclasses)])
1052 sets = [set(cl) for cl in lclasses]
1053 for forme in self.formes :
1054 formeuceeff = self.getformeuceseff(forme)
1055 for i, classe in enumerate(lclasses) :
1056 concern = sets[i].intersection(list(formeuceeff.keys()))
1058 occurrences[i+1] += sum([formeuceeff[uce] for uce in concern])
1060 if self.formes[forme].freq == 1 :
1062 log.info('%f' % (time() - t1))
1063 if outf is not None :
1064 toprint = '\n'.join([';'.join([repr(i), repr(occurrences[i]), repr(formescl[i]), repr(hapaxcl[i]), repr(lenclasses[i]), repr(float(hapaxcl[i])/float(formescl[i]))]) for i in occurrences])
1065 with open(outf, 'w', encoding='utf8') as f :
1068 return [[repr(occurrences[i]), repr(formescl[i]), repr(hapaxcl[i]), repr(lenclasses[i]), repr(float(hapaxcl[i])/float(formescl[i]))] for i in occurrences]
1070 def get_stat_by_et(self, outf, etoiles) :
1071 lclasses = [self.getucesfrometoile(etoile) for etoile in etoiles]
1072 stats = self.get_stat_by_cluster(None, lclasses)
1073 stats = [[etoiles[i]] + val for i, val in enumerate(stats)]
1075 def gethapaxbyet(self, etoiles) :
1076 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
1078 for uce in hapaxuces :
1079 if uce in hucesdict :
1083 etuces = [[] for et in etoiles]
1084 for uci in self.ucis :
1085 get = list(set(uci.etoiles).intersection(etoiles))
1087 return '2 variables sur la meme ligne'
1089 etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
1090 etuces = [set(val) for val in etuces]
1091 return [sum([hucesdict[uce] for uce in list(etuce.intersection(hapaxuces))]) for etuce in etuces]
1093 def gethapaxuces(self) :
1094 hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
1095 hapax = [forme for forme in self.lems if self.lems[forme].freq == 1]
1097 for i,uce in enumerate(hapaxuces) :
1098 if uce in hucesdict :
1099 hucesdict[uce][0] += 1
1100 hucesdict[uce][1].append(hapax[i])
1102 hucesdict[uce] = [1,[hapax[i]]]
1104 for uce in hucesdict :
1105 if hucesdict[uce][0] in huces :
1106 huces[hucesdict[uce][0]].append(uce)
1108 huces[hucesdict[uce][0]] = [uce]
1109 huces = list(zip(huces, list(huces.values())))
1110 huces.sort(reverse=True)
1114 for nb in huces[0:4] :
1115 txt += "<p><h2>%i hapax par uce</h2><p>\n" % nb[0]
1117 res = self.getconcorde([uce])
1119 ucetxt = ' ' + row[1] + ' '
1121 for hap in hucesdict[uce][1] :
1122 laforme = self.getforme([forme for forme in self.lems[hap].formes][0]).forme
1123 ucetxt = ucetxt.replace(' '+laforme+' ', ' <font color=red>'+laforme+'</font> ')
1124 txt += '<p><b>' + ' '.join(self.getetbyuceid(uceid)) + '</b></p>'
1125 txt += '<p>'+ucetxt+'</p>\n'
1129 with open('/tmp/testhapxuce.html','w', encoding='utf8') as f :
1132 def export_dictionary(self, fileout, syscoding) :
1133 listformes = [[self.formes[forme].freq, forme, self.formes[forme].lem, self.formes[forme].gram] for forme in self.formes]
1134 listformes.sort(reverse = True)
1135 listformes = [forme[1:] + [repr(forme[0])] for forme in listformes]
1136 with open(fileout, 'w', encoding='utf8') as f :
1137 f.write('\n'.join(['\t'.join(forme) for forme in listformes]))
1139 def export_lems(self, fileout, syscoding) :
1140 self.make_idformes()
1141 listlem = [[lem, '\t'.join(['\t'.join([self.idformes[forme].forme, repr(self.lems[lem].formes[forme])]) for forme in self.lems[lem].formes])] for lem in self.lems]
1143 with open(fileout, 'w', encoding='utf8') as f :
1144 f.write('\n'.join(['\t'.join(lem) for lem in listlem]))
1148 def __init__(self, corpus) :
1149 ucinb = corpus.getucinb()
1150 ucisize = corpus.getucisize()
1151 ucimean = float(sum(ucisize))/float(ucinb)
1152 detoile = corpus.make_etoiles_dict()
1155 def __init__(self, iduci, line, paraset = None) :
1157 self.etoiles = line.split()
1159 if paraset is not None :
1160 self.paras = paraset.split()
1165 def __init__(self, iduce, idpara, iduci) :
1171 def __init__(self, word, gramtype, idword, lem = None, freq = None) :
1174 self.gram = gramtype
1177 if freq is not None :
1183 def __init__(self, parent, forme) :
1184 self.formes = {forme.ident : forme.freq}
1185 self.gram = forme.gram
1186 self.freq = forme.freq
1187 self.act = forme.act
1189 def add_forme(self, forme) :
1190 self.formes[forme.ident] = forme.freq
1191 self.freq += forme.freq
1194 def decouperlist(chaine, longueur, longueurOptimale) :
1196 on part du dernier caractère, et on recule jusqu'au début de la chaîne.
1197 Si on trouve un '$', c'est fini.
1198 Sinon, on cherche le meilleur candidat. C'est-à-dire le rapport poids/distance le plus important.
1200 separateurs = [['.', 6.0], ['?', 6.0], ['!', 6.0], ['£$£', 6.0], [':', 5.0], [';', 4.0], [',', 1.0], [' ', 0.01]]
1201 dsep = dict([[val[0],val[1]] for val in separateurs])
1202 trouve = False # si on a trouvé un bon séparateur
1203 iDecoupe = 0 # indice du caractere ou il faut decouper
1204 longueur = min(longueur, len(chaine) - 1)
1205 chaineTravail = chaine[:longueur + 1]
1207 meilleur = ['', 0, 0] # type, poids et position du meilleur separateur
1209 indice = chaineTravail.index('$')
1211 iDecoupe = indice - 1
1216 caractere = chaineTravail[nbCar]
1217 distance = abs(longueurOptimale - nbCar) + 1
1218 meilleureDistance = abs(longueurOptimale - meilleur[2]) + 1
1219 if caractere in dsep :
1220 if (float(dsep[caractere]) / distance) > (float(meilleur[1]) / meilleureDistance) :
1221 meilleur[0] = caractere
1222 meilleur[1] = dsep[caractere]
1227 if (float(dsep[' ']) / distance) > (float(meilleur[1]) / meilleureDistance) :
1229 meilleur[1] = dsep[' ']
1237 #if meilleur[0] != ' ' :
1238 # fin = chaine[iDecoupe + 1:]
1239 # retour = chaineTravail[:iDecoupe]
1242 fin = chaine[iDecoupe + 1:]
1243 retour = chaineTravail[:iDecoupe + 1]
1244 return len(retour) > 0, retour, fin
1245 # si on a rien trouvé
1246 return False, chaine, ''
1248 def testetoile(line) :
1249 return line.startswith('****')
1252 return line[0:4].isdigit() and '*' in line
1254 def prep_txtlist(txt) :
1255 return txt.split() + ['$']
1257 def prep_txtcharact(txt) :
1263 Class for building a corpus
1266 def __init__(self, infile, parametres_corpus, lexique = None, expressions = None, dlg = None) :
1267 log.info('begin building corpus...')
1268 self.lexique = lexique
1269 self.expressions = expressions
1271 self.corpus = Corpus(self, parametres_corpus)
1272 self.infile = infile
1274 self.lim = parametres_corpus.get('lim', 1000000)
1275 self.encoding = parametres_corpus['encoding']
1276 self.corpus.pathout = PathOut(filename = parametres_corpus['originalpath'], dirout = parametres_corpus['pathout'])
1277 self.corpus.pathout.createdir(parametres_corpus['pathout'])
1278 self.corpus.parametres['uuid'] = str(uuid4())
1279 self.corpus.parametres['corpus_name'] = parametres_corpus['corpus_name'] #os.path.split(self.corpus.parametres['pathout'])[1]
1280 self.corpus.parametres['type'] = 'corpus'
1281 if self.corpus.parametres['keep_ponct'] :
1282 self.ponctuation_espace = [' ', '']
1284 self.ponctuation_espace = [' ','.', '£$£', ';', '?', '!', ',', ':','']
1286 self.tolist = self.corpus.parametres.get('tolist', 0)
1293 def prep_makeuce(self) :
1294 method = self.corpus.parametres.get('ucemethod', 0)
1296 self.decouper = decouperlist
1297 self.prep_txt = prep_txtlist
1298 self.ucesize = self.corpus.parametres.get('ucesize', 40)
1300 self.decouper = decoupercharact
1301 self.prep_txt = prep_txtcharact
1302 self.ucesize = self.corpus.parametres.get('ucesize', 240)
1303 log.info('method uce : %s' % method)
1308 self.read_corpus(self.infile)
1309 except Warning as args :
1310 log.info('pas kool %s' % args)
1314 self.corpus.parametres['ira'] = self.corpus.pathout['Corpus.cira']
1315 self.time = time() - t1
1317 DoConf().makeoptions(['corpus'],[self.corpus.parametres], self.corpus.pathout['Corpus.cira'])
1318 log.info('time : %f' % (time() - t1))
1321 self.conn_f = sqlite3.connect(self.corpus.pathout['formes.db'])
1322 self.cf = self.conn_f.cursor()
1323 self.cf.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
1324 self.cf.execute('CREATE TABLE IF NOT EXISTS eff (id INTEGER, eff TEXT);')
1325 self.conn_f.commit()
1326 self.cf = self.conn_f.cursor()
1327 self.cf.execute('PRAGMA temp_store=MEMORY;')
1328 self.cf.execute('PRAGMA journal_mode=MEMORY;')
1329 self.cf.execute('PRAGMA synchronous = OFF;')
1330 self.cf.execute('begin')
1331 self.conn = sqlite3.connect(self.corpus.pathout['uces.db'])
1332 self.c = self.conn.cursor()
1333 self.c.execute('CREATE TABLE IF NOT EXISTS uces (id INTEGER, uces TEXT);')
1335 self.c = self.conn.cursor()
1336 self.c.execute('PRAGMA temp_store=MEMORY;')
1337 self.c.execute('PRAGMA journal_mode=MEMORY;')
1338 self.c.execute('PRAGMA synchronous = OFF;')
1339 self.c.execute('begin')
1342 #commit index and close db
1344 self.conn_f.commit()
1345 self.cf.execute('CREATE INDEX iduces ON uces (id);')
1346 self.cf.execute('CREATE INDEX ideff ON eff (id);')
1350 self.conn_corpus = sqlite3.connect(self.corpus.pathout['corpus.db'])
1351 self.ccorpus = self.conn_corpus.cursor()
1352 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS etoiles (uci INTEGER, et TEXT, paras TEXT);')
1353 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS luces (uci INTEGER, para INTEGER, uce INTEGER);')
1354 self.ccorpus.execute('CREATE TABLE IF NOT EXISTS formes (ident INTEGER, forme TEXT, lem TEXT, gram TEXT, freq INTEGER);')
1355 self.conn_corpus.commit()
1356 self.ccorpus = self.conn_corpus.cursor()
1357 self.ccorpus.execute('PRAGMA temp_store=MEMORY;')
1358 self.ccorpus.execute('PRAGMA journal_mode=MEMORY;')
1359 self.ccorpus.execute('PRAGMA synchronous = OFF;')
1360 self.ccorpus.execute('begin')
1361 self.backup_corpus()
1362 self.ccorpus.execute('CREATE INDEX iduci ON luces (uci);')
1363 self.conn_corpus.commit()
1364 self.conn_corpus.close()
1365 #self.corpus.parametres['corpus_ira'] = self.corpus.pathout['corpus.cira']
1367 def buildcleans(self) :
1368 if self.corpus.parametres.get('lower', 1) :
1369 self.cleans.append(self.dolower)
1370 if self.corpus.parametres.get('firstclean', 1) :
1371 self.cleans.append(self.firstclean)
1372 if self.corpus.parametres['charact'] :
1373 self.rule = self.corpus.parametres.get('keep_caract', "^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_")
1374 self.cleans.append(self.docharact)
1375 if self.corpus.parametres.get('expressions', 1) :
1376 self.cleans.append(self.make_expression)
1377 if self.corpus.parametres.get('apos', 1) :
1378 self.cleans.append(self.doapos)
1379 if self.corpus.parametres.get('tiret', 1):
1380 self.cleans.append(self.dotiret)
1382 def make_expression(self,txt) :
1383 exp = list(self.expressions.keys())
1384 exp.sort(reverse=True)
1385 for expression in exp :
1386 if expression in txt :
1387 txt = txt.replace(expression, self.expressions[expression][0])
1390 def dolower(self, txt) :
1393 def docharact(self, txt) :
1394 #rule = u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_-"
1395 list_keep = "[" + self.rule + "]+"
1396 return re.sub(list_keep, ' ', txt)
1398 def doapos(self, txt) :
1399 return txt.replace('\'', ' ')
1401 def dotiret(self, txt) :
1402 return txt.replace('-', ' ')
1404 def firstclean(self, txt) :
1405 txt = txt.replace('’',"'")
1406 txt = txt.replace('œ', 'oe')
1407 return txt.replace('...',' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace('…', ' £$£ ')
1409 def make_cleans(self, txt) :
1410 for clean in self.cleans :
1414 def backup_uce(self) :
1415 if self.corpus.idformesuces != {} :
1416 log.info('backup %i' % len(self.corpus.idformesuces))
1417 touce = [(repr(forme), ' '.join([repr(val) for val in list(self.corpus.idformesuces[forme].keys())])) for forme in self.corpus.idformesuces]
1418 toeff = [(repr(forme), ' '.join([repr(val) for val in list(self.corpus.idformesuces[forme].values())])) for forme in self.corpus.idformesuces]
1419 self.cf.executemany('INSERT INTO uces VALUES (?,?);', touce)
1420 self.cf.executemany('INSERT INTO eff VALUES (?,?);', toeff)
1421 self.corpus.idformesuces = {}
1424 def backup_corpus(self) :
1425 log.info('start backup corpus')
1427 for uci in self.corpus.ucis :
1428 self.ccorpus.execute('INSERT INTO etoiles VALUES (?,?,?);' ,(uci.ident,' '.join(uci.etoiles), ' '.join(uci.paras,)))
1429 for uce in uci.uces :
1430 self.ccorpus.execute('INSERT INTO luces VALUES (?,?,?);',(repr(uci.ident),repr(uce.para),repr(uce.ident),))
1431 for forme in self.corpus.formes :
1432 self.ccorpus.execute('INSERT INTO formes VALUES (?,?,?,?,?);', (repr(self.corpus.formes[forme].ident), forme, self.corpus.formes[forme].lem, self.corpus.formes[forme].gram, repr(self.corpus.formes[forme].freq),))
1433 log.info('%f' % (time() - t))
1435 def dofinish(self) :
1436 self.corpus.parametres['date'] = datetime.datetime.now().ctime()
1437 minutes, seconds = divmod(self.time, 60)
1438 hours, minutes = divmod(minutes, 60)
1439 self.corpus.parametres['time'] = '%.0fh %.0fm %.0fs' % (hours, minutes, seconds)
1440 self.corpus.parametres['ucinb'] = self.corpus.getucinb()
1441 self.corpus.parametres['ucenb'] = self.corpus.getucenb()
1442 self.corpus.parametres['occurrences'] = self.corpus.gettotocc()
1443 self.corpus.parametres['formesnb'] = len(self.corpus.formes)
1444 hapaxnb = self.corpus.gethapaxnb()
1445 pourhapaxf = (float(hapaxnb) / len(self.corpus.formes)) * 100
1446 pourhapaxocc = (float(hapaxnb) / self.corpus.parametres['occurrences']) * 100
1447 self.corpus.parametres['hapax'] = '%i - %.2f %% des formes - %.2f %% des occurrences' % (hapaxnb, pourhapaxf, pourhapaxocc)
1449 class BuildSubCorpus(BuildCorpus):
1451 def __init__(self, corpus, parametres, dlg = None) :
1452 log.info('begin subcorpus...')
1456 self.corpus = Corpus(self, {'type' : 'corpus', 'originalpath' : corpus.parametres['originalpath'], 'encoding' : corpus.parametres['encoding']})
1458 self.parametres = parametres
1459 self.encoding = corpus.parametres['encoding']
1460 self.corpus.parametres['corpus_name'] = parametres['corpus_name']
1461 self.corpus.pathout = PathOut(filename = corpus.parametres['originalpath'], dirout = parametres['pathout'])
1462 self.corpus.pathout.createdir(parametres['pathout'])
1463 self.corpus.parametres['pathout'] = parametres['pathout']
1464 self.corpus.parametres['meta'] = parametres.get('meta', False)
1465 self.corpus.parametres['uuid'] = str(uuid4())
1466 if parametres.get('frommeta', False) :
1467 print('make subtexts')
1468 self.corpus.ucis = [CopyUci(uci) for uci in self.ori.ucis if set(parametres['meta']).intersection(uci.etoiles) != set()]
1469 elif parametres.get('fromtheme', False) :
1470 print('make subtexts from theme')
1472 for uci in self.ori.ucis :
1473 if uci.paras != [] :
1476 for et in uci.paras :
1477 if et in parametres['meta'] :
1478 newuce += [CopyUce(uce) for uce in uci.uces if uce.para == idpara]
1484 nuci.paras = newpara
1485 self.corpus.ucis.append(nuci)
1488 elif parametres.get('fromclusters', False) :
1489 self.parametres['uceids'] = [st for i in self.parametres['meta'] for st in self.parametres['lc'][i]]
1491 elif parametres.get('fromuceids', False) :
1497 def fromuceids(self):
1499 dictucekeep = dict(list(zip(self.parametres['uceids'], self.parametres['uceids'])))
1501 for uci in self.ori.ucis :
1502 if uci.paras == [] :
1503 keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep]
1506 nuci.uces = keepuces
1507 self.corpus.ucis.append(nuci)
1512 for et in uci.paras :
1513 keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeepand and uce.para == idpara]
1521 nuci.paras = newpara
1522 self.corpus.ucis.append(nuci)
1524 def read_corpus(self, infile = None):
1525 self.olduceid = [uce.ident for uci in self.corpus.ucis for uce in uci.uces]
1531 print('redo text, para and st ident')
1532 for uci in self.corpus.ucis :
1533 uci.ident = ident_uci
1535 for uce in uci.uces :
1537 if uce.para != lastpara :
1540 uce.para = ident_para
1542 uce.para = ident_para
1543 newuceident[uce.ident] = ident_uce
1544 uce.ident = ident_uce
1546 print('backup st text and forms')
1547 for row in self.ori.getconcorde(self.olduceid) :
1548 self.c.execute('INSERT INTO uces VALUES(?,?);', (repr(newuceident[row[0]]), row[1]))
1549 for word in row[1].split() :
1550 self.corpus.add_word_from_forme(self.ori.formes[word], newuceident[row[0]])
1554 class BuildFromAlceste(BuildCorpus) :
1556 def read_corpus(self, infile) :
1558 if self.dlg is not None :
1559 self.dlg.Pulse('textes : 0 - segments : 0')
1562 if self.corpus.parametres['ucimark'] == 0 :
1563 self.testuci = testetoile
1564 elif self.corpus.parametres['ucimark'] == 1 :
1565 self.testuci = testint
1571 with codecs.open(infile, 'r', self.encoding) as f :
1572 for linenb, line in enumerate(f) :
1573 line = line.rstrip('\n\r')#FIXME .lstrip(codecs.BOM).lstrip(codecs.BOM_UTF8)
1574 if self.testuci(line) :
1577 #doc = nlp(' '.join(txt))
1578 #print([[word, word.pos_, word.lemma_] for word in doc])
1579 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci - 1)
1581 self.corpus.ucis.append(Uci(iduci, line))
1584 if self.corpus.ucis[-1].uces == [] :
1585 log.info('Empty text : %i' % linenb)
1587 self.corpus.ucis.pop()
1588 self.corpus.ucis.append(Uci(iduci, line))
1589 if self.dlg is not None :
1590 if not (iduci + 1) % 10 :
1591 self.dlg.Pulse('textes : %i - segments : %i' % (iduci + 1, iduce +1))
1592 elif line.startswith('-*') :
1595 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1598 self.corpus.ucis[-1].paras.append(line.split()[0])
1600 raise Exception('paragrapheOT %i' % linenb)
1601 elif line.strip() != '' and iduci != -1 :
1603 if txt != [] and iduci != -1 :
1604 iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
1609 self.corpus.ucis.pop()
1610 log.info(Exception("Empty text %i" % linenb))
1612 raise Exception('EmptyText %i' % linenb)
1613 if iduci != -1 and iduce != -1:
1616 log.info(_("No Text in corpus. Are you sure of the formatting ?"))
1617 raise Exception('TextBeforeTextMark %i' % linenb)
1618 except UnicodeDecodeError :
1619 raise Exception("CorpusEncoding")
1621 def treattxt(self, txt, iduce, idpara, iduci) :
1622 if self.corpus.parametres.get('ucemethod', 0) == 2 and self.corpus.parametres['douce']:
1623 txt = 'laphrasepoursplitter'.join(txt)
1624 txt = self.make_cleans(txt)
1625 txt = ' '.join([val for val in txt.split() if val not in self.ponctuation_espace])
1626 ucetxt = txt.split('laphrasepoursplitter')
1629 txt = self.make_cleans(txt)
1630 ucetxt = self.make_uces(txt, self.corpus.parametres['douce'])
1631 if self.corpus.ucis[-1].paras == [] :
1635 self.corpus.ucis[-1].uces.append(Uce(iduce, idpara, iduci))
1636 self.c.execute('INSERT INTO uces VALUES(?,?);', (repr(iduce),uce))
1637 if not self.tolist :
1643 self.corpus.add_word(word)
1644 log.debug(' '.join([repr(iduci),repr(idpara),repr(iduce)]))
1645 if self.last > self.lim :
1648 return iduce, idpara
1650 def make_uces(self, txt, douce = True, keep_ponct = False) :
1651 txt = ' '.join(txt.split())
1654 reste, texte_uce, suite = self.decouper(self.prep_txt(txt), self.ucesize + 15, self.ucesize)
1656 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1659 reste, texte_uce, suite = self.decouper(suite, self.ucesize + 15, self.ucesize)
1660 uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
1665 return [' '.join([val for val in txt.split() if val not in self.ponctuation_espace])]
1667 #decouper (list_sep)
1668 #make_uces (decouper)
1669 #treat_txt (make_uces)
1674 def __init__(self, parent, dlg = None) :
1675 self.parent = parent
1677 parametres = DoConf(os.path.join(self.parent.UserConfigPath,'corpus.cfg')).getoptions('corpus')
1678 parametres['pathout'] = PathOut(parent.filename, 'corpus').mkdirout()
1679 parametres['corpus_name'] = os.path.split(parametres['pathout'])[1]
1680 dial = CorpusPref(parent, parametres)
1681 dial.CenterOnParent()
1682 dial.txtpath.SetLabel(parent.filename)
1683 #dial.repout_choices.SetValue(parametres['pathout'])
1684 self.res = dial.ShowModal()
1685 if self.dlg is not None :
1686 self.dlg = progressbar(self.parent, self.dlg)
1687 if self.res == 5100 :
1688 parametres = dial.doparametres()
1689 parametres['originalpath'] = parent.filename
1690 PathOut().createdir(parametres['pathout'])
1691 if parametres.get('dictionary', False) :
1692 filein = parametres['dictionary']
1695 if dial.corpusname.GetValue() != '' :
1696 parametres['corpus_name'] = dial.corpusname.GetValue()
1698 ReadLexique(self.parent, lang = parametres['lang'], filein = filein)
1699 if parametres['lang'] != 'other' and os.path.exists(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')):
1700 self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp'))
1702 self.parent.expressions = {}
1703 self.parametres = parametres
1706 if self.dlg is not None :
1709 def doanalyse(self) :
1710 return BuildFromAlceste(self.parent.filename, self.parametres, self.parent.lexique, self.parent.expressions, dlg = self.dlg).corpus
1714 def __init__(self, parent, corpus, parametres = None, dlg = None):
1715 self.parent = parent
1718 corpus_name = 'Sub' + corpus.parametres['corpus_name']
1719 if dlg is not None :
1720 busy = wx.BusyInfo(_("Please wait..."), self)
1722 parametres['corpus_name'] = corpus_name
1723 if parametres.get('frommeta', False) :
1724 parametres['meta'] = corpus.make_etoiles()
1725 elif parametres.get('fromtheme', False) :
1726 parametres['meta'] = corpus.make_themes()
1727 elif parametres.get('fromclusters', False) :
1728 parametres['meta'] = [' '.join(['classe', repr(i)]) for i in range(1,parametres['clnb'] + 1)]
1730 parametres['meta'] = []
1731 if 'fromclusters' not in parametres :
1732 parametres['meta'].sort()
1733 if dlg is not None :
1735 dial = SubTextFromMetaDial(parent, parametres)
1736 self.res = dial.ShowModal()
1737 if self.res == 5100 :
1738 if dial.subcorpusname.GetValue() != '' :
1739 corpus_name = ''.join([l for l in dial.subcorpusname.GetValue() if l.isalnum() or l in ['_']])
1740 if corpus_name != '' :
1741 parametres['corpus_name'] = corpus_name
1743 parametres['corpus_name'] = 'Sub' + corpus.parametres['corpus_name']
1744 pathout = os.path.join(corpus.parametres['pathout'], parametres['corpus_name'])
1746 while os.path.exists(pathout + '_%i' % i) :
1748 parametres['pathout'] = pathout + '_%i' % i
1749 meta = dial.m_listBox1.GetSelections()
1750 if not 'fromclusters' in parametres :
1751 parametres['meta'] = [parametres['meta'][val] for val in meta]
1753 parametres['meta'] = meta
1754 self.parametres = parametres
1759 def doanalyse(self):
1760 return BuildSubCorpus(self.ori, parametres = self.parametres, dlg = self.dlg).corpus
1762 class BuildMergeFromClusters(BuildCorpus):
1764 def __init__(self, analyses, parametres, dlg = None) :
1765 log.info('begin subcorpus...')
1768 self.corpus = Corpus(self, {'type' : 'corpus', 'originalpath' : 'MergeFromClusters', 'encoding' : 'merge'})
1770 self.analyses = analyses
1772 self.parametres = parametres
1773 #self.encoding = corpus.parametres['encoding']
1774 self.corpus.parametres['corpus_name'] = parametres['corpus_name']
1775 self.corpus.pathout = PathOut(filename = 'MFC', dirout = parametres['pathout'])
1776 self.corpus.pathout.createdir(parametres['pathout'])
1777 self.corpus.parametres['pathout'] = parametres['pathout']
1778 self.corpus.parametres['meta'] = parametres.get('meta', False)
1779 self.corpus.parametres['uuid'] = str(uuid4())
1780 for i, analyse in enumerate(analyses) :
1783 corpus_uuid = analyse['corpus']
1784 #if corpus_uuid not in self.parent.history.openedcorpus :
1785 irapath = parametres['corpusira'][i]
1786 corpus = Corpus(self, parametres = DoConf(irapath).getoptions('corpus'), read = True)
1787 ucepath = os.path.join(analyse['pathout'], 'uce.csv')
1788 corpus.make_ucecl_from_R(ucepath)
1790 for j, cl in enumerate(parametres['clusters'][i]) :
1791 #print cl, self.ori.lc[cl-1]
1792 self.parametres['uceids'] = self.ori.lc[cl-1]#[st for st in self.ori['lc'][cl-1]]
1793 self.lcl[i] += self.ori.lc[cl-1]
1794 self.et = parametres['newet'][i][j]
1800 def fromuceids(self):
1802 dictucekeep = dict(list(zip(self.parametres['uceids'], self.parametres['uceids'])))
1804 for uci in self.ori.ucis :
1805 if uci.paras == [] :
1806 keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep]
1809 nuci.uces = keepuces
1810 nuci.etoiles.append(self.et)
1811 nuci.analyseid = self.analyseid
1812 self.corpus.ucis.append(nuci)
1817 for et in uci.paras :
1818 keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep]
1826 nuci.paras = newpara
1827 nuci.etoiles.append(self.et)
1828 nuci.analyseid = self.analyseid
1829 self.corpus.ucis.append(nuci)
1830 #print nuci.etoiles, nuci.ident, nuci.uces
1832 def read_corpus(self, infile = None):
1833 #self.olduceid = [uce.ident for uci in self.corpus.ucis for uce in uci.uces]
1839 print('redo text, para and st ident')
1840 for uci in self.corpus.ucis :
1841 #print uci.ident, ident_uci, [uce.ident for uce in uci.uces], uci.etoiles
1842 uci.ident = ident_uci
1844 for uce in uci.uces :
1846 if uce.para != lastpara :
1849 uce.para = ident_para
1851 uce.para = ident_para
1852 newuceident['%i-%i' %(uci.analyseid, uce.ident)] = ident_uce
1853 uce.ident = ident_uce
1856 print('backup st text and forms')
1858 for i, analyse in enumerate(self.analyses) :
1859 #print analyse, self.parametres['corpusira']
1860 irapath = self.parametres['corpusira'][i]
1861 old = Corpus(self, parametres = DoConf(irapath).getoptions('corpus'), read = True)
1862 for row in old.getconcorde(self.lcl[i]) :
1863 self.c.execute('INSERT INTO uces VALUES(?,?);', (newuceident['%i-%i' % (i,row[0])], row[1]))
1864 for word in row[1].split() :
1865 self.corpus.add_word_from_forme(old.formes[word], newuceident['%i-%i' % (i,row[0])])
1871 class MergeClusters :
1873 def __init__(self, parent, parametres = None, dlg = None):
1874 self.parent = parent
1877 corpus_name = 'MergeFromClusters'
1878 if dlg is not None :
1879 busy = wx.BusyInfo(_("Please wait..."), self)
1881 parametres['corpus_name'] = corpus_name
1882 if dlg is not None :
1884 dial = MergeClusterFrame(parent)
1885 dial.m_textCtrl4.SetValue(corpus_name)
1886 self.res = dial.ShowModal()
1887 if self.res == 5100 :
1892 if dial.m_textCtrl4.GetValue() != '' :
1893 corpus_name = ''.join([l for l in dial.m_textCtrl4.GetValue() if l.isalnum() or l in ['_']])
1894 if corpus_name != '' :
1895 parametres['corpus_name'] = corpus_name
1897 parametres['corpus_name'] = 'MergeFromClusters'
1898 for cl in dial.selected :
1900 #if corpus_uuid not in self.parent.history.openedcorpus :
1901 irapath = self.parent.history.corpus[corpus_uuid]['ira']
1902 #corpus = Corpus(self.parent, parametres = DoConf(irapath).getoptions('corpus'), read = True)
1903 #self.parent.history.openedcorpus[corpus_uuid] = corpus
1904 if cl[0] not in self.analyses :
1905 analyse = DoConf(dial.irapath[cl[0]]).getoptions()
1906 #ucepath = os.path.join(os.path.dirname(dial.irapath[cl[0]]), 'uce.csv')
1907 #corpus = copycorpus(self.parent.history.openedcorpus[corpus_uuid])
1908 #corpus.make_ucecl_from_R(ucepath)
1909 self.analyses[cl[0]] = analyse
1910 self.clusters[cl[0]] = [cl[2]]
1911 self.newet[cl[0]] = [dial.selected[cl]]
1912 self.corpusira[cl[0]] = irapath
1914 self.clusters[cl[0]].append(cl[2])
1915 self.newet[cl[0]].append(dial.selected[cl])
1916 analyses = [val for val in self.clusters]
1917 clusters = [self.clusters[val] for val in analyses]
1918 self.newet = [self.newet[val] for val in analyses]
1919 corpusira = [self.corpusira[val] for val in analyses]
1920 analyses = [self.analyses[val] for val in analyses]
1921 pathout = os.path.dirname(os.path.dirname(analyses[0]['pathout']))
1922 self.analyses = analyses
1923 pathout = os.path.join(pathout, parametres['corpus_name'])
1925 while os.path.exists(pathout + '_%i' % i) :
1927 parametres['pathout'] = pathout + '_%i' % i
1928 self.parametres = parametres
1929 self.parametres['clusters'] = clusters
1930 self.parametres['newet'] = self.newet
1931 self.parametres['corpusira'] = corpusira
1936 def doanalyse(self):
1937 return BuildMergeFromClusters(self.analyses, parametres = self.parametres, dlg = self.dlg).corpus