X-Git-Url: http://iramuteq.org/git?p=iramuteq;a=blobdiff_plain;f=corpus.py;h=a2790f04d1253ba46592bf889b45a7f212c84b15;hp=3ced4824a7e546fc14ebd4472d10e5b0e50b40e4;hb=287f9e72c3e3d666b016dff0fa3dc39419adfcc2;hpb=aae91ab48172a83c49bb502ac737bcc1b3a6685c diff --git a/corpus.py b/corpus.py index 3ced482..a2790f0 100644 --- a/corpus.py +++ b/corpus.py @@ -8,7 +8,7 @@ _ = gettext.gettext import locale import sys from time import time -from functions import decoupercharact, ReadDicoAsDico, DoConf, ReadLexique +from functions import decoupercharact, ReadDicoAsDico, DoConf, ReadLexique, progressbar import re import sqlite3 import itertools @@ -171,12 +171,30 @@ class Corpus : query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid res = self.cformes.execute(query) return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res])))) - + def gettgenst(self, tgen): - formesid = ', '.join([`val` for lem in tgen for val in self.lems[lem].formes if lem in self.lems]) - query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid + formesid = [] + for lem in tgen : + if lem in self.lems : + formesid += self.lems[lem].formes + else : + print 'abscent : %s' % lem + query = 'SELECT uces FROM uces where id IN %s ORDER BY id' % str(tuple(formesid)) res = self.cformes.execute(query) return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res])))) + + def gettgenstprof(self, tgen, classe, i, clnb): + tgenst = [] + for lem in tgen : + if lem in self.lems : + lemst = self.getlemuces(lem) + tgenst += lemst + if not lem in self.tgenlem : + self.tgenlem[lem] = [0] * clnb + self.tgenlem[lem][i] = len(set(lemst).intersection(classe)) + else : + print 'abscent: ',lem + return list(set(tgenst)) def gettgentxt(self, tgen): sts = self.gettgenst(tgen) @@ -374,7 +392,9 @@ class Corpus : for lem in tokeep : deff = self.getlemuceseff(lem) ucesk = deff.keys() - tab.append([lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces]) + line = [lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces] + if sum(line[1:]) >= mineff : + tab.append(line) tab.insert(0, [''] + etoiles) return tab @@ -403,10 +423,13 @@ class Corpus : def make_tgen_profile(self, tgen, ucecl, uci = False) : log.info('tgen/classes') + self.tgenlem = {} + clnb = len(ucecl) if uci : - tab = [[lem] + [len(set(self.gettgentxt(tgen[lem])).intersection(classe)) for classe in ucecl] for lem in tgen] + #FIXME : NE MARCHE PLUS CHANGER CA + tab = [[lem] + [len(set(self.gettgentxt(tgen[lem])).intersection(classe)) for i, classe in enumerate(ucecl)] for lem in tgen] else : - tab = [[lem] + [len(set(self.gettgenst(tgen[lem])).intersection(classe)) for classe in ucecl] for lem in tgen] + tab = [[lem] + [len(set(self.gettgenstprof(tgen[lem], classe, i, clnb)).intersection(classe)) for i, classe in enumerate(ucecl)] for lem in tgen] tab = [[line[0]] + [val for val in line[1:]] for line in tab if sum(line[1:]) >= 3] return tab #i = 0 @@ -672,7 +695,7 @@ class Corpus : f.write(''.join([' '.join([`duces[uce]+1`,`i+1`,`1`]),'\n'])) f.seek(0) with open(outfile, 'w') as ffin : - ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl)) + ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (len(uces), len(actives), nbl)) for line in f : ffin.write(line) os.remove(outfile + '~') @@ -1303,7 +1326,9 @@ class BuildCorpus : self.cleans.append(self.dotiret) def make_expression(self,txt) : - for expression in self.expressions: + exp = self.expressions.keys() + exp.sort(reverse=True) + for expression in exp : if expression in txt : txt = txt.replace(expression, self.expressions[expression][0]) return txt @@ -1589,6 +1614,7 @@ class Builder : def __init__(self, parent, dlg = None) : self.parent = parent self.dlg = dlg + parametres = DoConf(os.path.join(self.parent.UserConfigPath,'corpus.cfg')).getoptions('corpus') parametres['pathout'] = PathOut(parent.filename, 'corpus').mkdirout() parametres['corpus_name'] = os.path.split(parametres['pathout'])[1] @@ -1597,6 +1623,8 @@ class Builder : dial.txtpath.SetLabel(parent.filename) #dial.repout_choices.SetValue(parametres['pathout']) self.res = dial.ShowModal() + if self.dlg is not None : + self.dlg = progressbar(self.parent, self.dlg) if self.res == 5100 : parametres = dial.doparametres() parametres['originalpath'] = parent.filename