X-Git-Url: http://iramuteq.org/git?p=iramuteq;a=blobdiff_plain;f=corpus.py;h=a2790f04d1253ba46592bf889b45a7f212c84b15;hp=011206fcc5553f7c6150e410917c0d475ce5c910;hb=287f9e72c3e3d666b016dff0fa3dc39419adfcc2;hpb=83802e662acde01994fe0bd2bf6978fef90b14f8 diff --git a/corpus.py b/corpus.py index 011206f..a2790f0 100644 --- a/corpus.py +++ b/corpus.py @@ -8,7 +8,7 @@ _ = gettext.gettext import locale import sys from time import time -from functions import decoupercharact, ReadDicoAsDico, DoConf, ReadLexique +from functions import decoupercharact, ReadDicoAsDico, DoConf, ReadLexique, progressbar import re import sqlite3 import itertools @@ -171,12 +171,30 @@ class Corpus : query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid res = self.cformes.execute(query) return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res])))) - + def gettgenst(self, tgen): - formesid = ', '.join([`val` for lem in tgen for val in self.lems[lem].formes if lem in self.lems]) - query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid + formesid = [] + for lem in tgen : + if lem in self.lems : + formesid += self.lems[lem].formes + else : + print 'abscent : %s' % lem + query = 'SELECT uces FROM uces where id IN %s ORDER BY id' % str(tuple(formesid)) res = self.cformes.execute(query) return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res])))) + + def gettgenstprof(self, tgen, classe, i, clnb): + tgenst = [] + for lem in tgen : + if lem in self.lems : + lemst = self.getlemuces(lem) + tgenst += lemst + if not lem in self.tgenlem : + self.tgenlem[lem] = [0] * clnb + self.tgenlem[lem][i] = len(set(lemst).intersection(classe)) + else : + print 'abscent: ',lem + return list(set(tgenst)) def gettgentxt(self, tgen): sts = self.gettgenst(tgen) @@ -374,7 +392,9 @@ class Corpus : for lem in tokeep : deff = self.getlemuceseff(lem) ucesk = deff.keys() - tab.append([lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces]) + line = [lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces] + if sum(line[1:]) >= mineff : + tab.append(line) tab.insert(0, [''] + etoiles) return tab @@ -403,10 +423,13 @@ class Corpus : def make_tgen_profile(self, tgen, ucecl, uci = False) : log.info('tgen/classes') + self.tgenlem = {} + clnb = len(ucecl) if uci : - tab = [[lem] + [len(set(self.gettgentxt(tgen[lem])).intersection(classe)) for classe in ucecl] for lem in tgen] + #FIXME : NE MARCHE PLUS CHANGER CA + tab = [[lem] + [len(set(self.gettgentxt(tgen[lem])).intersection(classe)) for i, classe in enumerate(ucecl)] for lem in tgen] else : - tab = [[lem] + [len(set(self.gettgenst(tgen[lem])).intersection(classe)) for classe in ucecl] for lem in tgen] + tab = [[lem] + [len(set(self.gettgenstprof(tgen[lem], classe, i, clnb)).intersection(classe)) for i, classe in enumerate(ucecl)] for lem in tgen] tab = [[line[0]] + [val for val in line[1:]] for line in tab if sum(line[1:]) >= 3] return tab #i = 0 @@ -539,6 +562,14 @@ class Corpus : ident += 1 f.write('\n'.join([self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n') + def export_meta_table(self, outf) : + metas = [[`i`] + text.etoiles[1:] for i, text in enumerate(self.ucis)] + longueur_max = max([len(val) for val in metas]) + first = ['column_%i' % i for i in range(longueur_max)] + metas.insert(0, first) + with open(outf, 'w') as f : + f.write('\n'.join(['\t'.join(line) for line in metas]).encode(self.parametres['syscoding'])) + def export_corpus_classes(self, outf, alc = True, lem = False, uci = False) : ucecl = {} for i, lc in enumerate(self.lc) : @@ -664,7 +695,7 @@ class Corpus : f.write(''.join([' '.join([`duces[uce]+1`,`i+1`,`1`]),'\n'])) f.seek(0) with open(outfile, 'w') as ffin : - ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl)) + ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (len(uces), len(actives), nbl)) for line in f : ffin.write(line) os.remove(outfile + '~') @@ -1295,7 +1326,9 @@ class BuildCorpus : self.cleans.append(self.dotiret) def make_expression(self,txt) : - for expression in self.expressions: + exp = self.expressions.keys() + exp.sort(reverse=True) + for expression in exp : if expression in txt : txt = txt.replace(expression, self.expressions[expression][0]) return txt @@ -1581,6 +1614,7 @@ class Builder : def __init__(self, parent, dlg = None) : self.parent = parent self.dlg = dlg + parametres = DoConf(os.path.join(self.parent.UserConfigPath,'corpus.cfg')).getoptions('corpus') parametres['pathout'] = PathOut(parent.filename, 'corpus').mkdirout() parametres['corpus_name'] = os.path.split(parametres['pathout'])[1] @@ -1589,6 +1623,8 @@ class Builder : dial.txtpath.SetLabel(parent.filename) #dial.repout_choices.SetValue(parametres['pathout']) self.res = dial.ShowModal() + if self.dlg is not None : + self.dlg = progressbar(self.parent, self.dlg) if self.res == 5100 : parametres = dial.doparametres() parametres['originalpath'] = parent.filename @@ -1597,6 +1633,8 @@ class Builder : filein = parametres['dictionary'] else : filein = None + if dial.corpusname.GetValue() != '' : + parametres['corpus_name'] = dial.corpusname.GetValue() dial.Destroy() ReadLexique(self.parent, lang = parametres['lang'], filein = filein) if parametres['lang'] != 'other' and os.path.exists(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')):