X-Git-Url: http://iramuteq.org/git?p=iramuteq;a=blobdiff_plain;f=corpus.py;h=849f8302e159ec25a8a0a04e373b68b23754f280;hp=6e667b3f173f272964ffb4b01ff0037a65e36419;hb=417216edd059c3d7a93a2ccba36ccc690121edee;hpb=773ca4248f17f17098e34b3c21004713e09e3109 diff --git a/corpus.py b/corpus.py index 6e667b3..849f830 100644 --- a/corpus.py +++ b/corpus.py @@ -8,7 +8,7 @@ _ = gettext.gettext import locale import sys from time import time -from functions import decoupercharact, ReadDicoAsDico, DoConf, ReadLexique +from functions import decoupercharact, ReadDicoAsDico, DoConf, ReadLexique, progressbar import re import sqlite3 import itertools @@ -173,7 +173,13 @@ class Corpus : return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res])))) def gettgenst(self, tgen): - formesid = ', '.join([`val` for lem in tgen for val in self.lems[lem].formes if lem in self.lems]) + formesid = '' + for lem in tgen : + if lem in self.lems : + formesid += ', '.join([`val` for val in self.lems[lem].formes]) + else : + print 'abscent: ',lem + #formesid = ', '.join([`val` for lem in tgen for val in self.lems[lem].formes if lem in self.lems]) query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid res = self.cformes.execute(query) return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res])))) @@ -374,7 +380,9 @@ class Corpus : for lem in tokeep : deff = self.getlemuceseff(lem) ucesk = deff.keys() - tab.append([lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces]) + line = [lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces] + if sum(line[1:]) >= mineff : + tab.append(line) tab.insert(0, [''] + etoiles) return tab @@ -539,6 +547,14 @@ class Corpus : ident += 1 f.write('\n'.join([self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n') + def export_meta_table(self, outf) : + metas = [[`i`] + text.etoiles[1:] for i, text in enumerate(self.ucis)] + longueur_max = max([len(val) for val in metas]) + first = ['column_%i' % i for i in range(longueur_max)] + metas.insert(0, first) + with open(outf, 'w') as f : + f.write('\n'.join(['\t'.join(line) for line in metas]).encode(self.parametres['syscoding'])) + def export_corpus_classes(self, outf, alc = True, lem = False, uci = False) : ucecl = {} for i, lc in enumerate(self.lc) : @@ -1295,7 +1311,9 @@ class BuildCorpus : self.cleans.append(self.dotiret) def make_expression(self,txt) : - for expression in self.expressions: + exp = self.expressions.keys() + exp.sort(reverse=True) + for expression in exp : if expression in txt : txt = txt.replace(expression, self.expressions[expression][0]) return txt @@ -1581,6 +1599,7 @@ class Builder : def __init__(self, parent, dlg = None) : self.parent = parent self.dlg = dlg + parametres = DoConf(os.path.join(self.parent.UserConfigPath,'corpus.cfg')).getoptions('corpus') parametres['pathout'] = PathOut(parent.filename, 'corpus').mkdirout() parametres['corpus_name'] = os.path.split(parametres['pathout'])[1] @@ -1589,6 +1608,8 @@ class Builder : dial.txtpath.SetLabel(parent.filename) #dial.repout_choices.SetValue(parametres['pathout']) self.res = dial.ShowModal() + if self.dlg is not None : + self.dlg = progressbar(self.parent, self.dlg) if self.res == 5100 : parametres = dial.doparametres() parametres['originalpath'] = parent.filename