X-Git-Url: http://iramuteq.org/git?p=iramuteq;a=blobdiff_plain;f=corpus.py;h=e6836308acc16d00490c3a392f831c3016a91f99;hp=cd6c36467b587157ab7bbc3b36a3c2efd5edf608;hb=441459f18ef9957b592f26743eea037bca431e55;hpb=e531d59ce2d8f72dfc3138446db913af1f20b134 diff --git a/corpus.py b/corpus.py index cd6c364..e683630 100644 --- a/corpus.py +++ b/corpus.py @@ -38,8 +38,7 @@ def copycorpus(corpus) : class Corpus : """Corpus class - list of uci - + list of text """ def __init__(self, parent, parametres = {}, read = False) : self.parent = parent @@ -275,8 +274,12 @@ class Corpus : if self.iduces is None : self.iduces = dict([[uce.ident, uce] for uci in self.ucis for uce in uci.uces]) - def make_lexitable(self, mineff, etoiles) : - tokeep = [lem for lem in self.lems if self.lems[lem].freq >= mineff] + def make_lexitable(self, mineff, etoiles, gram = 0) : + if gram == 0 : + grams = {1:'', 2:''} + else : + grams = {gram :''} + tokeep = [lem for lem in self.lems if self.lems[lem].freq >= mineff and self.lems[lem].act in grams] etuces = [[] for et in etoiles] for uci in self.ucis : get = list(set(uci.etoiles).intersection(etoiles)) @@ -434,7 +437,7 @@ class Corpus : f.write(guce.encode(self.parametres['syscoding']) + '\n\n') def export_classe(self, outf, classe, lem = False) : - sts = self.lc[classe] + sts = self.lc[classe - 1] res = self.getconcorde(sts) self.make_iduces() with open(outf, 'w') as f : @@ -506,6 +509,17 @@ class Corpus : table_uce[uces[uce]][i] = 1 table_uce.insert(0, list_act) return table_uce + + def make_pondtable_with_classe(self, uces, list_act) : + table_uce = [[0 for val in list_act] for line in range(0,len(uces))] + uces = dict([[uce, i] for i, uce in enumerate(uces)]) + for i, lem in enumerate(list_act) : + uceseff = self.getlemuceseff(lem) + lemuces = list(set(uceseff.keys()).intersection(uces)) + for uce in lemuces : + table_uce[uces[uce]][i] = uceseff[uce] + table_uce.insert(0, list_act) + return table_uce def parse_active(self, gramact, gramsup = None) : log.info('parse actives') @@ -534,6 +548,8 @@ class Corpus : allactives = [[self.lems[lem].freq, lem] for lem in self.lems if self.lems[lem].act == key and self.lems[lem].freq >= 3] self.activenb = len(allactives) allactives = sorted(allactives, reverse = True) + if self.activenb == 0 : + return [], 0 if len(allactives) <= nbmax : log.info('nb = %i - eff min = %i ' % (len(allactives), allactives[-1][0])) return [val[1] for val in allactives], allactives[-1][0] @@ -830,8 +846,7 @@ class MakeUciStat : ucinb = corpus.getucinb() ucisize = corpus.getucisize() ucimean = float(sum(ucisize))/float(ucinb) - detoile = corpus.make_etoiles_dict() - + detoile = corpus.make_etoiles_dict() class Uci : def __init__(self, iduci, line, paraset = None) : @@ -1081,7 +1096,7 @@ class BuildCorpus : def firstclean(self, txt) : txt = txt.replace(u'’',"'") txt = txt.replace(u'œ', u'oe') - return txt.replace('...',u' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'…', ' £$£ ') + return txt.replace('...',u' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'…', u' £$£ ') def make_cleans(self, txt) : for clean in self.cleans :