X-Git-Url: http://iramuteq.org/git?p=iramuteq;a=blobdiff_plain;f=corpus.py;h=e222940b42281882adcfc9828b4b2c1c012325fc;hp=76ebb660911223bf31496b8d0a23aea56c4d2290;hb=dbc9195121789297412a7d46eb9c1f089b77490e;hpb=432118f2ac3d2f8234c388e77d0fb9e14234750f diff --git a/corpus.py b/corpus.py index 76ebb66..e222940 100644 --- a/corpus.py +++ b/corpus.py @@ -8,7 +8,7 @@ _ = gettext.gettext import locale import sys from time import time -from functions import decoupercharact, ReadDicoAsDico, DoConf +from functions import decoupercharact, ReadDicoAsDico, DoConf, ReadLexique import re import sqlite3 import itertools @@ -17,7 +17,6 @@ from operator import itemgetter from uuid import uuid4 from chemins import PathOut from dialog import CorpusPref -from functions import ReadLexique, ReadDicoAsDico from colors import colors import datetime @@ -38,8 +37,7 @@ def copycorpus(corpus) : class Corpus : """Corpus class - list of uci - + list of text """ def __init__(self, parent, parametres = {}, read = False) : self.parent = parent @@ -77,10 +75,10 @@ class Corpus : gramtype = self.parent.lexique[word][1] lem = self.parent.lexique[word][0] elif word.isdigit() : - gramtype = 'num' + gramtype = u'num' lem = word else : - gramtype = 'nr' + gramtype = u'nr' lem = word self.formes[word] = Word(word, gramtype, len(self.formes), lem) self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident : 1} @@ -275,8 +273,12 @@ class Corpus : if self.iduces is None : self.iduces = dict([[uce.ident, uce] for uci in self.ucis for uce in uci.uces]) - def make_lexitable(self, mineff, etoiles) : - tokeep = [lem for lem in self.lems if self.lems[lem].freq >= mineff] + def make_lexitable(self, mineff, etoiles, gram = 0) : + if gram == 0 : + grams = {1:'', 2:''} + else : + grams = {gram :''} + tokeep = [lem for lem in self.lems if self.lems[lem].freq >= mineff and self.lems[lem].act in grams] etuces = [[] for et in etoiles] for uci in self.ucis : get = list(set(uci.etoiles).intersection(etoiles)) @@ -434,7 +436,7 @@ class Corpus : f.write(guce.encode(self.parametres['syscoding']) + '\n\n') def export_classe(self, outf, classe, lem = False) : - sts = self.lc[classe] + sts = self.lc[classe - 1] res = self.getconcorde(sts) self.make_iduces() with open(outf, 'w') as f : @@ -506,6 +508,17 @@ class Corpus : table_uce[uces[uce]][i] = 1 table_uce.insert(0, list_act) return table_uce + + def make_pondtable_with_classe(self, uces, list_act) : + table_uce = [[0 for val in list_act] for line in range(0,len(uces))] + uces = dict([[uce, i] for i, uce in enumerate(uces)]) + for i, lem in enumerate(list_act) : + uceseff = self.getlemuceseff(lem) + lemuces = list(set(uceseff.keys()).intersection(uces)) + for uce in lemuces : + table_uce[uces[uce]][i] = uceseff[uce] + table_uce.insert(0, list_act) + return table_uce def parse_active(self, gramact, gramsup = None) : log.info('parse actives') @@ -514,7 +527,7 @@ class Corpus : self.lems[lem].act = 2 elif self.lems[lem].gram in gramact : self.lems[lem].act = 1 - elif gramsup is not None : + elif gramsup is not None and self.lems[lem].gram not in gramact: if self.lems[lem].gram in gramsup : self.lems[lem].act = 2 else : @@ -534,6 +547,8 @@ class Corpus : allactives = [[self.lems[lem].freq, lem] for lem in self.lems if self.lems[lem].act == key and self.lems[lem].freq >= 3] self.activenb = len(allactives) allactives = sorted(allactives, reverse = True) + if self.activenb == 0 : + return [], 0 if len(allactives) <= nbmax : log.info('nb = %i - eff min = %i ' % (len(allactives), allactives[-1][0])) return [val[1] for val in allactives], allactives[-1][0] @@ -830,8 +845,7 @@ class MakeUciStat : ucinb = corpus.getucinb() ucisize = corpus.getucisize() ucimean = float(sum(ucisize))/float(ucinb) - detoile = corpus.make_etoiles_dict() - + detoile = corpus.make_etoiles_dict() class Uci : def __init__(self, iduci, line, paraset = None) : @@ -1081,7 +1095,7 @@ class BuildCorpus : def firstclean(self, txt) : txt = txt.replace(u'’',"'") txt = txt.replace(u'œ', u'oe') - return txt.replace('...',u' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'…', ' £$£ ') + return txt.replace('...',u' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'…', u' £$£ ') def make_cleans(self, txt) : for clean in self.cleans :