X-Git-Url: http://iramuteq.org/git?p=iramuteq;a=blobdiff_plain;f=corpusNG.py;h=eb55b0857be095bbfca46a7ebf13f6fd029851b6;hp=6dc48806e7fd8a43fa527507c7823ed6ba6bbf75;hb=b0333175cc68917ceb33589b0b354bf931fec245;hpb=8fa853a25a9d62b1446e1bc543e5a3a4d0e03dcf diff --git a/corpusNG.py b/corpusNG.py index 6dc4880..eb55b08 100644 --- a/corpusNG.py +++ b/corpusNG.py @@ -21,23 +21,6 @@ import datetime log = logging.getLogger('iramuteq.corpus') -#expressions = ReadDicoAsDico('dictionnaires/expression_fr.txt') -#lexique = ReadDicoAsDico('dictionnaires/lexique_fr.txt') -#infile = '/home/pierre/workspace/iramuteq/corpus/lru2.txt' -#infile = '/home/pierre/workspace/iramuteq/corpus/corpussab_cor.txt' -#encoding = 'utf8' -#infile = '/home/pierre/fac/identite/identite_sans_doublons_ok.txt' -#encoding = 'cp1252' -#infile = '/home/pierre/workspace/iramuteq/corpus/Natacha.txt' -#infile = '/home/pierre/fac/cablegate/allcables-all.txt' -#infile = '/home/pierre/fac/cablegate/allcables-08290338.txt' -#tar_in = '/home/pierre/fac/identite/uce.tar.gz -#tar_in = '/home/pierre/fac/cablegate/uce-cable-test.tar.gz' -#tar_infouce = '/home/pierre/fac/identite/info_uce.tar.gz' -#tar_infouce = '/home/pierre/fac/cablegate/info_uce.tar.gz' -#tar_formes = '/home/pierre/fac/identite/tar_formes.tar.gz' -#tar_formes = '/home/pierre/fac/cablegate/tar_formes.tar.gz' - def copycorpus(corpus) : log.info('copy corpus') @@ -71,6 +54,7 @@ class Corpus : self.idformesuces = {} self.iduces = None self.idformes = None + self.uceuci = None if read : self.pathout = PathOut(dirout = parametres['pathout']) self.read_corpus() @@ -238,6 +222,10 @@ class Corpus : # else : # self.lems[self.formes[forme].lem] = {self.formes[forme].ident : 0} + def getetbyuceid(self, uceid) : + if self.uceuci is None : self.uceuci = dict([[uce.ident,uci.ident] for uci in self.ucis for uce in uci.uces]) + return self.ucis[self.uceuci[uceid]].etoiles + def make_lems(self, lem = True) : log.info('make lems') self.lems = {} @@ -544,6 +532,64 @@ class Corpus : self.lc0 = self.lc.pop(0) #return ucecl + def gethapaxbyet(self, etoiles) : + hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1] + hucesdict = {} + for uce in hapaxuces : + if uce in hucesdict : + hucesdict[uce] += 1 + else : + hucesdict[uce] = 1 + etuces = [[] for et in etoiles] + for uci in self.ucis : + get = list(set(uci.etoiles).intersection(etoiles)) + if len(get) > 1 : + return '2 variables sur la meme ligne' + elif get != [] : + etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces] + etuces = [set(val) for val in etuces] + return [sum([hucesdict[uce] for uce in list(etuce.intersection(hapaxuces))]) for etuce in etuces] + + def gethapaxuces(self) : + hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1] + hapax = [forme for forme in self.lems if self.lems[forme].freq == 1] + hucesdict = {} + for i,uce in enumerate(hapaxuces) : + if uce in hucesdict : + hucesdict[uce][0] += 1 + hucesdict[uce][1].append(hapax[i]) + else : + hucesdict[uce] = [1,[hapax[i]]] + huces = {} + for uce in hucesdict : + if hucesdict[uce][0] in huces : + huces[hucesdict[uce][0]].append(uce) + else : + huces[hucesdict[uce][0]] = [uce] + huces = zip(huces, huces.values()) + huces.sort(reverse=True) + txt = """ + + """ + for nb in huces[0:4] : + txt += "

%i hapax par uce

\n" % nb[0] + for uce in nb[1] : + res = self.getconcorde([uce]) + for row in res : + ucetxt = ' ' + row[1] + ' ' + uceid = row[0] + for hap in hucesdict[uce][1] : + laforme = self.getforme([forme for forme in self.lems[hap].formes][0]).forme + ucetxt = ucetxt.replace(' '+laforme+' ', ' '+laforme+' ') + txt += '

' + ' '.join(self.getetbyuceid(uceid)) + '

' + txt += '

'+ucetxt+'

\n' + txt += """ + + """ + with open('/tmp/testhapxuce.html','w') as f : + f.write(txt) + + class Uci : def __init__(self, iduci, line, paraset = None) : self.ident = iduci @@ -647,7 +693,7 @@ def prep_txtcharact(txt) : class BuildCorpus : """ - Class for building a corpora + Class for building a corpus """ def __init__(self, infile, parametres_corpus, lexique = None, expressions = None, dlg = None) : log.info('begin building corpus...')