X-Git-Url: http://iramuteq.org/git?p=iramuteq;a=blobdiff_plain;f=corpus.py;h=5a18f628b0fcd0a5118ffbdbedd2d52362322719;hp=e0437077ff0fc5642a68f89a82ca2b5777da7ea3;hb=278fceaa7db7b84d7c6f3bbd3f86e5ddb0ebda09;hpb=d1d24d86422c9e9805516190ea17a379201f9300 diff --git a/corpus.py b/corpus.py index e043707..5a18f62 100644 --- a/corpus.py +++ b/corpus.py @@ -11,7 +11,6 @@ from time import time from functions import decoupercharact, ReadDicoAsDico, DoConf import re import sqlite3 -import numpy import itertools import logging from operator import itemgetter @@ -78,10 +77,10 @@ class Corpus : gramtype = self.parent.lexique[word][1] lem = self.parent.lexique[word][0] elif word.isdigit() : - gramtype = 'num' + gramtype = u'num' lem = word else : - gramtype = 'nr' + gramtype = u'nr' lem = word self.formes[word] = Word(word, gramtype, len(self.formes), lem) self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident : 1} @@ -211,6 +210,28 @@ class Corpus : def getucesfrometoile(self, etoile) : return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles] + def getetoileuces(self) : + log.info('get uces etoiles') + etoileuces = {} + idpara = 0 + for uci in self.ucis : + etoiles = uci.etoiles[1:] + for et in etoiles : + if et in etoileuces : + etoileuces[et] += [uce.ident for uce in uci.uces] + else : + etoileuces[et] = [uce.ident for uce in uci.uces] + if uci.paras != [] : + for et in uci.paras : + if et in etoileuces : + etoileuces[et] += [uce.ident for uce in uci.uces if uce.para == idpara] + else : + etoileuces[et] = [uce.ident for uce in uci.uces if uce.para == idpara] + idpara += 1 + else : + idpara += 1 + return etoileuces + def getucefromid(self, uceid) : if self.iduces is None : self.make_iduces() return self.iduces[uceid] @@ -260,8 +281,8 @@ class Corpus : for uci in self.ucis : get = list(set(uci.etoiles).intersection(etoiles)) if len(get) > 1 : - return '2 variables sur la meme ligne' - elif get != [] : + log.info('2 variables sur une ligne') + if get != [] : etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces] etuces = [set(val) for val in etuces] tab = [] @@ -412,6 +433,18 @@ class Corpus : f.write(etline.encode(self.parametres['syscoding']) + '\n') f.write(guce.encode(self.parametres['syscoding']) + '\n\n') + def export_classe(self, outf, classe, lem = False) : + sts = self.lc[classe] + res = self.getconcorde(sts) + self.make_iduces() + with open(outf, 'w') as f : + for uce in res : + guce = uce[1] + f.write(' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n') + if lem : + guce = ' '.join([self.formes[forme].lem for forme in guce.split()]) + f.write(guce.encode(self.parametres['syscoding']) + '\n\n') + def make_and_write_sparse_matrix_from_uces(self, actives, outfile, listuce = False) : log.info('make_and_write_sparse_matrix_from_uces %s' % outfile) nbl = 0 @@ -481,7 +514,7 @@ class Corpus : self.lems[lem].act = 2 elif self.lems[lem].gram in gramact : self.lems[lem].act = 1 - elif gramsup is not None : + elif gramsup is not None and self.lems[lem].gram not in gramact: if self.lems[lem].gram in gramsup : self.lems[lem].act = 2 else : @@ -531,7 +564,7 @@ class Corpus : def make_etoiles(self) : etoiles = set([]) for uci in self.ucis : - etoiles.update(uci.etoiles[1:] + uci.paras) + etoiles.update(uci.etoiles[1:]) return list(etoiles) def make_etoiles_dict(self) : @@ -542,16 +575,16 @@ class Corpus : if et[0] in det : try : endet = '_'.join(et[1:]) - if endet in det[et[0]] : - det[et[0]][endet] += 1 + if etoile in det[et[0]] : + det[et[0]][etoile] += 1 else : - det[et[0]][endet] = 1 + det[et[0]][etoile] = 1 except IndexError : det[et[0]] += 1 else : try : endet = '_'.join(et[1:]) - det[et[0]] = {endet :1} + det[et[0]] = {etoile :1} except IndexError : det[et[0]] = 1 return det @@ -565,13 +598,16 @@ class Corpus : elif get != [] : etuces[listet.index(get[0])] += [uce.ident for uce in uci.uces] return etuces - def make_and_write_profile_et(self, ucecl, fileout) : log.info('etoiles/classes') - etoiles = self.make_etoiles() + etoileuces = self.getetoileuces() + etoileuces = dict([[et, etoileuces[et]] for et in etoileuces if len(etoileuces[et]) > 1]) with open(fileout, 'w') as f : - f.write('\n'.join([';'.join([etoile] + [`len(set(self.getucesfrometoile(etoile)).intersection(classe))` for classe in ucecl]) for etoile in etoiles]).encode(self.parametres['syscoding'])) + f.write('\n'.join([';'.join([et] + [`len(set(etoileuces[et]).intersection(classe))` for classe in ucecl]) for et in etoileuces]).encode(self.parametres['syscoding'])) + #etoiles = self.make_etoiles() + #with open(fileout, 'w') as f : + # f.write('\n'.join([';'.join([etoile] + [`len(set(self.getucesfrometoile(etoile)).intersection(classe))` for classe in ucecl]) for etoile in etoiles]).encode(self.parametres['syscoding'])) def make_colored_corpus(self) : ucecl = {} @@ -683,27 +719,37 @@ class Corpus : self.lc0 = self.lc.pop(0) #return ucecl - def get_stat_by_cluster(self, outf) : + def get_stat_by_cluster(self, outf, lclasses = None) : log.info('get_stat_by_cluster') + if lclasses is None : + lclasses = self.lc t1 = time() - occurrences = dict([[i + 1, 0] for i in range(len(self.lc))]) - formescl = dict([[i + 1, 0] for i in range(len(self.lc))]) - hapaxcl = dict([[i + 1, 0] for i in range(len(self.lc))]) - lenclasses = dict([[i+1,len(cl)] for i, cl in enumerate(self.lc)]) - sets = [set(cl) for cl in self.lc] + occurrences = dict([[i + 1, 0] for i in range(len(lclasses))]) + formescl = dict([[i + 1, 0] for i in range(len(lclasses))]) + hapaxcl = dict([[i + 1, 0] for i in range(len(lclasses))]) + lenclasses = dict([[i+1,len(cl)] for i, cl in enumerate(lclasses)]) + sets = [set(cl) for cl in lclasses] for forme in self.formes : formeuceeff = self.getformeuceseff(forme) - for i, classe in enumerate(self.lc) : + for i, classe in enumerate(lclasses) : concern = sets[i].intersection(formeuceeff.keys()) if len(concern) : occurrences[i+1] += sum([formeuceeff[uce] for uce in concern]) formescl[i+1] += 1 if self.formes[forme].freq == 1 : hapaxcl[i+1] += 1 - toprint = '\n'.join([';'.join([`i`, `occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`]) for i in occurrences]) - with open(outf, 'w') as f : - f.write(toprint) log.info('%f' % (time() - t1)) + if outf is not None : + toprint = '\n'.join([';'.join([`i`, `occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`]) for i in occurrences]) + with open(outf, 'w') as f : + f.write(toprint) + else : + return [[`occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`] for i in occurrences] + + def get_stat_by_et(self, outf, etoiles) : + lclasses = [self.getucesfrometoile(etoile) for etoile in etoiles] + stats = self.get_stat_by_cluster(None, lclasses) + stats = [[etoiles[i]] + val for i, val in enumerate(stats)] def gethapaxbyet(self, etoiles) : hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1] @@ -762,6 +808,22 @@ class Corpus : with open('/tmp/testhapxuce.html','w') as f : f.write(txt) + def export_dictionary(self, fileout, syscoding) : + listformes = [[self.formes[forme].freq, forme, self.formes[forme].lem, self.formes[forme].gram] for forme in self.formes] + listformes.sort(reverse = True) + listformes = [forme[1:] + [`forme[0]`] for forme in listformes] + with open(fileout, 'w') as f : + f.write('\n'.join(['\t'.join(forme) for forme in listformes]).encode(syscoding)) + + def export_lems(self, fileout, syscoding) : + self.make_idformes() + listlem = [[lem, '\t'.join(['\t'.join([self.idformes[forme].forme, `self.lems[lem].formes[forme]`]) for forme in self.lems[lem].formes])] for lem in self.lems] + listlem.sort() + with open(fileout, 'w') as f : + f.write('\n'.join(['\t'.join(lem) for lem in listlem]).encode(syscoding)) + + + class MakeUciStat : def __init__(self, corpus) : @@ -1019,7 +1081,7 @@ class BuildCorpus : def firstclean(self, txt) : txt = txt.replace(u'’',"'") txt = txt.replace(u'œ', u'oe') - return txt.replace('...',u' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'…', ' £$£ ') + return txt.replace('...',u' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'…', u' £$£ ') def make_cleans(self, txt) : for clean in self.cleans : @@ -1092,7 +1154,6 @@ class BuildFromAlceste(BuildCorpus) : log.info(u'Empty text : %i' % linenb) iduci -= 1 self.corpus.ucis.pop() - #raise Exception("EmptyText %i" % linenb) self.corpus.ucis.append(Uci(iduci, line)) if self.dlg is not None : if not (iduci + 1) % 10 : @@ -1105,19 +1166,24 @@ class BuildFromAlceste(BuildCorpus) : idpara += 1 self.corpus.ucis[-1].paras.append(line.split()[0]) else : - raise Exception('paragrapheOT') + raise Exception('paragrapheOT %i' % linenb) elif line.strip() != '' and iduci != -1 : txt.append(line) if txt != [] and iduci != -1 : iduce, idpara = self.treattxt(txt, iduce, idpara, iduci) del(txt) else : - raise Exception("EmptyText") + if iduci != -1 : + iduci -= 1 + self.corpus.ucis.pop() + log.info(Exception("Empty text %i" % linenb)) + else : + raise Exception('EmptyText %i' % linenb) if iduci != -1 and iduce != -1: self.backup_uce() else : log.info(_(u"No Text in corpora. Are you sure of the formatting ?")) - raise Exception('TextBeforeTextMark') + raise Exception('TextBeforeTextMark %i' % linenb) except UnicodeDecodeError : raise Exception("CorpusEncoding") @@ -1188,7 +1254,10 @@ class Builder : parametres['originalpath'] = parent.filename PathOut().createdir(parametres['pathout']) ReadLexique(self.parent, lang = parametres['lang']) - self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')) + if parametres['lang'] != 'other' and os.path.exists(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')): + self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')) + else : + self.parent.expressions = {} self.parametres = parametres else : if self.dlg is not None :