X-Git-Url: http://iramuteq.org/git?p=iramuteq;a=blobdiff_plain;f=corpus.py;h=76ebb660911223bf31496b8d0a23aea56c4d2290;hp=ec277641227a55cfca94ac90a6535599d4f12aed;hb=432118f2ac3d2f8234c388e77d0fb9e14234750f;hpb=42a67a41b64a6e0cc3fd2a63a0749e9aa4b9374c diff --git a/corpus.py b/corpus.py index ec27764..76ebb66 100644 --- a/corpus.py +++ b/corpus.py @@ -433,6 +433,18 @@ class Corpus : f.write(etline.encode(self.parametres['syscoding']) + '\n') f.write(guce.encode(self.parametres['syscoding']) + '\n\n') + def export_classe(self, outf, classe, lem = False) : + sts = self.lc[classe] + res = self.getconcorde(sts) + self.make_iduces() + with open(outf, 'w') as f : + for uce in res : + guce = uce[1] + f.write(' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n') + if lem : + guce = ' '.join([self.formes[forme].lem for forme in guce.split()]) + f.write(guce.encode(self.parametres['syscoding']) + '\n\n') + def make_and_write_sparse_matrix_from_uces(self, actives, outfile, listuce = False) : log.info('make_and_write_sparse_matrix_from_uces %s' % outfile) nbl = 0 @@ -707,27 +719,37 @@ class Corpus : self.lc0 = self.lc.pop(0) #return ucecl - def get_stat_by_cluster(self, outf) : + def get_stat_by_cluster(self, outf, lclasses = None) : log.info('get_stat_by_cluster') + if lclasses is None : + lclasses = self.lc t1 = time() - occurrences = dict([[i + 1, 0] for i in range(len(self.lc))]) - formescl = dict([[i + 1, 0] for i in range(len(self.lc))]) - hapaxcl = dict([[i + 1, 0] for i in range(len(self.lc))]) - lenclasses = dict([[i+1,len(cl)] for i, cl in enumerate(self.lc)]) - sets = [set(cl) for cl in self.lc] + occurrences = dict([[i + 1, 0] for i in range(len(lclasses))]) + formescl = dict([[i + 1, 0] for i in range(len(lclasses))]) + hapaxcl = dict([[i + 1, 0] for i in range(len(lclasses))]) + lenclasses = dict([[i+1,len(cl)] for i, cl in enumerate(lclasses)]) + sets = [set(cl) for cl in lclasses] for forme in self.formes : formeuceeff = self.getformeuceseff(forme) - for i, classe in enumerate(self.lc) : + for i, classe in enumerate(lclasses) : concern = sets[i].intersection(formeuceeff.keys()) if len(concern) : occurrences[i+1] += sum([formeuceeff[uce] for uce in concern]) formescl[i+1] += 1 if self.formes[forme].freq == 1 : hapaxcl[i+1] += 1 - toprint = '\n'.join([';'.join([`i`, `occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`]) for i in occurrences]) - with open(outf, 'w') as f : - f.write(toprint) log.info('%f' % (time() - t1)) + if outf is not None : + toprint = '\n'.join([';'.join([`i`, `occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`]) for i in occurrences]) + with open(outf, 'w') as f : + f.write(toprint) + else : + return [[`occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`] for i in occurrences] + + def get_stat_by_et(self, outf, etoiles) : + lclasses = [self.getucesfrometoile(etoile) for etoile in etoiles] + stats = self.get_stat_by_cluster(None, lclasses) + stats = [[etoiles[i]] + val for i, val in enumerate(stats)] def gethapaxbyet(self, etoiles) : hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1] @@ -786,6 +808,22 @@ class Corpus : with open('/tmp/testhapxuce.html','w') as f : f.write(txt) + def export_dictionary(self, fileout, syscoding) : + listformes = [[self.formes[forme].freq, forme, self.formes[forme].lem, self.formes[forme].gram] for forme in self.formes] + listformes.sort(reverse = True) + listformes = [forme[1:] + [`forme[0]`] for forme in listformes] + with open(fileout, 'w') as f : + f.write('\n'.join(['\t'.join(forme) for forme in listformes]).encode(syscoding)) + + def export_lems(self, fileout, syscoding) : + self.make_idformes() + listlem = [[lem, '\t'.join(['\t'.join([self.idformes[forme].forme, `self.lems[lem].formes[forme]`]) for forme in self.lems[lem].formes])] for lem in self.lems] + listlem.sort() + with open(fileout, 'w') as f : + f.write('\n'.join(['\t'.join(lem) for lem in listlem]).encode(syscoding)) + + + class MakeUciStat : def __init__(self, corpus) : @@ -1216,7 +1254,10 @@ class Builder : parametres['originalpath'] = parent.filename PathOut().createdir(parametres['pathout']) ReadLexique(self.parent, lang = parametres['lang']) - self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')) + if parametres['lang'] != 'other' and os.path.exists(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')): + self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')) + else : + self.parent.expressions = {} self.parametres = parametres else : if self.dlg is not None :