X-Git-Url: http://iramuteq.org/git?p=iramuteq;a=blobdiff_plain;f=corpusNG.py;h=fa13a8b916b3762c806c9096fe0ff21722c76337;hp=5e7ba26ae751ae92b4e42970dbbdc4b80b58648f;hb=1fb687c23b19ae4cc88146acf393041356c1df3a;hpb=1a995a6ca4e8dbb09c8b9ab1276dabf17e065f0d;ds=sidebyside diff --git a/corpusNG.py b/corpusNG.py index 5e7ba26..fa13a8b 100644 --- a/corpusNG.py +++ b/corpusNG.py @@ -127,6 +127,19 @@ class Corpus : res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`wordid`,)) return list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res])) + def getformeuceseff(self, formeid) : + if isinstance(formeid, basestring) : + formeid = self.formes[formeid].ident + res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`formeid`,)) + uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res])) + query = 'SELECT eff FROM eff where id=%i ORDER BY id' % formeid + res = self.cformes.execute(query) + eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res])) + formeuceeff = {} + for i, uce in enumerate(uces) : + formeuceeff[uce] = formeuceeff.get(uce, 0) + eff[i] + return formeuceeff + def getlemuces(self, lem) : formesid = ', '.join([`val` for val in self.lems[lem].formes]) query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid @@ -137,7 +150,7 @@ class Corpus : uces = self.getlemuces(lem) return list(set([self.getucefromid(val).uci for val in uces])) - def getlemuceseff(self, lem) : + def getlemuceseff(self, lem, luces = None) : formesid = ', '.join([`val` for val in self.lems[lem].formes]) query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid res = self.cformes.execute(query) @@ -150,6 +163,9 @@ class Corpus : lemuceeff[uce] = lemuceeff.get(uce, 0) + eff[i] return lemuceeff + def getlemclustereff(self, lem, cluster) : + return len(list(set(self.lc[cluster]).intersection(self.getlemuces(lem)))) + def getlemeff(self, lem) : return self.lems[lem].freq @@ -251,7 +267,7 @@ class Corpus : self.iduces = dict([[uce.ident, uce] for uci in self.ucis for uce in uci.uces]) def make_lexitable(self, mineff, etoiles) : - tokeep = [lem for lem in self.lems if self.lems[lem].freq > mineff] + tokeep = [lem for lem in self.lems if self.lems[lem].freq >= mineff] etuces = [[] for et in etoiles] for uci in self.ucis : get = list(set(uci.etoiles).intersection(etoiles)) @@ -385,6 +401,28 @@ class Corpus : actpara = self.iduces[uce[0]].para ident += 1 f.write('\n'.join([self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n') + + def export_corpus_classes(self, outf, alc = True, lem = False) : + ucecl = {} + for i, lc in enumerate(self.lc) : + for uce in lc : + ucecl[uce] = i + 1 + for uce in self.lc0 : + ucecl[uce] = 0 + res = self.getalluces() + self.make_iduces() + with open(outf, 'w') as f : + for uce in res : + guce = uce[1] + actuci = self.iduces[uce[0]].uci + if lem : + guce = ' '.join([self.formes[forme].lem for forme in guce.split()]) + if alc : + etline = ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles + ['*classe_%i' % ucecl[uce[0]]]) + else : + etline = ' '.join(['<' + '='.join(et.split('_')) + '>' for et in self.ucis[self.iduces[uce[0]].uci].etoiles[1:]]) + f.write(etline.encode(self.parametres['syscoding']) + '\n') + f.write(guce.encode(self.parametres['syscoding']) + '\n\n') def make_and_write_sparse_matrix_from_uces(self, actives, outfile, listuce = False) : log.info('make_and_write_sparse_matrix_from_uces %s' % outfile) @@ -451,7 +489,9 @@ class Corpus : def parse_active(self, gramact, gramsup = None) : log.info('parse actives') for lem in self.lems : - if self.lems[lem].gram in gramact : + if lem.startswith('_') and lem.endswith('_') : + self.lems[lem].act = 2 + elif self.lems[lem].gram in gramact : self.lems[lem].act = 1 elif gramsup is not None : if self.lems[lem].gram in gramsup : @@ -489,7 +529,8 @@ class Corpus : lim -= 1 else : stop = nbmax - 1 - log.info('nb actives = %i - eff min = %i ' % (stop, lim)) + lim = effs[stop] + log.info('nb actives = %i - eff min = %i ' % (stop + 1, lim)) return [val[1] for val in allactives[0:stop + 1]], lim def make_and_write_profile(self, actives, ucecl, fileout) : @@ -622,7 +663,23 @@ class Corpus : result = [[seg] + [str(val) for val in d[seg]] for seg in d if sum(d[seg]) >= effmin] with open(fileout, 'w') as f : f.write('\n'.join([';'.join(line) for line in result])) - + + def make_proftype(self, outf) : + res = {} + for lem in self.lems : + gram = self.lems[lem].gram + if not gram in res : + res[gram] = [0 for val in self.lc] + lemuceeff = self.getlemuceseff(lem) + for i, classe in enumerate(self.lc) : + concern = set(classe).intersection(lemuceeff.keys()) + res[gram][i] += sum([lemuceeff[uce] for uce in concern]) + res = [[gram] + [`val` for val in res[gram]] for gram in res] + res.sort() + with open(outf, 'w') as f : + f.write('\n'.join([';'.join(line) for line in res]).encode(self.parametres['syscoding'])) + + def make_ucecl_from_R(self, filein) : with open(filein, 'rU') as f : c = f.readlines() @@ -637,6 +694,28 @@ class Corpus : self.lc = [[uce[0] for uce in self.lc if uce[1] == i] for i in range(clnb+1)] self.lc0 = self.lc.pop(0) #return ucecl + + def get_stat_by_cluster(self, outf) : + log.info('get_stat_by_cluster') + t1 = time() + occurrences = dict([[i + 1, 0] for i in range(len(self.lc))]) + formescl = dict([[i + 1, 0] for i in range(len(self.lc))]) + hapaxcl = dict([[i + 1, 0] for i in range(len(self.lc))]) + lenclasses = dict([[i+1,len(cl)] for i, cl in enumerate(self.lc)]) + sets = [set(cl) for cl in self.lc] + for forme in self.formes : + formeuceeff = self.getformeuceseff(forme) + for i, classe in enumerate(self.lc) : + concern = sets[i].intersection(formeuceeff.keys()) + if len(concern) : + occurrences[i+1] += sum([formeuceeff[uce] for uce in concern]) + formescl[i+1] += 1 + if self.formes[forme].freq == 1 : + hapaxcl[i+1] += 1 + toprint = '\n'.join([';'.join([`i`, `occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`]) for i in occurrences]) + with open(outf, 'w') as f : + f.write(toprint) + log.info('%f' % (time() - t1)) def gethapaxbyet(self, etoiles) : hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1] @@ -1077,13 +1156,6 @@ class BuildFromAlceste(BuildCorpus) : for word in uce : self.last += 1 self.corpus.add_word(word) - #if self.dlg is not None : - # if self.limitshow > self.count : - # self.dlg.Pulse('textes : %i - segments : %i' % (iduci + 1, iduce +1)) - # self.count += 1 - # self.limitshow = 0 - # else : - # self.limitshow = self.last / 100000 log.debug(' '.join([`iduci`,`idpara`,`iduce`])) if self.last > self.lim : self.backup_uce() @@ -1095,27 +1167,13 @@ class BuildFromAlceste(BuildCorpus) : if douce : out = [] reste, texte_uce, suite = self.decouper(self.prep_txt(txt), self.ucesize + 15, self.ucesize) -# print 'reste' -# print reste -# print 'texte_uce' -# print texte_uce -# print 'suite' -# print suite while reste : uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace]) if uce != '' : out.append(uce) reste, texte_uce, suite = self.decouper(suite, self.ucesize + 15, self.ucesize) -# print 'reste' -# print reste -# print 'texte_uce' -# print texte_uce -# print 'suite' -# print suite - uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace]) if uce != '' : - #print 'RESTEE UUCEEEEEEEEEEEEE', uce out.append(uce) return out else : @@ -1144,6 +1202,9 @@ class Builder : ReadLexique(self.parent, lang = parametres['lang']) self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')) self.parametres = parametres + else : + if self.dlg is not None : + self.dlg.Destroy() dial.Destroy() def doanalyse(self) :