X-Git-Url: http://iramuteq.org/git?p=iramuteq;a=blobdiff_plain;f=corpus.py;h=6e667b3f173f272964ffb4b01ff0037a65e36419;hp=d13b2f5b4a18933818aa0c7ea7313790204ada82;hb=bf6f3e6f469565e48fdc03216b1d95d72738e77a;hpb=357e5403d3c539083a1052248628f0aab1af1eaa diff --git a/corpus.py b/corpus.py index d13b2f5..6e667b3 100644 --- a/corpus.py +++ b/corpus.py @@ -172,6 +172,16 @@ class Corpus : res = self.cformes.execute(query) return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res])))) + def gettgenst(self, tgen): + formesid = ', '.join([`val` for lem in tgen for val in self.lems[lem].formes if lem in self.lems]) + query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid + res = self.cformes.execute(query) + return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res])))) + + def gettgentxt(self, tgen): + sts = self.gettgenst(tgen) + return list(set([self.getucefromid(val).uci for val in sts])) + def getlemucis(self, lem) : uces = self.getlemuces(lem) return list(set([self.getucefromid(val).uci for val in uces])) @@ -391,6 +401,26 @@ class Corpus : tgenoccurrences[t][etoiles[i]] += sum([lemuceeff[uce] for uce in concern]) return tgenoccurrences, totoccurrences + def make_tgen_profile(self, tgen, ucecl, uci = False) : + log.info('tgen/classes') + if uci : + tab = [[lem] + [len(set(self.gettgentxt(tgen[lem])).intersection(classe)) for classe in ucecl] for lem in tgen] + else : + tab = [[lem] + [len(set(self.gettgenst(tgen[lem])).intersection(classe)) for classe in ucecl] for lem in tgen] + tab = [[line[0]] + [val for val in line[1:]] for line in tab if sum(line[1:]) >= 3] + return tab + #i = 0 + #nam = 'total' + #while nam + `i` in tgen : + # i += 1 + #nam = nam + `i` + #last = [nam] + [`len(classe)` for classe in ucecl] + #tab += [last] + #line0 = ['tgen'] + ['_'.join(['cluster', `i+1`]) for i in range(len(ucecl))] + #tab = [line0] + tab + #with open(fileout, 'w') as f : + # f.write('\n'.join(['\t'.join(line) for line in tab]).encode(self.parametres['syscoding'])) + def make_efftype_from_etoiles(self, etoiles) : dtype = {} etuces = [[] for et in etoiles] @@ -756,6 +786,28 @@ class Corpus : except IndexError : det[et[0]] = 1 return det + + def make_theme_dict(self): + themes = [val for uci in self.ucis for val in uci.paras] + det = {} + for theme in themes : + th = theme.split('_') + if th[0] in det : + try : + endth = '_'.join(th[1:]) + if theme in det[th[0]] : + det[th[0]][theme] += 1 + else : + det[th[0]][theme] = 1 + except IndexError : + det[th[0]] += 1 + else : + try : + endth = '_'.join(th[1:]) + det[th[0]] = {theme:1} + except IndexError : + det[th[0]] = 1 + return det def make_etline(self, listet) : etuces = [[] for et in listet] @@ -1541,16 +1593,23 @@ class Builder : parametres = dial.doparametres() parametres['originalpath'] = parent.filename PathOut().createdir(parametres['pathout']) - ReadLexique(self.parent, lang = parametres['lang']) + if parametres.get('dictionary', False) : + filein = parametres['dictionary'] + else : + filein = None + if dial.corpusname.GetValue() != '' : + parametres['corpus_name'] = dial.corpusname.GetValue() + dial.Destroy() + ReadLexique(self.parent, lang = parametres['lang'], filein = filein) if parametres['lang'] != 'other' and os.path.exists(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')): self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')) else : self.parent.expressions = {} self.parametres = parametres else : + dial.Destroy() if self.dlg is not None : self.dlg.Destroy() - dial.Destroy() def doanalyse(self) : return BuildFromAlceste(self.parent.filename, self.parametres, self.parent.lexique, self.parent.expressions, dlg = self.dlg).corpus