X-Git-Url: http://iramuteq.org/git?p=iramuteq;a=blobdiff_plain;f=corpus.py;h=011206fcc5553f7c6150e410917c0d475ce5c910;hp=3e4ae30879f4d5872b444058ba064931eec9f620;hb=238f42801fed31007932d28e2d8e517081c9542d;hpb=efe71f52a1116aa1a3336178bfd8d6a9fbd5744f diff --git a/corpus.py b/corpus.py index 3e4ae30..011206f 100644 --- a/corpus.py +++ b/corpus.py @@ -172,6 +172,16 @@ class Corpus : res = self.cformes.execute(query) return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res])))) + def gettgenst(self, tgen): + formesid = ', '.join([`val` for lem in tgen for val in self.lems[lem].formes if lem in self.lems]) + query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid + res = self.cformes.execute(query) + return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res])))) + + def gettgentxt(self, tgen): + sts = self.gettgenst(tgen) + return list(set([self.getucefromid(val).uci for val in sts])) + def getlemucis(self, lem) : uces = self.getlemuces(lem) return list(set([self.getucefromid(val).uci for val in uces])) @@ -391,6 +401,26 @@ class Corpus : tgenoccurrences[t][etoiles[i]] += sum([lemuceeff[uce] for uce in concern]) return tgenoccurrences, totoccurrences + def make_tgen_profile(self, tgen, ucecl, uci = False) : + log.info('tgen/classes') + if uci : + tab = [[lem] + [len(set(self.gettgentxt(tgen[lem])).intersection(classe)) for classe in ucecl] for lem in tgen] + else : + tab = [[lem] + [len(set(self.gettgenst(tgen[lem])).intersection(classe)) for classe in ucecl] for lem in tgen] + tab = [[line[0]] + [val for val in line[1:]] for line in tab if sum(line[1:]) >= 3] + return tab + #i = 0 + #nam = 'total' + #while nam + `i` in tgen : + # i += 1 + #nam = nam + `i` + #last = [nam] + [`len(classe)` for classe in ucecl] + #tab += [last] + #line0 = ['tgen'] + ['_'.join(['cluster', `i+1`]) for i in range(len(ucecl))] + #tab = [line0] + tab + #with open(fileout, 'w') as f : + # f.write('\n'.join(['\t'.join(line) for line in tab]).encode(self.parametres['syscoding'])) + def make_efftype_from_etoiles(self, etoiles) : dtype = {} etuces = [[] for et in etoiles] @@ -756,6 +786,28 @@ class Corpus : except IndexError : det[et[0]] = 1 return det + + def make_theme_dict(self): + themes = [val for uci in self.ucis for val in uci.paras] + det = {} + for theme in themes : + th = theme.split('_') + if th[0] in det : + try : + endth = '_'.join(th[1:]) + if theme in det[th[0]] : + det[th[0]][theme] += 1 + else : + det[th[0]][theme] = 1 + except IndexError : + det[th[0]] += 1 + else : + try : + endth = '_'.join(th[1:]) + det[th[0]] = {theme:1} + except IndexError : + det[th[0]] = 1 + return det def make_etline(self, listet) : etuces = [[] for et in listet]