X-Git-Url: http://iramuteq.org/git?p=iramuteq;a=blobdiff_plain;f=corpus.py;h=a2790f04d1253ba46592bf889b45a7f212c84b15;hp=cbf8794292c9d7d93fb103c60f85200c5a659bd7;hb=7761609ba8d78519a5ac90ec86a57c63cfc16e39;hpb=e84160b7f61eb5b05cc12339e44a61d67b499e15 diff --git a/corpus.py b/corpus.py index cbf8794..a2790f0 100644 --- a/corpus.py +++ b/corpus.py @@ -8,7 +8,7 @@ _ = gettext.gettext import locale import sys from time import time -from functions import decoupercharact, ReadDicoAsDico, DoConf, ReadLexique +from functions import decoupercharact, ReadDicoAsDico, DoConf, ReadLexique, progressbar import re import sqlite3 import itertools @@ -17,6 +17,7 @@ from operator import itemgetter from uuid import uuid4 from chemins import PathOut from dialog import CorpusPref, SubTextFromMetaDial +from copy import copy from colors import colors import datetime @@ -33,6 +34,16 @@ def copycorpus(corpus) : copy_corpus.conn_all() return copy_corpus +def CopyUce(uce) : + return Uce(uce.ident, uce.para, uce.uci) + + +def CopyUci(uci): + nuci = Uci(uci.ident, '') + nuci.etoiles = copy(uci.etoiles) + nuci.uces = [CopyUce(uce) for uce in uci.uces] + return nuci + class Corpus : @@ -94,9 +105,7 @@ class Corpus : else : self.idformesuces[self.formes[word.forme].ident] = {stident: 1} else : - self.formes[word.forme] = word - self.formes[word.forme].ident = len(self.formes) - self.formes[word.forme].freq = 1 + self.formes[word.forme] = Word(word.forme, word.gram, len(self.formes), word.lem) self.idformesuces[self.formes[word.forme].ident] = {stident : 1} def conn_all(self): @@ -162,6 +171,34 @@ class Corpus : query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid res = self.cformes.execute(query) return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res])))) + + def gettgenst(self, tgen): + formesid = [] + for lem in tgen : + if lem in self.lems : + formesid += self.lems[lem].formes + else : + print 'abscent : %s' % lem + query = 'SELECT uces FROM uces where id IN %s ORDER BY id' % str(tuple(formesid)) + res = self.cformes.execute(query) + return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res])))) + + def gettgenstprof(self, tgen, classe, i, clnb): + tgenst = [] + for lem in tgen : + if lem in self.lems : + lemst = self.getlemuces(lem) + tgenst += lemst + if not lem in self.tgenlem : + self.tgenlem[lem] = [0] * clnb + self.tgenlem[lem][i] = len(set(lemst).intersection(classe)) + else : + print 'abscent: ',lem + return list(set(tgenst)) + + def gettgentxt(self, tgen): + sts = self.gettgenst(tgen) + return list(set([self.getucefromid(val).uci for val in sts])) def getlemucis(self, lem) : uces = self.getlemuces(lem) @@ -355,7 +392,9 @@ class Corpus : for lem in tokeep : deff = self.getlemuceseff(lem) ucesk = deff.keys() - tab.append([lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces]) + line = [lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces] + if sum(line[1:]) >= mineff : + tab.append(line) tab.insert(0, [''] + etoiles) return tab @@ -382,6 +421,29 @@ class Corpus : tgenoccurrences[t][etoiles[i]] += sum([lemuceeff[uce] for uce in concern]) return tgenoccurrences, totoccurrences + def make_tgen_profile(self, tgen, ucecl, uci = False) : + log.info('tgen/classes') + self.tgenlem = {} + clnb = len(ucecl) + if uci : + #FIXME : NE MARCHE PLUS CHANGER CA + tab = [[lem] + [len(set(self.gettgentxt(tgen[lem])).intersection(classe)) for i, classe in enumerate(ucecl)] for lem in tgen] + else : + tab = [[lem] + [len(set(self.gettgenstprof(tgen[lem], classe, i, clnb)).intersection(classe)) for i, classe in enumerate(ucecl)] for lem in tgen] + tab = [[line[0]] + [val for val in line[1:]] for line in tab if sum(line[1:]) >= 3] + return tab + #i = 0 + #nam = 'total' + #while nam + `i` in tgen : + # i += 1 + #nam = nam + `i` + #last = [nam] + [`len(classe)` for classe in ucecl] + #tab += [last] + #line0 = ['tgen'] + ['_'.join(['cluster', `i+1`]) for i in range(len(ucecl))] + #tab = [line0] + tab + #with open(fileout, 'w') as f : + # f.write('\n'.join(['\t'.join(line) for line in tab]).encode(self.parametres['syscoding'])) + def make_efftype_from_etoiles(self, etoiles) : dtype = {} etuces = [[] for et in etoiles] @@ -500,6 +562,14 @@ class Corpus : ident += 1 f.write('\n'.join([self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n') + def export_meta_table(self, outf) : + metas = [[`i`] + text.etoiles[1:] for i, text in enumerate(self.ucis)] + longueur_max = max([len(val) for val in metas]) + first = ['column_%i' % i for i in range(longueur_max)] + metas.insert(0, first) + with open(outf, 'w') as f : + f.write('\n'.join(['\t'.join(line) for line in metas]).encode(self.parametres['syscoding'])) + def export_corpus_classes(self, outf, alc = True, lem = False, uci = False) : ucecl = {} for i, lc in enumerate(self.lc) : @@ -625,7 +695,7 @@ class Corpus : f.write(''.join([' '.join([`duces[uce]+1`,`i+1`,`1`]),'\n'])) f.seek(0) with open(outfile, 'w') as ffin : - ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl)) + ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (len(uces), len(actives), nbl)) for line in f : ffin.write(line) os.remove(outfile + '~') @@ -747,6 +817,28 @@ class Corpus : except IndexError : det[et[0]] = 1 return det + + def make_theme_dict(self): + themes = [val for uci in self.ucis for val in uci.paras] + det = {} + for theme in themes : + th = theme.split('_') + if th[0] in det : + try : + endth = '_'.join(th[1:]) + if theme in det[th[0]] : + det[th[0]][theme] += 1 + else : + det[th[0]][theme] = 1 + except IndexError : + det[th[0]] += 1 + else : + try : + endth = '_'.join(th[1:]) + det[th[0]] = {theme:1} + except IndexError : + det[th[0]] = 1 + return det def make_etline(self, listet) : etuces = [[] for et in listet] @@ -1234,7 +1326,9 @@ class BuildCorpus : self.cleans.append(self.dotiret) def make_expression(self,txt) : - for expression in self.expressions: + exp = self.expressions.keys() + exp.sort(reverse=True) + for expression in exp : if expression in txt : txt = txt.replace(expression, self.expressions[expression][0]) return txt @@ -1304,8 +1398,9 @@ class BuildSubCorpus(BuildCorpus): self.dlg = dlg self.ori = corpus self.infile = None - self.corpus = Corpus(self, corpus.parametres) + self.corpus = Corpus(self, {'type' : 'corpus', 'originalpath' : corpus.parametres['originalpath'], 'encoding' : corpus.parametres['encoding']}) self.last = 0 + self.parametres = parametres self.encoding = corpus.parametres['encoding'] self.corpus.parametres['corpus_name'] = parametres['corpus_name'] self.corpus.pathout = PathOut(filename = corpus.parametres['originalpath'], dirout = parametres['pathout']) @@ -1315,7 +1410,7 @@ class BuildSubCorpus(BuildCorpus): self.corpus.parametres['uuid'] = str(uuid4()) if parametres.get('frommeta', False) : print 'make subtexts' - self.corpus.ucis = [uci for uci in self.ori.ucis if set(parametres['meta']).intersection(uci.etoiles) != set()] + self.corpus.ucis = [CopyUci(uci) for uci in self.ori.ucis if set(parametres['meta']).intersection(uci.etoiles) != set()] elif parametres.get('fromtheme', False) : print 'make subtexts from theme' idpara = 0 @@ -1325,20 +1420,51 @@ class BuildSubCorpus(BuildCorpus): newpara = [] for et in uci.paras : if et in parametres['meta'] : - newuce += [uce for uce in uci.uces if uce.para == idpara] + newuce += [CopyUce(uce) for uce in uci.uces if uce.para == idpara] newpara.append(et) idpara += 1 if newuce != [] : - uci.uces = newuce - uci.paras = newpara - self.corpus.ucis.append(uci) + nuci = CopyUci(uci) + nuci.uces = newuce + nuci.paras = newpara + self.corpus.ucis.append(nuci) else : idpara += 1 - elif parametres.get('fromcluster', False) : - pass + elif parametres.get('fromclusters', False) : + self.parametres['uceids'] = [st for i in self.parametres['meta'] for st in self.parametres['lc'][i]] + self.fromuceids() + elif parametres.get('fromuceids', False) : + self.fromuceids() #create database self.connect() self.dobuild() + + def fromuceids(self): + print 'fromuceids' + dictucekeep = dict(zip(self.parametres['uceids'], self.parametres['uceids'])) + idpara = 0 + for uci in self.ori.ucis : + if uci.paras == [] : + keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep] + if keepuces != [] : + nuci = CopyUci(uci) + nuci.uces = keepuces + self.corpus.ucis.append(nuci) + idpara += 1 + else : + newuces = [] + newpara = [] + for et in uci.paras : + keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep] + idpara += 1 + if keepuces != [] : + newuces += keepuces + newpara.append(et) + if newuces != [] : + nuci = CopyUci(uci) + nuci.uces = newuces + nuci.paras = newpara + self.corpus.ucis.append(nuci) def read_corpus(self, infile = None): self.olduceid = [uce.ident for uci in self.corpus.ucis for uce in uci.uces] @@ -1488,6 +1614,7 @@ class Builder : def __init__(self, parent, dlg = None) : self.parent = parent self.dlg = dlg + parametres = DoConf(os.path.join(self.parent.UserConfigPath,'corpus.cfg')).getoptions('corpus') parametres['pathout'] = PathOut(parent.filename, 'corpus').mkdirout() parametres['corpus_name'] = os.path.split(parametres['pathout'])[1] @@ -1496,20 +1623,29 @@ class Builder : dial.txtpath.SetLabel(parent.filename) #dial.repout_choices.SetValue(parametres['pathout']) self.res = dial.ShowModal() + if self.dlg is not None : + self.dlg = progressbar(self.parent, self.dlg) if self.res == 5100 : parametres = dial.doparametres() parametres['originalpath'] = parent.filename PathOut().createdir(parametres['pathout']) - ReadLexique(self.parent, lang = parametres['lang']) + if parametres.get('dictionary', False) : + filein = parametres['dictionary'] + else : + filein = None + if dial.corpusname.GetValue() != '' : + parametres['corpus_name'] = dial.corpusname.GetValue() + dial.Destroy() + ReadLexique(self.parent, lang = parametres['lang'], filein = filein) if parametres['lang'] != 'other' and os.path.exists(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')): self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')) else : self.parent.expressions = {} self.parametres = parametres else : + dial.Destroy() if self.dlg is not None : self.dlg.Destroy() - dial.Destroy() def doanalyse(self) : return BuildFromAlceste(self.parent.filename, self.parametres, self.parent.lexique, self.parent.expressions, dlg = self.dlg).corpus @@ -1520,12 +1656,22 @@ class SubBuilder : self.ori = corpus self.dlg = dlg corpus_name = 'Sub' + corpus.parametres['corpus_name'] + if dlg is not None : + busy = wx.BusyInfo(_("Please wait...").decode('utf8'), self) + wx.SafeYield() parametres['corpus_name'] = corpus_name if parametres.get('frommeta', False) : parametres['meta'] = corpus.make_etoiles() elif parametres.get('fromtheme', False) : parametres['meta'] = corpus.make_themes() - parametres['meta'].sort() + elif parametres.get('fromclusters', False) : + parametres['meta'] = [' '.join(['classe', `i`]) for i in range(1,parametres['clnb'] + 1)] + else : + parametres['meta'] = [] + if 'fromclusters' not in parametres : + parametres['meta'].sort() + if dlg is not None : + del busy dial = SubTextFromMetaDial(parent, parametres) self.res = dial.ShowModal() if self.res == 5100 : @@ -1541,7 +1687,10 @@ class SubBuilder : i += 1 parametres['pathout'] = pathout + '_%i' % i meta = dial.m_listBox1.GetSelections() - parametres['meta'] = [parametres['meta'][val] for val in meta] + if not 'fromclusters' in parametres : + parametres['meta'] = [parametres['meta'][val] for val in meta] + else : + parametres['meta'] = meta self.parametres = parametres dial.Destroy() else : @@ -1549,9 +1698,3 @@ class SubBuilder : def doanalyse(self): return BuildSubCorpus(self.ori, parametres = self.parametres, dlg = self.dlg).corpus - -if __name__ == '__main__' : - t1 = time() - parametres = {'formesdb':'formes.db', 'ucesdb': 'uces.db', 'corpusdb' : 'corpus.db', 'syscoding' : 'utf-8', 'encoding' : 'utf8'} - intro = BuildCorpus(infile, parametres)#, tar_in, tar_infouce)#, tar_formes) - print time() - t1