X-Git-Url: http://iramuteq.org/git?p=iramuteq;a=blobdiff_plain;f=corpus.py;h=d13b2f5b4a18933818aa0c7ea7313790204ada82;hp=ec4855b268abefe866b22d6218ebb4e335f3e74a;hb=357e5403d3c539083a1052248628f0aab1af1eaa;hpb=eb7ef63636ee16b64d621650b6db474852321652 diff --git a/corpus.py b/corpus.py index ec4855b..d13b2f5 100644 --- a/corpus.py +++ b/corpus.py @@ -17,6 +17,7 @@ from operator import itemgetter from uuid import uuid4 from chemins import PathOut from dialog import CorpusPref, SubTextFromMetaDial +from copy import copy from colors import colors import datetime @@ -33,6 +34,16 @@ def copycorpus(corpus) : copy_corpus.conn_all() return copy_corpus +def CopyUce(uce) : + return Uce(uce.ident, uce.para, uce.uci) + + +def CopyUci(uci): + nuci = Uci(uci.ident, '') + nuci.etoiles = copy(uci.etoiles) + nuci.uces = [CopyUce(uce) for uce in uci.uces] + return nuci + class Corpus : @@ -94,9 +105,7 @@ class Corpus : else : self.idformesuces[self.formes[word.forme].ident] = {stident: 1} else : - self.formes[word.forme] = word - self.formes[word.forme].ident = len(self.formes) - self.formes[word.forme].freq = 1 + self.formes[word.forme] = Word(word.forme, word.gram, len(self.formes), word.lem) self.idformesuces[self.formes[word.forme].ident] = {stident : 1} def conn_all(self): @@ -1306,6 +1315,7 @@ class BuildSubCorpus(BuildCorpus): self.infile = None self.corpus = Corpus(self, {'type' : 'corpus', 'originalpath' : corpus.parametres['originalpath'], 'encoding' : corpus.parametres['encoding']}) self.last = 0 + self.parametres = parametres self.encoding = corpus.parametres['encoding'] self.corpus.parametres['corpus_name'] = parametres['corpus_name'] self.corpus.pathout = PathOut(filename = corpus.parametres['originalpath'], dirout = parametres['pathout']) @@ -1315,7 +1325,7 @@ class BuildSubCorpus(BuildCorpus): self.corpus.parametres['uuid'] = str(uuid4()) if parametres.get('frommeta', False) : print 'make subtexts' - self.corpus.ucis = [uci for uci in self.ori.ucis if set(parametres['meta']).intersection(uci.etoiles) != set()] + self.corpus.ucis = [CopyUci(uci) for uci in self.ori.ucis if set(parametres['meta']).intersection(uci.etoiles) != set()] elif parametres.get('fromtheme', False) : print 'make subtexts from theme' idpara = 0 @@ -1325,45 +1335,51 @@ class BuildSubCorpus(BuildCorpus): newpara = [] for et in uci.paras : if et in parametres['meta'] : - newuce += [uce for uce in uci.uces if uce.para == idpara] + newuce += [CopyUce(uce) for uce in uci.uces if uce.para == idpara] newpara.append(et) idpara += 1 if newuce != [] : - uci.uces = newuce - uci.paras = newpara - self.corpus.ucis.append(uci) + nuci = CopyUci(uci) + nuci.uces = newuce + nuci.paras = newpara + self.corpus.ucis.append(nuci) else : idpara += 1 - elif parametres.get('fromcluster', False) : - pass + elif parametres.get('fromclusters', False) : + self.parametres['uceids'] = [st for i in self.parametres['meta'] for st in self.parametres['lc'][i]] + self.fromuceids() elif parametres.get('fromuceids', False) : - print 'fromuceids' - dictucekeep = dict(zip(parametres['uceids'], parametres['uceids'])) - idpara = 0 - for uci in self.ori.ucis : - if uci.paras == [] : - keepuces = [uce for uce in uci.uces if uce.ident in dictucekeep] - if keepuces != [] : - uci.uces = keepuces - self.corpus.ucis.append(uci) - idpara += 1 - else : - newuces = [] - newpara = [] - for et in uci.paras : - keepuces = [uce for uce in uci.uces if uce.para == idpara] - idpara += 1 - if keepuces != [] : - newuces += keepuces - newpara.append(et) - if newuces != [] : - uci.uces = newuces - uci.paras = newpara - self.corpus.ucis.append(uci) - + self.fromuceids() #create database self.connect() self.dobuild() + + def fromuceids(self): + print 'fromuceids' + dictucekeep = dict(zip(self.parametres['uceids'], self.parametres['uceids'])) + idpara = 0 + for uci in self.ori.ucis : + if uci.paras == [] : + keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep] + if keepuces != [] : + nuci = CopyUci(uci) + nuci.uces = keepuces + self.corpus.ucis.append(nuci) + idpara += 1 + else : + newuces = [] + newpara = [] + for et in uci.paras : + keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep] + idpara += 1 + if keepuces != [] : + newuces += keepuces + newpara.append(et) + if newuces != [] : + nuci = CopyUci(uci) + nuci.uces = newuces + nuci.paras = newpara + self.corpus.ucis.append(nuci) def read_corpus(self, infile = None): self.olduceid = [uce.ident for uci in self.corpus.ucis for uce in uci.uces] @@ -1553,9 +1569,12 @@ class SubBuilder : parametres['meta'] = corpus.make_etoiles() elif parametres.get('fromtheme', False) : parametres['meta'] = corpus.make_themes() + elif parametres.get('fromclusters', False) : + parametres['meta'] = [' '.join(['classe', `i`]) for i in range(1,parametres['clnb'] + 1)] else : parametres['meta'] = [] - parametres['meta'].sort() + if 'fromclusters' not in parametres : + parametres['meta'].sort() if dlg is not None : del busy dial = SubTextFromMetaDial(parent, parametres) @@ -1573,7 +1592,10 @@ class SubBuilder : i += 1 parametres['pathout'] = pathout + '_%i' % i meta = dial.m_listBox1.GetSelections() - parametres['meta'] = [parametres['meta'][val] for val in meta] + if not 'fromclusters' in parametres : + parametres['meta'] = [parametres['meta'][val] for val in meta] + else : + parametres['meta'] = meta self.parametres = parametres dial.Destroy() else : @@ -1581,9 +1603,3 @@ class SubBuilder : def doanalyse(self): return BuildSubCorpus(self.ori, parametres = self.parametres, dlg = self.dlg).corpus - -if __name__ == '__main__' : - t1 = time() - parametres = {'formesdb':'formes.db', 'ucesdb': 'uces.db', 'corpusdb' : 'corpus.db', 'syscoding' : 'utf-8', 'encoding' : 'utf8'} - intro = BuildCorpus(infile, parametres)#, tar_in, tar_infouce)#, tar_formes) - print time() - t1