X-Git-Url: http://iramuteq.org/git?p=iramuteq;a=blobdiff_plain;f=corpus.py;h=69539d3b8c53e7dfbcf4a434b50913b9b2c6174f;hp=4a2140d47725f608a693426e3b548a3758ed9714;hb=4efa1d6083096de61aed08a9b64bae41f48e13aa;hpb=54fef96ad151ba25920f3e589b39a83c3f62ae2c diff --git a/corpus.py b/corpus.py index 4a2140d..69539d3 100644 --- a/corpus.py +++ b/corpus.py @@ -17,6 +17,7 @@ from operator import itemgetter from uuid import uuid4 from chemins import PathOut from dialog import CorpusPref, SubTextFromMetaDial +from copy import copy from colors import colors import datetime @@ -33,6 +34,16 @@ def copycorpus(corpus) : copy_corpus.conn_all() return copy_corpus +def CopyUce(uce) : + return Uce(uce.ident, uce.para, uce.uci) + + +def CopyUci(uci): + nuci = Uci(uci.ident, '') + nuci.etoiles = copy(uci.etoiles) + nuci.uces = [CopyUce(uce) for uce in uci.uces] + return nuci + class Corpus : @@ -94,9 +105,7 @@ class Corpus : else : self.idformesuces[self.formes[word.forme].ident] = {stident: 1} else : - self.formes[word.forme] = word - self.formes[word.forme].ident = len(self.formes) - self.formes[word.forme].freq = 1 + self.formes[word.forme] = Word(word.forme, word.gram, len(self.formes), word.lem) self.idformesuces[self.formes[word.forme].ident] = {stident : 1} def conn_all(self): @@ -214,7 +223,7 @@ class Corpus : return [len(uce[1].split()) for uce in res] def getconcorde(self, uces) : - return self.cuces.execute('select * from uces where id IN (%s);' % ', '.join([`i` for i in uces])) + return self.cuces.execute('select * from uces where id IN (%s) ORDER BY id;' % ', '.join([`i` for i in uces])) def getuciconcorde(self, ucis) : uces = [[val,[uce.ident for uce in self.ucis[val].uces]] for val in ucis] @@ -1130,7 +1139,7 @@ class BuildCorpus : self.corpus.pathout = PathOut(filename = parametres_corpus['originalpath'], dirout = parametres_corpus['pathout']) self.corpus.pathout.createdir(parametres_corpus['pathout']) self.corpus.parametres['uuid'] = str(uuid4()) - self.corpus.parametres['corpus_name'] = os.path.split(self.corpus.parametres['pathout'])[1] + self.corpus.parametres['corpus_name'] = parametres_corpus['corpus_name']#os.path.split(self.corpus.parametres['pathout'])[1] self.corpus.parametres['type'] = 'corpus' if self.corpus.parametres['keep_ponct'] : self.ponctuation_espace = [' ', ''] @@ -1200,7 +1209,7 @@ class BuildCorpus : self.cf.execute('CREATE INDEX ideff ON eff (id);') self.c.close() self.cf.close() - #backup corpora + #backup corpus self.conn_corpus = sqlite3.connect(self.corpus.pathout['corpus.db']) self.ccorpus = self.conn_corpus.cursor() self.ccorpus.execute('CREATE TABLE IF NOT EXISTS etoiles (uci INTEGER, et TEXT, paras TEXT);') @@ -1300,11 +1309,12 @@ class BuildCorpus : class BuildSubCorpus(BuildCorpus): def __init__(self, corpus, parametres, dlg = None) : + print parametres log.info('begin subcorpus...') self.dlg = dlg self.ori = corpus self.infile = None - self.corpus = Corpus(self, corpus.parametres) + self.corpus = Corpus(self, {'type' : 'corpus', 'originalpath' : corpus.parametres['originalpath'], 'encoding' : corpus.parametres['encoding']}) self.last = 0 self.encoding = corpus.parametres['encoding'] self.corpus.parametres['corpus_name'] = parametres['corpus_name'] @@ -1315,7 +1325,7 @@ class BuildSubCorpus(BuildCorpus): self.corpus.parametres['uuid'] = str(uuid4()) if parametres.get('frommeta', False) : print 'make subtexts' - self.corpus.ucis = [uci for uci in self.ori.ucis if set(parametres['meta']).intersection(uci.etoiles) != set()] + self.corpus.ucis = [CopyUci(uci) for uci in self.ori.ucis if set(parametres['meta']).intersection(uci.etoiles) != set()] elif parametres.get('fromtheme', False) : print 'make subtexts from theme' idpara = 0 @@ -1325,17 +1335,45 @@ class BuildSubCorpus(BuildCorpus): newpara = [] for et in uci.paras : if et in parametres['meta'] : - newuce += [uce for uce in uci.uces if uce.para == idpara] + newuce += [CopyUce(uce) for uce in uci.uces if uce.para == idpara] newpara.append(et) idpara += 1 if newuce != [] : - uci.uces = newuce - uci.paras = newpara - self.corpus.ucis.append(uci) + nuci = CopyUci(uci) + nuci.uces = newuce + nuci.paras = newpara + self.corpus.ucis.append(nuci) else : idpara += 1 elif parametres.get('fromcluster', False) : pass + elif parametres.get('fromuceids', False) : + print 'fromuceids' + dictucekeep = dict(zip(parametres['uceids'], parametres['uceids'])) + idpara = 0 + for uci in self.ori.ucis : + if uci.paras == [] : + keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep] + if keepuces != [] : + nuci = CopyUci(uci) + nuci.uces = keepuces + self.corpus.ucis.append(nuci) + idpara += 1 + else : + newuces = [] + newpara = [] + for et in uci.paras : + keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep] + idpara += 1 + if keepuces != [] : + newuces += keepuces + newpara.append(et) + if newuces != [] : + nuci = CopyUci(uci) + nuci.uces = newuces + nuci.paras = newpara + self.corpus.ucis.append(nuci) + #create database self.connect() self.dobuild() @@ -1428,7 +1466,7 @@ class BuildFromAlceste(BuildCorpus) : if iduci != -1 and iduce != -1: self.backup_uce() else : - log.info(_(u"No Text in corpora. Are you sure of the formatting ?")) + log.info(_(u"No Text in corpus. Are you sure of the formatting ?")) raise Exception('TextBeforeTextMark %i' % linenb) except UnicodeDecodeError : raise Exception("CorpusEncoding") @@ -1490,6 +1528,7 @@ class Builder : self.dlg = dlg parametres = DoConf(os.path.join(self.parent.UserConfigPath,'corpus.cfg')).getoptions('corpus') parametres['pathout'] = PathOut(parent.filename, 'corpus').mkdirout() + parametres['corpus_name'] = os.path.split(parametres['pathout'])[1] dial = CorpusPref(parent, parametres) dial.CenterOnParent() dial.txtpath.SetLabel(parent.filename) @@ -1519,12 +1558,19 @@ class SubBuilder : self.ori = corpus self.dlg = dlg corpus_name = 'Sub' + corpus.parametres['corpus_name'] + if dlg is not None : + busy = wx.BusyInfo(_("Please wait...").decode('utf8'), self) + wx.SafeYield() parametres['corpus_name'] = corpus_name if parametres.get('frommeta', False) : parametres['meta'] = corpus.make_etoiles() elif parametres.get('fromtheme', False) : parametres['meta'] = corpus.make_themes() + else : + parametres['meta'] = [] parametres['meta'].sort() + if dlg is not None : + del busy dial = SubTextFromMetaDial(parent, parametres) self.res = dial.ShowModal() if self.res == 5100 :