+class BuildSubCorpus(BuildCorpus):
+ def __init__(self, corpus, parametres, dlg = None) :
+ log.info('begin subcorpus...')
+ self.dlg = dlg
+ self.ori = corpus
+ self.infile = None
+ self.corpus = Corpus(self, {'type' : 'corpus', 'originalpath' : corpus.parametres['originalpath'], 'encoding' : corpus.parametres['encoding']})
+ self.last = 0
+ self.parametres = parametres
+ self.encoding = corpus.parametres['encoding']
+ self.corpus.parametres['corpus_name'] = parametres['corpus_name']
+ self.corpus.pathout = PathOut(filename = corpus.parametres['originalpath'], dirout = parametres['pathout'])
+ self.corpus.pathout.createdir(parametres['pathout'])
+ self.corpus.parametres['pathout'] = parametres['pathout']
+ self.corpus.parametres['meta'] = parametres.get('meta', False)
+ self.corpus.parametres['uuid'] = str(uuid4())
+ if parametres.get('frommeta', False) :
+ print 'make subtexts'
+ self.corpus.ucis = [CopyUci(uci) for uci in self.ori.ucis if set(parametres['meta']).intersection(uci.etoiles) != set()]
+ elif parametres.get('fromtheme', False) :
+ print 'make subtexts from theme'
+ idpara = 0
+ for uci in self.ori.ucis :
+ if uci.paras != [] :
+ newuce = []
+ newpara = []
+ for et in uci.paras :
+ if et in parametres['meta'] :
+ newuce += [CopyUce(uce) for uce in uci.uces if uce.para == idpara]
+ newpara.append(et)
+ idpara += 1
+ if newuce != [] :
+ nuci = CopyUci(uci)
+ nuci.uces = newuce
+ nuci.paras = newpara
+ self.corpus.ucis.append(nuci)
+ else :
+ idpara += 1
+ elif parametres.get('fromclusters', False) :
+ self.parametres['uceids'] = [st for i in self.parametres['meta'] for st in self.parametres['lc'][i]]
+ self.fromuceids()
+ elif parametres.get('fromuceids', False) :
+ self.fromuceids()
+ #create database
+ self.connect()
+ self.dobuild()
+
+ def fromuceids(self):
+ print 'fromuceids'
+ dictucekeep = dict(zip(self.parametres['uceids'], self.parametres['uceids']))
+ idpara = 0
+ for uci in self.ori.ucis :
+ if uci.paras == [] :
+ keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep]
+ if keepuces != [] :
+ nuci = CopyUci(uci)
+ nuci.uces = keepuces
+ self.corpus.ucis.append(nuci)
+ idpara += 1
+ else :
+ newuces = []
+ newpara = []
+ for et in uci.paras :
+ keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep]
+ idpara += 1
+ if keepuces != [] :
+ newuces += keepuces
+ newpara.append(et)
+ if newuces != [] :
+ nuci = CopyUci(uci)
+ nuci.uces = newuces
+ nuci.paras = newpara
+ self.corpus.ucis.append(nuci)
+
+ def read_corpus(self, infile = None):
+ self.olduceid = [uce.ident for uci in self.corpus.ucis for uce in uci.uces]
+ ident_uci = 0
+ ident_uce = 0
+ ident_para = -1
+ lastpara = -1
+ newuceident = {}
+ print 'redo text, para and st ident'
+ for uci in self.corpus.ucis :
+ uci.ident = ident_uci
+ ident_uci += 1
+ for uce in uci.uces :
+ uce.uci = uci.ident
+ if uce.para != lastpara :
+ ident_para += 1
+ lastpara = uce.para
+ uce.para = ident_para
+ else :
+ uce.para = ident_para
+ newuceident[uce.ident] = ident_uce
+ uce.ident = ident_uce
+ ident_uce += 1
+ print 'backup st text and forms'
+ for row in self.ori.getconcorde(self.olduceid) :
+ self.c.execute('INSERT INTO uces VALUES(?,?);', (`newuceident[row[0]]`, row[1]))
+ for word in row[1].split() :
+ self.corpus.add_word_from_forme(self.ori.formes[word], newuceident[row[0]])
+ self.backup_uce()
+ print 'done'