+ newuces = []
+ newpara = []
+ for et in uci.paras :
+ keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep]
+ idpara += 1
+ if keepuces != [] :
+ newuces += keepuces
+ newpara.append(et)
+ if newuces != [] :
+ nuci = CopyUci(uci)
+ nuci.uces = newuces
+ nuci.paras = newpara
+ self.corpus.ucis.append(nuci)
+
+ def read_corpus(self, infile = None):
+ self.olduceid = [uce.ident for uci in self.corpus.ucis for uce in uci.uces]
+ ident_uci = 0
+ ident_uce = 0
+ ident_para = -1
+ lastpara = -1
+ newuceident = {}
+ print('redo text, para and st ident')
+ for uci in self.corpus.ucis :
+ uci.ident = ident_uci
+ ident_uci += 1
+ for uce in uci.uces :
+ uce.uci = uci.ident
+ if uce.para != lastpara :
+ ident_para += 1
+ lastpara = uce.para
+ uce.para = ident_para
+ else :
+ uce.para = ident_para
+ newuceident[uce.ident] = ident_uce
+ uce.ident = ident_uce
+ ident_uce += 1
+ print('backup st text and forms')
+ for row in self.ori.getconcorde(self.olduceid) :
+ self.c.execute('INSERT INTO uces VALUES(?,?);', (repr(newuceident[row[0]]), row[1]))
+ for word in row[1].split() :
+ self.corpus.add_word_from_forme(self.ori.formes[word], newuceident[row[0]])
+ self.backup_uce()
+ print('done')
+
+class BuildFromAlceste(BuildCorpus) :
+
+ def read_corpus(self, infile) :
+ if self.dlg is not None :
+ self.dlg.Pulse('textes : 0 - segments : 0')
+ self.limitshow = 0
+ self.count = 1
+ if self.corpus.parametres['ucimark'] == 0 :
+ self.testuci = testetoile
+ elif self.corpus.parametres['ucimark'] == 1 :
+ self.testuci = testint
+ txt = []
+ iduci = -1
+ idpara = -1
+ iduce = -1
+ try :
+ with codecs.open(infile, 'r', self.encoding) as f :
+ for linenb, line in enumerate(f) :
+ line = line.rstrip('\n\r')#FIXME .lstrip(codecs.BOM).lstrip(codecs.BOM_UTF8)
+ if self.testuci(line) :
+ iduci += 1
+ if txt != [] :
+ iduce, idpara = self.treattxt(txt, iduce, idpara, iduci - 1)
+ txt = []
+ self.corpus.ucis.append(Uci(iduci, line))
+ else :
+ if iduci > 0 :
+ if self.corpus.ucis[-1].uces == [] :
+ log.info('Empty text : %i' % linenb)
+ iduci -= 1
+ self.corpus.ucis.pop()
+ self.corpus.ucis.append(Uci(iduci, line))
+ if self.dlg is not None :
+ if not (iduci + 1) % 10 :
+ self.dlg.Pulse('textes : %i - segments : %i' % (iduci + 1, iduce +1))
+ elif line.startswith('-*') :
+ if iduci != -1 :
+ if txt != [] :
+ iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
+ txt = []
+ idpara += 1
+ self.corpus.ucis[-1].paras.append(line.split()[0])
+ else :
+ raise Exception('paragrapheOT %i' % linenb)
+ elif line.strip() != '' and iduci != -1 :
+ txt.append(line)
+ if txt != [] and iduci != -1 :
+ iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
+ del(txt)
+ else :
+ if iduci != -1 :
+ iduci -= 1
+ self.corpus.ucis.pop()
+ log.info(Exception("Empty text %i" % linenb))
+ else :
+ raise Exception('EmptyText %i' % linenb)
+ if iduci != -1 and iduce != -1:
+ self.backup_uce()
+ else :
+ log.info(_("No Text in corpus. Are you sure of the formatting ?"))
+ raise Exception('TextBeforeTextMark %i' % linenb)
+ except UnicodeDecodeError :
+ raise Exception("CorpusEncoding")
+
+ def treattxt(self, txt, iduce, idpara, iduci) :
+ if self.corpus.parametres.get('ucemethod', 0) == 2 and self.corpus.parametres['douce']:
+ txt = 'laphrasepoursplitter'.join(txt)
+ txt = self.make_cleans(txt)
+ txt = ' '.join([val for val in txt.split() if val not in self.ponctuation_espace])
+ ucetxt = txt.split('laphrasepoursplitter')
+ else :
+ txt = ' '.join(txt)
+ txt = self.make_cleans(txt)
+ ucetxt = self.make_uces(txt, self.corpus.parametres['douce'])
+ if self.corpus.ucis[-1].paras == [] :
+ idpara += 1
+ for uce in ucetxt :
+ iduce += 1
+ self.corpus.ucis[-1].uces.append(Uce(iduce, idpara, iduci))
+ self.c.execute('INSERT INTO uces VALUES(?,?);', (repr(iduce),uce))
+ if not self.tolist :
+ uce = uce.split()
+ else :
+ uce = list(uce)
+ for word in uce :
+ self.last += 1
+ self.corpus.add_word(word)
+ log.debug(' '.join([repr(iduci),repr(idpara),repr(iduce)]))
+ if self.last > self.lim :
+ self.backup_uce()
+ self.last = 0
+ return iduce, idpara
+
+ def make_uces(self, txt, douce = True, keep_ponct = False) :
+ txt = ' '.join(txt.split())
+ if douce :
+ out = []
+ reste, texte_uce, suite = self.decouper(self.prep_txt(txt), self.ucesize + 15, self.ucesize)
+ while reste :
+ uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
+ if uce != '' :
+ out.append(uce)
+ reste, texte_uce, suite = self.decouper(suite, self.ucesize + 15, self.ucesize)
+ uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
+ if uce != '' :
+ out.append(uce)
+ return out
+ else :
+ return [' '.join([val for val in txt.split() if val not in self.ponctuation_espace])]
+
+#decouper (list_sep)
+#make_uces (decouper)
+#treat_txt (make_uces)
+#read (treat_txt)
+
+class Builder :
+
+ def __init__(self, parent, dlg = None) :
+ self.parent = parent
+ self.dlg = dlg
+ parametres = DoConf(os.path.join(self.parent.UserConfigPath,'corpus.cfg')).getoptions('corpus')
+ parametres['pathout'] = PathOut(parent.filename, 'corpus').mkdirout()
+ parametres['corpus_name'] = os.path.split(parametres['pathout'])[1]
+ dial = CorpusPref(parent, parametres)
+ dial.CenterOnParent()
+ dial.txtpath.SetLabel(parent.filename)
+ #dial.repout_choices.SetValue(parametres['pathout'])
+ self.res = dial.ShowModal()
+ if self.dlg is not None :
+ self.dlg = progressbar(self.parent, self.dlg)
+ if self.res == 5100 :
+ parametres = dial.doparametres()
+ parametres['originalpath'] = parent.filename
+ PathOut().createdir(parametres['pathout'])
+ if parametres.get('dictionary', False) :
+ filein = parametres['dictionary']
+ else :
+ filein = None
+ if dial.corpusname.GetValue() != '' :
+ parametres['corpus_name'] = dial.corpusname.GetValue()
+ dial.Destroy()
+ ReadLexique(self.parent, lang = parametres['lang'], filein = filein)
+ if parametres['lang'] != 'other' and os.path.exists(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')):
+ self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp'))
+ else :
+ self.parent.expressions = {}
+ self.parametres = parametres
+ else :
+ dial.Destroy()
+ if self.dlg is not None :
+ self.dlg.Destroy()
+
+ def doanalyse(self) :
+ return BuildFromAlceste(self.parent.filename, self.parametres, self.parent.lexique, self.parent.expressions, dlg = self.dlg).corpus
+
+class SubBuilder :
+
+ def __init__(self, parent, corpus, parametres = None, dlg = None):
+ self.parent = parent
+ self.ori = corpus
+ self.dlg = dlg
+ corpus_name = 'Sub' + corpus.parametres['corpus_name']
+ if dlg is not None :
+ busy = wx.BusyInfo(_("Please wait..."), self)
+ wx.SafeYield()
+ parametres['corpus_name'] = corpus_name
+ if parametres.get('frommeta', False) :
+ parametres['meta'] = corpus.make_etoiles()
+ elif parametres.get('fromtheme', False) :
+ parametres['meta'] = corpus.make_themes()
+ elif parametres.get('fromclusters', False) :
+ parametres['meta'] = [' '.join(['classe', repr(i)]) for i in range(1,parametres['clnb'] + 1)]
+ else :
+ parametres['meta'] = []
+ if 'fromclusters' not in parametres :
+ parametres['meta'].sort()
+ if dlg is not None :
+ del busy
+ dial = SubTextFromMetaDial(parent, parametres)
+ self.res = dial.ShowModal()
+ if self.res == 5100 :
+ if dial.subcorpusname.GetValue() != '' :
+ corpus_name = ''.join([l for l in dial.subcorpusname.GetValue() if l.isalnum() or l in ['_']])
+ if corpus_name != '' :
+ parametres['corpus_name'] = corpus_name
+ else :
+ parametres['corpus_name'] = 'Sub' + corpus.parametres['corpus_name']
+ pathout = os.path.join(corpus.parametres['pathout'], parametres['corpus_name'])
+ i = 1
+ while os.path.exists(pathout + '_%i' % i) :
+ i += 1
+ parametres['pathout'] = pathout + '_%i' % i
+ meta = dial.m_listBox1.GetSelections()
+ if not 'fromclusters' in parametres :
+ parametres['meta'] = [parametres['meta'][val] for val in meta]
+ else :
+ parametres['meta'] = meta
+ self.parametres = parametres
+ dial.Destroy()
+ else :
+ dial.Destroy()
+
+ def doanalyse(self):
+ return BuildSubCorpus(self.ori, parametres = self.parametres, dlg = self.dlg).corpus
+
+class BuildMergeFromClusters(BuildCorpus):
+
+ def __init__(self, analyses, parametres, dlg = None) :
+ log.info('begin subcorpus...')
+ self.dlg = dlg
+ self.infile = None
+ self.corpus = Corpus(self, {'type' : 'corpus', 'originalpath' : 'MergeFromClusters', 'encoding' : 'merge'})
+ self.last = 0
+ self.analyses = analyses
+ self.lcl = []
+ self.parametres = parametres
+ #self.encoding = corpus.parametres['encoding']
+ self.corpus.parametres['corpus_name'] = parametres['corpus_name']
+ self.corpus.pathout = PathOut(filename = 'MFC', dirout = parametres['pathout'])
+ self.corpus.pathout.createdir(parametres['pathout'])
+ self.corpus.parametres['pathout'] = parametres['pathout']
+ self.corpus.parametres['meta'] = parametres.get('meta', False)
+ self.corpus.parametres['uuid'] = str(uuid4())
+ for i, analyse in enumerate(analyses) :
+ self.lcl.append([])
+ self.analyseid = i
+ corpus_uuid = analyse['corpus']
+ #if corpus_uuid not in self.parent.history.openedcorpus :
+ irapath = parametres['corpusira'][i]
+ corpus = Corpus(self, parametres = DoConf(irapath).getoptions('corpus'), read = True)
+ ucepath = os.path.join(analyse['pathout'], 'uce.csv')
+ corpus.make_ucecl_from_R(ucepath)
+ self.ori = corpus
+ for j, cl in enumerate(parametres['clusters'][i]) :
+ #print cl, self.ori.lc[cl-1]
+ self.parametres['uceids'] = self.ori.lc[cl-1]#[st for st in self.ori['lc'][cl-1]]
+ self.lcl[i] += self.ori.lc[cl-1]
+ self.et = parametres['newet'][i][j]
+ self.fromuceids()
+ #create database
+ self.connect()
+ self.dobuild()
+
+ def fromuceids(self):
+ print('fromuceids')
+ dictucekeep = dict(list(zip(self.parametres['uceids'], self.parametres['uceids'])))
+ idpara = 0
+ for uci in self.ori.ucis :
+ if uci.paras == [] :
+ keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep]
+ if keepuces != [] :
+ nuci = CopyUci(uci)
+ nuci.uces = keepuces
+ nuci.etoiles.append(self.et)
+ nuci.analyseid = self.analyseid
+ self.corpus.ucis.append(nuci)
+ idpara += 1
+ else :
+ newuces = []
+ newpara = []
+ for et in uci.paras :
+ keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep]
+ idpara += 1
+ if keepuces != [] :
+ newuces += keepuces
+ newpara.append(et)
+ if newuces != [] :
+ nuci = CopyUci(uci)
+ nuci.uces = newuces
+ nuci.paras = newpara
+ nuci.etoiles.append(self.et)
+ nuci.analyseid = self.analyseid
+ self.corpus.ucis.append(nuci)
+ #print nuci.etoiles, nuci.ident, nuci.uces
+
+ def read_corpus(self, infile = None):
+ #self.olduceid = [uce.ident for uci in self.corpus.ucis for uce in uci.uces]
+ ident_uci = 0
+ ident_uce = 0
+ ident_para = -1
+ lastpara = -1
+ newuceident = {}
+ print('redo text, para and st ident')
+ for uci in self.corpus.ucis :
+ #print uci.ident, ident_uci, [uce.ident for uce in uci.uces], uci.etoiles
+ uci.ident = ident_uci
+ ident_uci += 1
+ for uce in uci.uces :
+ uce.uci = uci.ident
+ if uce.para != lastpara :
+ ident_para += 1
+ lastpara = uce.para
+ uce.para = ident_para
+ else :
+ uce.para = ident_para
+ newuceident['%i-%i' %(uci.analyseid, uce.ident)] = ident_uce
+ uce.ident = ident_uce
+ #print uce.ident
+ ident_uce += 1
+ print('backup st text and forms')
+ rowid = 0
+ for i, analyse in enumerate(self.analyses) :
+ #print analyse, self.parametres['corpusira']
+ irapath = self.parametres['corpusira'][i]
+ old = Corpus(self, parametres = DoConf(irapath).getoptions('corpus'), read = True)
+ for row in old.getconcorde(self.lcl[i]) :
+ self.c.execute('INSERT INTO uces VALUES(?,?);', (newuceident['%i-%i' % (i,row[0])], row[1]))
+ for word in row[1].split() :
+ self.corpus.add_word_from_forme(old.formes[word], newuceident['%i-%i' % (i,row[0])])
+ rowid += 1
+ self.backup_uce()
+ print('done')
+
+
+class MergeClusters :
+
+ def __init__(self, parent, parametres = None, dlg = None):
+ self.parent = parent
+ #self.ori = corpus
+ self.dlg = dlg
+ corpus_name = 'MergeFromClusters'
+ if dlg is not None :
+ busy = wx.BusyInfo(_("Please wait..."), self)
+ wx.SafeYield()
+ parametres['corpus_name'] = corpus_name
+ if dlg is not None :
+ del busy
+ dial = MergeClusterFrame(parent)
+ dial.m_textCtrl4.SetValue(corpus_name)
+ self.res = dial.ShowModal()
+ if self.res == 5100 :
+ self.analyses = {}
+ self.clusters = {}
+ self.newet = {}
+ self.corpusira = {}
+ if dial.m_textCtrl4.GetValue() != '' :
+ corpus_name = ''.join([l for l in dial.m_textCtrl4.GetValue() if l.isalnum() or l in ['_']])
+ if corpus_name != '' :
+ parametres['corpus_name'] = corpus_name
+ else :
+ parametres['corpus_name'] = 'MergeFromClusters'
+ for cl in dial.selected :
+ corpus_uuid = cl[1]
+ #if corpus_uuid not in self.parent.history.openedcorpus :
+ irapath = self.parent.history.corpus[corpus_uuid]['ira']
+ #corpus = Corpus(self.parent, parametres = DoConf(irapath).getoptions('corpus'), read = True)
+ #self.parent.history.openedcorpus[corpus_uuid] = corpus
+ if cl[0] not in self.analyses :
+ analyse = DoConf(dial.irapath[cl[0]]).getoptions()
+ #ucepath = os.path.join(os.path.dirname(dial.irapath[cl[0]]), 'uce.csv')
+ #corpus = copycorpus(self.parent.history.openedcorpus[corpus_uuid])
+ #corpus.make_ucecl_from_R(ucepath)
+ self.analyses[cl[0]] = analyse
+ self.clusters[cl[0]] = [cl[2]]
+ self.newet[cl[0]] = [dial.selected[cl]]
+ self.corpusira[cl[0]] = irapath
+ else :
+ self.clusters[cl[0]].append(cl[2])
+ self.newet[cl[0]].append(dial.selected[cl])
+ analyses = [val for val in self.clusters]
+ clusters = [self.clusters[val] for val in analyses]
+ self.newet = [self.newet[val] for val in analyses]
+ corpusira = [self.corpusira[val] for val in analyses]
+ analyses = [self.analyses[val] for val in analyses]
+ pathout = os.path.dirname(os.path.dirname(analyses[0]['pathout']))
+ self.analyses = analyses
+ pathout = os.path.join(pathout, parametres['corpus_name'])
+ i = 1
+ while os.path.exists(pathout + '_%i' % i) :
+ i += 1
+ parametres['pathout'] = pathout + '_%i' % i
+ self.parametres = parametres
+ self.parametres['clusters'] = clusters
+ self.parametres['newet'] = self.newet
+ self.parametres['corpusira'] = corpusira
+ dial.Destroy()
+ else :
+ dial.Destroy()
+
+ def doanalyse(self):
+ return BuildMergeFromClusters(self.analyses, parametres = self.parametres, dlg = self.dlg).corpus