import locale
import sys
from time import time
-from functions import decoupercharact, ReadDicoAsDico, DoConf, ReadLexique
+from functions import decoupercharact, ReadDicoAsDico, DoConf, ReadLexique, progressbar
import re
import sqlite3
import itertools
from uuid import uuid4
from chemins import PathOut
from dialog import CorpusPref, SubTextFromMetaDial
+from copy import copy
from colors import colors
import datetime
copy_corpus.conn_all()
return copy_corpus
+def CopyUce(uce) :
+ return Uce(uce.ident, uce.para, uce.uci)
+
+
+def CopyUci(uci):
+ nuci = Uci(uci.ident, '')
+ nuci.etoiles = copy(uci.etoiles)
+ nuci.uces = [CopyUce(uce) for uce in uci.uces]
+ return nuci
+
class Corpus :
else :
self.idformesuces[self.formes[word.forme].ident] = {stident: 1}
else :
- self.formes[word.forme] = word
- self.formes[word.forme].ident = len(self.formes)
- self.formes[word.forme].freq = 1
+ self.formes[word.forme] = Word(word.forme, word.gram, len(self.formes), word.lem)
self.idformesuces[self.formes[word.forme].ident] = {stident : 1}
def conn_all(self):
query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
res = self.cformes.execute(query)
return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))))
+
+ def gettgenst(self, tgen):
+ formesid = []
+ for lem in tgen :
+ if lem in self.lems :
+ formesid += self.lems[lem].formes
+ else :
+ print 'abscent : %s' % lem
+ query = 'SELECT uces FROM uces where id IN %s ORDER BY id' % str(tuple(formesid))
+ res = self.cformes.execute(query)
+ return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))))
+
+ def gettgenstprof(self, tgen, classe, i, clnb):
+ tgenst = []
+ for lem in tgen :
+ if lem in self.lems :
+ lemst = self.getlemuces(lem)
+ tgenst += lemst
+ if not lem in self.tgenlem :
+ self.tgenlem[lem] = [0] * clnb
+ self.tgenlem[lem][i] = len(set(lemst).intersection(classe))
+ else :
+ print 'abscent: ',lem
+ return list(set(tgenst))
+
+ def gettgentxt(self, tgen):
+ sts = self.gettgenst(tgen)
+ return list(set([self.getucefromid(val).uci for val in sts]))
def getlemucis(self, lem) :
uces = self.getlemuces(lem)
for lem in tokeep :
deff = self.getlemuceseff(lem)
ucesk = deff.keys()
- tab.append([lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])
+ line = [lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces]
+ if sum(line[1:]) >= mineff :
+ tab.append(line)
tab.insert(0, [''] + etoiles)
return tab
tgenoccurrences[t][etoiles[i]] += sum([lemuceeff[uce] for uce in concern])
return tgenoccurrences, totoccurrences
+ def make_tgen_profile(self, tgen, ucecl, uci = False) :
+ log.info('tgen/classes')
+ self.tgenlem = {}
+ clnb = len(ucecl)
+ if uci :
+ #FIXME : NE MARCHE PLUS CHANGER CA
+ tab = [[lem] + [len(set(self.gettgentxt(tgen[lem])).intersection(classe)) for i, classe in enumerate(ucecl)] for lem in tgen]
+ else :
+ tab = [[lem] + [len(set(self.gettgenstprof(tgen[lem], classe, i, clnb)).intersection(classe)) for i, classe in enumerate(ucecl)] for lem in tgen]
+ tab = [[line[0]] + [val for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
+ return tab
+ #i = 0
+ #nam = 'total'
+ #while nam + `i` in tgen :
+ # i += 1
+ #nam = nam + `i`
+ #last = [nam] + [`len(classe)` for classe in ucecl]
+ #tab += [last]
+ #line0 = ['tgen'] + ['_'.join(['cluster', `i+1`]) for i in range(len(ucecl))]
+ #tab = [line0] + tab
+ #with open(fileout, 'w') as f :
+ # f.write('\n'.join(['\t'.join(line) for line in tab]).encode(self.parametres['syscoding']))
+
def make_efftype_from_etoiles(self, etoiles) :
dtype = {}
etuces = [[] for et in etoiles]
ident += 1
f.write('\n'.join([self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
+ def export_meta_table(self, outf) :
+ metas = [[`i`] + text.etoiles[1:] for i, text in enumerate(self.ucis)]
+ longueur_max = max([len(val) for val in metas])
+ first = ['column_%i' % i for i in range(longueur_max)]
+ metas.insert(0, first)
+ with open(outf, 'w') as f :
+ f.write('\n'.join(['\t'.join(line) for line in metas]).encode(self.parametres['syscoding']))
+
def export_corpus_classes(self, outf, alc = True, lem = False, uci = False) :
ucecl = {}
for i, lc in enumerate(self.lc) :
f.write(''.join([' '.join([`duces[uce]+1`,`i+1`,`1`]),'\n']))
f.seek(0)
with open(outfile, 'w') as ffin :
- ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
+ ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (len(uces), len(actives), nbl))
for line in f :
ffin.write(line)
os.remove(outfile + '~')
except IndexError :
det[et[0]] = 1
return det
+
+ def make_theme_dict(self):
+ themes = [val for uci in self.ucis for val in uci.paras]
+ det = {}
+ for theme in themes :
+ th = theme.split('_')
+ if th[0] in det :
+ try :
+ endth = '_'.join(th[1:])
+ if theme in det[th[0]] :
+ det[th[0]][theme] += 1
+ else :
+ det[th[0]][theme] = 1
+ except IndexError :
+ det[th[0]] += 1
+ else :
+ try :
+ endth = '_'.join(th[1:])
+ det[th[0]] = {theme:1}
+ except IndexError :
+ det[th[0]] = 1
+ return det
def make_etline(self, listet) :
etuces = [[] for et in listet]
self.cleans.append(self.dotiret)
def make_expression(self,txt) :
- for expression in self.expressions:
+ exp = self.expressions.keys()
+ exp.sort(reverse=True)
+ for expression in exp :
if expression in txt :
txt = txt.replace(expression, self.expressions[expression][0])
return txt
self.dlg = dlg
self.ori = corpus
self.infile = None
- self.corpus = Corpus(self, corpus.parametres)
+ self.corpus = Corpus(self, {'type' : 'corpus', 'originalpath' : corpus.parametres['originalpath'], 'encoding' : corpus.parametres['encoding']})
self.last = 0
+ self.parametres = parametres
self.encoding = corpus.parametres['encoding']
self.corpus.parametres['corpus_name'] = parametres['corpus_name']
self.corpus.pathout = PathOut(filename = corpus.parametres['originalpath'], dirout = parametres['pathout'])
self.corpus.parametres['uuid'] = str(uuid4())
if parametres.get('frommeta', False) :
print 'make subtexts'
- self.corpus.ucis = [uci for uci in self.ori.ucis if set(parametres['meta']).intersection(uci.etoiles) != set()]
+ self.corpus.ucis = [CopyUci(uci) for uci in self.ori.ucis if set(parametres['meta']).intersection(uci.etoiles) != set()]
elif parametres.get('fromtheme', False) :
print 'make subtexts from theme'
idpara = 0
newpara = []
for et in uci.paras :
if et in parametres['meta'] :
- newuce += [uce for uce in uci.uces if uce.para == idpara]
+ newuce += [CopyUce(uce) for uce in uci.uces if uce.para == idpara]
newpara.append(et)
idpara += 1
if newuce != [] :
- uci.uces = newuce
- uci.paras = newpara
- self.corpus.ucis.append(uci)
+ nuci = CopyUci(uci)
+ nuci.uces = newuce
+ nuci.paras = newpara
+ self.corpus.ucis.append(nuci)
else :
idpara += 1
- elif parametres.get('fromcluster', False) :
- pass
+ elif parametres.get('fromclusters', False) :
+ self.parametres['uceids'] = [st for i in self.parametres['meta'] for st in self.parametres['lc'][i]]
+ self.fromuceids()
+ elif parametres.get('fromuceids', False) :
+ self.fromuceids()
#create database
self.connect()
self.dobuild()
+
+ def fromuceids(self):
+ print 'fromuceids'
+ dictucekeep = dict(zip(self.parametres['uceids'], self.parametres['uceids']))
+ idpara = 0
+ for uci in self.ori.ucis :
+ if uci.paras == [] :
+ keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep]
+ if keepuces != [] :
+ nuci = CopyUci(uci)
+ nuci.uces = keepuces
+ self.corpus.ucis.append(nuci)
+ idpara += 1
+ else :
+ newuces = []
+ newpara = []
+ for et in uci.paras :
+ keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep]
+ idpara += 1
+ if keepuces != [] :
+ newuces += keepuces
+ newpara.append(et)
+ if newuces != [] :
+ nuci = CopyUci(uci)
+ nuci.uces = newuces
+ nuci.paras = newpara
+ self.corpus.ucis.append(nuci)
def read_corpus(self, infile = None):
self.olduceid = [uce.ident for uci in self.corpus.ucis for uce in uci.uces]
def __init__(self, parent, dlg = None) :
self.parent = parent
self.dlg = dlg
+
parametres = DoConf(os.path.join(self.parent.UserConfigPath,'corpus.cfg')).getoptions('corpus')
parametres['pathout'] = PathOut(parent.filename, 'corpus').mkdirout()
parametres['corpus_name'] = os.path.split(parametres['pathout'])[1]
dial.txtpath.SetLabel(parent.filename)
#dial.repout_choices.SetValue(parametres['pathout'])
self.res = dial.ShowModal()
+ if self.dlg is not None :
+ self.dlg = progressbar(self.parent, self.dlg)
if self.res == 5100 :
parametres = dial.doparametres()
parametres['originalpath'] = parent.filename
PathOut().createdir(parametres['pathout'])
- ReadLexique(self.parent, lang = parametres['lang'])
+ if parametres.get('dictionary', False) :
+ filein = parametres['dictionary']
+ else :
+ filein = None
+ if dial.corpusname.GetValue() != '' :
+ parametres['corpus_name'] = dial.corpusname.GetValue()
+ dial.Destroy()
+ ReadLexique(self.parent, lang = parametres['lang'], filein = filein)
if parametres['lang'] != 'other' and os.path.exists(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')):
self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp'))
else :
self.parent.expressions = {}
self.parametres = parametres
else :
+ dial.Destroy()
if self.dlg is not None :
self.dlg.Destroy()
- dial.Destroy()
def doanalyse(self) :
return BuildFromAlceste(self.parent.filename, self.parametres, self.parent.lexique, self.parent.expressions, dlg = self.dlg).corpus
self.ori = corpus
self.dlg = dlg
corpus_name = 'Sub' + corpus.parametres['corpus_name']
+ if dlg is not None :
+ busy = wx.BusyInfo(_("Please wait...").decode('utf8'), self)
+ wx.SafeYield()
parametres['corpus_name'] = corpus_name
if parametres.get('frommeta', False) :
parametres['meta'] = corpus.make_etoiles()
elif parametres.get('fromtheme', False) :
parametres['meta'] = corpus.make_themes()
- parametres['meta'].sort()
+ elif parametres.get('fromclusters', False) :
+ parametres['meta'] = [' '.join(['classe', `i`]) for i in range(1,parametres['clnb'] + 1)]
+ else :
+ parametres['meta'] = []
+ if 'fromclusters' not in parametres :
+ parametres['meta'].sort()
+ if dlg is not None :
+ del busy
dial = SubTextFromMetaDial(parent, parametres)
self.res = dial.ShowModal()
if self.res == 5100 :
i += 1
parametres['pathout'] = pathout + '_%i' % i
meta = dial.m_listBox1.GetSelections()
- parametres['meta'] = [parametres['meta'][val] for val in meta]
+ if not 'fromclusters' in parametres :
+ parametres['meta'] = [parametres['meta'][val] for val in meta]
+ else :
+ parametres['meta'] = meta
self.parametres = parametres
dial.Destroy()
else :
def doanalyse(self):
return BuildSubCorpus(self.ori, parametres = self.parametres, dlg = self.dlg).corpus
-
-if __name__ == '__main__' :
- t1 = time()
- parametres = {'formesdb':'formes.db', 'ucesdb': 'uces.db', 'corpusdb' : 'corpus.db', 'syscoding' : 'utf-8', 'encoding' : 'utf8'}
- intro = BuildCorpus(infile, parametres)#, tar_in, tar_infouce)#, tar_formes)
- print time() - t1