import locale
import sys
from time import time
-from functions import decoupercharact, ReadDicoAsDico, DoConf, ReadLexique
+from functions import decoupercharact, ReadDicoAsDico, DoConf, ReadLexique, progressbar
import re
import sqlite3
import itertools
query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
res = self.cformes.execute(query)
return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))))
+
+ def gettgenst(self, tgen):
+ formesid = []
+ for lem in tgen :
+ if lem in self.lems :
+ formesid += self.lems[lem].formes
+ else :
+ print 'abscent : %s' % lem
+ query = 'SELECT uces FROM uces where id IN %s ORDER BY id' % str(tuple(formesid))
+ res = self.cformes.execute(query)
+ return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))))
+
+ def gettgenstprof(self, tgen, classe, i, clnb):
+ tgenst = []
+ for lem in tgen :
+ if lem in self.lems :
+ lemst = self.getlemuces(lem)
+ tgenst += lemst
+ if not lem in self.tgenlem :
+ self.tgenlem[lem] = [0] * clnb
+ self.tgenlem[lem][i] = len(set(lemst).intersection(classe))
+ else :
+ print 'abscent: ',lem
+ return list(set(tgenst))
+
+ def gettgentxt(self, tgen):
+ sts = self.gettgenst(tgen)
+ return list(set([self.getucefromid(val).uci for val in sts]))
def getlemucis(self, lem) :
uces = self.getlemuces(lem)
for lem in tokeep :
deff = self.getlemuceseff(lem)
ucesk = deff.keys()
- tab.append([lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])
+ line = [lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces]
+ if sum(line[1:]) >= mineff :
+ tab.append(line)
tab.insert(0, [''] + etoiles)
return tab
tgenoccurrences[t][etoiles[i]] += sum([lemuceeff[uce] for uce in concern])
return tgenoccurrences, totoccurrences
+ def make_tgen_profile(self, tgen, ucecl, uci = False) :
+ log.info('tgen/classes')
+ self.tgenlem = {}
+ clnb = len(ucecl)
+ if uci :
+ #FIXME : NE MARCHE PLUS CHANGER CA
+ tab = [[lem] + [len(set(self.gettgentxt(tgen[lem])).intersection(classe)) for i, classe in enumerate(ucecl)] for lem in tgen]
+ else :
+ tab = [[lem] + [len(set(self.gettgenstprof(tgen[lem], classe, i, clnb)).intersection(classe)) for i, classe in enumerate(ucecl)] for lem in tgen]
+ tab = [[line[0]] + [val for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
+ return tab
+ #i = 0
+ #nam = 'total'
+ #while nam + `i` in tgen :
+ # i += 1
+ #nam = nam + `i`
+ #last = [nam] + [`len(classe)` for classe in ucecl]
+ #tab += [last]
+ #line0 = ['tgen'] + ['_'.join(['cluster', `i+1`]) for i in range(len(ucecl))]
+ #tab = [line0] + tab
+ #with open(fileout, 'w') as f :
+ # f.write('\n'.join(['\t'.join(line) for line in tab]).encode(self.parametres['syscoding']))
+
def make_efftype_from_etoiles(self, etoiles) :
dtype = {}
etuces = [[] for et in etoiles]
ident += 1
f.write('\n'.join([self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
+ def export_meta_table(self, outf) :
+ metas = [[`i`] + text.etoiles[1:] for i, text in enumerate(self.ucis)]
+ longueur_max = max([len(val) for val in metas])
+ first = ['column_%i' % i for i in range(longueur_max)]
+ metas.insert(0, first)
+ with open(outf, 'w') as f :
+ f.write('\n'.join(['\t'.join(line) for line in metas]).encode(self.parametres['syscoding']))
+
def export_corpus_classes(self, outf, alc = True, lem = False, uci = False) :
ucecl = {}
for i, lc in enumerate(self.lc) :
f.write(''.join([' '.join([`duces[uce]+1`,`i+1`,`1`]),'\n']))
f.seek(0)
with open(outfile, 'w') as ffin :
- ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
+ ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (len(uces), len(actives), nbl))
for line in f :
ffin.write(line)
os.remove(outfile + '~')
except IndexError :
det[et[0]] = 1
return det
+
+ def make_theme_dict(self):
+ themes = [val for uci in self.ucis for val in uci.paras]
+ det = {}
+ for theme in themes :
+ th = theme.split('_')
+ if th[0] in det :
+ try :
+ endth = '_'.join(th[1:])
+ if theme in det[th[0]] :
+ det[th[0]][theme] += 1
+ else :
+ det[th[0]][theme] = 1
+ except IndexError :
+ det[th[0]] += 1
+ else :
+ try :
+ endth = '_'.join(th[1:])
+ det[th[0]] = {theme:1}
+ except IndexError :
+ det[th[0]] = 1
+ return det
def make_etline(self, listet) :
etuces = [[] for et in listet]
self.cleans.append(self.dotiret)
def make_expression(self,txt) :
- for expression in self.expressions:
+ exp = self.expressions.keys()
+ exp.sort(reverse=True)
+ for expression in exp :
if expression in txt :
txt = txt.replace(expression, self.expressions[expression][0])
return txt
self.infile = None
self.corpus = Corpus(self, {'type' : 'corpus', 'originalpath' : corpus.parametres['originalpath'], 'encoding' : corpus.parametres['encoding']})
self.last = 0
+ self.parametres = parametres
self.encoding = corpus.parametres['encoding']
self.corpus.parametres['corpus_name'] = parametres['corpus_name']
self.corpus.pathout = PathOut(filename = corpus.parametres['originalpath'], dirout = parametres['pathout'])
self.corpus.ucis.append(nuci)
else :
idpara += 1
- elif parametres.get('fromcluster', False) :
- pass
+ elif parametres.get('fromclusters', False) :
+ self.parametres['uceids'] = [st for i in self.parametres['meta'] for st in self.parametres['lc'][i]]
+ self.fromuceids()
elif parametres.get('fromuceids', False) :
- print 'fromuceids'
- dictucekeep = dict(zip(parametres['uceids'], parametres['uceids']))
- idpara = 0
- for uci in self.ori.ucis :
- if uci.paras == [] :
- keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep]
- if keepuces != [] :
- nuci = CopyUci(uci)
- nuci.uces = keepuces
- self.corpus.ucis.append(nuci)
- idpara += 1
- else :
- newuces = []
- newpara = []
- for et in uci.paras :
- keepuces = [CopyUce(uce) for uce in uci.uces if uce.para == idpara]
- idpara += 1
- if keepuces != [] :
- newuces += keepuces
- newpara.append(et)
- if newuces != [] :
- nuci = CopyUci(uci)
- nuci.uces = newuces
- nuci.paras = newpara
- self.corpus.ucis.append(nuci)
-
+ self.fromuceids()
#create database
self.connect()
self.dobuild()
+
+ def fromuceids(self):
+ print 'fromuceids'
+ dictucekeep = dict(zip(self.parametres['uceids'], self.parametres['uceids']))
+ idpara = 0
+ for uci in self.ori.ucis :
+ if uci.paras == [] :
+ keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep]
+ if keepuces != [] :
+ nuci = CopyUci(uci)
+ nuci.uces = keepuces
+ self.corpus.ucis.append(nuci)
+ idpara += 1
+ else :
+ newuces = []
+ newpara = []
+ for et in uci.paras :
+ keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep]
+ idpara += 1
+ if keepuces != [] :
+ newuces += keepuces
+ newpara.append(et)
+ if newuces != [] :
+ nuci = CopyUci(uci)
+ nuci.uces = newuces
+ nuci.paras = newpara
+ self.corpus.ucis.append(nuci)
def read_corpus(self, infile = None):
self.olduceid = [uce.ident for uci in self.corpus.ucis for uce in uci.uces]
def __init__(self, parent, dlg = None) :
self.parent = parent
self.dlg = dlg
+
parametres = DoConf(os.path.join(self.parent.UserConfigPath,'corpus.cfg')).getoptions('corpus')
parametres['pathout'] = PathOut(parent.filename, 'corpus').mkdirout()
parametres['corpus_name'] = os.path.split(parametres['pathout'])[1]
dial.txtpath.SetLabel(parent.filename)
#dial.repout_choices.SetValue(parametres['pathout'])
self.res = dial.ShowModal()
+ if self.dlg is not None :
+ self.dlg = progressbar(self.parent, self.dlg)
if self.res == 5100 :
parametres = dial.doparametres()
parametres['originalpath'] = parent.filename
PathOut().createdir(parametres['pathout'])
- ReadLexique(self.parent, lang = parametres['lang'])
+ if parametres.get('dictionary', False) :
+ filein = parametres['dictionary']
+ else :
+ filein = None
+ if dial.corpusname.GetValue() != '' :
+ parametres['corpus_name'] = dial.corpusname.GetValue()
+ dial.Destroy()
+ ReadLexique(self.parent, lang = parametres['lang'], filein = filein)
if parametres['lang'] != 'other' and os.path.exists(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')):
self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp'))
else :
self.parent.expressions = {}
self.parametres = parametres
else :
+ dial.Destroy()
if self.dlg is not None :
self.dlg.Destroy()
- dial.Destroy()
def doanalyse(self) :
return BuildFromAlceste(self.parent.filename, self.parametres, self.parent.lexique, self.parent.expressions, dlg = self.dlg).corpus
parametres['meta'] = corpus.make_etoiles()
elif parametres.get('fromtheme', False) :
parametres['meta'] = corpus.make_themes()
+ elif parametres.get('fromclusters', False) :
+ parametres['meta'] = [' '.join(['classe', `i`]) for i in range(1,parametres['clnb'] + 1)]
else :
parametres['meta'] = []
- parametres['meta'].sort()
+ if 'fromclusters' not in parametres :
+ parametres['meta'].sort()
if dlg is not None :
del busy
dial = SubTextFromMetaDial(parent, parametres)
i += 1
parametres['pathout'] = pathout + '_%i' % i
meta = dial.m_listBox1.GetSelections()
- parametres['meta'] = [parametres['meta'][val] for val in meta]
+ if not 'fromclusters' in parametres :
+ parametres['meta'] = [parametres['meta'][val] for val in meta]
+ else :
+ parametres['meta'] = meta
self.parametres = parametres
dial.Destroy()
else :
def doanalyse(self):
return BuildSubCorpus(self.ori, parametres = self.parametres, dlg = self.dlg).corpus
-
-if __name__ == '__main__' :
- t1 = time()
- parametres = {'formesdb':'formes.db', 'ucesdb': 'uces.db', 'corpusdb' : 'corpus.db', 'syscoding' : 'utf-8', 'encoding' : 'utf8'}
- intro = BuildCorpus(infile, parametres)#, tar_in, tar_infouce)#, tar_formes)
- print time() - t1