import locale
import sys
from time import time
-from functions import decoupercharact, ReadDicoAsDico, DoConf
+from functions import decoupercharact, ReadDicoAsDico, DoConf, ReadLexique
import re
import sqlite3
import itertools
from operator import itemgetter
from uuid import uuid4
from chemins import PathOut
-from dialog import CorpusPref
-from functions import ReadLexique, ReadDicoAsDico
+from dialog import CorpusPref, SubTextFromMetaDial
from colors import colors
import datetime
class Corpus :
"""Corpus class
- list of uci
-
+ list of text
"""
def __init__(self, parent, parametres = {}, read = False) :
self.parent = parent
lem = word
self.formes[word] = Word(word, gramtype, len(self.formes), lem)
self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident : 1}
+
+ def add_word_from_forme(self, word, stident):
+ if word.forme in self.formes :
+ self.formes[word.forme].freq += 1
+ if self.formes[word.forme].ident in self.idformesuces :
+ if stident in self.idformesuces[self.formes[word.forme].ident] :
+ self.idformesuces[self.formes[word.forme].ident][stident] += 1
+ else :
+ self.idformesuces[self.formes[word.forme].ident][stident] = 1
+ else :
+ self.idformesuces[self.formes[word.forme].ident] = {stident: 1}
+ else :
+ self.formes[word.forme] = word
+ self.formes[word.forme].ident = len(self.formes)
+ self.formes[word.forme].freq = 1
+ self.idformesuces[self.formes[word.forme].ident] = {stident : 1}
def conn_all(self):
"""connect corpus to db"""
wordid = self.formes[wordid].ident
res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`wordid`,))
return list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
+
+ def getworducis(self, wordid) :
+ res = self.getworduces(wordid)
+ return list(set([self.getucefromid(uce).uci for uce in res]))
def getformeuceseff(self, formeid) :
if isinstance(formeid, basestring) :
lemuceeff = {}
for i, uce in enumerate(uces) :
lemuceeff[uce] = lemuceeff.get(uce, 0) + eff[i]
- return lemuceeff
+ return lemuceeff
def getlemclustereff(self, lem, cluster) :
return len(list(set(self.lc[cluster]).intersection(self.getlemuces(lem))))
return [len(uce[1].split()) for uce in res]
def getconcorde(self, uces) :
- return self.cuces.execute('select * from uces where id IN (%s);' % ', '.join([`i` for i in uces]))
+ return self.cuces.execute('select * from uces where id IN (%s) ORDER BY id;' % ', '.join([`i` for i in uces]))
+
+ def getuciconcorde(self, ucis) :
+ uces = [[val,[uce.ident for uce in self.ucis[val].uces]] for val in ucis]
+ uces = [[val[0], '\n'.join([row[1] for row in self.getconcorde(val[1])])] for val in uces]
+ return uces
def getwordconcorde(self, word) :
return self.getconcorde(self.getworduces(word))
def getalluces(self) :
return self.cuces.execute('SELECT * FROM uces')
-
+
+ def getallucis(self):
+ uces = [row[1] for row in self.getalluces()]
+ return [[uci.ident, '\n'.join([uces[uce.ident] for uce in uci.uces])] for uci in self.ucis]
+
def getucesfrometoile(self, etoile) :
return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles]
else :
idpara += 1
return etoileuces
+
+ def getetoileucis(self):
+ etoileuces = {}
+ for uci in self.ucis :
+ etoiles = uci.etoiles[1:]
+ for et in etoiles :
+ if et in etoileuces :
+ etoileuces[et] += [uci.ident]
+ else :
+ etoileuces[et] = [uci.ident]
+ return etoileuces
def getucefromid(self, uceid) :
if self.iduces is None : self.make_iduces()
self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
else :
self.lems = dict([[forme, Lem(self, self.formes[forme])] for forme in self.formes])
+
+ def make_lems_from_dict(self, dictionnaire, dolem = True) :
+ log.info('make lems from dict')
+ self.lems = {}
+ for forme in self.formes :
+ if self.formes[forme].forme in dictionnaire :
+ lem = dictionnaire[forme][0]
+ gram = dictionnaire[forme][1]
+ elif forme.isdigit() :
+ gram = u'num'
+ lem = forme
+ else :
+ gram = u'nr'
+ lem = forme
+ self.formes[forme].lem = lem
+ self.formes[forme].gram = gram
+ if dolem :
+ if self.formes[forme].lem in self.lems :
+ if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes :
+ self.lems[self.formes[forme].lem].add_forme(self.formes[forme])
+ else :
+ self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
+ else :
+ self.lems[forme] = Lem(self, self.formes[forme])
def make_idformes(self) :
self.idformes = dict([[self.formes[forme].ident, self.formes[forme]] for forme in self.formes])
if self.iduces is None :
self.iduces = dict([[uce.ident, uce] for uci in self.ucis for uce in uci.uces])
- def make_lexitable(self, mineff, etoiles) :
- tokeep = [lem for lem in self.lems if self.lems[lem].freq >= mineff]
+ def make_lexitable(self, mineff, etoiles, gram = 0) :
+ if gram == 0 :
+ grams = {1:'', 2:''}
+ else :
+ grams = {gram :''}
+ tokeep = [lem for lem in self.lems if self.lems[lem].freq >= mineff and self.lems[lem].act in grams]
etuces = [[] for et in etoiles]
for uci in self.ucis :
get = list(set(uci.etoiles).intersection(etoiles))
tab.insert(0, [''] + etoiles)
return tab
+ def make_tgen_table(self, tgen, etoiles, tot = None):
+ lclasses = [self.getucesfrometoile(etoile) for etoile in etoiles]
+ sets = [set(cl) for cl in lclasses]
+ totoccurrences = dict([[val, 0] for val in etoiles])
+ if tot is None :
+ for forme in self.formes :
+ formeuceeff = self.getformeuceseff(forme)
+ for i, classe in enumerate(lclasses) :
+ concern = sets[i].intersection(formeuceeff.keys())
+ if len(concern) :
+ totoccurrences[etoiles[i]] += sum([formeuceeff[uce] for uce in concern])
+ #tgenoccurrences = dict([[val, 0] for val in etoiles])
+ tgenoccurrences = {}
+ for t in tgen.tgen :
+ tgenoccurrences[t] = dict([[val, 0] for val in etoiles])
+ for lem in tgen[t] :
+ lemuceeff = self.getlemuceseff(lem)
+ for i, classe in enumerate(lclasses) :
+ concern = sets[i].intersection(lemuceeff.keys())
+ if len(concern) :
+ tgenoccurrences[t][etoiles[i]] += sum([lemuceeff[uce] for uce in concern])
+ return tgenoccurrences, totoccurrences
+
def make_efftype_from_etoiles(self, etoiles) :
dtype = {}
etuces = [[] for et in etoiles]
ident += 1
f.write('\n'.join([self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
- def export_corpus_classes(self, outf, alc = True, lem = False) :
+ def export_corpus_classes(self, outf, alc = True, lem = False, uci = False) :
ucecl = {}
for i, lc in enumerate(self.lc) :
for uce in lc :
ucecl[uce] = i + 1
for uce in self.lc0 :
ucecl[uce] = 0
- res = self.getalluces()
- self.make_iduces()
+ if not uci :
+ res = self.getalluces()
+ self.make_iduces()
+ else :
+ res = self.getallucis()
with open(outf, 'w') as f :
for uce in res :
guce = uce[1]
- actuci = self.iduces[uce[0]].uci
+ if not uci :
+ actuci = self.iduces[uce[0]].uci
+ else :
+ actuci = uce[0]
if lem :
guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
if alc :
- etline = ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles + ['*classe_%i' % ucecl[uce[0]]])
+ etline = ' '.join(self.ucis[actuci].etoiles + ['*classe_%i' % ucecl[uce[0]]])
else :
- etline = ' '.join(['<' + '='.join(et.split('_')) + '>' for et in self.ucis[self.iduces[uce[0]].uci].etoiles[1:]])
+ etline = ' '.join(['<' + '='.join(et.split('_')) + '>' for et in self.ucis[actuci].etoiles[1:]])
f.write(etline.encode(self.parametres['syscoding']) + '\n')
f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
- def export_classe(self, outf, classe, lem = False) :
+ def export_classe(self, outf, classe, lem = False, uci = False) :
sts = self.lc[classe - 1]
- res = self.getconcorde(sts)
- self.make_iduces()
+ if not uci :
+ res = self.getconcorde(sts)
+ self.make_iduces()
+ else :
+ res = self.getuciconcorde(sts)
with open(outf, 'w') as f :
for uce in res :
guce = uce[1]
- f.write(' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n')
+ if not uci :
+ f.write(' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n')
+ else :
+ f.write(' '.join(self.ucis[uce[0]].etoiles).encode(self.parametres['syscoding']) + '\n')
if lem :
guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
+
+ def export_owledge(self, rep, classe, lem = False, uci = False) :
+ sts = self.lc[classe - 1]
+ if not uci :
+ res = self.getconcorde(sts)
+ self.make_iduces()
+ else :
+ res = self.getuciconcorde(sts)
+ for uce in res :
+ ident = uce[0]
+ guce = uce[1]
+ outf = '.'.join([`ident`, 'txt'])
+ outf = os.path.join(rep, outf)
+ if lem :
+ guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
+ with open(outf, 'w') as f :
+ f.write(guce.encode('cp1252', errors = 'replace'))
+
+ def export_tropes(self, fileout, classe, lem = False, uci = False) :
+ sts = self.lc[classe - 1]
+ if not uci :
+ res = self.getconcorde(sts)
+ self.make_iduces()
+ else :
+ res = self.getuciconcorde(sts)
+ with open(fileout, 'w') as f :
+ for uce in res :
+ guce = uce[1]
+ if lem :
+ guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
+ f.write(guce.encode('cp1252', errors = 'replace'))
+ f.write('\n')
def make_and_write_sparse_matrix_from_uces(self, actives, outfile, listuce = False) :
log.info('make_and_write_sparse_matrix_from_uces %s' % outfile)
ffin.write(line)
os.remove(outfile + '~')
- def make_table_with_classe(self, uces, list_act) :
+ def make_table_with_classe(self, uces, list_act, uci = False) :
table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
uces = dict([[uce, i] for i, uce in enumerate(uces)])
+ if uci :
+ getlem = self.getlemucis
+ else :
+ getlem = self.getlemuces
for i, lem in enumerate(list_act) :
- lemuces = list(set(self.getlemuces(lem)).intersection(uces))
+ lemuces = list(set(getlem(lem)).intersection(uces))
for uce in lemuces :
table_uce[uces[uce]][i] = 1
table_uce.insert(0, list_act)
return table_uce
+
+ def make_pondtable_with_classe(self, uces, list_act) :
+ table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
+ uces = dict([[uce, i] for i, uce in enumerate(uces)])
+ for i, lem in enumerate(list_act) :
+ uceseff = self.getlemuceseff(lem)
+ lemuces = list(set(uceseff.keys()).intersection(uces))
+ for uce in lemuces :
+ table_uce[uces[uce]][i] = uceseff[uce]
+ table_uce.insert(0, list_act)
+ return table_uce
def parse_active(self, gramact, gramsup = None) :
log.info('parse actives')
allactives = [[self.lems[lem].freq, lem] for lem in self.lems if self.lems[lem].act == key and self.lems[lem].freq >= 3]
self.activenb = len(allactives)
allactives = sorted(allactives, reverse = True)
+ if self.activenb == 0 :
+ return [], 0
if len(allactives) <= nbmax :
log.info('nb = %i - eff min = %i ' % (len(allactives), allactives[-1][0]))
return [val[1] for val in allactives], allactives[-1][0]
log.info('nb actives = %i - eff min = %i ' % (stop + 1, lim))
return [val[1] for val in allactives[0:stop + 1]], lim
- def make_and_write_profile(self, actives, ucecl, fileout) :
+ def make_and_write_profile(self, actives, ucecl, fileout, uci = False) :
log.info('formes/classes')
- tab = [[lem] + [len(set(self.getlemuces(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
+ if uci :
+ tab = [[lem] + [len(set(self.getlemucis(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
+ else :
+ tab = [[lem] + [len(set(self.getlemuces(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
tab = [[line[0]] + [`val` for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
with open(fileout, 'w') as f :
f.write('\n'.join([';'.join(line) for line in tab]).encode(self.parametres['syscoding']))
for uci in self.ucis :
etoiles.update(uci.etoiles[1:])
return list(etoiles)
+
+ def make_themes(self):
+ themes = set([])
+ for uci in self.ucis :
+ themes.update(uci.paras)
+ return list(themes)
def make_etoiles_dict(self) :
etoiles = [et for uci in self.ucis for et in uci.etoiles[1:]]
etuces[listet.index(get[0])] += [uce.ident for uce in uci.uces]
return etuces
- def make_and_write_profile_et(self, ucecl, fileout) :
+ def make_and_write_profile_et(self, ucecl, fileout, uci = False) :
log.info('etoiles/classes')
- etoileuces = self.getetoileuces()
+ if not uci :
+ etoileuces = self.getetoileuces()
+ else :
+ etoileuces = self.getetoileucis()
etoileuces = dict([[et, etoileuces[et]] for et in etoileuces if len(etoileuces[et]) > 1])
with open(fileout, 'w') as f :
f.write('\n'.join([';'.join([et] + [`len(set(etoileuces[et]).intersection(classe))` for classe in ucecl]) for et in etoileuces]).encode(self.parametres['syscoding']))
#with open(fileout, 'w') as f :
# f.write('\n'.join([';'.join([etoile] + [`len(set(self.getucesfrometoile(etoile)).intersection(classe))` for classe in ucecl]) for etoile in etoiles]).encode(self.parametres['syscoding']))
- def make_colored_corpus(self) :
+ def make_colored_corpus(self, uci = False) :
ucecl = {}
for i, lc in enumerate(self.lc) :
for uce in lc :
<meta http-equiv="content-Type" content="text/html; charset=%s" />
<body>
''' % sys.getdefaultencoding()
- res = self.getalluces()
- self.make_iduces()
- actuci = ''
- actpara = False
- for uce in res :
- if self.iduces[uce[0]].uci != actuci :
- actuci = self.iduces[uce[0]].uci
- txt += '<br><hr>' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles) + '<br><br>'
- txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
- else :
- txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
+ if not uci :
+ res = self.getalluces()
+ self.make_iduces()
+ actuci = ''
+ actpara = False
+ for uce in res :
+ if self.iduces[uce[0]].uci != actuci :
+ actuci = self.iduces[uce[0]].uci
+ txt += '<br><hr>' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles) + '<br><br>'
+ txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
+ else :
+ txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
+ else :
+ res = self.getallucis()
+ actuci = ''
+ for uce in res :
+ if self.ucis[uce[0]].ident != actuci :
+ actuci = self.ucis[uce[0]].ident
+ txt += '<br><hr>' + ' '.join(self.ucis[self.ucis[uce[0]].ident].etoiles) + '<br><br>'
+ txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
+ else :
+ txt += '<font color="%s">' % (color[ucecl[uce[0]]]) + uce[1] + '</font><br><br>'
return txt + '\n</body></html>'
def count_from_list(self, l, d) :
l = l[-taille_limite:]
return l
- def find_segments_in_classe(self, list_uce, taille_segment, taille_limite):
+ def find_segments_in_classe(self, list_uce, taille_segment, taille_limite, uci = False):
d={}
- for uce in self.getconcorde(list_uce) :
+ if not uci :
+ concorde = self.getconcorde
+ else :
+ concorde = self.getuciconcorde
+ for uce in concorde(list_uce) :
uce = uce[1].split()
d =self.count_from_list([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d)
l = [[d[val], val, taille_segment] for val in d if d[val] >= 3]
listlem.sort()
with open(fileout, 'w') as f :
f.write('\n'.join(['\t'.join(lem) for lem in listlem]).encode(syscoding))
-
-
+
class MakeUciStat :
ucinb = corpus.getucinb()
ucisize = corpus.getucisize()
ucimean = float(sum(ucisize))/float(ucinb)
- detoile = corpus.make_etoiles_dict()
-
+ detoile = corpus.make_etoiles_dict()
class Uci :
def __init__(self, iduci, line, paraset = None) :
self.corpus.pathout = PathOut(filename = parametres_corpus['originalpath'], dirout = parametres_corpus['pathout'])
self.corpus.pathout.createdir(parametres_corpus['pathout'])
self.corpus.parametres['uuid'] = str(uuid4())
- self.corpus.parametres['corpus_name'] = os.path.split(self.corpus.parametres['pathout'])[1]
+ self.corpus.parametres['corpus_name'] = parametres_corpus['corpus_name']#os.path.split(self.corpus.parametres['pathout'])[1]
self.corpus.parametres['type'] = 'corpus'
if self.corpus.parametres['keep_ponct'] :
self.ponctuation_espace = [' ', '']
self.cf.execute('CREATE INDEX ideff ON eff (id);')
self.c.close()
self.cf.close()
- #backup corpora
+ #backup corpus
self.conn_corpus = sqlite3.connect(self.corpus.pathout['corpus.db'])
self.ccorpus = self.conn_corpus.cursor()
self.ccorpus.execute('CREATE TABLE IF NOT EXISTS etoiles (uci INTEGER, et TEXT, paras TEXT);')
self.cleans.append(self.dotiret)
def make_expression(self,txt) :
- for expression in self.expressions:
+ for expression in self.expressions:
if expression in txt :
txt = txt.replace(expression, self.expressions[expression][0])
- return txt
+ return txt
def dolower(self, txt) :
return txt.lower()
pourhapaxocc = (float(hapaxnb) / self.corpus.parametres['occurrences']) * 100
self.corpus.parametres['hapax'] = '%i - %.2f %% des formes - %.2f %% des occurrences' % (hapaxnb, pourhapaxf, pourhapaxocc)
+class BuildSubCorpus(BuildCorpus):
+ def __init__(self, corpus, parametres, dlg = None) :
+ log.info('begin subcorpus...')
+ self.dlg = dlg
+ self.ori = corpus
+ self.infile = None
+ self.corpus = Corpus(self, {'type' : 'corpus', 'originalpath' : corpus.parametres['originalpath'], 'encoding' : corpus.parametres['encoding']})
+ self.last = 0
+ self.encoding = corpus.parametres['encoding']
+ self.corpus.parametres['corpus_name'] = parametres['corpus_name']
+ self.corpus.pathout = PathOut(filename = corpus.parametres['originalpath'], dirout = parametres['pathout'])
+ self.corpus.pathout.createdir(parametres['pathout'])
+ self.corpus.parametres['pathout'] = parametres['pathout']
+ self.corpus.parametres['meta'] = parametres.get('meta', False)
+ self.corpus.parametres['uuid'] = str(uuid4())
+ if parametres.get('frommeta', False) :
+ print 'make subtexts'
+ self.corpus.ucis = [uci for uci in self.ori.ucis if set(parametres['meta']).intersection(uci.etoiles) != set()]
+ elif parametres.get('fromtheme', False) :
+ print 'make subtexts from theme'
+ idpara = 0
+ for uci in self.ori.ucis :
+ if uci.paras != [] :
+ newuce = []
+ newpara = []
+ for et in uci.paras :
+ if et in parametres['meta'] :
+ newuce += [uce for uce in uci.uces if uce.para == idpara]
+ newpara.append(et)
+ idpara += 1
+ if newuce != [] :
+ uci.uces = newuce
+ uci.paras = newpara
+ self.corpus.ucis.append(uci)
+ else :
+ idpara += 1
+ elif parametres.get('fromcluster', False) :
+ pass
+ elif parametres.get('fromuceids', False) :
+ print 'fromuceids'
+ dictucekeep = dict(zip(parametres['uceids'], parametres['uceids']))
+ idpara = 0
+ for uci in self.ori.ucis :
+ if uci.paras == [] :
+ keepuces = [uce for uce in uci.uces if uce.ident in dictucekeep]
+ if keepuces != [] :
+ uci.uces = keepuces
+ self.corpus.ucis.append(uci)
+ idpara += 1
+ else :
+ newuces = []
+ newpara = []
+ for et in uci.paras :
+ keepuces = [uce for uce in uci.uces if uce.para == idpara]
+ idpara += 1
+ if keepuces != [] :
+ newuces += keepuces
+ newpara.append(et)
+ if newuces != [] :
+ uci.uces = newuces
+ uci.paras = newpara
+ self.corpus.ucis.append(uci)
+
+ #create database
+ self.connect()
+ self.dobuild()
+
+ def read_corpus(self, infile = None):
+ self.olduceid = [uce.ident for uci in self.corpus.ucis for uce in uci.uces]
+ ident_uci = 0
+ ident_uce = 0
+ ident_para = -1
+ lastpara = -1
+ newuceident = {}
+ print 'redo text, para and st ident'
+ for uci in self.corpus.ucis :
+ uci.ident = ident_uci
+ ident_uci += 1
+ for uce in uci.uces :
+ uce.uci = uci.ident
+ if uce.para != lastpara :
+ ident_para += 1
+ lastpara = uce.para
+ uce.para = ident_para
+ else :
+ uce.para = ident_para
+ newuceident[uce.ident] = ident_uce
+ uce.ident = ident_uce
+ ident_uce += 1
+ print 'backup st text and forms'
+ for row in self.ori.getconcorde(self.olduceid) :
+ self.c.execute('INSERT INTO uces VALUES(?,?);', (`newuceident[row[0]]`, row[1]))
+ for word in row[1].split() :
+ self.corpus.add_word_from_forme(self.ori.formes[word], newuceident[row[0]])
+ self.backup_uce()
+ print 'done'
class BuildFromAlceste(BuildCorpus) :
def read_corpus(self, infile) :
try :
with codecs.open(infile, 'r', self.encoding) as f :
for linenb, line in enumerate(f) :
- line = line.rstrip('\n\r')
+ line = line.rstrip('\n\r')#FIXME .lstrip(codecs.BOM).lstrip(codecs.BOM_UTF8)
if self.testuci(line) :
iduci += 1
if txt != [] :
if iduci != -1 and iduce != -1:
self.backup_uce()
else :
- log.info(_(u"No Text in corpora. Are you sure of the formatting ?"))
+ log.info(_(u"No Text in corpus. Are you sure of the formatting ?"))
raise Exception('TextBeforeTextMark %i' % linenb)
except UnicodeDecodeError :
raise Exception("CorpusEncoding")
self.dlg = dlg
parametres = DoConf(os.path.join(self.parent.UserConfigPath,'corpus.cfg')).getoptions('corpus')
parametres['pathout'] = PathOut(parent.filename, 'corpus').mkdirout()
+ parametres['corpus_name'] = os.path.split(parametres['pathout'])[1]
dial = CorpusPref(parent, parametres)
dial.CenterOnParent()
dial.txtpath.SetLabel(parent.filename)
def doanalyse(self) :
return BuildFromAlceste(self.parent.filename, self.parametres, self.parent.lexique, self.parent.expressions, dlg = self.dlg).corpus
+class SubBuilder :
+ def __init__(self, parent, corpus, parametres = None, dlg = None):
+ self.parent = parent
+ self.ori = corpus
+ self.dlg = dlg
+ corpus_name = 'Sub' + corpus.parametres['corpus_name']
+ parametres['corpus_name'] = corpus_name
+ if parametres.get('frommeta', False) :
+ parametres['meta'] = corpus.make_etoiles()
+ elif parametres.get('fromtheme', False) :
+ parametres['meta'] = corpus.make_themes()
+ else :
+ parametres['meta'] = []
+ parametres['meta'].sort()
+ dial = SubTextFromMetaDial(parent, parametres)
+ self.res = dial.ShowModal()
+ if self.res == 5100 :
+ if dial.subcorpusname.GetValue() != '' :
+ corpus_name = ''.join([l for l in dial.subcorpusname.GetValue() if l.isalnum() or l in ['_']])
+ if corpus_name != '' :
+ parametres['corpus_name'] = corpus_name
+ else :
+ parametres['corpus_name'] = 'Sub' + corpus.parametres['corpus_name']
+ pathout = os.path.join(corpus.parametres['pathout'], parametres['corpus_name'])
+ i = 1
+ while os.path.exists(pathout + '_%i' % i) :
+ i += 1
+ parametres['pathout'] = pathout + '_%i' % i
+ meta = dial.m_listBox1.GetSelections()
+ parametres['meta'] = [parametres['meta'][val] for val in meta]
+ self.parametres = parametres
+ dial.Destroy()
+ else :
+ dial.Destroy()
+
+ def doanalyse(self):
+ return BuildSubCorpus(self.ori, parametres = self.parametres, dlg = self.dlg).corpus
if __name__ == '__main__' :
t1 = time()
- parametres = {'formesdb':'formes.db', 'ucesdb': 'uces.db', 'corpusdb' : 'corpus.db', 'syscoding' : 'utf-8', 'encoding' : encoding}
+ parametres = {'formesdb':'formes.db', 'ucesdb': 'uces.db', 'corpusdb' : 'corpus.db', 'syscoding' : 'utf-8', 'encoding' : 'utf8'}
intro = BuildCorpus(infile, parametres)#, tar_in, tar_infouce)#, tar_formes)
print time() - t1