res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`wordid`,))
return list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
+ def getformeuceseff(self, formeid) :
+ if isinstance(formeid, basestring) :
+ formeid = self.formes[formeid].ident
+ res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`formeid`,))
+ uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
+ query = 'SELECT eff FROM eff where id=%i ORDER BY id' % formeid
+ res = self.cformes.execute(query)
+ eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res]))
+ formeuceeff = {}
+ for i, uce in enumerate(uces) :
+ formeuceeff[uce] = formeuceeff.get(uce, 0) + eff[i]
+ return formeuceeff
+
def getlemuces(self, lem) :
formesid = ', '.join([`val` for val in self.lems[lem].formes])
query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
uces = self.getlemuces(lem)
return list(set([self.getucefromid(val).uci for val in uces]))
- def getlemuceseff(self, lem) :
+ def getlemuceseff(self, lem, luces = None) :
formesid = ', '.join([`val` for val in self.lems[lem].formes])
query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid
res = self.cformes.execute(query)
lemuceeff[uce] = lemuceeff.get(uce, 0) + eff[i]
return lemuceeff
+ def getlemclustereff(self, lem, cluster) :
+ return len(list(set(self.lc[cluster]).intersection(self.getlemuces(lem))))
+
def getlemeff(self, lem) :
return self.lems[lem].freq
self.iduces = dict([[uce.ident, uce] for uci in self.ucis for uce in uci.uces])
def make_lexitable(self, mineff, etoiles) :
- tokeep = [lem for lem in self.lems if self.lems[lem].freq > mineff]
+ tokeep = [lem for lem in self.lems if self.lems[lem].freq >= mineff]
etuces = [[] for et in etoiles]
for uci in self.ucis :
get = list(set(uci.etoiles).intersection(etoiles))
actpara = self.iduces[uce[0]].para
ident += 1
f.write('\n'.join([self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n')
+
+ def export_corpus_classes(self, outf, alc = True, lem = False) :
+ ucecl = {}
+ for i, lc in enumerate(self.lc) :
+ for uce in lc :
+ ucecl[uce] = i + 1
+ for uce in self.lc0 :
+ ucecl[uce] = 0
+ res = self.getalluces()
+ self.make_iduces()
+ with open(outf, 'w') as f :
+ for uce in res :
+ guce = uce[1]
+ actuci = self.iduces[uce[0]].uci
+ if lem :
+ guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
+ if alc :
+ etline = ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles + ['*classe_%i' % ucecl[uce[0]]])
+ else :
+ etline = ' '.join(['<' + '='.join(et.split('_')) + '>' for et in self.ucis[self.iduces[uce[0]].uci].etoiles[1:]])
+ f.write(etline.encode(self.parametres['syscoding']) + '\n')
+ f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
def make_and_write_sparse_matrix_from_uces(self, actives, outfile, listuce = False) :
log.info('make_and_write_sparse_matrix_from_uces %s' % outfile)
def parse_active(self, gramact, gramsup = None) :
log.info('parse actives')
for lem in self.lems :
- if self.lems[lem].gram in gramact :
+ if lem.startswith('_') and lem.endswith('_') :
+ self.lems[lem].act = 2
+ elif self.lems[lem].gram in gramact :
self.lems[lem].act = 1
elif gramsup is not None :
if self.lems[lem].gram in gramsup :
lim -= 1
else :
stop = nbmax - 1
- log.info('nb actives = %i - eff min = %i ' % (stop, lim))
+ lim = effs[stop]
+ log.info('nb actives = %i - eff min = %i ' % (stop + 1, lim))
return [val[1] for val in allactives[0:stop + 1]], lim
def make_and_write_profile(self, actives, ucecl, fileout) :
et = etoile.split('_')
if et[0] in det :
try :
- if et[1] in det[et[0]] :
- det[et[0]][et[1]] += 1
+ endet = '_'.join(et[1:])
+ if endet in det[et[0]] :
+ det[et[0]][endet] += 1
else :
- det[et[0]][et[1]] = 1
+ det[et[0]][endet] = 1
except IndexError :
det[et[0]] += 1
else :
try :
- det[et[0]] = {et[1] :1}
+ endet = '_'.join(et[1:])
+ det[et[0]] = {endet :1}
except IndexError :
det[et[0]] = 1
- print det
+ return det
+
+ def make_etline(self, listet) :
+ etuces = [[] for et in listet]
+ for uci in self.ucis :
+ get = list(set(uci.etoiles).intersection(listet))
+ if len(get) > 1 :
+ return '2 variables sur la meme ligne'
+ elif get != [] :
+ etuces[listet.index(get[0])] += [uce.ident for uce in uci.uces]
+ return etuces
+
def make_and_write_profile_et(self, ucecl, fileout) :
log.info('etoiles/classes')
result = [[seg] + [str(val) for val in d[seg]] for seg in d if sum(d[seg]) >= effmin]
with open(fileout, 'w') as f :
f.write('\n'.join([';'.join(line) for line in result]))
-
+
+ def make_proftype(self, outf) :
+ res = {}
+ for lem in self.lems :
+ gram = self.lems[lem].gram
+ if not gram in res :
+ res[gram] = [0 for val in self.lc]
+ lemuceeff = self.getlemuceseff(lem)
+ for i, classe in enumerate(self.lc) :
+ concern = set(classe).intersection(lemuceeff.keys())
+ res[gram][i] += sum([lemuceeff[uce] for uce in concern])
+ res = [[gram] + [`val` for val in res[gram]] for gram in res]
+ res.sort()
+ with open(outf, 'w') as f :
+ f.write('\n'.join([';'.join(line) for line in res]).encode(self.parametres['syscoding']))
+
+
def make_ucecl_from_R(self, filein) :
with open(filein, 'rU') as f :
c = f.readlines()
self.lc = [[uce[0] for uce in self.lc if uce[1] == i] for i in range(clnb+1)]
self.lc0 = self.lc.pop(0)
#return ucecl
+
+ def get_stat_by_cluster(self, outf) :
+ log.info('get_stat_by_cluster')
+ t1 = time()
+ occurrences = dict([[i + 1, 0] for i in range(len(self.lc))])
+ formescl = dict([[i + 1, 0] for i in range(len(self.lc))])
+ hapaxcl = dict([[i + 1, 0] for i in range(len(self.lc))])
+ lenclasses = dict([[i+1,len(cl)] for i, cl in enumerate(self.lc)])
+ sets = [set(cl) for cl in self.lc]
+ for forme in self.formes :
+ formeuceeff = self.getformeuceseff(forme)
+ for i, classe in enumerate(self.lc) :
+ concern = sets[i].intersection(formeuceeff.keys())
+ if len(concern) :
+ occurrences[i+1] += sum([formeuceeff[uce] for uce in concern])
+ formescl[i+1] += 1
+ if self.formes[forme].freq == 1 :
+ hapaxcl[i+1] += 1
+ toprint = '\n'.join([';'.join([`i`, `occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`]) for i in occurrences])
+ with open(outf, 'w') as f :
+ f.write(toprint)
+ log.info('%f' % (time() - t1))
def gethapaxbyet(self, etoiles) :
hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
self.cleans.append(self.dolower)
if self.corpus.parametres.get('firstclean', 1) :
self.cleans.append(self.firstclean)
- self.rule = self.corpus.parametres.get('keep_caract', u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_")
- self.cleans.append(self.docharact)
+ if self.corpus.parametres['charact'] :
+ self.rule = self.corpus.parametres.get('keep_caract', u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_")
+ self.cleans.append(self.docharact)
if self.corpus.parametres.get('expressions', 1) :
self.cleans.append(self.make_expression)
if self.corpus.parametres.get('apos', 1) :
class BuildFromAlceste(BuildCorpus) :
- #def __init___(self, infile, parametres_corpus) :
- # BuildCorpus.__init__(self, infile, parametres_corpus)
-
-
def read_corpus(self, infile) :
+ if self.dlg is not None :
+ self.dlg.Pulse('textes : 0 - segments : 0')
self.limitshow = 0
self.count = 1
if self.corpus.parametres['ucimark'] == 0 :
iduci = -1
idpara = -1
iduce = -1
- linenb = 0
try :
with codecs.open(infile, 'r', self.encoding) as f :
- for line in f :
- linenb += 1
+ for linenb, line in enumerate(f) :
line = line.rstrip('\n\r')
if self.testuci(line) :
iduci += 1
else :
if iduci > 0 :
if self.corpus.ucis[-1].uces == [] :
- log.info('linenb : %i' % linenb)
- raise Exception("EmptyText %i" % linenb)
+ log.info(u'Empty text : %i' % linenb)
+ iduci -= 1
+ self.corpus.ucis.pop()
+ #raise Exception("EmptyText %i" % linenb)
self.corpus.ucis.append(Uci(iduci, line))
+ if self.dlg is not None :
+ if not (iduci + 1) % 10 :
+ self.dlg.Pulse('textes : %i - segments : %i' % (iduci + 1, iduce +1))
elif line.startswith(u'-*') :
if iduci != -1 :
if txt != [] :
if iduci != -1 and iduce != -1:
self.backup_uce()
else :
- log.info(_(u"No Texte in corpora. Are you sure of the formatting ?"))
+ log.info(_(u"No Text in corpora. Are you sure of the formatting ?"))
raise Exception('TextBeforeTextMark')
except UnicodeDecodeError :
raise Exception("CorpusEncoding")
for word in uce :
self.last += 1
self.corpus.add_word(word)
- if self.dlg is not None :
- if self.limitshow > self.count :
- self.dlg.Pulse('uci: %i - uce : %i' % (iduci + 1, iduce +1))
- self.count += 1
- self.limitshow = 0
- else :
- self.limitshow = self.last / 100000
log.debug(' '.join([`iduci`,`idpara`,`iduce`]))
if self.last > self.lim :
self.backup_uce()
if douce :
out = []
reste, texte_uce, suite = self.decouper(self.prep_txt(txt), self.ucesize + 15, self.ucesize)
-# print 'reste'
-# print reste
-# print 'texte_uce'
-# print texte_uce
-# print 'suite'
-# print suite
while reste :
uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
if uce != '' :
out.append(uce)
reste, texte_uce, suite = self.decouper(suite, self.ucesize + 15, self.ucesize)
-# print 'reste'
-# print reste
-# print 'texte_uce'
-# print texte_uce
-# print 'suite'
-# print suite
-
uce = ' '.join([val for val in texte_uce if val not in self.ponctuation_espace])
if uce != '' :
- #print 'RESTEE UUCEEEEEEEEEEEEE', uce
out.append(uce)
return out
else :
ReadLexique(self.parent, lang = parametres['lang'])
self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp'))
self.parametres = parametres
+ else :
+ if self.dlg is not None :
+ self.dlg.Destroy()
dial.Destroy()
def doanalyse(self) :