gramtype = self.parent.lexique[word][1]
lem = self.parent.lexique[word][0]
elif word.isdigit() :
- gramtype = 'num'
+ gramtype = u'num'
lem = word
else :
- gramtype = 'nr'
+ gramtype = u'nr'
lem = word
self.formes[word] = Word(word, gramtype, len(self.formes), lem)
self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident : 1}
def getucesfrometoile(self, etoile) :
return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles]
+ def getetoileuces(self) :
+ log.info('get uces etoiles')
+ etoileuces = {}
+ idpara = 0
+ for uci in self.ucis :
+ etoiles = uci.etoiles[1:]
+ for et in etoiles :
+ if et in etoileuces :
+ etoileuces[et] += [uce.ident for uce in uci.uces]
+ else :
+ etoileuces[et] = [uce.ident for uce in uci.uces]
+ if uci.paras != [] :
+ for et in uci.paras :
+ if et in etoileuces :
+ etoileuces[et] += [uce.ident for uce in uci.uces if uce.para == idpara]
+ else :
+ etoileuces[et] = [uce.ident for uce in uci.uces if uce.para == idpara]
+ idpara += 1
+ else :
+ idpara += 1
+ return etoileuces
+
def getucefromid(self, uceid) :
if self.iduces is None : self.make_iduces()
return self.iduces[uceid]
if self.iduces is None :
self.iduces = dict([[uce.ident, uce] for uci in self.ucis for uce in uci.uces])
- def make_lexitable(self, mineff, etoiles) :
- tokeep = [lem for lem in self.lems if self.lems[lem].freq >= mineff]
+ def make_lexitable(self, mineff, etoiles, gram = 0) :
+ if gram == 0 :
+ grams = {1:'', 2:''}
+ else :
+ grams = {gram :''}
+ tokeep = [lem for lem in self.lems if self.lems[lem].freq >= mineff and self.lems[lem].act in grams]
etuces = [[] for et in etoiles]
for uci in self.ucis :
get = list(set(uci.etoiles).intersection(etoiles))
if len(get) > 1 :
- return '2 variables sur la meme ligne'
- elif get != [] :
+ log.info('2 variables sur une ligne')
+ if get != [] :
etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces]
etuces = [set(val) for val in etuces]
tab = []
f.write(etline.encode(self.parametres['syscoding']) + '\n')
f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
+ def export_classe(self, outf, classe, lem = False) :
+ sts = self.lc[classe - 1]
+ res = self.getconcorde(sts)
+ self.make_iduces()
+ with open(outf, 'w') as f :
+ for uce in res :
+ guce = uce[1]
+ f.write(' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n')
+ if lem :
+ guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
+ f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
+
def make_and_write_sparse_matrix_from_uces(self, actives, outfile, listuce = False) :
log.info('make_and_write_sparse_matrix_from_uces %s' % outfile)
nbl = 0
table_uce[uces[uce]][i] = 1
table_uce.insert(0, list_act)
return table_uce
+
+ def make_pondtable_with_classe(self, uces, list_act) :
+ table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
+ uces = dict([[uce, i] for i, uce in enumerate(uces)])
+ for i, lem in enumerate(list_act) :
+ uceseff = self.getlemuceseff(lem)
+ lemuces = list(set(uceseff.keys()).intersection(uces))
+ for uce in lemuces :
+ table_uce[uces[uce]][i] = uceseff[uce]
+ table_uce.insert(0, list_act)
+ return table_uce
def parse_active(self, gramact, gramsup = None) :
log.info('parse actives')
self.lems[lem].act = 2
elif self.lems[lem].gram in gramact :
self.lems[lem].act = 1
- elif gramsup is not None :
+ elif gramsup is not None and self.lems[lem].gram not in gramact:
if self.lems[lem].gram in gramsup :
self.lems[lem].act = 2
else :
def make_etoiles(self) :
etoiles = set([])
for uci in self.ucis :
- etoiles.update(uci.etoiles[1:] + uci.paras)
+ etoiles.update(uci.etoiles[1:])
return list(etoiles)
def make_etoiles_dict(self) :
if et[0] in det :
try :
endet = '_'.join(et[1:])
- if endet in det[et[0]] :
- det[et[0]][endet] += 1
+ if etoile in det[et[0]] :
+ det[et[0]][etoile] += 1
else :
- det[et[0]][endet] = 1
+ det[et[0]][etoile] = 1
except IndexError :
det[et[0]] += 1
else :
try :
endet = '_'.join(et[1:])
- det[et[0]] = {endet :1}
+ det[et[0]] = {etoile :1}
except IndexError :
det[et[0]] = 1
return det
elif get != [] :
etuces[listet.index(get[0])] += [uce.ident for uce in uci.uces]
return etuces
-
def make_and_write_profile_et(self, ucecl, fileout) :
log.info('etoiles/classes')
- etoiles = self.make_etoiles()
+ etoileuces = self.getetoileuces()
+ etoileuces = dict([[et, etoileuces[et]] for et in etoileuces if len(etoileuces[et]) > 1])
with open(fileout, 'w') as f :
- f.write('\n'.join([';'.join([etoile] + [`len(set(self.getucesfrometoile(etoile)).intersection(classe))` for classe in ucecl]) for etoile in etoiles]).encode(self.parametres['syscoding']))
+ f.write('\n'.join([';'.join([et] + [`len(set(etoileuces[et]).intersection(classe))` for classe in ucecl]) for et in etoileuces]).encode(self.parametres['syscoding']))
+ #etoiles = self.make_etoiles()
+ #with open(fileout, 'w') as f :
+ # f.write('\n'.join([';'.join([etoile] + [`len(set(self.getucesfrometoile(etoile)).intersection(classe))` for classe in ucecl]) for etoile in etoiles]).encode(self.parametres['syscoding']))
def make_colored_corpus(self) :
ucecl = {}
self.lc0 = self.lc.pop(0)
#return ucecl
- def get_stat_by_cluster(self, outf) :
+ def get_stat_by_cluster(self, outf, lclasses = None) :
log.info('get_stat_by_cluster')
+ if lclasses is None :
+ lclasses = self.lc
t1 = time()
- occurrences = dict([[i + 1, 0] for i in range(len(self.lc))])
- formescl = dict([[i + 1, 0] for i in range(len(self.lc))])
- hapaxcl = dict([[i + 1, 0] for i in range(len(self.lc))])
- lenclasses = dict([[i+1,len(cl)] for i, cl in enumerate(self.lc)])
- sets = [set(cl) for cl in self.lc]
+ occurrences = dict([[i + 1, 0] for i in range(len(lclasses))])
+ formescl = dict([[i + 1, 0] for i in range(len(lclasses))])
+ hapaxcl = dict([[i + 1, 0] for i in range(len(lclasses))])
+ lenclasses = dict([[i+1,len(cl)] for i, cl in enumerate(lclasses)])
+ sets = [set(cl) for cl in lclasses]
for forme in self.formes :
formeuceeff = self.getformeuceseff(forme)
- for i, classe in enumerate(self.lc) :
+ for i, classe in enumerate(lclasses) :
concern = sets[i].intersection(formeuceeff.keys())
if len(concern) :
occurrences[i+1] += sum([formeuceeff[uce] for uce in concern])
formescl[i+1] += 1
if self.formes[forme].freq == 1 :
hapaxcl[i+1] += 1
- toprint = '\n'.join([';'.join([`i`, `occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`]) for i in occurrences])
- with open(outf, 'w') as f :
- f.write(toprint)
log.info('%f' % (time() - t1))
+ if outf is not None :
+ toprint = '\n'.join([';'.join([`i`, `occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`]) for i in occurrences])
+ with open(outf, 'w') as f :
+ f.write(toprint)
+ else :
+ return [[`occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`] for i in occurrences]
+
+ def get_stat_by_et(self, outf, etoiles) :
+ lclasses = [self.getucesfrometoile(etoile) for etoile in etoiles]
+ stats = self.get_stat_by_cluster(None, lclasses)
+ stats = [[etoiles[i]] + val for i, val in enumerate(stats)]
def gethapaxbyet(self, etoiles) :
hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
with open('/tmp/testhapxuce.html','w') as f :
f.write(txt)
+ def export_dictionary(self, fileout, syscoding) :
+ listformes = [[self.formes[forme].freq, forme, self.formes[forme].lem, self.formes[forme].gram] for forme in self.formes]
+ listformes.sort(reverse = True)
+ listformes = [forme[1:] + [`forme[0]`] for forme in listformes]
+ with open(fileout, 'w') as f :
+ f.write('\n'.join(['\t'.join(forme) for forme in listformes]).encode(syscoding))
+
+ def export_lems(self, fileout, syscoding) :
+ self.make_idformes()
+ listlem = [[lem, '\t'.join(['\t'.join([self.idformes[forme].forme, `self.lems[lem].formes[forme]`]) for forme in self.lems[lem].formes])] for lem in self.lems]
+ listlem.sort()
+ with open(fileout, 'w') as f :
+ f.write('\n'.join(['\t'.join(lem) for lem in listlem]).encode(syscoding))
+
+
+
class MakeUciStat :
def __init__(self, corpus) :
def firstclean(self, txt) :
txt = txt.replace(u'’',"'")
txt = txt.replace(u'œ', u'oe')
- return txt.replace('...',u' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'…', ' £$£ ')
+ return txt.replace('...',u' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'…', u' £$£ ')
def make_cleans(self, txt) :
for clean in self.cleans :
log.info(u'Empty text : %i' % linenb)
iduci -= 1
self.corpus.ucis.pop()
- #raise Exception("EmptyText %i" % linenb)
self.corpus.ucis.append(Uci(iduci, line))
if self.dlg is not None :
if not (iduci + 1) % 10 :
idpara += 1
self.corpus.ucis[-1].paras.append(line.split()[0])
else :
- raise Exception('paragrapheOT')
+ raise Exception('paragrapheOT %i' % linenb)
elif line.strip() != '' and iduci != -1 :
txt.append(line)
if txt != [] and iduci != -1 :
iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
del(txt)
else :
- raise Exception("EmptyText")
+ if iduci != -1 :
+ iduci -= 1
+ self.corpus.ucis.pop()
+ log.info(Exception("Empty text %i" % linenb))
+ else :
+ raise Exception('EmptyText %i' % linenb)
if iduci != -1 and iduce != -1:
self.backup_uce()
else :
log.info(_(u"No Text in corpora. Are you sure of the formatting ?"))
- raise Exception('TextBeforeTextMark')
+ raise Exception('TextBeforeTextMark %i' % linenb)
except UnicodeDecodeError :
raise Exception("CorpusEncoding")
parametres['originalpath'] = parent.filename
PathOut().createdir(parametres['pathout'])
ReadLexique(self.parent, lang = parametres['lang'])
- self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp'))
+ if parametres['lang'] != 'other' and os.path.exists(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')):
+ self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp'))
+ else :
+ self.parent.expressions = {}
self.parametres = parametres
else :
if self.dlg is not None :