def read_corpus(self) :
log.info('read corpus')
- self.parametres['syscoding'] = sys.getdefaultencoding()
+ self.parametres['syscoding'] = 'utf8'
if self.conncorpus is None :
self.conn_all()
res = self.ccorpus.execute('SELECT * FROM etoiles;')
def getucisize(self) :
ucesize = self.getucesize()
- return [sum(ucesize[uci.uces[0].ident:(uci.uces[-1].ident + 1)]) for uci in self.ucis]
+ return [sum(ucesize[uci.uces[0].ident:(uci.uces[-1].ident + 1)]) for uci in self.ucis if len(uci.uces) != 0]
def getucesize(self) :
res = self.getalluces()
return [len(uce[1].split()) for uce in res]
def getconcorde(self, uces) :
- return self.cuces.execute('select * from uces where id IN (%s) ORDER BY id;' % ', '.join([repr(i) for i in uces]))
+ return self.cuces.execute('select * from uces where id IN (%s) ORDER BY id;' % ', '.join([repr(i) for i in uces]))
def getuciconcorde(self, ucis) :
uces = [[val,[uce.ident for uce in self.ucis[val].uces]] for val in ucis]
def getucesfrometoile(self, etoile) :
return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles]
+ def getucisfrometoile(self, etoile):
+ uces = [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles]
+ return list(set([self.getucefromid(val).uci for val in uces]))
+
+
def getetoileuces(self) :
log.info('get uces etoiles')
etoileuces = {}
self.make_iduces()
actuci = ''
actpara = False
- with open(outf,'w') as f :
+ with open(outf,'w', encoding='utf8') as f :
for uce in res :
if self.iduces[uce[0]].uci == actuci and self.iduces[uce[0]].para == actpara :
f.write(uce[1] + '\n')
longueur_max = max([len(val) for val in metas])
first = ['column_%i' % i for i in range(longueur_max)]
metas.insert(0, first)
- with open(outf, 'w') as f :
+ with open(outf, 'w', encoding='utf8') as f :
f.write('\n'.join(['\t'.join(line) for line in metas]))
def export_corpus_classes(self, outf, alc = True, lem = False, uci = False) :
self.make_iduces()
else :
res = self.getallucis()
- with open(outf, 'w') as f :
+ with open(outf, 'w', encoding='utf8') as f :
for uce in res :
guce = uce[1]
if not uci :
self.make_iduces()
else :
res = self.getuciconcorde(sts)
- with open(outf, 'w') as f :
+ with open(outf, 'w', encoding='utf8') as f :
for uce in res :
guce = uce[1]
if not uci :
outf = os.path.join(rep, outf)
if lem :
guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
- with open(outf, 'w') as f :
+ with open(outf, 'w', encoding='utf8') as f :
f.write(guce) #.encode('cp1252', errors = 'replace'))
def export_tropes(self, fileout, classe, lem = False, uci = False) :
self.make_iduces()
else :
res = self.getuciconcorde(sts)
- with open(fileout, 'w') as f :
+ with open(fileout, 'w', encoding='utf8') as f :
for uce in res :
guce = uce[1]
if lem :
else :
tab = [[lem] + [len(set(self.getlemuces(lem)).intersection(classe)) for classe in ucecl] for lem in actives]
tab = [[line[0]] + [repr(val) for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
- with open(fileout, 'w') as f :
+ with open(fileout, 'w', encoding='utf8') as f :
f.write('\n'.join([';'.join(line) for line in tab]))
def make_etoiles(self) :
etoileuces = self.getetoileuces()
else :
etoileuces = self.getetoileucis()
- etoileuces = dict([[et, etoileuces[et]] for et in etoileuces if len(etoileuces[et]) > 1])
- with open(fileout, 'w') as f :
+ etoileuces = dict([[et, etoileuces[et]] for et in etoileuces if len(etoileuces[et]) > 0])
+ with open(fileout, 'w', encoding='utf8') as f :
f.write('\n'.join([';'.join([et] + [repr(len(set(etoileuces[et]).intersection(classe))) for classe in ucecl]) for et in etoileuces])) #.encode(self.parametres['syscoding'])
#etoiles = self.make_etoiles()
#with open(fileout, 'w') as f :
ucecl[uce] = 0
color = ['black'] + colors[len(self.lc) - 1]
txt = '''<html>
- <meta http-equiv="content-Type" content="text/html; charset=%s" />
+ <meta http-equiv="content-Type" content="text/html; charset=utf8" />
<body>
-''' % sys.getdefaultencoding()
+'''
if not uci :
res = self.getalluces()
self.make_iduces()
for taille_segment in range(lenmin,lenmax) :
d =self.count_from_list_cl([' '.join(uce[i:i+taille_segment]) for i in range(len(uce)-(taille_segment - 1))], d, b, len(self.lc))
result = [[seg] + [str(val) for val in d[seg]] for seg in d if sum(d[seg]) >= effmin]
- with open(fileout, 'w') as f :
+ with open(fileout, 'w', encoding='utf8') as f :
f.write('\n'.join([';'.join(line) for line in result]))
def make_proftype(self, outf) :
res[gram][i] += sum([lemuceeff[uce] for uce in concern])
res = [[gram] + [repr(val) for val in res[gram]] for gram in res]
res.sort()
- with open(outf, 'w') as f :
+ with open(outf, 'w', encoding='utf8') as f :
f.write('\n'.join([';'.join(line) for line in res]))
def make_ucecl_from_R(self, filein) :
log.info('%f' % (time() - t1))
if outf is not None :
toprint = '\n'.join([';'.join([repr(i), repr(occurrences[i]), repr(formescl[i]), repr(hapaxcl[i]), repr(lenclasses[i]), repr(float(hapaxcl[i])/float(formescl[i]))]) for i in occurrences])
- with open(outf, 'w') as f :
+ with open(outf, 'w', encoding='utf8') as f :
f.write(toprint)
else :
return [[repr(occurrences[i]), repr(formescl[i]), repr(hapaxcl[i]), repr(lenclasses[i]), repr(float(hapaxcl[i])/float(formescl[i]))] for i in occurrences]
txt += """
</body></html>
"""
- with open('/tmp/testhapxuce.html','w') as f :
+ with open('/tmp/testhapxuce.html','w', encoding='utf8') as f :
f.write(txt)
def export_dictionary(self, fileout, syscoding) :
listformes = [[self.formes[forme].freq, forme, self.formes[forme].lem, self.formes[forme].gram] for forme in self.formes]
listformes.sort(reverse = True)
listformes = [forme[1:] + [repr(forme[0])] for forme in listformes]
- with open(fileout, 'w') as f :
+ with open(fileout, 'w', encoding='utf8') as f :
f.write('\n'.join(['\t'.join(forme) for forme in listformes]))
def export_lems(self, fileout, syscoding) :
self.make_idformes()
listlem = [[lem, '\t'.join(['\t'.join([self.idformes[forme].forme, repr(self.lems[lem].formes[forme])]) for forme in self.lems[lem].formes])] for lem in self.lems]
listlem.sort()
- with open(fileout, 'w') as f :
+ with open(fileout, 'w', encoding='utf8') as f :
f.write('\n'.join(['\t'.join(lem) for lem in listlem]))
newuces = []
newpara = []
for et in uci.paras :
- keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep]
+ keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeepand and uce.para == idpara]
idpara += 1
if keepuces != [] :
newuces += keepuces