X-Git-Url: http://iramuteq.org/git?a=blobdiff_plain;f=autres%2Fcable.py;fp=autres%2Fcable.py;h=bbfa37f6a55c9f514e4899da87e1c5703a33b1ab;hb=eaa044d1147e26b82942ce56d5965c83fdddf069;hp=0000000000000000000000000000000000000000;hpb=10d67a5cd48583c060b6a0e77e87c41f80671027;p=iramuteq diff --git a/autres/cable.py b/autres/cable.py new file mode 100644 index 0000000..bbfa37f --- /dev/null +++ b/autres/cable.py @@ -0,0 +1,68 @@ +# -*- coding: utf-8 -*- +#Author: Pierre Ratinaud +#Copyright (c) 2008-2020 Pierre Ratinaud +#Lisense: GNU/GPL +# usage ? +# encodage est utilisé - mais utf-8 est par défaut dans PY3 + +#------------------------------------ +# import des modules python +#------------------------------------ +import codecs + + +filein = '/home/pierre/fac/cablegate/allcables-all.txt' +enc = 'utf-8' + +infile = codecs.open(filein, 'r', enc) +content = [] + +class BigCorpus : + def __init__(self, parent) : + self.parent = parent + self.parametre = {'syscoding': sys.getdefaultencoding()} + self.content = None + self.ucis = None + self.formes = {} + self.lems = {} + self.ucenb = None + self.etoiles = None + self.etintxt = {} + self.ucis_paras_uces = None + self.lc = None + self.lc0 = None + self.actives = None + self.supp = None + #self.supplementaires = [] + self.lenuc1 = None + self.lenuc2 = None + self.lexique = None + + def open_corpus(self) : + return codecs.open(self.parametre['filename'], "r", self.parametre['encodage']) + + def buildcorpus(self) : + i = 0 + ucifile = os.path.join(os.path.basedir(self.parametre['filename']), 'ucis.txt') + uci = open(ucifile, 'w') + ucinb = 0 + for line in self.open_corpus() : + if line.startswith(u'****') and i==0 : + uci.write(line) + i += 1 + elif line.startswith(u'****') and i=!0 : + uci.write(line) + parse_uci() + + write_uci() + uci[ucinb] = i + ucinb += 1 + i += 1 + else : + addlinetouci(uci, prepare(line)) + line = line.lower().replace(u'\'','\' ').replace(u'’','\' ').replace('...',u' £ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':', ' : ').strip() + line = line.replace('\n', ' ').replace('\r', ' ') + line = line.split() + content[-1].append(line) + i += 1 +print len(content)