X-Git-Url: http://iramuteq.org/git?p=iramuteq;a=blobdiff_plain;f=corpus.py;h=ee80d9e8fc60f2d7b05e26dd807e3417a15ed3ea;hp=2e85ed58c413b031f7c4a3bb142de0da4a134755;hb=191e64482209e897e7bf1853646c0d2ca84db1f2;hpb=87842df83ba95117fcda5575bc60067a6d3654b0 diff --git a/corpus.py b/corpus.py index 2e85ed5..ee80d9e 100644 --- a/corpus.py +++ b/corpus.py @@ -77,10 +77,10 @@ class Corpus : gramtype = self.parent.lexique[word][1] lem = self.parent.lexique[word][0] elif word.isdigit() : - gramtype = 'num' + gramtype = u'num' lem = word else : - gramtype = 'nr' + gramtype = u'nr' lem = word self.formes[word] = Word(word, gramtype, len(self.formes), lem) self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident : 1} @@ -434,7 +434,7 @@ class Corpus : f.write(guce.encode(self.parametres['syscoding']) + '\n\n') def export_classe(self, outf, classe, lem = False) : - sts = self.lc[classe] + sts = self.lc[classe - 1] res = self.getconcorde(sts) self.make_iduces() with open(outf, 'w') as f : @@ -1081,7 +1081,7 @@ class BuildCorpus : def firstclean(self, txt) : txt = txt.replace(u'’',"'") txt = txt.replace(u'œ', u'oe') - return txt.replace('...',u' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'…', ' £$£ ') + return txt.replace('...',u' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'…', u' £$£ ') def make_cleans(self, txt) : for clean in self.cleans :