X-Git-Url: http://iramuteq.org/git?p=iramuteq;a=blobdiff_plain;f=corpusNG.py;fp=corpusNG.py;h=7354463e4f2e0293a5f4ef15a7793781d6d14242;hp=6a027dc1509bbbf6f532538e2d2330d7d182d53b;hb=ab23968410d4e2eff482fd16a639801b457d5063;hpb=781cbc21fec22eccd64bf7706aac10a0e2757814 diff --git a/corpusNG.py b/corpusNG.py index 6a027dc..7354463 100644 --- a/corpusNG.py +++ b/corpusNG.py @@ -512,18 +512,21 @@ class Corpus : et = etoile.split('_') if et[0] in det : try : - if et[1] in det[et[0]] : - det[et[0]][et[1]] += 1 + endet = '_'.join(et[1:]) + if endet in det[et[0]] : + det[et[0]][endet] += 1 else : - det[et[0]][et[1]] = 1 + det[et[0]][endet] = 1 except IndexError : det[et[0]] += 1 else : try : - det[et[0]] = {et[1] :1} + endet = '_'.join(et[1:]) + det[et[0]] = {endet :1} except IndexError : det[et[0]] = 1 print det + return det def make_and_write_profile_et(self, ucecl, fileout) : log.info('etoiles/classes') @@ -906,8 +909,9 @@ class BuildCorpus : self.cleans.append(self.dolower) if self.corpus.parametres.get('firstclean', 1) : self.cleans.append(self.firstclean) - self.rule = self.corpus.parametres.get('keep_caract', u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_") - self.cleans.append(self.docharact) + if self.corpus.parametres['charact'] : + self.rule = self.corpus.parametres.get('keep_caract', u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_") + self.cleans.append(self.docharact) if self.corpus.parametres.get('expressions', 1) : self.cleans.append(self.make_expression) if self.corpus.parametres.get('apos', 1) : @@ -1047,7 +1051,11 @@ class BuildFromAlceste(BuildCorpus) : ucetxt = txt.split('laphrasepoursplitter') else : txt = ' '.join(txt) + + print txt txt = self.make_cleans(txt) + + print txt ucetxt = self.make_uces(txt, self.corpus.parametres['douce']) if self.corpus.ucis[-1].paras == [] : idpara += 1