X-Git-Url: http://iramuteq.org/git?p=iramuteq;a=blobdiff_plain;f=corpusNG.py;h=5e7ba26ae751ae92b4e42970dbbdc4b80b58648f;hp=6a027dc1509bbbf6f532538e2d2330d7d182d53b;hb=a4932706fa82bf5c3c821ce48913db0231a2b671;hpb=a503f041dc4947ee21c1d353ddd05ddb13a5e322 diff --git a/corpusNG.py b/corpusNG.py index 6a027dc..5e7ba26 100644 --- a/corpusNG.py +++ b/corpusNG.py @@ -512,18 +512,31 @@ class Corpus : et = etoile.split('_') if et[0] in det : try : - if et[1] in det[et[0]] : - det[et[0]][et[1]] += 1 + endet = '_'.join(et[1:]) + if endet in det[et[0]] : + det[et[0]][endet] += 1 else : - det[et[0]][et[1]] = 1 + det[et[0]][endet] = 1 except IndexError : det[et[0]] += 1 else : try : - det[et[0]] = {et[1] :1} + endet = '_'.join(et[1:]) + det[et[0]] = {endet :1} except IndexError : det[et[0]] = 1 - print det + return det + + def make_etline(self, listet) : + etuces = [[] for et in listet] + for uci in self.ucis : + get = list(set(uci.etoiles).intersection(listet)) + if len(get) > 1 : + return '2 variables sur la meme ligne' + elif get != [] : + etuces[listet.index(get[0])] += [uce.ident for uce in uci.uces] + return etuces + def make_and_write_profile_et(self, ucecl, fileout) : log.info('etoiles/classes') @@ -906,8 +919,9 @@ class BuildCorpus : self.cleans.append(self.dolower) if self.corpus.parametres.get('firstclean', 1) : self.cleans.append(self.firstclean) - self.rule = self.corpus.parametres.get('keep_caract', u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_") - self.cleans.append(self.docharact) + if self.corpus.parametres['charact'] : + self.rule = self.corpus.parametres.get('keep_caract', u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_") + self.cleans.append(self.docharact) if self.corpus.parametres.get('expressions', 1) : self.cleans.append(self.make_expression) if self.corpus.parametres.get('apos', 1) : @@ -982,11 +996,9 @@ class BuildCorpus : class BuildFromAlceste(BuildCorpus) : - #def __init___(self, infile, parametres_corpus) : - # BuildCorpus.__init__(self, infile, parametres_corpus) - - def read_corpus(self, infile) : + if self.dlg is not None : + self.dlg.Pulse('textes : 0 - segments : 0') self.limitshow = 0 self.count = 1 if self.corpus.parametres['ucimark'] == 0 : @@ -997,11 +1009,9 @@ class BuildFromAlceste(BuildCorpus) : iduci = -1 idpara = -1 iduce = -1 - linenb = 0 try : with codecs.open(infile, 'r', self.encoding) as f : - for line in f : - linenb += 1 + for linenb, line in enumerate(f) : line = line.rstrip('\n\r') if self.testuci(line) : iduci += 1 @@ -1012,9 +1022,14 @@ class BuildFromAlceste(BuildCorpus) : else : if iduci > 0 : if self.corpus.ucis[-1].uces == [] : - log.info('linenb : %i' % linenb) - raise Exception("EmptyText %i" % linenb) + log.info(u'Empty text : %i' % linenb) + iduci -= 1 + self.corpus.ucis.pop() + #raise Exception("EmptyText %i" % linenb) self.corpus.ucis.append(Uci(iduci, line)) + if self.dlg is not None : + if not (iduci + 1) % 10 : + self.dlg.Pulse('textes : %i - segments : %i' % (iduci + 1, iduce +1)) elif line.startswith(u'-*') : if iduci != -1 : if txt != [] : @@ -1034,7 +1049,7 @@ class BuildFromAlceste(BuildCorpus) : if iduci != -1 and iduce != -1: self.backup_uce() else : - log.info(_(u"No Texte in corpora. Are you sure of the formatting ?")) + log.info(_(u"No Text in corpora. Are you sure of the formatting ?")) raise Exception('TextBeforeTextMark') except UnicodeDecodeError : raise Exception("CorpusEncoding") @@ -1062,13 +1077,13 @@ class BuildFromAlceste(BuildCorpus) : for word in uce : self.last += 1 self.corpus.add_word(word) - if self.dlg is not None : - if self.limitshow > self.count : - self.dlg.Pulse('uci: %i - uce : %i' % (iduci + 1, iduce +1)) - self.count += 1 - self.limitshow = 0 - else : - self.limitshow = self.last / 100000 + #if self.dlg is not None : + # if self.limitshow > self.count : + # self.dlg.Pulse('textes : %i - segments : %i' % (iduci + 1, iduce +1)) + # self.count += 1 + # self.limitshow = 0 + # else : + # self.limitshow = self.last / 100000 log.debug(' '.join([`iduci`,`idpara`,`iduce`])) if self.last > self.lim : self.backup_uce()