X-Git-Url: http://iramuteq.org/git?p=iramuteq;a=blobdiff_plain;f=corpus.py;h=51061fbb2006bb58385173f235a35d4cbdc5f147;hp=d4357cf97559ce4c4b4cdcd8886494e469d06603;hb=b19770356272772c8c8ba75f351520eca186bd19;hpb=ef45aa7e5e55a37956ce86dc4ce86471f11b018d diff --git a/corpus.py b/corpus.py index d4357cf..51061fb 100644 --- a/corpus.py +++ b/corpus.py @@ -213,13 +213,23 @@ class Corpus : def getetoileuces(self) : log.info('get uces etoiles') etoileuces = {} + idpara = 0 for uci in self.ucis : - etoiles = uci.etoiles[1:] + uci.paras + etoiles = uci.etoiles[1:] for et in etoiles : if et in etoileuces : etoileuces[et] += [uce.ident for uce in uci.uces] else : etoileuces[et] = [uce.ident for uce in uci.uces] + if uci.paras != [] : + for et in uci.paras : + if et in etoileuces : + etoileuces[et] += [uce.ident for uce in uci.uces if uce.para == idpara] + else : + etoileuces[et] = [uce.ident for uce in uci.uces if uce.para == idpara] + idpara += 1 + else : + idpara += 1 return etoileuces def getucefromid(self, uceid) : @@ -271,8 +281,8 @@ class Corpus : for uci in self.ucis : get = list(set(uci.etoiles).intersection(etoiles)) if len(get) > 1 : - return '2 variables sur la meme ligne' - elif get != [] : + log.info('2 variables sur une ligne') + if get != [] : etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces] etuces = [set(val) for val in etuces] tab = [] @@ -542,7 +552,7 @@ class Corpus : def make_etoiles(self) : etoiles = set([]) for uci in self.ucis : - etoiles.update(uci.etoiles[1:] + uci.paras) + etoiles.update(uci.etoiles[1:]) return list(etoiles) def make_etoiles_dict(self) : @@ -553,16 +563,16 @@ class Corpus : if et[0] in det : try : endet = '_'.join(et[1:]) - if endet in det[et[0]] : - det[et[0]][endet] += 1 + if etoile in det[et[0]] : + det[et[0]][etoile] += 1 else : - det[et[0]][endet] = 1 + det[et[0]][etoile] = 1 except IndexError : det[et[0]] += 1 else : try : endet = '_'.join(et[1:]) - det[et[0]] = {endet :1} + det[et[0]] = {etoile :1} except IndexError : det[et[0]] = 1 return det @@ -776,6 +786,22 @@ class Corpus : with open('/tmp/testhapxuce.html','w') as f : f.write(txt) + def export_dictionary(self, fileout, syscoding) : + listformes = [[self.formes[forme].freq, forme, self.formes[forme].lem, self.formes[forme].gram] for forme in self.formes] + listformes.sort(reverse = True) + listformes = [forme[1:] + [`forme[0]`] for forme in listformes] + with open(fileout, 'w') as f : + f.write('\n'.join(['\t'.join(forme) for forme in listformes]).encode(syscoding)) + + def export_lems(self, fileout, syscoding) : + self.make_idformes() + listlem = [[lem, '\t'.join(['\t'.join([self.idformes[forme].forme, `self.lems[lem].formes[forme]`]) for forme in self.lems[lem].formes])] for lem in self.lems] + listlem.sort() + with open(fileout, 'w') as f : + f.write('\n'.join(['\t'.join(lem) for lem in listlem]).encode(syscoding)) + + + class MakeUciStat : def __init__(self, corpus) : @@ -1106,7 +1132,6 @@ class BuildFromAlceste(BuildCorpus) : log.info(u'Empty text : %i' % linenb) iduci -= 1 self.corpus.ucis.pop() - #raise Exception("EmptyText %i" % linenb) self.corpus.ucis.append(Uci(iduci, line)) if self.dlg is not None : if not (iduci + 1) % 10 : @@ -1119,19 +1144,24 @@ class BuildFromAlceste(BuildCorpus) : idpara += 1 self.corpus.ucis[-1].paras.append(line.split()[0]) else : - raise Exception('paragrapheOT') + raise Exception('paragrapheOT %i' % linenb) elif line.strip() != '' and iduci != -1 : txt.append(line) if txt != [] and iduci != -1 : iduce, idpara = self.treattxt(txt, iduce, idpara, iduci) del(txt) else : - raise Exception("EmptyText") + if iduci != -1 : + iduci -= 1 + self.corpus.ucis.pop() + log.info(Exception("Empty text %i" % linenb)) + else : + raise Exception('EmptyText %i' % linenb) if iduci != -1 and iduce != -1: self.backup_uce() else : log.info(_(u"No Text in corpora. Are you sure of the formatting ?")) - raise Exception('TextBeforeTextMark') + raise Exception('TextBeforeTextMark %i' % linenb) except UnicodeDecodeError : raise Exception("CorpusEncoding") @@ -1202,7 +1232,10 @@ class Builder : parametres['originalpath'] = parent.filename PathOut().createdir(parametres['pathout']) ReadLexique(self.parent, lang = parametres['lang']) - self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')) + if parametres['lang'] != 'other' and os.path.exists(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')): + self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')) + else : + self.parent.expressions = {} self.parametres = parametres else : if self.dlg is not None :