X-Git-Url: http://iramuteq.org/git?a=blobdiff_plain;f=corpus.py;h=555a0340f09d7a826ae11a9bf96d8c672bebde26;hb=3532cafef6a7926cb7d07b223668a7946a86708f;hp=5fe448aee4570fcfbcd69d9efabf7c79223d8aff;hpb=45774df05e8f709fec28d87dd33cb17ef388c1b2;p=iramuteq diff --git a/corpus.py b/corpus.py index 5fe448a..555a034 100644 --- a/corpus.py +++ b/corpus.py @@ -20,6 +20,9 @@ from operator import itemgetter from uuid import uuid4 import datetime from copy import copy +#------test spacy------------ +#import spacy +#nlp = spacy.load("fr_core_news_lg") #------------------------------------ # import des fichiers du projet @@ -255,14 +258,14 @@ class Corpus : def getucisize(self) : ucesize = self.getucesize() - return [sum(ucesize[uci.uces[0].ident:(uci.uces[-1].ident + 1)]) for uci in self.ucis] + return [sum(ucesize[uci.uces[0].ident:(uci.uces[-1].ident + 1)]) for uci in self.ucis if len(uci.uces) != 0] def getucesize(self) : res = self.getalluces() return [len(uce[1].split()) for uce in res] def getconcorde(self, uces) : - return self.cuces.execute('select * from uces where id IN (%s) ORDER BY id;' % ', '.join([repr(i) for i in uces])) + return self.cuces.execute('select * from uces where id IN (%s) ORDER BY id;' % ', '.join([repr(i) for i in uces])) def getuciconcorde(self, ucis) : uces = [[val,[uce.ident for uce in self.ucis[val].uces]] for val in ucis] @@ -290,6 +293,11 @@ class Corpus : def getucesfrometoile(self, etoile) : return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles] + def getucisfrometoile(self, etoile): + uces = [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles] + return list(set([self.getucefromid(val).uci for val in uces])) + + def getetoileuces(self) : log.info('get uces etoiles') etoileuces = {} @@ -875,7 +883,7 @@ class Corpus : etoileuces = self.getetoileuces() else : etoileuces = self.getetoileucis() - etoileuces = dict([[et, etoileuces[et]] for et in etoileuces if len(etoileuces[et]) > 1]) + etoileuces = dict([[et, etoileuces[et]] for et in etoileuces if len(etoileuces[et]) > 0]) with open(fileout, 'w', encoding='utf8') as f : f.write('\n'.join([';'.join([et] + [repr(len(set(etoileuces[et]).intersection(classe))) for classe in ucecl]) for et in etoileuces])) #.encode(self.parametres['syscoding']) #etoiles = self.make_etoiles() @@ -1502,7 +1510,7 @@ class BuildSubCorpus(BuildCorpus): newuces = [] newpara = [] for et in uci.paras : - keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep] + keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeepand and uce.para == idpara] idpara += 1 if keepuces != [] : newuces += keepuces @@ -1546,6 +1554,7 @@ class BuildSubCorpus(BuildCorpus): class BuildFromAlceste(BuildCorpus) : def read_corpus(self, infile) : + if self.dlg is not None : self.dlg.Pulse('textes : 0 - segments : 0') self.limitshow = 0 @@ -1565,6 +1574,8 @@ class BuildFromAlceste(BuildCorpus) : if self.testuci(line) : iduci += 1 if txt != [] : + #doc = nlp(' '.join(txt)) + #print([[word, word.pos_, word.lemma_] for word in doc]) iduce, idpara = self.treattxt(txt, iduce, idpara, iduci - 1) txt = [] self.corpus.ucis.append(Uci(iduci, line))