X-Git-Url: http://iramuteq.org/git?p=iramuteq;a=blobdiff_plain;f=corpus.py;h=4ba60d1d4b51bb64d039cb4956eb83bebb8c6b86;hp=0d661d6d7ad2c6a8e6185dae4300778a95b3b4fb;hb=246487236ad321045561a260ca393b8394aff653;hpb=065a856b87fe4fe3258b1406d6ace509e9da6e69 diff --git a/corpus.py b/corpus.py index 0d661d6..4ba60d1 100644 --- a/corpus.py +++ b/corpus.py @@ -172,6 +172,12 @@ class Corpus : res = self.cformes.execute(query) return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res])))) + def gettgenst(self, tgen): + formesid = ', '.join([`val` for lem in tgen for val in self.lems[lem].formes if lem in self.lems]) + query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid + res = self.cformes.execute(query) + return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res])))) + def getlemucis(self, lem) : uces = self.getlemuces(lem) return list(set([self.getucefromid(val).uci for val in uces])) @@ -756,6 +762,28 @@ class Corpus : except IndexError : det[et[0]] = 1 return det + + def make_theme_dict(self): + themes = [val for uci in self.ucis for val in uci.paras] + det = {} + for theme in themes : + th = theme.split('_') + if th[0] in det : + try : + endth = '_'.join(th[1:]) + if theme in det[th[0]] : + det[th[0]][theme] += 1 + else : + det[th[0]][theme] = 1 + except IndexError : + det[th[0]] += 1 + else : + try : + endth = '_'.join(th[1:]) + det[th[0]] = {theme:1} + except IndexError : + det[th[0]] = 1 + return det def make_etline(self, listet) : etuces = [[] for et in listet] @@ -1315,6 +1343,7 @@ class BuildSubCorpus(BuildCorpus): self.infile = None self.corpus = Corpus(self, {'type' : 'corpus', 'originalpath' : corpus.parametres['originalpath'], 'encoding' : corpus.parametres['encoding']}) self.last = 0 + self.parametres = parametres self.encoding = corpus.parametres['encoding'] self.corpus.parametres['corpus_name'] = parametres['corpus_name'] self.corpus.pathout = PathOut(filename = corpus.parametres['originalpath'], dirout = parametres['pathout']) @@ -1344,38 +1373,41 @@ class BuildSubCorpus(BuildCorpus): self.corpus.ucis.append(nuci) else : idpara += 1 - elif parametres.get('fromcluster', False) : - pass + elif parametres.get('fromclusters', False) : + self.parametres['uceids'] = [st for i in self.parametres['meta'] for st in self.parametres['lc'][i]] + self.fromuceids() elif parametres.get('fromuceids', False) : - print 'fromuceids' - dictucekeep = dict(zip(parametres['uceids'], parametres['uceids'])) - idpara = 0 - for uci in self.ori.ucis : - if uci.paras == [] : - keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep] - if keepuces != [] : - nuci = CopyUci(uci) - nuci.uces = keepuces - self.corpus.ucis.append(nuci) - idpara += 1 - else : - newuces = [] - newpara = [] - for et in uci.paras : - keepuces = [CopyUce(uce) for uce in uci.uces if uce.para == idpara] - idpara += 1 - if keepuces != [] : - newuces += keepuces - newpara.append(et) - if newuces != [] : - nuci = CopyUci(uci) - nuci.uces = newuces - nuci.paras = newpara - self.corpus.ucis.append(nuci) - + self.fromuceids() #create database self.connect() self.dobuild() + + def fromuceids(self): + print 'fromuceids' + dictucekeep = dict(zip(self.parametres['uceids'], self.parametres['uceids'])) + idpara = 0 + for uci in self.ori.ucis : + if uci.paras == [] : + keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep] + if keepuces != [] : + nuci = CopyUci(uci) + nuci.uces = keepuces + self.corpus.ucis.append(nuci) + idpara += 1 + else : + newuces = [] + newpara = [] + for et in uci.paras : + keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep] + idpara += 1 + if keepuces != [] : + newuces += keepuces + newpara.append(et) + if newuces != [] : + nuci = CopyUci(uci) + nuci.uces = newuces + nuci.paras = newpara + self.corpus.ucis.append(nuci) def read_corpus(self, infile = None): self.olduceid = [uce.ident for uci in self.corpus.ucis for uce in uci.uces] @@ -1537,16 +1569,21 @@ class Builder : parametres = dial.doparametres() parametres['originalpath'] = parent.filename PathOut().createdir(parametres['pathout']) - ReadLexique(self.parent, lang = parametres['lang']) + if parametres.get('dictionary', False) : + filein = parametres['dictionary'] + else : + filein = None + dial.Destroy() + ReadLexique(self.parent, lang = parametres['lang'], filein = filein) if parametres['lang'] != 'other' and os.path.exists(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')): self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')) else : self.parent.expressions = {} self.parametres = parametres else : + dial.Destroy() if self.dlg is not None : self.dlg.Destroy() - dial.Destroy() def doanalyse(self) : return BuildFromAlceste(self.parent.filename, self.parametres, self.parent.lexique, self.parent.expressions, dlg = self.dlg).corpus @@ -1565,9 +1602,12 @@ class SubBuilder : parametres['meta'] = corpus.make_etoiles() elif parametres.get('fromtheme', False) : parametres['meta'] = corpus.make_themes() + elif parametres.get('fromclusters', False) : + parametres['meta'] = [' '.join(['classe', `i`]) for i in range(1,parametres['clnb'] + 1)] else : parametres['meta'] = [] - parametres['meta'].sort() + if 'fromclusters' not in parametres : + parametres['meta'].sort() if dlg is not None : del busy dial = SubTextFromMetaDial(parent, parametres) @@ -1585,7 +1625,10 @@ class SubBuilder : i += 1 parametres['pathout'] = pathout + '_%i' % i meta = dial.m_listBox1.GetSelections() - parametres['meta'] = [parametres['meta'][val] for val in meta] + if not 'fromclusters' in parametres : + parametres['meta'] = [parametres['meta'][val] for val in meta] + else : + parametres['meta'] = meta self.parametres = parametres dial.Destroy() else : @@ -1593,9 +1636,3 @@ class SubBuilder : def doanalyse(self): return BuildSubCorpus(self.ori, parametres = self.parametres, dlg = self.dlg).corpus - -if __name__ == '__main__' : - t1 = time() - parametres = {'formesdb':'formes.db', 'ucesdb': 'uces.db', 'corpusdb' : 'corpus.db', 'syscoding' : 'utf-8', 'encoding' : 'utf8'} - intro = BuildCorpus(infile, parametres)#, tar_in, tar_infouce)#, tar_formes) - print time() - t1