X-Git-Url: http://iramuteq.org/git?p=iramuteq;a=blobdiff_plain;f=corpus.py;h=011206fcc5553f7c6150e410917c0d475ce5c910;hp=d26a8a376dd96e3fa4d7be12a9c47bce6584a9ef;hb=763d90785a9de548c3a5ffd9b718e3e5fea8332d;hpb=bd8d0a889d1d393e64a6d768dc14e9c639a0df8c diff --git a/corpus.py b/corpus.py index d26a8a3..011206f 100644 --- a/corpus.py +++ b/corpus.py @@ -16,7 +16,8 @@ import logging from operator import itemgetter from uuid import uuid4 from chemins import PathOut -from dialog import CorpusPref +from dialog import CorpusPref, SubTextFromMetaDial +from copy import copy from colors import colors import datetime @@ -33,6 +34,16 @@ def copycorpus(corpus) : copy_corpus.conn_all() return copy_corpus +def CopyUce(uce) : + return Uce(uce.ident, uce.para, uce.uci) + + +def CopyUci(uci): + nuci = Uci(uci.ident, '') + nuci.etoiles = copy(uci.etoiles) + nuci.uces = [CopyUce(uce) for uce in uci.uces] + return nuci + class Corpus : @@ -82,6 +93,20 @@ class Corpus : lem = word self.formes[word] = Word(word, gramtype, len(self.formes), lem) self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident : 1} + + def add_word_from_forme(self, word, stident): + if word.forme in self.formes : + self.formes[word.forme].freq += 1 + if self.formes[word.forme].ident in self.idformesuces : + if stident in self.idformesuces[self.formes[word.forme].ident] : + self.idformesuces[self.formes[word.forme].ident][stident] += 1 + else : + self.idformesuces[self.formes[word.forme].ident][stident] = 1 + else : + self.idformesuces[self.formes[word.forme].ident] = {stident: 1} + else : + self.formes[word.forme] = Word(word.forme, word.gram, len(self.formes), word.lem) + self.idformesuces[self.formes[word.forme].ident] = {stident : 1} def conn_all(self): """connect corpus to db""" @@ -147,6 +172,16 @@ class Corpus : res = self.cformes.execute(query) return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res])))) + def gettgenst(self, tgen): + formesid = ', '.join([`val` for lem in tgen for val in self.lems[lem].formes if lem in self.lems]) + query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid + res = self.cformes.execute(query) + return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res])))) + + def gettgentxt(self, tgen): + sts = self.gettgenst(tgen) + return list(set([self.getucefromid(val).uci for val in sts])) + def getlemucis(self, lem) : uces = self.getlemuces(lem) return list(set([self.getucefromid(val).uci for val in uces])) @@ -198,7 +233,7 @@ class Corpus : return [len(uce[1].split()) for uce in res] def getconcorde(self, uces) : - return self.cuces.execute('select * from uces where id IN (%s);' % ', '.join([`i` for i in uces])) + return self.cuces.execute('select * from uces where id IN (%s) ORDER BY id;' % ', '.join([`i` for i in uces])) def getuciconcorde(self, ucis) : uces = [[val,[uce.ident for uce in self.ucis[val].uces]] for val in ucis] @@ -289,6 +324,30 @@ class Corpus : self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme]) else : self.lems = dict([[forme, Lem(self, self.formes[forme])] for forme in self.formes]) + + def make_lems_from_dict(self, dictionnaire, dolem = True) : + log.info('make lems from dict') + self.lems = {} + for forme in self.formes : + if self.formes[forme].forme in dictionnaire : + lem = dictionnaire[forme][0] + gram = dictionnaire[forme][1] + elif forme.isdigit() : + gram = u'num' + lem = forme + else : + gram = u'nr' + lem = forme + self.formes[forme].lem = lem + self.formes[forme].gram = gram + if dolem : + if self.formes[forme].lem in self.lems : + if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes : + self.lems[self.formes[forme].lem].add_forme(self.formes[forme]) + else : + self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme]) + else : + self.lems[forme] = Lem(self, self.formes[forme]) def make_idformes(self) : self.idformes = dict([[self.formes[forme].ident, self.formes[forme]] for forme in self.formes]) @@ -319,6 +378,49 @@ class Corpus : tab.insert(0, [''] + etoiles) return tab + def make_tgen_table(self, tgen, etoiles, tot = None): + lclasses = [self.getucesfrometoile(etoile) for etoile in etoiles] + sets = [set(cl) for cl in lclasses] + totoccurrences = dict([[val, 0] for val in etoiles]) + if tot is None : + for forme in self.formes : + formeuceeff = self.getformeuceseff(forme) + for i, classe in enumerate(lclasses) : + concern = sets[i].intersection(formeuceeff.keys()) + if len(concern) : + totoccurrences[etoiles[i]] += sum([formeuceeff[uce] for uce in concern]) + #tgenoccurrences = dict([[val, 0] for val in etoiles]) + tgenoccurrences = {} + for t in tgen.tgen : + tgenoccurrences[t] = dict([[val, 0] for val in etoiles]) + for lem in tgen[t] : + lemuceeff = self.getlemuceseff(lem) + for i, classe in enumerate(lclasses) : + concern = sets[i].intersection(lemuceeff.keys()) + if len(concern) : + tgenoccurrences[t][etoiles[i]] += sum([lemuceeff[uce] for uce in concern]) + return tgenoccurrences, totoccurrences + + def make_tgen_profile(self, tgen, ucecl, uci = False) : + log.info('tgen/classes') + if uci : + tab = [[lem] + [len(set(self.gettgentxt(tgen[lem])).intersection(classe)) for classe in ucecl] for lem in tgen] + else : + tab = [[lem] + [len(set(self.gettgenst(tgen[lem])).intersection(classe)) for classe in ucecl] for lem in tgen] + tab = [[line[0]] + [val for val in line[1:]] for line in tab if sum(line[1:]) >= 3] + return tab + #i = 0 + #nam = 'total' + #while nam + `i` in tgen : + # i += 1 + #nam = nam + `i` + #last = [nam] + [`len(classe)` for classe in ucecl] + #tab += [last] + #line0 = ['tgen'] + ['_'.join(['cluster', `i+1`]) for i in range(len(ucecl))] + #tab = [line0] + tab + #with open(fileout, 'w') as f : + # f.write('\n'.join(['\t'.join(line) for line in tab]).encode(self.parametres['syscoding'])) + def make_efftype_from_etoiles(self, etoiles) : dtype = {} etuces = [[] for et in etoiles] @@ -656,6 +758,12 @@ class Corpus : for uci in self.ucis : etoiles.update(uci.etoiles[1:]) return list(etoiles) + + def make_themes(self): + themes = set([]) + for uci in self.ucis : + themes.update(uci.paras) + return list(themes) def make_etoiles_dict(self) : etoiles = [et for uci in self.ucis for et in uci.etoiles[1:]] @@ -678,6 +786,28 @@ class Corpus : except IndexError : det[et[0]] = 1 return det + + def make_theme_dict(self): + themes = [val for uci in self.ucis for val in uci.paras] + det = {} + for theme in themes : + th = theme.split('_') + if th[0] in det : + try : + endth = '_'.join(th[1:]) + if theme in det[th[0]] : + det[th[0]][theme] += 1 + else : + det[th[0]][theme] = 1 + except IndexError : + det[th[0]] += 1 + else : + try : + endth = '_'.join(th[1:]) + det[th[0]] = {theme:1} + except IndexError : + det[th[0]] = 1 + return det def make_etline(self, listet) : etuces = [[] for et in listet] @@ -929,8 +1059,7 @@ class Corpus : listlem.sort() with open(fileout, 'w') as f : f.write('\n'.join(['\t'.join(lem) for lem in listlem]).encode(syscoding)) - - + class MakeUciStat : @@ -1062,7 +1191,7 @@ class BuildCorpus : self.corpus.pathout = PathOut(filename = parametres_corpus['originalpath'], dirout = parametres_corpus['pathout']) self.corpus.pathout.createdir(parametres_corpus['pathout']) self.corpus.parametres['uuid'] = str(uuid4()) - self.corpus.parametres['corpus_name'] = os.path.split(self.corpus.parametres['pathout'])[1] + self.corpus.parametres['corpus_name'] = parametres_corpus['corpus_name']#os.path.split(self.corpus.parametres['pathout'])[1] self.corpus.parametres['type'] = 'corpus' if self.corpus.parametres['keep_ponct'] : self.ponctuation_espace = [' ', ''] @@ -1132,7 +1261,7 @@ class BuildCorpus : self.cf.execute('CREATE INDEX ideff ON eff (id);') self.c.close() self.cf.close() - #backup corpora + #backup corpus self.conn_corpus = sqlite3.connect(self.corpus.pathout['corpus.db']) self.ccorpus = self.conn_corpus.cursor() self.ccorpus.execute('CREATE TABLE IF NOT EXISTS etoiles (uci INTEGER, et TEXT, paras TEXT);') @@ -1230,6 +1359,109 @@ class BuildCorpus : pourhapaxocc = (float(hapaxnb) / self.corpus.parametres['occurrences']) * 100 self.corpus.parametres['hapax'] = '%i - %.2f %% des formes - %.2f %% des occurrences' % (hapaxnb, pourhapaxf, pourhapaxocc) +class BuildSubCorpus(BuildCorpus): + def __init__(self, corpus, parametres, dlg = None) : + log.info('begin subcorpus...') + self.dlg = dlg + self.ori = corpus + self.infile = None + self.corpus = Corpus(self, {'type' : 'corpus', 'originalpath' : corpus.parametres['originalpath'], 'encoding' : corpus.parametres['encoding']}) + self.last = 0 + self.parametres = parametres + self.encoding = corpus.parametres['encoding'] + self.corpus.parametres['corpus_name'] = parametres['corpus_name'] + self.corpus.pathout = PathOut(filename = corpus.parametres['originalpath'], dirout = parametres['pathout']) + self.corpus.pathout.createdir(parametres['pathout']) + self.corpus.parametres['pathout'] = parametres['pathout'] + self.corpus.parametres['meta'] = parametres.get('meta', False) + self.corpus.parametres['uuid'] = str(uuid4()) + if parametres.get('frommeta', False) : + print 'make subtexts' + self.corpus.ucis = [CopyUci(uci) for uci in self.ori.ucis if set(parametres['meta']).intersection(uci.etoiles) != set()] + elif parametres.get('fromtheme', False) : + print 'make subtexts from theme' + idpara = 0 + for uci in self.ori.ucis : + if uci.paras != [] : + newuce = [] + newpara = [] + for et in uci.paras : + if et in parametres['meta'] : + newuce += [CopyUce(uce) for uce in uci.uces if uce.para == idpara] + newpara.append(et) + idpara += 1 + if newuce != [] : + nuci = CopyUci(uci) + nuci.uces = newuce + nuci.paras = newpara + self.corpus.ucis.append(nuci) + else : + idpara += 1 + elif parametres.get('fromclusters', False) : + self.parametres['uceids'] = [st for i in self.parametres['meta'] for st in self.parametres['lc'][i]] + self.fromuceids() + elif parametres.get('fromuceids', False) : + self.fromuceids() + #create database + self.connect() + self.dobuild() + + def fromuceids(self): + print 'fromuceids' + dictucekeep = dict(zip(self.parametres['uceids'], self.parametres['uceids'])) + idpara = 0 + for uci in self.ori.ucis : + if uci.paras == [] : + keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep] + if keepuces != [] : + nuci = CopyUci(uci) + nuci.uces = keepuces + self.corpus.ucis.append(nuci) + idpara += 1 + else : + newuces = [] + newpara = [] + for et in uci.paras : + keepuces = [CopyUce(uce) for uce in uci.uces if uce.ident in dictucekeep] + idpara += 1 + if keepuces != [] : + newuces += keepuces + newpara.append(et) + if newuces != [] : + nuci = CopyUci(uci) + nuci.uces = newuces + nuci.paras = newpara + self.corpus.ucis.append(nuci) + + def read_corpus(self, infile = None): + self.olduceid = [uce.ident for uci in self.corpus.ucis for uce in uci.uces] + ident_uci = 0 + ident_uce = 0 + ident_para = -1 + lastpara = -1 + newuceident = {} + print 'redo text, para and st ident' + for uci in self.corpus.ucis : + uci.ident = ident_uci + ident_uci += 1 + for uce in uci.uces : + uce.uci = uci.ident + if uce.para != lastpara : + ident_para += 1 + lastpara = uce.para + uce.para = ident_para + else : + uce.para = ident_para + newuceident[uce.ident] = ident_uce + uce.ident = ident_uce + ident_uce += 1 + print 'backup st text and forms' + for row in self.ori.getconcorde(self.olduceid) : + self.c.execute('INSERT INTO uces VALUES(?,?);', (`newuceident[row[0]]`, row[1])) + for word in row[1].split() : + self.corpus.add_word_from_forme(self.ori.formes[word], newuceident[row[0]]) + self.backup_uce() + print 'done' class BuildFromAlceste(BuildCorpus) : def read_corpus(self, infile) : @@ -1289,7 +1521,7 @@ class BuildFromAlceste(BuildCorpus) : if iduci != -1 and iduce != -1: self.backup_uce() else : - log.info(_(u"No Text in corpora. Are you sure of the formatting ?")) + log.info(_(u"No Text in corpus. Are you sure of the formatting ?")) raise Exception('TextBeforeTextMark %i' % linenb) except UnicodeDecodeError : raise Exception("CorpusEncoding") @@ -1351,6 +1583,7 @@ class Builder : self.dlg = dlg parametres = DoConf(os.path.join(self.parent.UserConfigPath,'corpus.cfg')).getoptions('corpus') parametres['pathout'] = PathOut(parent.filename, 'corpus').mkdirout() + parametres['corpus_name'] = os.path.split(parametres['pathout'])[1] dial = CorpusPref(parent, parametres) dial.CenterOnParent() dial.txtpath.SetLabel(parent.filename) @@ -1360,23 +1593,70 @@ class Builder : parametres = dial.doparametres() parametres['originalpath'] = parent.filename PathOut().createdir(parametres['pathout']) - ReadLexique(self.parent, lang = parametres['lang']) + if parametres.get('dictionary', False) : + filein = parametres['dictionary'] + else : + filein = None + dial.Destroy() + ReadLexique(self.parent, lang = parametres['lang'], filein = filein) if parametres['lang'] != 'other' and os.path.exists(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')): self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')) else : self.parent.expressions = {} self.parametres = parametres else : + dial.Destroy() if self.dlg is not None : self.dlg.Destroy() - dial.Destroy() def doanalyse(self) : return BuildFromAlceste(self.parent.filename, self.parametres, self.parent.lexique, self.parent.expressions, dlg = self.dlg).corpus - -if __name__ == '__main__' : - t1 = time() - parametres = {'formesdb':'formes.db', 'ucesdb': 'uces.db', 'corpusdb' : 'corpus.db', 'syscoding' : 'utf-8', 'encoding' : 'utf8'} - intro = BuildCorpus(infile, parametres)#, tar_in, tar_infouce)#, tar_formes) - print time() - t1 +class SubBuilder : + def __init__(self, parent, corpus, parametres = None, dlg = None): + self.parent = parent + self.ori = corpus + self.dlg = dlg + corpus_name = 'Sub' + corpus.parametres['corpus_name'] + if dlg is not None : + busy = wx.BusyInfo(_("Please wait...").decode('utf8'), self) + wx.SafeYield() + parametres['corpus_name'] = corpus_name + if parametres.get('frommeta', False) : + parametres['meta'] = corpus.make_etoiles() + elif parametres.get('fromtheme', False) : + parametres['meta'] = corpus.make_themes() + elif parametres.get('fromclusters', False) : + parametres['meta'] = [' '.join(['classe', `i`]) for i in range(1,parametres['clnb'] + 1)] + else : + parametres['meta'] = [] + if 'fromclusters' not in parametres : + parametres['meta'].sort() + if dlg is not None : + del busy + dial = SubTextFromMetaDial(parent, parametres) + self.res = dial.ShowModal() + if self.res == 5100 : + if dial.subcorpusname.GetValue() != '' : + corpus_name = ''.join([l for l in dial.subcorpusname.GetValue() if l.isalnum() or l in ['_']]) + if corpus_name != '' : + parametres['corpus_name'] = corpus_name + else : + parametres['corpus_name'] = 'Sub' + corpus.parametres['corpus_name'] + pathout = os.path.join(corpus.parametres['pathout'], parametres['corpus_name']) + i = 1 + while os.path.exists(pathout + '_%i' % i) : + i += 1 + parametres['pathout'] = pathout + '_%i' % i + meta = dial.m_listBox1.GetSelections() + if not 'fromclusters' in parametres : + parametres['meta'] = [parametres['meta'][val] for val in meta] + else : + parametres['meta'] = meta + self.parametres = parametres + dial.Destroy() + else : + dial.Destroy() + + def doanalyse(self): + return BuildSubCorpus(self.ori, parametres = self.parametres, dlg = self.dlg).corpus