X-Git-Url: http://iramuteq.org/git?a=blobdiff_plain;ds=inline;f=corpus.py;h=a2790f04d1253ba46592bf889b45a7f212c84b15;hb=7b070bf957be567de345b12e63cf5abbc7bafa79;hp=2f81aaa3a84cb783eab211312a63aab55b4d0b09;hpb=8fa853a25a9d62b1446e1bc543e5a3a4d0e03dcf;p=iramuteq diff --git a/corpus.py b/corpus.py index 2f81aaa..a2790f0 100644 --- a/corpus.py +++ b/corpus.py @@ -1,850 +1,904 @@ # -*- coding: utf-8 -*- #Author: Pierre Ratinaud -#Copyright (c) 2010, Pierre Ratinaud -#Lisense: GNU/GPL import codecs -import shelve -import csv -import re import os +import gettext +_ = gettext.gettext +import locale import sys -from colors import colors -from functions import decoupercharact, ReadDicoAsDico, sortedby -from ttparser import get_ucis_from_tt -#from ConfigParser import RawConfigParser -import json from time import time -#import nltk +from functions import decoupercharact, ReadDicoAsDico, DoConf, ReadLexique, progressbar +import re +import sqlite3 +import itertools +import logging +from operator import itemgetter +from uuid import uuid4 +from chemins import PathOut +from dialog import CorpusPref, SubTextFromMetaDial +from copy import copy +from colors import colors +import datetime + + +log = logging.getLogger('iramuteq.corpus') + + +def copycorpus(corpus) : + log.info('copy corpus') + copy_corpus = Corpus(corpus.parent, parametres = corpus.parametres) + copy_corpus.ucis = corpus.ucis + copy_corpus.formes = corpus.formes + copy_corpus.pathout = corpus.pathout + copy_corpus.conn_all() + return copy_corpus + +def CopyUce(uce) : + return Uce(uce.ident, uce.para, uce.uci) + + +def CopyUci(uci): + nuci = Uci(uci.ident, '') + nuci.etoiles = copy(uci.etoiles) + nuci.uces = [CopyUce(uce) for uce in uci.uces] + return nuci + -def chunks(l, n): - """ Yield successive n-sized chunks from l. - """ - for i in xrange(0, len(l), n): - yield l[i:i+n] class Corpus : - def __init__(self, parent) : + """Corpus class + list of text + """ + def __init__(self, parent, parametres = {}, read = False) : self.parent = parent - self.parametre = {'syscoding': sys.getdefaultencoding()} - self.content = None - self.ucis = None - self.formes = {} - self.lems = {} - self.ucenb = None - self.etoiles = None - self.etintxt = {} - self.ucis_paras_uces = None - self.lc = None - self.lc0 = None - self.actives = None - self.supp = None - #self.supplementaires = [] - self.lenuc1 = None - self.lenuc2 = None - self.lexique = None - - def open_corpus(self) : - with codecs.open(self.parametre['filename'], "r", self.parametre['encodage']) as f : - self.content = f.read() - - def make_big(self) : - import sqlite3 - ucifile = os.path.join(os.path.dirname(self.parametre['filename']), 'ucis.txt') - uci = open(ucifile, 'w') - #db = os.path.join(os.path.dirname(self.parametre['filename']), 'corpus.db') - #conn = sqlite3.connect(db) - #c = conn.cursor() - #conn.text_factory = str - #c = conn.cursor() - #c.execute('''CREATE TABLE corpus (id integer, varet TEXT)''') - #c = conn.cursor() - ucinb = 0 + self.parametres = parametres + self.cformes = None + self.connformes = None + self.connuces = None + self.conncorpus = None + self.islem = False + self.cuces = None self.ucis = [] - txt = [] - with codecs.open(self.parametre['filename'], "r", self.parametre['encodage']) as open_corpus : - for line in open_corpus : - if line.startswith(u'****') : - print ucinb - uci.write(line.replace('/n', ' ')) - #self.ucis.append([line.rstrip(), `ucinb`]) - if ucinb != 0 : - for word in txt : - if word not in [' ','.', u'£', ';', '?', '!', ',', ':',''] : - id = len(self.formes) - self.feed_dict_big(word, ucinb) - txt = [] - #c = conn.cursor() - #c.execute('INSERT INTO uci values (?,?)', (ucinb, line.rstrip())) - #conn.commit() - #print ucinb - ucinb += 1 + self.formes = {} + self.flems = {} + self.lems = None + self.idformesuces = {} + self.iduces = None + self.idformes = None + self.uceuci = None + if read : + self.pathout = PathOut(dirout = parametres['pathout']) + self.read_corpus() + + def add_word(self, word) : + if word in self.formes : + self.formes[word].freq += 1 + if self.formes[word].ident in self.idformesuces : + if self.ucis[-1].uces[-1].ident in self.idformesuces[self.formes[word].ident] : + self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] += 1 else : - line = line.lower().replace('-', ' ').replace(u'\'',' ').replace(u'â',' ').replace('...',u' £ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').rstrip().split() - txt += line - uci.close() - print len(self.formes) - print sum([self.formes[forme][0] for forme in self.formes]) - formes_out2 = os.path.join(os.path.dirname(self.parametre['filename']), 'formes_formes.csv') - formes_uces = os.path.join(os.path.dirname(self.parametre['filename']), 'formes_uces.csv') - with open(formes_out2, 'w') as f : - f.write('\n'.join([';'.join([forme, `self.formes[forme][0]`, self.formes[forme][2]]) for forme in self.formes])) - with open(formes_uces, 'w') as f: - f.write('\n'.join([' '.join([' '.join([`uce`, `self.formes[forme][1][uce]`]) for uce in self.formes[forme][1]]) for forme in self.formes])) - #uciout = os.path.join(os.path.dirname(self.parametre['filename']), 'uciout.csv') - #with open(uciout,'w') as f : - # f.write('\n'.join(['\t'.join(line) for line in self.ucis])) - - - - - def read_corpus_out(self, corpus_out) : - #print 'test encodage' - #self.parametre['syscoding'] = 'cp1252' - with codecs.open(corpus_out ,'r', self.parametre['syscoding']) as f: - content = f.read() - if sys.platform == 'win32' : - sep = '\r\n\r\n' + self.idformesuces[self.formes[word].ident][self.ucis[-1].uces[-1].ident] = 1 else : - sep = '\n\n' - self.ucis_paras_uces = [[[uce.split() for uce in para.splitlines()] for para in uci.split(u'$$$')] for uci in content.split(sep)] - #print self.ucis_paras_uces - - def read_formes_out(self, forme_out) : - print 'read formes' - print 'test encodage' - #t1 = time() - if os.path.exists(forme_out) : - with codecs.open(forme_out, 'r', self.parametre['syscoding']) as f : - content = f.read() - cc = [forme.split(u'$') for forme in content.splitlines()] - self.formes = dict([[forme[0], [int(forme[1]), dict([[eval(uce.split(':')[0]), int(uce.split(':')[1])] for uce in forme[2].split(';')]), forme[3], int(forme[4])]] for forme in cc]) - else : - formes_out2 = os.path.join(os.path.dirname(forme_out), 'formes_formes.csv') - formes_uces = os.path.join(os.path.dirname(forme_out), 'formes_uces.csv') - with codecs.open(formes_uces, 'r', self.parametre['syscoding']) as f: - uces = f.read() - uces = [list(chunks(line.split(),4)) for line in uces.splitlines()] - with codecs.open(formes_out2, 'r', self.parametre['syscoding']) as f : - self.formes = f.read() - self.formes = [[line.split(';'), dict([[(int(uce[0]),int(uce[1]), int(uce[2])), int(uce[3])] for uce in uces[i]])] for i, line in enumerate(self.formes.splitlines())] - self.formes = dict([[line[0][0], [int(line[0][1]), line[1], line[0][2], int(line[0][3])]] for line in self.formes]) - - def read_corpus_from_shelves(self, db) : - d = shelve.open(db) - self.parametre = d['parametre'] - if not 'syscoding' in self.parametre : - self.parametre['syscoding'] = sys.getdefaultencoding() - self.lems = d['lems'] - if 'ucis_paras_uces' in d : - self.ucis_paras_uces = d['ucis_paras_uces'] + self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident: 1} else : - corpus_out = os.path.join(os.path.dirname(db), 'corpus.txt') - self.read_corpus_out(corpus_out) - if 'formes' in d : - self.formes = d['formes'] - else : - formes_out = os.path.join(os.path.dirname(db), 'formes.txt') - self.read_formes_out(formes_out) -# print 'deb sql' -# import sqlite3 -# db_out = os.path.join(os.path.dirname(db), 'formes.db') -# conn = sqlite3.connect(db_out) -# c = conn.cursor() -# c.execute('''SELECT * FROM formes''') -# self.formes = dict([[forme[0], [int(forme[1]), dict([[eval(uce.split(':')[0]), int(uce.split(':')[1])] for uce in forme[2].split(';')]), forme[3], int(forme[4])]] for forme in c]) -# print 'fin sql' - self.etoiles = d['etoiles'] - self.actives = d['actives'] - self.ucis = d['ucis'] - self.lc = d['lc'] - self.lc0 = d['lc0'] - d.close() - - - def save_corpus(self, db) : - d= shelve.open(db) - d['parametre'] = self.parametre - #d['formes'] = self.formes - d['lems'] = self.lems - #d['ucis_paras_uces'] = self.ucis_paras_uces - d['etoiles'] = self.etoiles - d['actives'] = self.actives - d['ucis'] = self.ucis - d['lc'] = self.lc - d['lc0'] = self.lc0 - d.close() - corpus_out = os.path.join(os.path.dirname(db), 'corpus.txt') - with open(corpus_out, 'w') as f : - f.write('\n\n'.join([u'$$$'.join(['\n'.join([' '.join(uce) for uce in para]) for para in uci]) for uci in self.ucis_paras_uces])) - #t1 = time() - formes_out2 = os.path.join(os.path.dirname(db), 'formes_formes.csv') - formes_uces = os.path.join(os.path.dirname(db), 'formes_uces.csv') - - with open(formes_out2, 'w') as f : - f.write('\n'.join([';'.join([forme, `self.formes[forme][0]`, self.formes[forme][2], `self.formes[forme][3]`]) for forme in self.formes])) - with open(formes_uces, 'w') as f: - f.write('\n'.join([' '.join([' '.join([`uce[0]`,`uce[1]`, `uce[2]`, `self.formes[forme][1][uce]`]) for uce in self.formes[forme][1]]) for forme in self.formes])) - #print time() - t1 - #t1 = time() - #toprint = json.dumps(self.formes) - #with open(os.path.join(os.path.dirname(db), 'json.db'), 'w') as f: - # f.write(toprint) - #print time() - t2 - -# import sqlite3 -# db_out = os.path.join(os.path.dirname(db), 'formes.db') -# conn = sqlite3.connect(db_out) -# c = conn.cursor() -# conn.text_factory = str -# c = conn.cursor() -# c.execute('''CREATE TABLE formes (formes TEXT, freq integer, uces TEXT, type TEXT, identifiant integer)''') -# c = conn.cursor() -# for formes in self.formes : -# c.execute('INSERT INTO formes values (?,?,?,?,?)', (formes, self.formes[formes][0], ';'.join([':'.join([str(uce), str(self.formes[formes][1][uce])]) for uce in self.formes[formes][1]]), self.formes[formes][2], self.formes[forme][3])) -# conn.commit() -# print 'fin sql' - - def make_len_uce(self, nbtotoc): - if self.parametre['nbforme_uce'] == None or self.parametre['nbforme_uce'] == 0 : - #FIXME - if len(self.ucis) == 1: - self.parametre['eff_min_uce'] = 30 - elif 200000 <= nbtotoc < 400000: - self.parametre['eff_min_uce'] = (0.0016 * float(nbtotoc) / float(len(self.ucis))) + 20 - elif nbtotoc < 200000: - self.parametre['eff_min_uce'] = (0.0016 * float(nbtotoc) / float(len(self.ucis))) + 30 - else: - self.parametre['eff_min_uce'] = (float(nbtotoc) / float(len(self.ucis))) / float(15) - else : - self.parametre['eff_min_uce'] = self.parametre['nbforme_uce'] - # print 'ATTENTION ASSIGNATION DE LA TAILLE DES UCE' - # self.lenuce = 44 - - - def quick_clean1(self) : - print 'quick clean' - self.content = self.content.lower() - keep_caract = u"a-zA-Z0-9à ÃâÃäÃáÃéÃèÃêÃëÃìÃîÃïÃòÃôÃöÃùÃûÃüÃçÃÃâñ.:,;!?\n*'_-" - list_keep = u"[^" + keep_caract + "]+" -# print 'NETTOYAGE CABLE PLUS SUB' - #print ('#########ATTENTION CHINOIS plus keep_caract#################') - #list_keep = u"[;]+" - self.content = re.sub(list_keep, ' ', self.content) - #self.content = re.sub(list_keep, ' ', self.content) - - #self.content = self.content.replace(u'[â]+', '\'') - self.content = re.sub(u'[â]+', '\'', self.content) - self.content = re.sub(u'[\r\n]+', '\n', self.content) - self.content = self.content.replace(u'-*',u'#*') - - def find_expression(self,expressions) : - print 'find expression' - for expression in expressions: - if expression in self.content : - print expression, expressions[expression][0] - #self.content = self.content.replace(' '+expression+' ', ' '+expressions[expression][0]+' ') - self.content = self.content.replace(expression, expressions[expression][0]) + if word in self.parent.lexique : + gramtype = self.parent.lexique[word][1] + lem = self.parent.lexique[word][0] + elif word.isdigit() : + gramtype = u'num' + lem = word + else : + gramtype = u'nr' + lem = word + self.formes[word] = Word(word, gramtype, len(self.formes), lem) + self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident : 1} - def quick_clean2(self): - print 'quick clean 2' - self.content = self.content.replace('\'',' ') - self.content = re.sub(u'[-]+', ' ', self.content) - self.content = re.sub(u'[ ]+', ' ', self.content) - self.content = self.content.splitlines() + def add_word_from_forme(self, word, stident): + if word.forme in self.formes : + self.formes[word.forme].freq += 1 + if self.formes[word.forme].ident in self.idformesuces : + if stident in self.idformesuces[self.formes[word.forme].ident] : + self.idformesuces[self.formes[word.forme].ident][stident] += 1 + else : + self.idformesuces[self.formes[word.forme].ident][stident] = 1 + else : + self.idformesuces[self.formes[word.forme].ident] = {stident: 1} + else : + self.formes[word.forme] = Word(word.forme, word.gram, len(self.formes), word.lem) + self.idformesuces[self.formes[word.forme].ident] = {stident : 1} + + def conn_all(self): + """connect corpus to db""" + if self.connformes is None : + log.info('connexion corpus') + self.connuces = sqlite3.connect(self.pathout['uces.db']) + self.cuces = self.connuces.cursor() + self.connformes = sqlite3.connect(self.pathout['formes.db']) + self.cformes = self.connformes.cursor() + self.conncorpus = sqlite3.connect(self.pathout['corpus.db']) + self.ccorpus = self.conncorpus.cursor() + self.cformes.execute('PRAGMA temp_store=MEMORY;') + self.cformes.execute('PRAGMA journal_mode=MEMORY;') + self.cformes.execute('PRAGMA synchronous = OFF;') + self.cuces.execute('PRAGMA temp_store=MEMORY;') + self.cuces.execute('PRAGMA journal_mode=MEMORY;') + self.cuces.execute('PRAGMA synchronous = OFF;') + self.ccorpus.execute('PRAGMA temp_store=MEMORY;') + self.ccorpus.execute('PRAGMA journal_mode=MEMORY;') + self.ccorpus.execute('PRAGMA synchronous = OFF;') + + def read_corpus(self) : + log.info('read corpus') + self.parametres['syscoding'] = sys.getdefaultencoding() + if self.conncorpus is None : + self.conn_all() + res = self.ccorpus.execute('SELECT * FROM etoiles;') + for row in res : + self.ucis.append(Uci(row[0], row[1], row[2])) + uces = self.conncorpus.cursor().execute('SELECT * FROM luces where uci=?;',(`self.ucis[-1].ident`,)) + for uce in uces: + self.ucis[-1].uces.append(Uce(uce[2], uce[1], uce[0])) + res = self.ccorpus.execute('SELECT * FROM formes;') + self.formes = dict([[forme[1], Word(forme[1], forme[3], forme[0], lem = forme[2], freq = forme[4])] for forme in res]) + self.ccorpus.close() - def make_ucis(self) : - print 'make_ucis' - self.ucis = [[self.content[i].strip().split(),i] for i in range(0,len(self.content)) if self.content[i].startswith(u'****')] - return [a[1] for a in self.ucis] + def getworduces(self, wordid) : + if isinstance(wordid, basestring) : + wordid = self.formes[wordid].ident + res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`wordid`,)) + return list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res])) - def find_uci_with_digit(self, line) : - if line[0:4].isdigit() and u'*' in line : - return True - else : - return False + def getworducis(self, wordid) : + res = self.getworduces(wordid) + return list(set([self.getucefromid(uce).uci for uce in res])) + + def getformeuceseff(self, formeid) : + if isinstance(formeid, basestring) : + formeid = self.formes[formeid].ident + res = self.cformes.execute('SELECT uces FROM uces where id=? ORDER BY id;', (`formeid`,)) + uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res])) + query = 'SELECT eff FROM eff where id=%i ORDER BY id' % formeid + res = self.cformes.execute(query) + eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res])) + formeuceeff = {} + for i, uce in enumerate(uces) : + formeuceeff[uce] = formeuceeff.get(uce, 0) + eff[i] + return formeuceeff + + def getlemuces(self, lem) : + formesid = ', '.join([`val` for val in self.lems[lem].formes]) + query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid + res = self.cformes.execute(query) + return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res])))) - def make_ucis_with_digit(self) : - self.ucis = [[self.content[i].replace('\n',' ').strip().split(),i] for i in range(0,len(self.content)) if self.find_uci_with_digit(self.content[i])] - return [a[1] for a in self.ucis] + def gettgenst(self, tgen): + formesid = [] + for lem in tgen : + if lem in self.lems : + formesid += self.lems[lem].formes + else : + print 'abscent : %s' % lem + query = 'SELECT uces FROM uces where id IN %s ORDER BY id' % str(tuple(formesid)) + res = self.cformes.execute(query) + return list(set(list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res])))) + + def gettgenstprof(self, tgen, classe, i, clnb): + tgenst = [] + for lem in tgen : + if lem in self.lems : + lemst = self.getlemuces(lem) + tgenst += lemst + if not lem in self.tgenlem : + self.tgenlem[lem] = [0] * clnb + self.tgenlem[lem][i] = len(set(lemst).intersection(classe)) + else : + print 'abscent: ',lem + return list(set(tgenst)) - def make_lines(self, ucinb) : - print 'make_lines' - return [[ucinb[i]+1,ucinb[i+1]] for i in range(0,len(ucinb)-1)] + [[ucinb[len(ucinb)-1] + 1,len(self.content)]] + def gettgentxt(self, tgen): + sts = self.gettgenst(tgen) + return list(set([self.getucefromid(val).uci for val in sts])) + + def getlemucis(self, lem) : + uces = self.getlemuces(lem) + return list(set([self.getucefromid(val).uci for val in uces])) + + def getlemuceseff(self, lem, luces = None) : + formesid = ', '.join([`val` for val in self.lems[lem].formes]) + query = 'SELECT uces FROM uces where id IN (%s) ORDER BY id' % formesid + res = self.cformes.execute(query) + uces = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res])) + query = 'SELECT eff FROM eff where id IN (%s) ORDER BY id' % formesid + res = self.cformes.execute(query) + eff = list(itertools.chain(*[[int(val) for val in row[0].split()] if not isinstance(row[0], int) else [row[0]] for row in res])) + lemuceeff = {} + for i, uce in enumerate(uces) : + lemuceeff[uce] = lemuceeff.get(uce, 0) + eff[i] + return lemuceeff + + def getlemclustereff(self, lem, cluster) : + return len(list(set(self.lc[cluster]).intersection(self.getlemuces(lem)))) + + def getlemeff(self, lem) : + return self.lems[lem].freq + + def getlems(self) : + return self.lems + + def getforme(self, formeid) : + if self.idformes is None : self.make_idformes() + return self.idformes[formeid] + + def gettotocc(self) : + return sum([self.formes[forme].freq for forme in self.formes]) + + def getucemean(self) : + return float(self.gettotocc())/self.getucenb() + + def getucenb(self) : + return self.ucis[-1].uces[-1].ident + 1 + + def getucinb(self) : + return self.ucis[-1].ident + 1 + + def getucisize(self) : + ucesize = self.getucesize() + return [sum(ucesize[uci.uces[0].ident:(uci.uces[-1].ident + 1)]) for uci in self.ucis] - def make_ucis_words(self, lines): - print 'make ucis_words' - return [' '.join(self.content[l[0]:l[1]]).lower().replace(u'\'','\' ').replace(u'â','\' ').replace('...',u' £ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').strip().split() for l in lines] + def getucesize(self) : + res = self.getalluces() + return [len(uce[1].split()) for uce in res] + + def getconcorde(self, uces) : + return self.cuces.execute('select * from uces where id IN (%s) ORDER BY id;' % ', '.join([`i` for i in uces])) - def make_ucis_txt(self, lines): - print 'make ucis_txt' - return [' '.join(self.content[l[0]:l[1]]).lower().replace(u'\'','\' ').replace(u'â','\' ').replace('...',u' £ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':', ' : ').strip() for l in lines] + def getuciconcorde(self, ucis) : + uces = [[val,[uce.ident for uce in self.ucis[val].uces]] for val in ucis] + uces = [[val[0], '\n'.join([row[1] for row in self.getconcorde(val[1])])] for val in uces] + return uces + + def getwordconcorde(self, word) : + return self.getconcorde(self.getworduces(word)) + + def getlemconcorde(self, lem) : + return self.getconcorde(self.getlemuces(lem)) + + def getalluces(self) : + return self.cuces.execute('SELECT * FROM uces') - def make_ucis_lines(self, lines) : - print 'make ucis lines' - return [self.content[l[0]:l[1]] for l in lines] + def getallucis(self): + uces = [row[1] for row in self.getalluces()] + return [[uci.ident, '\n'.join([uces[uce.ident] for uce in uci.uces])] for uci in self.ucis] - def make_para_coords(self, ucis_lines): - print 'make para coords' - return [[[uci[i].split()[0], i] for i in range(0,len(uci)) if uci[i].startswith(u'#*')] for uci in ucis_lines] - - def make_ucis_paras_txt(self, para_coords, ucis_lines, ucis_txt) : - print 'make_ucis_paras_txt' - if para_coords != [[] for val in para_coords] : - paranb = [[para[1] for para in uci] for uci in para_coords] - paras = [] - #print 'len paranb', len(paranb) - #print len(self.ucis) - for i, uci in enumerate(paranb) : - uciline = ucis_lines[i] - #print uci - #print i - #print uciline - #print uci[i] - para = [[uci[i]+1, uci[i+1]] for i in range(0,len(uci)-1)] - para.append([uci[len(uci)-1]+1, len(uciline) ]) - paras.append(para) - self.parametre['para'] = True - return [[' '.join(ucis_lines[nb][l[0]:l[1]]).lower().replace(u'\'','\' ').replace(u'â','\' ').replace('...',u' £ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').strip() for l in paras[nb]] for nb in range(0,len(paras))] - else : - print '############pas de para####################' - self.parametre['para'] = False - return [[val] for val in ucis_txt] - - def make_ucis_paras_txt_phrases(self, para_coords, ucis_lines, ucis_txt) : - print 'make_ucis_paras_txt' - if para_coords != [[] for val in para_coords] : - paranb = [[para[1] for para in uci] for uci in para_coords] - paras = [] - for i, uci in enumerate(paranb) : - uciline = ucis_lines[i] - para = [[uci[i]+1, uci[i+1]] for i in range(0,len(uci)-1)] - para.append([uci[len(uci)-1]+1, len(uciline) ]) - paras.append(para) - self.parametre['para'] = True - return [[' '.join(ucis_lines[nb][l[0]:l[1]]).lower().replace(u'\'','\' ').replace(u'â','\' ').strip() for l in paras[nb]] for nb in range(0,len(paras))] - else : - print '############pas de para####################' - self.parametre['para'] = False - return [[val] for val in ucis_txt] - - def make_ucis_paras_uces_sentences(self, ucis_paras_txt, make_uce = True) : - print 'make_ucis_paras_sentences' - ponctuation_espace = [' ','.', u'£', ';', '?', '!', ',', ':',''] - tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer() - self.ucis_paras_uces = [] - for i, uci in enumerate(ucis_paras_txt) : - self.ucis_paras_uces.append([]) - for j, para in enumerate(uci) : - sentences = tokenizer.tokenize(para) - sentences = [[val.strip() for val in sent.strip().replace('...',u'£').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').split() if val.strip() not in ponctuation_espace] for sent in sentences] - self.ucis_paras_uces[i].append(sentences) - - def get_tot_occ_from_ucis_txt(self, ucis_txt): - print 'get_occ' - ponctuation_espace = [' ','.', u'£', ';', '?', '!', ',', ':',''] - return sum([len([val for val in uci.split() if val.strip() not in ponctuation_espace]) for uci in ucis_txt]) - - def decouper_para(self, txt, listeSeparateurs, ls) : - i = 0 - meilleur = ['', 0, 0] - if len(txt) <= self.parametre['eff_min_uce'] : - return False, txt, [] - else : - while i <= self.parametre['eff_min_uce'] : - rapport = abs(self.parametre['eff_min_uce'] - i) + 1 - forme = txt[i] - if forme in ls and i != 0 : - poids = float(listeSeparateurs[ls.index(forme)][1]) / float(rapport) - elif i!=0 : - poids = 0.1/float(rapport) + def getucesfrometoile(self, etoile) : + return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles] + + def getetoileuces(self) : + log.info('get uces etoiles') + etoileuces = {} + idpara = 0 + for uci in self.ucis : + etoiles = uci.etoiles[1:] + for et in etoiles : + if et in etoileuces : + etoileuces[et] += [uce.ident for uce in uci.uces] else : - poids = 0 - if poids >= meilleur[1] : - meilleur[0] = forme - meilleur[1] = poids - meilleur[2] = i - i += 1 - if meilleur[0] in ls : - return True, txt[:meilleur[2]],txt[meilleur[2] + 1:] + etoileuces[et] = [uce.ident for uce in uci.uces] + if uci.paras != [] : + for et in uci.paras : + if et in etoileuces : + etoileuces[et] += [uce.ident for uce in uci.uces if uce.para == idpara] + else : + etoileuces[et] = [uce.ident for uce in uci.uces if uce.para == idpara] + idpara += 1 else : - return True, txt[:meilleur[2]],txt[meilleur[2]:] - - def make_ucis_paras_uces(self, ucis_paras_txt, make_uce = True) : - print 'make_ucis_paras_uces' - ponctuation_espace = [' ','.', u'£', ';', '?', '!', ',', ':',''] - listeSeparateurs = [[u'.', 60.0], [u'?', 60.0], [u'!', 60.0], [u'£', 60], [u':', 50.0], [u';', 40.0], [u',', 10.0], [u' ', 0.1]] - if make_uce : - print 'decoupage uce' - taille_uce = self.parametre['eff_min_uce'] -# print 'plus de recomptage UCE' - self.ucis_paras_uces = [] - for i, uci in enumerate(ucis_paras_txt) : - self.ucis_paras_uces.append([]) - for j, para in enumerate(uci) : - #print '###########ATTENTION CHINOIS para to list################' - #para = ' '.join(list(para)) - self.ucis_paras_uces[i].append([]) - reste, texte_uce, suite = decouper(para+u'$', 250, 240, listeSeparateurs) - while reste : - uce = [val.strip() for val in texte_uce.strip().split() if val.strip() not in ponctuation_espace] - self.ucis_paras_uces[i][j].append(uce) - reste, texte_uce, suite = decouper(suite, 250, 240, listeSeparateurs) - newpara = [] - nuce = [] - for uce in self.ucis_paras_uces[i][j] : - nuce += uce - if len(nuce)>=taille_uce: - newpara.append(nuce) - nuce = [] - if nuce != [] : - #FIXME ??? - if len(nuce) >= 5 : - newpara.append(nuce) - else : - if newpara != [] : - newpara[-1] += nuce - else : - newpara.append(nuce) - self.ucis_paras_uces[i][j] = newpara - else : - self.ucis_paras_uces = [[[[val.strip() for val in para.strip().split() if val not in ponctuation_espace]] for para in uci] for uci in ucis_paras_txt] - -# def feed_dict(self, val, i, j, k, id) : -# if val in self.formes : -# self.formes[val][0] +=1 -# self.formes[val][1].append([i,j,k]) -# else : -# if val in self.parent.lexique : -# type_forme = self.parent.lexique[val][1] + idpara += 1 + return etoileuces + + def getetoileucis(self): + etoileuces = {} + for uci in self.ucis : + etoiles = uci.etoiles[1:] + for et in etoiles : + if et in etoileuces : + etoileuces[et] += [uci.ident] + else : + etoileuces[et] = [uci.ident] + return etoileuces + + def getucefromid(self, uceid) : + if self.iduces is None : self.make_iduces() + return self.iduces[uceid] + + def gethapaxnb(self) : + return len([None for forme in self.formes if self.formes[forme].freq == 1]) + + def getactivesnb(self, key) : + return len([lem for lem in self.lems if self.lems[lem].act == key]) +# def make_lems(self, lem = True) : +# log.info('make lems') +# self.lems = {} +# for forme in self.formes : +# if self.formes[forme].lem in self.lems : +# if self.formes[forme].ident not in self.lems[self.formes[forme].lem] : +# self.lems[self.formes[forme].lem][self.formes[forme].ident] = 0 # else : -# if val.isdigit(): -# type_forme = 'num' -# else : -# type_forme = 'nr' -# self.formes[val] = [1, [[i,j,k]], type_forme, id] - def feed_dict_big(self, val, ucinb) : - if val in self.formes : - self.formes[val][0] +=1 - if ucinb in self.formes[val][1] : - self.formes[val][1][ucinb] += 1 - else : - self.formes[val][1][ucinb] = 1 - #self.formes[val][1].append([i,j,k]) - else : - if val in self.parent.lexique : - type_forme = self.parent.lexique[val][1] - else : - if val.isdigit(): - type_forme = 'num' +# self.lems[self.formes[forme].lem] = {self.formes[forme].ident : 0} + + def getetbyuceid(self, uceid) : + if self.uceuci is None : self.uceuci = dict([[uce.ident,uci.ident] for uci in self.ucis for uce in uci.uces]) + return self.ucis[self.uceuci[uceid]].etoiles + + def make_lems(self, lem = True) : + log.info('make lems') + self.lems = {} + if lem : + for forme in self.formes : + if self.formes[forme].lem in self.lems : + if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes : + self.lems[self.formes[forme].lem].add_forme(self.formes[forme]) else : - type_forme = 'nr' - self.formes[val] = [1, {ucinb: 1}, type_forme] - - def feed_dict(self, val, i, j, k, id) : - if val in self.formes : - self.formes[val][0] +=1 - if (i,j,k) in self.formes[val][1] : - self.formes[val][1][(i,j,k)] += 1 - else : - self.formes[val][1][(i,j,k)] = 1 - #self.formes[val][1].append([i,j,k]) + self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme]) else : - if val in self.parent.lexique : - type_forme = self.parent.lexique[val][1] + self.lems = dict([[forme, Lem(self, self.formes[forme])] for forme in self.formes]) + + def make_lems_from_dict(self, dictionnaire, dolem = True) : + log.info('make lems from dict') + self.lems = {} + for forme in self.formes : + if self.formes[forme].forme in dictionnaire : + lem = dictionnaire[forme][0] + gram = dictionnaire[forme][1] + elif forme.isdigit() : + gram = u'num' + lem = forme else : - if val.isdigit(): - type_forme = 'num' + gram = u'nr' + lem = forme + self.formes[forme].lem = lem + self.formes[forme].gram = gram + if dolem : + if self.formes[forme].lem in self.lems : + if self.formes[forme].ident not in self.lems[self.formes[forme].lem].formes : + self.lems[self.formes[forme].lem].add_forme(self.formes[forme]) else : - type_forme = 'nr' - self.formes[val] = [1, {(i,j,k): 1}, type_forme, id] - - def check_uce_et(self) : - return [[forme, self.formes[forme][1]] for forme in self.formes if forme.startswith('_') and forme.endswith('_')] - - def make_forms_and_uces(self) : - print 'make forms and uces' - uces = {} - orderuces = {} - compt = 0 - for i, uci in enumerate(self.ucis_paras_uces) : - for j, para in enumerate(uci) : - for k, uce in enumerate(para) : - ijk = (i,j,k)#'.'.join([`i`,`j`,`k`]) - orderuces[ijk] = compt - compt += 1 - if uce != [] : - for word in uce : - id = len(self.formes) - self.feed_dict(word, i, j, k, id) - #FIXME pas la bonne facon de compter la taille des uces - #passer par self.formes et self.lems - if ijk in uces and self.formes[word][2] in self.typeactive : - uces[ijk] += 1 - elif ijk not in uces and self.formes[word][2] in self.typeactive : - uces[ijk] = 1 - elif ijk not in uces : - uces[ijk] = 0 - else : - uces[ijk] = 0 - self.etintxt = self.check_uce_et() - for forme in self.etintxt : - del(self.formes[forme[0]]) - return uces, orderuces - - def min_eff_formes(self) : - if not self.parametre['lem'] : - lformes = [self.formes[forme][0] for forme in self.formes if self.formes[forme][2] in self.typeactive] - if len(lformes) <= self.parametre['max_actives'] : - self.parametre['eff_min_forme'] = 3 + self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme]) else : - lformes.sort(reverse = True) - self.parametre['eff_min_forme'] = lformes[self.parametre['max_actives']] - print self.parametre['eff_min_forme'] + self.lems[forme] = Lem(self, self.formes[forme]) + + def make_idformes(self) : + self.idformes = dict([[self.formes[forme].ident, self.formes[forme]] for forme in self.formes]) + + def make_iduces(self) : + if self.iduces is None : + self.iduces = dict([[uce.ident, uce] for uci in self.ucis for uce in uci.uces]) + + def make_lexitable(self, mineff, etoiles, gram = 0) : + if gram == 0 : + grams = {1:'', 2:''} else : - lems = self.make_lem_eff() - llems = [lems[lem][0] for lem in lems if lems[lem][2] in self.typeactive] - if len(llems) <= self.parametre['max_actives'] : - self.parametre['eff_min_forme'] = 3 - else : - llems.sort(reverse = True) - self.parametre['eff_min_forme'] = llems[self.parametre['max_actives']] - print self.parametre['eff_min_forme'] + grams = {gram :''} + tokeep = [lem for lem in self.lems if self.lems[lem].freq >= mineff and self.lems[lem].act in grams] + etuces = [[] for et in etoiles] + for uci in self.ucis : + get = list(set(uci.etoiles).intersection(etoiles)) + if len(get) > 1 : + log.info('2 variables sur une ligne') + if get != [] : + etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces] + etuces = [set(val) for val in etuces] + tab = [] + for lem in tokeep : + deff = self.getlemuceseff(lem) + ucesk = deff.keys() + line = [lem] + [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces] + if sum(line[1:]) >= mineff : + tab.append(line) + tab.insert(0, [''] + etoiles) + return tab - def make_lems(self, lexique) : - if self.parametre['lem'] : - print 'lemmatsation' - for word in self.formes : - if word in lexique : - if lexique[word][0] in self.lems : - self.lems[lexique[word][0]].append(word) - else : - self.lems[lexique[word][0]] = [word] - else : - if word in self.lems : - self.lems[word].append(word) - else : - self.lems[word] = [word] + def make_tgen_table(self, tgen, etoiles, tot = None): + lclasses = [self.getucesfrometoile(etoile) for etoile in etoiles] + sets = [set(cl) for cl in lclasses] + totoccurrences = dict([[val, 0] for val in etoiles]) + if tot is None : + for forme in self.formes : + formeuceeff = self.getformeuceseff(forme) + for i, classe in enumerate(lclasses) : + concern = sets[i].intersection(formeuceeff.keys()) + if len(concern) : + totoccurrences[etoiles[i]] += sum([formeuceeff[uce] for uce in concern]) + #tgenoccurrences = dict([[val, 0] for val in etoiles]) + tgenoccurrences = {} + for t in tgen.tgen : + tgenoccurrences[t] = dict([[val, 0] for val in etoiles]) + for lem in tgen[t] : + lemuceeff = self.getlemuceseff(lem) + for i, classe in enumerate(lclasses) : + concern = sets[i].intersection(lemuceeff.keys()) + if len(concern) : + tgenoccurrences[t][etoiles[i]] += sum([lemuceeff[uce] for uce in concern]) + return tgenoccurrences, totoccurrences + + def make_tgen_profile(self, tgen, ucecl, uci = False) : + log.info('tgen/classes') + self.tgenlem = {} + clnb = len(ucecl) + if uci : + #FIXME : NE MARCHE PLUS CHANGER CA + tab = [[lem] + [len(set(self.gettgentxt(tgen[lem])).intersection(classe)) for i, classe in enumerate(ucecl)] for lem in tgen] else : - print 'pas de lemmatisation : lems = formes' - for word in self.formes : - self.lems[word] = [word] - - def make_lem_eff(self) : - print 'make lem eff' - lems = {} + tab = [[lem] + [len(set(self.gettgenstprof(tgen[lem], classe, i, clnb)).intersection(classe)) for i, classe in enumerate(ucecl)] for lem in tgen] + tab = [[line[0]] + [val for val in line[1:]] for line in tab if sum(line[1:]) >= 3] + return tab + #i = 0 + #nam = 'total' + #while nam + `i` in tgen : + # i += 1 + #nam = nam + `i` + #last = [nam] + [`len(classe)` for classe in ucecl] + #tab += [last] + #line0 = ['tgen'] + ['_'.join(['cluster', `i+1`]) for i in range(len(ucecl))] + #tab = [line0] + tab + #with open(fileout, 'w') as f : + # f.write('\n'.join(['\t'.join(line) for line in tab]).encode(self.parametres['syscoding'])) + + def make_efftype_from_etoiles(self, etoiles) : + dtype = {} + etuces = [[] for et in etoiles] + for uci in self.ucis : + get = list(set(uci.etoiles).intersection(etoiles)) + if len(get) > 1 : + return '2 variables sur la meme ligne' + elif get != [] : + etuces[etoiles.index(get[0])] += [uce.ident for uce in uci.uces] + etuces = [set(val) for val in etuces] for lem in self.lems : - lems[lem] = [sum([self.formes[word][0] for word in self.lems[lem]]), self.lems[lem], self.formes[self.lems[lem][0]][2]] - return lems + deff = self.getlemuceseff(lem) + ucesk = deff.keys() + gram = self.lems[lem].gram + if gram in dtype : + dtype[gram] = [i + j for i, j in zip(dtype[gram], [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces])] + else : + dtype[gram] = [sum([deff[uce] for uce in et.intersection(ucesk)]) for et in etuces] + tabout = [[gram] + dtype[gram] for gram in dtype] + tabout.insert(0, [''] + etoiles) + return tabout - def make_lexique(self) : - print 'make lexique' - self.lexique = {} - for lem in self.lems : - for forme in self.lems[lem] : - self.lexique[forme] = lem + def make_uceactsize(self, actives) : + res = self.getalluces() + ucesize = {} + for lem in actives: + deff = self.getlemuceseff(lem) + for uce in deff : + ucesize[uce] = ucesize.get(uce, 0) + 1 + return ucesize + + def make_uc(self, actives, lim1, lim2) : + uceactsize = self.make_uceactsize(actives) + last1 = 0 + last2 = 0 + uc1 = [[]] + uc2 = [[]] + lastpara = 0 + for uce in [uce for uci in self.ucis for uce in uci.uces] : + if uce.para == lastpara : + if last1 <= lim1 : + last1 += uceactsize.get(uce.ident,0) + uc1[-1].append(uce.ident) + else : + uc1.append([uce.ident]) + last1 = 0 + if last2 <= lim2 : + last2 += uceactsize.get(uce.ident, 0) + uc2[-1].append(uce.ident) + else : + uc2.append([uce.ident]) + last2 = 0 + else : + last1 = uceactsize.get(uce.ident, 0) + last2 = uceactsize.get(uce.ident, 0) + lastpara = uce.para + uc1.append([uce.ident]) + uc2.append([uce.ident]) + return uc1, uc2 + + def make_and_write_sparse_matrix_from_uc(self, actives, sizeuc1, sizeuc2, uc1out, uc2out, listuce1out, listuce2out) : + uc1, uc2 = self.make_uc(actives, sizeuc1, sizeuc2) + log.info('taille uc1 : %i - taille uc2 : %i' % (len(uc1), len(uc2))) + self.write_ucmatrix(uc1, actives, uc1out) + self.write_ucmatrix(uc2, actives, uc2out) + listuce1 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc1) for uce in ucl] + listuce2 = [['uce', 'uc']] + [[`uce`, `i`] for i, ucl in enumerate(uc2) for uce in ucl] + with open(listuce1out, 'w') as f : + f.write('\n'.join([';'.join(line) for line in listuce1])) + with open(listuce2out, 'w') as f : + f.write('\n'.join([';'.join(line) for line in listuce2])) + return len(uc1), len(uc2) + + def write_ucmatrix(self, uc, actives, fileout) : + log.info('write uc matrix %s' % fileout) + uces_uc = dict([[uce, i] for i, ucl in enumerate(uc) for uce in ucl]) + deja_la = {} + nbl = 0 + with open(fileout + '~', 'w+') as f : + for i, lem in enumerate(actives) : + for uce in self.getlemuces(lem): + if (uces_uc[uce], i) not in deja_la : + nbl += 1 + f.write(''.join([' '.join([`uces_uc[uce]+1`,`i+1`,`1`]),'\n'])) + deja_la[(uces_uc[uce], i)] = 0 + f.seek(0) + with open(fileout, 'w') as ffin : + ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (len(uc), len(actives), nbl)) + for line in f : + ffin.write(line) + os.remove(fileout + '~') + del(deja_la) + + def export_corpus(self, outf) : + #outf = 'export_corpus.txt' + self.make_iduces() + res = self.getalluces() + self.make_iduces() + actuci = '' + actpara = False + with open(outf,'w') as f : + for uce in res : + if self.iduces[uce[0]].uci == actuci and self.iduces[uce[0]].para == actpara : + f.write(uce[1].encode(self.parametres['syscoding']) + '\n') + elif self.iduces[uce[0]].uci != actuci : + actuci = self.iduces[uce[0]].uci + if self.ucis[self.iduces[uce[0]].uci].paras == [] : + actpara = self.iduces[uce[0]].para + f.write('\n' + ' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n' + uce[1].encode(self.parametres['syscoding']) + '\n') + else : + ident = 0 + actpara = self.iduces[uce[0]].para + f.write('\n'.join([' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']), self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n') + elif self.iduces[uce[0]].para != actpara : + actpara = self.iduces[uce[0]].para + ident += 1 + f.write('\n'.join([self.ucis[self.iduces[uce[0]].uci].paras[ident].encode(self.parametres['syscoding']), uce[1].encode(self.parametres['syscoding'])]) + '\n') -# def return_lem(self, word) : -# if word in self.lexique : -# return self.lexique[word] -# else : -# return word + def export_meta_table(self, outf) : + metas = [[`i`] + text.etoiles[1:] for i, text in enumerate(self.ucis)] + longueur_max = max([len(val) for val in metas]) + first = ['column_%i' % i for i in range(longueur_max)] + metas.insert(0, first) + with open(outf, 'w') as f : + f.write('\n'.join(['\t'.join(line) for line in metas]).encode(self.parametres['syscoding'])) - def make_ucis_paras_uces_lems(self): - print 'make_ucis_paras_uces_lems' - if self.lexique is None : - self.make_lexique() - return [[[[self.lexique.get(word, word) for word in uce] for uce in para] for para in uci] for uci in self.ucis_paras_uces] + def export_corpus_classes(self, outf, alc = True, lem = False, uci = False) : + ucecl = {} + for i, lc in enumerate(self.lc) : + for uce in lc : + ucecl[uce] = i + 1 + for uce in self.lc0 : + ucecl[uce] = 0 + if not uci : + res = self.getalluces() + self.make_iduces() + else : + res = self.getallucis() + with open(outf, 'w') as f : + for uce in res : + guce = uce[1] + if not uci : + actuci = self.iduces[uce[0]].uci + else : + actuci = uce[0] + if lem : + guce = ' '.join([self.formes[forme].lem for forme in guce.split()]) + if alc : + etline = ' '.join(self.ucis[actuci].etoiles + ['*classe_%i' % ucecl[uce[0]]]) + else : + etline = ' '.join(['<' + '='.join(et.split('_')) + '>' for et in self.ucis[actuci].etoiles[1:]]) + f.write(etline.encode(self.parametres['syscoding']) + '\n') + f.write(guce.encode(self.parametres['syscoding']) + '\n\n') + + def export_classe(self, outf, classe, lem = False, uci = False) : + sts = self.lc[classe - 1] + if not uci : + res = self.getconcorde(sts) + self.make_iduces() + else : + res = self.getuciconcorde(sts) + with open(outf, 'w') as f : + for uce in res : + guce = uce[1] + if not uci : + f.write(' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n') + else : + f.write(' '.join(self.ucis[uce[0]].etoiles).encode(self.parametres['syscoding']) + '\n') + if lem : + guce = ' '.join([self.formes[forme].lem for forme in guce.split()]) + f.write(guce.encode(self.parametres['syscoding']) + '\n\n') - def make_var_actives(self) : - print 'creation liste act' - self.actives = [word for word in self.lems if self.formes[self.lems[word][0]][2] in self.typeactive and sum([self.formes[mot][0] for mot in self.lems[word]]) > self.parametre['eff_min_forme']] - - def make_var_supp(self) : - print 'creation var supp' - self.supp = [word for word in self.lems if self.formes[self.lems[word][0]][2] in self.supplementaires and sum([self.formes[mot][0] for mot in self.lems[word]]) > self.parametre['eff_min_forme']] - - def make_and_write_sparse_matrix_from_uci(self, fileout) : - print 'make_and_write_sparse_martrix_from_uci' - with open(fileout+'~', 'w') as f : - for i, lem in enumerate(self.actives) : - ucis = list(set([uce[0] for form in self.lems[lem] for uce in self.formes[form][1]])) - ucis.sort() - for uci in ucis : - f.write(''.join([' '.join([`uci+1`,`i+1`,`1`]),'\n'])) - with open(fileout+'~', 'r') as f : - old = f.read() - f.seek(0) - for i, line in enumerate(f) : - pass - nrow = i + 1 - with open(fileout, 'w') as f : - txt = "%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % ( len(self.ucis), len(self.actives), nrow) - f.write(txt + old) - os.remove(fileout+'~') - - - def make_pondtable_with_uci(self, lformes, fileout) : - table_uci = [[0 for val in lformes] for line in range(0,len(self.ucis))] - for i, lem in enumerate(lformes) : - for form in self.lems[lem] : - ucit = [val for val in self.formes[form][1]] - for uci in ucit : - table_uci[uci[0]][i] += self.formes[form][1][uci] - table_uci = [[str(val) for val in line] for line in table_uci] - table_uci.insert(0,lformes) - with open(fileout, 'w') as f : - f.write('\n'.join([';'.join(line) for line in table_uci])) - del table_uci - - def make_tableet_with_uci(self, fileout) : - et = self.get_unique_etoiles() - table_out = [[0 for val in et] for line in range(0,len(self.ucis))] - for i, uci in enumerate(self.etoiles) : - for valet in uci[0][0] : - table_out[i][et.index(valet)] = 1 - table_out = [[str(val) for val in line] for line in table_out] - table_out.insert(0,et) + def export_owledge(self, rep, classe, lem = False, uci = False) : + sts = self.lc[classe - 1] + if not uci : + res = self.getconcorde(sts) + self.make_iduces() + else : + res = self.getuciconcorde(sts) + for uce in res : + ident = uce[0] + guce = uce[1] + outf = '.'.join([`ident`, 'txt']) + outf = os.path.join(rep, outf) + if lem : + guce = ' '.join([self.formes[forme].lem for forme in guce.split()]) + with open(outf, 'w') as f : + f.write(guce.encode('cp1252', errors = 'replace')) + + def export_tropes(self, fileout, classe, lem = False, uci = False) : + sts = self.lc[classe - 1] + if not uci : + res = self.getconcorde(sts) + self.make_iduces() + else : + res = self.getuciconcorde(sts) with open(fileout, 'w') as f : - f.write('\n'.join([';'.join(line) for line in table_out])) - del table_out - - def make_table_with_uce(self, orderuces) : - print 'make_table_with_uce' - #print self.ucenb - table_uce = [[0 for val in self.actives] for line in range(0, len(orderuces))] - for i, lem in enumerate(self.actives) : - for form in self.lems[lem] : - for uce in self.formes[form][1] : - #ijk = '.'.join([str(val) for val in uce]) - table_uce[orderuces[uce]][i] = 1 - return table_uce - -# def make_sparse_matrix_with_uce(self, orderuces) : -# print 'make_sparse_matrix_with_uce' -# smat = [] -# for i, lem in enumerate(self.actives) : -# for form in self.lems[lem] : -# for uce in self.formes[form][1] : -# #ijk = '.'.join([str(val) for val in uce]) -# smat.append((`orderuces[uce]+1`,`i+1`,`1`)) -# smat = list(set(smat)) -# smat.sort() -# return smat -# -# def write_sparse_matrix(self, fileout, smat, nrow, ncol) : -# print 'write_sparse_matrix' -# txt = "%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % ( nrow, ncol, len(smat)) -# with open(fileout, 'w') as f : -# f.write(txt+'\n'.join([' '.join(line) for line in smat])) - - def make_and_write_sparse_matrix_from_uce(self, orderuces, fileout) : - print 'make_and_write_sparse_martrix_from_uce' - with open(fileout+'~', 'w') as f : - for i, lem in enumerate(self.actives) : - uces = list(set([uce for form in self.lems[lem] for uce in self.formes[form][1]])) - for uce in uces : - f.write(''.join([' '.join([`orderuces[uce]+1`,`i+1`,`1`]),'\n'])) - - with open(fileout+'~', 'r') as f : - old = f.read() + for uce in res : + guce = uce[1] + if lem : + guce = ' '.join([self.formes[forme].lem for forme in guce.split()]) + f.write(guce.encode('cp1252', errors = 'replace')) + f.write('\n') + + def make_and_write_sparse_matrix_from_uces(self, actives, outfile, listuce = False) : + log.info('make_and_write_sparse_matrix_from_uces %s' % outfile) + nbl = 0 + with open(outfile + '~', 'w+') as f : + for i, lem in enumerate(actives) : + for uce in sorted(self.getlemuces(lem)) : + nbl += 1 + f.write(''.join([' '.join([`uce+1`, `i+1`,`1`]),'\n'])) f.seek(0) - for i, line in enumerate(f) : - pass - nrow = i + 1 - with open(fileout, 'w') as f : - txt = "%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % ( len(orderuces), len(self.actives), nrow) - f.write(txt + old) - os.remove(fileout+'~') - - def make_and_write_sparse_matrix_from_uce_list(self, listin, fileout) : - print 'make_and_write_sparse_martrix_from_uce' - orderuces = [(i,j,k) for i, uci in enumerate(self.ucis_paras_uces) for j, para in enumerate(uci) for k, uce in enumerate(para)] - orderuces = dict([[uce,i] for i, uce in enumerate(orderuces)]) - with open(fileout+'~', 'w') as f : - for i, forme in enumerate(listin) : - uces = [uce for uce in self.formes[forme][1]] - for uce in uces : - f.write(''.join([' '.join([`orderuces[uce]+1`,`i+1`,`1`]),'\n'])) - - with open(fileout+'~', 'r') as f : - old = f.read() + with open(outfile, 'w') as ffin : + ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl)) + for line in f : + ffin.write(line) + os.remove(outfile + '~') + if listuce : + with open(listuce, 'w') as f : + f.write('\n'.join(['uce;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucenb())])) + + def make_and_write_sparse_matrix_from_uci(self, actives, outfile, listuci = False) : + log.info('make_and_write_sparse_matrix_from_ucis %s' % outfile) + nbl = 0 + with open(outfile + '~', 'w+') as f : + for i, lem in enumerate(actives) : + for uci in sorted(self.getlemucis(lem)) : + nbl += 1 + f.write(''.join([' '.join([`uci+1`, `i+1`,`1`]),'\n'])) f.seek(0) - for i, line in enumerate(f) : - pass - nrow = i + 1 - with open(fileout, 'w') as f : - txt = "%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % ( len(orderuces), len(listin), nrow) - f.write(txt + old) - os.remove(fileout+'~') - - - def make_table_with_classe(self, uces, list_act) : + with open(outfile, 'w') as ffin : + ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucinb(), len(actives), nbl)) + for line in f : + ffin.write(line) + os.remove(outfile + '~') + if listuci : + with open(listuci, 'w') as f : + f.write('\n'.join(['uci;uc'] + [';'.join([`i`,`i`]) for i in range(0, self.getucinb())])) + + def make_and_write_sparse_matrix_from_classe(self, actives, uces, outfile) : + log.info('make_and_write_sparse_matrix_from_classe %s' % outfile) + nbl = 0 + duces = dict([[uce, i] for i, uce in enumerate(uces)]) + with open(outfile + '~', 'w+') as f : + for i, lem in enumerate(actives) : + uces_ok = list(set(self.getlemuces(lem)).intersection(uces)) + for uce in uces_ok : + f.write(''.join([' '.join([`duces[uce]+1`,`i+1`,`1`]),'\n'])) + f.seek(0) + with open(outfile, 'w') as ffin : + ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (len(uces), len(actives), nbl)) + for line in f : + ffin.write(line) + os.remove(outfile + '~') + + def make_table_with_classe(self, uces, list_act, uci = False) : table_uce = [[0 for val in list_act] for line in range(0,len(uces))] uces = dict([[uce, i] for i, uce in enumerate(uces)]) + if uci : + getlem = self.getlemucis + else : + getlem = self.getlemuces for i, lem in enumerate(list_act) : - for form in self.lems[lem] : - for uce in self.formes[form][1] : - if uce in uces : - table_uce[uces[uce]][i] = 1 + lemuces = list(set(getlem(lem)).intersection(uces)) + for uce in lemuces : + table_uce[uces[uce]][i] = 1 table_uce.insert(0, list_act) - return table_uce + return table_uce - def make_and_write_sparse_matrix_from_classe(self, uces, list_act, fileout) : - print 'make_and_write_sparse_martrix_from_classe' - duces = dict([[uce, i] for i, uce in enumerate(uces)]) - with open(fileout+'~', 'w') as f : - for i, lem in enumerate(list_act) : - uces_ok = list(set([uce for form in self.lems[lem] for uce in self.formes[form][1]]).intersection(uces)) - for uce in uces_ok : - f.write(''.join([' '.join([`duces[uce]+1`,`i+1`,`1`]),'\n'])) + def make_pondtable_with_classe(self, uces, list_act) : + table_uce = [[0 for val in list_act] for line in range(0,len(uces))] + uces = dict([[uce, i] for i, uce in enumerate(uces)]) + for i, lem in enumerate(list_act) : + uceseff = self.getlemuceseff(lem) + lemuces = list(set(uceseff.keys()).intersection(uces)) + for uce in lemuces : + table_uce[uces[uce]][i] = uceseff[uce] + table_uce.insert(0, list_act) + return table_uce - with open(fileout+'~', 'r') as f : - old = f.read() - f.seek(0) - for i, line in enumerate(f) : - pass - nrow = i + 1 + def parse_active(self, gramact, gramsup = None) : + log.info('parse actives') + for lem in self.lems : + if lem.startswith('_') and lem.endswith('_') : + self.lems[lem].act = 2 + elif self.lems[lem].gram in gramact : + self.lems[lem].act = 1 + elif gramsup is not None and self.lems[lem].gram not in gramact: + if self.lems[lem].gram in gramsup : + self.lems[lem].act = 2 + else : + self.lems[lem].act = 0 + else : + self.lems[lem].act = 2 + + def make_actives_limit(self, limit, key = 1) : + if self.idformes is None : + self.make_idformes() + return [lem for lem in self.lems if self.getlemeff(lem) >= limit and self.lems[lem].act == key] + + def make_actives_nb(self, nbmax, key) : + log.info('make_actives_nb : %i - %i' % (nbmax,key)) + if self.idformes is None : + self.make_idformes() + allactives = [[self.lems[lem].freq, lem] for lem in self.lems if self.lems[lem].act == key and self.lems[lem].freq >= 3] + self.activenb = len(allactives) + allactives = sorted(allactives, reverse = True) + if self.activenb == 0 : + return [], 0 + if len(allactives) <= nbmax : + log.info('nb = %i - eff min = %i ' % (len(allactives), allactives[-1][0])) + return [val[1] for val in allactives], allactives[-1][0] + else : + effs = [val[0] for val in allactives] + if effs.count(effs[nbmax - 1]) > 1 : + lim = effs[nbmax - 1] + 1 + nok = True + while nok : + try : + stop = effs.index(lim) + nok = False + except ValueError: + lim -= 1 + else : + stop = nbmax - 1 + lim = effs[stop] + log.info('nb actives = %i - eff min = %i ' % (stop + 1, lim)) + return [val[1] for val in allactives[0:stop + 1]], lim + + def make_and_write_profile(self, actives, ucecl, fileout, uci = False) : + log.info('formes/classes') + if uci : + tab = [[lem] + [len(set(self.getlemucis(lem)).intersection(classe)) for classe in ucecl] for lem in actives] + else : + tab = [[lem] + [len(set(self.getlemuces(lem)).intersection(classe)) for classe in ucecl] for lem in actives] + tab = [[line[0]] + [`val` for val in line[1:]] for line in tab if sum(line[1:]) >= 3] with open(fileout, 'w') as f : - txt = "%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % ( len(uces), len(list_act), nrow) - f.write(txt + old) - os.remove(fileout+'~') + f.write('\n'.join([';'.join(line) for line in tab]).encode(self.parametres['syscoding'])) + + def make_etoiles(self) : + etoiles = set([]) + for uci in self.ucis : + etoiles.update(uci.etoiles[1:]) + return list(etoiles) - def make_uc(self, uces, orderuce, min_word_by_uc): - print 'start make uc' - ucenb= [uces[val] for val in orderuce] - uc = [] - uces_uc = {} - for i, uci in enumerate(self.ucis_paras_uces) : - for j, para in enumerate(uci) : - uc.append(0) - for k, uce in enumerate(para) : - uce_id = (i,j,k) - if uc[-1] >= min_word_by_uc : - uc.append(uces[uce_id]) + def make_themes(self): + themes = set([]) + for uci in self.ucis : + themes.update(uci.paras) + return list(themes) + + def make_etoiles_dict(self) : + etoiles = [et for uci in self.ucis for et in uci.etoiles[1:]] + det = {} + for etoile in etoiles : + et = etoile.split('_') + if et[0] in det : + try : + endet = '_'.join(et[1:]) + if etoile in det[et[0]] : + det[et[0]][etoile] += 1 else : - uc[-1] += uces[uce_id] - uces_uc[uce_id] = len(uc)-1 - lenuc = len(uc) - del uc - return lenuc, uces_uc + det[et[0]][etoile] = 1 + except IndexError : + det[et[0]] += 1 + else : + try : + endet = '_'.join(et[1:]) + det[et[0]] = {etoile :1} + except IndexError : + det[et[0]] = 1 + return det - def make_and_write_sparse_matrix_from_uc(self, uces_uc, fileout) : - print 'make_and_write_sparse_martrix_from_uc' - deja_la = {} - with open(fileout+'~', 'w') as f : - for i, lem in enumerate(self.actives) : - uces = list(set([uce for form in self.lems[lem] for uce in self.formes[form][1]])) - for uce in uces : - if (uces_uc[uce],i) not in deja_la : - f.write(''.join([' '.join([`uces_uc[uce]+1`,`i+1`,`1`]),'\n'])) - deja_la[(uces_uc[uce],i)]='' - del(deja_la) - with open(fileout+'~', 'r') as f : - old = f.read() - f.seek(0) - for i, line in enumerate(f) : - pass - nrow = i + 1 + def make_theme_dict(self): + themes = [val for uci in self.ucis for val in uci.paras] + det = {} + for theme in themes : + th = theme.split('_') + if th[0] in det : + try : + endth = '_'.join(th[1:]) + if theme in det[th[0]] : + det[th[0]][theme] += 1 + else : + det[th[0]][theme] = 1 + except IndexError : + det[th[0]] += 1 + else : + try : + endth = '_'.join(th[1:]) + det[th[0]] = {theme:1} + except IndexError : + det[th[0]] = 1 + return det + + def make_etline(self, listet) : + etuces = [[] for et in listet] + for uci in self.ucis : + get = list(set(uci.etoiles).intersection(listet)) + if len(get) > 1 : + return '2 variables sur la meme ligne' + elif get != [] : + etuces[listet.index(get[0])] += [uce.ident for uce in uci.uces] + return etuces + + def make_and_write_profile_et(self, ucecl, fileout, uci = False) : + log.info('etoiles/classes') + if not uci : + etoileuces = self.getetoileuces() + else : + etoileuces = self.getetoileucis() + etoileuces = dict([[et, etoileuces[et]] for et in etoileuces if len(etoileuces[et]) > 1]) with open(fileout, 'w') as f : - txt = "%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (max(uces_uc.values()) + 1, len(self.actives), nrow) - f.write(txt + old) - os.remove(fileout+'~') - - -# def make_tab_uc(self, uces_uc, uc) : -# print 'make_tab_uc' -# tabuc = [[0 for val in self.actives] for line in uc] -# for i, word in enumerate(self.actives) : -# for forme in self.lems[word] : -# valforme = self.formes[forme] -# for j, uce in enumerate(valforme[1]): -# #uce = '.'.join([str(val) for val in uci]) -# ligne = uces_uc[uce] -# tabuc[ligne][i] = 1 -# return tabuc - - def write_tab(self, tab, fileout) : - print 'commence ecrire' - #print len(tab) - #print len(tab[0]) - writer = csv.writer(open(fileout, 'wb'), delimiter=';', quoting = csv.QUOTE_NONNUMERIC) - writer.writerows(tab) - - def make_concord(self, words, txt, color) : - txt = ' '+ txt +' ' - for word in words : - for forme in self.lems[word] : - txt = txt.replace(' '+forme+' ', ' ' % color +forme+' ') - return txt.strip() - - def make_colored_corpus(self) : - #colors = ['black', 'red', 'blue', 'green', 'orange', 'yellow', 'brown', 'pink', 'grey'] + f.write('\n'.join([';'.join([et] + [`len(set(etoileuces[et]).intersection(classe))` for classe in ucecl]) for et in etoileuces]).encode(self.parametres['syscoding'])) + #etoiles = self.make_etoiles() + #with open(fileout, 'w') as f : + # f.write('\n'.join([';'.join([etoile] + [`len(set(self.getucesfrometoile(etoile)).intersection(classe))` for classe in ucecl]) for etoile in etoiles]).encode(self.parametres['syscoding'])) + + def make_colored_corpus(self, uci = False) : ucecl = {} for i, lc in enumerate(self.lc) : - for uce in lc : + for uce in lc : ucecl[uce] = i + 1 for uce in self.lc0 : ucecl[uce] = 0 - color = ['black'] + colors[len(self.lc) - 1] + color = ['black'] + colors[len(self.lc) - 1] txt = '''
''' % sys.getdefaultencoding() - res = [[' '.join(self.ucis[i][0]), '