- formes_out2 = os.path.join(os.path.dirname(forme_out), 'formes_formes.csv')
- formes_uces = os.path.join(os.path.dirname(forme_out), 'formes_uces.csv')
- with codecs.open(formes_uces, 'r', self.parametre['syscoding']) as f:
- uces = f.read()
- uces = [list(chunks(line.split(),4)) for line in uces.splitlines()]
- with codecs.open(formes_out2, 'r', self.parametre['syscoding']) as f :
- self.formes = f.read()
- self.formes = [[line.split(';'), dict([[(int(uce[0]),int(uce[1]), int(uce[2])), int(uce[3])] for uce in uces[i]])] for i, line in enumerate(self.formes.splitlines())]
- self.formes = dict([[line[0][0], [int(line[0][1]), line[1], line[0][2], int(line[0][3])]] for line in self.formes])
-
- def read_corpus_from_shelves(self, db) :
- d = shelve.open(db)
- self.parametre = d['parametre']
- if not 'syscoding' in self.parametre :
- self.parametre['syscoding'] = sys.getdefaultencoding()
- self.lems = d['lems']
- if 'ucis_paras_uces' in d :
- self.ucis_paras_uces = d['ucis_paras_uces']
- else :
- corpus_out = os.path.join(os.path.dirname(db), 'corpus.txt')
- self.read_corpus_out(corpus_out)
- if 'formes' in d :
- self.formes = d['formes']
- else :
- formes_out = os.path.join(os.path.dirname(db), 'formes.txt')
- self.read_formes_out(formes_out)
-# print 'deb sql'
-# import sqlite3
-# db_out = os.path.join(os.path.dirname(db), 'formes.db')
-# conn = sqlite3.connect(db_out)
-# c = conn.cursor()
-# c.execute('''SELECT * FROM formes''')
-# self.formes = dict([[forme[0], [int(forme[1]), dict([[eval(uce.split(':')[0]), int(uce.split(':')[1])] for uce in forme[2].split(';')]), forme[3], int(forme[4])]] for forme in c])
-# print 'fin sql'
- self.etoiles = d['etoiles']
- self.actives = d['actives']
- self.ucis = d['ucis']
- self.lc = d['lc']
- self.lc0 = d['lc0']
- d.close()
-
-
- def save_corpus(self, db) :
- d= shelve.open(db)
- d['parametre'] = self.parametre
- #d['formes'] = self.formes
- d['lems'] = self.lems
- #d['ucis_paras_uces'] = self.ucis_paras_uces
- d['etoiles'] = self.etoiles
- d['actives'] = self.actives
- d['ucis'] = self.ucis
- d['lc'] = self.lc
- d['lc0'] = self.lc0
- d.close()
- corpus_out = os.path.join(os.path.dirname(db), 'corpus.txt')
- with open(corpus_out, 'w') as f :
- f.write('\n\n'.join([u'$$$'.join(['\n'.join([' '.join(uce) for uce in para]) for para in uci]) for uci in self.ucis_paras_uces]))
- #t1 = time()
- formes_out2 = os.path.join(os.path.dirname(db), 'formes_formes.csv')
- formes_uces = os.path.join(os.path.dirname(db), 'formes_uces.csv')
-
- with open(formes_out2, 'w') as f :
- f.write('\n'.join([';'.join([forme, `self.formes[forme][0]`, self.formes[forme][2], `self.formes[forme][3]`]) for forme in self.formes]))
- with open(formes_uces, 'w') as f:
- f.write('\n'.join([' '.join([' '.join([`uce[0]`,`uce[1]`, `uce[2]`, `self.formes[forme][1][uce]`]) for uce in self.formes[forme][1]]) for forme in self.formes]))
- #print time() - t1
- #t1 = time()
- #toprint = json.dumps(self.formes)
- #with open(os.path.join(os.path.dirname(db), 'json.db'), 'w') as f:
- # f.write(toprint)
- #print time() - t2
-
-# import sqlite3
-# db_out = os.path.join(os.path.dirname(db), 'formes.db')
-# conn = sqlite3.connect(db_out)
-# c = conn.cursor()
-# conn.text_factory = str
-# c = conn.cursor()
-# c.execute('''CREATE TABLE formes (formes TEXT, freq integer, uces TEXT, type TEXT, identifiant integer)''')
-# c = conn.cursor()
-# for formes in self.formes :
-# c.execute('INSERT INTO formes values (?,?,?,?,?)', (formes, self.formes[formes][0], ';'.join([':'.join([str(uce), str(self.formes[formes][1][uce])]) for uce in self.formes[formes][1]]), self.formes[formes][2], self.formes[forme][3]))
-# conn.commit()
-# print 'fin sql'
-
- def make_len_uce(self, nbtotoc):
- if self.parametre['nbforme_uce'] == None or self.parametre['nbforme_uce'] == 0 :
- #FIXME
- if len(self.ucis) == 1:
- self.parametre['eff_min_uce'] = 30
- elif 200000 <= nbtotoc < 400000:
- self.parametre['eff_min_uce'] = (0.0016 * float(nbtotoc) / float(len(self.ucis))) + 20
- elif nbtotoc < 200000:
- self.parametre['eff_min_uce'] = (0.0016 * float(nbtotoc) / float(len(self.ucis))) + 30
- else:
- self.parametre['eff_min_uce'] = (float(nbtotoc) / float(len(self.ucis))) / float(15)
- else :
- self.parametre['eff_min_uce'] = self.parametre['nbforme_uce']
- # print 'ATTENTION ASSIGNATION DE LA TAILLE DES UCE'
- # self.lenuce = 44
-
-
- def quick_clean1(self) :
- print 'quick clean'
- self.content = self.content.lower()
- keep_caract = u"a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇß’ñ.:,;!?\n*'_-"
- list_keep = u"[^" + keep_caract + "]+"
-# print 'NETTOYAGE CABLE PLUS SUB'
- #print ('#########ATTENTION CHINOIS plus keep_caract#################')
- #list_keep = u"[;]+"
- self.content = re.sub(list_keep, ' ', self.content)
- #self.content = re.sub(list_keep, ' ', self.content)
-
- #self.content = self.content.replace(u'[’]+', '\'')
- self.content = re.sub(u'[’]+', '\'', self.content)
- self.content = re.sub(u'[\r\n]+', '\n', self.content)
- self.content = self.content.replace(u'-*',u'#*')
-
- def find_expression(self,expressions) :
- print 'find expression'
- for expression in expressions:
- if expression in self.content :
- print expression, expressions[expression][0]
- #self.content = self.content.replace(' '+expression+' ', ' '+expressions[expression][0]+' ')
- self.content = self.content.replace(expression, expressions[expression][0])
-
- def quick_clean2(self):
- print 'quick clean 2'
- self.content = self.content.replace('\'',' ')
- self.content = re.sub(u'[-]+', ' ', self.content)
- self.content = re.sub(u'[ ]+', ' ', self.content)
- self.content = self.content.splitlines()
-
- def make_ucis(self) :
- print 'make_ucis'
- self.ucis = [[self.content[i].strip().split(),i] for i in range(0,len(self.content)) if self.content[i].startswith(u'****')]
- return [a[1] for a in self.ucis]