+ def docharact(self, txt) :
+ #rule = u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_-"
+ list_keep = u"[" + self.rule + "]+"
+ return re.sub(list_keep, ' ', txt)
+
+ def doapos(self, txt) :
+ return txt.replace(u'\'', u' ')
+
+ def dotiret(self, txt) :
+ return txt.replace(u'-', u' ')
+
+ def firstclean(self, txt) :
+ txt = txt.replace(u'’',"'")
+ txt = txt.replace(u'œ', u'oe')
+ return txt.replace('...',u' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'…', u' £$£ ')
+
+ def make_cleans(self, txt) :
+ for clean in self.cleans :
+ txt = clean(txt)
+ return txt
+
+ def backup_uce(self) :
+ if self.corpus.idformesuces != {} :
+ log.info('backup %i' % len(self.corpus.idformesuces))
+ touce = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].keys()])) for forme in self.corpus.idformesuces]
+ toeff = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].values()])) for forme in self.corpus.idformesuces]
+ self.cf.executemany('INSERT INTO uces VALUES (?,?);', touce)
+ self.cf.executemany('INSERT INTO eff VALUES (?,?);', toeff)
+ self.corpus.idformesuces = {}
+ self.count = 1
+
+ def backup_corpus(self) :
+ log.info('start backup corpus')
+ t = time()
+ for uci in self.corpus.ucis :
+ self.ccorpus.execute('INSERT INTO etoiles VALUES (?,?,?);' ,(uci.ident,' '.join(uci.etoiles), ' '.join(uci.paras,)))
+ for uce in uci.uces :
+ self.ccorpus.execute('INSERT INTO luces VALUES (?,?,?);',(`uci.ident`,`uce.para`,`uce.ident`,))
+ for forme in self.corpus.formes :
+ self.ccorpus.execute('INSERT INTO formes VALUES (?,?,?,?,?);', (`self.corpus.formes[forme].ident`, forme, self.corpus.formes[forme].lem, self.corpus.formes[forme].gram, `self.corpus.formes[forme].freq`,))
+ log.info('%f' % (time() - t))
+
+ def dofinish(self) :
+ self.corpus.parametres['date'] = datetime.datetime.now().ctime()
+ minutes, seconds = divmod(self.time, 60)
+ hours, minutes = divmod(minutes, 60)
+ self.corpus.parametres['time'] = '%.0fh %.0fm %.0fs' % (hours, minutes, seconds)
+ self.corpus.parametres['ucinb'] = self.corpus.getucinb()
+ self.corpus.parametres['ucenb'] = self.corpus.getucenb()
+ self.corpus.parametres['occurrences'] = self.corpus.gettotocc()
+ self.corpus.parametres['formesnb'] = len(self.corpus.formes)
+ hapaxnb = self.corpus.gethapaxnb()
+ pourhapaxf = (float(hapaxnb) / len(self.corpus.formes)) * 100
+ pourhapaxocc = (float(hapaxnb) / self.corpus.parametres['occurrences']) * 100
+ self.corpus.parametres['hapax'] = '%i - %.2f %% des formes - %.2f %% des occurrences' % (hapaxnb, pourhapaxf, pourhapaxocc)
+
+class BuildSubCorpus(BuildCorpus):
+ def __init__(self, corpus, parametres, dlg = None) :
+ log.info('begin subcorpus...')
+ self.dlg = dlg
+ self.ori = corpus
+ self.infile = None
+ self.corpus = Corpus(self, {'type' : 'corpus', 'originalpath' : corpus.parametres['originalpath'], 'encoding' : corpus.parametres['encoding']})
+ self.last = 0
+ self.parametres = parametres
+ self.encoding = corpus.parametres['encoding']
+ self.corpus.parametres['corpus_name'] = parametres['corpus_name']
+ self.corpus.pathout = PathOut(filename = corpus.parametres['originalpath'], dirout = parametres['pathout'])
+ self.corpus.pathout.createdir(parametres['pathout'])
+ self.corpus.parametres['pathout'] = parametres['pathout']
+ self.corpus.parametres['meta'] = parametres.get('meta', False)
+ self.corpus.parametres['uuid'] = str(uuid4())
+ if parametres.get('frommeta', False) :
+ print 'make subtexts'
+ self.corpus.ucis = [CopyUci(uci) for uci in self.ori.ucis if set(parametres['meta']).intersection(uci.etoiles) != set()]
+ elif parametres.get('fromtheme', False) :
+ print 'make subtexts from theme'
+ idpara = 0
+ for uci in self.ori.ucis :
+ if uci.paras != [] :
+ newuce = []
+ newpara = []
+ for et in uci.paras :
+ if et in parametres['meta'] :
+ newuce += [CopyUce(uce) for uce in uci.uces if uce.para == idpara]
+ newpara.append(et)
+ idpara += 1
+ if newuce != [] :
+ nuci = CopyUci(uci)
+ nuci.uces = newuce
+ nuci.paras = newpara
+ self.corpus.ucis.append(nuci)