+ def docharact(self, txt) :
+ #rule = u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_-"
+ list_keep = u"[" + self.rule + "]+"
+ return re.sub(list_keep, ' ', txt)
+
+ def doapos(self, txt) :
+ return txt.replace(u'\'', u' ')
+
+ def dotiret(self, txt) :
+ return txt.replace(u'-', u' ')
+
+ def firstclean(self, txt) :
+ txt = txt.replace(u'’',"'")
+ txt = txt.replace(u'œ', u'oe')
+ return txt.replace('...',u' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'…', ' £$£ ')
+
+ def make_cleans(self, txt) :
+ for clean in self.cleans :
+ txt = clean(txt)
+ return txt
+
+ def backup_uce(self) :
+ if self.corpus.idformesuces != {} :
+ log.info('backup %i' % len(self.corpus.idformesuces))
+ touce = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].keys()])) for forme in self.corpus.idformesuces]
+ toeff = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].values()])) for forme in self.corpus.idformesuces]
+ self.cf.executemany('INSERT INTO uces VALUES (?,?);', touce)
+ self.cf.executemany('INSERT INTO eff VALUES (?,?);', toeff)
+ self.corpus.idformesuces = {}
+ self.count = 1
+
+ def backup_corpus(self) :
+ log.info('start backup corpus')
+ t = time()
+ for uci in self.corpus.ucis :
+ self.ccorpus.execute('INSERT INTO etoiles VALUES (?,?,?);' ,(uci.ident,' '.join(uci.etoiles), ' '.join(uci.paras,)))
+ for uce in uci.uces :
+ self.ccorpus.execute('INSERT INTO luces VALUES (?,?,?);',(`uci.ident`,`uce.para`,`uce.ident`,))
+ for forme in self.corpus.formes :
+ self.ccorpus.execute('INSERT INTO formes VALUES (?,?,?,?,?);', (`self.corpus.formes[forme].ident`, forme, self.corpus.formes[forme].lem, self.corpus.formes[forme].gram, `self.corpus.formes[forme].freq`,))
+ log.info('%f' % (time() - t))
+
+ def dofinish(self) :
+ self.corpus.parametres['date'] = datetime.datetime.now().ctime()
+ minutes, seconds = divmod(self.time, 60)
+ hours, minutes = divmod(minutes, 60)
+ self.corpus.parametres['time'] = '%.0fh %.0fm %.0fs' % (hours, minutes, seconds)
+ self.corpus.parametres['ucinb'] = self.corpus.getucinb()
+ self.corpus.parametres['ucenb'] = self.corpus.getucenb()
+ self.corpus.parametres['occurrences'] = self.corpus.gettotocc()
+ self.corpus.parametres['formesnb'] = len(self.corpus.formes)
+ hapaxnb = self.corpus.gethapaxnb()
+ pourhapaxf = (float(hapaxnb) / len(self.corpus.formes)) * 100
+ pourhapaxocc = (float(hapaxnb) / self.corpus.parametres['occurrences']) * 100
+ self.corpus.parametres['hapax'] = '%i - %.2f %% des formes - %.2f %% des occurrences' % (hapaxnb, pourhapaxf, pourhapaxocc)
+
+
+class BuildFromAlceste(BuildCorpus) :
+ def read_corpus(self, infile) :
+ if self.dlg is not None :
+ self.dlg.Pulse('textes : 0 - segments : 0')
+ self.limitshow = 0
+ self.count = 1
+ if self.corpus.parametres['ucimark'] == 0 :
+ self.testuci = testetoile
+ elif self.corpus.parametres['ucimark'] == 1 :
+ self.testuci = testint
+ txt = []
+ iduci = -1
+ idpara = -1
+ iduce = -1
+ try :
+ with codecs.open(infile, 'r', self.encoding) as f :
+ for linenb, line in enumerate(f) :
+ line = line.rstrip('\n\r')
+ if self.testuci(line) :
+ iduci += 1
+ if txt != [] :
+ iduce, idpara = self.treattxt(txt, iduce, idpara, iduci - 1)
+ txt = []
+ self.corpus.ucis.append(Uci(iduci, line))
+ else :
+ if iduci > 0 :
+ if self.corpus.ucis[-1].uces == [] :
+ log.info(u'Empty text : %i' % linenb)
+ iduci -= 1
+ self.corpus.ucis.pop()
+ self.corpus.ucis.append(Uci(iduci, line))
+ if self.dlg is not None :
+ if not (iduci + 1) % 10 :
+ self.dlg.Pulse('textes : %i - segments : %i' % (iduci + 1, iduce +1))
+ elif line.startswith(u'-*') :
+ if iduci != -1 :
+ if txt != [] :
+ iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
+ txt = []
+ idpara += 1
+ self.corpus.ucis[-1].paras.append(line.split()[0])
+ else :
+ raise Exception('paragrapheOT %i' % linenb)
+ elif line.strip() != '' and iduci != -1 :
+ txt.append(line)
+ if txt != [] and iduci != -1 :
+ iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
+ del(txt)
+ else :
+ if iduci != -1 :
+ iduci -= 1
+ self.corpus.ucis.pop()
+ log.info(Exception("Empty text %i" % linenb))