+ def docharact(self, txt) :
+ #rule = u"^a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇßœŒ’ñ.:,;!?*'_-"
+ list_keep = u"[" + self.rule + "]+"
+ return re.sub(list_keep, ' ', txt)
+
+ def doapos(self, txt) :
+ return txt.replace(u'\'', u' ')
+
+ def dotiret(self, txt) :
+ return txt.replace(u'-', u' ')
+
+ def firstclean(self, txt) :
+ txt = txt.replace(u'’',"'")
+ txt = txt.replace(u'œ', u'oe')
+ return txt.replace('...',u' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'…', u' £$£ ')
+
+ def make_cleans(self, txt) :
+ for clean in self.cleans :
+ txt = clean(txt)
+ return txt
+
+ def backup_uce(self) :
+ if self.corpus.idformesuces != {} :
+ log.info('backup %i' % len(self.corpus.idformesuces))
+ touce = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].keys()])) for forme in self.corpus.idformesuces]
+ toeff = [(`forme`, ' '.join([`val` for val in self.corpus.idformesuces[forme].values()])) for forme in self.corpus.idformesuces]
+ self.cf.executemany('INSERT INTO uces VALUES (?,?);', touce)
+ self.cf.executemany('INSERT INTO eff VALUES (?,?);', toeff)
+ self.corpus.idformesuces = {}
+ self.count = 1
+
+ def backup_corpus(self) :
+ log.info('start backup corpus')
+ t = time()
+ for uci in self.corpus.ucis :
+ self.ccorpus.execute('INSERT INTO etoiles VALUES (?,?,?);' ,(uci.ident,' '.join(uci.etoiles), ' '.join(uci.paras,)))
+ for uce in uci.uces :
+ self.ccorpus.execute('INSERT INTO luces VALUES (?,?,?);',(`uci.ident`,`uce.para`,`uce.ident`,))
+ for forme in self.corpus.formes :
+ self.ccorpus.execute('INSERT INTO formes VALUES (?,?,?,?,?);', (`self.corpus.formes[forme].ident`, forme, self.corpus.formes[forme].lem, self.corpus.formes[forme].gram, `self.corpus.formes[forme].freq`,))
+ log.info('%f' % (time() - t))
+
+ def dofinish(self) :
+ self.corpus.parametres['date'] = datetime.datetime.now().ctime()
+ minutes, seconds = divmod(self.time, 60)
+ hours, minutes = divmod(minutes, 60)
+ self.corpus.parametres['time'] = '%.0fh %.0fm %.0fs' % (hours, minutes, seconds)
+ self.corpus.parametres['ucinb'] = self.corpus.getucinb()
+ self.corpus.parametres['ucenb'] = self.corpus.getucenb()
+ self.corpus.parametres['occurrences'] = self.corpus.gettotocc()
+ self.corpus.parametres['formesnb'] = len(self.corpus.formes)
+ hapaxnb = self.corpus.gethapaxnb()
+ pourhapaxf = (float(hapaxnb) / len(self.corpus.formes)) * 100
+ pourhapaxocc = (float(hapaxnb) / self.corpus.parametres['occurrences']) * 100
+ self.corpus.parametres['hapax'] = '%i - %.2f %% des formes - %.2f %% des occurrences' % (hapaxnb, pourhapaxf, pourhapaxocc)
+
+class BuildSubCorpus(BuildCorpus):
+ def __init__(self, corpus, parametres, dlg = None) :
+ log.info('begin subcorpus...')
+ self.dlg = dlg
+ self.ori = corpus
+ self.infile = None
+ self.corpus = Corpus(self, corpus.parametres)
+ self.last = 0
+ self.encoding = corpus.parametres['encoding']
+ self.corpus.parametres['corpus_name'] = parametres['corpus_name']
+ self.corpus.pathout = PathOut(filename = corpus.parametres['originalpath'], dirout = parametres['pathout'])
+ self.corpus.pathout.createdir(parametres['pathout'])
+ self.corpus.parametres['pathout'] = parametres['pathout']
+ self.corpus.parametres['meta'] = parametres.get('meta', False)
+ self.corpus.parametres['uuid'] = str(uuid4())
+ if parametres.get('frommeta', False) :
+ print 'make subtexts'
+ self.corpus.ucis = [uci for uci in self.ori.ucis if set(parametres['meta']).intersection(uci.etoiles) != set()]
+ elif parametres.get('fromtheme', False) :
+ print 'make subtexts from theme'
+ idpara = 0
+ for uci in self.ori.ucis :
+ if uci.paras != [] :
+ newuce = []
+ newpara = []
+ for et in uci.paras :
+ if et in parametres['meta'] :
+ newuce += [uce for uce in uci.uces if uce.para == idpara]
+ newpara.append(et)
+ idpara += 1
+ if newuce != [] :
+ uci.uces = newuce
+ uci.paras = newpara
+ self.corpus.ucis.append(uci)
+ else :
+ idpara += 1
+ elif parametres.get('fromcluster', False) :
+ pass
+ #create database
+ self.connect()
+ self.dobuild()
+
+ def read_corpus(self, infile = None):
+ self.olduceid = [uce.ident for uci in self.corpus.ucis for uce in uci.uces]
+ ident_uci = 0
+ ident_uce = 0
+ ident_para = -1
+ lastpara = -1
+ newuceident = {}
+ print 'redo text, para and st ident'
+ for uci in self.corpus.ucis :
+ uci.ident = ident_uci
+ ident_uci += 1
+ for uce in uci.uces :
+ uce.uci = uci.ident
+ if uce.para != lastpara :
+ ident_para += 1
+ lastpara = uce.para
+ uce.para = ident_para
+ else :
+ uce.para = ident_para
+ newuceident[uce.ident] = ident_uce
+ uce.ident = ident_uce
+ ident_uce += 1
+ print 'backup st text and forms'
+ for row in self.ori.getconcorde(self.olduceid) :
+ self.c.execute('INSERT INTO uces VALUES(?,?);', (`newuceident[row[0]]`, row[1]))
+ for word in row[1].split() :
+ self.corpus.add_word_from_forme(self.ori.formes[word], newuceident[row[0]])
+ self.backup_uce()
+ print 'done'
+
+class BuildFromAlceste(BuildCorpus) :
+ def read_corpus(self, infile) :
+ if self.dlg is not None :
+ self.dlg.Pulse('textes : 0 - segments : 0')
+ self.limitshow = 0
+ self.count = 1
+ if self.corpus.parametres['ucimark'] == 0 :
+ self.testuci = testetoile
+ elif self.corpus.parametres['ucimark'] == 1 :
+ self.testuci = testint
+ txt = []
+ iduci = -1
+ idpara = -1
+ iduce = -1
+ try :
+ with codecs.open(infile, 'r', self.encoding) as f :
+ for linenb, line in enumerate(f) :
+ line = line.rstrip('\n\r')#FIXME .lstrip(codecs.BOM).lstrip(codecs.BOM_UTF8)
+ if self.testuci(line) :
+ iduci += 1
+ if txt != [] :
+ iduce, idpara = self.treattxt(txt, iduce, idpara, iduci - 1)
+ txt = []
+ self.corpus.ucis.append(Uci(iduci, line))
+ else :
+ if iduci > 0 :
+ if self.corpus.ucis[-1].uces == [] :
+ log.info(u'Empty text : %i' % linenb)
+ iduci -= 1
+ self.corpus.ucis.pop()
+ self.corpus.ucis.append(Uci(iduci, line))
+ if self.dlg is not None :
+ if not (iduci + 1) % 10 :
+ self.dlg.Pulse('textes : %i - segments : %i' % (iduci + 1, iduce +1))
+ elif line.startswith(u'-*') :
+ if iduci != -1 :
+ if txt != [] :
+ iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
+ txt = []
+ idpara += 1
+ self.corpus.ucis[-1].paras.append(line.split()[0])
+ else :
+ raise Exception('paragrapheOT %i' % linenb)
+ elif line.strip() != '' and iduci != -1 :
+ txt.append(line)
+ if txt != [] and iduci != -1 :
+ iduce, idpara = self.treattxt(txt, iduce, idpara, iduci)
+ del(txt)
+ else :
+ if iduci != -1 :
+ iduci -= 1
+ self.corpus.ucis.pop()
+ log.info(Exception("Empty text %i" % linenb))