-#!/bin/env python
# -*- coding: utf-8 -*-
#Author: Pierre Ratinaud
-#Copyright (c) 2008-2013, Pierre Ratinaud
-#License: GNU GPL
+#Copyright (c) 2008-2020 Pierre Ratinaud
+#modification pour python 3 : Laurent Mérat, 6x7 - mai 2020
+#License: GNU/GPL
+#------------------------------------
+# import des modules python
+#------------------------------------
import codecs
import os
+
+#------------------------------------
+# import des modules wx
+#------------------------------------
+import wx
+
+#------------------------------------
+# import des fichiers du projet
+#------------------------------------
from dialog import ExtractDialog
from corpus import Corpus, copycorpus
-import wx
parametres = {'filein' : 'corpus/lru2.txt',
'encodein' : 'utf8',
'encodeout' : 'utf8',
- 'mods' : [u'*annee_2010', u'*annee_2011']}
+ 'mods' : ['*annee_2010', '*annee_2011']}
+
def istext(line) :
- if line.startswith(u'**** ') :
+ if line.startswith('**** ') :
return True
else :
return False
def isthem(line):
- if line.startswith(u'-*') :
+ if line.startswith('-*') :
return True
else :
return False
varmod = [val.split('_') for val in line[1:]]
vars = [var[0] for var in varmod]
if variable in vars :
- return '_'.join([variable, varmod[vars.index(variable)][1]]).replace(u'*','')
+ return '_'.join([variable, varmod[vars.index(variable)][1]]).replace('*','')
else :
return False
line = line.split()
for mod in mods :
if mod in line[1:] :
- return mod.replace(u'*','')
+ return mod.replace('*','')
return False
class Extract :
+
def __init__(self, parent, option) :
dial = ExtractDialog(parent, option)
dial.CenterOnParent()
ExtractMods(parametres)
elif option == 'them' :
SplitFromThem(parametres)
- dial.Destroy()
- dial = wx.MessageDialog(parent, 'Done !', style = wx.OK)
- dial.ShowModal()
- dial.Destroy()
+ dial.Destroy()
+ dial = wx.MessageDialog(parent, 'Done !', style = wx.OK)
+ dial.ShowModal()
+ dial.Destroy()
+ else :
+ dial.Destroy()
+
class SplitFromVar :
+
def __init__(self, parametres) :
self.filein = parametres['filein']
self.var = parametres['var']
else :
keepline = False
if keepline :
- fileout.write(line.encode(self.encodeout))
+ fileout.write(line)
for f in filedict :
filedict[f].close()
+
class SplitFromThem :
+
def __init__(self, parametres) :
self.filein = parametres['filein']
self.them = parametres['them']
self.encodein = parametres['encodein']
self.encodeout = parametres['encodeout']
self.basepath = os.path.dirname(self.filein)
- self.pathout = os.path.join(self.basepath, '_'.join([them.replace(u'-*','') for them in self.them]))
+ self.pathout = os.path.join(self.basepath, '_'.join([them.replace('-*','') for them in self.them]))
self.fileout = open(self.pathout, 'w')
self.doparse()
self.fileout.close()
-
+
def doparse(self):
text = ''
keepline = False
if keepline :
text += line
self.writetext(self.fileout, lastet, text)
-
+
def writetext(self, fileout, lastet, text):
if text != '' :
- self.fileout.write(lastet.encode(self.encodeout) + text.encode(self.encodeout))
-
+ self.fileout.write(lastet + text)
+
class ExtractMods :
+
def __init__(self, parametres) :
self.onefile = parametres.get('onefile', False)
self.filein = parametres['filein']
self.encodeout = parametres['encodeout']
self.basepath = os.path.dirname(self.filein)
if self.onefile :
- filename = os.path.join(self.basepath, '_'.join([mod.replace(u'*','') for mod in self.mods])+'.txt')
+ filename = os.path.join(self.basepath, '_'.join([mod.replace('*','') for mod in self.mods])+'.txt')
self.fileout = open(filename, 'w')
self.doparse()
else :
keepline = False
if keepline :
- fileout.write(line.encode(self.encodeout))
+ fileout.write(line)
if not self.onefile :
for f in filedict :
filedict[f].close()
class SubCorpus(Corpus) :
+
def __init__(self, parent, corpus, sgts) :
Corpus.__init__(self, parent, corpus.parametres)
self.sgts = sgts
self.formes = {}
for forme in self.corpus.formes :
sgtseff = self.corpus.getformeuceseff(forme)
- sgts = set(self.sgts).intersection(sgtseff.keys())
+ sgts = set(self.sgts).intersection(list(sgtseff.keys()))
if len(sgts) :
self.formes[forme] = self.corpus.formes[forme]
self.formes[forme].freq = sum([sgtseff[sgt] for sgt in sgts])
def getlemuces(self, lem) :
return list(set(self.sgts).intersection(self.corpus.getlemuces(lem)))
+
def converttabletocorpus(table, fileout, enc='UTF8') :
var = table.pop(0)
var = var[0:len(var)-1]
- print var
- et = [zip(var, line[0:len(line)-1]) for line in table]
+ print(var)
+ et = [list(zip(var, line[0:len(line)-1])) for line in table]
et = ['**** ' + ' '.join(['*' + '_'.join(val) for val in line]) for line in et]
txt = ['\n'.join([et[i], line[-1]]) for i, line in enumerate(table)]
- print '\n'.join(txt)
+ print('\n'.join(txt))
#with open(fileout, 'w') as f :
-
-
-
+# execution directe ???
if __name__ == '__main__' :
#SplitFromVar(parametres)
ExtractMods(parametres, True)