# -*- coding: utf-8 -*-
#Author: Pierre Ratinaud
#Copyright (c) 2008-2013, Pierre Ratinaud
-#Lisense: GNU GPL
+#License: GNU GPL
import codecs
import os
else :
return False
+def isthem(line):
+ if line.startswith(u'-*') :
+ return True
+ else :
+ return False
+
def testvar(line, variable) :
line = line.split()
varmod = [val.split('_') for val in line[1:]]
parametres = dial.make_param()
if option == 'splitvar' :
SplitFromVar(parametres)
- else :
+ elif option == 'mods' :
ExtractMods(parametres)
- dial.Destroy()
- dial = wx.MessageDialog(parent, 'Done !', style = wx.OK)
- dial.ShowModal()
- dial.Destroy()
+ elif option == 'them' :
+ SplitFromThem(parametres)
+ dial.Destroy()
+ dial = wx.MessageDialog(parent, 'Done !', style = wx.OK)
+ dial.ShowModal()
+ dial.Destroy()
+ else :
+ dial.Destroy()
class SplitFromVar :
def __init__(self, parametres) :
keepline = False
filedict = {}
with codecs.open(self.filein, 'r', self.encodein) as fin :
- for line in fin :
- if istext(line) :
- varmod = testvar(line, self.var)
- if varmod :
- keepline = True
- if varmod not in filedict :
- filename = os.path.join(self.basepath, varmod + '.txt')
- filedict[varmod] = open(filename, 'w')
- fileout = filedict[varmod]
- else :
- keepline = False
- if keepline :
- fileout.write(line.encode(self.encodeout))
+ for line in fin :
+ if istext(line) :
+ varmod = testvar(line, self.var)
+ if varmod :
+ keepline = True
+ if varmod not in filedict :
+ filename = os.path.join(self.basepath, varmod + '.txt')
+ filedict[varmod] = open(filename, 'w')
+ fileout = filedict[varmod]
+ else :
+ keepline = False
+ if keepline :
+ fileout.write(line.encode(self.encodeout))
for f in filedict :
filedict[f].close()
+class SplitFromThem :
+ def __init__(self, parametres) :
+ self.filein = parametres['filein']
+ self.them = parametres['them']
+ self.encodein = parametres['encodein']
+ self.encodeout = parametres['encodeout']
+ self.basepath = os.path.dirname(self.filein)
+ self.pathout = os.path.join(self.basepath, '_'.join([them.replace(u'-*','') for them in self.them]))
+ self.fileout = open(self.pathout, 'w')
+ self.doparse()
+ self.fileout.close()
+
+ def doparse(self):
+ text = ''
+ keepline = False
+ lastet = ''
+ with codecs.open(self.filein, 'r', self.encodein) as fin :
+ for line in fin :
+ if istext(line) :
+ self.writetext(self.fileout, lastet, text)
+ text = ''
+ lastet = line
+ if isthem(line) :
+ l = line.strip().rstrip('\n\r')
+ if l in self.them :
+ keepline = True
+ else :
+ keepline = False
+ if keepline :
+ text += line
+ self.writetext(self.fileout, lastet, text)
+
+ def writetext(self, fileout, lastet, text):
+ if text != '' :
+ self.fileout.write(lastet.encode(self.encodeout) + text.encode(self.encodeout))
+
+
class ExtractMods :
def __init__(self, parametres) :
self.onefile = parametres.get('onefile', False)
keepline = False
filedict = {}
with codecs.open(self.filein, 'r', self.encodein) as fin :
- for line in fin :
- if istext(line) :
- modinline = testmod(line, self.mods)
- if modinline :
- keepline = True
- if not self.onefile :
+ for line in fin :
+ if istext(line) :
+ modinline = testmod(line, self.mods)
+ if modinline :
+ keepline = True
+ if not self.onefile :
if modinline not in filedict :
filename = os.path.join(self.basepath, modinline + '.txt')
filedict[modinline] = open(filename, 'w')
fileout = filedict[modinline]
- else :
- fileout = self.fileout
- else :
- keepline = False
- if keepline :
- fileout.write(line.encode(self.encodeout))
+ else :
+ fileout = self.fileout
+ else :
+ keepline = False
+ if keepline :
+ fileout.write(line.encode(self.encodeout))
if not self.onefile :
for f in filedict :
filedict[f].close()
def getlemuces(self, lem) :
return list(set(self.sgts).intersection(self.corpus.getlemuces(lem)))
-
+def converttabletocorpus(table, fileout, enc='UTF8') :
+ var = table.pop(0)
+ var = var[0:len(var)-1]
+ print var
+ et = [zip(var, line[0:len(line)-1]) for line in table]
+ et = ['**** ' + ' '.join(['*' + '_'.join(val) for val in line]) for line in et]
+ txt = ['\n'.join([et[i], line[-1]]) for i, line in enumerate(table)]
+ print '\n'.join(txt)
+ #with open(fileout, 'w') as f :
+