2 # -*- coding: utf-8 -*-
3 #Author: Pierre Ratinaud
4 #Copyright (c) 2008-2013, Pierre Ratinaud
9 from dialog import ExtractDialog
10 from corpus import Corpus, copycorpus
14 parametres = {'filein' : 'corpus/lru2.txt',
17 'mods' : [u'*annee_2010', u'*annee_2011']}
20 if line.startswith(u'**** ') :
25 def testvar(line, variable) :
27 varmod = [val.split('_') for val in line[1:]]
28 vars = [var[0] for var in varmod]
30 return '_'.join([variable, varmod[vars.index(variable)][1]]).replace(u'*','')
34 def testmod(line, mods) :
38 return mod.replace(u'*','')
43 def __init__(self, parent, option) :
44 dial = ExtractDialog(parent, option)
46 res = dial.ShowModal()
48 parametres = dial.make_param()
49 if option == 'splitvar' :
50 SplitFromVar(parametres)
52 ExtractMods(parametres)
54 dial = wx.MessageDialog(parent, 'Done !', style = wx.OK)
59 def __init__(self, parametres) :
60 self.filein = parametres['filein']
61 self.var = parametres['var']
62 self.encodein = parametres['encodein']
63 self.encodeout = parametres['encodeout']
64 self.basepath = os.path.dirname(self.filein)
70 with codecs.open(self.filein, 'r', self.encodein) as fin :
73 varmod = testvar(line, self.var)
76 if varmod not in filedict :
77 filename = os.path.join(self.basepath, varmod + '.txt')
78 filedict[varmod] = open(filename, 'w')
79 fileout = filedict[varmod]
83 fileout.write(line.encode(self.encodeout))
88 def __init__(self, parametres) :
89 self.onefile = parametres.get('onefile', False)
90 self.filein = parametres['filein']
91 self.mods = parametres['mods']
92 self.encodein = parametres['encodein']
93 self.encodeout = parametres['encodeout']
94 self.basepath = os.path.dirname(self.filein)
96 filename = os.path.join(self.basepath, '_'.join([mod.replace(u'*','') for mod in self.mods])+'.txt')
97 self.fileout = open(filename, 'w')
103 with codecs.open(self.filein, 'r', self.encodein) as fin :
106 modinline = testmod(line, self.mods)
109 if not self.onefile :
110 if modinline not in filedict :
111 filename = os.path.join(self.basepath, modinline + '.txt')
112 filedict[modinline] = open(filename, 'w')
113 fileout = filedict[modinline]
115 fileout = self.fileout
119 fileout.write(line.encode(self.encodeout))
120 if not self.onefile :
127 class SubCorpus(Corpus) :
128 def __init__(self, parent, corpus, sgts) :
129 Corpus.__init__(self, parent, corpus.parametres)
131 self.corpus = copycorpus(corpus)
132 self.corpus.make_lems(self.parametres['lem'])
133 textes = list(set([corpus.getucefromid(sgt).uci for sgt in sgts]))
134 self.ucis = [corpus.ucis[i] for i in textes]
135 for texte in self.ucis :
136 texte.uces = [uce for uce in texte.uces if uce.ident in self.sgts]
137 self.make_formes(corpus)
138 self.pathout = corpus.pathout
139 self.parametres['sub'] = self.sgts
141 def make_formes(self, corpus) :
143 for forme in self.corpus.formes :
144 sgtseff = self.corpus.getformeuceseff(forme)
145 sgts = set(self.sgts).intersection(sgtseff.keys())
147 self.formes[forme] = self.corpus.formes[forme]
148 self.formes[forme].freq = sum([sgtseff[sgt] for sgt in sgts])
150 def getlemuces(self, lem) :
151 return list(set(self.sgts).intersection(self.corpus.getlemuces(lem)))
158 if __name__ == '__main__' :
159 #SplitFromVar(parametres)
160 ExtractMods(parametres, True)