2 # -*- coding: utf-8 -*-
3 #Author: Pierre Ratinaud
4 #Copyright (c) 2008-2012 Pierre Ratinaud
7 #from chemins import ConstructPathOut, StatTxtPathOut, ffr
8 from chemins import PathOut
9 from analysetxt import AnalyseText
10 #from corpus import Corpus
11 from guifunct import getPage, getCorpus
12 from ConfigParser import RawConfigParser
13 from functions import sortedby, progressbar, CreateIraFile, exec_rcode, check_Rresult, DoConf
14 from dialog import StatDialog
15 from openanalyse import OpenAnalyse
16 #from ttparser import *
18 from time import sleep
25 logger = logging.getLogger('iramuteq.textstat')
29 class Stat(AnalyseText) :
33 def preferences(self) :
34 dial = StatDialog(self, self.parent)
36 val = dial.ShowModal()
38 if dial.radio_lem.GetSelection() == 0 :
42 self.parametres['lem'] = lem
44 return self.parametres
51 if not 'dlg' in dir(self) :
52 self.dlg = progressbar(self, 7)
54 # formes = self.corpus.formes
56 # self.corpus.make_lems()
57 formes = self.corpus.lems
58 tot = [[forme, formes[forme].freq, formes[forme].gram] for forme in formes if formes[forme].freq > 1]
59 tot = sortedby(tot, 2,1)
60 tot = [[i, val] for i, val in enumerate(tot)]
61 hapax = [[forme, formes[forme].freq, formes[forme].gram] for forme in formes if formes[forme].freq == 1]
62 hapax = sortedby(hapax, 1, 1)
63 hapax = [[i, val] for i, val in enumerate(hapax)]
64 act = [[forme, formes[forme].freq, formes[forme].gram] for forme in formes if formes[forme].act == 1]
65 act = sortedby(act, 2, 1)
66 act = [[i, val] for i, val in enumerate(act)]
67 supp = [[forme, formes[forme].freq, formes[forme].gram] for forme in formes if formes[forme].act == 2]
68 supp = sortedby(supp, 2, 1)
70 #print self.corpus.gethapaxbyuci()
72 supp = [[i, val] for i, val in enumerate(supp)]
73 #self.corpus.pathout = self.dictpathout
74 #self.corpus.make_type_tot()
76 self.result = {u'total' : dict(tot), u'formes_actives' : dict(act), u'formes_supplémentaires' : dict(supp), u'hapax' : dict(hapax), u'glob' : ''}
77 occurrences = sum([val[1][1] for val in tot]) + len(hapax)
78 phapax = (float(len(hapax)) / float(occurrences)) * 100
79 phapax_forme = (float(len(hapax)) / (float(len(formes)))) * 100
80 moy_occu_mot = float(occurrences) / float(len(formes))
82 txt += 'nombre d\'uci : %i\n' % len(self.corpus.ucis)
83 txt += 'nombre d\'occurrences : %i\n' % occurrences
84 txt += 'nombre de formes : %i\n' % (len(formes))
85 txt += 'moyenne d\'occurrences par forme : %.2f\n' % moy_occu_mot
86 txt += 'nombre d\'hapax : %i (%.2f%% des occurrences - %.2f%% des formes)\n' % (len(hapax), phapax, phapax_forme)
87 print float(occurrences), float(len(self.corpus.ucis))
88 txt += 'moyenne d\'occurrences par uci : %.2f' % (float(occurrences)/float(len(self.corpus.ucis)))
90 self.dlg.Update(7, u'Ecriture...')
91 self.result['glob'] = txt
96 tot <- read.csv2("%s", header = FALSE, row.names = 1)
97 hapax <- read.csv2("%s", header = FALSE, row.names = 1)
98 tot <- rbind(tot, hapax)
99 open_file_graph("%s", width = 400, height = 400)
100 plot(log(tot[,1]), log = 'x', xlab='log(rangs)', ylab = 'log(frequences)', col = 'red', pch=16)
102 """ % (self.parent.RscriptsPath['Rgraph'], self.pathout['total.csv'], self.pathout['hapax.csv'], self.pathout['zipf.png'])
103 tmpscript = tempfile.mktemp(dir=self.parent.TEMPDIR)
104 with open(tmpscript, 'w') as f :
106 pid = exec_rcode(self.parent.RPath, tmpscript, wait = False)
107 while pid.poll() == None :
109 check_Rresult(self.parent, pid)
110 #CreateIraFile(self.dictpathout, 0, corpname = os.path.basename(self.corpus.parametre['filename']), section = 'stat')
112 #OpenAnalyse(self.parent, self.pathout['Analyse.ira'])
113 #self.DoLayout(self.parent)
116 def print_result(self) :
117 for key in self.result :
119 dico = self.result[key]
120 toprint = [[dico[val][0],`dico[val][1]`, dico[val][2]] for val in dico]
121 #toprint = [[line[0], `line[1]`] for line in self.result[key]]
122 with open(self.pathout['%s.csv' % key], 'w') as f :
123 f.write('\n'.join([';'.join([val for val in ligne]) for ligne in toprint]).encode(self.parent.syscoding))
125 with open(self.pathout['%s.txt' % 'glob'], 'w') as f :
126 f.write(self.result['glob'].encode(self.parent.syscoding))
127 self.parametres['pathout'] = self.pathout['Analyse.ira']
128 DoConf().makeoptions(['stat'],[self.parametres], self.pathout['Analyse.ira'])
132 # def __init__(self, parent, corpus, cmd = False, lem = True, exp = True):
133 #####################################################################
134 # logger.info('start text stat')
136 # self.parent = parent
137 # self.type = 'alceste'
139 # self.ConfigPath = parent.ConfigPath
140 # self.DictPath = parent.DictPath
141 # self.KeyConf = RawConfigParser()
142 # self.KeyConf.read(self.ConfigPath['key'])
143 # page = getPage(self.parent)
144 # if page is not None :
145 # self.corpus = getCorpus(page)
146 # if self.corpus is not None :
147 # self.pathout = ConstructPathOut(self.corpus.parametre['openpath'], 'Stat')
148 # self.dictpathout = StatTxtPathOut(self.pathout)
149 # self.val = wx.ID_OK
151 # self.corpus = Corpus(parent)
152 # self.corpus.parametre['encodage'] = parent.corpus_encodage
153 # self.corpus.parametre['lang'] = parent.corpus_lang
154 # self.corpus.parametre['filename'] = parent.filename
155 # self.pathout = ConstructPathOut(self.corpus.parametre['filename'], 'Stat')
156 # self.dictpathout = StatTxtPathOut(self.pathout)
157 # self.corpus.dictpathout = self.dictpathout
159 # dial = StatDialog(self,parent)
160 # dial.CenterOnParent()
161 # self.val = dial.ShowModal()
163 # self.val = wx.ID_OK
164 # if self.val == wx.ID_OK :
166 # if dial.radio_lem.GetSelection() == 0 : lem = True
168 # if dial.exp.GetSelection() == 0 : exp = True
170 # self.make_uce = dial.check_uce.GetValue()
171 # self.corpus.parametre['nbforme_uce'] = dial.spin_ctrl_4.GetValue()
172 # self.corpus.parametre['max_actives'] = dial.spin_max_actives.GetValue()
173 # self.corpus.parametre['eff_min_uce'] = self.corpus.parametre['nbforme_uce']
177 # self.make_uce = False
178 # self.corpus.parametre['nbforme_uce'] = None
179 # self.corpus.parametre['eff_min_uce'] = None
180 # self.corpus.parametre['lem'] = lem
181 # self.corpus.parametre['expressions'] = exp
182 # self.corpus.supplementaires = [option for option in self.KeyConf.options('KEYS') if self.KeyConf.get('KEYS', option) == "2"]
183 # self.corpus.typeactive = [option for option in self.KeyConf.options('KEYS') if self.KeyConf.get('KEYS', option) == "1"]
186 # if self.val == wx.ID_OK :
187 # if 'supplementaires' not in dir(self.corpus) :
188 # print 'supplementaire'
189 # self.corpus.supplementaires = [option for option in self.KeyConf.options('KEYS') if self.KeyConf.get('KEYS', option) == "2"]
190 # print self.corpus.supplementaires
192 # print 'corpus supplementaires'
193 # print self.corpus.supplementaires
194 # if 'typeactive' not in dir(self.corpus) :
195 # self.corpus.typeactive = [option for option in self.KeyConf.options('KEYS') if self.KeyConf.get('KEYS', option) == "1"]
198 # def make_corpus(self) :
200 # self.dlg = progressbar(self, 7)
203 # self.corpus.content = self.parent.content
204 # #print 'ATTENTION : FROM TT'
205 # #prepare_for_treetagger(self.corpus, self.parent)
206 # #get_ucis_from_tt(self.corpus)
208 # ucis_txt, ucis_paras_txt = self.corpus.start_analyse(self.parent, dlg = self.dlg, cmd = self.cmd, fromtt = False)
209 # #self.corpus.make_et_table()
210 # #self.corpus.make_len_uce(self.corpus.get_tot_occ_from_ucis_txt(ucis_txt))
211 ## print 'ATTTTTENTION CHECK_DOUBLON'
212 ## self.corpus.check_double(ucis_txt)
216 # self.dlg.Update(5, '%i UCI...' % len(ucis_paras_txt))
217 # self.corpus.make_ucis_paras_uces(ucis_paras_txt, make_uce = self.make_uce)
220 ## print 'ATTENTION EFF PAR UCI'
221 ## effuci = [[`i`, `len(uce)`] for i, uci in enumerate(self.corpus.ucis_paras_uces) for para in uci for uce in para]
222 ## with open('/home/pierre/fac/identite/taille_uci.csv', 'w') as f :
223 ## f.write('\n'.join([';'.join(val) for val in effuci]))
224 ## print effuci[0:30]
225 ## print max(effuci), min(effuci), float(sum(effuci))/float(len(effuci))
229 # if self.corpus.para_coords != [[] for val in self.corpus.para_coords] :
230 # self.corpus.parametre['para'] = True
232 # self.corpus.parametre['para'] = False
233 # self.corpus.make_etoiles(self.corpus.para_coords)
235 # print 'len(ucis_paras_uces', len(self.corpus.ucis_paras_uces)
238 # self.dlg.Update(6, u'Dictionnaires')
239 # uces, orderuces = self.corpus.make_forms_and_uces()
240 # self.corpus.make_lems(self.parent.lexique)
242 # def make_stats(self):
244 # if not 'dlg' in dir(self) :
245 # self.dlg = progressbar(self, 7)
246 # if not self.corpus.parametre['lem'] :
247 # formes = self.corpus.formes
249 # formes = self.corpus.make_lem_eff()
250 # tot = [[forme, formes[forme][0], formes[forme][2]] for forme in formes if formes[forme][0] > 1]
251 # tot = sortedby(tot, 2,1)
252 # tot = [[i, val] for i, val in enumerate(tot)]
253 # hapax = [[forme, formes[forme][0], formes[forme][2]] for forme in formes if formes[forme][0] == 1]
254 # hapax = sortedby(hapax, 1, 1)
255 # hapax = [[i, val] for i, val in enumerate(hapax)]
256 # act = [[forme, formes[forme][0], formes[forme][2]] for forme in formes if formes[forme][2] in self.corpus.typeactive]
257 # act = sortedby(act, 2, 1)
258 # act = [[i, val] for i, val in enumerate(act)]
259 # supp = [[forme, formes[forme][0], formes[forme][2]] for forme in formes if formes[forme][2] in self.corpus.supplementaires]
260 # supp = sortedby(supp, 2, 1)
261 # supp = [[i, val] for i, val in enumerate(supp)]
262 # self.corpus.dictpathout = self.dictpathout
263 # #self.corpus.make_type_tot()
265 # self.result = {u'total' : dict(tot), u'formes_actives' : dict(act), u'formes_supplémentaires' : dict(supp), u'hapax' : dict(hapax), u'glob' : ''}
266 # occurrences = sum([val[1][1] for val in tot]) + len(hapax)
267 # phapax = (float(len(hapax)) / float(occurrences)) * 100
268 # phapax_forme = (float(len(hapax)) / (float(len(formes)) + len(hapax))) * 100
269 # moy_occu_mot = float(occurrences) / float(len(formes))
271 # txt += 'nombre d\'uci : %i\n' % len(self.corpus.ucis)
272 # txt += 'nombre d\'occurrences : %i\n' % occurrences
273 # txt += 'nombre de formes : %i\n' % (len(formes) + len(hapax))
274 # txt += 'moyenne d\'occurrences par forme : %.2f\n' % moy_occu_mot
275 # txt += 'nombre d\'hapax : %i (%.2f%% des occurrences - %.2f%% des formes)\n' % (len(hapax), phapax, phapax_forme)
276 # print float(occurrences), float(len(self.corpus.ucis))
277 # txt += 'moyenne d\'occurrences par uci : %.2f' % (float(occurrences)/float(len(self.corpus.ucis)))
279 # self.dlg.Update(7, u'Ecriture...')
280 # self.result['glob'] = txt
281 # self.print_result()
285 # tot <- read.csv2("%s", header = FALSE, row.names = 1)
286 # hapax <- read.csv2("%s", header = FALSE, row.names = 1)
287 # tot <- rbind(tot, hapax)
288 # open_file_graph("%s", width = 400, height = 400)
289 # plot(log(tot[,1]), log = 'x', xlab='log(rangs)', ylab = 'log(frequences)', col = 'red', pch=16)
291 # """ % (self.parent.RscriptsPath['Rgraph'], ffr(os.path.join(self.pathout, 'total.csv')), ffr(os.path.join(self.pathout, 'hapax.csv')), self.dictpathout['zipf'])
292 # tmpscript = tempfile.mktemp(dir=self.parent.TEMPDIR)
293 # with open(tmpscript, 'w') as f :
295 # pid = exec_rcode(self.parent.RPath, tmpscript, wait = False)
296 # while pid.poll() == None :
298 # check_Rresult(self.parent, pid)
299 # self.corpus.save_corpus(self.dictpathout['db'])
300 # CreateIraFile(self.dictpathout, 0, corpname = os.path.basename(self.corpus.parametre['filename']), section = 'stat')
302 # OpenAnalyse(self.parent, self.dictpathout['ira'])
303 # #self.DoLayout(self.parent)
306 # def print_result(self) :
307 # for key in self.result :
309 # dico = self.result[key]
310 # toprint = [[dico[val][0],`dico[val][1]`, dico[val][2]] for val in dico]
311 # #toprint = [[line[0], `line[1]`] for line in self.result[key]]
312 # output = open(os.path.join(self.pathout,'%s.csv' % key), 'w')
313 # output.write('\n'.join([';'.join([val for val in ligne]) for ligne in toprint]))
316 # output = open(os.path.join(self.pathout,'%s.txt' % 'glob'), 'w')
317 # output.write(self.result['glob'])