X-Git-Url: http://iramuteq.org/git?p=iramuteq;a=blobdiff_plain;f=textstat.py;h=c25b9dcea30647650b437659f22f85903056131d;hp=2618c51e1994fb8a99fb1fa7c95dee4a5b6a4def;hb=0e02f9566eb56fcb0f16bd070248235f78093ff5;hpb=42a67a41b64a6e0cc3fd2a63a0749e9aa4b9374c diff --git a/textstat.py b/textstat.py index 2618c51..c25b9dc 100644 --- a/textstat.py +++ b/textstat.py @@ -2,24 +2,13 @@ # -*- coding: utf-8 -*- #Author: Pierre Ratinaud #Copyright (c) 2008-2012 Pierre Ratinaud -#Lisense: GNU/GPL +#License: GNU/GPL -#from chemins import ConstructPathOut, StatTxtPathOut, ffr -from chemins import PathOut +from chemins import ffr from analysetxt import AnalyseText -#from corpus import Corpus -from guifunct import getPage, getCorpus -from ConfigParser import RawConfigParser -from functions import sortedby, progressbar, CreateIraFile, exec_rcode, check_Rresult, DoConf -from dialog import StatDialog -from openanalyse import OpenAnalyse -#from ttparser import * +from functions import sortedby, progressbar, exec_rcode, check_Rresult import tempfile from time import sleep -import wx -import os - -print 'TEST LOGGING' import logging logger = logging.getLogger('iramuteq.textstat') @@ -37,10 +26,7 @@ class Stat(AnalyseText) : if self.dlg : if not 'dlg' in dir(self) : self.dlg = progressbar(self, 7) - #if not self.lem : - # formes = self.corpus.formes - #else : - # self.corpus.make_lems() + formes = self.corpus.lems tot = [[forme, formes[forme].freq, formes[forme].gram] for forme in formes if formes[forme].freq > 1] tot = sortedby(tot, 2,1) @@ -54,44 +40,39 @@ class Stat(AnalyseText) : supp = [[forme, formes[forme].freq, formes[forme].gram] for forme in formes if formes[forme].act == 2] supp = sortedby(supp, 2, 1) - #print self.corpus.gethapaxbyuci() - supp = [[i, val] for i, val in enumerate(supp)] - #self.corpus.pathout = self.dictpathout - #self.corpus.make_type_tot() self.result = {u'total' : dict(tot), u'formes_actives' : dict(act), u'formes_supplémentaires' : dict(supp), u'hapax' : dict(hapax), u'glob' : ''} occurrences = sum([val[1][1] for val in tot]) + len(hapax) phapax = (float(len(hapax)) / float(occurrences)) * 100 phapax_forme = (float(len(hapax)) / (float(len(formes)))) * 100 moy_occu_mot = float(occurrences) / float(len(formes)) - txt = 'Globale\n' - txt += 'nombre d\'uci : %i\n' % len(self.corpus.ucis) - txt += 'nombre d\'occurrences : %i\n' % occurrences - txt += 'nombre de formes : %i\n' % (len(formes)) - txt += 'moyenne d\'occurrences par forme : %.2f\n' % moy_occu_mot - txt += 'nombre d\'hapax : %i (%.2f%% des occurrences - %.2f%% des formes)\n' % (len(hapax), phapax, phapax_forme) - print float(occurrences), float(len(self.corpus.ucis)) - txt += 'moyenne d\'occurrences par uci : %.2f' % (float(occurrences)/float(len(self.corpus.ucis))) + txt = ''.join([_(u'Abstract').decode('utf8'), '\n']) + txt += ''.join([_(u'Number of texts').decode('utf8'),' : ', '%i\n' % len(self.corpus.ucis)]) + txt += ''.join([_(u"Number of occurrences").decode('utf8'),' : %i\n' % occurrences]) + txt += ''.join([_(u'Number of forms').decode('utf8'), ' : %i\n' % (len(formes))]) + txt += ''.join([_(u"Number of hapax").decode('utf8'),' : %i (%.2f%%' % (len(hapax),phapax), _(u'of occurrences').decode('utf8'), ' - %.2f%% ' % phapax_forme, _(u'of forms').decode('utf8'), ')\n']) + #print float(occurrences), float(len(self.corpus.ucis)) + txt += ''.join([_(u"Mean of occurrences by text").decode('utf8'), ' : %.2f' % (float(occurrences)/float(len(self.corpus.ucis)))]) if self.dlg : - self.dlg.Update(7, u'Ecriture...') + self.dlg.Update(7, u'Ecriture...') self.result['glob'] = txt self.print_result() # for Zipf grap txt = """ source("%s") tot <- read.csv2("%s", header = FALSE, row.names = 1) - """ % (self.parent.RscriptsPath['Rgraph'], self.pathout['total.csv']) + """ % (ffr(self.parent.RscriptsPath['Rgraph']), ffr(self.pathout['total.csv'])) if len(hapax) : txt += """ hapax <- read.csv2("%s", header = FALSE, row.names = 1) tot <- rbind(tot, hapax) - """ % self.pathout['hapax.csv'] + """ % ffr(self.pathout['hapax.csv']) txt += """ open_file_graph("%s", width = 400, height = 400) - plot(log(tot[,1]), log = 'x', xlab='log(rangs)', ylab = 'log(frequences)', col = 'red', pch=16) + plot(tot[,1], log = 'xy', xlab='log(rangs)', ylab = 'log(frequences)', col = 'red', pch=16) dev.off() - """ % (self.pathout['zipf.png']) + """ % (ffr(self.pathout['zipf.png'])) tmpscript = tempfile.mktemp(dir=self.parent.TEMPDIR) with open(tmpscript, 'w') as f : f.write(txt) @@ -99,10 +80,7 @@ class Stat(AnalyseText) : while pid.poll() == None : sleep(0.2) check_Rresult(self.parent, pid) - #CreateIraFile(self.dictpathout, 0, corpname = os.path.basename(self.corpus.parametre['filename']), section = 'stat') if self.dlg : - #OpenAnalyse(self.parent, self.pathout['Analyse.ira']) - #self.DoLayout(self.parent) self.dlg.Destroy() def print_result(self) : @@ -110,201 +88,8 @@ class Stat(AnalyseText) : if key != 'glob' : dico = self.result[key] toprint = [[dico[val][0],`dico[val][1]`, dico[val][2]] for val in dico] - #toprint = [[line[0], `line[1]`] for line in self.result[key]] with open(self.pathout['%s.csv' % key], 'w') as f : f.write('\n'.join([';'.join([val for val in ligne]) for ligne in toprint]).encode(self.parent.syscoding)) else : with open(self.pathout['%s.txt' % 'glob'], 'w') as f : f.write(self.result['glob'].encode(self.parent.syscoding)) - #self.parametres['pathout'] = self.pathout['Analyse.ira'] - #DoConf().makeoptions(['stat'],[self.parametres], self.pathout['Analyse.ira']) - - -#class Stat(): -# def __init__(self, parent, corpus, cmd = False, lem = True, exp = True): -##################################################################### -# logger.info('start text stat') -# self.conf = None -# self.parent = parent -# self.type = 'alceste' -# self.cmd = cmd -# self.ConfigPath = parent.ConfigPath -# self.DictPath = parent.DictPath -# self.KeyConf = RawConfigParser() -# self.KeyConf.read(self.ConfigPath['key']) -# page = getPage(self.parent) -# if page is not None : -# self.corpus = getCorpus(page) -# if self.corpus is not None : -# self.pathout = ConstructPathOut(self.corpus.parametre['openpath'], 'Stat') -# self.dictpathout = StatTxtPathOut(self.pathout) -# self.val = wx.ID_OK -# else : -# self.corpus = Corpus(parent) -# self.corpus.parametre['encodage'] = parent.corpus_encodage -# self.corpus.parametre['lang'] = parent.corpus_lang -# self.corpus.parametre['filename'] = parent.filename -# self.pathout = ConstructPathOut(self.corpus.parametre['filename'], 'Stat') -# self.dictpathout = StatTxtPathOut(self.pathout) -# self.corpus.dictpathout = self.dictpathout -# if not self.cmd : -# dial = StatDialog(self,parent) -# dial.CenterOnParent() -# self.val = dial.ShowModal() -# else : -# self.val = wx.ID_OK -# if self.val == wx.ID_OK : -# if not self.cmd : -# if dial.radio_lem.GetSelection() == 0 : lem = True -# else : lem = False -# if dial.exp.GetSelection() == 0 : exp = True -# else : exp = False -# self.make_uce = dial.check_uce.GetValue() -# self.corpus.parametre['nbforme_uce'] = dial.spin_ctrl_4.GetValue() -# self.corpus.parametre['max_actives'] = dial.spin_max_actives.GetValue() -# self.corpus.parametre['eff_min_uce'] = self.corpus.parametre['nbforme_uce'] -# else : -# lem = True -# exp = True -# self.make_uce = False -# self.corpus.parametre['nbforme_uce'] = None -# self.corpus.parametre['eff_min_uce'] = None -# self.corpus.parametre['lem'] = lem -# self.corpus.parametre['expressions'] = exp -# self.corpus.supplementaires = [option for option in self.KeyConf.options('KEYS') if self.KeyConf.get('KEYS', option) == "2"] -# self.corpus.typeactive = [option for option in self.KeyConf.options('KEYS') if self.KeyConf.get('KEYS', option) == "1"] -# self.make_corpus() -# -# if self.val == wx.ID_OK : -# if 'supplementaires' not in dir(self.corpus) : -# print 'supplementaire' -# self.corpus.supplementaires = [option for option in self.KeyConf.options('KEYS') if self.KeyConf.get('KEYS', option) == "2"] -# print self.corpus.supplementaires -# else : -# print 'corpus supplementaires' -# print self.corpus.supplementaires -# if 'typeactive' not in dir(self.corpus) : -# self.corpus.typeactive = [option for option in self.KeyConf.options('KEYS') if self.KeyConf.get('KEYS', option) == "1"] -# self.make_stats() -# -# def make_corpus(self) : -# if not self.cmd : -# self.dlg = progressbar(self, 7) -# else : -# self.dlg = None -# self.corpus.content = self.parent.content -# #print 'ATTENTION : FROM TT' -# #prepare_for_treetagger(self.corpus, self.parent) -# #get_ucis_from_tt(self.corpus) -# #qsdfqsdf -# ucis_txt, ucis_paras_txt = self.corpus.start_analyse(self.parent, dlg = self.dlg, cmd = self.cmd, fromtt = False) -# #self.corpus.make_et_table() -# #self.corpus.make_len_uce(self.corpus.get_tot_occ_from_ucis_txt(ucis_txt)) -## print 'ATTTTTENTION CHECK_DOUBLON' -## self.corpus.check_double(ucis_txt) -# del ucis_txt -# -# if not self.cmd : -# self.dlg.Update(5, '%i UCI...' % len(ucis_paras_txt)) -# self.corpus.make_ucis_paras_uces(ucis_paras_txt, make_uce = self.make_uce) -# del ucis_paras_txt -# -## print 'ATTENTION EFF PAR UCI' -## effuci = [[`i`, `len(uce)`] for i, uci in enumerate(self.corpus.ucis_paras_uces) for para in uci for uce in para] -## with open('/home/pierre/fac/identite/taille_uci.csv', 'w') as f : -## f.write('\n'.join([';'.join(val) for val in effuci])) -## print effuci[0:30] -## print max(effuci), min(effuci), float(sum(effuci))/float(len(effuci)) -## qsdfqsdfqsd -# -# -# if self.corpus.para_coords != [[] for val in self.corpus.para_coords] : -# self.corpus.parametre['para'] = True -# else : -# self.corpus.parametre['para'] = False -# self.corpus.make_etoiles(self.corpus.para_coords) -# -# print 'len(ucis_paras_uces', len(self.corpus.ucis_paras_uces) -# -# if not self.cmd : -# self.dlg.Update(6, u'Dictionnaires') -# uces, orderuces = self.corpus.make_forms_and_uces() -# self.corpus.make_lems(self.parent.lexique) -# -# def make_stats(self): -# if not self.cmd : -# if not 'dlg' in dir(self) : -# self.dlg = progressbar(self, 7) -# if not self.corpus.parametre['lem'] : -# formes = self.corpus.formes -# else : -# formes = self.corpus.make_lem_eff() -# tot = [[forme, formes[forme][0], formes[forme][2]] for forme in formes if formes[forme][0] > 1] -# tot = sortedby(tot, 2,1) -# tot = [[i, val] for i, val in enumerate(tot)] -# hapax = [[forme, formes[forme][0], formes[forme][2]] for forme in formes if formes[forme][0] == 1] -# hapax = sortedby(hapax, 1, 1) -# hapax = [[i, val] for i, val in enumerate(hapax)] -# act = [[forme, formes[forme][0], formes[forme][2]] for forme in formes if formes[forme][2] in self.corpus.typeactive] -# act = sortedby(act, 2, 1) -# act = [[i, val] for i, val in enumerate(act)] -# supp = [[forme, formes[forme][0], formes[forme][2]] for forme in formes if formes[forme][2] in self.corpus.supplementaires] -# supp = sortedby(supp, 2, 1) -# supp = [[i, val] for i, val in enumerate(supp)] -# self.corpus.dictpathout = self.dictpathout -# #self.corpus.make_type_tot() -# -# self.result = {u'total' : dict(tot), u'formes_actives' : dict(act), u'formes_supplémentaires' : dict(supp), u'hapax' : dict(hapax), u'glob' : ''} -# occurrences = sum([val[1][1] for val in tot]) + len(hapax) -# phapax = (float(len(hapax)) / float(occurrences)) * 100 -# phapax_forme = (float(len(hapax)) / (float(len(formes)) + len(hapax))) * 100 -# moy_occu_mot = float(occurrences) / float(len(formes)) -# txt = 'Globale\n' -# txt += 'nombre d\'uci : %i\n' % len(self.corpus.ucis) -# txt += 'nombre d\'occurrences : %i\n' % occurrences -# txt += 'nombre de formes : %i\n' % (len(formes) + len(hapax)) -# txt += 'moyenne d\'occurrences par forme : %.2f\n' % moy_occu_mot -# txt += 'nombre d\'hapax : %i (%.2f%% des occurrences - %.2f%% des formes)\n' % (len(hapax), phapax, phapax_forme) -# print float(occurrences), float(len(self.corpus.ucis)) -# txt += 'moyenne d\'occurrences par uci : %.2f' % (float(occurrences)/float(len(self.corpus.ucis))) -# if not self.cmd : -# self.dlg.Update(7, u'Ecriture...') -# self.result['glob'] = txt -# self.print_result() -# # for Zipf grap -# txt = """ -# source("%s") -# tot <- read.csv2("%s", header = FALSE, row.names = 1) -# hapax <- read.csv2("%s", header = FALSE, row.names = 1) -# tot <- rbind(tot, hapax) -# open_file_graph("%s", width = 400, height = 400) -# plot(log(tot[,1]), log = 'x', xlab='log(rangs)', ylab = 'log(frequences)', col = 'red', pch=16) -# dev.off() -# """ % (self.parent.RscriptsPath['Rgraph'], ffr(os.path.join(self.pathout, 'total.csv')), ffr(os.path.join(self.pathout, 'hapax.csv')), self.dictpathout['zipf']) -# tmpscript = tempfile.mktemp(dir=self.parent.TEMPDIR) -# with open(tmpscript, 'w') as f : -# f.write(txt) -# pid = exec_rcode(self.parent.RPath, tmpscript, wait = False) -# while pid.poll() == None : -# sleep(0.2) -# check_Rresult(self.parent, pid) -# self.corpus.save_corpus(self.dictpathout['db']) -# CreateIraFile(self.dictpathout, 0, corpname = os.path.basename(self.corpus.parametre['filename']), section = 'stat') -# if not self.cmd : -# OpenAnalyse(self.parent, self.dictpathout['ira']) -# #self.DoLayout(self.parent) -# self.dlg.Destroy() -# -# def print_result(self) : -# for key in self.result : -# if key != 'glob' : -# dico = self.result[key] -# toprint = [[dico[val][0],`dico[val][1]`, dico[val][2]] for val in dico] -# #toprint = [[line[0], `line[1]`] for line in self.result[key]] -# output = open(os.path.join(self.pathout,'%s.csv' % key), 'w') -# output.write('\n'.join([';'.join([val for val in ligne]) for ligne in toprint])) -# output.close() -# else : -# output = open(os.path.join(self.pathout,'%s.txt' % 'glob'), 'w') -# output.write(self.result['glob']) -# output.close()