...

[iramuteq] / textstat.py
diff --git a/textstat.py b/textstat.py

index ee0e4ef..c25b9dc 100644 (file)
--- a/textstat.py
+++ b/textstat.py
@@ -4,21 +4,11 @@
  #Copyright (c) 2008-2012 Pierre Ratinaud
  #License: GNU/GPL
  
-#from chemins import ConstructPathOut, StatTxtPathOut, ffr
-from chemins import PathOut
+from chemins import ffr
  from analysetxt import AnalyseText
-#from corpus import Corpus
-from guifunct import getPage, getCorpus
-from ConfigParser import RawConfigParser
-from functions import sortedby, progressbar, CreateIraFile, exec_rcode, check_Rresult, DoConf
-from dialog import StatDialog
-from openanalyse import OpenAnalyse
-#from ttparser import * 
+from functions import sortedby, progressbar, exec_rcode, check_Rresult 
  import tempfile
  from time import sleep
-import wx
-import os
-
  import logging
  
  logger = logging.getLogger('iramuteq.textstat')
@@ -36,10 +26,7 @@ class Stat(AnalyseText) :
          if self.dlg :
              if not 'dlg' in dir(self) :
                  self.dlg = progressbar(self, 7)
-        #if not self.lem :
-        #    formes = self.corpus.formes
-        #else :
-        #    self.corpus.make_lems()
+
          formes = self.corpus.lems
          tot = [[forme, formes[forme].freq, formes[forme].gram] for forme in formes if formes[forme].freq > 1]
          tot = sortedby(tot, 2,1)
@@ -53,44 +40,39 @@ class Stat(AnalyseText) :
          supp = [[forme, formes[forme].freq, formes[forme].gram] for forme in formes if formes[forme].act == 2]        
          supp = sortedby(supp, 2, 1)
  
-        #print self.corpus.gethapaxbyuci()
-
          supp = [[i, val] for i, val in enumerate(supp)]
-        #self.corpus.pathout = self.dictpathout
-        #self.corpus.make_type_tot()
  
          self.result = {u'total' : dict(tot), u'formes_actives' : dict(act), u'formes_supplémentaires' : dict(supp), u'hapax' : dict(hapax), u'glob' : ''}
          occurrences = sum([val[1][1] for val in tot]) + len(hapax)
          phapax = (float(len(hapax)) / float(occurrences)) * 100
          phapax_forme = (float(len(hapax)) / (float(len(formes)))) * 100
          moy_occu_mot = float(occurrences) / float(len(formes))
-        txt = 'Globale\n'
-        txt += 'nombre de textes : %i\n' % len(self.corpus.ucis)
-        txt += 'nombre d\'occurrences : %i\n' % occurrences
-        txt += 'nombre de formes : %i\n' % (len(formes))
-        txt += 'moyenne d\'occurrences par forme : %.2f\n' % moy_occu_mot
-        txt += 'nombre d\'hapax : %i (%.2f%% des occurrences - %.2f%% des formes)\n' % (len(hapax), phapax, phapax_forme)
-        print float(occurrences), float(len(self.corpus.ucis))
-        txt += 'moyenne d\'occurrences par texte : %.2f' % (float(occurrences)/float(len(self.corpus.ucis)))
+        txt = ''.join([_(u'Abstract').decode('utf8'), '\n'])
+        txt += ''.join([_(u'Number of texts').decode('utf8'),' : ', '%i\n' % len(self.corpus.ucis)])
+        txt += ''.join([_(u"Number of occurrences").decode('utf8'),' : %i\n' % occurrences])
+        txt += ''.join([_(u'Number of forms').decode('utf8'), ' : %i\n' % (len(formes))])
+        txt += ''.join([_(u"Number of hapax").decode('utf8'),' : %i (%.2f%%' % (len(hapax),phapax), _(u'of occurrences').decode('utf8'), ' - %.2f%% ' % phapax_forme, _(u'of forms').decode('utf8'), ')\n']) 
+        #print float(occurrences), float(len(self.corpus.ucis))
+        txt += ''.join([_(u"Mean of occurrences by text").decode('utf8'), ' : %.2f' % (float(occurrences)/float(len(self.corpus.ucis)))])
          if self.dlg :
-             self.dlg.Update(7, u'Ecriture...')
+            self.dlg.Update(7, u'Ecriture...')
          self.result['glob'] = txt
          self.print_result()
          # for Zipf grap
          txt = """
          source("%s")
          tot <- read.csv2("%s", header = FALSE, row.names = 1)
-        """ % (self.parent.RscriptsPath['Rgraph'], self.pathout['total.csv'])
+        """ % (ffr(self.parent.RscriptsPath['Rgraph']), ffr(self.pathout['total.csv']))
          if len(hapax) :
              txt += """
              hapax <- read.csv2("%s", header = FALSE, row.names = 1)
              tot <- rbind(tot, hapax)
-            """ % self.pathout['hapax.csv']
+            """ % ffr(self.pathout['hapax.csv'])
          txt += """
          open_file_graph("%s", width = 400, height = 400)
          plot(tot[,1], log = 'xy', xlab='log(rangs)', ylab = 'log(frequences)', col = 'red', pch=16)
          dev.off()
-        """ % (self.pathout['zipf.png'])
+        """ % (ffr(self.pathout['zipf.png']))
          tmpscript = tempfile.mktemp(dir=self.parent.TEMPDIR)
          with open(tmpscript, 'w') as f :
              f.write(txt)
@@ -98,10 +80,7 @@ class Stat(AnalyseText) :
          while pid.poll() == None :
              sleep(0.2)
          check_Rresult(self.parent, pid)
-        #CreateIraFile(self.dictpathout, 0, corpname = os.path.basename(self.corpus.parametre['filename']), section = 'stat')
          if self.dlg :
-            #OpenAnalyse(self.parent, self.pathout['Analyse.ira'])
-            #self.DoLayout(self.parent)
              self.dlg.Destroy()
  
      def print_result(self) :
@@ -109,201 +88,8 @@ class Stat(AnalyseText) :
              if key != 'glob' :
                  dico = self.result[key]
                  toprint = [[dico[val][0],`dico[val][1]`, dico[val][2]] for val in dico]
-                #toprint = [[line[0], `line[1]`] for line in self.result[key]]
                  with open(self.pathout['%s.csv' % key], 'w') as f :
                      f.write('\n'.join([';'.join([val for val in ligne]) for ligne in toprint]).encode(self.parent.syscoding))
              else :
                  with open(self.pathout['%s.txt' % 'glob'], 'w') as f :
                      f.write(self.result['glob'].encode(self.parent.syscoding))
-            #self.parametres['pathout'] = self.pathout['Analyse.ira']
-            #DoConf().makeoptions(['stat'],[self.parametres], self.pathout['Analyse.ira'])
-
-
-#class Stat():
-#    def __init__(self, parent, corpus, cmd = False, lem = True, exp = True):
-#####################################################################   
-#        logger.info('start text stat')
-#        self.conf = None
-#        self.parent = parent
-#        self.type = 'alceste'
-#        self.cmd = cmd
-#        self.ConfigPath = parent.ConfigPath
-#        self.DictPath = parent.DictPath
-#        self.KeyConf = RawConfigParser()
-#        self.KeyConf.read(self.ConfigPath['key'])
-#        page = getPage(self.parent)
-#        if page is not None :
-#            self.corpus = getCorpus(page)
-#            if self.corpus is not None :
-#                self.pathout = ConstructPathOut(self.corpus.parametre['openpath'], 'Stat')
-#                self.dictpathout = StatTxtPathOut(self.pathout)
-#                self.val = wx.ID_OK
-#        else :
-#            self.corpus = Corpus(parent)
-#            self.corpus.parametre['encodage'] = parent.corpus_encodage
-#            self.corpus.parametre['lang'] = parent.corpus_lang
-#            self.corpus.parametre['filename'] = parent.filename
-#            self.pathout = ConstructPathOut(self.corpus.parametre['filename'], 'Stat')
-#            self.dictpathout = StatTxtPathOut(self.pathout)
-#            self.corpus.dictpathout = self.dictpathout
-#            if not self.cmd :
-#                dial = StatDialog(self,parent)
-#                dial.CenterOnParent()
-#                self.val = dial.ShowModal()
-#            else :
-#                self.val = wx.ID_OK
-#            if self.val == wx.ID_OK :
-#                if not self.cmd :
-#                    if dial.radio_lem.GetSelection() == 0 : lem = True
-#                    else : lem = False
-#                    if dial.exp.GetSelection() == 0 : exp = True
-#                    else : exp = False
-#                    self.make_uce = dial.check_uce.GetValue()
-#                    self.corpus.parametre['nbforme_uce'] = dial.spin_ctrl_4.GetValue()
-#                    self.corpus.parametre['max_actives'] = dial.spin_max_actives.GetValue()
-#                    self.corpus.parametre['eff_min_uce'] = self.corpus.parametre['nbforme_uce']
-#                else :
-#                    lem = True
-#                    exp = True
-#                    self.make_uce = False
-#                    self.corpus.parametre['nbforme_uce'] = None
-#                    self.corpus.parametre['eff_min_uce'] = None
-#                self.corpus.parametre['lem'] = lem
-#                self.corpus.parametre['expressions'] = exp
-#                self.corpus.supplementaires = [option for option in self.KeyConf.options('KEYS') if self.KeyConf.get('KEYS', option) == "2"]
-#                self.corpus.typeactive = [option for option in self.KeyConf.options('KEYS') if self.KeyConf.get('KEYS', option) == "1"]
-#                self.make_corpus()
-#
-#        if self.val == wx.ID_OK :
-#            if 'supplementaires' not in dir(self.corpus) :
-#                print 'supplementaire'
-#                self.corpus.supplementaires = [option for option in self.KeyConf.options('KEYS') if self.KeyConf.get('KEYS', option) == "2"]
-#                print self.corpus.supplementaires
-#            else :
-#                print 'corpus supplementaires'
-#                print self.corpus.supplementaires
-#            if 'typeactive' not in dir(self.corpus) :
-#                self.corpus.typeactive = [option for option in self.KeyConf.options('KEYS') if self.KeyConf.get('KEYS', option) == "1"]
-#            self.make_stats()
-#
-#    def make_corpus(self) :
-#        if not self.cmd :
-#            self.dlg = progressbar(self, 7)
-#        else :
-#            self.dlg = None
-#        self.corpus.content = self.parent.content
-#        #print 'ATTENTION : FROM TT'
-#        #prepare_for_treetagger(self.corpus, self.parent)
-#        #get_ucis_from_tt(self.corpus)
-#        #qsdfqsdf
-#        ucis_txt, ucis_paras_txt = self.corpus.start_analyse(self.parent, dlg = self.dlg, cmd = self.cmd, fromtt = False)
-#        #self.corpus.make_et_table()
-#        #self.corpus.make_len_uce(self.corpus.get_tot_occ_from_ucis_txt(ucis_txt))
-##        print 'ATTTTTENTION CHECK_DOUBLON'
-##        self.corpus.check_double(ucis_txt)
-#        del ucis_txt
-#        
-#        if not self.cmd :
-#            self.dlg.Update(5, '%i UCI...' % len(ucis_paras_txt))
-#        self.corpus.make_ucis_paras_uces(ucis_paras_txt, make_uce = self.make_uce)
-#        del ucis_paras_txt
-#
-##        print 'ATTENTION EFF PAR UCI'
-##        effuci = [[`i`, `len(uce)`] for i, uci in enumerate(self.corpus.ucis_paras_uces) for para in uci for uce in para]
-##        with open('/home/pierre/fac/identite/taille_uci.csv', 'w') as f :
-##            f.write('\n'.join([';'.join(val) for val in effuci]))
-##        print effuci[0:30]
-##        print max(effuci), min(effuci), float(sum(effuci))/float(len(effuci))
-##        qsdfqsdfqsd
-#
-#
-#        if self.corpus.para_coords != [[] for val in self.corpus.para_coords] :
-#            self.corpus.parametre['para'] = True
-#        else :
-#            self.corpus.parametre['para'] = False
-#        self.corpus.make_etoiles(self.corpus.para_coords)
-#
-#        print 'len(ucis_paras_uces', len(self.corpus.ucis_paras_uces)
-#        
-#        if not self.cmd :
-#            self.dlg.Update(6, u'Dictionnaires')
-#        uces, orderuces = self.corpus.make_forms_and_uces()
-#        self.corpus.make_lems(self.parent.lexique)
-#
-#    def make_stats(self):
-#        if not self.cmd :
-#            if not 'dlg' in dir(self) :
-#                self.dlg = progressbar(self, 7)
-#        if not self.corpus.parametre['lem'] :
-#            formes = self.corpus.formes
-#        else :
-#            formes = self.corpus.make_lem_eff()
-#        tot = [[forme, formes[forme][0], formes[forme][2]] for forme in formes if formes[forme][0] > 1]
-#        tot = sortedby(tot, 2,1)
-#        tot = [[i, val] for i, val in enumerate(tot)]
-#        hapax = [[forme, formes[forme][0], formes[forme][2]] for forme in formes if formes[forme][0] == 1]
-#        hapax = sortedby(hapax, 1, 1)
-#        hapax = [[i, val] for i, val in enumerate(hapax)]
-#        act = [[forme, formes[forme][0], formes[forme][2]] for forme in formes if formes[forme][2] in self.corpus.typeactive]
-#        act = sortedby(act, 2, 1)
-#        act = [[i, val] for i, val in enumerate(act)]
-#        supp = [[forme, formes[forme][0], formes[forme][2]] for forme in formes if formes[forme][2] in self.corpus.supplementaires]
-#        supp = sortedby(supp, 2, 1)
-#        supp = [[i, val] for i, val in enumerate(supp)]
-#        self.corpus.dictpathout = self.dictpathout
-#        #self.corpus.make_type_tot()
-#
-#        self.result = {u'total' : dict(tot), u'formes_actives' : dict(act), u'formes_supplémentaires' : dict(supp), u'hapax' : dict(hapax), u'glob' : ''}
-#        occurrences = sum([val[1][1] for val in tot]) + len(hapax)
-#        phapax = (float(len(hapax)) / float(occurrences)) * 100
-#        phapax_forme = (float(len(hapax)) / (float(len(formes)) + len(hapax))) * 100
-#        moy_occu_mot = float(occurrences) / float(len(formes))
-#        txt = 'Globale\n'
-#        txt += 'nombre d\'uci : %i\n' % len(self.corpus.ucis)
-#        txt += 'nombre d\'occurrences : %i\n' % occurrences
-#        txt += 'nombre de formes : %i\n' % (len(formes) + len(hapax))
-#        txt += 'moyenne d\'occurrences par forme : %.2f\n' % moy_occu_mot
-#        txt += 'nombre d\'hapax : %i (%.2f%% des occurrences - %.2f%% des formes)\n' % (len(hapax), phapax, phapax_forme)
-#        print float(occurrences), float(len(self.corpus.ucis))
-#        txt += 'moyenne d\'occurrences par uci : %.2f' % (float(occurrences)/float(len(self.corpus.ucis)))
-#        if not self.cmd :
-#             self.dlg.Update(7, u'Ecriture...')
-#        self.result['glob'] = txt
-#        self.print_result()
-#        # for Zipf grap
-#        txt = """
-#        source("%s")
-#        tot <- read.csv2("%s", header = FALSE, row.names = 1)
-#        hapax <- read.csv2("%s", header = FALSE, row.names = 1)
-#        tot <- rbind(tot, hapax)
-#        open_file_graph("%s", width = 400, height = 400)
-#        plot(log(tot[,1]), log = 'x', xlab='log(rangs)', ylab = 'log(frequences)', col = 'red', pch=16)
-#        dev.off()
-#        """ % (self.parent.RscriptsPath['Rgraph'], ffr(os.path.join(self.pathout, 'total.csv')), ffr(os.path.join(self.pathout, 'hapax.csv')), self.dictpathout['zipf'])
-#        tmpscript = tempfile.mktemp(dir=self.parent.TEMPDIR)
-#        with open(tmpscript, 'w') as f :
-#            f.write(txt)
-#        pid = exec_rcode(self.parent.RPath, tmpscript, wait = False)
-#        while pid.poll() == None :
-#            sleep(0.2)
-#        check_Rresult(self.parent, pid)
-#        self.corpus.save_corpus(self.dictpathout['db'])
-#        CreateIraFile(self.dictpathout, 0, corpname = os.path.basename(self.corpus.parametre['filename']), section = 'stat')
-#        if not self.cmd :
-#            OpenAnalyse(self.parent, self.dictpathout['ira'])
-#            #self.DoLayout(self.parent)
-#            self.dlg.Destroy()
-#
-#    def print_result(self) :
-#        for key in self.result :
-#            if key != 'glob' :
-#                dico = self.result[key]
-#                toprint = [[dico[val][0],`dico[val][1]`, dico[val][2]] for val in dico]
-#                #toprint = [[line[0], `line[1]`] for line in self.result[key]]
-#                output = open(os.path.join(self.pathout,'%s.csv' % key), 'w')
-#                output.write('\n'.join([';'.join([val for val in ligne]) for ligne in toprint]))
-#                output.close()
-#            else :
-#                output = open(os.path.join(self.pathout,'%s.txt' % 'glob'), 'w')
-#                output.write(self.result['glob'])
-#                output.close()