iramuteq.org Git - iramuteq/blob - textstat.py

   1 #!/bin/env python
   2 # -*- coding: utf-8 -*-
   3 #Author: Pierre Ratinaud
   4 #Copyright (c) 2008-2012 Pierre Ratinaud
   5 #License: GNU/GPL
   6
   7 from chemins import ffr
   8 from analysetxt import AnalyseText
   9 from functions import sortedby, progressbar, exec_rcode, check_Rresult
  10 import tempfile
  11 from time import sleep
  12 import logging
  13 import gettext
  14 _ = gettext.gettext
  15
  16 logger = logging.getLogger('iramuteq.textstat')
  17
  18
  19
  20 class Stat(AnalyseText) :
  21     def doanalyse(self) :
  22         self.make_stats()
  23
  24     def preferences(self) :
  25         return self.parametres
  26
  27     def make_stats(self):
  28         if self.dlg :
  29             if not 'dlg' in dir(self) :
  30                 self.dlg = progressbar(self, 7)
  31
  32         formes = self.corpus.lems
  33         tot = [[forme, formes[forme].freq, formes[forme].gram] for forme in formes if formes[forme].freq > 1]
  34         tot = sortedby(tot, 2,1)
  35         tot = [[i, val] for i, val in enumerate(tot)]
  36         hapax = [[forme, formes[forme].freq, formes[forme].gram] for forme in formes if formes[forme].freq == 1]
  37         hapax = sortedby(hapax, 1, 1)
  38         hapax = [[i, val] for i, val in enumerate(hapax)]
  39         act = [[forme, formes[forme].freq, formes[forme].gram] for forme in formes if formes[forme].act == 1]
  40         act = sortedby(act, 2, 1)
  41         act = [[i, val] for i, val in enumerate(act)]
  42         supp = [[forme, formes[forme].freq, formes[forme].gram] for forme in formes if formes[forme].act == 2]
  43         supp = sortedby(supp, 2, 1)
  44
  45         supp = [[i, val] for i, val in enumerate(supp)]
  46
  47         ucesize = self.corpus.getucesize()
  48         with open(self.pathout['stsize.csv'], 'w') as f :
  49             f.write('\n'.join([`val` for val in ucesize]))
  50
  51         self.result = {u'total' : dict(tot), u'formes_actives' : dict(act), u'formes_supplémentaires' : dict(supp), u'hapax' : dict(hapax), u'glob' : ''}
  52         occurrences = sum([val[1][1] for val in tot]) + len(hapax)
  53         phapax = (float(len(hapax)) / float(occurrences)) * 100
  54         phapax_forme = (float(len(hapax)) / (float(len(formes)))) * 100
  55         moy_occu_mot = float(occurrences) / float(len(formes))
  56         txt = ''.join([_(u'Abstract').decode('utf8'), '\n'])
  57         txt += ''.join([_(u'Number of texts').decode('utf8'),' : ', '%i\n' % len(self.corpus.ucis)])
  58         txt += ''.join([_(u"Number of occurrences").decode('utf8'),' : %i\n' % occurrences])
  59         txt += ''.join([_(u'Number of forms').decode('utf8'), ' : %i\n' % (len(formes))])
  60         txt += ''.join([_(u"Number of hapax").decode('utf8'),' : %i (%.2f%%' % (len(hapax),phapax), _(u'of occurrences').decode('utf8'), ' - %.2f%% ' % phapax_forme, _(u'of forms').decode('utf8'), ')\n'])
  61         #print float(occurrences), float(len(self.corpus.ucis))
  62         txt += ''.join([_(u"Mean of occurrences by text").decode('utf8'), ' : %.2f' % (float(occurrences)/float(len(self.corpus.ucis)))])
  63         if self.dlg :
  64             self.dlg.Update(7, u'Ecriture...')
  65         self.result['glob'] = txt
  66         self.print_result()
  67         # for Zipf grap
  68         txt = """
  69         source("%s")
  70         tot <- read.csv2("%s", header = FALSE, row.names = 1)
  71         """ % (ffr(self.parent.RscriptsPath['Rgraph']), ffr(self.pathout['total.csv']))
  72         if len(hapax) :
  73             txt += """
  74             hapax <- read.csv2("%s", header = FALSE, row.names = 1)
  75             tot <- rbind(tot, hapax)
  76             """ % ffr(self.pathout['hapax.csv'])
  77         txt += """
  78         open_file_graph("%s", width = 400, height = 400)
  79         plot(tot[,1], log = 'xy', xlab='log(rangs)', ylab = 'log(frequences)', col = 'red', pch=16)
  80         dev.off()
  81         """ % (ffr(self.pathout['zipf.png']))
  82         txt += """
  83         stsize <- read.csv2("%s", header=F)
  84         open_file_graph("%s", width = 400, height = 400)
  85         barplot(table(stsize[,1]))
  86         dev.off()
  87         """ % (self.pathout['stsize.csv'], self.pathout['segments_size.png'])
  88         tmpscript = tempfile.mktemp(dir=self.parent.TEMPDIR)
  89         with open(tmpscript, 'w') as f :
  90             f.write(txt)
  91         pid = exec_rcode(self.parent.RPath, tmpscript, wait = False)
  92         while pid.poll() == None :
  93             sleep(0.2)
  94         check_Rresult(self.parent, pid)
  95         if self.dlg :
  96             self.dlg.Destroy()
  97
  98     def print_result(self) :
  99         for key in self.result :
 100             if key != 'glob' :
 101                 dico = self.result[key]
 102                 toprint = [[dico[val][0],`dico[val][1]`, dico[val][2]] for val in dico]
 103                 with open(self.pathout['%s.csv' % key], 'w') as f :
 104                     f.write('\n'.join([';'.join([val for val in ligne]) for ligne in toprint]).encode(self.parent.syscoding))
 105             else :
 106                 with open(self.pathout['%s.txt' % 'glob'], 'w') as f :
 107                     f.write(self.result['glob'].encode(self.parent.syscoding, errors='replace'))