1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
3 #Copyright (c) 2008-2020 Pierre Ratinaud
4 #modification pour python 3 : Laurent MĂ©rat, 6x7 - mai 2020
7 #------------------------------------
8 # import des modules python
9 #------------------------------------
11 from time import sleep
17 #------------------------------------
18 # import des fichiers du projet
19 #------------------------------------
20 from chemins import ffr
21 from analysetxt import AnalyseText
22 from functions import sortedby, progressbar, exec_rcode, check_Rresult
25 logger = logging.getLogger('iramuteq.textstat')
28 class Stat(AnalyseText) :
33 def preferences(self) :
34 return self.parametres
38 # if not 'dlg' in dir(self) :
39 self.dlg = progressbar(self, 7)
40 formes = self.corpus.lems
41 tot = [[forme, formes[forme].freq, formes[forme].gram] for forme in formes if formes[forme].freq > 1]
42 tot = sortedby(tot, 2, 1)
43 tot = [[i, val] for i, val in enumerate(tot)]
44 hapax = [[forme, formes[forme].freq, formes[forme].gram] for forme in formes if formes[forme].freq == 1]
45 hapax = sortedby(hapax, 1, 1)
46 hapax = [[i, val] for i, val in enumerate(hapax)]
47 act = [[forme, formes[forme].freq, formes[forme].gram] for forme in formes if formes[forme].act == 1]
48 act = sortedby(act, 2, 1)
49 act = [[i, val] for i, val in enumerate(act)]
50 supp = [[forme, formes[forme].freq, formes[forme].gram] for forme in formes if formes[forme].act == 2]
51 supp = sortedby(supp, 2, 1)
52 supp = [[i, val] for i, val in enumerate(supp)]
53 ucesize = self.corpus.getucesize()
54 with open(self.pathout['stsize.csv'], 'w') as f :
55 f.write('\n'.join([repr(val) for val in ucesize]))
56 self.result = {'total' : dict(tot), 'formes_actives' : dict(act), 'formes_supplémentaires' : dict(supp), 'hapax' : dict(hapax), 'glob' : ''}
57 occurrences = sum([val[1][1] for val in tot]) + len(hapax)
58 phapax = (float(len(hapax)) / float(occurrences)) * 100
59 phapax_forme = (float(len(hapax)) / (float(len(formes)))) * 100
60 moy_occu_mot = float(occurrences) / float(len(formes))
61 txt = ''.join([_('Abstract'), '\n'])
62 txt += ''.join([_('Number of texts'),' : ', '%i\n' % len(self.corpus.ucis)])
63 txt += ''.join([_("Number of occurrences"),' : %i\n' % occurrences])
64 txt += ''.join([_('Number of forms'), ' : %i\n' % (len(formes))])
65 txt += ''.join([_("Number of hapax"),' : %i (%.2f%% ' % (len(hapax),phapax), _('of occurrences'), ' - %.2f%% ' % phapax_forme, _('of forms'), ')\n'])
66 txt += ''.join([_("Mean of occurrences by text"), ' : %.2f' % (float(occurrences)/float(len(self.corpus.ucis)))])
68 self.dlg.Update(7, 'Ecriture...')
69 self.result['glob'] = txt
74 tot <- read.csv2("%s", header = FALSE, row.names = 1)
75 """ % (ffr(self.parent.RscriptsPath['Rgraph']), ffr(self.pathout['total.csv']))
78 hapax <- read.csv2("%s", header = FALSE, row.names = 1)
79 tot <- rbind(tot, hapax)
80 """ % ffr(self.pathout['hapax.csv'])
82 open_file_graph("%s", width = 400, height = 400)
83 plot(tot[,1], log = 'xy', xlab='log(rangs)', ylab = 'log(frequences)', col = 'red', pch=16)
85 """ % (ffr(self.pathout['zipf.png']))
87 stsize <- read.csv2("%s", header=F)
88 open_file_graph("%s", width = 400, height = 400)
89 barplot(table(stsize[,1]))
91 """ % (ffr(self.pathout['stsize.csv']), ffr(self.pathout['segments_size.png']))
92 tmpscript = tempfile.mktemp(dir=self.parent.TEMPDIR)
93 with open(tmpscript, 'w', encoding='utf8') as f :
95 pid = exec_rcode(self.parent.RPath, tmpscript, wait = False)
96 while pid.poll() == None :
98 check_Rresult(self.parent, pid)
102 def print_result(self) :
103 for key in self.result :
105 dico = self.result[key]
106 toprint = [[dico[val][0],repr(dico[val][1]), dico[val][2]] for val in dico]
107 with open(self.pathout['%s.csv' % key], 'w', encoding='utf8') as f :
108 f.write('\n'.join([';'.join([val for val in ligne]) for ligne in toprint]))
110 with open(self.pathout['%s.txt' % 'glob'], 'w', encoding='utf8') as f :
111 f.write(self.result['glob'])