1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
3 #Copyright (c) 2008-2020 Pierre Ratinaud
4 #modification pour python 3 : Laurent Mérat, 6x7 - mai 2020
7 #------------------------------------
8 # import des modules python
9 #------------------------------------
11 from subprocess import Popen, call, PIPE
23 from shutil import copyfile
26 #from dialog import BugDialog
28 from operator import itemgetter
30 #------------------------------------
31 # import des modules wx
32 #------------------------------------
36 #------------------------------------
37 # import des fichiers du projet
38 #------------------------------------
39 from configparser import ConfigParser
42 log = logging.getLogger('iramuteq')
45 indices_simi = ['cooccurrence' ,'pourcentage de cooccurrence','Russel','Jaccard', 'Kulczynski1', 'Kulczynski2', 'Mountford', 'Fager', 'simple matching', 'Hamman', 'Faith', 'Tanimoto', 'Dice', 'Phi', 'Stiles', 'Michael', 'Mozley', 'Yule', 'Yule2', 'Ochiai', 'Simpson', 'Braun-Blanquet','Chi-squared', 'Phi-squared', 'Tschuprow', 'Cramer', 'Pearson', 'binomial']
47 def open_folder(folder):
48 if sys.platform == "win32":
51 opener ="open" if sys.platform == "darwin" else "xdg-open"
52 #call([opener, folder])
53 call(["%s %s &" % (opener, folder)], shell=True)
55 def normpath_win32(path) :
56 if not sys.platform == 'win32' :
58 while '\\\\' in path :
59 path = path.replace('\\\\', '\\')
60 if path.startswith('\\') and not path.startswith('\\\\') :
65 def __init__(self, path = None, encoding = 'utf8'):
68 self.encoding = encoding
70 def __getitem__(self, key):
73 def read(self, path = None):
76 with codecs.open(path, 'r', self.encoding) as f :
78 tgen = [line.split('\t') for line in tgen.splitlines()]
79 tgen = dict([[line[0], line[1:]] for line in tgen])
83 def write(self, path = None):
86 with open(path, 'w', encoding='utf8') as f :
87 f.write('\n'.join(['\t'.join([val] + self.tgen[val]) for val in self.tgen]))
89 def writetable(self, pathout, tgens, totocc):
90 etoiles = list(totocc.keys())
92 with open(pathout, 'w', encoding='utf8') as f :
93 line = '\t'.join(['tgens'] + etoiles) + '\n'
96 line = '\t'.join([t] + [repr(tgens[t][et]) for et in etoiles]) + '\n'
100 while totname + repr(i) in tgens :
102 totname = totname + repr(i)
103 line = '\t'.join([totname] + [repr(totocc[et]) for et in etoiles]) + '\n'
107 def __init__(self, filein, syscoding = 'utf8') :
109 self.syscoding = syscoding
111 self.openedcorpus = {}
112 self.openedmatrix = {}
120 with open(self.filein, 'r') as fjson :
122 # d = shelve.open(self.filein, protocol=1)
123 self.history = d.get('history', [])
124 self.matrix = d.get('matrix', [])
125 self.ordercorpus = dict([[corpus['uuid'], i] for i, corpus in enumerate(self.history)])
126 self.corpus = dict([[corpus['uuid'], corpus] for corpus in self.history])
127 self.analyses = dict([[analyse['uuid'], analyse] for corpus in self.history for analyse in corpus.get('analyses', [])])
128 self.matrixanalyse = dict([[mat['uuid'], mat] for mat in self.matrix])
129 self.ordermatrix = dict([[matrix['uuid'], i] for i, matrix in enumerate(self.matrix)])
134 d['history'] = self.history
135 d['matrix'] = self.matrix
136 with open(self.filein, 'w') as f :
137 f.write(json.dumps(d, indent=4, default=str))
138 #d = shelve.open(self.filein, protocol=1)
141 def add(self, analyse) :
142 log.info('add to history %s' % analyse.get('corpus_name', 'pas un corpus'))
143 tosave = {'uuid' : analyse['uuid'], 'ira': analyse['ira'], 'type' : analyse['type']}
144 if tosave['uuid'] in self.corpus :
145 log.info('problem : this uuid is already in history : %s' % tosave['uuid'])
147 if analyse.get('corpus', False) :
148 if analyse['uuid'] in self.analyses :
150 tosave['corpus'] = analyse['corpus']
151 tosave['name'] = analyse['name']
152 acorpus_uuid = analyse['corpus']
153 if acorpus_uuid in self.corpus :
154 if 'analyses' in self.history[self.ordercorpus[acorpus_uuid]] :
155 self.history[self.ordercorpus[acorpus_uuid]]['analyses'].append(tosave)
157 self.history[self.ordercorpus[acorpus_uuid]]['analyses'] = [tosave]
159 self.orph.append(tosave)
161 tosave['corpus_name'] = analyse['corpus_name']
162 #self.ordercorpus[tosave['uuid']] = len(history)
163 #self.corpus[tosave['uuid']] = analyse
164 self.history.append(tosave)
168 def addMatrix(self, analyse) :
170 #tosave['matrix_name'] = analyse['matrix_name']
171 tosave['analyses'] = []
172 self.matrix.append(tosave)
176 def addMatrixAnalyse(self, analyse) :
177 tosave = {'uuid' : analyse['uuid'], 'ira': analyse['ira'], 'type' : analyse['type'], 'matrix' : analyse['matrix']}
178 tosave['name'] = analyse['name']
179 if tosave['matrix'] in self.ordermatrix :
180 self.matrix[self.ordermatrix[tosave['matrix']]]['analyses'].append(tosave)
184 def addmultiple(self, analyses) :
185 log.info('add multiple')
186 for analyse in analyses :
187 tosave = {'uuid' : analyse['uuid'], 'ira': analyse['ira'], 'type' : analyse['type']}
188 corpus = analyse['corpus']
189 tosave['corpus'] = corpus
190 tosave['name'] = analyse['name']
191 if corpus in self.corpus :
192 if 'analyses' in self.history[self.ordercorpus[corpus]] :
193 self.history[self.ordercorpus[corpus]]['analyses'].append(tosave)
195 self.history[self.ordercorpus[corpus]]['analyses'] = [tosave]
199 def delete(self, analyse, corpus = False) :
200 log.info('delete %s' % analyse.get('name', 'noname'))
202 self.history.pop(self.ordercorpus[analyse['uuid']])
203 if analyse['uuid'] in self.openedcorpus :
204 del self.openedcorpus[analyse['uuid']]
205 log.info('delete corpus : %s' % analyse['uuid'])
206 elif analyse['uuid'] in self.analyses :
207 todel = [i for i, ana in enumerate(self.corpus[analyse['corpus']]['analyses']) if ana['uuid'] == analyse['uuid']][0]
208 self.history[self.ordercorpus[analyse['corpus']]]['analyses'].pop(todel)
209 elif analyse['uuid'] in self.matrixanalyse :
210 self.matrix = [mat for mat in self.matrix if mat['uuid'] != analyse['uuid']]
211 elif analyse.get('matrix', False) in self.matrixanalyse :
212 analyses = self.matrix[self.ordermatrix[analyse['matrix']]]['analyses']
213 topop = [i for i, val in enumerate(analyses) if analyse['uuid'] == val['uuid']][0]
215 self.matrix[self.ordermatrix[analyse['matrix']]]['analyses'] = analyses
219 def addtab(self, analyse) :
220 self.opened[analyse['uuid']] = analyse
222 def rmtab(self, analyse) :
223 del self.opened[analyse['uuid']]
225 def update(self, analyse) :
226 if 'matrix_name' in analyse :
227 self.matrixanalyse[analyse['uuid']].update(analyse)
228 elif 'corpus_name' in analyse :
229 self.corpus[analyse['uuid']].update(analyse)
230 elif 'corpus' in analyse :
231 self.analyses[analyse['uuid']].update(analyse)
233 toupdate = [an for an in self.matrixanalyse[analyse['matrix']]['analyses'] if an['uuid'] == analyse['uuid']]
234 toupdate[0].update(analyse)
239 corpustodel = [corpus for corpus in self.history if not os.path.exists(corpus['ira'])]
241 for corpus in corpustodel :
242 print('cleaning :', corpus['corpus_name'])
243 self.delete(corpus, corpus = True)
244 anatodel = [analyse for corpus in self.history for analyse in corpus.get('analyses', []) if not os.path.exists(analyse.get('ira', '/'))]
245 for analyse in anatodel :
246 print('cleaning :', analyse['name'])
261 for corpus in self.history :
262 analysenb += len(corpus.get('analyses', []))
263 analyses = corpus.get('analyses', [])
264 for analyse in analyses :
265 if os.path.exists(analyse['ira']) :
266 ana = DoConf(analyse['ira']).getoptions()
268 time = ana['time'].split()
269 ha += int(time[0].replace('h','')) * 3600
270 ma += int(time[1].replace('m','')) * 60
271 sa += int(time[2].replace('s',''))
272 if os.path.exists(corpus['ira']) :
273 param = DoConf(corpus['ira']).getoptions()
274 time = param.get('time','0h 0m 0s')
276 hours += int(time[0].replace('h','')) * 3600
277 minutes += int(time[1].replace('m','')) * 60
278 secondes += int(time[2].replace('s',''))
279 if param.get('originalpath', False) :
280 if param['originalpath'] in corpusnb :
281 corpusnb[param['originalpath']] += 1
282 tokens += int(param['occurrences'])
284 corpusnb[param['originalpath']] = 1
289 if corpus['ira'] in todel :
293 print('Nbr total de corpus : %s' % len(self.history))
294 corpus_nb = len(corpusnb) + len(todel)
295 print('Nbr de corpus différents : %s' % corpus_nb)
296 lentodel = len(todel)
297 print('Nbr de corpus à supprimer : %s' % lentodel)
298 print('Nbr de sous corpus : %s' % subnb)
299 print("Nbr total d'occurrences : %s" % tokens)
300 print('Moyenne occurrences par corpus : %f' % (tokens/corpus_nb))
301 print('---------------------')
302 print("Nbr total d'analyses : %s" % analysenb)
303 print('Temps total indexation : %f h' % ((hours+minutes+secondes) / 3600))
304 print('Temps total analyses : %f h' % ((ha+ma+sa) / 3600))
307 return str(self.history)
310 def __init__(self, configfile=None, diff = None, parametres = None) :
311 self.configfile = configfile
312 self.conf = ConfigParser(interpolation=None) # pourquoi ce paramètre ???
314 if configfile is not None :
315 configfile = normpath_win32(configfile)
316 self.conf.read_file(codecs.open(configfile, 'r', 'utf8'))
318 if parametres is not None :
319 self.doparametres(parametres)
321 def doparametres(self, parametres) :
324 def getsections(self) :
325 return self.conf.sections()
327 def getoptions(self, section = None, diff = None):
330 section = self.conf.sections()[0]
331 for option in self.conf.options(section) :
332 if self.conf.get(section, option).isdigit() :
333 parametres[option] = int(self.conf.get(section, option))
334 elif self.conf.get(section, option) == 'False' :
335 parametres[option] = False
336 elif self.conf.get(section, option) == 'True' :
337 parametres[option] = True
338 elif self.conf.get(section, option).startswith('(') and self.conf.get(section, option).endswith(')') :
339 parametres[option] = ast.literal_eval(self.conf.get(section, option))
340 elif self.conf.get(section, option).startswith('[') and self.conf.get(section, option).endswith(']') :
341 parametres[option] = ast.literal_eval(self.conf.get(section, option))
343 parametres[option] = self.conf.get(section, option)
344 if 'type' not in parametres :
345 parametres['type'] = section
348 def makeoptions(self, sections, parametres, outfile = None) :
350 for i, section in enumerate(sections) :
351 txt += '[%s]\n' % section
352 if not self.conf.has_section(section) :
353 self.conf.add_section(section)
354 for option in parametres[i] :
355 if isinstance(parametres[i][option], int) :
356 self.conf.set(section, option, repr(parametres[i][option]))
357 txt += '%s = %i\n' % (option, parametres[i][option])
358 elif isinstance(parametres[i][option], str) :
359 self.conf.set(section, option, parametres[i][option])
360 txt += '%s = %s\n' % (option, parametres[i][option])
361 elif isinstance(parametres[i][option], wx.Colour) :
362 self.conf.set(section, option, str(parametres[i][option]))
363 txt += '%s = %s\n' % (option, str(parametres[i][option]))
364 elif option == 'analyses' :
367 self.conf.set(section, option, repr(parametres[i][option]))
368 txt += '%s = %s\n' % (option, repr(parametres[i][option]))
370 outfile = self.configfile
371 outfile = normpath_win32(outfile)
372 with open(outfile, 'w', encoding="utf-8") as f :
376 def totext(self, parametres) :
379 for val in parametres :
380 if isinstance(parametres[val], int) :
381 txt.append(' \t\t: '.join([val, repr(parametres[val])]))
382 elif isinstance(parametres[val], str) :
383 txt.append(' \t\t: '.join([val, parametres[val]]))
384 elif val in ['listet', 'stars'] :
387 txt.append(' \t\t: '.join([val, repr(parametres[val])]))
388 return '\n'.join(txt)
391 def write_tab(tab, fileout) :
392 csvWriter = csv.writer(open(fileout, 'w', newline='', encoding='utf8'), delimiter=';', quoting = csv.QUOTE_NONNUMERIC)
393 csvWriter.writerows(tab)
395 class BugDialog(wx.Dialog):
396 def __init__(self, *args, **kwds):
397 # begin wxGlade: MyDialog.__init__
398 kwds["style"] = wx.DEFAULT_DIALOG_STYLE | wx.STAY_ON_TOP
399 kwds["size"] = wx.Size(500, 200)
400 wx.Dialog.__init__(self, *args, **kwds)
401 self.SetTitle(kwds['title'])
402 self.text_ctrl_1 = wx.TextCtrl(self, -1, "", style=wx.TE_MULTILINE)
403 self.text_ctrl_1.SetBackgroundColour('#DDE8EB')
404 self.button_1 = wx.Button(self, wx.ID_OK, "")
406 self.__set_properties()
410 def __set_properties(self):
411 # begin wxGlade: MyDialog.__set_properties
412 self.SetMinSize(wx.Size(500, 200))
413 self.text_ctrl_1.SetMinSize(wx.Size(500, 200))
417 def __do_layout(self):
418 # begin wxGlade: MyDialog.__do_layout
419 sizer_1 = wx.BoxSizer(wx.VERTICAL)
420 sizer_1.Add(self.text_ctrl_1, 1, wx.EXPAND, 0)
421 sizer_1.Add(self.button_1, 0, wx.ALIGN_CENTER_HORIZONTAL, 0)
422 self.SetSizer(sizer_1)
427 def CreateIraFile(DictPathOut, clusternb, corpname='corpus_name', section = 'analyse'):
428 AnalyseConf = ConfigParser()
429 AnalyseConf.read(DictPathOut['ira'])
430 AnalyseConf.add_section(section)
431 date = datetime.datetime.now().ctime()
432 AnalyseConf.set(section, 'date', str(date))
433 AnalyseConf.set(section, 'clusternb', clusternb)
434 AnalyseConf.set(section, 'corpus_name', corpname)
436 fileout = open(DictPathOut['ira'], 'w', encoding='utf8')
437 AnalyseConf.write(fileout)
440 def multisort(liste2d, ordre, indices_tri):
443 methode destinée à remplacer 'comp' qui a disparu en Python 3
444 tri de tuples sur l'un des éléments du tuple
445 en principe, elle doit renvoyer les éléments triés selon le principe d'avant
446 tel que décrit dans la docstring de 'sortedby'
448 probablement à améliorer pour la rendre d'usage plus général
449 en acceptant un nombre variable de parametres ???
452 indices_triTuple = indices_tri.Tuple(int, ...)
453 for key in reversed(indices_tri):
454 liste2d.sort(key=attrgetter(key), reverse=ordre)
457 def sortedby(liste2d, direct, *indices):
460 sortedby: sort a list of lists (e.g. a table) by one or more indices
461 (columns of the table) and return the sorted list
464 for list = [[2,3],[1,2],[3,1]]:
465 sortedby(list,1) will return [[3, 1], [1, 2], [2, 3]],
466 sortedby(list,0) will return [[1, 2], [2, 3], [3, 1]]
468 elle n'est pas remplacée par la méthode 'multisort' ???
473 # nlist = map(lambda x, indices=indices:
474 # map(lambda i, x=x: x[i], indices) + [x],
477 # iramuteq passé à 2to3
478 # nlist = list(map(lambda x, indices=indices:
479 # list(map(lambda i, x=x: x[i], indices)) + [x],
482 for key in reversed(indices):
483 liste2d.sort(key=itemgetter(key), reverse=(direct==2))
489 # sorted_list = multisort(liste2d, direct, *indices)
492 # nlist.sort(reverse=True)
493 # sorted_list = multisort(liste2d, direct, *indices)
495 # return [l[-1] for l in nlist]
498 def add_type(line, dictlem):
499 if line[4] in dictlem:
500 line.append(dictlem[line[4]])
505 def treat_line_alceste(i, line) :
506 if line[0] == '*' or line[0] == '*****' :
511 elif float(line[5].replace(',', '.')) < 0.0001:
513 elif float(line[5].replace(',', '.')) > 0.05:
514 line[5] = 'NS (%s)' % str(float(line[5].replace(',', '.')))[0:7]
516 line[5] = str(float(line[5].replace(',', '.')))[0:7]
517 return [i, int(line[0]), int(line[1]), float(line[2]), float(line[3]), line[6], line[4], line[5]]
519 def ReadProfileAsDico(File, Alceste=False, encoding = 'utf8'):
521 print('lecture des profiles')
522 FileReader = open(File, 'r', encoding='utf8')
523 Filecontent = FileReader.readlines()
527 #rows = [row.replace('\n', '').replace("'", '').replace('\"', '').replace(',', '.').replace('\r','').split(';') for row in Filecontent]
528 rows = [row.replace('\n', '').replace("'", '').replace('\"', '').replace('\r','').split(';') for row in Filecontent]
530 ClusterNb = rows[0][2]
532 clusters = [row[2] for row in rows if row[0] == '**']
533 valclusters = [row[1:4] for row in rows if row[0] == '****']
534 lp = [i for i, line in enumerate(rows) if line[0] == '****']
535 prof = [rows[lp[i] + 1:lp[i+1] - 1] for i in range(0, len(lp)-1)] + [rows[lp[-1] + 1:len(rows)]]
537 prof = [[add_type(row, dictlem) for row in pr] for pr in prof]
538 prof = [[treat_line_alceste(i,line) for i, line in enumerate(pr)] for pr in prof]
540 prof = [[line + [''] for line in pr] for pr in prof]
541 prof = [[treat_line_alceste(i,line) for i, line in enumerate(pr)] for pr in prof]
542 for i, cluster in enumerate(clusters):
543 DictProfile[cluster] = [valclusters[i]] + prof[i]
546 def GetTxtProfile(dictprofile, cluster_size) :
548 for classe in range(0, len(dictprofile)) :
549 prof = dictprofile[str(classe + 1)]
550 clinfo = cluster_size[classe]
551 proflist.append('\n'.join([' '.join(['classe %i' % (classe + 1), '-', '%s uce sur %s - %s%%' % (clinfo[0], clinfo[1], clinfo[2])]), '\n'.join(['%5s|%5s|%6s|%6s|%8s|%8s|%20s\t%10s' % tuple([str(val) for val in line]) for line in prof if len(line)==8])]))
552 return '\n\n'.join(proflist)
554 def formatExceptionInfo(maxTBlevel=5):
555 cla, exc, trbk = sys.exc_info()
557 excName = cla.__name__
561 excArgs = exc.args[0]
563 excArgs = "<no args>"
564 excTb = traceback.format_tb(trbk, maxTBlevel)
565 return (excName, excArgs, excTb)
568 #fonction des etudiants de l'iut
569 def decoupercharact(chaine, longueur, longueurOptimale, separateurs = None) :
571 on part du dernier caractère, et on recule jusqu'au début de la chaîne.
572 Si on trouve un '$', c'est fini.
573 Sinon, on cherche le meilleur candidat. C'est-Ã -dire le rapport poids/distance le plus important.
575 separateurs = [['.', 60.0], ['?', 60.0], ['!', 60.0], ['£$£', 60], [':', 50.0], [';', 40.0], [',', 10.0], [' ', 0.1]]
576 trouve = False # si on a trouvé un bon séparateur
577 iDecoupe = 0 # indice du caractere ou il faut decouper
579 # on découpe la chaine pour avoir au maximum 240 caractères
580 longueur = min(longueur, len(chaine) - 1)
581 chaineTravail = chaine[:longueur + 1]
583 meilleur = ['', 0, 0] # type, poids et position du meilleur separateur
585 # on vérifie si on ne trouve pas un '$'
586 indice = chaineTravail.find('$')
591 # si on ne trouve rien, on cherche le meilleur séparateur
594 caractere = chaineTravail[nbCar]
595 distance = abs(longueurOptimale - nbCar) + 1
596 meilleureDistance = abs(longueurOptimale - meilleur[2]) + 1
598 # on vérifie si le caractére courant est une marque de ponctuation
599 for s in separateurs:
600 if caractere == s[0]:
601 # si c'est une ponctuation
603 if s[1] / distance > float(meilleur[1]) / meilleureDistance:
611 # et on termine la recherche
614 # on passe au caractère précédant
619 fin = chaine[iDecoupe + 1:]
620 retour = chaineTravail[:iDecoupe]
621 return len(retour) > 0, retour.split(), fin
622 # si on a rien trouvé
623 return False, chaine.split(), ''
626 exceptions = {'paragrapheOT' : "Un problème de formatage (présence d'un marqueur de paragraphe (-*) en dehors d'un texte) est survenu à la ligne ",
627 'EmptyText' : "Texte vide (probablement un problème de formatage du corpus). Le problème est apparu à la ligne ",
628 'CorpusEncoding' : "Problème d'encodage.",
629 'TextBeforeTextMark' : "Problème de formatage : du texte avant le premier marqueur de texte (****). Le problème est survenu à la ligne ",
630 'MissingAnalyse' : 'Aucun fichier à cet emplacement :\n',
633 def BugReport(parent, error = None):
634 for ch in parent.GetChildren():
635 if "<class 'wx._windows.ProgressDialog'>" == str(type(ch)):
637 excName, exc, excTb = formatExceptionInfo()
638 if excName == 'Exception' :
640 if len(exc.split()) == 2 :
641 mss, linenb = exc.split()
642 if mss in exceptions :
643 txt = exceptions[mss] + linenb
647 if exc in exceptions :
648 txt = exceptions[exc]
651 title = "Information"
653 txt = '\n !== BUG ==! \n'
654 txt += '*************************************\n'
655 txt += '\n'.join(excTb).replace(' ', ' ')
656 txt += excName + '\n'
660 dial = BugDialog(parent, **{'title' : title})
661 if 'Rerror' in dir(parent) :
665 dial.text_ctrl_1.write(txt)
666 dial.CenterOnParent()
670 def PlaySound(parent):
671 if parent.pref.getboolean('iramuteq', 'sound') :
673 if "gtk2" in wx.PlatformInfo:
674 error = Popen(['aplay','-q',os.path.join(parent.AppliPath,'son_fin.wav')])
676 sound = wx.adv.Sound(os.path.join(parent.AppliPath, 'son_fin.wav'))
677 sound.Play(wx.adv.SOUND_SYNC)
681 def ReadDicoAsDico(dicopath):
682 with open(dicopath, 'r', encoding='UTF8') as f:
683 content = f.readlines()
684 lines = [line.rstrip('\n\r').replace('\n', '').replace('"', '').split('\t') for line in content if line != '']
685 return dict([[line[0], line[1:]] for line in lines])
687 def ReadLexique(parent, lang = 'french', filein = None):
690 parent.lexique = ReadDicoAsDico(parent.DictPath.get(lang, 'french'))
692 parent.lexique = ReadDicoAsDico(filein)
697 parent.lexique = ReadDicoAsDico(filein)
699 def ReadList(filein, encoding = 'utf8', sep = ';'):
701 with open(filein, 'r', encoding='utf8') as f :
703 content = [line.replace('\n', '').replace('\r','').replace('\"', '').replace(',', '.').split(sep) for line in content.splitlines()]
704 #file = codecs.open(filein, 'r', encoding)
705 #content = file.readlines()
707 first = content.pop(0)
708 #first = first.replace('\n', '').replace('\r','').replace('\"', '').split(sep)
712 #line = line.replace('\n', '').replace('\r','').replace('\"', '').replace(',', '.')
713 #line = line.split(';')
722 don = float('%.5f' % float(val))
728 def exec_RCMD(rpath, command) :
729 log.info('R CMD INSTALL %s' % command)
730 rpath = rpath.replace('\\','\\\\')
731 error = call(["%s" % rpath, 'CMD', 'INSTALL', "%s" % command])
734 def exec_rcode(rpath, rcode, wait = True, graph = False):
735 log.info("R Script : %s" % rcode)
737 if sys.platform == 'darwin' :
739 macversion = platform.mac_ver()[0].split('.')
740 if int(macversion[1]) < 5 :
746 rpath = rpath.replace('\\','\\\\')
747 env = os.environ.copy()
748 if sys.platform == 'darwin' and 'LC_ALL' not in env:
749 env['LC_ALL'] = 'en_US.UTF-8'
752 if sys.platform == 'win32':
753 error = call(["%s" % rpath, "--vanilla","--slave","-f", "%s" % rcode])
755 error = call([rpath, '--slave', "--vanilla", "--encoding=UTF-8", "-f %s" % rcode], env = env)
758 if sys.platform == 'win32':
759 pid = Popen(["%s" % rpath, '--vanilla','--slave','-f', "%s" % rcode])
761 pid = Popen([rpath, '--slave', "--vanilla", "--encoding=UTF-8", "-f %s" % rcode], stderr = PIPE, env = env, encoding='UTF-8') #PIPE ou STDOUT ?
765 if sys.platform == 'win32':
766 error = call(["%s" % rpath, '--vanilla','--slave','-f', "%s" % rcode])
767 elif sys.platform == 'darwin' and needX11:
768 os.environ['DISPLAY'] = ':0.0'
769 error = call([rpath, '--vanilla','--slave', "--encoding=UTF-8","-f %s" % rcode], env = env, encoding='UTF-8')
771 error = call([rpath, '--vanilla','--slave', "--encoding=UTF-8","-f %s" % rcode], env = env, encoding='UTF-8')
774 if sys.platform == 'win32':
775 pid = Popen(["%s" % rpath, '--vanilla','--slave','-f', "%s" % rcode])
776 elif sys.platform == 'darwin' and needX11:
777 os.environ['DISPLAY'] = ':0.0'
778 pid = Popen([rpath, '--vanilla','--slave', "--encoding=UTF-8","-f %s" % rcode], stderr = PIPE, env = env, encoding='UTF-8')
780 pid = Popen([rpath, '--vanilla','--slave', "--encoding=UTF-8","-f %s" % rcode], stderr = PIPE, env = env, encoding='UTF-8')
783 def check_Rresult(parent, pid) :
784 if isinstance(pid, Popen) :
785 if pid.returncode != 0 :
786 error = pid.communicate()
787 error = [str(error[0]), error[1]]
788 if error[1] is None :
790 parent.Rerror = '\n'.join([str(pid.returncode), '\n'.join(error)])
792 raise Exception('\n'.join(['Erreur R', '\n'.join(error[1:])]))
801 raise Exception('Erreur R')
809 def launchcommand(mycommand):
812 def print_liste(filename,liste):
813 with open(filename,'w', encoding='utf8') as f :
815 f.write(';'.join(graph) +'\n')
817 def read_list_file(filename, encoding = 'utf8'):
818 with open(filename,'r', encoding='utf8') as f:
819 content=f.readlines()
820 ncontent=[line.replace('\n','').split(';') for line in content if line.strip() != '']
823 def progressbar(self, maxi):
824 ira = wx.GetApp().GetTopWindow()
830 prog = wx.ProgressDialog("Traitements",
831 "Veuillez patienter...",
834 style=wx.PD_APP_MODAL | wx.PD_AUTO_HIDE | wx.PD_ELAPSED_TIME | wx.PD_CAN_ABORT
837 # le ABORT n'est pas géré à tous les coups ???
838 prog.SetSize((400,150))
839 #prog.SetIcon(ira._icon)
842 def treat_var_mod(variables) :
844 variables = list(set(variables))
845 varmod = [variable.split('_') for variable in variables]
846 vars = list(set([var[0] for var in varmod if len(var) >=2]))
848 mods = ['_'.join(v) for v in varmod if v[0] == var]
851 # for variable in variables :
852 # if '_' in variable :
853 # forme = variable.split('_')
856 # if not var in var_mod :
857 # var_mod[var] = [variable]
859 # if not mod in var_mod[var] :
860 # var_mod[var].append(variable)
863 def doconcorde(corpus, uces, mots, uci = False) :
865 ucestxt1 = [row for row in corpus.getconcorde(uces)]
867 ucestxt1 = [row for row in corpus.getuciconcorde(uces)]
868 ucestxt1 = dict(ucestxt1)
871 listmot = [corpus.getlems()[lem].formes for lem in mots]
872 listmot = [corpus.getforme(fid).forme for lem in listmot for fid in lem]
873 mothtml = ['<font color=red><b>%s</b></font>' % mot for mot in listmot]
874 dmots = dict(list(zip(listmot, mothtml)))
876 ucetxt = ucestxt1[uce].split()
877 ucetxt = ' '.join([dmots.get(mot, mot) for mot in ucetxt])
879 uciid = corpus.getucefromid(uce).uci
880 ucis_txt.append('<p><b>' + ' '.join(corpus.ucis[corpus.getucefromid(uce).uci].etoiles) + '<a href="%i_%i"> *%i_%i</a></b></p>' % (uciid, uce, uciid, uce))
882 ucis_txt.append('<p><b>' + ' '.join(corpus.ucis[uce].etoiles) + '</b></p>')
883 ucestxt.append(ucetxt)
884 return ucis_txt, ucestxt
887 def getallstcarac(corpus, analyse) :
888 pathout = PathOut(analyse['ira'])
889 profils = ReadProfileAsDico(pathout['PROFILE_OUT'], Alceste, 'utf8')
892 def read_chd(filein, fileout):
893 with open(filein, 'r') as f :
895 #content = [line[3:].replace('"',"").replace(' ','') for line in content.splitlines()]
896 content = [line.split('\t') for line in content.splitlines()]
897 chd = {'name':1, 'children':[]}
899 for i, line in enumerate(content) :
901 chd['children'] = [{'name': line[1],'size' : content[i+1][0]}, {'name':line[2], 'size': content[i+1][1]}]
902 mere[line[1]] = chd['children'][0]
903 mere[line[2]] = chd['children'][1]
905 if 'children' in mere[line[0]]:
906 mere[line[0]]['children'].append({'name': line[1],'size' : content[i+1][0]})
907 mere[line[1]] = mere[line[0]]['children'][-1]
908 mere[line[0]]['children'].append({'name': line[2],'size' : content[i+1][1]})
909 mere[line[2]] = mere[line[0]]['children'][-1]
911 mere[line[0]]['children'] = [{'name': line[1],'size' : content[i+1][0]}, {'name':line[2], 'size': content[i+1][1]}]
912 mere[line[1]] = mere[line[0]]['children'][-2]
913 mere[line[2]] = mere[line[0]]['children'][-1]
914 with open(fileout, 'w') as f :
915 f.write(json.dumps(chd))
918 translation_languages = {"Afrikaans":"af", "Albanian":"sq", "Amharic":"am", "Arabic":"ar", "Armenian":"hy", "Azeerbaijani":"az", "Basque":"eu", "Belarusian":"be", "Bengali":"bn", "Bosnian":"bs", "Bulgarian":"bg", "Catalan":"ca", "Cebuano":"ceb", "Chichewa":"ny", "Chinese (Simplified)":"zh-CN", "Chinese (Traditional)":"zh-TW", "Corsican":"co", "Croatian":"hr", "Czech":"cs", "Danish":"da", "Dutch":"nl", "English":"en", "Esperanto":"eo", "Estonian":"et", "Filipino":"tl", "Finnish":"fi", "French":"fr", "Frisian":"fy", "Galician":"gl", "Georgian":"ka", "German":"de", "Greek":"el", "Gujarati":"gu", "Haitian Creole":"ht", "Hausa":"ha", "Hawaiian":"haw", "Hebrew":"iw", "Hindi":"hi", "Hmong":"hmn ", "Hungarian":"hu", "Icelandic":"is", "Igbo":"ig", "Indonesian":"id", "Irish":"ga", "Italian":"it", "Japanese":"ja", "Javanese":"jw", "Kannada":"kn", "Kazakh":"kk", "Khmer":"km", "Korean":"ko", "Kurdish":"ku", "Kyrgyz":"ky", "Lao":"lo", "Latin":"la", "Latvian":"lv", "Lithuanian":"lt", "Luxembourgish":"lb", "Macedonian":"mk", "Malagasy":"mg", "Malay":"ms", "Malayalam":"ml", "Maltese":"mt", "Maori":"mi", "Marathi":"mr", "Mongolian":"mn", "Burmese":"my", "Nepali":"ne", "Norwegian":"no", "Pashto":"ps", "Persian":"fa", "Polish":"pl", "Portuguese":"pt", "Punjabi":"ma", "Romanian":"ro", "Russian":"ru", "Samoan":"sm", "Scots Gaelic":"gd", "Serbian":"sr", "Sesotho":"st", "Shona":"sn", "Sindhi":"sd", "Sinhala":"si", "Slovak":"sk", "Slovenian":"sl", "Somali":"so", "Spanish":"es", "Sundanese":"su", "Swahili":"sw", "Swedish":"sv", "Tajik":"tg", "Tamil":"ta", "Telugu":"te", "Thai":"th", "Turkish":"tr", "Ukrainian":"uk", "Urdu":"ur", "Uzbek":"uz", "Vietnamese":"vi", "Welsh":"cy", "Xhosa":"xh", "Yiddish":"yi", "Yoruba":"yo", "Zulu":"zu", }
921 def gettranslation(words, lf, lt) :
922 import urllib.request, urllib.error, urllib.parse
924 agent = {'User-Agent':
932 .NET CLR 3.0.04506.30\
934 base_link = "https://translate.googleapis.com/translate_a/single?client=gtx&sl=%s&tl=%s&dt=t&q=%s"
936 totrans = urllib.parse.quote('\n'.join(words))
937 link = base_link % (lf, lt, totrans)
938 request = urllib.request.Request(link, headers=agent)
939 raw_data = urllib.request.urlopen(request).read()
940 data = json.loads(raw_data)
941 return [line[0].replace("'", '_').replace(' | ', '|').replace(' ', '_').replace('-','_').replace('\n','') for line in data[0]]
943 def makenprof(prof, trans, deb=0) :
946 nprof.append(prof[0])
947 for i, val in enumerate(trans) :
948 line = prof[deb+i+1][:]
953 def treatempty(val) :
954 if val.strip() == '' :
959 def translateprofile(corpus, dictprofile, lf='it', lt='fr', maxword = 50) :
962 for i in range(len(dictprofile)) :
963 prof = dictprofile[repr(i+1)]
965 lenact = prof.index(['*****', '*', '*', '*', '*', '*', '', ''])
969 lenact = prof.index(['*', '*', '*', '*', '*', '*', '', ''])
975 lensup += prof.index(['*', '*', '*', '*', '*', '*', '', ''])
976 lensup = lensup - lenact
978 lensup += len(prof) - lenact
980 if lenact > maxword :
984 actori = [line[6] for line in prof[1:nlenact]]
985 act = [val.replace('_', ' ') for val in actori]
986 act = gettranslation(act, lf, lt)
987 for j, val in enumerate(actori) :
988 if act[j] not in lems :
991 while act[j] in lems :
992 act[j] = act[j] + "+"
994 nprof[repr(i+1)] = makenprof(prof, act)
997 if lensup > maxword :
1001 supori = [line[6] for line in prof[(1+lenact):(lenact+nlensup)]]
1002 sup = [val.replace('_', ' ') for val in supori]
1003 sup = [treatempty(val) for val in sup]
1004 sup = gettranslation(sup, lf, lt)
1005 for j, val in enumerate(supori) :
1006 if sup[j] not in lems :
1009 while sup[j] in lems :
1010 sup[j] = sup[j] + "+"
1012 nprof[repr(i+1)].append(['*****', '*', '*', '*', '*', '*', '', ''])
1013 nprof[repr(i+1)] += makenprof(prof, sup, deb=lenact)
1016 lenet = prof.index(['*', '*', '*', '*', '*', '*', '', ''])
1017 nprof[repr(i+1)].append(['*', '*', '*', '*', '*', '*', '', ''])
1018 nprof[repr(i+1)] += prof[(lenet+1):]
1023 def write_translation_profile(prof, lems, language, dictpathout) :
1024 if os.path.exists(dictpathout['translations.txt']) :
1025 with open(dictpathout['translations.txt'], 'r', encoding='utf8') as f :
1026 translist = f.read()
1027 translist = [line.split('\t') for line in translist.splitlines()]
1031 toprint.append(['','','','','',''])
1032 toprint.append(['***', 'nb classes', repr(len(prof)), '***', '', ''])
1033 for i in range(len(prof)) :
1034 toprint.append(['**', 'classe', repr(i+1), '**', '', ''])
1035 toprint.append(['****'] + prof[repr(i+1)][0] + ['****'])
1036 rest = [[repr(line[1]), repr(line[2]), repr(line[3]), repr(line[4]), line[6], line[7].replace('< 0,0001', '0.00009').replace('NS (','').replace(')','')] for line in prof[repr(i+1)][1:]]
1037 for i, line in enumerate(prof[repr(i+1)][1:]) :
1039 rest[i] = ['*', '*', '*', '*', '*', '*']
1040 elif line[0] == '*****' :
1041 rest[i] = ['*****','*','*', '*', '*', '*']
1043 with open(dictpathout['translation_profile_%s.csv' % language], 'w', encoding='utf8') as f :
1044 f.write('\n'.join([';'.join(line) for line in toprint]))
1045 with open(dictpathout['translation_words_%s.csv' % language], 'w', encoding='utf8') as f :
1046 f.write('\n'.join(['\t'.join([val, lems[val]]) for val in lems]))
1047 if 'translation_profile_%s.csv' % language not in [val[0] for val in translist] :
1048 translist.append(['translation_profile_%s.csv' % language, 'translation_words_%s.csv' % language])
1049 with open(dictpathout['translations.txt'], 'w', encoding='utf8') as f :
1050 f.write('\n'.join(['\t'.join(line) for line in translist]))
1052 def makesentidict(infile, language) :
1053 with codecs.open(infile,'r', 'utf8') as f :
1055 content = [line.split('\t') for line in content.splitlines()]
1056 titles = content.pop(0)
1057 senti = ['Positive', 'Negative', 'Anger', 'Anticipation', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise', 'Trust']
1060 sentid[sent] = titles.index(sent)
1061 frtitle = [val for val in titles if '(fr)' in val]
1062 frid = titles.index(frtitle[0])
1063 sentidict = [[line[frid].lower(), [line[sentid[sent]] for sent in senti]] for line in content]
1064 pos = ['positive'] + [line[0] for line in sentidict if line[1][0] == '1']
1065 neg = ['negative'] + [line[0] for line in sentidict if line[1][1] == '1']
1066 anger = ['anger'] + [line[0] for line in sentidict if line[1][2] == '1']
1067 anticipation = ['anticipation'] + [line[0] for line in sentidict if line[1][3] == '1']
1068 disgust = ['disgust'] + [line[0] for line in sentidict if line[1][4] == '1']
1069 fear = ['fear'] + [line[0] for line in sentidict if line[1][5] == '1']
1070 joy = ['joy'] + [line[0] for line in sentidict if line[1][6] == '1']
1071 sadness = ['sadness'] + [line[0] for line in sentidict if line[1][7] == '1']
1072 surprise = ['surprise'] + [line[0] for line in sentidict if line[1][8] == '1']
1073 trust = ['trust'] + [line[0] for line in sentidict if line[1][9] == '1']
1074 with open('/tmp/tgenemo.csv', 'w') as f :
1075 for val in [pos, neg, anger, anticipation, disgust, fear, joy, sadness, surprise, trust] :
1076 f.write('\t'.join(val) + '\n')
1078 def countsentfromprof(prof, encoding, sentidict) :
1079 with codecs.open(prof, 'r', encoding) as f :
1081 content = [line.split(';') for line in content.splitlines()]
1083 content = [[line[0], [int(val) for val in line[1:]]] for line in content]
1085 content = dict(content)
1088 def iratolexico(infile, outfile, encoding) :
1089 with codecs.open(infile, 'r', encoding) as f :
1091 if line.startswith('**** ') :