1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
3 #Copyright (c) 2008-2020 Pierre Ratinaud
4 #modification pour python 3 : Laurent Mérat, 6x7 - mai 2020
7 #------------------------------------
8 # import des modules python
9 #------------------------------------
11 from subprocess import Popen, call, PIPE
23 from shutil import copyfile
26 #from dialog import BugDialog
28 from operator import itemgetter
30 #------------------------------------
31 # import des modules wx
32 #------------------------------------
36 #------------------------------------
37 # import des fichiers du projet
38 #------------------------------------
39 from configparser import ConfigParser
42 log = logging.getLogger('iramuteq')
45 indices_simi = ['cooccurrence' ,'pourcentage de cooccurrence','Russel','Jaccard', 'Kulczynski1', 'Kulczynski2', 'Mountford', 'Fager', 'simple matching', 'Hamman', 'Faith', 'Tanimoto', 'Dice', 'Phi', 'Stiles', 'Michael', 'Mozley', 'Yule', 'Yule2', 'Ochiai', 'Simpson', 'Braun-Blanquet','Chi-squared', 'Phi-squared', 'Tschuprow', 'Cramer', 'Pearson', 'binomial']
47 def open_folder(folder):
48 if sys.platform == "win32":
51 opener ="open" if sys.platform == "darwin" else "xdg-open"
52 #call([opener, folder])
53 call(["%s '%s' &" % (opener, folder)], shell=True)
55 def normpath_win32(path) :
56 if not sys.platform == 'win32' :
58 while '\\\\' in path :
59 path = path.replace('\\\\', '\\')
60 if path.startswith('\\') and not path.startswith('\\\\') :
65 def __init__(self, path = None, encoding = 'utf8'):
68 self.encoding = encoding
70 def __getitem__(self, key):
73 def read(self, path = None):
76 with codecs.open(path, 'r', self.encoding) as f :
78 tgen = [line.split('\t') for line in tgen.splitlines()]
79 tgen = dict([[line[0], line[1:]] for line in tgen])
83 def write(self, path = None):
86 with open(path, 'w', encoding='utf8') as f :
87 f.write('\n'.join(['\t'.join([val] + self.tgen[val]) for val in self.tgen]))
89 def writetable(self, pathout, tgens, totocc):
90 etoiles = list(totocc.keys())
92 with open(pathout, 'w', encoding='utf8') as f :
93 line = '\t'.join(['tgens'] + etoiles) + '\n'
96 line = '\t'.join([t] + [repr(tgens[t][et]) for et in etoiles]) + '\n'
100 while totname + repr(i) in tgens :
102 totname = totname + repr(i)
103 line = '\t'.join([totname] + [repr(totocc[et]) for et in etoiles]) + '\n'
107 def __init__(self, filein, syscoding = 'utf8') :
109 self.syscoding = syscoding
111 self.openedcorpus = {}
112 self.openedmatrix = {}
120 with open(self.filein, 'r') as fjson :
122 # d = shelve.open(self.filein, protocol=1)
123 self.history = d.get('history', [])
124 self.matrix = d.get('matrix', [])
125 self.ordercorpus = dict([[corpus['uuid'], i] for i, corpus in enumerate(self.history)])
126 self.corpus = dict([[corpus['uuid'], corpus] for corpus in self.history])
127 self.analyses = dict([[analyse['uuid'], analyse] for corpus in self.history for analyse in corpus.get('analyses', [])])
128 self.matrixanalyse = dict([[mat['uuid'], mat] for mat in self.matrix])
129 self.ordermatrix = dict([[matrix['uuid'], i] for i, matrix in enumerate(self.matrix)])
134 d['history'] = self.history
135 d['matrix'] = self.matrix
136 with open(self.filein, 'w') as f :
137 f.write(json.dumps(d, indent=4, default=str))
138 #d = shelve.open(self.filein, protocol=1)
141 def add(self, analyse) :
142 log.info('add to history %s' % analyse.get('corpus_name', 'pas un corpus'))
143 tosave = {'uuid' : analyse['uuid'], 'ira': analyse['ira'], 'type' : analyse['type']}
144 if tosave['uuid'] in self.corpus :
145 log.info('problem : this uuid is already in history : %s' % tosave['uuid'])
147 if analyse.get('corpus', False) :
148 if analyse['uuid'] in self.analyses :
150 tosave['corpus'] = analyse['corpus']
151 tosave['name'] = analyse['name']
152 acorpus_uuid = analyse['corpus']
153 if acorpus_uuid in self.corpus :
154 if 'analyses' in self.history[self.ordercorpus[acorpus_uuid]] :
155 self.history[self.ordercorpus[acorpus_uuid]]['analyses'].append(tosave)
157 self.history[self.ordercorpus[acorpus_uuid]]['analyses'] = [tosave]
159 self.orph.append(tosave)
161 tosave['corpus_name'] = analyse['corpus_name']
162 #self.ordercorpus[tosave['uuid']] = len(history)
163 #self.corpus[tosave['uuid']] = analyse
164 self.history.append(tosave)
168 def addMatrix(self, analyse) :
170 #tosave['matrix_name'] = analyse['matrix_name']
171 tosave['analyses'] = []
172 self.matrix.append(tosave)
176 def addMatrixAnalyse(self, analyse) :
177 tosave = {'uuid' : analyse['uuid'], 'ira': analyse['ira'], 'type' : analyse['type'], 'matrix' : analyse['matrix']}
178 tosave['name'] = analyse['name']
179 if tosave['matrix'] in self.ordermatrix :
180 self.matrix[self.ordermatrix[tosave['matrix']]]['analyses'].append(tosave)
184 def addmultiple(self, analyses) :
185 log.info('add multiple')
186 for analyse in analyses :
187 tosave = {'uuid' : analyse['uuid'], 'ira': analyse['ira'], 'type' : analyse['type']}
188 corpus = analyse['corpus']
189 tosave['corpus'] = corpus
190 tosave['name'] = analyse['name']
191 if corpus in self.corpus :
192 if 'analyses' in self.history[self.ordercorpus[corpus]] :
193 self.history[self.ordercorpus[corpus]]['analyses'].append(tosave)
195 self.history[self.ordercorpus[corpus]]['analyses'] = [tosave]
199 def delete(self, analyse, corpus = False) :
200 log.info('delete %s' % analyse.get('name', 'noname'))
202 self.history.pop(self.ordercorpus[analyse['uuid']])
203 if analyse['uuid'] in self.openedcorpus :
204 del self.openedcorpus[analyse['uuid']]
205 log.info('delete corpus : %s' % analyse['uuid'])
206 elif analyse['uuid'] in self.analyses :
207 todel = [i for i, ana in enumerate(self.corpus[analyse['corpus']]['analyses']) if ana['uuid'] == analyse['uuid']][0]
208 self.history[self.ordercorpus[analyse['corpus']]]['analyses'].pop(todel)
209 elif analyse['uuid'] in self.matrixanalyse :
210 self.matrix = [mat for mat in self.matrix if mat['uuid'] != analyse['uuid']]
211 elif analyse.get('matrix', False) in self.matrixanalyse :
212 analyses = self.matrix[self.ordermatrix[analyse['matrix']]]['analyses']
213 topop = [i for i, val in enumerate(analyses) if analyse['uuid'] == val['uuid']][0]
215 self.matrix[self.ordermatrix[analyse['matrix']]]['analyses'] = analyses
219 def addtab(self, analyse) :
220 self.opened[analyse['uuid']] = analyse
222 def rmtab(self, analyse) :
223 del self.opened[analyse['uuid']]
225 def update(self, analyse) :
226 if 'matrix_name' in analyse :
227 self.matrixanalyse[analyse['uuid']].update(analyse)
228 elif 'corpus_name' in analyse :
229 self.corpus[analyse['uuid']].update(analyse)
230 elif 'corpus' in analyse :
231 self.analyses[analyse['uuid']].update(analyse)
233 toupdate = [an for an in self.matrixanalyse[analyse['matrix']]['analyses'] if an['uuid'] == analyse['uuid']]
234 toupdate[0].update(analyse)
239 corpustodel = [corpus for corpus in self.history if not os.path.exists(corpus['ira'])]
241 for corpus in corpustodel :
242 print('cleaning :', corpus['corpus_name'])
243 self.delete(corpus, corpus = True)
244 anatodel = [analyse for corpus in self.history for analyse in corpus.get('analyses', []) if not os.path.exists(analyse.get('ira', '/'))]
245 for analyse in anatodel :
246 print('cleaning :', analyse['name'])
261 for corpus in self.history :
262 analysenb += len(corpus.get('analyses', []))
263 analyses = corpus.get('analyses', [])
264 for analyse in analyses :
265 if os.path.exists(analyse['ira']) :
266 ana = DoConf(analyse['ira']).getoptions()
268 time = ana['time'].split()
269 ha += int(time[0].replace('h','')) * 3600
270 ma += int(time[1].replace('m','')) * 60
271 sa += int(time[2].replace('s',''))
272 if os.path.exists(corpus['ira']) :
273 param = DoConf(corpus['ira']).getoptions()
274 time = param.get('time','0h 0m 0s')
276 hours += int(time[0].replace('h','')) * 3600
277 minutes += int(time[1].replace('m','')) * 60
278 secondes += int(time[2].replace('s',''))
279 if param.get('originalpath', False) :
280 if param['originalpath'] in corpusnb :
281 corpusnb[param['originalpath']] += 1
282 tokens += int(param['occurrences'])
284 corpusnb[param['originalpath']] = 1
289 if corpus['ira'] in todel :
293 print('Nbr total de corpus : %s' % len(self.history))
294 corpus_nb = len(corpusnb) + len(todel)
295 print('Nbr de corpus différents : %s' % corpus_nb)
296 lentodel = len(todel)
297 print('Nbr de corpus à supprimer : %s' % lentodel)
298 print('Nbr de sous corpus : %s' % subnb)
299 print("Nbr total d'occurrences : %s" % tokens)
300 print('Moyenne occurrences par corpus : %f' % (tokens/corpus_nb))
301 print('---------------------')
302 print("Nbr total d'analyses : %s" % analysenb)
303 print('Temps total indexation : %f h' % ((hours+minutes+secondes) / 3600))
304 print('Temps total analyses : %f h' % ((ha+ma+sa) / 3600))
307 return str(self.history)
310 def __init__(self, configfile=None, diff = None, parametres = None) :
311 self.configfile = configfile
312 self.conf = ConfigParser(interpolation=None) # pourquoi ce paramètre ???
314 if configfile is not None :
315 configfile = normpath_win32(configfile)
316 self.conf.read_file(codecs.open(configfile, 'r', 'utf8'))
318 if parametres is not None :
319 self.doparametres(parametres)
321 def doparametres(self, parametres) :
324 def getsections(self) :
325 return self.conf.sections()
327 def getoptions(self, section = None, diff = None):
330 section = self.conf.sections()[0]
331 for option in self.conf.options(section) :
332 if self.conf.get(section, option).isdigit() :
333 parametres[option] = int(self.conf.get(section, option))
334 elif self.conf.get(section, option) == 'False' :
335 parametres[option] = False
336 elif self.conf.get(section, option) == 'True' :
337 parametres[option] = True
338 elif self.conf.get(section, option).startswith('(') and self.conf.get(section, option).endswith(')') :
339 parametres[option] = ast.literal_eval(self.conf.get(section, option))
340 elif self.conf.get(section, option).startswith('[') and self.conf.get(section, option).endswith(']') :
341 parametres[option] = ast.literal_eval(self.conf.get(section, option))
343 parametres[option] = self.conf.get(section, option)
344 if 'type' not in parametres :
345 parametres['type'] = section
348 def makeoptions(self, sections, parametres, outfile = None) :
350 for i, section in enumerate(sections) :
351 txt += '[%s]\n' % section
352 if not self.conf.has_section(section) :
353 self.conf.add_section(section)
354 for option in parametres[i] :
355 if isinstance(parametres[i][option], int) :
356 self.conf.set(section, option, repr(parametres[i][option]))
357 txt += '%s = %i\n' % (option, parametres[i][option])
358 elif isinstance(parametres[i][option], str) :
359 self.conf.set(section, option, parametres[i][option])
360 txt += '%s = %s\n' % (option, parametres[i][option])
361 elif isinstance(parametres[i][option], wx.Colour) :
362 self.conf.set(section, option, str(parametres[i][option]))
363 txt += '%s = %s\n' % (option, str(parametres[i][option]))
364 elif option == 'analyses' :
367 self.conf.set(section, option, repr(parametres[i][option]))
368 txt += '%s = %s\n' % (option, repr(parametres[i][option]))
370 outfile = self.configfile
371 outfile = normpath_win32(outfile)
372 with open(outfile, 'w', encoding="utf-8") as f :
376 def totext(self, parametres) :
379 for val in parametres :
380 if isinstance(parametres[val], int) :
381 txt.append(' \t\t: '.join([val, repr(parametres[val])]))
382 elif isinstance(parametres[val], str) :
383 txt.append(' \t\t: '.join([val, parametres[val]]))
384 elif val in ['listet', 'stars'] :
387 txt.append(' \t\t: '.join([val, repr(parametres[val])]))
388 return '\n'.join(txt)
391 def write_tab(tab, fileout) :
392 csvWriter = csv.writer(open(fileout, 'w', newline='', encoding='utf8'), delimiter=';', quoting = csv.QUOTE_NONNUMERIC)
393 csvWriter.writerows(tab)
395 class BugDialog(wx.Dialog):
396 def __init__(self, *args, **kwds):
397 # begin wxGlade: MyDialog.__init__
398 kwds["style"] = wx.DEFAULT_DIALOG_STYLE | wx.STAY_ON_TOP
399 kwds["size"] = wx.Size(500, 200)
400 wx.Dialog.__init__(self, *args, **kwds)
401 self.SetTitle(kwds['title'])
402 self.text_ctrl_1 = wx.TextCtrl(self, -1, "", style=wx.TE_MULTILINE)
403 self.text_ctrl_1.SetBackgroundColour('#DDE8EB')
404 self.button_1 = wx.Button(self, wx.ID_OK, "")
406 self.__set_properties()
410 def __set_properties(self):
411 # begin wxGlade: MyDialog.__set_properties
412 self.SetMinSize(wx.Size(500, 200))
413 self.text_ctrl_1.SetMinSize(wx.Size(500, 200))
417 def __do_layout(self):
418 # begin wxGlade: MyDialog.__do_layout
419 sizer_1 = wx.BoxSizer(wx.VERTICAL)
420 sizer_1.Add(self.text_ctrl_1, 1, wx.EXPAND, 0)
421 sizer_1.Add(self.button_1, 0, wx.ALIGN_CENTER_HORIZONTAL, 0)
422 self.SetSizer(sizer_1)
427 def CreateIraFile(DictPathOut, clusternb, corpname='corpus_name', section = 'analyse'):
428 AnalyseConf = ConfigParser()
429 AnalyseConf.read(DictPathOut['ira'])
430 AnalyseConf.add_section(section)
431 date = datetime.datetime.now().ctime()
432 AnalyseConf.set(section, 'date', str(date))
433 AnalyseConf.set(section, 'clusternb', clusternb)
434 AnalyseConf.set(section, 'corpus_name', corpname)
436 fileout = open(DictPathOut['ira'], 'w', encoding='utf8')
437 AnalyseConf.write(fileout)
440 def multisort(liste2d, ordre, indices_tri):
443 methode destinée à remplacer 'comp' qui a disparu en Python 3
444 tri de tuples sur l'un des éléments du tuple
445 en principe, elle doit renvoyer les éléments triés selon le principe d'avant
446 tel que décrit dans la docstring de 'sortedby'
448 probablement à améliorer pour la rendre d'usage plus général
449 en acceptant un nombre variable de parametres ???
452 indices_triTuple = indices_tri.Tuple(int, ...)
453 for key in reversed(indices_tri):
454 liste2d.sort(key=attrgetter(key), reverse=ordre)
457 def sortedby(liste2d, direct, *indices):
460 sortedby: sort a list of lists (e.g. a table) by one or more indices
461 (columns of the table) and return the sorted list
464 for list = [[2,3],[1,2],[3,1]]:
465 sortedby(list,1) will return [[3, 1], [1, 2], [2, 3]],
466 sortedby(list,0) will return [[1, 2], [2, 3], [3, 1]]
468 elle n'est pas remplacée par la méthode 'multisort' ???
473 # nlist = map(lambda x, indices=indices:
474 # map(lambda i, x=x: x[i], indices) + [x],
477 # iramuteq passé à 2to3
478 # nlist = list(map(lambda x, indices=indices:
479 # list(map(lambda i, x=x: x[i], indices)) + [x],
482 for key in reversed(indices):
483 liste2d.sort(key=itemgetter(key), reverse=(direct==2))
489 # sorted_list = multisort(liste2d, direct, *indices)
492 # nlist.sort(reverse=True)
493 # sorted_list = multisort(liste2d, direct, *indices)
495 # return [l[-1] for l in nlist]
498 def add_type(line, dictlem):
499 if line[4] in dictlem:
500 line.append(dictlem[line[4]])
505 def treat_line_alceste(i, line) :
506 if line[0] == '*' or line[0] == '*****' :
511 elif float(line[5].replace(',', '.')) < 0.0001:
513 elif float(line[5].replace(',', '.')) > 0.05:
514 line[5] = 'NS (%s)' % str(float(line[5].replace(',', '.')))[0:7]
516 line[5] = str(float(line[5].replace(',', '.')))[0:7]
517 return [i, int(line[0]), int(line[1]), float(line[2]), float(line[3]), line[6], line[4], line[5]]
519 def ReadProfileAsDico(File, Alceste=False, encoding = 'utf8'):
521 print('lecture des profiles')
522 FileReader = open(File, 'r', encoding='utf8')
523 Filecontent = FileReader.readlines()
527 #rows = [row.replace('\n', '').replace("'", '').replace('\"', '').replace(',', '.').replace('\r','').split(';') for row in Filecontent]
528 rows = [row.replace('\n', '').replace("'", '').replace('\"', '').replace('\r','').split(';') for row in Filecontent]
530 ClusterNb = rows[0][2]
532 clusters = [row[2] for row in rows if row[0] == '**']
533 valclusters = [row[1:4] for row in rows if row[0] == '****']
534 lp = [i for i, line in enumerate(rows) if line[0] == '****']
535 prof = [rows[lp[i] + 1:lp[i+1] - 1] for i in range(0, len(lp)-1)] + [rows[lp[-1] + 1:len(rows)]]
537 prof = [[add_type(row, dictlem) for row in pr] for pr in prof]
538 prof = [[treat_line_alceste(i,line) for i, line in enumerate(pr)] for pr in prof]
540 prof = [[line + [''] for line in pr] for pr in prof]
541 prof = [[treat_line_alceste(i,line) for i, line in enumerate(pr)] for pr in prof]
542 for i, cluster in enumerate(clusters):
543 DictProfile[cluster] = [valclusters[i]] + prof[i]
546 def GetTxtProfile(dictprofile, cluster_size) :
548 for classe in range(0, len(dictprofile)) :
549 prof = dictprofile[str(classe + 1)]
550 clinfo = cluster_size[classe]
551 proflist.append('\n'.join([' '.join(['classe %i' % (classe + 1), '-', '%s uce sur %s - %s%%' % (clinfo[0], clinfo[1], clinfo[2])]), '\n'.join(['%5s|%5s|%6s|%6s|%8s|%8s|%20s\t%10s' % tuple([str(val) for val in line]) for line in prof if len(line)==8])]))
552 return '\n\n'.join(proflist)
554 def formatExceptionInfo(maxTBlevel=5):
555 cla, exc, trbk = sys.exc_info()
557 excName = cla.__name__
561 excArgs = exc.args[0]
563 excArgs = "<no args>"
564 excTb = traceback.format_tb(trbk, maxTBlevel)
565 return (excName, excArgs, excTb)
568 #fonction des etudiants de l'iut
569 def decoupercharact(chaine, longueur, longueurOptimale, separateurs = None) :
571 on part du dernier caractère, et on recule jusqu'au début de la chaîne.
572 Si on trouve un '$', c'est fini.
573 Sinon, on cherche le meilleur candidat. C'est-Ã -dire le rapport poids/distance le plus important.
575 separateurs = [['.', 60.0], ['?', 60.0], ['!', 60.0], ['£$£', 60], [':', 50.0], [';', 40.0], [',', 10.0], [' ', 0.1]]
576 trouve = False # si on a trouvé un bon séparateur
577 iDecoupe = 0 # indice du caractere ou il faut decouper
579 # on découpe la chaine pour avoir au maximum 240 caractères
580 longueur = min(longueur, len(chaine) - 1)
581 chaineTravail = chaine[:longueur + 1]
583 meilleur = ['', 0, 0] # type, poids et position du meilleur separateur
585 # on vérifie si on ne trouve pas un '$'
586 indice = chaineTravail.find('$')
591 # si on ne trouve rien, on cherche le meilleur séparateur
594 caractere = chaineTravail[nbCar]
595 distance = abs(longueurOptimale - nbCar) + 1
596 meilleureDistance = abs(longueurOptimale - meilleur[2]) + 1
598 # on vérifie si le caractére courant est une marque de ponctuation
599 for s in separateurs:
600 if caractere == s[0]:
601 # si c'est une ponctuation
603 if s[1] / distance > float(meilleur[1]) / meilleureDistance:
611 # et on termine la recherche
614 # on passe au caractère précédant
619 fin = chaine[iDecoupe + 1:]
620 retour = chaineTravail[:iDecoupe]
621 return len(retour) > 0, retour.split(), fin
622 # si on a rien trouvé
623 return False, chaine.split(), ''
626 exceptions = {'paragrapheOT' : "Un problème de formatage (présence d'un marqueur de paragraphe (-*) en dehors d'un texte) est survenu à la ligne ",
627 'EmptyText' : "Texte vide (probablement un problème de formatage du corpus). Le problème est apparu à la ligne ",
628 'CorpusEncoding' : "Problème d'encodage.",
629 'TextBeforeTextMark' : "Problème de formatage : du texte avant le premier marqueur de texte (****). Le problème est survenu à la ligne ",
630 'MissingAnalyse' : 'Aucun fichier à cet emplacement :\n',
633 def BugReport(parent, error = None):
634 for ch in parent.GetChildren():
635 if "<class 'wx._windows.ProgressDialog'>" == str(type(ch)):
637 excName, exc, excTb = formatExceptionInfo()
638 if excName == 'Exception' :
640 if len(exc.split()) == 2 :
641 mss, linenb = exc.split()
642 if mss in exceptions :
643 txt = exceptions[mss] + linenb
647 if exc in exceptions :
648 txt = exceptions[exc]
651 title = "Information"
653 txt = '\n !== BUG ==! \n'
654 txt += '*************************************\n'
655 txt += '\n'.join(excTb).replace(' ', ' ')
656 txt += excName + '\n'
660 dial = BugDialog(parent, **{'title' : title})
661 if 'Rerror' in dir(parent) :
665 dial.text_ctrl_1.write(txt)
666 dial.CenterOnParent()
670 def PlaySound(parent):
671 if parent.pref.getboolean('iramuteq', 'sound') :
673 if "gtk2" in wx.PlatformInfo:
674 error = Popen(['aplay','-q',os.path.join(parent.AppliPath,'son_fin.wav')])
676 sound = wx.adv.Sound(os.path.join(parent.AppliPath, 'son_fin.wav'))
677 sound.Play(wx.adv.SOUND_SYNC)
681 def ReadDicoAsDico(dicopath):
682 with open(dicopath, 'r', encoding='UTF8') as f:
683 content = f.readlines()
684 lines = [line.rstrip('\n\r').replace('\n', '').replace('"', '').split('\t') for line in content if line != '']
685 return dict([[line[0], line[1:]] for line in lines])
687 def ReadLexique(parent, lang = 'french', filein = None):
690 parent.lexique = ReadDicoAsDico(parent.DictPath.get(lang, 'french'))
692 parent.lexique = ReadDicoAsDico(filein)
697 parent.lexique = ReadDicoAsDico(filein)
699 def ReadList(filein, encoding = 'utf8', sep = ';'):
701 with open(filein, 'r', encoding='utf8') as f :
703 content = [line.replace('\n', '').replace('\r','').replace('\"', '').replace(',', '.').split(sep) for line in content.splitlines()]
704 #file = codecs.open(filein, 'r', encoding)
705 #content = file.readlines()
707 first = content.pop(0)
708 #first = first.replace('\n', '').replace('\r','').replace('\"', '').split(sep)
712 #line = line.replace('\n', '').replace('\r','').replace('\"', '').replace(',', '.')
713 #line = line.split(';')
722 don = float('%.5f' % float(val))
728 def exec_RCMD(rpath, command) :
729 log.info('R CMD INSTALL %s' % command)
730 rpath = rpath.replace('\\','\\\\')
731 error = call(["%s" % rpath, 'CMD', 'INSTALL', "%s" % command])
734 def exec_rcode(rpath, rcode, wait = True, graph = False):
735 log.info("R Script : %s" % rcode)
737 if sys.platform == 'darwin' :
739 macversion = platform.mac_ver()[0].split('.')
740 if int(macversion[1]) < 5 :
746 rpath = rpath.replace('\\','\\\\')
747 env = os.environ.copy()
748 if sys.platform == 'darwin' and 'LC_ALL' not in env:
749 env['LC_ALL'] = 'en_US.UTF-8'
752 if sys.platform == 'win32':
753 error = call(["%s" % rpath, "--vanilla","--slave","-f", "%s" % rcode])
755 error = call([rpath, '--slave', "--vanilla", "--encoding=UTF-8", "-f %s" % rcode], env = env)
758 if sys.platform == 'win32':
759 pid = Popen(["%s" % rpath, '--vanilla','--slave','-f', "%s" % rcode])
761 pid = Popen([rpath, '--slave', "--vanilla", "--encoding=UTF-8", "-f %s" % rcode], stderr = PIPE, env = env, encoding='UTF-8') #PIPE ou STDOUT ?
765 if sys.platform == 'win32':
766 error = call(["%s" % rpath, '--vanilla','--slave','-f', "%s" % rcode])
767 elif sys.platform == 'darwin' and needX11:
768 os.environ['DISPLAY'] = ':0.0'
769 error = call([rpath, '--vanilla','--slave', "--encoding=UTF-8","-f %s" % rcode], env = env, encoding='UTF-8')
771 error = call([rpath, '--vanilla','--slave', "--encoding=UTF-8","-f %s" % rcode], env = env, encoding='UTF-8')
774 if sys.platform == 'win32':
775 pid = Popen(["%s" % rpath, '--vanilla','--slave','-f', "%s" % rcode])
776 elif sys.platform == 'darwin' and needX11:
777 os.environ['DISPLAY'] = ':0.0'
778 pid = Popen([rpath, '--vanilla','--slave', "--encoding=UTF-8","-f %s" % rcode], stderr = PIPE, env = env, encoding='UTF-8')
780 pid = Popen([rpath, '--vanilla','--slave', "--encoding=UTF-8","-f %s" % rcode], stderr = PIPE, env = env, encoding='UTF-8')
783 def check_Rresult(parent, pid) :
784 if isinstance(pid, Popen) :
785 if pid.returncode != 0 :
786 error = pid.communicate()
787 error = [str(error[0]), error[1]]
788 if error[1] is None :
790 parent.Rerror = '\n'.join([str(pid.returncode), '\n'.join(error)])
792 raise Exception('\n'.join(['Erreur R', '\n'.join(error[1:])]))
801 raise Exception('Erreur R')
809 def launchcommand(mycommand):
812 def print_liste(filename,liste):
813 with open(filename,'w', encoding='utf8') as f :
815 f.write(';'.join(graph) +'\n')
817 def read_list_file(filename, encoding = 'utf8'):
818 with open(filename,'r', encoding='utf8') as f:
819 content=f.readlines()
820 ncontent=[line.replace('\n','').split(';') for line in content if line.strip() != '']
823 def progressbar(self, maxi):
824 ira = wx.GetApp().GetTopWindow()
830 prog = wx.ProgressDialog("Traitements",
831 "Veuillez patienter...",
834 style=wx.PD_APP_MODAL | wx.PD_AUTO_HIDE | wx.PD_ELAPSED_TIME | wx.PD_CAN_ABORT
837 # le ABORT n'est pas géré à tous les coups ???
838 prog.SetSize((400,150))
839 #prog.SetIcon(ira._icon)
842 def treat_var_mod(variables) :
844 variables = list(set(variables))
845 varmod = [variable.split('_') for variable in variables]
846 vars = list(set([var[0] for var in varmod if len(var) >=2]))
848 mods = ['_'.join(v) for v in varmod if v[0] == var]
851 # for variable in variables :
852 # if '_' in variable :
853 # forme = variable.split('_')
856 # if not var in var_mod :
857 # var_mod[var] = [variable]
859 # if not mod in var_mod[var] :
860 # var_mod[var].append(variable)
863 def doconcorde(corpus, uces, mots, uci = False, fontsize=16) :
865 ucestxt1 = [row for row in corpus.getconcorde(uces)]
867 ucestxt1 = [row for row in corpus.getuciconcorde(uces)]
868 ucestxt1 = dict(ucestxt1)
871 listmot = [corpus.getlems()[lem].formes for lem in mots]
872 listmot = [corpus.getforme(fid).forme for lem in listmot for fid in lem]
873 mothtml = ['<font color=red><b>%s</b></font>' % mot for mot in listmot]
874 dmots = dict(list(zip(listmot, mothtml)))
875 presfont = '<p><b><font size="%i">' % fontsize
876 font = '<font size="%i">' % fontsize
878 ucetxt = ucestxt1[uce].split()
879 ucetxt = ' '.join([dmots.get(mot, mot) for mot in ucetxt])
881 uciid = corpus.getucefromid(uce).uci
882 ucis_txt.append(presfont + ' '.join(corpus.ucis[corpus.getucefromid(uce).uci].etoiles) + '<a href="%i_%i"> *%i_%i</a></font></b></p>' % (uciid, uce, uciid, uce))
884 ucis_txt.append(presfont + ' '.join(corpus.ucis[uce].etoiles) + '</font></b></p>')
885 ucestxt.append(font + ucetxt + '</font>')
886 return ucis_txt, ucestxt
889 def getallstcarac(corpus, analyse) :
890 pathout = PathOut(analyse['ira'])
891 profils = ReadProfileAsDico(pathout['PROFILE_OUT'], Alceste, 'utf8')
894 def read_chd(filein, fileout):
895 with open(filein, 'r') as f :
897 #content = [line[3:].replace('"',"").replace(' ','') for line in content.splitlines()]
898 content = [line.split('\t') for line in content.splitlines()]
899 chd = {'name':1, 'children':[]}
901 for i, line in enumerate(content) :
903 chd['children'] = [{'name': line[1],'size' : content[i+1][0]}, {'name':line[2], 'size': content[i+1][1]}]
904 mere[line[1]] = chd['children'][0]
905 mere[line[2]] = chd['children'][1]
907 if 'children' in mere[line[0]]:
908 mere[line[0]]['children'].append({'name': line[1],'size' : content[i+1][0]})
909 mere[line[1]] = mere[line[0]]['children'][-1]
910 mere[line[0]]['children'].append({'name': line[2],'size' : content[i+1][1]})
911 mere[line[2]] = mere[line[0]]['children'][-1]
913 mere[line[0]]['children'] = [{'name': line[1],'size' : content[i+1][0]}, {'name':line[2], 'size': content[i+1][1]}]
914 mere[line[1]] = mere[line[0]]['children'][-2]
915 mere[line[2]] = mere[line[0]]['children'][-1]
916 with open(fileout, 'w') as f :
917 f.write(json.dumps(chd))
920 translation_languages = {"Afrikaans":"af", "Albanian":"sq", "Amharic":"am", "Arabic":"ar", "Armenian":"hy", "Azeerbaijani":"az", "Basque":"eu", "Belarusian":"be", "Bengali":"bn", "Bosnian":"bs", "Bulgarian":"bg", "Catalan":"ca", "Cebuano":"ceb", "Chichewa":"ny", "Chinese (Simplified)":"zh-CN", "Chinese (Traditional)":"zh-TW", "Corsican":"co", "Croatian":"hr", "Czech":"cs", "Danish":"da", "Dutch":"nl", "English":"en", "Esperanto":"eo", "Estonian":"et", "Filipino":"tl", "Finnish":"fi", "French":"fr", "Frisian":"fy", "Galician":"gl", "Georgian":"ka", "German":"de", "Greek":"el", "Gujarati":"gu", "Haitian Creole":"ht", "Hausa":"ha", "Hawaiian":"haw", "Hebrew":"iw", "Hindi":"hi", "Hmong":"hmn ", "Hungarian":"hu", "Icelandic":"is", "Igbo":"ig", "Indonesian":"id", "Irish":"ga", "Italian":"it", "Japanese":"ja", "Javanese":"jw", "Kannada":"kn", "Kazakh":"kk", "Khmer":"km", "Korean":"ko", "Kurdish":"ku", "Kyrgyz":"ky", "Lao":"lo", "Latin":"la", "Latvian":"lv", "Lithuanian":"lt", "Luxembourgish":"lb", "Macedonian":"mk", "Malagasy":"mg", "Malay":"ms", "Malayalam":"ml", "Maltese":"mt", "Maori":"mi", "Marathi":"mr", "Mongolian":"mn", "Burmese":"my", "Nepali":"ne", "Norwegian":"no", "Pashto":"ps", "Persian":"fa", "Polish":"pl", "Portuguese":"pt", "Punjabi":"ma", "Romanian":"ro", "Russian":"ru", "Samoan":"sm", "Scots Gaelic":"gd", "Serbian":"sr", "Sesotho":"st", "Shona":"sn", "Sindhi":"sd", "Sinhala":"si", "Slovak":"sk", "Slovenian":"sl", "Somali":"so", "Spanish":"es", "Sundanese":"su", "Swahili":"sw", "Swedish":"sv", "Tajik":"tg", "Tamil":"ta", "Telugu":"te", "Thai":"th", "Turkish":"tr", "Ukrainian":"uk", "Urdu":"ur", "Uzbek":"uz", "Vietnamese":"vi", "Welsh":"cy", "Xhosa":"xh", "Yiddish":"yi", "Yoruba":"yo", "Zulu":"zu", }
923 def gettranslation(words, lf, lt) :
924 import urllib.request, urllib.error, urllib.parse
926 agent = {'User-Agent':
934 .NET CLR 3.0.04506.30\
936 base_link = "https://translate.googleapis.com/translate_a/single?client=gtx&sl=%s&tl=%s&dt=t&q=%s"
938 totrans = urllib.parse.quote('\n'.join(words))
939 link = base_link % (lf, lt, totrans)
940 request = urllib.request.Request(link, headers=agent)
941 raw_data = urllib.request.urlopen(request).read()
942 data = json.loads(raw_data)
943 return [line[0].replace("'", '_').replace(' | ', '|').replace(' ', '_').replace('-','_').replace('\n','') for line in data[0]]
945 def makenprof(prof, trans, deb=0) :
948 nprof.append(prof[0])
949 for i, val in enumerate(trans) :
950 line = prof[deb+i+1][:]
955 def treatempty(val) :
956 if val.strip() == '' :
961 def translateprofile(corpus, dictprofile, lf='it', lt='fr', maxword = 50) :
964 for i in range(len(dictprofile)) :
965 prof = dictprofile[repr(i+1)]
967 lenact = prof.index(['*****', '*', '*', '*', '*', '*', '', ''])
971 lenact = prof.index(['*', '*', '*', '*', '*', '*', '', ''])
977 lensup += prof.index(['*', '*', '*', '*', '*', '*', '', ''])
978 lensup = lensup - lenact
980 lensup += len(prof) - lenact
982 if lenact > maxword :
986 actori = [line[6] for line in prof[1:nlenact]]
987 act = [val.replace('_', ' ') for val in actori]
988 act = gettranslation(act, lf, lt)
989 for j, val in enumerate(actori) :
990 if act[j] not in lems :
993 while act[j] in lems :
994 act[j] = act[j] + "+"
996 nprof[repr(i+1)] = makenprof(prof, act)
999 if lensup > maxword :
1003 supori = [line[6] for line in prof[(1+lenact):(lenact+nlensup)]]
1004 sup = [val.replace('_', ' ') for val in supori]
1005 sup = [treatempty(val) for val in sup]
1006 sup = gettranslation(sup, lf, lt)
1007 for j, val in enumerate(supori) :
1008 if sup[j] not in lems :
1011 while sup[j] in lems :
1012 sup[j] = sup[j] + "+"
1014 nprof[repr(i+1)].append(['*****', '*', '*', '*', '*', '*', '', ''])
1015 nprof[repr(i+1)] += makenprof(prof, sup, deb=lenact)
1018 lenet = prof.index(['*', '*', '*', '*', '*', '*', '', ''])
1019 nprof[repr(i+1)].append(['*', '*', '*', '*', '*', '*', '', ''])
1020 nprof[repr(i+1)] += prof[(lenet+1):]
1025 def write_translation_profile(prof, lems, language, dictpathout) :
1026 if os.path.exists(dictpathout['translations.txt']) :
1027 with open(dictpathout['translations.txt'], 'r', encoding='utf8') as f :
1028 translist = f.read()
1029 translist = [line.split('\t') for line in translist.splitlines()]
1033 toprint.append(['','','','','',''])
1034 toprint.append(['***', 'nb classes', repr(len(prof)), '***', '', ''])
1035 for i in range(len(prof)) :
1036 toprint.append(['**', 'classe', repr(i+1), '**', '', ''])
1037 toprint.append(['****'] + prof[repr(i+1)][0] + ['****'])
1038 rest = [[repr(line[1]), repr(line[2]), repr(line[3]), repr(line[4]), line[6], line[7].replace('< 0,0001', '0.00009').replace('NS (','').replace(')','')] for line in prof[repr(i+1)][1:]]
1039 for i, line in enumerate(prof[repr(i+1)][1:]) :
1041 rest[i] = ['*', '*', '*', '*', '*', '*']
1042 elif line[0] == '*****' :
1043 rest[i] = ['*****','*','*', '*', '*', '*']
1045 with open(dictpathout['translation_profile_%s.csv' % language], 'w', encoding='utf8') as f :
1046 f.write('\n'.join([';'.join(line) for line in toprint]))
1047 with open(dictpathout['translation_words_%s.csv' % language], 'w', encoding='utf8') as f :
1048 f.write('\n'.join(['\t'.join([val, lems[val]]) for val in lems]))
1049 if 'translation_profile_%s.csv' % language not in [val[0] for val in translist] :
1050 translist.append(['translation_profile_%s.csv' % language, 'translation_words_%s.csv' % language])
1051 with open(dictpathout['translations.txt'], 'w', encoding='utf8') as f :
1052 f.write('\n'.join(['\t'.join(line) for line in translist]))
1054 def makesentidict(infile, language) :
1055 with codecs.open(infile,'r', 'utf8') as f :
1057 content = [line.split('\t') for line in content.splitlines()]
1058 titles = content.pop(0)
1059 senti = ['Positive', 'Negative', 'Anger', 'Anticipation', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise', 'Trust']
1062 sentid[sent] = titles.index(sent)
1063 frtitle = [val for val in titles if '(fr)' in val]
1064 frid = titles.index(frtitle[0])
1065 sentidict = [[line[frid].lower(), [line[sentid[sent]] for sent in senti]] for line in content]
1066 pos = ['positive'] + [line[0] for line in sentidict if line[1][0] == '1']
1067 neg = ['negative'] + [line[0] for line in sentidict if line[1][1] == '1']
1068 anger = ['anger'] + [line[0] for line in sentidict if line[1][2] == '1']
1069 anticipation = ['anticipation'] + [line[0] for line in sentidict if line[1][3] == '1']
1070 disgust = ['disgust'] + [line[0] for line in sentidict if line[1][4] == '1']
1071 fear = ['fear'] + [line[0] for line in sentidict if line[1][5] == '1']
1072 joy = ['joy'] + [line[0] for line in sentidict if line[1][6] == '1']
1073 sadness = ['sadness'] + [line[0] for line in sentidict if line[1][7] == '1']
1074 surprise = ['surprise'] + [line[0] for line in sentidict if line[1][8] == '1']
1075 trust = ['trust'] + [line[0] for line in sentidict if line[1][9] == '1']
1076 with open('/tmp/tgenemo.csv', 'w') as f :
1077 for val in [pos, neg, anger, anticipation, disgust, fear, joy, sadness, surprise, trust] :
1078 f.write('\t'.join(val) + '\n')
1080 def countsentfromprof(prof, encoding, sentidict) :
1081 with codecs.open(prof, 'r', encoding) as f :
1083 content = [line.split(';') for line in content.splitlines()]
1085 content = [[line[0], [int(val) for val in line[1:]]] for line in content]
1087 content = dict(content)
1090 def iratolexico(infile, outfile, encoding) :
1091 with codecs.open(infile, 'r', encoding) as f :
1093 if line.startswith('**** ') :