1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
3 #Copyright (c) 2008-2020 Pierre Ratinaud
4 #modification pour python 3 : Laurent Mérat, 6x7 - mai 2020
7 #------------------------------------
8 # import des modules python
9 #------------------------------------
11 from subprocess import Popen, call, PIPE
23 from shutil import copyfile
26 #from dialog import BugDialog
28 from operator import itemgetter
30 #------------------------------------
31 # import des modules wx
32 #------------------------------------
36 #------------------------------------
37 # import des fichiers du projet
38 #------------------------------------
39 from configparser import ConfigParser
42 log = logging.getLogger('iramuteq')
45 indices_simi = ['cooccurrence' ,'pourcentage de cooccurrence','Russel','Jaccard', 'Kulczynski1', 'Kulczynski2', 'Mountford', 'Fager', 'simple matching', 'Hamman', 'Faith', 'Tanimoto', 'Dice', 'Phi', 'Stiles', 'Michael', 'Mozley', 'Yule', 'Yule2', 'Ochiai', 'Simpson', 'Braun-Blanquet','Chi-squared', 'Phi-squared', 'Tschuprow', 'Cramer', 'Pearson', 'binomial']
47 def open_folder(folder):
48 if sys.platform == "win32":
51 opener ="open" if sys.platform == "darwin" else "xdg-open"
52 #call([opener, folder])
53 call(["%s %s &" % (opener, folder)], shell=True)
55 def normpath_win32(path) :
56 if not sys.platform == 'win32' :
58 while '\\\\' in path :
59 path = path.replace('\\\\', '\\')
60 if path.startswith('\\') and not path.startswith('\\\\') :
65 def __init__(self, path = None, encoding = 'utf8'):
68 self.encoding = encoding
70 def __getitem__(self, key):
73 def read(self, path = None):
76 with codecs.open(path, 'r', self.encoding) as f :
78 tgen = [line.split('\t') for line in tgen.splitlines()]
79 tgen = dict([[line[0], line[1:]] for line in tgen])
83 def write(self, path = None):
86 with open(path, 'w', encoding='utf8') as f :
87 f.write('\n'.join(['\t'.join([val] + self.tgen[val]) for val in self.tgen]))
89 def writetable(self, pathout, tgens, totocc):
90 etoiles = list(totocc.keys())
92 with open(pathout, 'w', encoding='utf8') as f :
93 line = '\t'.join(['tgens'] + etoiles) + '\n'
96 line = '\t'.join([t] + [repr(tgens[t][et]) for et in etoiles]) + '\n'
100 while totname + repr(i) in tgens :
102 totname = totname + repr(i)
103 line = '\t'.join([totname] + [repr(totocc[et]) for et in etoiles]) + '\n'
107 def __init__(self, filein, syscoding = 'utf8') :
109 self.syscoding = syscoding
111 self.openedcorpus = {}
112 self.openedmatrix = {}
120 d = shelve.open(self.filein)
121 self.history = d.get('history', [])
122 self.matrix = d.get('matrix', [])
123 self.ordercorpus = dict([[corpus['uuid'], i] for i, corpus in enumerate(self.history)])
124 self.corpus = dict([[corpus['uuid'], corpus] for corpus in self.history])
125 self.analyses = dict([[analyse['uuid'], analyse] for corpus in self.history for analyse in corpus.get('analyses', [])])
126 self.matrixanalyse = dict([[mat['uuid'], mat] for mat in self.matrix])
127 self.ordermatrix = dict([[matrix['uuid'], i] for i, matrix in enumerate(self.matrix)])
131 d = shelve.open(self.filein)
132 d['history'] = self.history
133 d['matrix'] = self.matrix
136 def add(self, analyse) :
137 log.info('add to history %s' % analyse.get('corpus_name', 'pas un corpus'))
138 tosave = {'uuid' : analyse['uuid'], 'ira': analyse['ira'], 'type' : analyse['type']}
139 if tosave['uuid'] in self.corpus :
140 log.info('problem : this uuid is already in history : %s' % tosave['uuid'])
142 if analyse.get('corpus', False) :
143 if analyse['uuid'] in self.analyses :
145 tosave['corpus'] = analyse['corpus']
146 tosave['name'] = analyse['name']
147 acorpus_uuid = analyse['corpus']
148 if acorpus_uuid in self.corpus :
149 if 'analyses' in self.history[self.ordercorpus[acorpus_uuid]] :
150 self.history[self.ordercorpus[acorpus_uuid]]['analyses'].append(tosave)
152 self.history[self.ordercorpus[acorpus_uuid]]['analyses'] = [tosave]
154 self.orph.append(tosave)
156 tosave['corpus_name'] = analyse['corpus_name']
157 #self.ordercorpus[tosave['uuid']] = len(history)
158 #self.corpus[tosave['uuid']] = analyse
159 self.history.append(tosave)
163 def addMatrix(self, analyse) :
165 #tosave['matrix_name'] = analyse['matrix_name']
166 tosave['analyses'] = []
167 self.matrix.append(tosave)
171 def addMatrixAnalyse(self, analyse) :
172 tosave = {'uuid' : analyse['uuid'], 'ira': analyse['ira'], 'type' : analyse['type'], 'matrix' : analyse['matrix']}
173 tosave['name'] = analyse['name']
174 if tosave['matrix'] in self.ordermatrix :
175 self.matrix[self.ordermatrix[tosave['matrix']]]['analyses'].append(tosave)
179 def addmultiple(self, analyses) :
180 log.info('add multiple')
181 for analyse in analyses :
182 tosave = {'uuid' : analyse['uuid'], 'ira': analyse['ira'], 'type' : analyse['type']}
183 corpus = analyse['corpus']
184 tosave['corpus'] = corpus
185 tosave['name'] = analyse['name']
186 if corpus in self.corpus :
187 if 'analyses' in self.history[self.ordercorpus[corpus]] :
188 self.history[self.ordercorpus[corpus]]['analyses'].append(tosave)
190 self.history[self.ordercorpus[corpus]]['analyses'] = [tosave]
194 def delete(self, analyse, corpus = False) :
195 log.info('delete %s' % analyse.get('name', 'noname'))
197 self.history.pop(self.ordercorpus[analyse['uuid']])
198 if analyse['uuid'] in self.openedcorpus :
199 del self.openedcorpus[analyse['uuid']]
200 log.info('delete corpus : %s' % analyse['uuid'])
201 elif analyse['uuid'] in self.analyses :
202 todel = [i for i, ana in enumerate(self.corpus[analyse['corpus']]['analyses']) if ana['uuid'] == analyse['uuid']][0]
203 self.history[self.ordercorpus[analyse['corpus']]]['analyses'].pop(todel)
204 elif analyse['uuid'] in self.matrixanalyse :
205 self.matrix = [mat for mat in self.matrix if mat['uuid'] != analyse['uuid']]
206 elif analyse.get('matrix', False) in self.matrixanalyse :
207 analyses = self.matrix[self.ordermatrix[analyse['matrix']]]['analyses']
208 topop = [i for i, val in enumerate(analyses) if analyse['uuid'] == val['uuid']][0]
210 self.matrix[self.ordermatrix[analyse['matrix']]]['analyses'] = analyses
214 def addtab(self, analyse) :
215 self.opened[analyse['uuid']] = analyse
217 def rmtab(self, analyse) :
218 del self.opened[analyse['uuid']]
220 def update(self, analyse) :
221 if 'matrix_name' in analyse :
222 self.matrixanalyse[analyse['uuid']].update(analyse)
223 elif 'corpus_name' in analyse :
224 self.corpus[analyse['uuid']].update(analyse)
225 elif 'corpus' in analyse :
226 self.analyses[analyse['uuid']].update(analyse)
228 toupdate = [an for an in self.matrixanalyse[analyse['matrix']]['analyses'] if an['uuid'] == analyse['uuid']]
229 toupdate[0].update(analyse)
234 corpustodel = [corpus for corpus in self.history if not os.path.exists(corpus['ira'])]
236 for corpus in corpustodel :
237 print('cleaning :', corpus['corpus_name'])
238 self.delete(corpus, corpus = True)
239 anatodel = [analyse for corpus in self.history for analyse in corpus.get('analyses', []) if not os.path.exists(analyse.get('ira', '/'))]
240 for analyse in anatodel :
241 print('cleaning :', analyse['name'])
256 for corpus in self.history :
257 analysenb += len(corpus.get('analyses', []))
258 analyses = corpus.get('analyses', [])
259 for analyse in analyses :
260 if os.path.exists(analyse['ira']) :
261 ana = DoConf(analyse['ira']).getoptions()
263 time = ana['time'].split()
264 ha += int(time[0].replace('h','')) * 3600
265 ma += int(time[1].replace('m','')) * 60
266 sa += int(time[2].replace('s',''))
267 if os.path.exists(corpus['ira']) :
268 param = DoConf(corpus['ira']).getoptions()
269 time = param.get('time','0h 0m 0s')
271 hours += int(time[0].replace('h','')) * 3600
272 minutes += int(time[1].replace('m','')) * 60
273 secondes += int(time[2].replace('s',''))
274 if param.get('originalpath', False) :
275 if param['originalpath'] in corpusnb :
276 corpusnb[param['originalpath']] += 1
277 tokens += int(param['occurrences'])
279 corpusnb[param['originalpath']] = 1
284 if corpus['ira'] in todel :
288 print('Nbr total de corpus : %s' % len(self.history))
289 corpus_nb = len(corpusnb) + len(todel)
290 print('Nbr de corpus différents : %s' % corpus_nb)
291 lentodel = len(todel)
292 print('Nbr de corpus à supprimer : %s' % lentodel)
293 print('Nbr de sous corpus : %s' % subnb)
294 print("Nbr total d'occurrences : %s" % tokens)
295 print('Moyenne occurrences par corpus : %f' % (tokens/corpus_nb))
296 print('---------------------')
297 print("Nbr total d'analyses : %s" % analysenb)
298 print('Temps total indexation : %f h' % ((hours+minutes+secondes) / 3600))
299 print('Temps total analyses : %f h' % ((ha+ma+sa) / 3600))
302 return str(self.history)
305 def __init__(self, configfile=None, diff = None, parametres = None) :
306 self.configfile = configfile
307 self.conf = ConfigParser(interpolation=None) # pourquoi ce paramètre ???
309 if configfile is not None :
310 configfile = normpath_win32(configfile)
311 self.conf.read_file(codecs.open(configfile, 'r', 'utf8'))
313 if parametres is not None :
314 self.doparametres(parametres)
316 def doparametres(self, parametres) :
319 def getsections(self) :
320 return self.conf.sections()
322 def getoptions(self, section = None, diff = None):
325 section = self.conf.sections()[0]
326 for option in self.conf.options(section) :
327 if self.conf.get(section, option).isdigit() :
328 parametres[option] = int(self.conf.get(section, option))
329 elif self.conf.get(section, option) == 'False' :
330 parametres[option] = False
331 elif self.conf.get(section, option) == 'True' :
332 parametres[option] = True
333 elif self.conf.get(section, option).startswith('(') and self.conf.get(section, option).endswith(')') :
334 parametres[option] = ast.literal_eval(self.conf.get(section, option))
335 elif self.conf.get(section, option).startswith('[') and self.conf.get(section, option).endswith(']') :
336 parametres[option] = ast.literal_eval(self.conf.get(section, option))
338 parametres[option] = self.conf.get(section, option)
339 if 'type' not in parametres :
340 parametres['type'] = section
343 def makeoptions(self, sections, parametres, outfile = None) :
345 for i, section in enumerate(sections) :
346 txt += '[%s]\n' % section
347 if not self.conf.has_section(section) :
348 self.conf.add_section(section)
349 for option in parametres[i] :
350 if isinstance(parametres[i][option], int) :
351 self.conf.set(section, option, repr(parametres[i][option]))
352 txt += '%s = %i\n' % (option, parametres[i][option])
353 elif isinstance(parametres[i][option], str) :
354 self.conf.set(section, option, parametres[i][option])
355 txt += '%s = %s\n' % (option, parametres[i][option])
356 elif isinstance(parametres[i][option], wx.Colour) :
357 self.conf.set(section, option, str(parametres[i][option]))
358 txt += '%s = %s\n' % (option, str(parametres[i][option]))
359 elif option == 'analyses' :
362 self.conf.set(section, option, repr(parametres[i][option]))
363 txt += '%s = %s\n' % (option, repr(parametres[i][option]))
365 outfile = self.configfile
366 outfile = normpath_win32(outfile)
367 with open(outfile, 'w', encoding="utf-8") as f :
371 def totext(self, parametres) :
374 for val in parametres :
375 if isinstance(parametres[val], int) :
376 txt.append(' \t\t: '.join([val, repr(parametres[val])]))
377 elif isinstance(parametres[val], str) :
378 txt.append(' \t\t: '.join([val, parametres[val]]))
379 elif val in ['listet', 'stars'] :
382 txt.append(' \t\t: '.join([val, repr(parametres[val])]))
383 return '\n'.join(txt)
386 def write_tab(tab, fileout) :
387 csvWriter = csv.writer(open(fileout, 'w', newline='', encoding='utf8'), delimiter=';', quoting = csv.QUOTE_NONNUMERIC)
388 csvWriter.writerows(tab)
390 class BugDialog(wx.Dialog):
391 def __init__(self, *args, **kwds):
392 # begin wxGlade: MyDialog.__init__
393 kwds["style"] = wx.DEFAULT_DIALOG_STYLE | wx.STAY_ON_TOP
394 kwds["size"] = wx.Size(500, 200)
395 wx.Dialog.__init__(self, *args, **kwds)
396 self.SetTitle(kwds['title'])
397 self.text_ctrl_1 = wx.TextCtrl(self, -1, "", style=wx.TE_MULTILINE)
398 self.text_ctrl_1.SetBackgroundColour('#DDE8EB')
399 self.button_1 = wx.Button(self, wx.ID_OK, "")
401 self.__set_properties()
405 def __set_properties(self):
406 # begin wxGlade: MyDialog.__set_properties
407 self.SetMinSize(wx.Size(500, 200))
408 self.text_ctrl_1.SetMinSize(wx.Size(500, 200))
412 def __do_layout(self):
413 # begin wxGlade: MyDialog.__do_layout
414 sizer_1 = wx.BoxSizer(wx.VERTICAL)
415 sizer_1.Add(self.text_ctrl_1, 1, wx.EXPAND, 0)
416 sizer_1.Add(self.button_1, 0, wx.ALIGN_CENTER_HORIZONTAL, 0)
417 self.SetSizer(sizer_1)
422 def CreateIraFile(DictPathOut, clusternb, corpname='corpus_name', section = 'analyse'):
423 AnalyseConf = ConfigParser()
424 AnalyseConf.read(DictPathOut['ira'])
425 AnalyseConf.add_section(section)
426 date = datetime.datetime.now().ctime()
427 AnalyseConf.set(section, 'date', str(date))
428 AnalyseConf.set(section, 'clusternb', clusternb)
429 AnalyseConf.set(section, 'corpus_name', corpname)
431 fileout = open(DictPathOut['ira'], 'w', encoding='utf8')
432 AnalyseConf.write(fileout)
435 def multisort(liste2d, ordre, indices_tri):
438 methode destinée à remplacer 'comp' qui a disparu en Python 3
439 tri de tuples sur l'un des éléments du tuple
440 en principe, elle doit renvoyer les éléments triés selon le principe d'avant
441 tel que décrit dans la docstring de 'sortedby'
443 probablement à améliorer pour la rendre d'usage plus général
444 en acceptant un nombre variable de parametres ???
447 indices_triTuple = indices_tri.Tuple(int, ...)
448 for key in reversed(indices_tri):
449 liste2d.sort(key=attrgetter(key), reverse=ordre)
452 def sortedby(liste2d, direct, *indices):
455 sortedby: sort a list of lists (e.g. a table) by one or more indices
456 (columns of the table) and return the sorted list
459 for list = [[2,3],[1,2],[3,1]]:
460 sortedby(list,1) will return [[3, 1], [1, 2], [2, 3]],
461 sortedby(list,0) will return [[1, 2], [2, 3], [3, 1]]
463 elle n'est pas remplacée par la méthode 'multisort' ???
468 # nlist = map(lambda x, indices=indices:
469 # map(lambda i, x=x: x[i], indices) + [x],
472 # iramuteq passé à 2to3
473 # nlist = list(map(lambda x, indices=indices:
474 # list(map(lambda i, x=x: x[i], indices)) + [x],
477 for key in reversed(indices):
478 liste2d.sort(key=itemgetter(key), reverse=(direct==2))
484 # sorted_list = multisort(liste2d, direct, *indices)
487 # nlist.sort(reverse=True)
488 # sorted_list = multisort(liste2d, direct, *indices)
490 # return [l[-1] for l in nlist]
493 def add_type(line, dictlem):
494 if line[4] in dictlem:
495 line.append(dictlem[line[4]])
500 def treat_line_alceste(i, line) :
501 if line[0] == '*' or line[0] == '*****' :
506 elif float(line[5].replace(',', '.')) < 0.0001:
508 elif float(line[5].replace(',', '.')) > 0.05:
509 line[5] = 'NS (%s)' % str(float(line[5].replace(',', '.')))[0:7]
511 line[5] = str(float(line[5].replace(',', '.')))[0:7]
512 return [i, int(line[0]), int(line[1]), float(line[2]), float(line[3]), line[6], line[4], line[5]]
514 def ReadProfileAsDico(File, Alceste=False, encoding = 'utf8'):
516 print('lecture des profiles')
517 FileReader = open(File, 'r', encoding='utf8')
518 Filecontent = FileReader.readlines()
522 #rows = [row.replace('\n', '').replace("'", '').replace('\"', '').replace(',', '.').replace('\r','').split(';') for row in Filecontent]
523 rows = [row.replace('\n', '').replace("'", '').replace('\"', '').replace('\r','').split(';') for row in Filecontent]
525 ClusterNb = rows[0][2]
527 clusters = [row[2] for row in rows if row[0] == '**']
528 valclusters = [row[1:4] for row in rows if row[0] == '****']
529 lp = [i for i, line in enumerate(rows) if line[0] == '****']
530 prof = [rows[lp[i] + 1:lp[i+1] - 1] for i in range(0, len(lp)-1)] + [rows[lp[-1] + 1:len(rows)]]
532 prof = [[add_type(row, dictlem) for row in pr] for pr in prof]
533 prof = [[treat_line_alceste(i,line) for i, line in enumerate(pr)] for pr in prof]
535 prof = [[line + [''] for line in pr] for pr in prof]
536 prof = [[treat_line_alceste(i,line) for i, line in enumerate(pr)] for pr in prof]
537 for i, cluster in enumerate(clusters):
538 DictProfile[cluster] = [valclusters[i]] + prof[i]
541 def GetTxtProfile(dictprofile, cluster_size) :
543 for classe in range(0, len(dictprofile)) :
544 prof = dictprofile[str(classe + 1)]
545 clinfo = cluster_size[classe]
546 proflist.append('\n'.join([' '.join(['classe %i' % (classe + 1), '-', '%s uce sur %s - %s%%' % (clinfo[0], clinfo[1], clinfo[2])]), '\n'.join(['%5s|%5s|%6s|%6s|%8s|%8s|%20s\t%10s' % tuple([str(val) for val in line]) for line in prof if len(line)==8])]))
547 return '\n\n'.join(proflist)
549 def formatExceptionInfo(maxTBlevel=5):
550 cla, exc, trbk = sys.exc_info()
552 excName = cla.__name__
556 excArgs = exc.args[0]
558 excArgs = "<no args>"
559 excTb = traceback.format_tb(trbk, maxTBlevel)
560 return (excName, excArgs, excTb)
563 #fonction des etudiants de l'iut
564 def decoupercharact(chaine, longueur, longueurOptimale, separateurs = None) :
566 on part du dernier caractère, et on recule jusqu'au début de la chaîne.
567 Si on trouve un '$', c'est fini.
568 Sinon, on cherche le meilleur candidat. C'est-Ã -dire le rapport poids/distance le plus important.
570 separateurs = [['.', 60.0], ['?', 60.0], ['!', 60.0], ['£$£', 60], [':', 50.0], [';', 40.0], [',', 10.0], [' ', 0.1]]
571 trouve = False # si on a trouvé un bon séparateur
572 iDecoupe = 0 # indice du caractere ou il faut decouper
574 # on découpe la chaine pour avoir au maximum 240 caractères
575 longueur = min(longueur, len(chaine) - 1)
576 chaineTravail = chaine[:longueur + 1]
578 meilleur = ['', 0, 0] # type, poids et position du meilleur separateur
580 # on vérifie si on ne trouve pas un '$'
581 indice = chaineTravail.find('$')
586 # si on ne trouve rien, on cherche le meilleur séparateur
589 caractere = chaineTravail[nbCar]
590 distance = abs(longueurOptimale - nbCar) + 1
591 meilleureDistance = abs(longueurOptimale - meilleur[2]) + 1
593 # on vérifie si le caractére courant est une marque de ponctuation
594 for s in separateurs:
595 if caractere == s[0]:
596 # si c'est une ponctuation
598 if s[1] / distance > float(meilleur[1]) / meilleureDistance:
606 # et on termine la recherche
609 # on passe au caractère précédant
614 fin = chaine[iDecoupe + 1:]
615 retour = chaineTravail[:iDecoupe]
616 return len(retour) > 0, retour.split(), fin
617 # si on a rien trouvé
618 return False, chaine.split(), ''
621 exceptions = {'paragrapheOT' : "Un problème de formatage (présence d'un marqueur de paragraphe (-*) en dehors d'un texte) est survenu à la ligne ",
622 'EmptyText' : "Texte vide (probablement un problème de formatage du corpus). Le problème est apparu à la ligne ",
623 'CorpusEncoding' : "Problème d'encodage.",
624 'TextBeforeTextMark' : "Problème de formatage : du texte avant le premier marqueur de texte (****). Le problème est survenu à la ligne ",
625 'MissingAnalyse' : 'Aucun fichier à cet emplacement :\n',
628 def BugReport(parent, error = None):
629 for ch in parent.GetChildren():
630 if "<class 'wx._windows.ProgressDialog'>" == str(type(ch)):
632 excName, exc, excTb = formatExceptionInfo()
633 if excName == 'Exception' :
635 if len(exc.split()) == 2 :
636 mss, linenb = exc.split()
637 if mss in exceptions :
638 txt = exceptions[mss] + linenb
642 if exc in exceptions :
643 txt = exceptions[exc]
646 title = "Information"
648 txt = '\n !== BUG ==! \n'
649 txt += '*************************************\n'
650 txt += '\n'.join(excTb).replace(' ', ' ')
651 txt += excName + '\n'
655 dial = BugDialog(parent, **{'title' : title})
656 if 'Rerror' in dir(parent) :
660 dial.text_ctrl_1.write(txt)
661 dial.CenterOnParent()
665 def PlaySound(parent):
666 if parent.pref.getboolean('iramuteq', 'sound') :
668 if "gtk2" in wx.PlatformInfo:
669 error = Popen(['aplay','-q',os.path.join(parent.AppliPath,'son_fin.wav')])
671 sound = wx.adv.Sound(os.path.join(parent.AppliPath, 'son_fin.wav'))
672 sound.Play(wx.adv.SOUND_SYNC)
676 def ReadDicoAsDico(dicopath):
677 with open(dicopath, 'r', encoding='UTF8') as f:
678 content = f.readlines()
679 lines = [line.rstrip('\n\r').replace('\n', '').replace('"', '').split('\t') for line in content if line != '']
680 return dict([[line[0], line[1:]] for line in lines])
682 def ReadLexique(parent, lang = 'french', filein = None):
685 parent.lexique = ReadDicoAsDico(parent.DictPath.get(lang, 'french'))
687 parent.lexique = ReadDicoAsDico(filein)
692 parent.lexique = ReadDicoAsDico(filein)
694 def ReadList(filein, encoding = 'utf8', sep = ';'):
696 with open(filein, 'r', encoding='utf8') as f :
698 content = [line.replace('\n', '').replace('\r','').replace('\"', '').replace(',', '.').split(sep) for line in content.splitlines()]
699 #file = codecs.open(filein, 'r', encoding)
700 #content = file.readlines()
702 first = content.pop(0)
703 #first = first.replace('\n', '').replace('\r','').replace('\"', '').split(sep)
707 #line = line.replace('\n', '').replace('\r','').replace('\"', '').replace(',', '.')
708 #line = line.split(';')
717 don = float('%.5f' % float(val))
723 def exec_RCMD(rpath, command) :
724 log.info('R CMD INSTALL %s' % command)
725 rpath = rpath.replace('\\','\\\\')
726 error = call(["%s" % rpath, 'CMD', 'INSTALL', "%s" % command])
729 def exec_rcode(rpath, rcode, wait = True, graph = False):
730 log.info("R Script : %s" % rcode)
732 if sys.platform == 'darwin' :
734 macversion = platform.mac_ver()[0].split('.')
735 if int(macversion[1]) < 5 :
741 rpath = rpath.replace('\\','\\\\')
742 env = os.environ.copy()
743 if sys.platform == 'darwin' and 'LC_ALL' not in env:
744 env['LC_ALL'] = 'en_US.UTF-8'
747 if sys.platform == 'win32':
748 error = call(["%s" % rpath, "--vanilla","--slave","-f", "%s" % rcode])
750 error = call([rpath, '--slave', "--vanilla", "--encoding=UTF-8", "-f %s" % rcode], env = env)
753 if sys.platform == 'win32':
754 pid = Popen(["%s" % rpath, '--vanilla','--slave','-f', "%s" % rcode])
756 pid = Popen([rpath, '--slave', "--vanilla", "--encoding=UTF-8", "-f %s" % rcode], stderr = PIPE, env = env, encoding='UTF-8') #PIPE ou STDOUT ?
760 if sys.platform == 'win32':
761 error = call(["%s" % rpath, '--vanilla','--slave','-f', "%s" % rcode])
762 elif sys.platform == 'darwin' and needX11:
763 os.environ['DISPLAY'] = ':0.0'
764 error = call([rpath, '--vanilla','--slave', "--encoding=UTF-8","-f %s" % rcode], env = env, encoding='UTF-8')
766 error = call([rpath, '--vanilla','--slave', "--encoding=UTF-8","-f %s" % rcode], env = env, encoding='UTF-8')
769 if sys.platform == 'win32':
770 pid = Popen(["%s" % rpath, '--vanilla','--slave','-f', "%s" % rcode])
771 elif sys.platform == 'darwin' and needX11:
772 os.environ['DISPLAY'] = ':0.0'
773 pid = Popen([rpath, '--vanilla','--slave', "--encoding=UTF-8","-f %s" % rcode], stderr = PIPE, env = env, encoding='UTF-8')
775 pid = Popen([rpath, '--vanilla','--slave', "--encoding=UTF-8","-f %s" % rcode], stderr = PIPE, env = env, encoding='UTF-8')
778 def check_Rresult(parent, pid) :
779 if isinstance(pid, Popen) :
780 if pid.returncode != 0 :
781 error = pid.communicate()
782 error = [str(error[0]), error[1]]
783 if error[1] is None :
785 parent.Rerror = '\n'.join([str(pid.returncode), '\n'.join(error)])
787 raise Exception('\n'.join(['Erreur R', '\n'.join(error[1:])]))
796 raise Exception('Erreur R')
804 def launchcommand(mycommand):
807 def print_liste(filename,liste):
808 with open(filename,'w', encoding='utf8') as f :
810 f.write(';'.join(graph) +'\n')
812 def read_list_file(filename, encoding = 'utf8'):
813 with open(filename,'r', encoding='utf8') as f:
814 content=f.readlines()
815 ncontent=[line.replace('\n','').split(';') for line in content if line.strip() != '']
818 def progressbar(self, maxi):
819 ira = wx.GetApp().GetTopWindow()
825 prog = wx.ProgressDialog("Traitements",
826 "Veuillez patienter...",
829 style=wx.PD_APP_MODAL | wx.PD_AUTO_HIDE | wx.PD_ELAPSED_TIME | wx.PD_CAN_ABORT
832 # le ABORT n'est pas géré à tous les coups ???
833 prog.SetSize((400,150))
834 #prog.SetIcon(ira._icon)
837 def treat_var_mod(variables) :
839 variables = list(set(variables))
840 varmod = [variable.split('_') for variable in variables]
841 vars = list(set([var[0] for var in varmod if len(var) >=2]))
843 mods = ['_'.join(v) for v in varmod if v[0] == var]
846 # for variable in variables :
847 # if '_' in variable :
848 # forme = variable.split('_')
851 # if not var in var_mod :
852 # var_mod[var] = [variable]
854 # if not mod in var_mod[var] :
855 # var_mod[var].append(variable)
858 def doconcorde(corpus, uces, mots, uci = False) :
860 ucestxt1 = [row for row in corpus.getconcorde(uces)]
862 ucestxt1 = [row for row in corpus.getuciconcorde(uces)]
863 ucestxt1 = dict(ucestxt1)
866 listmot = [corpus.getlems()[lem].formes for lem in mots]
867 listmot = [corpus.getforme(fid).forme for lem in listmot for fid in lem]
868 mothtml = ['<font color=red><b>%s</b></font>' % mot for mot in listmot]
869 dmots = dict(list(zip(listmot, mothtml)))
871 ucetxt = ucestxt1[uce].split()
872 ucetxt = ' '.join([dmots.get(mot, mot) for mot in ucetxt])
874 uciid = corpus.getucefromid(uce).uci
875 ucis_txt.append('<p><b>' + ' '.join(corpus.ucis[corpus.getucefromid(uce).uci].etoiles) + '<a href="%i_%i"> *%i_%i</a></b></p>' % (uciid, uce, uciid, uce))
877 ucis_txt.append('<p><b>' + ' '.join(corpus.ucis[uce].etoiles) + '</b></p>')
878 ucestxt.append(ucetxt)
879 return ucis_txt, ucestxt
882 def getallstcarac(corpus, analyse) :
883 pathout = PathOut(analyse['ira'])
884 profils = ReadProfileAsDico(pathout['PROFILE_OUT'], Alceste, 'utf8')
887 def read_chd(filein, fileout):
888 with open(filein, 'r') as f :
890 #content = [line[3:].replace('"',"").replace(' ','') for line in content.splitlines()]
891 content = [line.split('\t') for line in content.splitlines()]
892 chd = {'name':1, 'children':[]}
894 for i, line in enumerate(content) :
896 chd['children'] = [{'name': line[1],'size' : content[i+1][0]}, {'name':line[2], 'size': content[i+1][1]}]
897 mere[line[1]] = chd['children'][0]
898 mere[line[2]] = chd['children'][1]
900 if 'children' in mere[line[0]]:
901 mere[line[0]]['children'].append({'name': line[1],'size' : content[i+1][0]})
902 mere[line[1]] = mere[line[0]]['children'][-1]
903 mere[line[0]]['children'].append({'name': line[2],'size' : content[i+1][1]})
904 mere[line[2]] = mere[line[0]]['children'][-1]
906 mere[line[0]]['children'] = [{'name': line[1],'size' : content[i+1][0]}, {'name':line[2], 'size': content[i+1][1]}]
907 mere[line[1]] = mere[line[0]]['children'][-2]
908 mere[line[2]] = mere[line[0]]['children'][-1]
909 with open(fileout, 'w') as f :
910 f.write(json.dumps(chd))
913 translation_languages = {"Afrikaans":"af", "Albanian":"sq", "Amharic":"am", "Arabic":"ar", "Armenian":"hy", "Azeerbaijani":"az", "Basque":"eu", "Belarusian":"be", "Bengali":"bn", "Bosnian":"bs", "Bulgarian":"bg", "Catalan":"ca", "Cebuano":"ceb", "Chichewa":"ny", "Chinese (Simplified)":"zh-CN", "Chinese (Traditional)":"zh-TW", "Corsican":"co", "Croatian":"hr", "Czech":"cs", "Danish":"da", "Dutch":"nl", "English":"en", "Esperanto":"eo", "Estonian":"et", "Filipino":"tl", "Finnish":"fi", "French":"fr", "Frisian":"fy", "Galician":"gl", "Georgian":"ka", "German":"de", "Greek":"el", "Gujarati":"gu", "Haitian Creole":"ht", "Hausa":"ha", "Hawaiian":"haw", "Hebrew":"iw", "Hindi":"hi", "Hmong":"hmn ", "Hungarian":"hu", "Icelandic":"is", "Igbo":"ig", "Indonesian":"id", "Irish":"ga", "Italian":"it", "Japanese":"ja", "Javanese":"jw", "Kannada":"kn", "Kazakh":"kk", "Khmer":"km", "Korean":"ko", "Kurdish":"ku", "Kyrgyz":"ky", "Lao":"lo", "Latin":"la", "Latvian":"lv", "Lithuanian":"lt", "Luxembourgish":"lb", "Macedonian":"mk", "Malagasy":"mg", "Malay":"ms", "Malayalam":"ml", "Maltese":"mt", "Maori":"mi", "Marathi":"mr", "Mongolian":"mn", "Burmese":"my", "Nepali":"ne", "Norwegian":"no", "Pashto":"ps", "Persian":"fa", "Polish":"pl", "Portuguese":"pt", "Punjabi":"ma", "Romanian":"ro", "Russian":"ru", "Samoan":"sm", "Scots Gaelic":"gd", "Serbian":"sr", "Sesotho":"st", "Shona":"sn", "Sindhi":"sd", "Sinhala":"si", "Slovak":"sk", "Slovenian":"sl", "Somali":"so", "Spanish":"es", "Sundanese":"su", "Swahili":"sw", "Swedish":"sv", "Tajik":"tg", "Tamil":"ta", "Telugu":"te", "Thai":"th", "Turkish":"tr", "Ukrainian":"uk", "Urdu":"ur", "Uzbek":"uz", "Vietnamese":"vi", "Welsh":"cy", "Xhosa":"xh", "Yiddish":"yi", "Yoruba":"yo", "Zulu":"zu", }
916 def gettranslation(words, lf, lt) :
917 import urllib.request, urllib.error, urllib.parse
919 agent = {'User-Agent':
927 .NET CLR 3.0.04506.30\
929 base_link = "https://translate.googleapis.com/translate_a/single?client=gtx&sl=%s&tl=%s&dt=t&q=%s"
931 totrans = urllib.parse.quote('\n'.join(words))
932 link = base_link % (lf, lt, totrans)
933 request = urllib.request.Request(link, headers=agent)
934 raw_data = urllib.request.urlopen(request).read()
935 data = json.loads(raw_data)
936 return [line[0].replace("'", '_').replace(' | ', '|').replace(' ', '_').replace('-','_').replace('\n','') for line in data[0]]
938 def makenprof(prof, trans, deb=0) :
941 nprof.append(prof[0])
942 for i, val in enumerate(trans) :
943 line = prof[deb+i+1][:]
948 def treatempty(val) :
949 if val.strip() == '' :
954 def translateprofile(corpus, dictprofile, lf='it', lt='fr', maxword = 50) :
957 for i in range(len(dictprofile)) :
958 prof = dictprofile[repr(i+1)]
960 lenact = prof.index(['*****', '*', '*', '*', '*', '*', '', ''])
964 lenact = prof.index(['*', '*', '*', '*', '*', '*', '', ''])
970 lensup += prof.index(['*', '*', '*', '*', '*', '*', '', ''])
971 lensup = lensup - lenact
973 lensup += len(prof) - lenact
975 if lenact > maxword :
979 actori = [line[6] for line in prof[1:nlenact]]
980 act = [val.replace('_', ' ') for val in actori]
981 act = gettranslation(act, lf, lt)
982 for j, val in enumerate(actori) :
983 if act[j] not in lems :
986 while act[j] in lems :
987 act[j] = act[j] + "+"
989 nprof[repr(i+1)] = makenprof(prof, act)
992 if lensup > maxword :
996 supori = [line[6] for line in prof[(1+lenact):(lenact+nlensup)]]
997 sup = [val.replace('_', ' ') for val in supori]
998 sup = [treatempty(val) for val in sup]
999 sup = gettranslation(sup, lf, lt)
1000 for j, val in enumerate(supori) :
1001 if sup[j] not in lems :
1004 while sup[j] in lems :
1005 sup[j] = sup[j] + "+"
1007 nprof[repr(i+1)].append(['*****', '*', '*', '*', '*', '*', '', ''])
1008 nprof[repr(i+1)] += makenprof(prof, sup, deb=lenact)
1011 lenet = prof.index(['*', '*', '*', '*', '*', '*', '', ''])
1012 nprof[repr(i+1)].append(['*', '*', '*', '*', '*', '*', '', ''])
1013 nprof[repr(i+1)] += prof[(lenet+1):]
1018 def write_translation_profile(prof, lems, language, dictpathout) :
1019 if os.path.exists(dictpathout['translations.txt']) :
1020 with open(dictpathout['translations.txt'], 'r', encoding='utf8') as f :
1021 translist = f.read()
1022 translist = [line.split('\t') for line in translist.splitlines()]
1026 toprint.append(['','','','','',''])
1027 toprint.append(['***', 'nb classes', repr(len(prof)), '***', '', ''])
1028 for i in range(len(prof)) :
1029 toprint.append(['**', 'classe', repr(i+1), '**', '', ''])
1030 toprint.append(['****'] + prof[repr(i+1)][0] + ['****'])
1031 rest = [[repr(line[1]), repr(line[2]), repr(line[3]), repr(line[4]), line[6], line[7].replace('< 0,0001', '0.00009').replace('NS (','').replace(')','')] for line in prof[repr(i+1)][1:]]
1032 for i, line in enumerate(prof[repr(i+1)][1:]) :
1034 rest[i] = ['*', '*', '*', '*', '*', '*']
1035 elif line[0] == '*****' :
1036 rest[i] = ['*****','*','*', '*', '*', '*']
1038 with open(dictpathout['translation_profile_%s.csv' % language], 'w', encoding='utf8') as f :
1039 f.write('\n'.join([';'.join(line) for line in toprint]))
1040 with open(dictpathout['translation_words_%s.csv' % language], 'w', encoding='utf8') as f :
1041 f.write('\n'.join(['\t'.join([val, lems[val]]) for val in lems]))
1042 if 'translation_profile_%s.csv' % language not in [val[0] for val in translist] :
1043 translist.append(['translation_profile_%s.csv' % language, 'translation_words_%s.csv' % language])
1044 with open(dictpathout['translations.txt'], 'w', encoding='utf8') as f :
1045 f.write('\n'.join(['\t'.join(line) for line in translist]))
1047 def makesentidict(infile, language) :
1048 with codecs.open(infile,'r', 'utf8') as f :
1050 content = [line.split('\t') for line in content.splitlines()]
1051 titles = content.pop(0)
1052 senti = ['Positive', 'Negative', 'Anger', 'Anticipation', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise', 'Trust']
1055 sentid[sent] = titles.index(sent)
1056 frtitle = [val for val in titles if '(fr)' in val]
1057 frid = titles.index(frtitle[0])
1058 sentidict = [[line[frid].lower(), [line[sentid[sent]] for sent in senti]] for line in content]
1059 pos = ['positive'] + [line[0] for line in sentidict if line[1][0] == '1']
1060 neg = ['negative'] + [line[0] for line in sentidict if line[1][1] == '1']
1061 anger = ['anger'] + [line[0] for line in sentidict if line[1][2] == '1']
1062 anticipation = ['anticipation'] + [line[0] for line in sentidict if line[1][3] == '1']
1063 disgust = ['disgust'] + [line[0] for line in sentidict if line[1][4] == '1']
1064 fear = ['fear'] + [line[0] for line in sentidict if line[1][5] == '1']
1065 joy = ['joy'] + [line[0] for line in sentidict if line[1][6] == '1']
1066 sadness = ['sadness'] + [line[0] for line in sentidict if line[1][7] == '1']
1067 surprise = ['surprise'] + [line[0] for line in sentidict if line[1][8] == '1']
1068 trust = ['trust'] + [line[0] for line in sentidict if line[1][9] == '1']
1069 with open('/tmp/tgenemo.csv', 'w') as f :
1070 for val in [pos, neg, anger, anticipation, disgust, fear, joy, sadness, surprise, trust] :
1071 f.write('\t'.join(val) + '\n')
1073 def countsentfromprof(prof, encoding, sentidict) :
1074 with codecs.open(prof, 'r', encoding) as f :
1076 content = [line.split(';') for line in content.splitlines()]
1078 content = [[line[0], [int(val) for val in line[1:]]] for line in content]
1080 content = dict(content)
1083 def iratolexico(infile, outfile, encoding) :
1084 with codecs.open(infile, 'r', encoding) as f :
1086 if line.startswith('**** ') :