iramuteq.org Git - iramuteq/blob - functions.py

   1 #!/bin/env python
   2 # -*- coding: utf-8 -*-
   3 #Author: Pierre Ratinaud
   4 #Copyright (c) 2008-2012 Pierre Ratinaud
   5 #License: GNU/GPL
   6
   7 import wx
   8 import re
   9 from ConfigParser import ConfigParser
  10 from subprocess import Popen, call, PIPE
  11 import thread
  12 import os
  13 import ast
  14 import sys
  15 import csv
  16 import platform
  17 import traceback
  18 import codecs
  19 import locale
  20 import datetime
  21 from copy import copy
  22 from shutil import copyfile
  23 import shelve
  24 import json
  25 #from dialog import BugDialog
  26 import logging
  27
  28 log = logging.getLogger('iramuteq')
  29
  30
  31 indices_simi = [u'cooccurrence' ,'pourcentage de cooccurrence',u'Russel',u'Jaccard', 'Kulczynski1', 'Kulczynski2', 'Mountford', 'Fager', 'simple matching', 'Hamman', 'Faith', 'Tanimoto', 'Dice', 'Phi', 'Stiles', 'Michael', 'Mozley', 'Yule', 'Yule2', 'Ochiai', 'Simpson', 'Braun-Blanquet','Chi-squared', 'Phi-squared', 'Tschuprow', 'Cramer', 'Pearson', 'binomial']
  32
  33
  34
  35 def open_folder(folder):
  36     if sys.platform == "win32":
  37         os.startfile(folder)
  38     else:
  39         opener ="open" if sys.platform == "darwin" else "xdg-open"
  40         #call([opener, folder])
  41         call([u"%s %s &" % (opener, folder)], shell=True)
  42
  43 def normpath_win32(path) :
  44     if not sys.platform == 'win32' :
  45         return path
  46     while '\\\\' in path :
  47         path = path.replace('\\\\', '\\')
  48     if path.startswith('\\') and not path.startswith('\\\\') :
  49         path = '\\' + path
  50     return path
  51
  52 class TGen :
  53     def __init__(self, path = None, encoding = 'utf8'):
  54         self.path = path
  55         self.tgen = {}
  56         self.encoding = encoding
  57
  58     def __getitem__(self, key):
  59         return self.tgen[key]
  60
  61     def read(self, path = None):
  62         if path is None :
  63             path = self.path
  64         with codecs.open(path, 'r', self.encoding) as f :
  65             tgen = f.read()
  66         tgen = [line.split('\t') for line in tgen.splitlines()]
  67         tgen = dict([[line[0], line[1:]] for line in tgen])
  68         self.tgen = tgen
  69         self.path = path
  70
  71     def write(self, path = None):
  72         if path is None :
  73             path = self.path
  74         with open(path, 'w') as f :
  75             f.write('\n'.join(['\t'.join([val] + self.tgen[val]) for val in self.tgen]).encode(self.encoding))
  76
  77     def writetable(self, pathout, tgens, totocc):
  78         etoiles = totocc.keys()
  79         etoiles.sort()
  80         with open(pathout, 'w') as f :
  81             line = '\t'.join([u'tgens'] + etoiles) + '\n'
  82             f.write(line.encode(self.encoding))
  83             for t in tgens :
  84                 line = '\t'.join([t] + [`tgens[t][et]` for et in etoiles]) + '\n'
  85                 f.write(line.encode(self.encoding))
  86             i = 0
  87             totname = 'total'
  88             while totname + `i` in tgens :
  89                 i += 1
  90             totname = totname + `i`
  91             line = '\t'.join([totname] + [`totocc[et]` for et in etoiles]) + '\n'
  92             f.write(line.encode(self.encoding))
  93
  94 class History :
  95     def __init__(self, filein, syscoding = 'utf8') :
  96         self.filein = filein
  97         self.syscoding = syscoding
  98         self.corpus = {}
  99         self.openedcorpus = {}
 100         self.openedmatrix = {}
 101         self.orph = []
 102         self.analyses = {}
 103         self.history = []
 104         self.opened = {}
 105         self.read()
 106
 107     def read(self) :
 108         d = shelve.open(self.filein)
 109         self.history = d.get('history', [])
 110         self.matrix = d.get('matrix', [])
 111         self.ordercorpus = dict([[corpus['uuid'], i] for i, corpus in enumerate(self.history)])
 112         self.corpus = dict([[corpus['uuid'], corpus] for corpus in self.history])
 113         self.analyses = dict([[analyse['uuid'], analyse] for corpus in self.history for analyse in corpus.get('analyses', [])])
 114         self.matrixanalyse = dict([[mat['uuid'], mat] for mat in self.matrix])
 115         self.ordermatrix = dict([[matrix['uuid'], i] for i, matrix in enumerate(self.matrix)])
 116         d.close()
 117         d = {}
 118         d['history'] = self.history
 119         d['matrix'] = self.matrix
 120         with open('/home/pierre/hystory.json', 'w') as f :
 121             f.write(json.dumps(d, indent=4, default=str))
 122
 123     def write(self) :
 124         d = shelve.open(self.filein)
 125         d['history'] = self.history
 126         d['matrix'] = self.matrix
 127         d.close()
 128
 129     def add(self, analyse) :
 130         log.info('add to history %s' % analyse.get('corpus_name', 'pas un corpus'))
 131         tosave = {'uuid' : analyse['uuid'], 'ira': analyse['ira'], 'type' : analyse['type']}
 132         if tosave['uuid'] in self.corpus :
 133             log.info('problem : this uuid is already in history : %s' % tosave['uuid'])
 134             return
 135         if analyse.get('corpus', False) :
 136             if analyse['uuid'] in self.analyses :
 137                 return
 138             tosave['corpus'] = analyse['corpus']
 139             tosave['name'] = analyse['name']
 140             acorpus_uuid =  analyse['corpus']
 141             if acorpus_uuid in self.corpus :
 142                 if 'analyses' in self.history[self.ordercorpus[acorpus_uuid]] :
 143                     self.history[self.ordercorpus[acorpus_uuid]]['analyses'].append(tosave)
 144                 else :
 145                     self.history[self.ordercorpus[acorpus_uuid]]['analyses'] = [tosave]
 146             else :
 147                 self.orph.append(tosave)
 148         else :
 149             tosave['corpus_name'] = analyse['corpus_name']
 150             #self.ordercorpus[tosave['uuid']] = len(history)
 151             #self.corpus[tosave['uuid']] = analyse
 152             self.history.append(tosave)
 153         self.write()
 154         self.read()
 155
 156     def addMatrix(self, analyse) :
 157         tosave = analyse
 158         #tosave['matrix_name'] = analyse['matrix_name']
 159         tosave['analyses'] = []
 160         self.matrix.append(tosave)
 161         self.write()
 162         self.read()
 163
 164     def addMatrixAnalyse(self, analyse) :
 165         tosave = {'uuid' : analyse['uuid'], 'ira': analyse['ira'], 'type' : analyse['type'], 'matrix' : analyse['matrix']}
 166         tosave['name'] = analyse['name']
 167         if tosave['matrix'] in self.ordermatrix :
 168             self.matrix[self.ordermatrix[tosave['matrix']]]['analyses'].append(tosave)
 169         self.write()
 170         self.read()
 171
 172     def addmultiple(self, analyses) :
 173         log.info('add multiple')
 174         for analyse in analyses :
 175             tosave = {'uuid' : analyse['uuid'], 'ira': analyse['ira'], 'type' : analyse['type']}
 176             corpus = analyse['corpus']
 177             tosave['corpus'] = corpus
 178             tosave['name'] = analyse['name']
 179             if corpus in self.corpus :
 180                 if 'analyses' in self.history[self.ordercorpus[corpus]] :
 181                     self.history[self.ordercorpus[corpus]]['analyses'].append(tosave)
 182                 else :
 183                     self.history[self.ordercorpus[corpus]]['analyses'] = [tosave]
 184         self.write()
 185         self.read()
 186
 187     def delete(self, analyse, corpus = False) :
 188         log.info('delete %s' % analyse.get('name', 'noname'))
 189         if corpus :
 190             self.history.pop(self.ordercorpus[analyse['uuid']])
 191             if analyse['uuid'] in self.openedcorpus :
 192                 del self.openedcorpus[analyse['uuid']]
 193             log.info('delete corpus : %s' % analyse['uuid'])
 194         elif analyse['uuid'] in self.analyses :
 195             todel = [i for i, ana in enumerate(self.corpus[analyse['corpus']]['analyses']) if ana['uuid'] == analyse['uuid']][0]
 196             self.history[self.ordercorpus[analyse['corpus']]]['analyses'].pop(todel)
 197         elif analyse['uuid'] in self.matrixanalyse :
 198             self.matrix = [mat for mat in self.matrix if mat['uuid'] != analyse['uuid']]
 199         elif analyse.get('matrix', False) in self.matrixanalyse :
 200             analyses = self.matrix[self.ordermatrix[analyse['matrix']]]['analyses']
 201             topop = [i for i, val in enumerate(analyses) if analyse['uuid'] == val['uuid']][0]
 202             analyses.pop(topop)
 203             self.matrix[self.ordermatrix[analyse['matrix']]]['analyses'] = analyses
 204         self.write()
 205         self.read()
 206
 207     def addtab(self, analyse) :
 208         self.opened[analyse['uuid']] = analyse
 209
 210     def rmtab(self, analyse) :
 211         del self.opened[analyse['uuid']]
 212
 213     def update(self, analyse) :
 214         if 'matrix_name' in analyse :
 215             self.matrixanalyse[analyse['uuid']].update(analyse)
 216         elif 'corpus_name' in analyse :
 217             self.corpus[analyse['uuid']].update(analyse)
 218         elif 'corpus' in analyse :
 219             self.analyses[analyse['uuid']].update(analyse)
 220         else :
 221             toupdate = [an for an in self.matrixanalyse[analyse['matrix']]['analyses'] if an['uuid'] == analyse['uuid']]
 222             toupdate[0].update(analyse)
 223         self.write()
 224         self.read()
 225
 226     def clean(self) :
 227         corpustodel = [corpus for corpus in self.history if not os.path.exists(corpus['ira'])]
 228         print corpustodel
 229         for corpus in corpustodel :
 230             print 'cleaning :', corpus['corpus_name']
 231             self.delete(corpus, corpus = True)
 232         anatodel = [analyse for corpus in self.history for analyse in corpus.get('analyses', []) if not os.path.exists(analyse.get('ira', '/'))]
 233         for analyse in anatodel :
 234             print 'cleaning :', analyse['name']
 235             self.delete(analyse)
 236
 237     def dostat(self):
 238         todel = {}
 239         tokens = 0
 240         corpusnb = {}
 241         subnb = 0
 242         analysenb = 0
 243         hours = 0
 244         minutes = 0
 245         secondes = 0
 246         ha = 0
 247         ma = 0
 248         sa = 0
 249         for corpus in self.history :
 250             analysenb += len(corpus.get('analyses', []))
 251             analyses = corpus.get('analyses', [])
 252             for analyse in analyses :
 253                 if os.path.exists(analyse['ira']) :
 254                     ana = DoConf(analyse['ira']).getoptions()
 255                     if 'time' in ana :
 256                         time = ana['time'].split()
 257                         ha += int(time[0].replace('h','')) * 3600
 258                         ma += int(time[1].replace('m','')) * 60
 259                         sa += int(time[2].replace('s',''))
 260             if os.path.exists(corpus['ira']) :
 261                 param = DoConf(corpus['ira']).getoptions()
 262                 time = param.get('time','0h 0m 0s')
 263                 time = time.split()
 264                 hours += int(time[0].replace('h','')) * 3600
 265                 minutes += int(time[1].replace('m','')) * 60
 266                 secondes += int(time[2].replace('s',''))
 267                 if param.get('originalpath', False) :
 268                     if param['originalpath'] in corpusnb :
 269                         corpusnb[param['originalpath']] += 1
 270                         tokens += int(param['occurrences'])
 271                     else :
 272                         corpusnb[param['originalpath']] = 1
 273                     #print param
 274                 else :
 275                     subnb += 1
 276             else :
 277                 if corpus['ira'] in todel :
 278                     todel['ira'] += 1
 279                 else :
 280                     todel['ira'] = 1
 281         print u'Nbr total de corpus : %s' % len(self.history)
 282         corpus_nb = len(corpusnb) + len(todel)
 283         print u'Nbr de corpus différents : %s' % corpus_nb
 284         lentodel = len(todel)
 285         print u'Nbr de corpus à supprimer : %s' % lentodel
 286         print u'Nbr de sous corpus : %s' % subnb
 287         print u"Nbr total d'occurrences : %s" % tokens
 288         print u'Moyenne occurrences par corpus : %f' % (tokens/corpus_nb)
 289         print '---------------------'
 290         print u"Nbr total d'analyses : %s" % analysenb
 291         print u'Temps total indexation : %f h' % ((hours+minutes+secondes) / 3600)
 292         print u'Temps total analyses :  %f h' % ((ha+ma+sa) / 3600)
 293
 294     def __str__(self) :
 295         return str(self.history)
 296
 297 class DoConf :
 298     def __init__(self, configfile=None, diff = None, parametres = None) :
 299         self.configfile = configfile
 300         self.conf = ConfigParser()
 301
 302         if configfile is not None :
 303             configfile = normpath_win32(configfile)
 304             self.conf.readfp(codecs.open(configfile, 'r', 'utf8'))
 305         self.parametres = {}
 306         if parametres is not None :
 307             self.doparametres(parametres)
 308
 309     def doparametres(self, parametres) :
 310         return parametres
 311
 312     def getsections(self) :
 313         return self.conf.sections()
 314
 315     def getoptions(self, section = None, diff = None):
 316         parametres = {}
 317         if section is None :
 318             section = self.conf.sections()[0]
 319         for option in self.conf.options(section) :
 320             if self.conf.get(section, option).isdigit() :
 321                 parametres[option] = int(self.conf.get(section, option))
 322             elif self.conf.get(section, option) == 'False' :
 323                 parametres[option] = False
 324             elif self.conf.get(section, option) == 'True' :
 325                 parametres[option] = True
 326             elif self.conf.get(section, option).startswith('(') and self.conf.get(section, option).endswith(')') :
 327                 parametres[option] = ast.literal_eval(self.conf.get(section, option))
 328             elif self.conf.get(section, option).startswith('[') and self.conf.get(section, option).endswith(']') :
 329                 parametres[option] = ast.literal_eval(self.conf.get(section, option))
 330             else :
 331                 parametres[option] = self.conf.get(section, option)
 332         if 'type' not in parametres :
 333             parametres['type'] = section
 334         return parametres
 335
 336     def makeoptions(self, sections, parametres, outfile = None) :
 337         txt = ''
 338         for i, section in enumerate(sections) :
 339             txt += '[%s]\n' % section
 340             if not self.conf.has_section(section) :
 341                 self.conf.add_section(section)
 342             for option in parametres[i] :
 343                 if isinstance(parametres[i][option], int) :
 344                     self.conf.set(section, option, `parametres[i][option]`)
 345                     txt += '%s = %i\n' % (option, parametres[i][option])
 346                 elif isinstance(parametres[i][option], basestring) :
 347                     self.conf.set(section, option, parametres[i][option].encode('utf8'))
 348                     txt += '%s = %s\n' % (option, parametres[i][option])
 349                 elif isinstance(parametres[i][option], wx.Colour) :
 350                     self.conf.set(section, option, str(parametres[i][option]))
 351                     txt += '%s = %s\n' % (option, str(parametres[i][option]))
 352                 elif option == 'analyses' :
 353                     pass
 354                 else :
 355                     self.conf.set(section, option, `parametres[i][option]`)
 356                     txt += '%s = %s\n' % (option, `parametres[i][option]`)
 357         if outfile is None :
 358             outfile = self.configfile
 359         outfile = normpath_win32(outfile)
 360         with open(outfile, 'w') as f :
 361             f.write(txt.encode('utf8'))
 362             #self.conf.write(f)
 363
 364     def totext(self, parametres) :
 365         #txt = ['Corpus']
 366         txt = []
 367         for val in parametres :
 368             if isinstance(parametres[val], int) :
 369                 txt.append(' \t\t: '.join([val, `parametres[val]`]))
 370             elif isinstance(parametres[val], basestring) :
 371                 txt.append(' \t\t: '.join([val, parametres[val]]))
 372             elif val in ['listet', 'stars'] :
 373                 pass
 374             else :
 375                 txt.append(' \t\t: '.join([val, `parametres[val]`]))
 376         return '\n'.join(txt)
 377
 378
 379 def write_tab(tab, fileout) :
 380         writer = csv.writer(open(fileout, 'wb'), delimiter=';', quoting = csv.QUOTE_NONNUMERIC)
 381         writer.writerows(tab)
 382
 383 class BugDialog(wx.Dialog):
 384     def __init__(self, *args, **kwds):
 385         # begin wxGlade: MyDialog.__init__
 386         kwds["style"] = wx.DEFAULT_DIALOG_STYLE | wx.STAY_ON_TOP
 387         kwds["size"] = wx.Size(500, 200)
 388         wx.Dialog.__init__(self, *args, **kwds)
 389         self.SetTitle(kwds['title'])
 390         self.text_ctrl_1 = wx.TextCtrl(self, -1, "", style=wx.TE_MULTILINE)
 391         self.text_ctrl_1.SetBackgroundColour('#DDE8EB')
 392         self.button_1 = wx.Button(self, wx.ID_OK, "")
 393
 394         self.__set_properties()
 395         self.__do_layout()
 396         # end wxGlade
 397
 398     def __set_properties(self):
 399         # begin wxGlade: MyDialog.__set_properties
 400         self.SetMinSize(wx.Size(500, 200))
 401         self.text_ctrl_1.SetMinSize(wx.Size(500, 200))
 402
 403         # end wxGlade
 404
 405     def __do_layout(self):
 406         # begin wxGlade: MyDialog.__do_layout
 407         sizer_1 = wx.BoxSizer(wx.VERTICAL)
 408         sizer_1.Add(self.text_ctrl_1, 1, wx.EXPAND, 0)
 409         sizer_1.Add(self.button_1, 0, wx.ALIGN_CENTER_HORIZONTAL, 0)
 410         self.SetSizer(sizer_1)
 411         sizer_1.Fit(self)
 412         self.Layout()
 413
 414
 415 def CreateIraFile(DictPathOut, clusternb, corpname='corpus_name', section = 'analyse'):
 416     AnalyseConf = ConfigParser()
 417     AnalyseConf.read(DictPathOut['ira'])
 418     AnalyseConf.add_section(section)
 419     date = datetime.datetime.now().ctime()
 420     AnalyseConf.set(section, 'date', str(date))
 421     AnalyseConf.set(section, 'clusternb', clusternb)
 422     AnalyseConf.set(section, 'corpus_name', corpname)
 423
 424     fileout = open(DictPathOut['ira'], 'w')
 425     AnalyseConf.write(fileout)
 426     fileout.close()
 427
 428 def sortedby(list, direct, *indices):
 429
 430     """
 431         sortedby: sort a list of lists (e.g. a table) by one or more indices
 432                   (columns of the table) and return the sorted list
 433
 434         e.g.
 435          for list = [[2,3],[1,2],[3,1]]:
 436          sortedby(list,1) will return [[3, 1], [1, 2], [2, 3]],
 437          sortedby(list,0) will return [[1, 2], [2, 3], [3, 1]]
 438     """
 439
 440     nlist = map(lambda x, indices=indices:
 441                  map(lambda i, x=x: x[i], indices) + [x],
 442                  list)
 443     if direct == 1:
 444         nlist.sort()
 445     elif direct == 2:
 446         nlist.sort(reverse=True)
 447     return map(lambda l: l[-1], nlist)
 448
 449 def add_type(line, dictlem):
 450     if line[4] in dictlem:
 451         line.append(dictlem[line[4]])
 452     else :
 453         line.append('')
 454     return line
 455
 456 def treat_line_alceste(i, line) :
 457     if line[0] == '*' or line[0] == '*****' :
 458         return line + ['']
 459     if line[5] == 'NA':
 460         print 'NA', line[5]
 461         pass
 462     elif float(line[5].replace(',', '.')) < 0.0001:
 463         line[5] = '< 0,0001'
 464     elif float(line[5].replace(',', '.')) > 0.05:
 465         line[5] = 'NS (%s)' % str(float(line[5].replace(',', '.')))[0:7]
 466     else:
 467         line[5] = str(float(line[5].replace(',', '.')))[0:7]
 468     return [i, int(line[0]), int(line[1]), float(line[2]), float(line[3]), line[6], line[4], line[5]]
 469
 470 def ReadProfileAsDico(File, Alceste=False, encoding = sys.getdefaultencoding()):
 471     dictlem = {}
 472     print 'lecture des profiles'
 473     FileReader = codecs.open(File, 'r', encoding)
 474     Filecontent = FileReader.readlines()
 475     FileReader.close()
 476     DictProfile = {}
 477     count = 0
 478     #rows = [row.replace('\n', '').replace("'", '').replace('\"', '').replace(',', '.').replace('\r','').split(';') for row in Filecontent]
 479     rows = [row.replace('\n', '').replace("'", '').replace('\"', '').replace('\r','').split(';') for row in Filecontent]
 480     rows.pop(0)
 481     ClusterNb = rows[0][2]
 482     rows.pop(0)
 483     clusters = [row[2] for row in rows if row[0] == u'**']
 484     valclusters = [row[1:4] for row in rows if row[0] == u'****']
 485     lp = [i for i, line in enumerate(rows) if line[0] == u'****']
 486     prof = [rows[lp[i] + 1:lp[i+1] - 1] for i in range(0, len(lp)-1)] + [rows[lp[-1] + 1:len(rows)]]
 487     if Alceste :
 488         prof = [[add_type(row, dictlem) for row in pr] for pr in prof]
 489         prof = [[treat_line_alceste(i,line) for i, line in enumerate(pr)] for pr in prof]
 490     else :
 491         prof = [[line + [''] for line in pr] for pr in prof]
 492         prof = [[treat_line_alceste(i,line) for i, line in enumerate(pr)] for pr in prof]
 493     for i, cluster in enumerate(clusters):
 494         DictProfile[cluster] = [valclusters[i]] + prof[i]
 495     return DictProfile
 496
 497 def GetTxtProfile(dictprofile, cluster_size) :
 498     proflist = []
 499     for classe in range(0, len(dictprofile)) :
 500         prof = dictprofile[str(classe + 1)]
 501         clinfo = cluster_size[classe]
 502         proflist.append('\n'.join([' '.join(['classe %i' % (classe + 1), '-', '%s uce sur %s - %s%%' % (clinfo[0], clinfo[1], clinfo[2])]), '\n'.join(['%5s|%5s|%6s|%6s|%8s|%8s|%20s\t%10s' % tuple([str(val) for val in line]) for line in prof if len(line)==8])]))
 503     return '\n\n'.join(proflist)
 504
 505 def formatExceptionInfo(maxTBlevel=5):
 506     cla, exc, trbk = sys.exc_info()
 507     try :
 508         excName = cla.__name__
 509     except :
 510         excName = 'None'
 511     try:
 512         excArgs = exc.args[0]
 513     except :
 514         excArgs = "<no args>"
 515     excTb = traceback.format_tb(trbk, maxTBlevel)
 516     return (excName, excArgs, excTb)
 517
 518
 519 #fonction des etudiants de l'iut
 520 def decoupercharact(chaine, longueur, longueurOptimale, separateurs = None) :
 521     """
 522         on part du dernier caractère, et on recule jusqu'au début de la chaîne.
 523         Si on trouve un '$', c'est fini.
 524         Sinon, on cherche le meilleur candidat. C'est-à-dire le rapport poids/distance le plus important.
 525     """
 526     separateurs = [[u'.', 60.0], [u'?', 60.0], [u'!', 60.0], [u'£$£', 60], [u':', 50.0], [u';', 40.0], [u',', 10.0], [u' ', 0.1]]
 527     trouve = False                 # si on a trouvé un bon séparateur
 528     iDecoupe = 0                # indice du caractere ou il faut decouper
 529
 530     # on découpe la chaine pour avoir au maximum 240 caractères
 531     longueur = min(longueur, len(chaine) - 1)
 532     chaineTravail = chaine[:longueur + 1]
 533     nbCar = longueur
 534     meilleur = ['', 0, 0]        # type, poids et position du meilleur separateur
 535
 536     # on vérifie si on ne trouve pas un '$'
 537     indice = chaineTravail.find(u'$')
 538     if indice > -1:
 539         trouve = True
 540         iDecoupe = indice
 541
 542     # si on ne trouve rien, on cherche le meilleur séparateur
 543     if not trouve:
 544         while nbCar >= 0:
 545             caractere = chaineTravail[nbCar]
 546             distance = abs(longueurOptimale - nbCar) + 1
 547             meilleureDistance = abs(longueurOptimale - meilleur[2]) + 1
 548
 549             # on vérifie si le caractére courant est une marque de ponctuation
 550             for s in separateurs:
 551                 if caractere == s[0]:
 552                     # si c'est une ponctuation
 553
 554                     if s[1] / distance > float(meilleur[1]) / meilleureDistance:
 555                         # print nbCar, s[0]
 556                         meilleur[0] = s[0]
 557                         meilleur[1] = s[1]
 558                         meilleur[2] = nbCar
 559                         trouve = True
 560                         iDecoupe = nbCar
 561
 562                     # et on termine la recherche
 563                     break
 564
 565             # on passe au caractère précédant
 566             nbCar = nbCar - 1
 567
 568     # si on a trouvé
 569     if trouve:
 570         fin = chaine[iDecoupe + 1:]
 571         retour = chaineTravail[:iDecoupe]
 572         return len(retour) > 0, retour.split(), fin
 573     # si on a rien trouvé
 574     return False, chaine.split(), ''
 575
 576
 577 exceptions = {'paragrapheOT' : u"Un problème de formatage (présence d'un marqueur de paragraphe (-*) en dehors d'un texte) est survenu à la ligne ",
 578               'EmptyText' : u"Texte vide (probablement un problème de formatage du corpus). Le problème est apparu à la ligne ",
 579               'CorpusEncoding' : u"Problème d'encodage.",
 580               'TextBeforeTextMark' : u"Problème de formatage : du texte avant le premier marqueur de texte (****). Le problème est survenu à la ligne ",
 581               'MissingAnalyse' : u'Aucun fichier à cet emplacement :\n',
 582 }
 583
 584 def BugReport(parent, error = None):
 585     for ch in parent.GetChildren():
 586         if "<class 'wx._windows.ProgressDialog'>" == str(type(ch)):
 587             ch.Destroy()
 588     excName, exc, excTb = formatExceptionInfo()
 589     if excName == 'Exception' :
 590         print exc
 591         if len(exc.split()) == 2 :
 592             mss, linenb = exc.split()
 593             if mss in exceptions :
 594                 txt = exceptions[mss] + linenb
 595             else :
 596                 txt = exc
 597         else :
 598             if exc in exceptions :
 599                 txt = exceptions[exc]
 600             else :
 601                 txt = exc
 602         title = "Information"
 603     else :
 604         txt = u'            !== BUG ==!       \n'
 605         txt += u'*************************************\n'
 606         txt += '\n'.join(excTb).replace('    ', ' ')
 607         txt += excName + '\n'
 608         txt += `exc`
 609         title = "Bug"
 610
 611     dial = BugDialog(parent, **{'title' : title})
 612     if 'Rerror' in dir(parent) :
 613         txt += parent.Rerror
 614         parent.Rerror = ''
 615     log.info(txt)
 616     dial.text_ctrl_1.write(txt)
 617     dial.CenterOnParent()
 618     dial.ShowModal()
 619     dial.Destroy()
 620
 621 def PlaySound(parent):
 622     if parent.pref.getboolean('iramuteq', 'sound') :
 623         try:
 624             if "gtk2" in wx.PlatformInfo:
 625                 error = Popen(['aplay','-q',os.path.join(parent.AppliPath,'son_fin.wav')])
 626             else :
 627                 sound = wx.Sound(os.path.join(parent.AppliPath, 'son_fin.wav'))
 628                 sound.Play(wx.SOUND_SYNC)
 629         except :
 630             print 'pas de son'
 631
 632 def ReadDicoAsDico(dicopath):
 633     with codecs.open(dicopath, 'r', 'UTF8') as f:
 634         content = f.readlines()
 635     lines = [line.rstrip('\n\r').replace(u'\n', '').replace('"', '').split('\t') for line in content if line != u'']
 636     return dict([[line[0], line[1:]] for line in lines])
 637
 638 def ReadLexique(parent, lang = 'french', filein = None):
 639     if lang != 'other' :
 640         if filein is None :
 641             parent.lexique = ReadDicoAsDico(parent.DictPath.get(lang, 'french'))
 642         else :
 643             parent.lexique = ReadDicoAsDico(filein)
 644     else :
 645         if filein is None :
 646             parent.lexique = {}
 647         else :
 648             parent.lexique = ReadDicoAsDico(filein)
 649
 650 def ReadList(filein, encoding = sys.getdefaultencoding(), sep = ';'):
 651     #file = open(filein)
 652     with codecs.open(filein, 'r', encoding) as f :
 653         content = f.read()
 654     content = [line.replace('\n', '').replace('\r','').replace('\"', '').replace(',', '.').split(sep) for line in content.splitlines()]
 655     #file = codecs.open(filein, 'r', encoding)
 656     #content = file.readlines()
 657     #file.close()
 658     first = content.pop(0)
 659     #first = first.replace('\n', '').replace('\r','').replace('\"', '').split(sep)
 660     dict = {}
 661     i = 0
 662     for line in content:
 663         #line = line.replace('\n', '').replace('\r','').replace('\"', '').replace(',', '.')
 664         #line = line.split(';')
 665         nline = [line[0]]
 666         for val in line[1:]:
 667             if val == u'NA' :
 668                 don = ''
 669             else:
 670                 try:
 671                     don = int(val)
 672                 except:
 673                     don = float('%.5f' % float(val))
 674             nline.append(don)
 675         dict[i] = nline
 676         i += 1
 677     return dict, first
 678
 679 def exec_RCMD(rpath, command) :
 680     log.info('R CMD INSTALL %s' % command)
 681     rpath = rpath.replace('\\','\\\\')
 682     error = call(["%s" % rpath, 'CMD', 'INSTALL', "%s" % command])
 683     return error
 684
 685 def exec_rcode(rpath, rcode, wait = True, graph = False):
 686     log.info("R Script : %s" % rcode)
 687     needX11 = False
 688     if sys.platform == 'darwin' :
 689         try :
 690             macversion = platform.mac_ver()[0].split('.')
 691             if int(macversion[1]) < 5 :
 692                 needX11 = True
 693             else :
 694                 needX11 = False
 695         except :
 696             needX11 = False
 697
 698     rpath = rpath.replace('\\','\\\\')
 699     env = os.environ.copy()
 700     if sys.platform == 'darwin' and 'LC_ALL' not in env:
 701         env['LC_ALL'] = 'en_US.UTF-8'
 702     if not graph :
 703         if wait :
 704             if sys.platform == 'win32':
 705                 error = call(["%s" % rpath, "--vanilla","--slave","-f", "%s" % rcode])
 706             else :
 707                 error = call([rpath, '--slave', "--vanilla", "-f %s" % rcode, "--encoding=UTF-8"], env = env)
 708             return error
 709         else :
 710             if sys.platform == 'win32':
 711                 pid = Popen(["%s" % rpath, '--vanilla','--slave','-f', "%s" % rcode])
 712             else :
 713                 pid = Popen([rpath, '--slave', "--vanilla", "-f %s" % rcode, "--encoding=UTF-8"], stderr = PIPE, env = env)
 714             return pid
 715     else :
 716         if wait :
 717             if sys.platform == 'win32':
 718                 error = call(["%s" % rpath, '--vanilla','--slave','-f', "%s" % rcode])
 719             elif sys.platform == 'darwin' and needX11:
 720                 os.environ['DISPLAY'] = ':0.0'
 721                 error = call([rpath, '--vanilla','--slave',"-f %s" % rcode, "--encoding=UTF-8"], env = env)
 722             else :
 723                 error = call([rpath, '--vanilla','--slave',"-f %s" % rcode, "--encoding=UTF-8"], env = env)
 724             return error
 725         else :
 726             if sys.platform == 'win32':
 727                 pid = Popen(["%s" % rpath, '--vanilla','--slave','-f', "%s" % rcode])
 728             elif sys.platform == 'darwin' and needX11:
 729                 os.environ['DISPLAY'] = ':0.0'
 730                 pid = Popen([rpath, '--vanilla','--slave',"-f %s" % rcode, "--encoding=UTF-8"], stderr = PIPE, env = env)
 731             else :
 732                 pid = Popen([rpath, '--vanilla','--slave',"-f %s" % rcode, "--encoding=UTF-8"], stderr = PIPE, env = env)
 733             return pid
 734
 735 def check_Rresult(parent, pid) :
 736     if isinstance(pid, Popen) :
 737         if pid.returncode != 0 :
 738             error = pid.communicate()
 739             error = [str(error[0]), error[1]]
 740             if error[1] is None :
 741                 error[1] = 'None'
 742             parent.Rerror = '\n'.join([str(pid.returncode), '\n'.join(error)])
 743             try :
 744                 raise Exception('\n'.join([u'Erreur R', '\n'.join(error[1:])]))
 745             except :
 746                 BugReport(parent)
 747             return False
 748         else :
 749             return True
 750     else :
 751         if pid != 0 :
 752             try :
 753                 raise Exception(u'Erreur R')
 754             except :
 755                 BugReport(parent)
 756             return False
 757         else :
 758             return True
 759
 760
 761 def launchcommand(mycommand):
 762     Popen(mycommand)
 763
 764 def print_liste(filename,liste):
 765     with open(filename,'w') as f :
 766         for graph in liste :
 767             f.write(';'.join(graph).encode(sys.getdefaultencoding(), errors='replace')+'\n')
 768
 769 def read_list_file(filename, encoding = sys.getdefaultencoding()):
 770     with codecs.open(filename,'rU', encoding) as f :
 771         content=f.readlines()
 772         ncontent=[line.replace('\n','').split(';') for line in content if line.strip() != '']
 773     return ncontent
 774
 775 def progressbar(self, maxi) :
 776     ira = wx.GetApp().GetTopWindow()
 777     parent = ira
 778     try :
 779         maxi = int(maxi)
 780     except :
 781         maxi = 1
 782     prog = wx.ProgressDialog("Traitements",
 783                              "Veuillez patienter...",
 784                              maximum=maxi,
 785                              parent=parent,
 786                              style=wx.PD_APP_MODAL | wx.PD_AUTO_HIDE | wx.PD_ELAPSED_TIME | wx.PD_CAN_ABORT
 787                              )
 788     prog.SetSize((400,150))
 789     #prog.SetIcon(ira._icon)
 790     return prog
 791
 792 def treat_var_mod(variables) :
 793     var_mod = {}
 794     variables = list(set(variables))
 795     varmod = [variable.split('_') for variable in variables]
 796     vars = list(set([var[0] for var in varmod if len(var) >=2]))
 797     for var in vars :
 798         mods = ['_'.join(v) for v in varmod if v[0] == var]
 799         var_mod[var] = mods
 800
 801 #     for variable in variables :
 802 #         if u'_' in variable :
 803 #             forme = variable.split(u'_')
 804 #             var = forme[0]
 805 #             mod = forme[1]
 806 #             if not var in var_mod :
 807 #                 var_mod[var] = [variable]
 808 #             else :
 809 #                 if not mod in var_mod[var] :
 810 #                     var_mod[var].append(variable)
 811     return var_mod
 812
 813 def doconcorde(corpus, uces, mots, uci = False, et = False) :
 814     if not uci :
 815         ucestxt1 = [row for row in corpus.getconcorde(uces)]
 816     else :
 817         ucestxt1 = [row for row in corpus.getuciconcorde(uces)]
 818     ucestxt1 = dict(ucestxt1)
 819     ucestxt = []
 820     ucis_txt = []
 821     if not et :
 822         listmot = [corpus.getlems()[lem].formes for lem in mots]
 823         listmot = [corpus.getforme(fid).forme for lem in listmot for fid in lem]
 824     else :
 825         listmot = mots
 826     mothtml = ['<font color=red><b>%s</b></font>' % mot for mot in listmot]
 827     dmots = dict(zip(listmot, mothtml))
 828     for uce in uces :
 829         ucetxt = ucestxt1[uce].split()
 830         ucetxt = ' '.join([dmots.get(mot, mot) for mot in ucetxt])
 831         if not uci :
 832             uciid = corpus.getucefromid(uce).uci
 833             ucis_txt.append('<p><b>' + ' '.join(corpus.ucis[corpus.getucefromid(uce).uci].etoiles) + '<a href="%i_%i"> *%i_%i</a></b></p>' % (uciid, uce, uciid, uce))
 834         else :
 835             ucis_txt.append('<p><b>' + ' '.join(corpus.ucis[uce].etoiles) + '</b></p>')
 836         ucestxt.append(ucetxt)
 837     return ucis_txt, ucestxt
 838
 839
 840 def getallstcarac(corpus, analyse) :
 841    pathout = PathOut(analyse['ira'])
 842    profils =  ReadProfileAsDico(pathout['PROFILE_OUT'], Alceste, self.encoding)
 843    print profils
 844
 845 def read_chd(filein, fileout):
 846     with open(filein, 'r') as f :
 847         content = f.read()
 848     #content = [line[3:].replace('"',"").replace(' ','') for line in content.splitlines()]
 849     content = [line.split('\t') for line in content.splitlines()]
 850     chd = {'name':1, 'children':[]}
 851     mere={}
 852     for i, line in enumerate(content) :
 853         if i == 0 :
 854             chd['children'] = [{'name': line[1],'size' : content[i+1][0]}, {'name':line[2], 'size': content[i+1][1]}]
 855             mere[line[1]] = chd['children'][0]
 856             mere[line[2]] = chd['children'][1]
 857         elif not i % 2 :
 858             if 'children' in mere[line[0]]:
 859                 mere[line[0]]['children'].append({'name': line[1],'size' : content[i+1][0]})
 860                 mere[line[1]] = mere[line[0]]['children'][-1]
 861                 mere[line[0]]['children'].append({'name': line[2],'size' : content[i+1][1]})
 862                 mere[line[2]] = mere[line[0]]['children'][-1]
 863             else :
 864                 mere[line[0]]['children'] = [{'name': line[1],'size' : content[i+1][0]}, {'name':line[2], 'size': content[i+1][1]}]
 865                 mere[line[1]] = mere[line[0]]['children'][-2]
 866                 mere[line[2]] = mere[line[0]]['children'][-1]
 867     with open(fileout, 'w') as f :
 868         f.write(json.dumps(chd))
 869
 870
 871 translation_languages = {"Afrikaans":"af", "Albanian":"sq", "Amharic":"am", "Arabic":"ar", "Armenian":"hy", "Azeerbaijani":"az", "Basque":"eu", "Belarusian":"be", "Bengali":"bn", "Bosnian":"bs", "Bulgarian":"bg", "Catalan":"ca", "Cebuano":"ceb", "Chichewa":"ny", "Chinese (Simplified)":"zh-CN", "Chinese (Traditional)":"zh-TW", "Corsican":"co", "Croatian":"hr", "Czech":"cs", "Danish":"da", "Dutch":"nl", "English":"en", "Esperanto":"eo", "Estonian":"et", "Filipino":"tl", "Finnish":"fi", "French":"fr", "Frisian":"fy", "Galician":"gl", "Georgian":"ka", "German":"de", "Greek":"el", "Gujarati":"gu", "Haitian Creole":"ht", "Hausa":"ha", "Hawaiian":"haw", "Hebrew":"iw", "Hindi":"hi", "Hmong":"hmn ", "Hungarian":"hu", "Icelandic":"is", "Igbo":"ig", "Indonesian":"id", "Irish":"ga", "Italian":"it", "Japanese":"ja", "Javanese":"jw", "Kannada":"kn", "Kazakh":"kk", "Khmer":"km", "Korean":"ko", "Kurdish":"ku", "Kyrgyz":"ky", "Lao":"lo", "Latin":"la", "Latvian":"lv", "Lithuanian":"lt", "Luxembourgish":"lb", "Macedonian":"mk", "Malagasy":"mg", "Malay":"ms", "Malayalam":"ml", "Maltese":"mt", "Maori":"mi", "Marathi":"mr", "Mongolian":"mn", "Burmese":"my", "Nepali":"ne", "Norwegian":"no", "Pashto":"ps", "Persian":"fa", "Polish":"pl", "Portuguese":"pt", "Punjabi":"ma", "Romanian":"ro", "Russian":"ru", "Samoan":"sm", "Scots Gaelic":"gd", "Serbian":"sr", "Sesotho":"st", "Shona":"sn", "Sindhi":"sd", "Sinhala":"si", "Slovak":"sk", "Slovenian":"sl", "Somali":"so", "Spanish":"es", "Sundanese":"su", "Swahili":"sw", "Swedish":"sv", "Tajik":"tg", "Tamil":"ta", "Telugu":"te", "Thai":"th", "Turkish":"tr", "Ukrainian":"uk", "Urdu":"ur", "Uzbek":"uz", "Vietnamese":"vi", "Welsh":"cy", "Xhosa":"xh", "Yiddish":"yi", "Yoruba":"yo", "Zulu":"zu", }
 872
 873
 874 def gettranslation(words, lf, lt) :
 875     import urllib2
 876     import json
 877     agent = {'User-Agent':
 878     "Mozilla/4.0 (\
 879     compatible;\
 880     MSIE 6.0;\
 881     Windows NT 5.1;\
 882     SV1;\
 883     .NET CLR 1.1.4322;\
 884     .NET CLR 2.0.50727;\
 885     .NET CLR 3.0.04506.30\
 886     )"}
 887     base_link = "https://translate.googleapis.com/translate_a/single?client=gtx&sl=%s&tl=%s&dt=t&q=%s"
 888     print len(words)
 889     totrans = urllib2.quote('\n'.join(words).encode('utf8'))
 890     link = base_link % (lf, lt, totrans)
 891     request = urllib2.Request(link, headers=agent)
 892     raw_data = urllib2.urlopen(request).read()
 893     data = json.loads(raw_data)
 894     return [line[0].decode('utf8', error='replace').replace(u"'", u'_').replace(u' | ', u'|').replace(u' ', u'_').replace(u'-',u'_').replace(u'\n','') for line in data[0]]
 895
 896 def makenprof(prof, trans, deb=0) :
 897     nprof=[]
 898     if deb == 0 :
 899         nprof.append(prof[0])
 900     for i, val in enumerate(trans) :
 901         line = prof[deb+i+1][:]
 902         line[6] = val
 903         nprof.append(line)
 904     return nprof
 905
 906 def treatempty(val) :
 907     if val.strip() == '' :
 908         return '_'
 909     else :
 910         return val
 911
 912 def translateprofile(corpus, dictprofile, lf='it', lt='fr', maxword = 20) :
 913     nprof = {}
 914     lems = {}
 915     for i in range(len(dictprofile)) :
 916         prof = dictprofile[`i+1`]
 917         try :
 918             lenact = prof.index([u'*****', u'*', u'*', u'*', u'*', u'*', '', ''])
 919             lensup = -1
 920         except ValueError:
 921             try :
 922                 lenact = prof.index([u'*', u'*', u'*', u'*', u'*', u'*', '', ''])
 923                 lensup = 0
 924             except ValueError:
 925                 lenact = len(prof)
 926                 lensup = 0
 927         try :
 928             lensup += prof.index([u'*', u'*', u'*', u'*', u'*', u'*', '', ''])
 929             lensup = lensup - lenact
 930         except ValueError:
 931             lensup += len(prof) - lenact
 932         if lenact != 0 :
 933             if lenact > maxword :
 934                 nlenact = maxword
 935             else :
 936                 nlenact = lenact
 937             actori = [line[6] for line in prof[1:nlenact]]
 938             act = [val.replace(u'_', u' ') for val in actori]
 939             act = gettranslation(act, lf, lt)
 940             for j, val in enumerate(actori) :
 941                 if act[j] not in lems :
 942                     lems[act[j]] = val
 943                 else :
 944                     while act[j] in lems :
 945                         act[j] = act[j] + u"+"
 946                     lems[act[j]] = val
 947             nprof[`i+1`] = makenprof(prof, act)
 948
 949         if lensup != 0 :
 950             if lensup > maxword :
 951                 nlensup = maxword
 952             else :
 953                 nlensup = lensup
 954             supori = [line[6] for line in prof[(1+lenact):(lenact+nlensup)]]
 955             sup = [val.replace(u'_', u' ') for val in supori]
 956             sup = [treatempty(val) for val in sup]
 957             sup = gettranslation(sup, lf, lt)
 958             for j, val in enumerate(supori) :
 959                 if sup[j] not in lems :
 960                     lems[sup[j]] = val
 961                 else :
 962                     while sup[j] in lems :
 963                         sup[j] = sup[j] + u"+"
 964                     lems[sup[j]] = val
 965             nprof[`i+1`].append([u'*****', u'*', u'*', u'*', u'*', u'*', '', ''])
 966             nprof[`i+1`] += makenprof(prof, sup, deb=lenact)
 967
 968         try :
 969             lenet = prof.index([u'*', u'*', u'*', u'*', u'*', u'*', '', ''])
 970             nprof[`i+1`].append([u'*', u'*', u'*', u'*', u'*', u'*', '', ''])
 971             nprof[`i+1`] += prof[(lenet+1):]
 972         except :
 973             pass
 974     return nprof, lems
 975
 976 def write_translation_profile(prof, lems, language, dictpathout) :
 977     if os.path.exists(dictpathout['translations.txt']) :
 978         with codecs.open(dictpathout['translations.txt'], 'r', 'utf8') as f :
 979             translist = f.read()
 980         translist = [line.split('\t') for line in translist.splitlines()]
 981     else :
 982         translist = []
 983     toprint = []
 984     toprint.append(['','','','','',''])
 985     toprint.append([u'***', u'nb classes', `len(prof)`, u'***', '', ''])
 986     for i in range(len(prof)) :
 987         toprint.append([u'**', u'classe', `i+1`, u'**', '', ''])
 988         toprint.append([u'****'] + prof[`i+1`][0] + [u'****'])
 989         rest = [[`line[1]`, `line[2]`, `line[3]`, `line[4]`, line[6], line[7].replace('< 0,0001', '0.00009').replace('NS (','').replace(')','')] for line in prof[`i+1`][1:]]
 990         for i, line in enumerate(prof[`i+1`][1:]) :
 991             if line[0] == u'*' :
 992                 rest[i] = [u'*', u'*', u'*', u'*', u'*', u'*']
 993             elif line[0] == u'*****' :
 994                 rest[i] = [u'*****',u'*',u'*', u'*', u'*', u'*']
 995         toprint += rest
 996     with open(dictpathout['translation_profile_%s.csv' % language], 'w') as f :
 997         f.write('\n'.join([';'.join(line) for line in toprint]).encode('utf8'))
 998     with open(dictpathout['translation_words_%s.csv' % language], 'w') as f :
 999         f.write('\n'.join(['\t'.join([val, lems[val]]) for val in lems]).encode('utf8'))
1000     if 'translation_profile_%s.csv' % language not in [val[0] for val in translist] :
1001         translist.append(['translation_profile_%s.csv' % language, 'translation_words_%s.csv' % language])
1002         with open(dictpathout['translations.txt'], 'w') as f :
1003             f.write('\n'.join(['\t'.join(line) for line in translist]).encode('utf8'))
1004
1005 def makesentidict(infile, language) :
1006     #'/home/pierre/workspace/iramuteq/dev/langues/NRC/NRC-Emotion-Lexicon.csv'
1007     with codecs.open(infile,'r', 'utf8') as f :
1008         content = f.read()
1009     content = [line.split('\t') for line in content.splitlines()]
1010     titles = content.pop(0)
1011     senti = ['Positive', 'Negative', 'Anger', 'Anticipation', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise', 'Trust']
1012     sentid = {}
1013     for sent in senti :
1014         sentid[sent] = titles.index(sent)
1015     frtitle = [val for val in titles if '(fr)' in val]
1016     frid = titles.index(frtitle[0])
1017     sentidict = [[line[frid].lower(), [line[sentid[sent]] for sent in senti]] for line in content]
1018     pos = ['positive'] + [line[0] for line in sentidict if line[1][0] == '1']
1019     neg = ['negative'] + [line[0] for line in sentidict if line[1][1] == '1']
1020     anger = ['anger'] + [line[0] for line in sentidict if line[1][2] == '1']
1021     anticipation = ['anticipation'] + [line[0] for line in sentidict if line[1][3] == '1']
1022     disgust = ['disgust'] + [line[0] for line in sentidict if line[1][4] == '1']
1023     fear = ['fear'] + [line[0] for line in sentidict if line[1][5] == '1']
1024     joy = ['joy'] + [line[0] for line in sentidict if line[1][6] == '1']
1025     sadness = ['sadness'] + [line[0] for line in sentidict if line[1][7] == '1']
1026     surprise = ['surprise'] + [line[0] for line in sentidict if line[1][8] == '1']
1027     trust = ['trust'] + [line[0] for line in sentidict if line[1][9] == '1']
1028     with open('/tmp/tgenemo.csv', 'w') as f :
1029         for val in [pos, neg, anger, anticipation, disgust, fear, joy, sadness, surprise, trust] :
1030             f.write('\t'.join(val).encode('utf8') + '\n')
1031
1032 def countsentfromprof(prof, encoding, sentidict) :
1033     with codecs.open(prof, 'r', encoding) as f :
1034         content = f.read()
1035     content = [line.split(';') for line in content.splitlines()]
1036     print content
1037     content = [[line[0], [int(val) for val in line[1:]]] for line in content]
1038     print content
1039     content = dict(content)
1040     print content
1041
1042 def iratolexico(infile, outfile, encoding) :
1043     with codecs.open(infile, 'r', encoding) as f :
1044         for line in f :
1045             if line.startswith(u'**** ') :
1046                 line = line.split()
1047