iramuteq.org Git - iramuteq/blob - functions.py

   1 #!/bin/env python
   2 # -*- coding: utf-8 -*-
   3 #Author: Pierre Ratinaud
   4 #Copyright (c) 2008-2012 Pierre Ratinaud
   5 #License: GNU/GPL
   6
   7 import wx
   8 import re
   9 from ConfigParser import ConfigParser
  10 from subprocess import Popen, call, PIPE
  11 import thread
  12 import os
  13 import ast
  14 import sys
  15 import csv
  16 import platform
  17 import traceback
  18 import codecs
  19 import locale
  20 import datetime
  21 from copy import copy
  22 from shutil import copyfile
  23 import shelve
  24 import json
  25 #from dialog import BugDialog
  26 import logging
  27
  28 log = logging.getLogger('iramuteq')
  29
  30
  31 indices_simi = [u'cooccurrence' ,'pourcentage de cooccurrence',u'Russel',u'Jaccard', 'Kulczynski1', 'Kulczynski2', 'Mountford', 'Fager', 'simple matching', 'Hamman', 'Faith', 'Tanimoto', 'Dice', 'Phi', 'Stiles', 'Michael', 'Mozley', 'Yule', 'Yule2', 'Ochiai', 'Simpson', 'Braun-Blanquet','Chi-squared', 'Phi-squared', 'Tschuprow', 'Cramer', 'Pearson', 'binomial']
  32
  33
  34
  35 def open_folder(folder):
  36     if sys.platform == "win32":
  37         os.startfile(folder)
  38     else:
  39         opener ="open" if sys.platform == "darwin" else "xdg-open"
  40         #call([opener, folder])
  41         call([u"%s %s &" % (opener, folder)], shell=True)
  42
  43 def normpath_win32(path) :
  44     if not sys.platform == 'win32' :
  45         return path
  46     while '\\\\' in path :
  47         path = path.replace('\\\\', '\\')
  48     if path.startswith('\\') and not path.startswith('\\\\') :
  49         path = '\\' + path
  50     return path
  51
  52 class TGen :
  53     def __init__(self, path = None, encoding = 'utf8'):
  54         self.path = path
  55         self.tgen = {}
  56         self.encoding = encoding
  57
  58     def __getitem__(self, key):
  59         return self.tgen[key]
  60
  61     def read(self, path = None):
  62         if path is None :
  63             path = self.path
  64         with codecs.open(path, 'r', self.encoding) as f :
  65             tgen = f.read()
  66         tgen = [line.split('\t') for line in tgen.splitlines()]
  67         tgen = dict([[line[0], line[1:]] for line in tgen])
  68         self.tgen = tgen
  69         self.path = path
  70
  71     def write(self, path = None):
  72         if path is None :
  73             path = self.path
  74         with open(path, 'w') as f :
  75             f.write('\n'.join(['\t'.join([val] + self.tgen[val]) for val in self.tgen]).encode(self.encoding))
  76
  77     def writetable(self, pathout, tgens, totocc):
  78         etoiles = totocc.keys()
  79         etoiles.sort()
  80         with open(pathout, 'w') as f :
  81             line = '\t'.join([u'tgens'] + etoiles) + '\n'
  82             f.write(line.encode(self.encoding))
  83             for t in tgens :
  84                 line = '\t'.join([t] + [`tgens[t][et]` for et in etoiles]) + '\n'
  85                 f.write(line.encode(self.encoding))
  86             i = 0
  87             totname = 'total'
  88             while totname + `i` in tgens :
  89                 i += 1
  90             totname = totname + `i`
  91             line = '\t'.join([totname] + [`totocc[et]` for et in etoiles]) + '\n'
  92             f.write(line.encode(self.encoding))
  93
  94 class History :
  95     def __init__(self, filein, syscoding = 'utf8') :
  96         self.filein = filein
  97         self.syscoding = syscoding
  98         self.corpus = {}
  99         self.openedcorpus = {}
 100         self.openedmatrix = {}
 101         self.orph = []
 102         self.analyses = {}
 103         self.history = []
 104         self.opened = {}
 105         self.read()
 106
 107     def read(self) :
 108         d = shelve.open(self.filein)
 109         self.history = d.get('history', [])
 110         self.matrix = d.get('matrix', [])
 111         self.ordercorpus = dict([[corpus['uuid'], i] for i, corpus in enumerate(self.history)])
 112         self.corpus = dict([[corpus['uuid'], corpus] for corpus in self.history])
 113         self.analyses = dict([[analyse['uuid'], analyse] for corpus in self.history for analyse in corpus.get('analyses', [])])
 114         self.matrixanalyse = dict([[mat['uuid'], mat] for mat in self.matrix])
 115         self.ordermatrix = dict([[matrix['uuid'], i] for i, matrix in enumerate(self.matrix)])
 116         d.close()
 117
 118     def write(self) :
 119         d = shelve.open(self.filein)
 120         d['history'] = self.history
 121         d['matrix'] = self.matrix
 122         d.close()
 123
 124     def add(self, analyse) :
 125         log.info('add to history %s' % analyse.get('corpus_name', 'pas un corpus'))
 126         tosave = {'uuid' : analyse['uuid'], 'ira': analyse['ira'], 'type' : analyse['type']}
 127         if tosave['uuid'] in self.corpus :
 128             log.info('problem : this uuid is already in history : %s' % tosave['uuid'])
 129             return
 130         if analyse.get('corpus', False) :
 131             if analyse['uuid'] in self.analyses :
 132                 return
 133             tosave['corpus'] = analyse['corpus']
 134             tosave['name'] = analyse['name']
 135             acorpus_uuid =  analyse['corpus']
 136             if acorpus_uuid in self.corpus :
 137                 if 'analyses' in self.history[self.ordercorpus[acorpus_uuid]] :
 138                     self.history[self.ordercorpus[acorpus_uuid]]['analyses'].append(tosave)
 139                 else :
 140                     self.history[self.ordercorpus[acorpus_uuid]]['analyses'] = [tosave]
 141             else :
 142                 self.orph.append(tosave)
 143         else :
 144             tosave['corpus_name'] = analyse['corpus_name']
 145             #self.ordercorpus[tosave['uuid']] = len(history)
 146             #self.corpus[tosave['uuid']] = analyse
 147             self.history.append(tosave)
 148         self.write()
 149         self.read()
 150
 151     def addMatrix(self, analyse) :
 152         tosave = analyse
 153         #tosave['matrix_name'] = analyse['matrix_name']
 154         tosave['analyses'] = []
 155         self.matrix.append(tosave)
 156         self.write()
 157         self.read()
 158
 159     def addMatrixAnalyse(self, analyse) :
 160         tosave = {'uuid' : analyse['uuid'], 'ira': analyse['ira'], 'type' : analyse['type'], 'matrix' : analyse['matrix']}
 161         tosave['name'] = analyse['name']
 162         if tosave['matrix'] in self.ordermatrix :
 163             self.matrix[self.ordermatrix[tosave['matrix']]]['analyses'].append(tosave)
 164         self.write()
 165         self.read()
 166
 167     def addmultiple(self, analyses) :
 168         log.info('add multiple')
 169         for analyse in analyses :
 170             tosave = {'uuid' : analyse['uuid'], 'ira': analyse['ira'], 'type' : analyse['type']}
 171             corpus = analyse['corpus']
 172             tosave['corpus'] = corpus
 173             tosave['name'] = analyse['name']
 174             if corpus in self.corpus :
 175                 if 'analyses' in self.history[self.ordercorpus[corpus]] :
 176                     self.history[self.ordercorpus[corpus]]['analyses'].append(tosave)
 177                 else :
 178                     self.history[self.ordercorpus[corpus]]['analyses'] = [tosave]
 179         self.write()
 180         self.read()
 181
 182     def delete(self, analyse, corpus = False) :
 183         log.info('delete %s' % analyse.get('name', 'noname'))
 184         if corpus :
 185             self.history.pop(self.ordercorpus[analyse['uuid']])
 186             if analyse['uuid'] in self.openedcorpus :
 187                 del self.openedcorpus[analyse['uuid']]
 188             log.info('delete corpus : %s' % analyse['uuid'])
 189         elif analyse['uuid'] in self.analyses :
 190             todel = [i for i, ana in enumerate(self.corpus[analyse['corpus']]['analyses']) if ana['uuid'] == analyse['uuid']][0]
 191             self.history[self.ordercorpus[analyse['corpus']]]['analyses'].pop(todel)
 192         elif analyse['uuid'] in self.matrixanalyse :
 193             self.matrix = [mat for mat in self.matrix if mat['uuid'] != analyse['uuid']]
 194         elif analyse.get('matrix', False) in self.matrixanalyse :
 195             analyses = self.matrix[self.ordermatrix[analyse['matrix']]]['analyses']
 196             topop = [i for i, val in enumerate(analyses) if analyse['uuid'] == val['uuid']][0]
 197             analyses.pop(topop)
 198             self.matrix[self.ordermatrix[analyse['matrix']]]['analyses'] = analyses
 199         self.write()
 200         self.read()
 201
 202     def addtab(self, analyse) :
 203         self.opened[analyse['uuid']] = analyse
 204
 205     def rmtab(self, analyse) :
 206         del self.opened[analyse['uuid']]
 207
 208     def update(self, analyse) :
 209         if 'matrix_name' in analyse :
 210             self.matrixanalyse[analyse['uuid']].update(analyse)
 211         elif 'corpus_name' in analyse :
 212             self.corpus[analyse['uuid']].update(analyse)
 213         elif 'corpus' in analyse :
 214             self.analyses[analyse['uuid']].update(analyse)
 215         else :
 216             toupdate = [an for an in self.matrixanalyse[analyse['matrix']]['analyses'] if an['uuid'] == analyse['uuid']]
 217             toupdate[0].update(analyse)
 218         self.write()
 219         self.read()
 220
 221     def clean(self) :
 222         corpustodel = [corpus for corpus in self.history if not os.path.exists(corpus['ira'])]
 223         print corpustodel
 224         for corpus in corpustodel :
 225             print 'cleaning :', corpus['corpus_name']
 226             self.delete(corpus, corpus = True)
 227         anatodel = [analyse for corpus in self.history for analyse in corpus.get('analyses', []) if not os.path.exists(analyse.get('ira', '/'))]
 228         for analyse in anatodel :
 229             print 'cleaning :', analyse['name']
 230             self.delete(analyse)
 231
 232     def dostat(self):
 233         todel = {}
 234         tokens = 0
 235         corpusnb = {}
 236         subnb = 0
 237         analysenb = 0
 238         hours = 0
 239         minutes = 0
 240         secondes = 0
 241         ha = 0
 242         ma = 0
 243         sa = 0
 244         for corpus in self.history :
 245             analysenb += len(corpus.get('analyses', []))
 246             analyses = corpus.get('analyses', [])
 247             for analyse in analyses :
 248                 if os.path.exists(analyse['ira']) :
 249                     ana = DoConf(analyse['ira']).getoptions()
 250                     if 'time' in ana :
 251                         time = ana['time'].split()
 252                         ha += int(time[0].replace('h','')) * 3600
 253                         ma += int(time[1].replace('m','')) * 60
 254                         sa += int(time[2].replace('s',''))
 255             if os.path.exists(corpus['ira']) :
 256                 param = DoConf(corpus['ira']).getoptions()
 257                 time = param.get('time','0h 0m 0s')
 258                 time = time.split()
 259                 hours += int(time[0].replace('h','')) * 3600
 260                 minutes += int(time[1].replace('m','')) * 60
 261                 secondes += int(time[2].replace('s',''))
 262                 if param.get('originalpath', False) :
 263                     if param['originalpath'] in corpusnb :
 264                         corpusnb[param['originalpath']] += 1
 265                         tokens += int(param['occurrences'])
 266                     else :
 267                         corpusnb[param['originalpath']] = 1
 268                     #print param
 269                 else :
 270                     subnb += 1
 271             else :
 272                 if corpus['ira'] in todel :
 273                     todel['ira'] += 1
 274                 else :
 275                     todel['ira'] = 1
 276         print u'Nbr total de corpus : %s' % len(self.history)
 277         corpus_nb = len(corpusnb) + len(todel)
 278         print u'Nbr de corpus différents : %s' % corpus_nb
 279         lentodel = len(todel)
 280         print u'Nbr de corpus à supprimer : %s' % lentodel
 281         print u'Nbr de sous corpus : %s' % subnb
 282         print u"Nbr total d'occurrences : %s" % tokens
 283         print u'Moyenne occurrences par corpus : %f' % (tokens/corpus_nb)
 284         print '---------------------'
 285         print u"Nbr total d'analyses : %s" % analysenb
 286         print u'Temps total indexation : %f h' % ((hours+minutes+secondes) / 3600)
 287         print u'Temps total analyses :  %f h' % ((ha+ma+sa) / 3600)
 288
 289     def __str__(self) :
 290         return str(self.history)
 291
 292 class DoConf :
 293     def __init__(self, configfile=None, diff = None, parametres = None) :
 294         self.configfile = configfile
 295         self.conf = ConfigParser()
 296
 297         if configfile is not None :
 298             configfile = normpath_win32(configfile)
 299             self.conf.readfp(codecs.open(configfile, 'r', 'utf8'))
 300         self.parametres = {}
 301         if parametres is not None :
 302             self.doparametres(parametres)
 303
 304     def doparametres(self, parametres) :
 305         return parametres
 306
 307     def getsections(self) :
 308         return self.conf.sections()
 309
 310     def getoptions(self, section = None, diff = None):
 311         parametres = {}
 312         if section is None :
 313             section = self.conf.sections()[0]
 314         for option in self.conf.options(section) :
 315             if self.conf.get(section, option).isdigit() :
 316                 parametres[option] = int(self.conf.get(section, option))
 317             elif self.conf.get(section, option) == 'False' :
 318                 parametres[option] = False
 319             elif self.conf.get(section, option) == 'True' :
 320                 parametres[option] = True
 321             elif self.conf.get(section, option).startswith('(') and self.conf.get(section, option).endswith(')') :
 322                 parametres[option] = ast.literal_eval(self.conf.get(section, option))
 323             elif self.conf.get(section, option).startswith('[') and self.conf.get(section, option).endswith(']') :
 324                 parametres[option] = ast.literal_eval(self.conf.get(section, option))
 325             else :
 326                 parametres[option] = self.conf.get(section, option)
 327         if 'type' not in parametres :
 328             parametres['type'] = section
 329         return parametres
 330
 331     def makeoptions(self, sections, parametres, outfile = None) :
 332         txt = ''
 333         for i, section in enumerate(sections) :
 334             txt += '[%s]\n' % section
 335             if not self.conf.has_section(section) :
 336                 self.conf.add_section(section)
 337             for option in parametres[i] :
 338                 if isinstance(parametres[i][option], int) :
 339                     self.conf.set(section, option, `parametres[i][option]`)
 340                     txt += '%s = %i\n' % (option, parametres[i][option])
 341                 elif isinstance(parametres[i][option], basestring) :
 342                     self.conf.set(section, option, parametres[i][option].encode('utf8'))
 343                     txt += '%s = %s\n' % (option, parametres[i][option])
 344                 elif isinstance(parametres[i][option], wx.Colour) :
 345                     self.conf.set(section, option, str(parametres[i][option]))
 346                     txt += '%s = %s\n' % (option, str(parametres[i][option]))
 347                 elif option == 'analyses' :
 348                     pass
 349                 else :
 350                     self.conf.set(section, option, `parametres[i][option]`)
 351                     txt += '%s = %s\n' % (option, `parametres[i][option]`)
 352         if outfile is None :
 353             outfile = self.configfile
 354         outfile = normpath_win32(outfile)
 355         with open(outfile, 'w') as f :
 356             f.write(txt.encode('utf8'))
 357             #self.conf.write(f)
 358
 359     def totext(self, parametres) :
 360         #txt = ['Corpus']
 361         txt = []
 362         for val in parametres :
 363             if isinstance(parametres[val], int) :
 364                 txt.append(' \t\t: '.join([val, `parametres[val]`]))
 365             elif isinstance(parametres[val], basestring) :
 366                 txt.append(' \t\t: '.join([val, parametres[val]]))
 367             elif val in ['listet', 'stars'] :
 368                 pass
 369             else :
 370                 txt.append(' \t\t: '.join([val, `parametres[val]`]))
 371         return '\n'.join(txt)
 372
 373
 374 def write_tab(tab, fileout) :
 375         writer = csv.writer(open(fileout, 'wb'), delimiter=';', quoting = csv.QUOTE_NONNUMERIC)
 376         writer.writerows(tab)
 377
 378 class BugDialog(wx.Dialog):
 379     def __init__(self, *args, **kwds):
 380         # begin wxGlade: MyDialog.__init__
 381         kwds["style"] = wx.DEFAULT_DIALOG_STYLE | wx.STAY_ON_TOP
 382         kwds["size"] = wx.Size(500, 200)
 383         wx.Dialog.__init__(self, *args, **kwds)
 384         self.SetTitle(kwds['title'])
 385         self.text_ctrl_1 = wx.TextCtrl(self, -1, "", style=wx.TE_MULTILINE)
 386         self.text_ctrl_1.SetBackgroundColour('#DDE8EB')
 387         self.button_1 = wx.Button(self, wx.ID_OK, "")
 388
 389         self.__set_properties()
 390         self.__do_layout()
 391         # end wxGlade
 392
 393     def __set_properties(self):
 394         # begin wxGlade: MyDialog.__set_properties
 395         self.SetMinSize(wx.Size(500, 200))
 396         self.text_ctrl_1.SetMinSize(wx.Size(500, 200))
 397
 398         # end wxGlade
 399
 400     def __do_layout(self):
 401         # begin wxGlade: MyDialog.__do_layout
 402         sizer_1 = wx.BoxSizer(wx.VERTICAL)
 403         sizer_1.Add(self.text_ctrl_1, 1, wx.EXPAND, 0)
 404         sizer_1.Add(self.button_1, 0, wx.ALIGN_CENTER_HORIZONTAL, 0)
 405         self.SetSizer(sizer_1)
 406         sizer_1.Fit(self)
 407         self.Layout()
 408
 409
 410 def CreateIraFile(DictPathOut, clusternb, corpname='corpus_name', section = 'analyse'):
 411     AnalyseConf = ConfigParser()
 412     AnalyseConf.read(DictPathOut['ira'])
 413     AnalyseConf.add_section(section)
 414     date = datetime.datetime.now().ctime()
 415     AnalyseConf.set(section, 'date', str(date))
 416     AnalyseConf.set(section, 'clusternb', clusternb)
 417     AnalyseConf.set(section, 'corpus_name', corpname)
 418
 419     fileout = open(DictPathOut['ira'], 'w')
 420     AnalyseConf.write(fileout)
 421     fileout.close()
 422
 423 def sortedby(list, direct, *indices):
 424
 425     """
 426         sortedby: sort a list of lists (e.g. a table) by one or more indices
 427                   (columns of the table) and return the sorted list
 428
 429         e.g.
 430          for list = [[2,3],[1,2],[3,1]]:
 431          sortedby(list,1) will return [[3, 1], [1, 2], [2, 3]],
 432          sortedby(list,0) will return [[1, 2], [2, 3], [3, 1]]
 433     """
 434
 435     nlist = map(lambda x, indices=indices:
 436                  map(lambda i, x=x: x[i], indices) + [x],
 437                  list)
 438     if direct == 1:
 439         nlist.sort()
 440     elif direct == 2:
 441         nlist.sort(reverse=True)
 442     return map(lambda l: l[-1], nlist)
 443
 444 def add_type(line, dictlem):
 445     if line[4] in dictlem:
 446         line.append(dictlem[line[4]])
 447     else :
 448         line.append('')
 449     return line
 450
 451 def treat_line_alceste(i, line) :
 452     if line[0] == '*' or line[0] == '*****' :
 453         return line + ['']
 454     if line[5] == 'NA':
 455         print 'NA', line[5]
 456         pass
 457     elif float(line[5].replace(',', '.')) < 0.0001:
 458         line[5] = '< 0,0001'
 459     elif float(line[5].replace(',', '.')) > 0.05:
 460         line[5] = 'NS (%s)' % str(float(line[5].replace(',', '.')))[0:7]
 461     else:
 462         line[5] = str(float(line[5].replace(',', '.')))[0:7]
 463     return [i, int(line[0]), int(line[1]), float(line[2]), float(line[3]), line[6], line[4], line[5]]
 464
 465 def ReadProfileAsDico(File, Alceste=False, encoding = sys.getdefaultencoding()):
 466     dictlem = {}
 467     print 'lecture des profiles'
 468     FileReader = codecs.open(File, 'r', encoding)
 469     Filecontent = FileReader.readlines()
 470     FileReader.close()
 471     DictProfile = {}
 472     count = 0
 473     #rows = [row.replace('\n', '').replace("'", '').replace('\"', '').replace(',', '.').replace('\r','').split(';') for row in Filecontent]
 474     rows = [row.replace('\n', '').replace("'", '').replace('\"', '').replace('\r','').split(';') for row in Filecontent]
 475     rows.pop(0)
 476     ClusterNb = rows[0][2]
 477     rows.pop(0)
 478     clusters = [row[2] for row in rows if row[0] == u'**']
 479     valclusters = [row[1:4] for row in rows if row[0] == u'****']
 480     lp = [i for i, line in enumerate(rows) if line[0] == u'****']
 481     prof = [rows[lp[i] + 1:lp[i+1] - 1] for i in range(0, len(lp)-1)] + [rows[lp[-1] + 1:len(rows)]]
 482     if Alceste :
 483         prof = [[add_type(row, dictlem) for row in pr] for pr in prof]
 484         prof = [[treat_line_alceste(i,line) for i, line in enumerate(pr)] for pr in prof]
 485     else :
 486         prof = [[line + [''] for line in pr] for pr in prof]
 487         prof = [[treat_line_alceste(i,line) for i, line in enumerate(pr)] for pr in prof]
 488     for i, cluster in enumerate(clusters):
 489         DictProfile[cluster] = [valclusters[i]] + prof[i]
 490     return DictProfile
 491
 492 def GetTxtProfile(dictprofile, cluster_size) :
 493     proflist = []
 494     for classe in range(0, len(dictprofile)) :
 495         prof = dictprofile[str(classe + 1)]
 496         clinfo = cluster_size[classe]
 497         proflist.append('\n'.join([' '.join(['classe %i' % (classe + 1), '-', '%s uce sur %s - %s%%' % (clinfo[0], clinfo[1], clinfo[2])]), '\n'.join(['%5s|%5s|%6s|%6s|%8s|%8s|%20s\t%10s' % tuple([str(val) for val in line]) for line in prof if len(line)==8])]))
 498     return '\n\n'.join(proflist)
 499
 500 def formatExceptionInfo(maxTBlevel=5):
 501     cla, exc, trbk = sys.exc_info()
 502     try :
 503         excName = cla.__name__
 504     except :
 505         excName = 'None'
 506     try:
 507         excArgs = exc.args[0]
 508     except :
 509         excArgs = "<no args>"
 510     excTb = traceback.format_tb(trbk, maxTBlevel)
 511     return (excName, excArgs, excTb)
 512
 513
 514 #fonction des etudiants de l'iut
 515 def decoupercharact(chaine, longueur, longueurOptimale, separateurs = None) :
 516     """
 517         on part du dernier caractère, et on recule jusqu'au début de la chaîne.
 518         Si on trouve un '$', c'est fini.
 519         Sinon, on cherche le meilleur candidat. C'est-à-dire le rapport poids/distance le plus important.
 520     """
 521     separateurs = [[u'.', 60.0], [u'?', 60.0], [u'!', 60.0], [u'£$£', 60], [u':', 50.0], [u';', 40.0], [u',', 10.0], [u' ', 0.1]]
 522     trouve = False                 # si on a trouvé un bon séparateur
 523     iDecoupe = 0                # indice du caractere ou il faut decouper
 524
 525     # on découpe la chaine pour avoir au maximum 240 caractères
 526     longueur = min(longueur, len(chaine) - 1)
 527     chaineTravail = chaine[:longueur + 1]
 528     nbCar = longueur
 529     meilleur = ['', 0, 0]        # type, poids et position du meilleur separateur
 530
 531     # on vérifie si on ne trouve pas un '$'
 532     indice = chaineTravail.find(u'$')
 533     if indice > -1:
 534         trouve = True
 535         iDecoupe = indice
 536
 537     # si on ne trouve rien, on cherche le meilleur séparateur
 538     if not trouve:
 539         while nbCar >= 0:
 540             caractere = chaineTravail[nbCar]
 541             distance = abs(longueurOptimale - nbCar) + 1
 542             meilleureDistance = abs(longueurOptimale - meilleur[2]) + 1
 543
 544             # on vérifie si le caractére courant est une marque de ponctuation
 545             for s in separateurs:
 546                 if caractere == s[0]:
 547                     # si c'est une ponctuation
 548
 549                     if s[1] / distance > float(meilleur[1]) / meilleureDistance:
 550                         # print nbCar, s[0]
 551                         meilleur[0] = s[0]
 552                         meilleur[1] = s[1]
 553                         meilleur[2] = nbCar
 554                         trouve = True
 555                         iDecoupe = nbCar
 556
 557                     # et on termine la recherche
 558                     break
 559
 560             # on passe au caractère précédant
 561             nbCar = nbCar - 1
 562
 563     # si on a trouvé
 564     if trouve:
 565         fin = chaine[iDecoupe + 1:]
 566         retour = chaineTravail[:iDecoupe]
 567         return len(retour) > 0, retour.split(), fin
 568     # si on a rien trouvé
 569     return False, chaine.split(), ''
 570
 571
 572 exceptions = {'paragrapheOT' : u"Un problème de formatage (présence d'un marqueur de paragraphe (-*) en dehors d'un texte) est survenu à la ligne ",
 573               'EmptyText' : u"Texte vide (probablement un problème de formatage du corpus). Le problème est apparu à la ligne ",
 574               'CorpusEncoding' : u"Problème d'encodage.",
 575               'TextBeforeTextMark' : u"Problème de formatage : du texte avant le premier marqueur de texte (****). Le problème est survenu à la ligne ",
 576               'MissingAnalyse' : u'Aucun fichier à cet emplacement :\n',
 577 }
 578
 579 def BugReport(parent, error = None):
 580     for ch in parent.GetChildren():
 581         if "<class 'wx._windows.ProgressDialog'>" == str(type(ch)):
 582             ch.Destroy()
 583     excName, exc, excTb = formatExceptionInfo()
 584     if excName == 'Exception' :
 585         print exc
 586         if len(exc.split()) == 2 :
 587             mss, linenb = exc.split()
 588             if mss in exceptions :
 589                 txt = exceptions[mss] + linenb
 590             else :
 591                 txt = exc
 592         else :
 593             if exc in exceptions :
 594                 txt = exceptions[exc]
 595             else :
 596                 txt = exc
 597         title = "Information"
 598     else :
 599         txt = u'            !== BUG ==!       \n'
 600         txt += u'*************************************\n'
 601         txt += '\n'.join(excTb).replace('    ', ' ')
 602         txt += excName + '\n'
 603         txt += `exc`
 604         title = "Bug"
 605
 606     dial = BugDialog(parent, **{'title' : title})
 607     if 'Rerror' in dir(parent) :
 608         txt += parent.Rerror
 609         parent.Rerror = ''
 610     log.info(txt)
 611     dial.text_ctrl_1.write(txt)
 612     dial.CenterOnParent()
 613     dial.ShowModal()
 614     dial.Destroy()
 615
 616 def PlaySound(parent):
 617     if parent.pref.getboolean('iramuteq', 'sound') :
 618         try:
 619             if "gtk2" in wx.PlatformInfo:
 620                 error = Popen(['aplay','-q',os.path.join(parent.AppliPath,'son_fin.wav')])
 621             else :
 622                 sound = wx.Sound(os.path.join(parent.AppliPath, 'son_fin.wav'))
 623                 sound.Play(wx.SOUND_SYNC)
 624         except :
 625             print 'pas de son'
 626
 627 def ReadDicoAsDico(dicopath):
 628     with codecs.open(dicopath, 'r', 'UTF8') as f:
 629         content = f.readlines()
 630     lines = [line.rstrip('\n\r').replace(u'\n', '').replace('"', '').split('\t') for line in content if line != u'']
 631     return dict([[line[0], line[1:]] for line in lines])
 632
 633 def ReadLexique(parent, lang = 'french', filein = None):
 634     if lang != 'other' :
 635         if filein is None :
 636             parent.lexique = ReadDicoAsDico(parent.DictPath.get(lang, 'french'))
 637         else :
 638             parent.lexique = ReadDicoAsDico(filein)
 639     else :
 640         if filein is None :
 641             parent.lexique = {}
 642         else :
 643             parent.lexique = ReadDicoAsDico(filein)
 644
 645 def ReadList(filein, encoding = sys.getdefaultencoding(), sep = ';'):
 646     #file = open(filein)
 647     with codecs.open(filein, 'r', encoding) as f :
 648         content = f.read()
 649     content = [line.replace('\n', '').replace('\r','').replace('\"', '').replace(',', '.').split(sep) for line in content.splitlines()]
 650     #file = codecs.open(filein, 'r', encoding)
 651     #content = file.readlines()
 652     #file.close()
 653     first = content.pop(0)
 654     #first = first.replace('\n', '').replace('\r','').replace('\"', '').split(sep)
 655     dict = {}
 656     i = 0
 657     for line in content:
 658         #line = line.replace('\n', '').replace('\r','').replace('\"', '').replace(',', '.')
 659         #line = line.split(';')
 660         nline = [line[0]]
 661         for val in line[1:]:
 662             if val == u'NA' :
 663                 don = ''
 664             else:
 665                 try:
 666                     don = int(val)
 667                 except:
 668                     don = float('%.5f' % float(val))
 669             nline.append(don)
 670         dict[i] = nline
 671         i += 1
 672     return dict, first
 673
 674 def exec_RCMD(rpath, command) :
 675     log.info('R CMD INSTALL %s' % command)
 676     rpath = rpath.replace('\\','\\\\')
 677     error = call(["%s" % rpath, 'CMD', 'INSTALL', "%s" % command])
 678     return error
 679
 680 def exec_rcode(rpath, rcode, wait = True, graph = False):
 681     log.info("R Script : %s" % rcode)
 682     needX11 = False
 683     if sys.platform == 'darwin' :
 684         try :
 685             macversion = platform.mac_ver()[0].split('.')
 686             if int(macversion[1]) < 5 :
 687                 needX11 = True
 688             else :
 689                 needX11 = False
 690         except :
 691             needX11 = False
 692
 693     rpath = rpath.replace('\\','\\\\')
 694     env = os.environ.copy()
 695     if sys.platform == 'darwin' and 'LC_ALL' not in env:
 696         env['LC_ALL'] = 'en_US.UTF-8'
 697     if not graph :
 698         if wait :
 699             if sys.platform == 'win32':
 700                 error = call(["%s" % rpath, "--vanilla","--slave","-f", "%s" % rcode])
 701             else :
 702                 error = call([rpath, '--slave', "--vanilla", "-f %s" % rcode, "--encoding=UTF-8"], env = env)
 703             return error
 704         else :
 705             if sys.platform == 'win32':
 706                 pid = Popen(["%s" % rpath, '--vanilla','--slave','-f', "%s" % rcode])
 707             else :
 708                 pid = Popen([rpath, '--slave', "--vanilla", "-f %s" % rcode, "--encoding=UTF-8"], stderr = PIPE, env = env)
 709             return pid
 710     else :
 711         if wait :
 712             if sys.platform == 'win32':
 713                 error = call(["%s" % rpath, '--vanilla','--slave','-f', "%s" % rcode])
 714             elif sys.platform == 'darwin' and needX11:
 715                 os.environ['DISPLAY'] = ':0.0'
 716                 error = call([rpath, '--vanilla','--slave',"-f %s" % rcode, "--encoding=UTF-8"], env = env)
 717             else :
 718                 error = call([rpath, '--vanilla','--slave',"-f %s" % rcode, "--encoding=UTF-8"], env = env)
 719             return error
 720         else :
 721             if sys.platform == 'win32':
 722                 pid = Popen(["%s" % rpath, '--vanilla','--slave','-f', "%s" % rcode])
 723             elif sys.platform == 'darwin' and needX11:
 724                 os.environ['DISPLAY'] = ':0.0'
 725                 pid = Popen([rpath, '--vanilla','--slave',"-f %s" % rcode, "--encoding=UTF-8"], stderr = PIPE, env = env)
 726             else :
 727                 pid = Popen([rpath, '--vanilla','--slave',"-f %s" % rcode, "--encoding=UTF-8"], stderr = PIPE, env = env)
 728             return pid
 729
 730 def check_Rresult(parent, pid) :
 731     if isinstance(pid, Popen) :
 732         if pid.returncode != 0 :
 733             error = pid.communicate()
 734             error = [str(error[0]), error[1]]
 735             if error[1] is None :
 736                 error[1] = 'None'
 737             parent.Rerror = '\n'.join([str(pid.returncode), '\n'.join(error)])
 738             try :
 739                 raise Exception('\n'.join([u'Erreur R', '\n'.join(error[1:])]))
 740             except :
 741                 BugReport(parent)
 742             return False
 743         else :
 744             return True
 745     else :
 746         if pid != 0 :
 747             try :
 748                 raise Exception(u'Erreur R')
 749             except :
 750                 BugReport(parent)
 751             return False
 752         else :
 753             return True
 754
 755
 756 def launchcommand(mycommand):
 757     Popen(mycommand)
 758
 759 def print_liste(filename,liste):
 760     with open(filename,'w') as f :
 761         for graph in liste :
 762             f.write(';'.join(graph).encode(sys.getdefaultencoding(), errors='replace')+'\n')
 763
 764 def read_list_file(filename, encoding = sys.getdefaultencoding()):
 765     with codecs.open(filename,'rU', encoding) as f :
 766         content=f.readlines()
 767         ncontent=[line.replace('\n','').split(';') for line in content if line.strip() != '']
 768     return ncontent
 769
 770 def progressbar(self, maxi) :
 771     ira = wx.GetApp().GetTopWindow()
 772     parent = ira
 773     try :
 774         maxi = int(maxi)
 775     except :
 776         maxi = 1
 777     prog = wx.ProgressDialog("Traitements",
 778                              "Veuillez patienter...",
 779                              maximum=maxi,
 780                              parent=parent,
 781                              style=wx.PD_APP_MODAL | wx.PD_AUTO_HIDE | wx.PD_ELAPSED_TIME | wx.PD_CAN_ABORT
 782                              )
 783     prog.SetSize((400,150))
 784     #prog.SetIcon(ira._icon)
 785     return prog
 786
 787 def treat_var_mod(variables) :
 788     var_mod = {}
 789     variables = list(set(variables))
 790     varmod = [variable.split('_') for variable in variables]
 791     vars = list(set([var[0] for var in varmod if len(var) >=2]))
 792     for var in vars :
 793         mods = ['_'.join(v) for v in varmod if v[0] == var]
 794         var_mod[var] = mods
 795
 796 #     for variable in variables :
 797 #         if u'_' in variable :
 798 #             forme = variable.split(u'_')
 799 #             var = forme[0]
 800 #             mod = forme[1]
 801 #             if not var in var_mod :
 802 #                 var_mod[var] = [variable]
 803 #             else :
 804 #                 if not mod in var_mod[var] :
 805 #                     var_mod[var].append(variable)
 806     return var_mod
 807
 808 def doconcorde(corpus, uces, mots, uci = False) :
 809     if not uci :
 810         ucestxt1 = [row for row in corpus.getconcorde(uces)]
 811     else :
 812         ucestxt1 = [row for row in corpus.getuciconcorde(uces)]
 813     ucestxt1 = dict(ucestxt1)
 814     ucestxt = []
 815     ucis_txt = []
 816     listmot = [corpus.getlems()[lem].formes for lem in mots]
 817     listmot = [corpus.getforme(fid).forme for lem in listmot for fid in lem]
 818     mothtml = ['<font color=red><b>%s</b></font>' % mot for mot in listmot]
 819     dmots = dict(zip(listmot, mothtml))
 820     for uce in uces :
 821         ucetxt = ucestxt1[uce].split()
 822         ucetxt = ' '.join([dmots.get(mot, mot) for mot in ucetxt])
 823         if not uci :
 824             uciid = corpus.getucefromid(uce).uci
 825             ucis_txt.append('<p><b>' + ' '.join(corpus.ucis[corpus.getucefromid(uce).uci].etoiles) + '<a href="%i_%i"> *%i_%i</a></b></p>' % (uciid, uce, uciid, uce))
 826         else :
 827             ucis_txt.append('<p><b>' + ' '.join(corpus.ucis[uce].etoiles) + '</b></p>')
 828         ucestxt.append(ucetxt)
 829     return ucis_txt, ucestxt
 830
 831
 832 def getallstcarac(corpus, analyse) :
 833    pathout = PathOut(analyse['ira'])
 834    profils =  ReadProfileAsDico(pathout['PROFILE_OUT'], Alceste, self.encoding)
 835    print profils
 836
 837 def read_chd(filein, fileout):
 838     with open(filein, 'r') as f :
 839         content = f.read()
 840     #content = [line[3:].replace('"',"").replace(' ','') for line in content.splitlines()]
 841     content = [line.split('\t') for line in content.splitlines()]
 842     chd = {'name':1, 'children':[]}
 843     mere={}
 844     for i, line in enumerate(content) :
 845         if i == 0 :
 846             chd['children'] = [{'name': line[1],'size' : content[i+1][0]}, {'name':line[2], 'size': content[i+1][1]}]
 847             mere[line[1]] = chd['children'][0]
 848             mere[line[2]] = chd['children'][1]
 849         elif not i % 2 :
 850             if 'children' in mere[line[0]]:
 851                 mere[line[0]]['children'].append({'name': line[1],'size' : content[i+1][0]})
 852                 mere[line[1]] = mere[line[0]]['children'][-1]
 853                 mere[line[0]]['children'].append({'name': line[2],'size' : content[i+1][1]})
 854                 mere[line[2]] = mere[line[0]]['children'][-1]
 855             else :
 856                 mere[line[0]]['children'] = [{'name': line[1],'size' : content[i+1][0]}, {'name':line[2], 'size': content[i+1][1]}]
 857                 mere[line[1]] = mere[line[0]]['children'][-2]
 858                 mere[line[2]] = mere[line[0]]['children'][-1]
 859     with open(fileout, 'w') as f :
 860         f.write(json.dumps(chd))
 861
 862
 863 translation_languages = {"Afrikaans":"af", "Albanian":"sq", "Amharic":"am", "Arabic":"ar", "Armenian":"hy", "Azeerbaijani":"az", "Basque":"eu", "Belarusian":"be", "Bengali":"bn", "Bosnian":"bs", "Bulgarian":"bg", "Catalan":"ca", "Cebuano":"ceb", "Chichewa":"ny", "Chinese (Simplified)":"zh-CN", "Chinese (Traditional)":"zh-TW", "Corsican":"co", "Croatian":"hr", "Czech":"cs", "Danish":"da", "Dutch":"nl", "English":"en", "Esperanto":"eo", "Estonian":"et", "Filipino":"tl", "Finnish":"fi", "French":"fr", "Frisian":"fy", "Galician":"gl", "Georgian":"ka", "German":"de", "Greek":"el", "Gujarati":"gu", "Haitian Creole":"ht", "Hausa":"ha", "Hawaiian":"haw", "Hebrew":"iw", "Hindi":"hi", "Hmong":"hmn ", "Hungarian":"hu", "Icelandic":"is", "Igbo":"ig", "Indonesian":"id", "Irish":"ga", "Italian":"it", "Japanese":"ja", "Javanese":"jw", "Kannada":"kn", "Kazakh":"kk", "Khmer":"km", "Korean":"ko", "Kurdish":"ku", "Kyrgyz":"ky", "Lao":"lo", "Latin":"la", "Latvian":"lv", "Lithuanian":"lt", "Luxembourgish":"lb", "Macedonian":"mk", "Malagasy":"mg", "Malay":"ms", "Malayalam":"ml", "Maltese":"mt", "Maori":"mi", "Marathi":"mr", "Mongolian":"mn", "Burmese":"my", "Nepali":"ne", "Norwegian":"no", "Pashto":"ps", "Persian":"fa", "Polish":"pl", "Portuguese":"pt", "Punjabi":"ma", "Romanian":"ro", "Russian":"ru", "Samoan":"sm", "Scots Gaelic":"gd", "Serbian":"sr", "Sesotho":"st", "Shona":"sn", "Sindhi":"sd", "Sinhala":"si", "Slovak":"sk", "Slovenian":"sl", "Somali":"so", "Spanish":"es", "Sundanese":"su", "Swahili":"sw", "Swedish":"sv", "Tajik":"tg", "Tamil":"ta", "Telugu":"te", "Thai":"th", "Turkish":"tr", "Ukrainian":"uk", "Urdu":"ur", "Uzbek":"uz", "Vietnamese":"vi", "Welsh":"cy", "Xhosa":"xh", "Yiddish":"yi", "Yoruba":"yo", "Zulu":"zu", }
 864
 865
 866 def gettranslation(words, lf, lt) :
 867     import urllib2
 868     import json
 869     agent = {'User-Agent':
 870     "Mozilla/4.0 (\
 871     compatible;\
 872     MSIE 6.0;\
 873     Windows NT 5.1;\
 874     SV1;\
 875     .NET CLR 1.1.4322;\
 876     .NET CLR 2.0.50727;\
 877     .NET CLR 3.0.04506.30\
 878     )"}
 879     base_link = "https://translate.googleapis.com/translate_a/single?client=gtx&sl=%s&tl=%s&dt=t&q=%s"
 880     print len(words)
 881     totrans = urllib2.quote('\n'.join(words).encode('utf8'))
 882     link = base_link % (lf, lt, totrans)
 883     request = urllib2.Request(link, headers=agent)
 884     raw_data = urllib2.urlopen(request).read()
 885     data = json.loads(raw_data)
 886     return [line[0].decode('utf8').replace(u"'", u'_').replace(u' | ', u'|').replace(u' ', u'_').replace(u'-',u'_').replace(u'\n','') for line in data[0]]
 887
 888 def makenprof(prof, trans, deb=0) :
 889     nprof=[]
 890     if deb == 0 :
 891         nprof.append(prof[0])
 892     for i, val in enumerate(trans) :
 893         line = prof[deb+i+1][:]
 894         line[6] = val
 895         nprof.append(line)
 896     return nprof
 897
 898 def treatempty(val) :
 899     if val.strip() == '' :
 900         return '_'
 901     else :
 902         return val
 903
 904 def translateprofile(corpus, dictprofile, lf='it', lt='fr', maxword = 50) :
 905     nprof = {}
 906     lems = {}
 907     for i in range(len(dictprofile)) :
 908         prof = dictprofile[`i+1`]
 909         try :
 910             lenact = prof.index([u'*****', u'*', u'*', u'*', u'*', u'*', '', ''])
 911             lensup = -1
 912         except ValueError:
 913             try :
 914                 lenact = prof.index([u'*', u'*', u'*', u'*', u'*', u'*', '', ''])
 915                 lensup = 0
 916             except ValueError:
 917                 lenact = len(prof)
 918                 lensup = 0
 919         try :
 920             lensup += prof.index([u'*', u'*', u'*', u'*', u'*', u'*', '', ''])
 921             lensup = lensup - lenact
 922         except ValueError:
 923             lensup += len(prof) - lenact
 924         if lenact != 0 :
 925             if lenact > maxword :
 926                 nlenact = maxword
 927             else :
 928                 nlenact = lenact
 929             actori = [line[6] for line in prof[1:nlenact]]
 930             act = [val.replace(u'_', u' ') for val in actori]
 931             act = gettranslation(act, lf, lt)
 932             for j, val in enumerate(actori) :
 933                 if act[j] not in lems :
 934                     lems[act[j]] = val
 935                 else :
 936                     while act[j] in lems :
 937                         act[j] = act[j] + u"+"
 938                     lems[act[j]] = val
 939             nprof[`i+1`] = makenprof(prof, act)
 940
 941         if lensup != 0 :
 942             if lensup > maxword :
 943                 nlensup = maxword
 944             else :
 945                 nlensup = lensup
 946             supori = [line[6] for line in prof[(1+lenact):(lenact+nlensup)]]
 947             sup = [val.replace(u'_', u' ') for val in supori]
 948             sup = [treatempty(val) for val in sup]
 949             sup = gettranslation(sup, lf, lt)
 950             for j, val in enumerate(supori) :
 951                 if sup[j] not in lems :
 952                     lems[sup[j]] = val
 953                 else :
 954                     while sup[j] in lems :
 955                         sup[j] = sup[j] + u"+"
 956                     lems[sup[j]] = val
 957             nprof[`i+1`].append([u'*****', u'*', u'*', u'*', u'*', u'*', '', ''])
 958             nprof[`i+1`] += makenprof(prof, sup, deb=lenact)
 959
 960         try :
 961             lenet = prof.index([u'*', u'*', u'*', u'*', u'*', u'*', '', ''])
 962             nprof[`i+1`].append([u'*', u'*', u'*', u'*', u'*', u'*', '', ''])
 963             nprof[`i+1`] += prof[(lenet+1):]
 964         except :
 965             pass
 966     return nprof, lems
 967
 968 def write_translation_profile(prof, lems, language, dictpathout) :
 969     if os.path.exists(dictpathout['translations.txt']) :
 970         with codecs.open(dictpathout['translations.txt'], 'r', 'utf8') as f :
 971             translist = f.read()
 972         translist = [line.split('\t') for line in translist.splitlines()]
 973     else :
 974         translist = []
 975     toprint = []
 976     toprint.append(['','','','','',''])
 977     toprint.append([u'***', u'nb classes', `len(prof)`, u'***', '', ''])
 978     for i in range(len(prof)) :
 979         toprint.append([u'**', u'classe', `i+1`, u'**', '', ''])
 980         toprint.append([u'****'] + prof[`i+1`][0] + [u'****'])
 981         rest = [[`line[1]`, `line[2]`, `line[3]`, `line[4]`, line[6], line[7].replace('< 0,0001', '0.00009').replace('NS (','').replace(')','')] for line in prof[`i+1`][1:]]
 982         for i, line in enumerate(prof[`i+1`][1:]) :
 983             if line[0] == u'*' :
 984                 rest[i] = [u'*', u'*', u'*', u'*', u'*', u'*']
 985             elif line[0] == u'*****' :
 986                 rest[i] = [u'*****',u'*',u'*', u'*', u'*', u'*']
 987         toprint += rest
 988     with open(dictpathout['translation_profile_%s.csv' % language], 'w') as f :
 989         f.write('\n'.join([';'.join(line) for line in toprint]).encode('utf8'))
 990     with open(dictpathout['translation_words_%s.csv' % language], 'w') as f :
 991         f.write('\n'.join(['\t'.join([val, lems[val]]) for val in lems]).encode('utf8'))
 992     if 'translation_profile_%s.csv' % language not in [val[0] for val in translist] :
 993         translist.append(['translation_profile_%s.csv' % language, 'translation_words_%s.csv' % language])
 994         with open(dictpathout['translations.txt'], 'w') as f :
 995             f.write('\n'.join(['\t'.join(line) for line in translist]).encode('utf8'))
 996
 997 def makesentidict(infile, language) :
 998     #'/home/pierre/workspace/iramuteq/dev/langues/NRC/NRC-Emotion-Lexicon.csv'
 999     with codecs.open(infile,'r', 'utf8') as f :
1000         content = f.read()
1001     content = [line.split('\t') for line in content.splitlines()]
1002     titles = content.pop(0)
1003     senti = ['Positive', 'Negative', 'Anger', 'Anticipation', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise', 'Trust']
1004     sentid = {}
1005     for sent in senti :
1006         sentid[sent] = titles.index(sent)
1007     frtitle = [val for val in titles if '(fr)' in val]
1008     frid = titles.index(frtitle[0])
1009     sentidict = [[line[frid].lower(), [line[sentid[sent]] for sent in senti]] for line in content]
1010     pos = ['positive'] + [line[0] for line in sentidict if line[1][0] == '1']
1011     neg = ['negative'] + [line[0] for line in sentidict if line[1][1] == '1']
1012     anger = ['anger'] + [line[0] for line in sentidict if line[1][2] == '1']
1013     anticipation = ['anticipation'] + [line[0] for line in sentidict if line[1][3] == '1']
1014     disgust = ['disgust'] + [line[0] for line in sentidict if line[1][4] == '1']
1015     fear = ['fear'] + [line[0] for line in sentidict if line[1][5] == '1']
1016     joy = ['joy'] + [line[0] for line in sentidict if line[1][6] == '1']
1017     sadness = ['sadness'] + [line[0] for line in sentidict if line[1][7] == '1']
1018     surprise = ['surprise'] + [line[0] for line in sentidict if line[1][8] == '1']
1019     trust = ['trust'] + [line[0] for line in sentidict if line[1][9] == '1']
1020     with open('/tmp/tgenemo.csv', 'w') as f :
1021         for val in [pos, neg, anger, anticipation, disgust, fear, joy, sadness, surprise, trust] :
1022             f.write('\t'.join(val).encode('utf8') + '\n')
1023
1024 def countsentfromprof(prof, encoding, sentidict) :
1025     with codecs.open(prof, 'r', encoding) as f :
1026         content = f.read()
1027     content = [line.split(';') for line in content.splitlines()]
1028     print content
1029     content = [[line[0], [int(val) for val in line[1:]]] for line in content]
1030     print content
1031     content = dict(content)
1032     print content
1033
1034 def iratolexico(infile, outfile, encoding) :
1035     with codecs.open(infile, 'r', encoding) as f :
1036         for line in f :
1037             if line.startswith(u'**** ') :
1038                 line = line.split()
1039