1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
3 #Copyright (c) 2008-2020 Pierre Ratinaud
4 #modification pour python 3 : Laurent Mérat, 6x7 - mai 2020
8 Removes HTML or XML character references and entities from a text string.
10 @param text The HTML (or XML) source text.
11 @return The plain text, as a Unicode string, if necessary.
14 #------------------------------------
15 # import des modules python
16 #------------------------------------
27 from uuid import uuid4
30 #------------------------------------
31 # import des fichiers du projet
32 #------------------------------------
33 from functions import DoConf
34 from chemins import PathOut
37 log = logging.getLogger('iramuteq.tableau')
42 #apos is not in the dictionnary
43 html.entities.name2codepoint['apos'] = ord("'")
49 return chr(int(text[3:-1], 16))
51 return chr(int(text[2:-1]))
56 text = chr(html.entities.name2codepoint[text[1:-1]])
59 return text # leave as is
60 return re.sub("&#?\w+;", fixup, text)
62 def UpdateDico(Dico, word, line):
65 Dico[word][1].append(line)
67 Dico[word] = [1, [line]]
69 def copymatrix(tableau):
70 log.info('copy matrix')
71 copymat = Tableau(tableau.parent, parametres = tableau.parametres)
72 copymat.linecontent = copy(tableau.linecontent)
73 copymat.csvtable = copy(tableau.csvtable)
74 copymat.pathout = copy(tableau.pathout)
75 copymat.colnames = copy(tableau.colnames)
76 copymat.rownb = copy(tableau.rownb)
77 copymat.colnb = copy(tableau.colnb)
78 if copymat.csvtable is None :
85 def __init__(self, parent, filename = '', filetype = 'csv', encodage = 'utf-8', parametres = None) :
87 if parametres is None :
88 self.parametres = DoConf(self.parent.ConfigPath['matrix']).getoptions('matrix')
89 self.parametres['pathout'] = PathOut(filename, 'matrix').mkdirout()
90 self.parametres['originalpath'] = filename
91 self.parametres['filetype'] = filetype
92 self.parametres['encodage'] = encodage
93 #self.parametre['pathout'] = os.path.dirname(os.path.abspath(filename))
94 self.parametres['mineff'] = 3
95 self.parametres['syscoding'] = sys.getdefaultencoding()
96 self.parametres['type'] = 'matrix'
97 self.parametres['matrix_name'] = os.path.basename(filename)
98 self.parametres['uuid'] = str(uuid4())
99 self.parametres['shelves'] = os.path.join(self.parametres['pathout'], 'shelve')
100 self.parametres['ira'] = os.path.join(self.parametres['pathout'], 'Matrix.ira')
102 self.parametres = parametres
103 self.pathout = PathOut(filename = filename, dirout = self.parametres['pathout'])
107 self.listactives = None
109 self.linecontent = []
110 self.isbinary = False
112 self.firstrowiscolnames = True
114 self.firstcolisrownames = True
119 #self.parametres = self.parametre
121 def read_tableau(self, fileout) :
122 with open(fileout, 'r', encoding='utf8') as f :
124 self.actives = d['actives']
125 self.sups = d['sups']
126 self.classes = d['classes']
127 self.listactives = d['listactives']
129 self.listet = d['listet']
130 if 'selected_col' in d :
131 self.selected_col = d['selected_col']
133 self.datas = d['datas']
135 self.lchi = d['lchi']
137 self.content = d['content']
142 self.colnames = self.csvtable[0][1:]
143 self.rownb = len(self.linecontent)
144 self.colnb = len(self.linecontent[0])
146 def save_tableau(self, fileout) :
148 d['parametres'] = self.parametres
149 d['actives'] = self.actives
150 d['sups'] = self.sups
151 d['classes'] = self.classes
152 d['listactives'] = self.listactives
153 if 'listet' in dir(self) :
154 d['listet'] = self.listet
155 if 'selected_col' in dir(self) :
156 d['selected_col'] = self.selected_col
157 if 'datas' in dir(self) :
158 d['datas'] = self.datas
159 if 'lchi' in dir(self) :
160 d['lchi'] = self.lchi
161 d['content'] = self.content
162 with open(fileout, 'w', encoding='utf8') as f :
165 def make_content(self) :
166 self.pathout.createdir(self.parametres['pathout'])
167 if self.parametres['filetype'] == 'csv' :
169 elif self.parametres['filetype'] == 'xls' :
171 elif self.parametres['filetype'] == 'ods' :
173 self.parametres['csvfile'] = os.path.join(self.parametres['pathout'], 'csvfile.csv')
175 DoConf().makeoptions(['matrix'],[self.parametres], self.parametres['ira'])
176 self.parent.history.addMatrix(self.parametres)
178 def make_content_simple(self):
179 self.parametres['csvfile'] = os.path.join(self.parametres['pathout'], 'csvfile.csv')
181 DoConf().makeoptions(['matrix'],[self.parametres], self.parametres['ira'])
182 self.parent.history.addMatrix(self.parametres)
186 #print '############## ENCODING IN EXCEL #######################'
187 #datafile = xlrd.open_workbook(self.parametre['filename'], encoding_override="azerazerazer")
188 datafile = xlrd.open_workbook(self.parametres['originalpath'])
189 datatable = datafile.sheet_by_index(self.parametres['sheetnb']-1)
190 self.linecontent = [[str(datatable.cell_value(rowx = i, colx = j)).replace('"','').replace(';',' ').replace('\n',' ').replace('\r', ' ').replace('\t', ' ').strip() for j in range(datatable.ncols)] for i in range(datatable.nrows)]
193 doc = ooolib.Calc(opendoc=self.parametres['originalpath'])
194 doc.set_sheet_index(0)
195 (cols, rows) = doc.get_sheet_dimensions()
196 for row in range(1, rows + 1):
198 for col in range(1, cols + 1):
199 data = doc.get_cell_value(col, row)
200 if data is not None :
201 ligne.append(unescape(data[1].replace('"','').replace(';',' ').replace('\n', ' ').replace('\t', ' ').strip()))
204 self.linecontent.append(ligne)
207 with codecs.open(self.parametres['originalpath'], 'r', self.parametres['encodage']) as f :
209 self.linecontent = [line.split(self.parametres['colsep']) for line in content.splitlines()]
210 self.linecontent = [[val.replace('"','').replace(';',' ').replace('\t', ' ').strip() for val in line] for line in self.linecontent]
212 def write_csvfile(self) :
213 with open(self.parametres['csvfile'], 'w', encoding='utf8') as f :
214 f.write('\n'.join(['\t'.join(line) for line in self.csvtable]))
216 def make_tmpfile(self) :
217 self.rownb = len(self.linecontent)
218 self.colnb = len(self.linecontent[0])
219 if self.firstrowiscolnames :
220 self.colnames = self.linecontent[0]
221 self.linecontent.pop(0)
224 self.colnames = ['_'.join(['colonne', repr(i)]) for i in range(self.colnb)]
225 if self.firstcolisrownames :
226 self.rownames = [row[0] for row in self.linecontent]
227 self.linecontent = [row[1:] for row in self.linecontent]
229 self.idname = self.colnames[0]
231 self.check_rownames()
233 self.rownames = [repr(i) for i in range(self.rownb)]
234 self.idname = 'identifiant'
235 self.csvtable = [[self.idname] + self.colnames] + [[self.rownames[i]] + self.linecontent[i] for i in range(len(self.rownames))]
238 def read_csvfile(self):
239 with open(self.parametres['csvfile'], 'r', encoding='utf8') as f:
240 self.csvtable = [line.split('\t') for line in f.read().splitlines()]
241 self.linecontent = [line[1:] for line in self.csvtable]
242 self.linecontent.pop(0)
244 def extractfrommod(self, col, val):
245 return ([''] + self.colnames) + [line for line in self.csvtable[1:] if line[col + 1] == val]
247 def splitfromvar(self, col):
249 for line in self.csvtable[1:] :
252 newtabs[mod].append(line)
254 newtabs[mod] = [line]
256 newtabs[mod].insert(0, [''] + self.colnames)
259 def check_rownames(self) :
260 if len(self.rownames) == len(list(set(self.rownames))) :
261 print('row names ok')
263 print('les noms de lignes ne sont pas uniques, ils sont remplaces')
264 self.rownames = [repr(i) for i in range(self.rownb)]
266 def make_unique_list(self) :
267 return list(set([val for line in self.linecontent for val in line if val.strip() != '']))
269 def make_dico(self, selcol) :
271 for i, line in enumerate(selcol) :
273 if forme.strip() != '' :
274 UpdateDico(dico, forme, i)
277 def select_col(self, listcol) :
278 dc = dict(list(zip(listcol, listcol)))
279 selcol = [[val for i, val in enumerate(row) if i in dc] for row in self.linecontent]
282 def countmultiple(self, liscol):
283 return self.make_dico(self.select_col(liscol))
285 def getactlistfromselection(self, listact) :
286 selcol = self.select_col(listact)
287 self.actives = self.make_dico(selcol)
288 return [[val, self.actives[val][0]] for val in self.actives]
290 def make_listactives(self) :
291 self.listactives = [val for val in self.actives if val != 'NA' and self.actives[val][0] >= self.parametres['mineff']]
293 def write01(self, fileout, dico, linecontent) :
294 if self.listactives is None :
295 self.listactives = [val for val in dico if val != 'NA' and dico[val][0] >= self.parametres['mineff']]
296 out = [['0' for forme in self.listactives] for line in linecontent]
297 for i, forme in enumerate(self.listactives) :
298 for line in dico[forme][1] :
300 #out = [[self.rownames[i]] + out[i] for i in range(len(linecontent))]
301 #out.insert(0,[self.idname] + self.listactives)
302 out.insert(0, self.listactives)
303 with open(fileout, 'w', encoding='utf8') as f :
304 f.write('\n'.join([';'.join(line) for line in out]))
306 def make_01_from_selection(self, listact, listsup = None, dowrite = True) :
307 selcol = self.select_col(listact)
308 self.actives = self.make_dico(selcol)
309 self.write01(self.pathout['mat01.csv'], self.actives, selcol)
310 if listsup is not None :
311 selcol = self.select_col(listsup)
312 self.sups = self.make_dico(selcol)
314 def make_01_alc_format(self, fileout) :
315 for i, ligne in enumerate(self.linecontent) :
319 UpdateDico(self.sups, forme, i)
321 UpdateDico(self.actives, forme, i)
322 self.listactives = [val for val in self.actives if self.actives[val][0] >= self.parametres['mineff']]
323 table = [['0' for i in range(len(self.listactives))] for j in range(self.rownb)]
324 for i, val in enumerate(self.listactives) :
325 for j, line in enumerate(self.linecontent) :
328 #table = [[self.rownames[i]] + table[i] for i in range(len(self.rownames))]
329 #table.insert(0, [self.idname] + self.listactives)
330 table.insert(0, self.listactives)
331 with open(fileout, 'w', encoding='utf8') as f:
332 f.write('\n'.join([';'.join(line) for line in table]))
334 def printtable(self, filename, Table, sep = ';'):
335 with open(filename, 'w', encoding='utf8') as f :
336 f.write('\n'.join([sep.join(line) for line in Table]))
338 def buildprofil(self) :
339 with open(self.pathout['uce'], 'r', encoding='utf8') as filein :
340 content = filein.readlines()
344 for i, line in enumerate(content) :
345 line = line.replace('\n', '').replace('"', '').split(';')
346 UpdateDico(dicocl, line[1], i)
347 lsucecl.append([int(line[0]) - 1, int(line[1])])
348 self.classes = lsucecl
349 nlist = [[nbuce, cl] for nbuce, cl in lsucecl if cl != 0]
350 self.ucecla = len(nlist)
352 self.clnb = len(dicocl) - 1
354 self.clnb = len(dicocl)
356 for active in self.listactives :
358 line0 = [0] * self.clnb
360 for i in range(0, self.clnb) :
361 for uce, cl in nlist:
363 if active in self.linecontent[uce]:
365 if sum(line[1:]) > self.parametres['mineff']:
366 tablecont.append([line[0]] + [repr(don) for don in line if type(don) == type(1)])
368 for sup in self.sups :
370 line0 = [0] * self.clnb
372 for i in range(0, self.clnb) :
373 for uce, cl in nlist:
375 if sup in self.linecontent[uce]:
377 tablecontet.append([line[0]] + [repr(don) for don in line if type(don) == type(1)])
379 self.printtable(self.pathout['ContEtOut'], tablecontet)
380 self.printtable(self.pathout['Contout'], tablecont)
382 def get_colnames(self) :
383 return self.colnames[:]
385 def make_table_from_classe(self, cl, la) :
386 ln = [line[0] for line in self.classes if line[1] == cl]
387 out = [['0' for col in la] for line in ln]
388 for i, act in enumerate(la) :
389 for j, line in enumerate(ln) :
390 if line in self.actives[act][1] :
392 out.insert(0,[act for act in la])