1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
3 #Copyright (c) 2010 Pierre Ratinaud
15 from functions import DoConf
16 from uuid import uuid4
17 from chemins import PathOut
20 log = logging.getLogger('iramuteq.tableau')
23 # Removes HTML or XML character references and entities from a text string.
25 # @param text The HTML (or XML) source text.
26 # @return The plain text, as a Unicode string, if necessary.
30 #apos is not in the dictionnary
31 htmlentitydefs.name2codepoint['apos'] = ord("'")
37 return unichr(int(text[3:-1], 16))
39 return unichr(int(text[2:-1]))
44 text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
47 return text # leave as is
48 return re.sub("&#?\w+;", fixup, text)
50 def UpdateDico(Dico, word, line):
53 Dico[word][1].append(line)
55 Dico[word] = [1, [line]]
57 def copymatrix(tableau):
58 log.info('copy matrix')
59 copymat = Tableau(tableau.parent, parametres = tableau.parametres)
60 copymat.linecontent = tableau.linecontent
61 copymat.csvtable = tableau.csvtable
62 copymat.pathout = tableau.pathout
63 copymat.colnames = tableau.colnames
64 copymat.rownb = tableau.rownb
65 copymat.colnb = tableau.colnb
66 if copymat.csvtable is None :
71 def __init__(self, parent, filename = '', filetype = 'csv', encodage = 'utf-8', parametres = None) :
73 if parametres is None :
74 self.parametres = DoConf(os.path.join(self.parent.UserConfigPath,'matrix.cfg')).getoptions('matrix')
75 self.parametres['pathout'] = PathOut(filename, 'matrix').mkdirout()
76 self.parametres['originalpath'] = filename
77 self.parametres['filetype'] = filetype
78 self.parametres['encodage'] = encodage
79 #self.parametre['pathout'] = os.path.dirname(os.path.abspath(filename))
80 self.parametres['mineff'] = 3
81 self.parametres['syscoding'] = sys.getdefaultencoding()
82 self.parametres['type'] = 'matrix'
83 self.parametres['matrix_name'] = os.path.basename(filename)
84 self.parametres['uuid'] = str(uuid4())
85 self.parametres['shelves'] = os.path.join(self.parametres['pathout'], 'shelve.db')
86 self.parametres['ira'] = os.path.join(self.parametres['pathout'], 'Matrix.ira')
88 self.parametres = parametres
89 self.pathout = PathOut(filename = filename, dirout = self.parametres['pathout'])
93 self.listactives = None
98 self.firstrowiscolnames = True
100 self.firstcolisrownames = True
105 #self.parametres = self.parametre
107 def read_tableau(self, fileout) :
108 d=shelve.open(fileout)
109 #self.parametres = d['parametres']
110 #if 'syscoding' not in self.parametres :
111 # self.parametres['syscoding'] = sys.getdefaultencoding()
112 self.actives = d['actives']
113 self.sups = d['sups']
114 self.classes = d['classes']
115 self.listactives = d['listactives']
117 self.listet = d['listet']
118 if 'selected_col' in d :
119 self.selected_col = d['selected_col']
121 self.datas = d['datas']
123 self.lchi = d['lchi']
125 self.content = d['content']
131 self.colnames = self.csvtable[0][1:]
132 self.rownb = len(self.linecontent)
133 self.colnb = len(self.linecontent[0])
135 def save_tableau(self, fileout) :
136 d=shelve.open(fileout)
137 d['parametres'] = self.parametres
138 d['actives'] = self.actives
139 d['sups'] = self.sups
140 d['classes'] = self.classes
141 d['listactives'] = self.listactives
142 if 'listet' in dir(self) :
143 d['listet'] = self.listet
144 if 'selected_col' in dir(self) :
145 d['selected_col'] = self.selected_col
146 if 'datas' in dir(self) :
147 d['datas'] = self.datas
148 if 'lchi' in dir(self) :
149 d['lchi'] = self.lchi
150 d['content'] = self.content
153 def make_content(self) :
154 self.pathout.createdir(self.parametres['pathout'])
155 if self.parametres['filetype'] == 'csv' :
157 elif self.parametres['filetype'] == 'xls' :
159 elif self.parametres['filetype'] == 'ods' :
161 self.parametres['csvfile'] = os.path.join(self.parametres['pathout'], 'csvfile.csv')
163 DoConf().makeoptions(['matrix'],[self.parametres], self.parametres['ira'])
164 self.parent.history.addMatrix(self.parametres)
168 #print '############## ENCODING IN EXCEL #######################'
169 #datafile = xlrd.open_workbook(self.parametre['filename'], encoding_override="azerazerazer")
170 datafile = xlrd.open_workbook(self.parametres['originalpath'])
171 datatable = datafile.sheet_by_index(self.parametres['sheetnb']-1)
172 self.linecontent = [[str(datatable.cell_value(rowx = i, colx = j)).replace(u'"','').replace(u';','').replace(u'\n',' ').strip() for j in range(datatable.ncols)] for i in range(datatable.nrows)]
175 doc = ooolib.Calc(opendoc=self.parametres['originalpath'])
176 doc.set_sheet_index(0)
177 (cols, rows) = doc.get_sheet_dimensions()
178 for row in range(1, rows + 1):
180 for col in range(1, cols + 1):
181 data = doc.get_cell_value(col, row)
182 if data is not None :
183 ligne.append(unescape(data[1].replace(u'"','').replace(u';','').replace(u'\n', ' ').strip()))
186 self.linecontent.append(ligne)
189 with codecs.open(self.parametres['originalpath'], 'r', self.parametres['encodage']) as f :
191 self.linecontent = [line.split(self.parametres['colsep']) for line in content.splitlines()]
192 self.linecontent = [[val.replace(u'"','').strip() for val in line] for line in self.linecontent]
194 def write_csvfile(self) :
195 with open(self.parametres['csvfile'], 'w') as f :
196 f.write('\n'.join(['\t'.join(line) for line in self.csvtable]))
198 def make_tmpfile(self) :
199 self.rownb = len(self.linecontent)
200 self.colnb = len(self.linecontent[0])
201 if self.firstrowiscolnames :
202 self.colnames = self.linecontent[0]
203 self.linecontent.pop(0)
206 self.colnames = ['_'.join([u'colonne', `i`]) for i in range(self.colnb)]
207 if self.firstcolisrownames :
208 self.rownames = [row[0] for row in self.linecontent]
209 self.linecontent = [row[1:] for row in self.linecontent]
211 self.idname = self.colnames[0]
213 self.check_rownames()
215 self.rownames = [`i` for i in range(self.rownb)]
216 self.idname = u'identifiant'
217 self.csvtable = [[self.idname] + self.colnames] + [[self.rownames[i]] + self.linecontent[i] for i in range(len(self.rownames))]
220 def read_csvfile(self):
221 with codecs.open(self.parametres['csvfile'], 'r', self.parametres['syscoding']) as f:
222 self.csvtable = [line.split('\t') for line in f.read().splitlines()]
223 self.linecontent = [line[1:] for line in self.csvtable]
224 self.linecontent.pop(0)
226 def check_rownames(self) :
227 if len(self.rownames) == len(list(set(self.rownames))) :
228 print u'row names ok'
230 print u'les noms de lignes ne sont pas uniques, ils sont remplaces'
231 self.rownames = [`i` for i in range(self.rownb)]
233 def make_unique_list(self) :
234 return list(set([val for line in self.linecontent for val in line if val.strip() != '']))
236 def make_dico(self, selcol) :
238 for i, line in enumerate(selcol) :
240 if forme.strip() != '' :
241 UpdateDico(dico, forme, i)
244 def select_col(self, listcol) :
245 dc = dict(zip(listcol, listcol))
246 selcol = [[val for i, val in enumerate(row) if i in dc] for row in self.linecontent]
249 def getactlistfromselection(self, listact) :
250 selcol = self.select_col(listact)
251 self.actives = self.make_dico(selcol)
252 return [[val, self.actives[val][0]] for val in self.actives]
254 def make_listactives(self) :
255 self.listactives = [val for val in self.actives if val != 'NA' and self.actives[val] >= self.parametres['mineff']]
257 def write01(self, fileout, dico, linecontent) :
258 if self.listactives is None :
259 self.listactives = [val for val in dico if val != 'NA' and dico[val] >= self.parametres['mineff']]
260 out = [['0' for forme in self.listactives] for line in linecontent]
261 for i, forme in enumerate(self.listactives) :
262 for line in dico[forme][1] :
264 #out = [[self.rownames[i]] + out[i] for i in range(len(linecontent))]
265 #out.insert(0,[self.idname] + self.listactives)
266 out.insert(0, self.listactives)
267 with open(fileout, 'w') as f :
268 f.write('\n'.join([';'.join(line) for line in out]))
270 def make_01_from_selection(self, listact, listsup = None, dowrite = True) :
271 selcol = self.select_col(listact)
272 self.actives = self.make_dico(selcol)
273 self.write01(self.pathout['mat01.csv'], self.actives, selcol)
274 if listsup is not None :
275 selcol = self.select_col(listsup)
276 self.sups = self.make_dico(selcol)
278 def make_01_alc_format(self, fileout) :
279 for i, ligne in enumerate(self.linecontent) :
283 UpdateDico(self.sups, forme, i)
285 UpdateDico(self.actives, forme, i)
286 self.listactives = [val for val in self.actives if self.actives[val][0] >= self.parametres['mineff']]
287 table = [['0' for i in range(len(self.listactives))] for j in range(self.rownb)]
288 for i, val in enumerate(self.listactives) :
289 for j, line in enumerate(self.linecontent) :
292 #table = [[self.rownames[i]] + table[i] for i in range(len(self.rownames))]
293 #table.insert(0, [self.idname] + self.listactives)
294 table.insert(0, self.listactives)
295 with open(fileout, 'w') as f:
296 f.write('\n'.join([';'.join(line) for line in table]))
298 def printtable(self, filename, Table, sep = ';'):
299 with open(filename, 'w') as f :
300 f.write('\n'.join([sep.join(line) for line in Table]))
302 def buildprofil(self) :
303 with open(self.pathout['uce'], 'rU') as filein :
304 content = filein.readlines()
308 for i, line in enumerate(content) :
309 line = line.replace('\n', '').replace('"', '').split(';')
310 UpdateDico(dicocl, line[1], i)
311 lsucecl.append([int(line[0]) - 1, int(line[1])])
312 self.classes = lsucecl
313 nlist = [[nbuce, cl] for nbuce, cl in lsucecl if cl != 0]
314 self.ucecla = len(nlist)
316 self.clnb = len(dicocl) - 1
318 self.clnb = len(dicocl)
321 for active in self.listactives :
323 line0 = [0] * self.clnb
325 for i in range(0, self.clnb) :
326 for uce, cl in nlist:
328 if active in self.linecontent[uce]:
330 if sum(line[1:]) > self.parametres['mineff']:
331 tablecont.append([line[0]] + [`don` for don in line if type(don) == type(1)])
334 for sup in self.sups :
336 line0 = [0] * self.clnb
338 for i in range(0, self.clnb) :
339 for uce, cl in nlist:
341 if sup in self.linecontent[uce]:
343 tablecontet.append([line[0]] + [`don` for don in line if type(don) == type(1)])
345 self.printtable(self.pathout['ContEtOut'], tablecontet)
346 self.printtable(self.pathout['Contout'], tablecont)
348 def get_colnames(self) :
349 return self.colnames[:]
351 def make_table_from_classe(self, cl, la) :
352 ln = [line[0] for line in self.classes if line[1] == cl]
353 out = [['0' for col in la] for line in ln]
354 for i, act in enumerate(la) :
355 for j, line in enumerate(ln) :
356 if line in self.actives[act][1] :
358 out.insert(0,[act for act in la])
363 #filename = 'corpus/cent3.csv'
364 #filename = 'corpus/agir2sortie.csv'
365 #tab = Tableau('',filename, encodage='utf-8')
366 #tab.parametre['csvfile'] = tab.parametre['filename']
367 #tab.parametre['sep'] = '\t'
368 #tab.firstrowiscolnames = True
369 #tab.firstcolisrownames = False
371 #tab.make_01('corpus/matrice01.csv')