1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
3 #Copyright (c) 2010 Pierre Ratinaud
15 from uuid import uuid4
18 log = logging.getLogger('iramuteq.tableau')
21 # Removes HTML or XML character references and entities from a text string.
23 # @param text The HTML (or XML) source text.
24 # @return The plain text, as a Unicode string, if necessary.
28 #apos is not in the dictionnary
29 htmlentitydefs.name2codepoint['apos'] = ord("'")
35 return unichr(int(text[3:-1], 16))
37 return unichr(int(text[2:-1]))
42 text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
45 return text # leave as is
46 return re.sub("&#?\w+;", fixup, text)
48 def UpdateDico(Dico, word, line):
51 Dico[word][1].append(line)
53 Dico[word] = [1, [line]]
56 def __init__(self, parent, filename = '', filetype = 'csv', encodage = 'utf-8') :
58 self.parametre = {'filename' : filename}
59 self.parametre['filetype'] = filetype
60 self.parametre['encodage'] = encodage
61 self.parametre['pathout'] = os.path.dirname(os.path.abspath(filename))
62 self.parametre['mineff'] = 3
63 self.parametre['syscoding'] = sys.getdefaultencoding()
64 self.parametre['type'] = 'matrix'
65 self.parametre['name'] = 'unNOm'
68 self.listactives = None
73 self.firstrowiscolnames = True
75 self.firstcolisrownames = True
80 self.parametres = self.parametre
82 def read_tableau(self, fileout) :
83 d=shelve.open(fileout)
84 self.parametre = d['parametre']
85 if 'syscoding' not in self.parametre :
86 self.parametre['syscoding'] = sys.getdefaultencoding()
87 self.actives = d['actives']
89 self.classes = d['classes']
90 self.listactives = d['listactives']
92 self.listet = d['listet']
93 if 'selected_col' in d :
94 self.selected_col = d['selected_col']
96 self.datas = d['datas']
101 def save_tableau(self, fileout) :
102 d=shelve.open(fileout)
103 d['parametre'] = self.parametre
104 d['actives'] = self.actives
105 d['sups'] = self.sups
106 d['classes'] = self.classes
107 d['listactives'] = self.listactives
108 if 'listet' in dir(self) :
109 d['listet'] = self.listet
110 if 'selected_col' in dir(self) :
111 d['selected_col'] = self.selected_col
112 if 'datas' in dir(self) :
113 d['datas'] = self.datas
114 if 'lchi' in dir(self) :
115 d['lchi'] = self.lchi
118 def make_content(self) :
119 if self.parametre['filetype'] == 'csv' :
121 elif self.parametre['filetype'] == 'xls' :
123 elif self.parametre['filetype'] == 'ods' :
125 self.parametre['csvfile'] = tempfile.mktemp(dir=self.parent.TEMPDIR)
130 #print '############## ENCODING IN EXCEL #######################'
131 #datafile = xlrd.open_workbook(self.parametre['filename'], encoding_override="azerazerazer")
132 datafile = xlrd.open_workbook(self.parametre['filename'])
133 datatable = datafile.sheet_by_index(self.parametre['sheetnb']-1)
134 self.linecontent = [[str(datatable.cell_value(rowx = i, colx = j)) for j in range(datatable.ncols)] for i in range(datatable.nrows)]
137 doc = ooolib.Calc(opendoc=self.parametre['filename'])
138 doc.set_sheet_index(0)
139 (cols, rows) = doc.get_sheet_dimensions()
140 for row in range(1, rows + 1):
142 for col in range(1, cols + 1):
143 data = doc.get_cell_value(col, row)
144 if data is not None :
145 ligne.append(unescape(data[1]))
148 self.linecontent.append(ligne)
151 with codecs.open(self.parametre['filename'], 'r', self.parametre['encodage']) as f :
153 self.linecontent = [line.replace('"','').split(self.parametre['colsep']) for line in content.splitlines()]
155 def write_csvfile(self) :
156 with open(self.parametre['csvfile'], 'w') as f :
157 f.write('\n'.join([';'.join(line) for line in self.csvtable]))
159 def make_tmpfile(self) :
160 self.rownb = len(self.linecontent)
161 self.colnb = len(self.linecontent[0])
162 if self.firstrowiscolnames :
163 self.colnames = self.linecontent[0]
164 self.linecontent.pop(0)
167 self.colnames = ['_'.join([u'colonne', `i`]) for i in range(self.colnb)]
168 if self.firstcolisrownames :
169 self.rownames = [row[0] for row in self.linecontent]
170 self.linecontent = [row[1:] for row in self.linecontent]
172 self.idname = self.colnames[0]
174 self.check_rownames()
176 self.rownames = [`i` for i in range(self.rownb)]
177 self.idname = u'identifiant'
178 self.csvtable = [[self.idname] + self.colnames] + [[self.rownames[i]] + self.linecontent[i] for i in range(len(self.rownames))]
182 self.parent.content = self.csvtable
183 self.parent.ShowMenu(_("View"))
184 self.parent.ShowMenu(_("Spreadsheet analysis"))
185 self.parent.ShowMenu(_("Text analysis"), False)
186 self.parent.type = "Data"
187 self.parent.DataPop = False
188 self.parent.OnViewData('')
190 def check_rownames(self) :
191 if len(self.rownames) == len(list(set(self.rownames))) :
192 print u'row names ok'
194 print u'les noms de lignes ne sont pas uniques, ils sont remplaces'
195 self.rownames = [`i` for i in range(self.rownb)]
197 def make_unique_list(self) :
198 return list(set([val for line in self.linecontent for val in line if val.strip() != '']))
200 def make_dico(self, linecontent) :
202 for i, line in enumerate(linecontent) :
204 if forme.strip() != '' :
205 UpdateDico(dico, forme, i)
208 def select_col(self, listcol) :
209 dc = dict(zip(listcol, listcol))
210 selcol = [[val for i, val in enumerate(row) if i in dc] for row in self.linecontent]
213 def getactlistfromselection(self, listact) :
214 selcol = self.select_col(listact)
215 self.actives = self.make_dico(selcol)
216 return [[val, self.actives[val][0]] for val in self.actives]
218 def make_listactives(self) :
219 self.listactives = [val for val in self.actives if val != 'NA' and self.actives[val] >= self.parametre['mineff']]
221 def write01(self, fileout, dico, linecontent) :
222 if self.listactives is None :
223 self.listactives = [val for val in dico if val != 'NA' and dico[val] >= self.parametre['mineff']]
224 out = [['0' for forme in self.listactives] for line in linecontent]
225 for i, forme in enumerate(self.listactives) :
226 for line in dico[forme][1] :
228 #out = [[self.rownames[i]] + out[i] for i in range(len(linecontent))]
229 #out.insert(0,[self.idname] + self.listactives)
230 out.insert(0, self.listactives)
231 with open(fileout, 'w') as f :
232 f.write('\n'.join([';'.join(line) for line in out]))
234 def make_01_from_selection(self, listact, listsup = None, dowrite = True) :
235 selcol = self.select_col(listact)
236 self.actives = self.make_dico(selcol)
237 self.write01(self.dictpathout['mat01'], self.actives, selcol)
238 if listsup is not None :
239 selcol = self.select_col(listsup)
240 self.sups = self.make_dico(selcol)
242 def make_01_alc_format(self, fileout) :
243 for i, ligne in enumerate(self.linecontent) :
247 UpdateDico(self.sups, forme, i)
249 UpdateDico(self.actives, forme, i)
250 self.listactives = [val for val in self.actives if self.actives[val][0] >= self.parametre['mineff']]
251 table = [['0' for i in range(len(self.listactives))] for j in range(self.rownb)]
252 for i, val in enumerate(self.listactives) :
253 for j, line in enumerate(self.linecontent) :
256 #table = [[self.rownames[i]] + table[i] for i in range(len(self.rownames))]
257 #table.insert(0, [self.idname] + self.listactives)
258 table.insert(0, self.listactives)
259 with open(fileout, 'w') as f:
260 f.write('\n'.join([';'.join(line) for line in table]))
262 def printtable(self, filename, Table):
263 with open(filename, 'w') as f :
264 f.write('\n'.join([';'.join(line) for line in Table]))
266 def buildprofil(self) :
267 with open(self.dictpathout['uce'], 'rU') as filein :
268 content = filein.readlines()
272 for i, line in enumerate(content) :
273 line = line.replace('\n', '').replace('"', '').split(';')
274 UpdateDico(dicocl, line[1], i)
275 lsucecl.append([int(line[0]) - 1, int(line[1])])
276 self.classes = lsucecl
277 nlist = [[nbuce, cl] for nbuce, cl in lsucecl if cl != 0]
278 self.ucecla = len(nlist)
280 self.clnb = len(dicocl) - 1
282 self.clnb = len(dicocl)
285 for active in self.listactives :
287 line0 = [0] * self.clnb
289 for i in range(0, self.clnb) :
290 for uce, cl in nlist:
292 if active in self.linecontent[uce]:
294 if sum(line[1:]) > self.parametre['mineff']:
295 tablecont.append([line[0]] + [`don` for don in line if type(don) == type(1)])
298 for sup in self.sups :
300 line0 = [0] * self.clnb
302 for i in range(0, self.clnb) :
303 for uce, cl in nlist:
305 if sup in self.linecontent[uce]:
307 tablecontet.append([line[0]] + [`don` for don in line if type(don) == type(1)])
309 self.printtable(self.dictpathout['ContEtOut'], tablecontet)
310 self.printtable(self.dictpathout['Contout'], tablecont)
312 def get_colnames(self) :
313 return self.colnames[:]
315 def make_table_from_classe(self, cl, la) :
316 ln = [line[0] for line in self.classes if line[1] == cl]
317 out = [['0' for col in la] for line in ln]
318 for i, act in enumerate(la) :
319 for j, line in enumerate(ln) :
320 if line in self.actives[act][1] :
322 out.insert(0,[act for act in la])
327 #filename = 'corpus/cent3.csv'
328 #filename = 'corpus/agir2sortie.csv'
329 #tab = Tableau('',filename, encodage='utf-8')
330 #tab.parametre['csvfile'] = tab.parametre['filename']
331 #tab.parametre['sep'] = '\t'
332 #tab.firstrowiscolnames = True
333 #tab.firstcolisrownames = False
335 #tab.make_01('corpus/matrice01.csv')