+
+ def get_stat_by_cluster(self, outf) :
+ log.info('get_stat_by_cluster')
+ t1 = time()
+ occurrences = dict([[i + 1, 0] for i in range(len(self.lc))])
+ formescl = dict([[i + 1, 0] for i in range(len(self.lc))])
+ hapaxcl = dict([[i + 1, 0] for i in range(len(self.lc))])
+ lenclasses = dict([[i+1,len(cl)] for i, cl in enumerate(self.lc)])
+ sets = [set(cl) for cl in self.lc]
+ for forme in self.formes :
+ formeuceeff = self.getformeuceseff(forme)
+ for i, classe in enumerate(self.lc) :
+ concern = sets[i].intersection(formeuceeff.keys())
+ if len(concern) :
+ occurrences[i+1] += sum([formeuceeff[uce] for uce in concern])
+ formescl[i+1] += 1
+ if self.formes[forme].freq == 1 :
+ hapaxcl[i+1] += 1
+ toprint = '\n'.join([';'.join([`i`, `occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`]) for i in occurrences])
+ with open(outf, 'w') as f :
+ f.write(toprint)
+ log.info('%f' % (time() - t1))