unicode...

[iramuteq] / corpus.py
diff --git a/corpus.py b/corpus.py

index ec27764..5a18f62 100644 (file)
--- a/corpus.py
+++ b/corpus.py
@@ -77,10 +77,10 @@ class Corpus :
                  gramtype = self.parent.lexique[word][1]
                  lem = self.parent.lexique[word][0]
              elif word.isdigit() :
-                gramtype = 'num'
+                gramtype = u'num'
                  lem = word
              else :
-                gramtype = 'nr'
+                gramtype = u'nr'
                  lem = word
              self.formes[word] =  Word(word, gramtype, len(self.formes), lem)
              self.idformesuces[self.formes[word].ident] = {self.ucis[-1].uces[-1].ident : 1}
@@ -433,6 +433,18 @@ class Corpus :
                  f.write(etline.encode(self.parametres['syscoding']) + '\n')
                  f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
  
+    def export_classe(self, outf, classe, lem = False) :
+        sts = self.lc[classe] 
+        res = self.getconcorde(sts)
+        self.make_iduces()
+        with open(outf, 'w') as f :
+            for uce in res :
+                guce = uce[1]
+                f.write(' '.join(self.ucis[self.iduces[uce[0]].uci].etoiles).encode(self.parametres['syscoding']) + '\n')
+                if lem :
+                    guce = ' '.join([self.formes[forme].lem for forme in guce.split()])
+                f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
+
      def make_and_write_sparse_matrix_from_uces(self, actives, outfile, listuce = False) :
          log.info('make_and_write_sparse_matrix_from_uces %s' % outfile)
          nbl = 0
@@ -502,7 +514,7 @@ class Corpus :
                  self.lems[lem].act = 2
              elif self.lems[lem].gram in gramact :
                  self.lems[lem].act = 1
-            elif gramsup is not None :
+            elif gramsup is not None and self.lems[lem].gram not in gramact:
                  if self.lems[lem].gram in gramsup :
                      self.lems[lem].act = 2
                  else :
@@ -707,27 +719,37 @@ class Corpus :
          self.lc0 = self.lc.pop(0)
          #return ucecl
      
-    def get_stat_by_cluster(self, outf) :
+    def get_stat_by_cluster(self, outf, lclasses = None) :
          log.info('get_stat_by_cluster')
+        if lclasses is None :
+            lclasses = self.lc
          t1 = time()
-        occurrences = dict([[i + 1, 0] for i in range(len(self.lc))])
-        formescl = dict([[i + 1, 0] for i in range(len(self.lc))])
-        hapaxcl = dict([[i + 1, 0] for i in range(len(self.lc))])
-        lenclasses = dict([[i+1,len(cl)] for i, cl in enumerate(self.lc)])
-        sets = [set(cl) for cl in self.lc]
+        occurrences = dict([[i + 1, 0] for i in range(len(lclasses))])
+        formescl = dict([[i + 1, 0] for i in range(len(lclasses))])
+        hapaxcl = dict([[i + 1, 0] for i in range(len(lclasses))])
+        lenclasses = dict([[i+1,len(cl)] for i, cl in enumerate(lclasses)])
+        sets = [set(cl) for cl in lclasses]
          for forme in self.formes :
              formeuceeff = self.getformeuceseff(forme)
-            for i, classe in enumerate(self.lc) :
+            for i, classe in enumerate(lclasses) :
                  concern = sets[i].intersection(formeuceeff.keys())
                  if len(concern) :
                      occurrences[i+1] += sum([formeuceeff[uce] for uce in concern])
                      formescl[i+1] += 1
                      if self.formes[forme].freq == 1 :
                          hapaxcl[i+1] += 1
-        toprint = '\n'.join([';'.join([`i`, `occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`]) for i in occurrences])
-        with open(outf, 'w') as f :
-            f.write(toprint)
          log.info('%f' % (time() - t1))        
+        if outf is not None :
+            toprint = '\n'.join([';'.join([`i`, `occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`]) for i in occurrences])
+            with open(outf, 'w') as f :
+                f.write(toprint)
+        else :
+            return [[`occurrences[i]`, `formescl[i]`, `hapaxcl[i]`, `lenclasses[i]`, `float(hapaxcl[i])/float(formescl[i])`] for i in occurrences]
+
+    def get_stat_by_et(self, outf, etoiles) :
+        lclasses = [self.getucesfrometoile(etoile) for etoile in etoiles]
+        stats = self.get_stat_by_cluster(None, lclasses)
+        stats = [[etoiles[i]] + val for i, val in enumerate(stats)]
  
      def gethapaxbyet(self, etoiles) :
          hapaxuces = [self.getlemuces(forme)[0] for forme in self.lems if self.lems[forme].freq == 1]
@@ -786,6 +808,22 @@ class Corpus :
          with open('/tmp/testhapxuce.html','w') as f :
              f.write(txt)
  
+    def export_dictionary(self, fileout, syscoding) :
+        listformes = [[self.formes[forme].freq, forme, self.formes[forme].lem, self.formes[forme].gram] for forme in self.formes]
+        listformes.sort(reverse = True)
+        listformes = [forme[1:] + [`forme[0]`] for forme in listformes]
+        with open(fileout, 'w') as f :
+            f.write('\n'.join(['\t'.join(forme) for forme in listformes]).encode(syscoding))
+
+    def export_lems(self, fileout, syscoding) :
+        self.make_idformes()
+        listlem = [[lem, '\t'.join(['\t'.join([self.idformes[forme].forme, `self.lems[lem].formes[forme]`]) for forme in self.lems[lem].formes])] for lem in self.lems]
+        listlem.sort()
+        with open(fileout, 'w') as f :
+            f.write('\n'.join(['\t'.join(lem) for lem in listlem]).encode(syscoding))
+
+
+
  
  class MakeUciStat :
      def __init__(self, corpus) :
@@ -1043,7 +1081,7 @@ class BuildCorpus :
      def firstclean(self, txt) :
          txt = txt.replace(u'’',"'")
          txt = txt.replace(u'œ', u'oe')
-        return txt.replace('...',u' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'…', ' £$£ ')
+        return txt.replace('...',u' £$£ ').replace('?',' ? ').replace('.',' . ').replace('!', ' ! ').replace(',',' , ').replace(';', ' ; ').replace(':',' : ').replace(u'…', u' £$£ ')
  
      def make_cleans(self, txt) :
          for clean in self.cleans :
@@ -1216,7 +1254,10 @@ class Builder :
              parametres['originalpath'] = parent.filename
              PathOut().createdir(parametres['pathout'])
              ReadLexique(self.parent, lang = parametres['lang'])
-            self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp'))
+            if parametres['lang'] != 'other' and  os.path.exists(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp')):
+                self.parent.expressions = ReadDicoAsDico(self.parent.DictPath.get(parametres['lang']+'_exp', 'french_exp'))
+            else :
+                self.parent.expressions = {}
              self.parametres = parametres
          else :
              if self.dlg is not None :