iramuteq.org Git - iramuteq/blob - autres/build_dictionnaries.py

   1 # -*- coding: utf-8 -*-
   2 #Author: Pierre Ratinaud
   3 # usage ?
   4
   5 #------------------------------------
   6 # import des modules python
   7 #------------------------------------
   8 import codecs
   9 import os
  10 from time import time
  11 import re
  12 from subprocess import *
  13 import sys, string
  14 from xml.dom import minidom, Node
  15
  16
  17 txtdir = '/home/pierre/workspace/iramuteq/dev/langues/italian'
  18 encodage = 'macroman'
  19 treetagger = '/home/pierre/prog/treetagger/cmd/tree-tagger-italian-utf8'
  20 fileout = '/home/pierre/workspace/iramuteq/dev/langues/lexique_it_t1.txt'
  21 stopword = '/home/pierre/workspace/iramuteq/dev/langues/IT_NEW_stopwords_utf8.txt'
  22 lexique = '/home/pierre/workspace/iramuteq/dev/langues/lexique_it.txt'
  23 xmlfile = '/home/pierre/workspace/iramuteq/dev/langues/itwiki-latest-pages-articles.xml'
  24
  25
  26 import xml.sax
  27
  28 class WikiPediaHandler(xml.sax.ContentHandler):
  29     def __init__(self, sparser) :
  30         self.txt = False
  31         self.totreat = []
  32         self.tottitle = 0
  33         self.diff = 0
  34         self.last = 0
  35         self.sparser = sparser
  36
  37     def startElement(self, name, attrs):
  38         if self.diff > 1000 :
  39             self.sparser.treat_formes()
  40             self.last = len(self.sparser.formes)
  41             self.diff = 0
  42         if name == 'title' :
  43             self.tottitle += 1
  44             if len(self.totreat) > 100000 :
  45                 self.diff = len(self.sparser.formes) - self.last
  46                 self.sparser.doparsewiki(' '.join(self.totreat))
  47                 self.totreat = []
  48                 print 'titres :', self.tottitle
  49         if name == 'text' :
  50             self.txt = True
  51         else :
  52             self.txt = False
  53         #if name == "title":
  54         #    for item in attrs.items():
  55         #        print item
  56     def characters(self, content) :
  57         if self.txt :
  58             self.totreat.append(content)
  59
  60 class Parser :
  61     def __init__(self, txtdir, encodage, treetagger, fileout) :
  62         self.txtdir = txtdir
  63         self.encodage = encodage
  64         self.tt = treetagger
  65         self.formes = {}
  66         self.fileout = fileout
  67         #self.doparse()
  68         #self.treat_formes(fileout)
  69
  70     def clean(self, txt) :
  71         txt = txt.lower()
  72         keep_caract = u"a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇß’ñ.:,;!?\n*'_-"
  73         list_keep = u"[^" + keep_caract + "]+"
  74         txt = re.sub(list_keep, ' ', txt)
  75         txt = txt.replace(u'’',u"'")
  76         txt = txt.replace(u'\'',u' ').replace(u'-', u' ')
  77         txt = txt.replace(u'?',u' ? ').replace(u'.',u' . ').replace(u'!', u' ! ').replace(u',',u' , ').replace(u';', u' ; ').replace(u':',u' : ')
  78         txt = ' '.join(txt.split())
  79         return txt
  80
  81     def update_dict(self, tmpfile) :
  82         with codecs.open(tmpfile, 'r', 'utf8') as f :
  83             content = f.read()
  84         content = [line.split('\t') for line in content.splitlines()]
  85         for forme in content :
  86             if (forme[2] == u'<unknown>') or (forme[1] in [u'PON', u'<unknown>', u'SYM', u'SENT']) or (forme[1]==u'NUM' and forme[2]==u'@card@') :
  87                 pass
  88             elif (forme[0], forme[1]) in self.formes :
  89                 self.formes[(forme[0], forme[1])][0] += 1
  90             else :
  91                 self.formes[(forme[0], forme[1])] = [1, forme[2]]
  92         print len(self.formes)
  93
  94     def treat_formes(self) :
  95         print 'treat_formes'
  96         nformes= {}
  97         for forme in self.formes :
  98             if forme[0] in nformes :
  99                 if self.formes[forme][0] > nformes[forme[0]][0] :
 100                     nformes[forme[0]] = [self.formes[forme][0], forme[1], self.formes[forme][1]]
 101             else :
 102                 nformes[forme[0]] = [self.formes[forme][0], forme[1], self.formes[forme][1]]
 103         with open(self.fileout, 'w') as f :
 104             toprint = [[forme, nformes[forme][1], nformes[forme][2], `nformes[forme][0]`] for forme in nformes]
 105             toprint = sorted(toprint)
 106             toprint = '\n'.join(['\t'.join(line) for line in toprint])
 107             f.write(toprint.encode('utf8'))
 108         print len(nformes)
 109
 110     def doparsewiki(self, content) :
 111         content = self.clean(content)
 112         with open('/tmp/tmptxt', 'w') as f :
 113             f.write(content.encode('utf8'))
 114         p1 = Popen(['cat', '/tmp/tmptxt'], stdout = PIPE)
 115         with open('/tmp/tttmp', 'w') as f :
 116             p2 = Popen([treetagger], stdin = p1.stdout, stdout = f)
 117             out = p2.communicate()
 118         self.update_dict('/tmp/tttmp')
 119
 120     def doparse(self):
 121         files = os.listdir(self.txtdir)
 122         for fpath in files :
 123             fpath = os.path.join(self.txtdir, fpath)
 124             print fpath
 125             with codecs.open(fpath, 'r', self.encodage) as f :
 126                 content = f.read()
 127             content = self.clean(content)
 128             with open('/tmp/tmptxt', 'w') as f :
 129                 f.write(content.encode('utf8'))
 130             p1 = Popen(['cat', '/tmp/tmptxt'], stdout = PIPE)
 131             with open('/tmp/tttmp', 'w') as f :
 132                 p2 = Popen([treetagger], stdin = p1.stdout, stdout = f)
 133                 out = p2.communicate()
 134             self.update_dict('/tmp/tttmp')
 135
 136
 137 class PostTreat :
 138     def __init__(self, infile, outfile, stopw = None) :
 139         self.dictg = {}
 140         with codecs.open(infile, 'r', 'utf8') as f :
 141             content = f.read()
 142         content = [line.split('\t') for line in content.splitlines()]
 143         content = [self.treatline(line) for line in content if line[3] != '1']
 144         self.formes = {}
 145         self.lems = {}
 146         if stopw is not None :
 147             with codecs.open(stopw, 'r', 'utf8') as f :
 148                 stw = f.read()
 149             self.stw = stw.splitlines()
 150             content = self.dostopword(content)
 151         self.printcontent(content, outfile)
 152         self.dictg = {}
 153         for forme in self.formes :
 154             self.dictg[self.formes[forme][2]] = self.dictg.get(self.formes[forme][2],0) + 1
 155         print self.dictg
 156         print content[0:10]
 157         print len(content)
 158
 159     def treatline(self, line) :
 160         gram = line[1].split(u':')[0].lower()
 161         self.dictg[gram] = self.dictg.get(gram, 0) + 1
 162         return [line[0], line[2], gram, int(line[3])]
 163
 164     def dostopword(self, content) :
 165         for line in content :
 166             self.formes[line[0]] = line
 167             self.lems[line[1]] = line
 168         for word in self.stw :
 169             if word in self.formes :
 170                 print word, self.formes[word]
 171                 if self.formes[word][2] in ['adj','adv','ver','nom'] :
 172                     self.formes[word][2] = self.formes[word][2] + '_sup'
 173                     print self.formes[word]
 174             else :
 175                 self.formes[word] = [word, word, 'sw', 0]
 176         return sorted([[forme, self.formes[forme][1], self.formes[forme][2]] for forme in self.formes])
 177
 178     def printcontent(self, content, outfile) :
 179         with open(outfile, 'w') as f :
 180             f.write('\n'.join(['\t'.join(line) for line in content]).encode('utf8'))
 181
 182
 183
 184
 185 #sparser = Parser('', encodage, treetagger, fileout)
 186 #parser = xml.sax.make_parser()
 187 #parser.setContentHandler(WikiPediaHandler(sparser))
 188 #parser.parse(open(xmlfile,"r"))
 189 ##Parser(txtdir, encodage, treetagger, fileout)
 190 PostTreat(fileout, lexique, stopword)
 191
 192
 193