1 # -*- coding: utf-8 -*-
2 #Author: Pierre Ratinaud
5 #------------------------------------
6 # import des modules python
7 #------------------------------------
12 from subprocess import *
14 from xml.dom import minidom, Node
17 txtdir = '/home/pierre/workspace/iramuteq/dev/langues/italian'
19 treetagger = '/home/pierre/prog/treetagger/cmd/tree-tagger-italian-utf8'
20 fileout = '/home/pierre/workspace/iramuteq/dev/langues/lexique_it_t1.txt'
21 stopword = '/home/pierre/workspace/iramuteq/dev/langues/IT_NEW_stopwords_utf8.txt'
22 lexique = '/home/pierre/workspace/iramuteq/dev/langues/lexique_it.txt'
23 xmlfile = '/home/pierre/workspace/iramuteq/dev/langues/itwiki-latest-pages-articles.xml'
28 class WikiPediaHandler(xml.sax.ContentHandler):
29 def __init__(self, sparser) :
35 self.sparser = sparser
37 def startElement(self, name, attrs):
39 self.sparser.treat_formes()
40 self.last = len(self.sparser.formes)
44 if len(self.totreat) > 100000 :
45 self.diff = len(self.sparser.formes) - self.last
46 self.sparser.doparsewiki(' '.join(self.totreat))
48 print 'titres :', self.tottitle
54 # for item in attrs.items():
56 def characters(self, content) :
58 self.totreat.append(content)
61 def __init__(self, txtdir, encodage, treetagger, fileout) :
63 self.encodage = encodage
66 self.fileout = fileout
68 #self.treat_formes(fileout)
70 def clean(self, txt) :
72 keep_caract = u"a-zA-Z0-9àÀâÂäÄáÁéÉèÈêÊëËìÌîÎïÏòÒôÔöÖùÙûÛüÜçÇß’ñ.:,;!?\n*'_-"
73 list_keep = u"[^" + keep_caract + "]+"
74 txt = re.sub(list_keep, ' ', txt)
75 txt = txt.replace(u'’',u"'")
76 txt = txt.replace(u'\'',u' ').replace(u'-', u' ')
77 txt = txt.replace(u'?',u' ? ').replace(u'.',u' . ').replace(u'!', u' ! ').replace(u',',u' , ').replace(u';', u' ; ').replace(u':',u' : ')
78 txt = ' '.join(txt.split())
81 def update_dict(self, tmpfile) :
82 with codecs.open(tmpfile, 'r', 'utf8') as f :
84 content = [line.split('\t') for line in content.splitlines()]
85 for forme in content :
86 if (forme[2] == u'<unknown>') or (forme[1] in [u'PON', u'<unknown>', u'SYM', u'SENT']) or (forme[1]==u'NUM' and forme[2]==u'@card@') :
88 elif (forme[0], forme[1]) in self.formes :
89 self.formes[(forme[0], forme[1])][0] += 1
91 self.formes[(forme[0], forme[1])] = [1, forme[2]]
92 print len(self.formes)
94 def treat_formes(self) :
97 for forme in self.formes :
98 if forme[0] in nformes :
99 if self.formes[forme][0] > nformes[forme[0]][0] :
100 nformes[forme[0]] = [self.formes[forme][0], forme[1], self.formes[forme][1]]
102 nformes[forme[0]] = [self.formes[forme][0], forme[1], self.formes[forme][1]]
103 with open(self.fileout, 'w') as f :
104 toprint = [[forme, nformes[forme][1], nformes[forme][2], `nformes[forme][0]`] for forme in nformes]
105 toprint = sorted(toprint)
106 toprint = '\n'.join(['\t'.join(line) for line in toprint])
107 f.write(toprint.encode('utf8'))
110 def doparsewiki(self, content) :
111 content = self.clean(content)
112 with open('/tmp/tmptxt', 'w') as f :
113 f.write(content.encode('utf8'))
114 p1 = Popen(['cat', '/tmp/tmptxt'], stdout = PIPE)
115 with open('/tmp/tttmp', 'w') as f :
116 p2 = Popen([treetagger], stdin = p1.stdout, stdout = f)
117 out = p2.communicate()
118 self.update_dict('/tmp/tttmp')
121 files = os.listdir(self.txtdir)
123 fpath = os.path.join(self.txtdir, fpath)
125 with codecs.open(fpath, 'r', self.encodage) as f :
127 content = self.clean(content)
128 with open('/tmp/tmptxt', 'w') as f :
129 f.write(content.encode('utf8'))
130 p1 = Popen(['cat', '/tmp/tmptxt'], stdout = PIPE)
131 with open('/tmp/tttmp', 'w') as f :
132 p2 = Popen([treetagger], stdin = p1.stdout, stdout = f)
133 out = p2.communicate()
134 self.update_dict('/tmp/tttmp')
138 def __init__(self, infile, outfile, stopw = None) :
140 with codecs.open(infile, 'r', 'utf8') as f :
142 content = [line.split('\t') for line in content.splitlines()]
143 content = [self.treatline(line) for line in content if line[3] != '1']
146 if stopw is not None :
147 with codecs.open(stopw, 'r', 'utf8') as f :
149 self.stw = stw.splitlines()
150 content = self.dostopword(content)
151 self.printcontent(content, outfile)
153 for forme in self.formes :
154 self.dictg[self.formes[forme][2]] = self.dictg.get(self.formes[forme][2],0) + 1
159 def treatline(self, line) :
160 gram = line[1].split(u':')[0].lower()
161 self.dictg[gram] = self.dictg.get(gram, 0) + 1
162 return [line[0], line[2], gram, int(line[3])]
164 def dostopword(self, content) :
165 for line in content :
166 self.formes[line[0]] = line
167 self.lems[line[1]] = line
168 for word in self.stw :
169 if word in self.formes :
170 print word, self.formes[word]
171 if self.formes[word][2] in ['adj','adv','ver','nom'] :
172 self.formes[word][2] = self.formes[word][2] + '_sup'
173 print self.formes[word]
175 self.formes[word] = [word, word, 'sw', 0]
176 return sorted([[forme, self.formes[forme][1], self.formes[forme][2]] for forme in self.formes])
178 def printcontent(self, content, outfile) :
179 with open(outfile, 'w') as f :
180 f.write('\n'.join(['\t'.join(line) for line in content]).encode('utf8'))
185 #sparser = Parser('', encodage, treetagger, fileout)
186 #parser = xml.sax.make_parser()
187 #parser.setContentHandler(WikiPediaHandler(sparser))
188 #parser.parse(open(xmlfile,"r"))
189 ##Parser(txtdir, encodage, treetagger, fileout)
190 PostTreat(fileout, lexique, stopword)