2 # -*- coding: utf-8 -*-
3 #Author: Pierre Ratinaud
5 #a simple treetagger parser
10 from functions import ReadDicoAsDico
11 from itertools import izip, chain
14 #infile = '/home/pierre/prog/treetagger/cmd/allenglish.txt'
16 infile = '/home/pierre/fac/cablegate/cablestagger.txt'
17 word_list = '/home/pierre/fac/cablegate/liste_de_mots.csv'
18 actives_list = '/home/pierre/fac/cablegate/actives.csv'
19 supps_list = '/home/pierre/fac/cablegate/supplementaires.csv'
22 def prepare_for_treetagger(corpus, parent) :
23 fileout = '/home/pierre/workspace/iramuteq/corpus/corpus_pour_tt.txt'
25 lang = corpus.parametre['lang']
26 dico_path = parent.DictPath.get(lang + '_exp', 'french_exp')
27 expressions = ReadDicoAsDico(dico_path)
28 corpus.find_expression(expressions)
29 #corpus.content = re.sub(u'[-]+', ' ', corpus.content)
30 corpus.content = re.sub(u'[ ]+', ' ', corpus.content)
31 #FIXME : remplacer . par ' . '
32 #corpus.quick_clean2()
33 with open(fileout, 'w') as f :
34 f.write(corpus.content)
36 def partition(alist, indices):
37 pairs = izip(chain([0], indices), chain(indices, [None]))
38 return (alist[i:j] for i, j in pairs)
40 def partition_uci(uci) :
44 indices = [i for i, forme in enumerate(uci) if forme[0].startswith(u'*')]
45 pairs = izip(chain([0], indices), chain(indices, [None]))
46 return (uci[i,j] for i, j in pairs)
48 def dodict(inlist, indict) :
49 for i, forme in enumerate(inlist) :
50 if tuple(forme) in tot :
51 tot[tuple(forme)].append(i)
53 tot[tuple(forme)] = [i]
55 def treat_forme(forme, tformes) :
56 ponct =[u',', u'', u'``', u"''", u':', u'#', u')', '(', u'!', u'?', u';', u'-', u'.', u'...']
57 if forme[0] in ponct :
59 tforme = (forme[0], forme[1].split(':')[0])
60 return tformes[tforme]
62 def make_formes_and_lems(inlist) :
69 gram = forme[1].split(':')[0]
72 if tforme in tformes :
73 if formes_lems[tformes[tforme]] == u'<unknown>' and lem != u'<unknown>' :
74 formes_lems[tformes[tforme]] = lem
77 nword = u'@'.join([word,''])
78 while nword in formes and formes[nword] != gram :
79 nword = u'@'.join([nword,''])
80 formes[nword] = [0, {}, gram, len(formes)]
81 tformes[tforme] = nword
82 formes_lems[nword] = lem
84 formes[word] = [0, {}, gram, len(formes)]
85 tformes[tforme] = word
86 formes_lems[word] = lem
88 if formes_lems[forme] == u'<unknown>' :
89 formes_lems[forme] = forme
90 if formes_lems[forme] in lems :
91 lems[formes_lems[forme]].append(forme)
93 lems[formes_lems[forme]] = [forme]
95 return formes, lems, tformes
97 def make_ucis_txt_formes_from_tt(corpus, tformes) :
98 ucis_txt = [[treat_forme(forme, tformes) for forme in uci] for uci in corpus.ucis_txt]
100 return [' '.join(uci) for uci in ucis_txt]
102 def get_ucis_from_tt(corpus) :
103 content_split = [tuple(line.split('\t')) for line in corpus.content.splitlines()]
104 #print [i for i, line in enumerate(content_split) if line[0] == u'****']
105 ponct =[u',', u'', u'``', u"''", u':', u'#', u')', '(', u'!', u'?', u';', u'-', u'.', u'...']
106 lformes = [forme for forme in list(set(content_split)) if not forme[0].startswith(u'*') and forme[0] not in ponct]
107 formes, lems, tformes = make_formes_and_lems(lformes)
108 ucis = partition(content_split, [i for i, line in enumerate(content_split) if line[0] == u'****'])
110 ucis = [uci for uci in ucis]
112 indices_max_et = [max([i for i, forme in enumerate(uci) if forme[0].startswith(u'*')]) for uci in ucis]
113 corpus.ucis = [uci[:indices_max_et[i] + 1] for i, uci in enumerate(ucis)]
114 corpus.ucis = [[[et[0] for et in uci],''] for uci in corpus.ucis]
115 corpus.ucis_txt = [uci[indices_max_et[i] + 1:] for i, uci in enumerate(ucis)]
117 corpus.formes = formes
119 return make_ucis_txt_formes_from_tt(corpus, tformes)
121 #with codecs.open(infile, 'r', 'latin1') as f :
122 # #content = [line.split('\t') for line in f]
124 #print time.time() - t1
126 #c1 = content.splitlines()
128 #c1s = [val.split('\t') for val in c1]
132 #sc1 = [val.split('\t') for val in sc1]
135 #formes = [val for val in sc1 if not val[0].isdigit()]
138 #sformes = [val[0] for val in formes]
143 #def make_dicts(inlist) :
146 # for i, forme in enumerate(inlist) :
147 # if tuple(forme) in tot :
148 # tot[tuple(forme)].append(i)
150 # tot[tuple(forme)] = [i]
151 # if forme[1] in totgram :
152 # totgram[forme[1]] += 1
154 # totgram[forme[1]] = 1
155 # return tot, totgram
156 #tot, totgram = make_dicts(c1s)
157 #print 'dico', time.time() - t4
160 # key_file = '/home/pierre/fac/cablegate/keys_english.txt'
161 # with open(key_file, 'r') as f :
163 # keys = keys.splitlines()
164 # keys = [line.split('\t') for line in keys]
169 #kact = [key[0] for key in keys if key[2] == '1']
170 #ksup = [key[0] for key in keys if key[2] == '2']
172 #actives = [[len(tot[forme]), forme[0], forme[1], forme[2]] for forme in tot if forme[1] in kact and len(tot[forme]) > 3]
174 #supps = [[len(tot[forme]), forme[0], forme[1], forme[2]] for forme in tot if forme[1] in ksup and len(tot[forme]) > 3]
177 #words = [[len(tot[word]), word[0], word[1], word[2]] for word in tot]
179 ##hapax = [word for word in words if word[3] == 1]
181 #def print_list(thelist, fileout) :
182 # with open(fileout, 'w') as f :
183 # f.write('\n'.join(['\t'.join(['\t'.join(list(line[1:])), `line[0]`]) for line in thelist]).encode('latin1'))
184 #print_list(words, word_list)
185 #print_list(actives, actives_list)
186 #print_list(supps, supps_list)
187 #print time.time() - t4
191 # return [val.tolist() for val in numpy.split(numpy.array(c1s),[i for i, line in enumerate(c1s) if line[0] == u'****'])]
193 ##def make_ucil(c1s) :
194 #from itertools import izip, chain
195 #def partition(alist, indices):
196 # pairs = izip(chain([0], indices), chain(indices, [None]))
197 # return (alist[i:j] for i, j in pairs)
199 #def partition_uci(uci) :
203 # indices = [i for i, forme in enumerate(uci) if forme[0].startswith(u'*')]
204 # pairs = izip(chain([0], indices), chain(indices, [None]))
205 # return (uci[i,j] for i, j in pairs)
207 ##ucis = make_uci(c1s)
209 #ucis = partition(c1s, [i for i, line in enumerate(c1s) if line[0] == u'****'])
210 #print time.time() - t2
215 ## print max([i for i, forme in enumerate(uci) if forme[0].startswith(u'*')])
217 #ucis = [uci for uci in ucis]
219 #indices_max_et = [max([i for i, forme in enumerate(uci) if forme[0].startswith(u'*')]) for uci in ucis]
220 ##ucis2 = [partition_uci(uci) for uci in ucis]
222 #print len(indices_max_et)
225 #etoiles = [uci[:indices_max_et[i] + 1] for i, uci in enumerate(ucis)]
226 #ucis = [uci[indices_max_et[i] + 1:] for i, uci in enumerate(ucis)]
231 ##ucis = [[val for val in uci if val[1] != 'PUN'] for uci in ucis]
232 #ind_sent = [[i for i, forme in enumerate(uci) if forme[1] == 'SENT'] for uci in ucis]
233 #print time.time() - t3
235 #print len(ucis), len(ind_sent)
236 ##inuformes = [i for i,forme in enumerate(sformes) if sformes.count(forme) > 1]
237 ##inuformes = [formes[i] for i, forme in enumerate(sformes) if forme in sformes[i+1:]]
238 ##nonunique = [forme for forme in formes if formes.count(forme) > 1]
240 #split_sents = [[partition(uci, ind_sent[i])] for i, uci in enumerate(ucis)]
241 #PUNCT = [u',', u'', u'``', u"''", u':', u'#', u')', '(', u'!', u'?', u';', u'-', u'.']
242 #split_sents = [[val for sent in uci for val in sent] for uci in split_sents]
243 #split_sents = [[val for sent in uci for val in sent if val[0] not in PUNCT and not val[0].isdigit()] for uci in split_sents]
244 ##for i in range(0,1) :
245 ## for sent in split_sents[i] :
248 ##nuformes = [formes[i] for i in inuformes]