X-Git-Url: http://iramuteq.org/git?p=iramuteq;a=blobdiff_plain;f=parse_dmi.py;fp=parse_dmi.py;h=5c740203ab5ca110c11d5004da51e3ca41dab704;hp=0000000000000000000000000000000000000000;hb=54d82dcc766acdc38be4b6c16831d5856ceb9d1f;hpb=0f1c1cf162a3958732e4000bbe9d210671a74ce7;ds=sidebyside diff --git a/parse_dmi.py b/parse_dmi.py new file mode 100644 index 0000000..5c74020 --- /dev/null +++ b/parse_dmi.py @@ -0,0 +1,179 @@ +#!/bin/env python +# -*- coding: utf-8 -*- +#Author: Pierre Ratinaud +#Copyright (c) 2014, Pierre Ratinaud +#License: GNU GPL + +import csv, codecs, cStringIO +import itertools +from parse_factiva_xml import PrefImport +import wx +import os +from functions import BugReport + +#filein = '/home/pierre/workspace/iramuteq/dev/dmi-tcat/travail_dmi.csv' +#fileout = '/home/pierre/workspace/iramuteq/dev/dmi-tcat/corpus.txt' + +class UTF8Recoder: + """ + Iterator that reads an encoded stream and reencodes the input to UTF-8 + """ + def __init__(self, f, encoding): + self.reader = codecs.getreader(encoding)(f) + + def __iter__(self): + return self + + def next(self): + return self.reader.next().encode("utf-8") + +class UnicodeReader: + """ + A CSV reader which will iterate over lines in the CSV file "f", + which is encoded in the given encoding. + """ + + def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): + f = UTF8Recoder(f, encoding) + self.reader = csv.reader(f, dialect=dialect, **kwds) + + def next(self): + row = self.reader.next() + return [unicode(s, "utf-8") for s in row] + + def __iter__(self): + return self + +class UnicodeWriter: + """ + A CSV writer which will write rows to CSV file "f", + which is encoded in the given encoding. + """ + + def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): + # Redirect output to a queue + self.queue = cStringIO.StringIO() + self.writer = csv.writer(self.queue, dialect=dialect, **kwds) + self.stream = f + self.encoder = codecs.getincrementalencoder(encoding)() + + def writerow(self, row): + self.writer.writerow([s.encode("utf-8") for s in row]) + # Fetch UTF-8 output from the queue ... + data = self.queue.getvalue() + data = data.decode("utf-8") + # ... and reencode it into the target encoding + data = self.encoder.encode(data) + # write to the target stream + self.stream.write(data) + # empty queue + self.queue.truncate(0) + + def writerows(self, rows): + for row in rows: + self.writerow(row) + +class ParseDMI : + def __init__(self, filein, fileout, encodeout ='utf8', onlyrt = True, cleanurl = True, cleanRT = True, cleanAt = True): + self.outf = open(fileout, 'w') + self.encodeout = encodeout + with open(filein, 'rb') as f: + reader = UnicodeReader(f) + linenb = 0 + for row in reader: + if linenb == 0 : + first = row + create_dateid = first.index('created_at') + textid = first.index('text') + print first + else : + text = row[textid] + text = self.washtweet(text) + isrt = self.isRT(text) + if cleanurl : + text = self.cleanurl(text) + if cleanRT : + text = self.cleanRT(text) + if cleanAt : + text = self.cleanAt(text) + meta = self.makemetadata(row, {'date' : create_dateid}) + if onlyrt and not isrt : + self.write_tweet(meta, text) + elif not onlyrt : + self.write_tweet(meta, text) + linenb += 1 + + def write_tweet(self, meta, text): + self.outf.write('\n'.join([meta, text, '']).encode(self.encodeout)) + + def makemetadata(self, row, parametres = {}): + line = [u'****'] + for val in parametres : + if val == 'date' : + line.append('_'.join([u'*date', row[parametres[val]].split()[0]])) + else : + line.append('_'.join([val,row[parametres[val]]])) + return ' '.join(line) + + def washtweet(self, text) : + text = text.replace(u'RT“', u'RT ') + text = text.replace(u'*', ' ') + for val in u'”«»“"' : + text = text.replace(val, ' " ') + text.strip() + return text + + def isRT(self, tweet): + if tweet.startswith('RT ') : + return True + else : + return False + + def cleanurl(self, tweet) : + return ' '.join([word for word in tweet.split() if not word.startswith('http')]) + + def cleanAt(self, tweet) : + return ' '.join([word for word in tweet.split() if not word.startswith('@')]) + + def cleanRT(self, text) : + tweet = text.split() + tovire = [[i, i+1] for i, word in enumerate(tweet) if word == 'RT' and i!=len(tweet) - 1] + tovire = itertools.chain(*tovire) + text = ' '.join([word for i, word in enumerate(tweet) if i not in tovire]) + return text + +class ImportDMI : + def __init__(self, parent, parametres): + self.ira = parent + self.parametres = parametres + self.parse() + + def parse(self): + self.dial = PrefImport(self.ira, methode='dmi') + val = self.dial.ShowModal() + if val == wx.ID_OK : + csvfile = self.dial.dbb.GetValue() + corp_out = self.dial.fbb.GetValue() + self.dial.Destroy() + busy = wx.BusyInfo(_("Please wait...").decode('utf8')) + wx.SafeYield() + try : + ParseDMI(csvfile, corp_out, 'utf8') + del busy + msg = '\n'.join([_(u"Corpus created :").decode('utf8'), corp_out, _(u"Do you want to open it in IRaMuTeQ ?").decode('utf8')]) + dlg = wx.MessageDialog(self.ira, msg, _(u'Information').decode('utf8'), wx.YES_NO | wx.ICON_INFORMATION | wx.STAY_ON_TOP) + dlg.CenterOnParent() + val = dlg.ShowModal() + if val == wx.ID_YES : + dlg.Destroy() + self.ira.filename = os.path.abspath(corp_out) + self.ira.OpenText() + else : + dlg.Destroy() + except : + del busy + BugReport(self.ira) + else : + self.dial.Destroy() + +#ParseDMI(filein, fileout, 'utf8') \ No newline at end of file