From: Pierre Ratinaud Date: Sun, 4 Jan 2015 14:41:29 +0000 (+0100) Subject: dmi X-Git-Url: http://iramuteq.org/git?p=iramuteq;a=commitdiff_plain;h=54d82dcc766acdc38be4b6c16831d5856ceb9d1f;ds=sidebyside dmi --- diff --git a/images/twitter.png b/images/twitter.png new file mode 100644 index 0000000..145b7c9 Binary files /dev/null and b/images/twitter.png differ diff --git a/iramuteq.py b/iramuteq.py index 7060820..fc77952 100644 --- a/iramuteq.py +++ b/iramuteq.py @@ -56,6 +56,7 @@ from corpus import Builder, SubBuilder from checkinstall import CreateIraDirectory, CheckRPath, FindRPAthWin32, FindRPathNix, CheckRPackages, IsNew, UpgradeConf, CopyConf, RLibsAreInstalled from chemins import RscriptsPath, ConstructConfigPath, ConstructDicoPath, ConstructGlobalPath, PathOut from parse_factiva_xml import ImportFactiva +from parse_dmi import ImportDMI from tools import Extract from tree import LeftTree @@ -97,6 +98,7 @@ ID_Fact_xml = wx.NewId() ID_Fact_mail = wx.NewId() ID_Fact_copy = wx.NewId() ID_exportmeta = wx.NewId() +ID_importdmi = wx.NewId() ########################################################## #elements de configuration ########################################################## @@ -195,6 +197,7 @@ images_analyses = { 'subcorpusthema' : 'subcorpusthema.png', 'preferences' : 'preferences.png', 'exportmetatable' : 'exportmetatable.png', + 'importdmi' : 'twitter.png' } ##################################################################### @@ -264,6 +267,10 @@ class IraFrame(wx.Frame): item.SetBitmap(self.images_analyses['europress']) file_menu.AppendItem(item) + item = wx.MenuItem(file_menu, ID_importdmi, _(u"Import from DMI-TCAT (exp.)").decode('utf8'), _(u"Import from DMI-TCAT (exp.)").decode('utf8')) + item.SetBitmap(self.images_analyses['importdmi']) + file_menu.AppendItem(item) + menuFactiva = wx.Menu() fact_from_xml = wx.MenuItem(menuFactiva, ID_Fact_xml, _(u"from xml").decode('utf8')) fact_from_xml.SetBitmap(self.images_analyses['factiva_xml']) @@ -441,6 +448,8 @@ class IraFrame(wx.Frame): tb1.AddSeparator() tb1.AddLabelTool(ID_ImportEuro, "ImportEuro", self.images_analyses['europress'], shortHelp= _(u"Import from Europress").decode('utf8'), longHelp=_(u"Import from Europress").decode('utf8')) tb1.AddSeparator() + tb1.AddLabelTool(ID_importdmi, "ImportDMI", self.images_analyses['importdmi'], shortHelp= _(u"Import from DMI-TCAT (exp.)").decode('utf8'), longHelp=_(u"Import from DMI-TCAT (exp.)").decode('utf8')) + tb1.AddSeparator() tb1.AddLabelTool(ID_Fact_xml, "ImportFactxml", self.images_analyses['factiva_xml'], shortHelp= _(u"Factiva from xml").decode('utf8'), longHelp=_(u"Factiva from xml").decode('utf8')) tb1.AddLabelTool(ID_Fact_mail, "ImportFactmail", self.images_analyses['factiva_mail'], shortHelp= _(u"Factiva from mail").decode('utf8'), longHelp=_(u"Factiva from mail").decode('utf8')) tb1.AddLabelTool(ID_Fact_copy, "ImportFactcopy", self.images_analyses['factiva_copy'], shortHelp= _(u"Factiva from copy/paste").decode('utf8'), longHelp=_(u"Factiva from copy/paste").decode('utf8')) @@ -593,6 +602,7 @@ class IraFrame(wx.Frame): self.Bind(wx.EVT_MENU, self.OnPref, id=wx.ID_PREFERENCES) self.Bind(wx.EVT_MENU, self.OnImportTXM, id=ID_ImportTXM) self.Bind(wx.EVT_MENU, self.OnImportEuropress, id=ID_ImportEuro) + self.Bind(wx.EVT_MENU, self.OnImportDMI, id=ID_importdmi) self.Bind(wx.EVT_MENU, self.OnExportMeta, id=ID_exportmeta) self.Bind(wx.EVT_CLOSE, self.OnClose) ################################################################## @@ -1097,6 +1107,9 @@ Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, États-Unis.""" except : BugReport(self) + def OnImportDMI(self, evt): + ImportDMI(self, {}) + def OnExportMeta(self, evt, corpus = None): if corpus is None : corpus = self.tree.getcorpus() diff --git a/parse_dmi.py b/parse_dmi.py new file mode 100644 index 0000000..5c74020 --- /dev/null +++ b/parse_dmi.py @@ -0,0 +1,179 @@ +#!/bin/env python +# -*- coding: utf-8 -*- +#Author: Pierre Ratinaud +#Copyright (c) 2014, Pierre Ratinaud +#License: GNU GPL + +import csv, codecs, cStringIO +import itertools +from parse_factiva_xml import PrefImport +import wx +import os +from functions import BugReport + +#filein = '/home/pierre/workspace/iramuteq/dev/dmi-tcat/travail_dmi.csv' +#fileout = '/home/pierre/workspace/iramuteq/dev/dmi-tcat/corpus.txt' + +class UTF8Recoder: + """ + Iterator that reads an encoded stream and reencodes the input to UTF-8 + """ + def __init__(self, f, encoding): + self.reader = codecs.getreader(encoding)(f) + + def __iter__(self): + return self + + def next(self): + return self.reader.next().encode("utf-8") + +class UnicodeReader: + """ + A CSV reader which will iterate over lines in the CSV file "f", + which is encoded in the given encoding. + """ + + def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): + f = UTF8Recoder(f, encoding) + self.reader = csv.reader(f, dialect=dialect, **kwds) + + def next(self): + row = self.reader.next() + return [unicode(s, "utf-8") for s in row] + + def __iter__(self): + return self + +class UnicodeWriter: + """ + A CSV writer which will write rows to CSV file "f", + which is encoded in the given encoding. + """ + + def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): + # Redirect output to a queue + self.queue = cStringIO.StringIO() + self.writer = csv.writer(self.queue, dialect=dialect, **kwds) + self.stream = f + self.encoder = codecs.getincrementalencoder(encoding)() + + def writerow(self, row): + self.writer.writerow([s.encode("utf-8") for s in row]) + # Fetch UTF-8 output from the queue ... + data = self.queue.getvalue() + data = data.decode("utf-8") + # ... and reencode it into the target encoding + data = self.encoder.encode(data) + # write to the target stream + self.stream.write(data) + # empty queue + self.queue.truncate(0) + + def writerows(self, rows): + for row in rows: + self.writerow(row) + +class ParseDMI : + def __init__(self, filein, fileout, encodeout ='utf8', onlyrt = True, cleanurl = True, cleanRT = True, cleanAt = True): + self.outf = open(fileout, 'w') + self.encodeout = encodeout + with open(filein, 'rb') as f: + reader = UnicodeReader(f) + linenb = 0 + for row in reader: + if linenb == 0 : + first = row + create_dateid = first.index('created_at') + textid = first.index('text') + print first + else : + text = row[textid] + text = self.washtweet(text) + isrt = self.isRT(text) + if cleanurl : + text = self.cleanurl(text) + if cleanRT : + text = self.cleanRT(text) + if cleanAt : + text = self.cleanAt(text) + meta = self.makemetadata(row, {'date' : create_dateid}) + if onlyrt and not isrt : + self.write_tweet(meta, text) + elif not onlyrt : + self.write_tweet(meta, text) + linenb += 1 + + def write_tweet(self, meta, text): + self.outf.write('\n'.join([meta, text, '']).encode(self.encodeout)) + + def makemetadata(self, row, parametres = {}): + line = [u'****'] + for val in parametres : + if val == 'date' : + line.append('_'.join([u'*date', row[parametres[val]].split()[0]])) + else : + line.append('_'.join([val,row[parametres[val]]])) + return ' '.join(line) + + def washtweet(self, text) : + text = text.replace(u'RT“', u'RT ') + text = text.replace(u'*', ' ') + for val in u'”«»“"' : + text = text.replace(val, ' " ') + text.strip() + return text + + def isRT(self, tweet): + if tweet.startswith('RT ') : + return True + else : + return False + + def cleanurl(self, tweet) : + return ' '.join([word for word in tweet.split() if not word.startswith('http')]) + + def cleanAt(self, tweet) : + return ' '.join([word for word in tweet.split() if not word.startswith('@')]) + + def cleanRT(self, text) : + tweet = text.split() + tovire = [[i, i+1] for i, word in enumerate(tweet) if word == 'RT' and i!=len(tweet) - 1] + tovire = itertools.chain(*tovire) + text = ' '.join([word for i, word in enumerate(tweet) if i not in tovire]) + return text + +class ImportDMI : + def __init__(self, parent, parametres): + self.ira = parent + self.parametres = parametres + self.parse() + + def parse(self): + self.dial = PrefImport(self.ira, methode='dmi') + val = self.dial.ShowModal() + if val == wx.ID_OK : + csvfile = self.dial.dbb.GetValue() + corp_out = self.dial.fbb.GetValue() + self.dial.Destroy() + busy = wx.BusyInfo(_("Please wait...").decode('utf8')) + wx.SafeYield() + try : + ParseDMI(csvfile, corp_out, 'utf8') + del busy + msg = '\n'.join([_(u"Corpus created :").decode('utf8'), corp_out, _(u"Do you want to open it in IRaMuTeQ ?").decode('utf8')]) + dlg = wx.MessageDialog(self.ira, msg, _(u'Information').decode('utf8'), wx.YES_NO | wx.ICON_INFORMATION | wx.STAY_ON_TOP) + dlg.CenterOnParent() + val = dlg.ShowModal() + if val == wx.ID_YES : + dlg.Destroy() + self.ira.filename = os.path.abspath(corp_out) + self.ira.OpenText() + else : + dlg.Destroy() + except : + del busy + BugReport(self.ira) + else : + self.dial.Destroy() + +#ParseDMI(filein, fileout, 'utf8') \ No newline at end of file