from checkinstall import CreateIraDirectory, CheckRPath, FindRPAthWin32, FindRPathNix, CheckRPackages, IsNew, UpgradeConf, CopyConf, RLibsAreInstalled
from chemins import RscriptsPath, ConstructConfigPath, ConstructDicoPath, ConstructGlobalPath, PathOut
from parse_factiva_xml import ImportFactiva
+from parse_dmi import ImportDMI
from tools import Extract
from tree import LeftTree
ID_Fact_mail = wx.NewId()
ID_Fact_copy = wx.NewId()
ID_exportmeta = wx.NewId()
+ID_importdmi = wx.NewId()
##########################################################
#elements de configuration
##########################################################
'subcorpusthema' : 'subcorpusthema.png',
'preferences' : 'preferences.png',
'exportmetatable' : 'exportmetatable.png',
+ 'importdmi' : 'twitter.png'
}
#####################################################################
item.SetBitmap(self.images_analyses['europress'])
file_menu.AppendItem(item)
+ item = wx.MenuItem(file_menu, ID_importdmi, _(u"Import from DMI-TCAT (exp.)").decode('utf8'), _(u"Import from DMI-TCAT (exp.)").decode('utf8'))
+ item.SetBitmap(self.images_analyses['importdmi'])
+ file_menu.AppendItem(item)
+
menuFactiva = wx.Menu()
fact_from_xml = wx.MenuItem(menuFactiva, ID_Fact_xml, _(u"from xml").decode('utf8'))
fact_from_xml.SetBitmap(self.images_analyses['factiva_xml'])
tb1.AddSeparator()
tb1.AddLabelTool(ID_ImportEuro, "ImportEuro", self.images_analyses['europress'], shortHelp= _(u"Import from Europress").decode('utf8'), longHelp=_(u"Import from Europress").decode('utf8'))
tb1.AddSeparator()
+ tb1.AddLabelTool(ID_importdmi, "ImportDMI", self.images_analyses['importdmi'], shortHelp= _(u"Import from DMI-TCAT (exp.)").decode('utf8'), longHelp=_(u"Import from DMI-TCAT (exp.)").decode('utf8'))
+ tb1.AddSeparator()
tb1.AddLabelTool(ID_Fact_xml, "ImportFactxml", self.images_analyses['factiva_xml'], shortHelp= _(u"Factiva from xml").decode('utf8'), longHelp=_(u"Factiva from xml").decode('utf8'))
tb1.AddLabelTool(ID_Fact_mail, "ImportFactmail", self.images_analyses['factiva_mail'], shortHelp= _(u"Factiva from mail").decode('utf8'), longHelp=_(u"Factiva from mail").decode('utf8'))
tb1.AddLabelTool(ID_Fact_copy, "ImportFactcopy", self.images_analyses['factiva_copy'], shortHelp= _(u"Factiva from copy/paste").decode('utf8'), longHelp=_(u"Factiva from copy/paste").decode('utf8'))
self.Bind(wx.EVT_MENU, self.OnPref, id=wx.ID_PREFERENCES)
self.Bind(wx.EVT_MENU, self.OnImportTXM, id=ID_ImportTXM)
self.Bind(wx.EVT_MENU, self.OnImportEuropress, id=ID_ImportEuro)
+ self.Bind(wx.EVT_MENU, self.OnImportDMI, id=ID_importdmi)
self.Bind(wx.EVT_MENU, self.OnExportMeta, id=ID_exportmeta)
self.Bind(wx.EVT_CLOSE, self.OnClose)
##################################################################
except :
BugReport(self)
+ def OnImportDMI(self, evt):
+ ImportDMI(self, {})
+
def OnExportMeta(self, evt, corpus = None):
if corpus is None :
corpus = self.tree.getcorpus()
--- /dev/null
+#!/bin/env python
+# -*- coding: utf-8 -*-
+#Author: Pierre Ratinaud
+#Copyright (c) 2014, Pierre Ratinaud
+#License: GNU GPL
+
+import csv, codecs, cStringIO
+import itertools
+from parse_factiva_xml import PrefImport
+import wx
+import os
+from functions import BugReport
+
+#filein = '/home/pierre/workspace/iramuteq/dev/dmi-tcat/travail_dmi.csv'
+#fileout = '/home/pierre/workspace/iramuteq/dev/dmi-tcat/corpus.txt'
+
+class UTF8Recoder:
+ """
+ Iterator that reads an encoded stream and reencodes the input to UTF-8
+ """
+ def __init__(self, f, encoding):
+ self.reader = codecs.getreader(encoding)(f)
+
+ def __iter__(self):
+ return self
+
+ def next(self):
+ return self.reader.next().encode("utf-8")
+
+class UnicodeReader:
+ """
+ A CSV reader which will iterate over lines in the CSV file "f",
+ which is encoded in the given encoding.
+ """
+
+ def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
+ f = UTF8Recoder(f, encoding)
+ self.reader = csv.reader(f, dialect=dialect, **kwds)
+
+ def next(self):
+ row = self.reader.next()
+ return [unicode(s, "utf-8") for s in row]
+
+ def __iter__(self):
+ return self
+
+class UnicodeWriter:
+ """
+ A CSV writer which will write rows to CSV file "f",
+ which is encoded in the given encoding.
+ """
+
+ def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
+ # Redirect output to a queue
+ self.queue = cStringIO.StringIO()
+ self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
+ self.stream = f
+ self.encoder = codecs.getincrementalencoder(encoding)()
+
+ def writerow(self, row):
+ self.writer.writerow([s.encode("utf-8") for s in row])
+ # Fetch UTF-8 output from the queue ...
+ data = self.queue.getvalue()
+ data = data.decode("utf-8")
+ # ... and reencode it into the target encoding
+ data = self.encoder.encode(data)
+ # write to the target stream
+ self.stream.write(data)
+ # empty queue
+ self.queue.truncate(0)
+
+ def writerows(self, rows):
+ for row in rows:
+ self.writerow(row)
+
+class ParseDMI :
+ def __init__(self, filein, fileout, encodeout ='utf8', onlyrt = True, cleanurl = True, cleanRT = True, cleanAt = True):
+ self.outf = open(fileout, 'w')
+ self.encodeout = encodeout
+ with open(filein, 'rb') as f:
+ reader = UnicodeReader(f)
+ linenb = 0
+ for row in reader:
+ if linenb == 0 :
+ first = row
+ create_dateid = first.index('created_at')
+ textid = first.index('text')
+ print first
+ else :
+ text = row[textid]
+ text = self.washtweet(text)
+ isrt = self.isRT(text)
+ if cleanurl :
+ text = self.cleanurl(text)
+ if cleanRT :
+ text = self.cleanRT(text)
+ if cleanAt :
+ text = self.cleanAt(text)
+ meta = self.makemetadata(row, {'date' : create_dateid})
+ if onlyrt and not isrt :
+ self.write_tweet(meta, text)
+ elif not onlyrt :
+ self.write_tweet(meta, text)
+ linenb += 1
+
+ def write_tweet(self, meta, text):
+ self.outf.write('\n'.join([meta, text, '']).encode(self.encodeout))
+
+ def makemetadata(self, row, parametres = {}):
+ line = [u'****']
+ for val in parametres :
+ if val == 'date' :
+ line.append('_'.join([u'*date', row[parametres[val]].split()[0]]))
+ else :
+ line.append('_'.join([val,row[parametres[val]]]))
+ return ' '.join(line)
+
+ def washtweet(self, text) :
+ text = text.replace(u'RT“', u'RT ')
+ text = text.replace(u'*', ' ')
+ for val in u'”«»“"' :
+ text = text.replace(val, ' " ')
+ text.strip()
+ return text
+
+ def isRT(self, tweet):
+ if tweet.startswith('RT ') :
+ return True
+ else :
+ return False
+
+ def cleanurl(self, tweet) :
+ return ' '.join([word for word in tweet.split() if not word.startswith('http')])
+
+ def cleanAt(self, tweet) :
+ return ' '.join([word for word in tweet.split() if not word.startswith('@')])
+
+ def cleanRT(self, text) :
+ tweet = text.split()
+ tovire = [[i, i+1] for i, word in enumerate(tweet) if word == 'RT' and i!=len(tweet) - 1]
+ tovire = itertools.chain(*tovire)
+ text = ' '.join([word for i, word in enumerate(tweet) if i not in tovire])
+ return text
+
+class ImportDMI :
+ def __init__(self, parent, parametres):
+ self.ira = parent
+ self.parametres = parametres
+ self.parse()
+
+ def parse(self):
+ self.dial = PrefImport(self.ira, methode='dmi')
+ val = self.dial.ShowModal()
+ if val == wx.ID_OK :
+ csvfile = self.dial.dbb.GetValue()
+ corp_out = self.dial.fbb.GetValue()
+ self.dial.Destroy()
+ busy = wx.BusyInfo(_("Please wait...").decode('utf8'))
+ wx.SafeYield()
+ try :
+ ParseDMI(csvfile, corp_out, 'utf8')
+ del busy
+ msg = '\n'.join([_(u"Corpus created :").decode('utf8'), corp_out, _(u"Do you want to open it in IRaMuTeQ ?").decode('utf8')])
+ dlg = wx.MessageDialog(self.ira, msg, _(u'Information').decode('utf8'), wx.YES_NO | wx.ICON_INFORMATION | wx.STAY_ON_TOP)
+ dlg.CenterOnParent()
+ val = dlg.ShowModal()
+ if val == wx.ID_YES :
+ dlg.Destroy()
+ self.ira.filename = os.path.abspath(corp_out)
+ self.ira.OpenText()
+ else :
+ dlg.Destroy()
+ except :
+ del busy
+ BugReport(self.ira)
+ else :
+ self.dial.Destroy()
+
+#ParseDMI(filein, fileout, 'utf8')
\ No newline at end of file