+ profils = ReadProfileAsDico(pathout['PROFILE_OUT'], Alceste, 'utf8')
+ print(profils)
+
+def read_chd(filein, fileout):
+ with open(filein, 'r') as f :
+ content = f.read()
+ #content = [line[3:].replace('"',"").replace(' ','') for line in content.splitlines()]
+ content = [line.split('\t') for line in content.splitlines()]
+ chd = {'name':1, 'children':[]}
+ mere={}
+ for i, line in enumerate(content) :
+ if i == 0 :
+ chd['children'] = [{'name': line[1],'size' : content[i+1][0]}, {'name':line[2], 'size': content[i+1][1]}]
+ mere[line[1]] = chd['children'][0]
+ mere[line[2]] = chd['children'][1]
+ elif not i % 2 :
+ if 'children' in mere[line[0]]:
+ mere[line[0]]['children'].append({'name': line[1],'size' : content[i+1][0]})
+ mere[line[1]] = mere[line[0]]['children'][-1]
+ mere[line[0]]['children'].append({'name': line[2],'size' : content[i+1][1]})
+ mere[line[2]] = mere[line[0]]['children'][-1]
+ else :
+ mere[line[0]]['children'] = [{'name': line[1],'size' : content[i+1][0]}, {'name':line[2], 'size': content[i+1][1]}]
+ mere[line[1]] = mere[line[0]]['children'][-2]
+ mere[line[2]] = mere[line[0]]['children'][-1]
+ with open(fileout, 'w') as f :
+ f.write(json.dumps(chd))
+
+
+translation_languages = {"Afrikaans":"af", "Albanian":"sq", "Amharic":"am", "Arabic":"ar", "Armenian":"hy", "Azeerbaijani":"az", "Basque":"eu", "Belarusian":"be", "Bengali":"bn", "Bosnian":"bs", "Bulgarian":"bg", "Catalan":"ca", "Cebuano":"ceb", "Chichewa":"ny", "Chinese (Simplified)":"zh-CN", "Chinese (Traditional)":"zh-TW", "Corsican":"co", "Croatian":"hr", "Czech":"cs", "Danish":"da", "Dutch":"nl", "English":"en", "Esperanto":"eo", "Estonian":"et", "Filipino":"tl", "Finnish":"fi", "French":"fr", "Frisian":"fy", "Galician":"gl", "Georgian":"ka", "German":"de", "Greek":"el", "Gujarati":"gu", "Haitian Creole":"ht", "Hausa":"ha", "Hawaiian":"haw", "Hebrew":"iw", "Hindi":"hi", "Hmong":"hmn ", "Hungarian":"hu", "Icelandic":"is", "Igbo":"ig", "Indonesian":"id", "Irish":"ga", "Italian":"it", "Japanese":"ja", "Javanese":"jw", "Kannada":"kn", "Kazakh":"kk", "Khmer":"km", "Korean":"ko", "Kurdish":"ku", "Kyrgyz":"ky", "Lao":"lo", "Latin":"la", "Latvian":"lv", "Lithuanian":"lt", "Luxembourgish":"lb", "Macedonian":"mk", "Malagasy":"mg", "Malay":"ms", "Malayalam":"ml", "Maltese":"mt", "Maori":"mi", "Marathi":"mr", "Mongolian":"mn", "Burmese":"my", "Nepali":"ne", "Norwegian":"no", "Pashto":"ps", "Persian":"fa", "Polish":"pl", "Portuguese":"pt", "Punjabi":"ma", "Romanian":"ro", "Russian":"ru", "Samoan":"sm", "Scots Gaelic":"gd", "Serbian":"sr", "Sesotho":"st", "Shona":"sn", "Sindhi":"sd", "Sinhala":"si", "Slovak":"sk", "Slovenian":"sl", "Somali":"so", "Spanish":"es", "Sundanese":"su", "Swahili":"sw", "Swedish":"sv", "Tajik":"tg", "Tamil":"ta", "Telugu":"te", "Thai":"th", "Turkish":"tr", "Ukrainian":"uk", "Urdu":"ur", "Uzbek":"uz", "Vietnamese":"vi", "Welsh":"cy", "Xhosa":"xh", "Yiddish":"yi", "Yoruba":"yo", "Zulu":"zu", }
+
+
+def gettranslation(words, lf, lt) :
+ import urllib.request, urllib.error, urllib.parse
+ import json
+ agent = {'User-Agent':
+ "Mozilla/4.0 (\
+ compatible;\
+ MSIE 6.0;\
+ Windows NT 5.1;\
+ SV1;\
+ .NET CLR 1.1.4322;\
+ .NET CLR 2.0.50727;\
+ .NET CLR 3.0.04506.30\
+ )"}
+ base_link = "https://translate.googleapis.com/translate_a/single?client=gtx&sl=%s&tl=%s&dt=t&q=%s"
+ print(len(words))
+ totrans = urllib.parse.quote('\n'.join(words))
+ link = base_link % (lf, lt, totrans)
+ request = urllib.request.Request(link, headers=agent)
+ raw_data = urllib.request.urlopen(request).read()
+ data = json.loads(raw_data)
+ return [line[0].replace("'", '_').replace(' | ', '|').replace(' ', '_').replace('-','_').replace('\n','') for line in data[0]]
+
+def makenprof(prof, trans, deb=0) :
+ nprof=[]
+ if deb == 0 :
+ nprof.append(prof[0])
+ for i, val in enumerate(trans) :
+ line = prof[deb+i+1][:]
+ line[6] = val
+ nprof.append(line)
+ return nprof
+
+def treatempty(val) :
+ if val.strip() == '' :
+ return '_'
+ else :
+ return val
+
+def translateprofile(corpus, dictprofile, lf='it', lt='fr', maxword = 50) :
+ nprof = {}
+ lems = {}
+ for i in range(len(dictprofile)) :
+ prof = dictprofile[repr(i+1)]
+ try :
+ lenact = prof.index(['*****', '*', '*', '*', '*', '*', '', ''])
+ lensup = -1
+ except ValueError:
+ try :
+ lenact = prof.index(['*', '*', '*', '*', '*', '*', '', ''])
+ lensup = 0
+ except ValueError:
+ lenact = len(prof)
+ lensup = 0
+ try :
+ lensup += prof.index(['*', '*', '*', '*', '*', '*', '', ''])
+ lensup = lensup - lenact
+ except ValueError:
+ lensup += len(prof) - lenact
+ if lenact != 0 :
+ if lenact > maxword :
+ nlenact = maxword
+ else :
+ nlenact = lenact
+ actori = [line[6] for line in prof[1:nlenact]]
+ act = [val.replace('_', ' ') for val in actori]
+ act = gettranslation(act, lf, lt)
+ for j, val in enumerate(actori) :
+ if act[j] not in lems :
+ lems[act[j]] = val
+ else :
+ while act[j] in lems :
+ act[j] = act[j] + "+"
+ lems[act[j]] = val
+ nprof[repr(i+1)] = makenprof(prof, act)
+
+ if lensup != 0 :
+ if lensup > maxword :
+ nlensup = maxword
+ else :
+ nlensup = lensup
+ supori = [line[6] for line in prof[(1+lenact):(lenact+nlensup)]]
+ sup = [val.replace('_', ' ') for val in supori]
+ sup = [treatempty(val) for val in sup]
+ sup = gettranslation(sup, lf, lt)
+ for j, val in enumerate(supori) :
+ if sup[j] not in lems :
+ lems[sup[j]] = val
+ else :
+ while sup[j] in lems :
+ sup[j] = sup[j] + "+"
+ lems[sup[j]] = val
+ nprof[repr(i+1)].append(['*****', '*', '*', '*', '*', '*', '', ''])
+ nprof[repr(i+1)] += makenprof(prof, sup, deb=lenact)
+
+ try :
+ lenet = prof.index(['*', '*', '*', '*', '*', '*', '', ''])
+ nprof[repr(i+1)].append(['*', '*', '*', '*', '*', '*', '', ''])
+ nprof[repr(i+1)] += prof[(lenet+1):]
+ except :
+ pass
+ return nprof, lems
+
+def write_translation_profile(prof, lems, language, dictpathout) :
+ if os.path.exists(dictpathout['translations.txt']) :
+ with open(dictpathout['translations.txt'], 'r', encoding='utf8') as f :
+ translist = f.read()
+ translist = [line.split('\t') for line in translist.splitlines()]
+ else :
+ translist = []
+ toprint = []
+ toprint.append(['','','','','',''])
+ toprint.append(['***', 'nb classes', repr(len(prof)), '***', '', ''])
+ for i in range(len(prof)) :
+ toprint.append(['**', 'classe', repr(i+1), '**', '', ''])
+ toprint.append(['****'] + prof[repr(i+1)][0] + ['****'])
+ rest = [[repr(line[1]), repr(line[2]), repr(line[3]), repr(line[4]), line[6], line[7].replace('< 0,0001', '0.00009').replace('NS (','').replace(')','')] for line in prof[repr(i+1)][1:]]
+ for i, line in enumerate(prof[repr(i+1)][1:]) :
+ if line[0] == '*' :
+ rest[i] = ['*', '*', '*', '*', '*', '*']
+ elif line[0] == '*****' :
+ rest[i] = ['*****','*','*', '*', '*', '*']
+ toprint += rest
+ with open(dictpathout['translation_profile_%s.csv' % language], 'w', encoding='utf8') as f :
+ f.write('\n'.join([';'.join(line) for line in toprint]))
+ with open(dictpathout['translation_words_%s.csv' % language], 'w', encoding='utf8') as f :
+ f.write('\n'.join(['\t'.join([val, lems[val]]) for val in lems]))
+ if 'translation_profile_%s.csv' % language not in [val[0] for val in translist] :
+ translist.append(['translation_profile_%s.csv' % language, 'translation_words_%s.csv' % language])
+ with open(dictpathout['translations.txt'], 'w', encoding='utf8') as f :
+ f.write('\n'.join(['\t'.join(line) for line in translist]))
+
+def makesentidict(infile, language) :
+ with codecs.open(infile,'r', 'utf8') as f :
+ content = f.read()
+ content = [line.split('\t') for line in content.splitlines()]
+ titles = content.pop(0)
+ senti = ['Positive', 'Negative', 'Anger', 'Anticipation', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise', 'Trust']
+ sentid = {}
+ for sent in senti :
+ sentid[sent] = titles.index(sent)
+ frtitle = [val for val in titles if '(fr)' in val]
+ frid = titles.index(frtitle[0])
+ sentidict = [[line[frid].lower(), [line[sentid[sent]] for sent in senti]] for line in content]
+ pos = ['positive'] + [line[0] for line in sentidict if line[1][0] == '1']
+ neg = ['negative'] + [line[0] for line in sentidict if line[1][1] == '1']
+ anger = ['anger'] + [line[0] for line in sentidict if line[1][2] == '1']
+ anticipation = ['anticipation'] + [line[0] for line in sentidict if line[1][3] == '1']
+ disgust = ['disgust'] + [line[0] for line in sentidict if line[1][4] == '1']
+ fear = ['fear'] + [line[0] for line in sentidict if line[1][5] == '1']
+ joy = ['joy'] + [line[0] for line in sentidict if line[1][6] == '1']
+ sadness = ['sadness'] + [line[0] for line in sentidict if line[1][7] == '1']
+ surprise = ['surprise'] + [line[0] for line in sentidict if line[1][8] == '1']
+ trust = ['trust'] + [line[0] for line in sentidict if line[1][9] == '1']
+ with open('/tmp/tgenemo.csv', 'w') as f :
+ for val in [pos, neg, anger, anticipation, disgust, fear, joy, sadness, surprise, trust] :
+ f.write('\t'.join(val) + '\n')
+
+def countsentfromprof(prof, encoding, sentidict) :
+ with codecs.open(prof, 'r', encoding) as f :
+ content = f.read()
+ content = [line.split(';') for line in content.splitlines()]
+ print(content)
+ content = [[line[0], [int(val) for val in line[1:]]] for line in content]
+ print(content)
+ content = dict(content)
+ print(content)
+
+def iratolexico(infile, outfile, encoding) :
+ with codecs.open(infile, 'r', encoding) as f :
+ for line in f :
+ if line.startswith('**** ') :
+ line = line.split()
+