X-Git-Url: http://iramuteq.org/git?a=blobdiff_plain;ds=sidebyside;f=corpus.py;h=080b980ff7bbe13c4ff0d665db751a2af92fbfd7;hb=80f4bfad30ece8835cb1f91349b1dda36439e4ca;hp=9b417885ac77f2f97bf6e2cdeb86f4b95211d99a;hpb=c038ef7892cf106654fcd0d35389584513b2ec1d;p=iramuteq
diff --git a/corpus.py b/corpus.py
old mode 100644
new mode 100755
index 9b41788..080b980
--- a/corpus.py
+++ b/corpus.py
@@ -36,7 +36,7 @@ def copycorpus(corpus) :
def CopyUce(uce) :
return Uce(uce.ident, uce.para, uce.uci)
-
+
def CopyUci(uci):
nuci = Uci(uci.ident, '')
@@ -44,7 +44,7 @@ def CopyUci(uci):
nuci.uces = [CopyUce(uce) for uce in uci.uces]
nuci.paras = copy(uci.paras)
return nuci
-
+
class Corpus :
@@ -259,6 +259,11 @@ class Corpus :
uces = [[val[0], '\n'.join([row[1] for row in self.getconcorde(val[1])])] for val in uces]
return uces
+ def getuciconcorde_uces(self, uciid, uceid) :
+ uces = [uce.ident for uce in self.ucis[uciid].uces]
+ uces = [row for row in self.getconcorde(uces)]
+ return uces
+
def getwordconcorde(self, word) :
return self.getconcorde(self.getworduces(word))
@@ -271,7 +276,7 @@ class Corpus :
def getallucis(self):
uces = [row[1] for row in self.getalluces()]
return [[uci.ident, '\n'.join([uces[uce.ident] for uce in uci.uces])] for uci in self.ucis]
-
+
def getucesfrometoile(self, etoile) :
return [uce.ident for uci in self.ucis for uce in uci.uces if etoile in uci.etoiles]
@@ -296,7 +301,7 @@ class Corpus :
else :
idpara += 1
return etoileuces
-
+
def getetoileucis(self):
etoileuces = {}
for uci in self.ucis :
@@ -343,7 +348,7 @@ class Corpus :
self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
else :
self.lems = dict([[forme, Lem(self, self.formes[forme])] for forme in self.formes])
-
+
def make_lems_from_dict(self, dictionnaire, dolem = True) :
log.info('make lems from dict')
self.lems = {}
@@ -367,7 +372,7 @@ class Corpus :
self.lems[self.formes[forme].lem] = Lem(self, self.formes[forme])
else :
self.lems[forme] = Lem(self, self.formes[forme])
-
+
def make_idformes(self) :
self.idformes = dict([[self.formes[forme].ident, self.formes[forme]] for forme in self.formes])
@@ -376,6 +381,7 @@ class Corpus :
self.iduces = dict([[uce.ident, uce] for uci in self.ucis for uce in uci.uces])
def make_lexitable(self, mineff, etoiles, gram = 0) :
+ log.info('making lexical table...')
if gram == 0 :
grams = {1:'', 2:''}
else :
@@ -398,7 +404,7 @@ class Corpus :
tab.append(line)
tab.insert(0, [''] + etoiles)
return tab
-
+
def make_tgen_table(self, tgen, etoiles, tot = None):
lclasses = [self.getucesfrometoile(etoile) for etoile in etoiles]
sets = [set(cl) for cl in lclasses]
@@ -600,7 +606,7 @@ class Corpus :
f.write(guce.encode(self.parametres['syscoding']) + '\n\n')
def export_classe(self, outf, classe, lem = False, uci = False) :
- sts = self.lc[classe - 1]
+ sts = self.lc[classe - 1]
if not uci :
res = self.getconcorde(sts)
self.make_iduces()
@@ -658,7 +664,7 @@ class Corpus :
nbl += 1
f.write(''.join([' '.join([`uce+1`, `i+1`,`1`]),'\n']))
f.seek(0)
- with open(outfile, 'w') as ffin :
+ with open(outfile, 'w') as ffin :
ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucenb(), len(actives), nbl))
for line in f :
ffin.write(line)
@@ -676,7 +682,7 @@ class Corpus :
nbl += 1
f.write(''.join([' '.join([`uci+1`, `i+1`,`1`]),'\n']))
f.seek(0)
- with open(outfile, 'w') as ffin :
+ with open(outfile, 'w') as ffin :
ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (self.getucinb(), len(actives), nbl))
for line in f :
ffin.write(line)
@@ -695,12 +701,12 @@ class Corpus :
for uce in uces_ok :
f.write(''.join([' '.join([`duces[uce]+1`,`i+1`,`1`]),'\n']))
f.seek(0)
- with open(outfile, 'w') as ffin :
+ with open(outfile, 'w') as ffin :
ffin.write("%%%%MatrixMarket matrix coordinate integer general\n%i %i %i\n" % (len(uces), len(actives), nbl))
for line in f :
ffin.write(line)
os.remove(outfile + '~')
-
+
def make_table_with_classe(self, uces, list_act, uci = False) :
table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
uces = dict([[uce, i] for i, uce in enumerate(uces)])
@@ -713,8 +719,8 @@ class Corpus :
for uce in lemuces :
table_uce[uces[uce]][i] = 1
table_uce.insert(0, list_act)
- return table_uce
-
+ return table_uce
+
def make_pondtable_with_classe(self, uces, list_act) :
table_uce = [[0 for val in list_act] for line in range(0,len(uces))]
uces = dict([[uce, i] for i, uce in enumerate(uces)])
@@ -724,7 +730,7 @@ class Corpus :
for uce in lemuces :
table_uce[uces[uce]][i] = uceseff[uce]
table_uce.insert(0, list_act)
- return table_uce
+ return table_uce
def parse_active(self, gramact, gramsup = None) :
log.info('parse actives')
@@ -784,6 +790,7 @@ class Corpus :
tab = [[line[0]] + [`val` for val in line[1:]] for line in tab if sum(line[1:]) >= 3]
with open(fileout, 'w') as f :
f.write('\n'.join([';'.join(line) for line in tab]).encode(self.parametres['syscoding']))
+ f.write('\n')
def make_etoiles(self) :
etoiles = set([])
@@ -857,8 +864,13 @@ class Corpus :
etoileuces = self.getetoileuces()
else :
etoileuces = self.getetoileucis()
- etoileuces = dict([[et, etoileuces[et]] for et in etoileuces if len(etoileuces[et]) > 1])
+ print 'etoilesuces ok'
+ etoileuces = dict([[et, etoileuces[et]] for et in etoileuces if
+ len(etoileuces[et]) > 1 ]) #and not et.startswith(u'*reference_')
+ print len(etoileuces)
+ print 'etoilesuces ok2'
with open(fileout, 'w') as f :
+ print 'write...'
f.write('\n'.join([';'.join([et] + [`len(set(etoileuces[et]).intersection(classe))` for classe in ucecl]) for et in etoileuces]).encode(self.parametres['syscoding']))
#etoiles = self.make_etoiles()
#with open(fileout, 'w') as f :
@@ -900,6 +912,32 @@ class Corpus :
txt += '' % (color[ucecl[uce[0]]]) + uce[1] + '
'
return txt + '\n