# class UDlexPT - the PortiLexicon-UD it reads dic files from the current directory # - it should contain WORDmaster.txt plus the 12 tags .tsv files # # member functions: # UDlexPT - the constructor # sget(self, word): # get the entries for a word - returns a list with 3-tuples (empty if absent) # exists(self, word): # returns True if the word exists # pget(self, word, tag): # get the entries of a word for a specific tag - return similar to sget # pexists(self, word, tag): # returns True if this word has at least one entry for tag # theTags(self, word): # returns an array of all tags of a word - empty if absent of the lexicon from os import path class UDlexPT: def __init__(self): # creates the lexicon self.tags = ["ADJ", "ADP", "ADV", "AUX", "CCONJ", "DET", "INTJ", \ "NOUN", "NUM", "PRON", "SCONJ", "VERB"] self.master = {} self.words = 0 self.entries = 0 nEnt = [0]*len(self.tags) nNAE = [0]*len(self.tags) nEnD = [0]*len(self.tags) infile = open(path.dirname(__file__)+"/WORDmaster.txt") for line in infile: buf = line[:-1].split(",") tg = buf[1].split(" ") self.master.update({buf[0]:tg}) self.words += 1 ### compute totals if (len(tg) == 1): nNAE[self.tags.index(tg[0])] += 1 for t in tg: nEnt[self.tags.index(t)] += 1 infile.close() self.t = [] i = 0 for t in self.tags: self.t.append({}) infile = open(path.dirname(__file__)+"/"+t+".tsv") for line in infile: buf = line[:-1].split("\t") entry = self.t[i].get(buf[0],"none") if (entry == "none"): self.t[i].update({buf[0]:[[buf[1],buf[2]]]}) else: entry.append([buf[1],buf[2]]) self.t[i].update({buf[0]:entry}) self.entries += 1 nEnD[self.tags.index(t)] += 1 infile.close() i += 1 print("UDlexPT read with", self.words, "distinct words and", self.entries, "entries") print("{:5} & {:6} & {:6} & {:6} \\\\ \\hline".format("tag","total","amb","non-amb")) accW, accN, accE = 0, 0, 0 for t in self.tags: print("{:5} & {:6} & {:6} & {:6} & {:6} \\\\ \\hline".format(t, \ nEnt[self.tags.index(t)], \ nEnt[self.tags.index(t)]-nNAE[self.tags.index(t)], \ nNAE[self.tags.index(t)], \ nEnD[self.tags.index(t)])) accW += nEnt[self.tags.index(t)] accN += nNAE[self.tags.index(t)] accE += nEnD[self.tags.index(t)] print("{:5} & {:6} & {:6} & {:6} & {:6} \\\\ \\hline".format("total", self.words, self.words-accN, accN, accE)) def sget(self, word): # get the entries for a word tags = self.master.get(word,"none") if (tags == "none"): return [] else: ans = [] for t in tags: a = self.t[self.tags.index(t)].get(word) #if (a == None): # input("fix WORDmaster for: "+word) for n in a: ans.append([n[0],t,n[1]]) return ans def exists(self, word): # returns True if the word exists tags = self.master.get(word,"none") if (tags == "none"): return False else: return True def pget(self, word, tag): # get the entries of a word for a specific tag a = self.t[self.tags.index(tag)].get(word,"none") if (a == "none"): return [] else: ans = [] for n in a: ans.append([n[0],tag,n[1]]) return ans def pexists(self, word, tag): # returns True if this word has at least one entry for tag a = self.t[self.tags.index(tag)].get(word,"none") if (a == "none"): return False else: return True def theTags(self, word): # returns an array of all tags of a word - empty if absent of the dictionary ts = self.master.get(word,"none") if (ts == "none"): return [] else: return ts