sociolome / tools /framenet /naive_identifier.py
Gosse Minnema
Initial commit
05922fb
from collections import defaultdict
from itertools import product
from typing import *
import nltk
from nltk.corpus import framenet, framenet15
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
manual = {
'\'s': 'be',
'\'re': 'be',
'\'ve': 'have',
've': 'have',
'men': 'man',
'saw': 'see',
'could': 'can',
'neighbour': 'neighbor',
'felt': 'feel',
'fell': 'fall',
'little': 'a little',
'have': 'have to',
'raping': 'rape',
'flavor': 'flavour',
'ca': 'can',
'bit': 'a bit',
}
def load_framenet_corpus(version):
if '1.5' in version:
nltk.download('framenet_v15')
return framenet15
elif '1.7' in version:
nltk.download('framenet_v17')
return framenet
else:
raise NotImplementedError
def is_word(s: str):
return all([c.isalpha() or c in ' -\'' for c in s])
def lu_to_frame(version: str):
fn = load_framenet_corpus(version)
fn._bad_statuses = []
map_no_pos = defaultdict(set)
lexicon_set = set()
for frame in fn.frames():
for lu in frame.lexUnit:
assert lu.count('.') == 1
lexicon, pos = lu.split('.')
lexicon = lexicon.lower()
lexicon = ' '.join(filter(lambda x: is_word(x), lexicon.split()))
if lexicon == '':
continue
map_no_pos[lexicon].add(frame.name)
lexicon_set.add(lexicon)
fn._bad_statuses = []
return map_no_pos
class FrameIdentifier:
def __init__(self):
lf_map = lu_to_frame('1.7')
lf_map['there have'].add('Existence')
lf_map['there must'].add('Existence')
lf_map['be there'].add('Existence')
self.lf_map = dict(lf_map)
def __call__(self, tokens: List[str]):
if len(tokens) == 1 and tokens[0].isnumeric():
return ['Cardinal_numbers']
if len(tokens) == 1 and tokens[0].endswith('th') and tokens[0][:-2].isnumeric():
return ['Ordinal_numbers']
tokens = [t.lower() for t in tokens]
frames = list()
if not all([is_word(t) for t in tokens]):
return []
for i, token in enumerate(tokens):
t2s = [token]
for _pos in 'asrnv':
t2s.append(lemmatizer.lemmatize(token, _pos))
for t_ in t2s:
if t_ in manual:
t2s.append(manual[t_])
t2s = list(set(t2s))
tokens[i] = t2s
for t2s in tokens:
for t in t2s:
key = t
if key in self.lf_map:
for f in self.lf_map[key]:
frames.append(f)
for t1, t2 in zip(tokens, tokens[1:]):
for ts in product(t1, t2):
t = ' '.join(ts)
if t in self.lf_map:
for f in self.lf_map[t]:
frames.append(f)
return list(set(frames))