|
from collections import defaultdict |
|
from itertools import product |
|
from typing import * |
|
|
|
import nltk |
|
from nltk.corpus import framenet, framenet15 |
|
from nltk.stem import WordNetLemmatizer |
|
|
|
lemmatizer = WordNetLemmatizer() |
|
|
|
|
|
manual = { |
|
'\'s': 'be', |
|
'\'re': 'be', |
|
'\'ve': 'have', |
|
've': 'have', |
|
'men': 'man', |
|
'saw': 'see', |
|
'could': 'can', |
|
'neighbour': 'neighbor', |
|
'felt': 'feel', |
|
'fell': 'fall', |
|
'little': 'a little', |
|
'have': 'have to', |
|
'raping': 'rape', |
|
'flavor': 'flavour', |
|
'ca': 'can', |
|
'bit': 'a bit', |
|
} |
|
|
|
|
|
def load_framenet_corpus(version): |
|
if '1.5' in version: |
|
nltk.download('framenet_v15') |
|
return framenet15 |
|
elif '1.7' in version: |
|
nltk.download('framenet_v17') |
|
return framenet |
|
else: |
|
raise NotImplementedError |
|
|
|
|
|
def is_word(s: str): |
|
return all([c.isalpha() or c in ' -\'' for c in s]) |
|
|
|
|
|
def lu_to_frame(version: str): |
|
fn = load_framenet_corpus(version) |
|
fn._bad_statuses = [] |
|
map_no_pos = defaultdict(set) |
|
lexicon_set = set() |
|
for frame in fn.frames(): |
|
for lu in frame.lexUnit: |
|
assert lu.count('.') == 1 |
|
lexicon, pos = lu.split('.') |
|
lexicon = lexicon.lower() |
|
lexicon = ' '.join(filter(lambda x: is_word(x), lexicon.split())) |
|
if lexicon == '': |
|
continue |
|
map_no_pos[lexicon].add(frame.name) |
|
lexicon_set.add(lexicon) |
|
fn._bad_statuses = [] |
|
return map_no_pos |
|
|
|
|
|
class FrameIdentifier: |
|
def __init__(self): |
|
lf_map = lu_to_frame('1.7') |
|
lf_map['there have'].add('Existence') |
|
lf_map['there must'].add('Existence') |
|
lf_map['be there'].add('Existence') |
|
self.lf_map = dict(lf_map) |
|
|
|
def __call__(self, tokens: List[str]): |
|
if len(tokens) == 1 and tokens[0].isnumeric(): |
|
return ['Cardinal_numbers'] |
|
if len(tokens) == 1 and tokens[0].endswith('th') and tokens[0][:-2].isnumeric(): |
|
return ['Ordinal_numbers'] |
|
tokens = [t.lower() for t in tokens] |
|
frames = list() |
|
|
|
if not all([is_word(t) for t in tokens]): |
|
return [] |
|
|
|
for i, token in enumerate(tokens): |
|
t2s = [token] |
|
for _pos in 'asrnv': |
|
t2s.append(lemmatizer.lemmatize(token, _pos)) |
|
for t_ in t2s: |
|
if t_ in manual: |
|
t2s.append(manual[t_]) |
|
t2s = list(set(t2s)) |
|
tokens[i] = t2s |
|
|
|
for t2s in tokens: |
|
for t in t2s: |
|
key = t |
|
if key in self.lf_map: |
|
for f in self.lf_map[key]: |
|
frames.append(f) |
|
for t1, t2 in zip(tokens, tokens[1:]): |
|
for ts in product(t1, t2): |
|
t = ' '.join(ts) |
|
if t in self.lf_map: |
|
for f in self.lf_map[t]: |
|
frames.append(f) |
|
|
|
return list(set(frames)) |
|
|