Spaces:
Sleeping
Sleeping
File size: 6,453 Bytes
98eb826 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
import nltk
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer, util
import numpy
# from nltk.stem import WordNetLemmatizer
import pickle
import re
nltk.download('punkt')
lexical_AMB = ['bound', 'break', 'content', 'call', 'continue', 'contract', 'count', 'direct', 'even', 'express', 'form', 'forward', 'function', 'job',
'level', 'name', 'notice', 'number', 'out', 'position', 'record', 'reference', 'subject', 'string', 'switch', 'throw', 'translate', 'try', 'under']
referential_AMB = ['everyone', 'everything', 'someone',
'something', 'anything', 'anyone', 'itself', 'yourself']
coordination_AMB = ['also', 'if then', 'unless', 'if and only if']
scope_AMB = ['all', 'any', 'few', 'little', 'many', 'much', 'several', 'some']
vague_AMB = ['good', 'better', 'worse', 'available', 'common', 'capability', 'easy', 'full', 'maximum',
'minimum', 'quickly', 'random', 'recently', 'sufficient', 'sufficiently', 'simple', 'useful', 'various']
stopwords_custom = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourselves',
'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'they',
'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these',
'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does',
'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by',
'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below',
'to', 'from', 'up', 'down', 'in', 'on', 'off', 'over', 'again', 'further', 'then', 'once', 'here', 'there', 'when',
'where', 'why', 'how', 'both', 'each', 'more', 'most', 'other', 'such', 'no', 'nor', 'not', 'only', 'own', 'same',
'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll',
'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',
"hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn',
"needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
punctuation = ['.', ',', ';', '?']
# model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
# tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
lexical_encoded = pickle.load(open("lexical_encoded.pickel", "rb"))
vague_encoded = pickle.load(open("vague_encoded.pickel", "rb"))
referential_encoded = pickle.load(open("referential_encoded.pickel", "rb"))
coordination_encoded = pickle.load(open("coordination_encoded.pickel", "rb"))
scope_encoded = pickle.load(open("scope_encoded.pickel", "rb"))
class AmbguityDetector:
def __init__(self):
self.model = SentenceTransformer(
'sentence-transformers/all-MiniLM-L6-v2')
def sentence_ambiguity(self, sentence):
model = self.model
tokens = word_tokenize(sentence)
filtered_tokens = list()
for token in tokens:
if token not in stopwords_custom:
filtered_tokens.append(token)
for i in filtered_tokens:
filtered_tokens[filtered_tokens.index(i)] = i.lower()
if i in punctuation:
filtered_tokens.remove(i)
lexical = dict()
scope = dict()
referential = dict()
vague = dict()
coordination = dict()
ambiguity = dict()
ambiguous_words = list()
words_set=list()
for i in filtered_tokens:
temp = model.encode(i, convert_to_tensor=True)
for j in lexical_AMB:
temp2 = lexical_encoded[j]
cos_sim = util.pytorch_cos_sim(
temp, temp2).numpy().reshape([1, ])
if(cos_sim[0] >= 0.6):
ambiguous_words.append(i)
words_set.append((i,"lexical"))
lexical[i+"+"+j] = cos_sim[0]
for j in scope_AMB:
temp2 = scope_encoded[j]
cos_sim = util.pytorch_cos_sim(
temp, temp2).numpy().reshape([1, ])
if(cos_sim[0] >= 0.6):
ambiguous_words.append(i)
words_set.append((i,"scope"))
scope[i+"+"+j] = cos_sim[0]
for j in referential_AMB:
temp2 = referential_encoded[j]
cos_sim = util.pytorch_cos_sim(
temp, temp2).numpy().reshape([1, ])
if(cos_sim[0] >= 0.6):
ambiguous_words.append(i)
words_set.append((i,"referential"))
referential[i+"+"+j] = cos_sim[0]
for j in vague_AMB:
temp2 = vague_encoded[j]
cos_sim = util.pytorch_cos_sim(
temp, temp2).numpy().reshape([1, ])
if(cos_sim[0] >= 0.6):
ambiguous_words.append(i)
words_set.append((i,"vague"))
vague[i+"+"+j] = cos_sim[0]
for j in coordination_AMB:
temp2 = coordination_encoded[j]
cos_sim = util.pytorch_cos_sim(
temp, temp2).numpy().reshape([1, ])
if(cos_sim[0] >= 0.6):
ambiguous_words.append(i)
words_set.append((i,"coordination"))
coordination[i+"+"+j] = cos_sim[0]
ambiguous_words = list(dict.fromkeys(ambiguous_words))
ambiguity["lexical"] = lexical
ambiguity["referential"] = referential
ambiguity["scope"] = scope
ambiguity["vague"] = vague
ambiguity["coordination"] = coordination
ambiguity["words"] = ambiguous_words
ambiguity["lexical_st"]=words_set
# print(filtered_tokens)
# print(ambiguity)
return ambiguity["lexical_st"] |