hasanriaz121 commited on
Commit
98eb826
1 Parent(s): 14bb62b

ambiguity detection added

Browse files
.ipynb_checkpoints/app-checkpoint.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+
4
+ def update_array(array_a, array_b):
5
+ # Create a dictionary to store the mappings from the first value to the second value
6
+ mapping = {}
7
+
8
+ # Populate the dictionary using the tuples from array A
9
+ for a, b in array_a:
10
+ mapping[a] = b
11
+
12
+ # Iterate through the tuples in array B
13
+ for i, (a, b) in enumerate(array_b):
14
+ if b is None and a in mapping:
15
+ # Replace the tuple in array B with the value from array A
16
+ array_b[i] = (a, mapping[a])
17
+
18
+ def amb_texts(text):
19
+ tokens = re.split(r'(\s+)', text)
20
+ # tokens = [token for token in tokens if token.strip() != '']
21
+ ambs=a.sentence_ambiguity(text)
22
+ res=list()
23
+ for i in tokens:
24
+ res.append((i,None))
25
+ update_array(ambs,res)
26
+ # print(tokens,text)
27
+ return res
28
+ def greet(name):
29
+ return "Hello " + name + "!!"
30
+
31
+ iface = gr.Interface(fn=greet, inputs="text", outputs="text")
32
+ iface.launch()
.ipynb_checkpoints/detector-checkpoint.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ from nltk.tokenize import word_tokenize
3
+ from sentence_transformers import SentenceTransformer, util
4
+ import numpy
5
+ # from nltk.stem import WordNetLemmatizer
6
+ import pickle
7
+ import re
8
+
9
+ nltk.download('punkt')
10
+
11
+
12
+ class AmbguityDetector:
13
+
14
+ def __init__(self):
15
+ self.model = SentenceTransformer(
16
+ 'sentence-transformers/all-MiniLM-L6-v2')
17
+
18
+ def sentence_ambiguity(self, sentence):
19
+
20
+ model = self.model
21
+ tokens = word_tokenize(sentence)
22
+ filtered_tokens = list()
23
+ for token in tokens:
24
+ if token not in stopwords_custom:
25
+ filtered_tokens.append(token)
26
+
27
+ for i in filtered_tokens:
28
+ filtered_tokens[filtered_tokens.index(i)] = i.lower()
29
+ if i in punctuation:
30
+ filtered_tokens.remove(i)
31
+
32
+ lexical = dict()
33
+ scope = dict()
34
+ referential = dict()
35
+ vague = dict()
36
+ coordination = dict()
37
+ ambiguity = dict()
38
+ ambiguous_words = list()
39
+ words_set=list()
40
+
41
+ for i in filtered_tokens:
42
+ temp = model.encode(i, convert_to_tensor=True)
43
+ for j in lexical_AMB:
44
+ temp2 = lexical_encoded[j]
45
+ cos_sim = util.pytorch_cos_sim(
46
+ temp, temp2).numpy().reshape([1, ])
47
+ if(cos_sim[0] >= 0.6):
48
+ ambiguous_words.append(i)
49
+ words_set.append((i,"lexical"))
50
+ lexical[i+"+"+j] = cos_sim[0]
51
+
52
+ for j in scope_AMB:
53
+ temp2 = scope_encoded[j]
54
+ cos_sim = util.pytorch_cos_sim(
55
+ temp, temp2).numpy().reshape([1, ])
56
+ if(cos_sim[0] >= 0.6):
57
+ ambiguous_words.append(i)
58
+ words_set.append((i,"scope"))
59
+ scope[i+"+"+j] = cos_sim[0]
60
+
61
+ for j in referential_AMB:
62
+ temp2 = referential_encoded[j]
63
+ cos_sim = util.pytorch_cos_sim(
64
+ temp, temp2).numpy().reshape([1, ])
65
+ if(cos_sim[0] >= 0.6):
66
+ ambiguous_words.append(i)
67
+ words_set.append((i,"referential"))
68
+ referential[i+"+"+j] = cos_sim[0]
69
+
70
+ for j in vague_AMB:
71
+ temp2 = vague_encoded[j]
72
+ cos_sim = util.pytorch_cos_sim(
73
+ temp, temp2).numpy().reshape([1, ])
74
+ if(cos_sim[0] >= 0.6):
75
+ ambiguous_words.append(i)
76
+ words_set.append((i,"vague"))
77
+ vague[i+"+"+j] = cos_sim[0]
78
+
79
+ for j in coordination_AMB:
80
+ temp2 = coordination_encoded[j]
81
+ cos_sim = util.pytorch_cos_sim(
82
+ temp, temp2).numpy().reshape([1, ])
83
+ if(cos_sim[0] >= 0.6):
84
+ ambiguous_words.append(i)
85
+ words_set.append((i,"coordination"))
86
+ coordination[i+"+"+j] = cos_sim[0]
87
+
88
+ ambiguous_words = list(dict.fromkeys(ambiguous_words))
89
+ ambiguity["lexical"] = lexical
90
+ ambiguity["referential"] = referential
91
+ ambiguity["scope"] = scope
92
+ ambiguity["vague"] = vague
93
+ ambiguity["coordination"] = coordination
94
+ ambiguity["words"] = ambiguous_words
95
+ ambiguity["lexical_st"]=words_set
96
+
97
+ # print(filtered_tokens)
98
+ # print(ambiguity)
99
+ return ambiguity["lexical_st"]
__pycache__/detector.cpython-39.pyc ADDED
Binary file (4.38 kB). View file
 
app.py CHANGED
@@ -1,7 +1,48 @@
1
  import gradio as gr
 
 
2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  def greet(name):
4
  return "Hello " + name + "!!"
5
 
6
- iface = gr.Interface(fn=greet, inputs="text", outputs="text")
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  iface.launch()
 
1
  import gradio as gr
2
+ import re
3
+ from detector import AmbguityDetector
4
 
5
+ a=AmbguityDetector()
6
+
7
+ def update_array(array_a, array_b):
8
+ # Create a dictionary to store the mappings from the first value to the second value
9
+ mapping = {}
10
+
11
+ # Populate the dictionary using the tuples from array A
12
+ for a, b in array_a:
13
+ mapping[a] = b
14
+
15
+ # Iterate through the tuples in array B
16
+ for i, (a, b) in enumerate(array_b):
17
+ if b is None and a in mapping:
18
+ # Replace the tuple in array B with the value from array A
19
+ array_b[i] = (a, mapping[a])
20
+
21
+ def amb_texts(text):
22
+ tokens = re.split(r'(\s+)', text)
23
+ # tokens = [token for token in tokens if token.strip() != '']
24
+ ambs=a.sentence_ambiguity(text)
25
+ res=list()
26
+ for i in tokens:
27
+ res.append((i,None))
28
+ update_array(ambs,res)
29
+ # print(tokens,text)
30
+ return res
31
  def greet(name):
32
  return "Hello " + name + "!!"
33
 
34
+ iface = gr.Interface(fn=amb_texts, inputs= [
35
+ gr.Textbox(
36
+ label="Input",
37
+ info="Find ambiguities in the following",
38
+ lines=3,
39
+ value="The test can only continue if it receives all inputs from previous page.",
40
+ ),
41
+ ], outputs= gr.HighlightedText(
42
+ label="Ambiguity Detection",
43
+ combine_adjacent=True,
44
+ show_legend=True,
45
+ color_map={"lexical": "blue","scope":"yellow","referential":"orange","coordination":"pink","vague":"red"}),
46
+ theme=gr.themes.Base())
47
+
48
  iface.launch()
coordination_encoded.pickel ADDED
Binary file (7.45 kB). View file
 
detector.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ from nltk.tokenize import word_tokenize
3
+ from sentence_transformers import SentenceTransformer, util
4
+ import numpy
5
+ # from nltk.stem import WordNetLemmatizer
6
+ import pickle
7
+ import re
8
+
9
+ nltk.download('punkt')
10
+
11
+ lexical_AMB = ['bound', 'break', 'content', 'call', 'continue', 'contract', 'count', 'direct', 'even', 'express', 'form', 'forward', 'function', 'job',
12
+ 'level', 'name', 'notice', 'number', 'out', 'position', 'record', 'reference', 'subject', 'string', 'switch', 'throw', 'translate', 'try', 'under']
13
+ referential_AMB = ['everyone', 'everything', 'someone',
14
+ 'something', 'anything', 'anyone', 'itself', 'yourself']
15
+ coordination_AMB = ['also', 'if then', 'unless', 'if and only if']
16
+ scope_AMB = ['all', 'any', 'few', 'little', 'many', 'much', 'several', 'some']
17
+ vague_AMB = ['good', 'better', 'worse', 'available', 'common', 'capability', 'easy', 'full', 'maximum',
18
+ 'minimum', 'quickly', 'random', 'recently', 'sufficient', 'sufficiently', 'simple', 'useful', 'various']
19
+ stopwords_custom = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourselves',
20
+ 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'they',
21
+ 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these',
22
+ 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does',
23
+ 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by',
24
+ 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below',
25
+ 'to', 'from', 'up', 'down', 'in', 'on', 'off', 'over', 'again', 'further', 'then', 'once', 'here', 'there', 'when',
26
+ 'where', 'why', 'how', 'both', 'each', 'more', 'most', 'other', 'such', 'no', 'nor', 'not', 'only', 'own', 'same',
27
+ 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll',
28
+ 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',
29
+ "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn',
30
+ "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
31
+
32
+ punctuation = ['.', ',', ';', '?']
33
+
34
+ # model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
35
+ # tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
36
+
37
+ lexical_encoded = pickle.load(open("lexical_encoded.pickel", "rb"))
38
+ vague_encoded = pickle.load(open("vague_encoded.pickel", "rb"))
39
+ referential_encoded = pickle.load(open("referential_encoded.pickel", "rb"))
40
+ coordination_encoded = pickle.load(open("coordination_encoded.pickel", "rb"))
41
+ scope_encoded = pickle.load(open("scope_encoded.pickel", "rb"))
42
+
43
+ class AmbguityDetector:
44
+
45
+ def __init__(self):
46
+ self.model = SentenceTransformer(
47
+ 'sentence-transformers/all-MiniLM-L6-v2')
48
+
49
+ def sentence_ambiguity(self, sentence):
50
+
51
+ model = self.model
52
+ tokens = word_tokenize(sentence)
53
+ filtered_tokens = list()
54
+ for token in tokens:
55
+ if token not in stopwords_custom:
56
+ filtered_tokens.append(token)
57
+
58
+ for i in filtered_tokens:
59
+ filtered_tokens[filtered_tokens.index(i)] = i.lower()
60
+ if i in punctuation:
61
+ filtered_tokens.remove(i)
62
+
63
+ lexical = dict()
64
+ scope = dict()
65
+ referential = dict()
66
+ vague = dict()
67
+ coordination = dict()
68
+ ambiguity = dict()
69
+ ambiguous_words = list()
70
+ words_set=list()
71
+
72
+ for i in filtered_tokens:
73
+ temp = model.encode(i, convert_to_tensor=True)
74
+ for j in lexical_AMB:
75
+ temp2 = lexical_encoded[j]
76
+ cos_sim = util.pytorch_cos_sim(
77
+ temp, temp2).numpy().reshape([1, ])
78
+ if(cos_sim[0] >= 0.6):
79
+ ambiguous_words.append(i)
80
+ words_set.append((i,"lexical"))
81
+ lexical[i+"+"+j] = cos_sim[0]
82
+
83
+ for j in scope_AMB:
84
+ temp2 = scope_encoded[j]
85
+ cos_sim = util.pytorch_cos_sim(
86
+ temp, temp2).numpy().reshape([1, ])
87
+ if(cos_sim[0] >= 0.6):
88
+ ambiguous_words.append(i)
89
+ words_set.append((i,"scope"))
90
+ scope[i+"+"+j] = cos_sim[0]
91
+
92
+ for j in referential_AMB:
93
+ temp2 = referential_encoded[j]
94
+ cos_sim = util.pytorch_cos_sim(
95
+ temp, temp2).numpy().reshape([1, ])
96
+ if(cos_sim[0] >= 0.6):
97
+ ambiguous_words.append(i)
98
+ words_set.append((i,"referential"))
99
+ referential[i+"+"+j] = cos_sim[0]
100
+
101
+ for j in vague_AMB:
102
+ temp2 = vague_encoded[j]
103
+ cos_sim = util.pytorch_cos_sim(
104
+ temp, temp2).numpy().reshape([1, ])
105
+ if(cos_sim[0] >= 0.6):
106
+ ambiguous_words.append(i)
107
+ words_set.append((i,"vague"))
108
+ vague[i+"+"+j] = cos_sim[0]
109
+
110
+ for j in coordination_AMB:
111
+ temp2 = coordination_encoded[j]
112
+ cos_sim = util.pytorch_cos_sim(
113
+ temp, temp2).numpy().reshape([1, ])
114
+ if(cos_sim[0] >= 0.6):
115
+ ambiguous_words.append(i)
116
+ words_set.append((i,"coordination"))
117
+ coordination[i+"+"+j] = cos_sim[0]
118
+
119
+ ambiguous_words = list(dict.fromkeys(ambiguous_words))
120
+ ambiguity["lexical"] = lexical
121
+ ambiguity["referential"] = referential
122
+ ambiguity["scope"] = scope
123
+ ambiguity["vague"] = vague
124
+ ambiguity["coordination"] = coordination
125
+ ambiguity["words"] = ambiguous_words
126
+ ambiguity["lexical_st"]=words_set
127
+
128
+ # print(filtered_tokens)
129
+ # print(ambiguity)
130
+ return ambiguity["lexical_st"]
lexical_encoded.pickel ADDED
Binary file (53.2 kB). View file
 
referential_encoded.pickel ADDED
Binary file (14.8 kB). View file
 
scope_encoded.pickel ADDED
Binary file (14.7 kB). View file
 
vague_encoded.pickel ADDED
Binary file (33.1 kB). View file