Commit
•
1a873d5
1
Parent(s):
e166efd
Upload Final.ipynb
Browse files- Final.ipynb +65 -61
Final.ipynb
CHANGED
@@ -2,35 +2,19 @@
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
-
"execution_count":
|
6 |
"metadata": {},
|
7 |
"outputs": [
|
8 |
{
|
9 |
"name": "stdout",
|
10 |
"output_type": "stream",
|
11 |
"text": [
|
12 |
-
"
|
13 |
-
"
|
14 |
-
"Probability: 0.
|
15 |
-
"Cosine Similarity: 0.
|
16 |
-
"\n",
|
17 |
-
"
|
18 |
-
"Probability: 0.9892023205757141\n",
|
19 |
-
"Cosine Similarity: 0.002990148961544037\n",
|
20 |
-
"\n",
|
21 |
-
"Candidate: Siya ay namatay kanina .\n",
|
22 |
-
"Probability: 0.9889046549797058\n",
|
23 |
-
"Cosine Similarity: -0.04294966533780098\n",
|
24 |
-
"\n",
|
25 |
-
"Candidate: Siya ay nagbitiw kanina .\n",
|
26 |
-
"Probability: 0.9842618703842163\n",
|
27 |
-
"Cosine Similarity: -0.029277324676513672\n",
|
28 |
-
"\n",
|
29 |
-
"Candidate: Siya ay nahuli kanina .\n",
|
30 |
-
"Probability: 0.9830281734466553\n",
|
31 |
-
"Cosine Similarity: -0.02716892771422863\n",
|
32 |
-
"\n",
|
33 |
-
"Original sentence POS Tags: ['PRS', 'LM', 'VBTF', 'RBW', 'PMP']\n"
|
34 |
]
|
35 |
}
|
36 |
],
|
@@ -39,16 +23,20 @@
|
|
39 |
"import nltk\n",
|
40 |
"from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForMaskedLM\n",
|
41 |
"import joblib\n",
|
|
|
42 |
"\n",
|
43 |
-
"
|
44 |
-
"tokenizer_mlm = AutoTokenizer.from_pretrained(\"
|
45 |
-
"model_mlm = AutoModelForMaskedLM.from_pretrained(\"
|
46 |
"\n",
|
47 |
"# Load pre-trained BERT tokenizer and model for sequence classification\n",
|
48 |
"tokenizer_cls = AutoTokenizer.from_pretrained(\"zklmorales/bert_finetuned\")\n",
|
49 |
"model_cls = AutoModelForSequenceClassification.from_pretrained(\"zklmorales/bert_finetuned\")\n",
|
|
|
|
|
50 |
"crf_model = joblib.load(r'D:\\Thesis\\POS Tag Automation\\crf_model.pkl')\n",
|
51 |
"\n",
|
|
|
52 |
"def word_features(sent, i):\n",
|
53 |
" word = sent[i][0]\n",
|
54 |
" pos = sent[i][1]\n",
|
@@ -98,38 +86,35 @@
|
|
98 |
" 'prev2pos': prev2pos \n",
|
99 |
" }\n",
|
100 |
"\n",
|
101 |
-
"
|
|
|
102 |
"\n",
|
|
|
103 |
"tokens = nltk.word_tokenize(new_sentence)\n",
|
104 |
-
"\n",
|
105 |
-
"tagged_tokens = []\n",
|
106 |
-
"\n",
|
107 |
-
"for token in tokens:\n",
|
108 |
-
" pos_tag = nltk.pos_tag([token])[0][1]\n",
|
109 |
-
" tagged_tokens.append((token, pos_tag))\n",
|
110 |
"\n",
|
111 |
"# Extract features for each token in the new sentence\n",
|
112 |
"features = [word_features(tagged_tokens, i) for i in range(len(tagged_tokens))]\n",
|
113 |
"\n",
|
114 |
-
"# Use the
|
115 |
-
"predicted_labels = crf_model.predict([features])[0]\n",
|
116 |
-
"\n",
|
117 |
-
"# Forward pass through the model for sequence classification\n",
|
118 |
"inputs_cls = tokenizer_cls(new_sentence, return_tensors=\"pt\")\n",
|
119 |
"with torch.no_grad():\n",
|
120 |
" outputs_cls = model_cls(**inputs_cls)\n",
|
121 |
-
"\n",
|
122 |
-
"# Get softmax probabilities\n",
|
123 |
"probabilities_cls = torch.softmax(outputs_cls.logits, dim=1).squeeze().tolist()\n",
|
124 |
-
"\n",
|
125 |
-
"# Get the most probable class\n",
|
126 |
"predicted_class = torch.argmax(outputs_cls.logits, dim=1).item()\n",
|
127 |
"\n",
|
128 |
"# Check if the sentence is grammatically correct\n",
|
129 |
-
"if predicted_class == 1
|
130 |
" print(\"The sentence is grammatically correct.\")\n",
|
131 |
"else:\n",
|
132 |
-
" #
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
" grammar_correction_candidates = []\n",
|
134 |
"\n",
|
135 |
" # Iterate over each word and mask it, then predict the masked word\n",
|
@@ -161,43 +146,62 @@
|
|
161 |
" # Get embeddings for the masked word and the candidate word\n",
|
162 |
" original_embedding = model_mlm.get_input_embeddings()(torch.tensor(tokenizer_mlm.encode(token, add_special_tokens=False))).mean(dim=0)\n",
|
163 |
" candidate_embedding = model_mlm.get_input_embeddings()(torch.tensor(tokenizer_mlm.encode(candidate_mlm, add_special_tokens=False))).mean(dim=0)\n",
|
164 |
-
"\n",
|
165 |
" # Compute cosine similarity between original masked word and predicted word\n",
|
166 |
" similarity = torch.nn.functional.cosine_similarity(original_embedding.unsqueeze(0), candidate_embedding.unsqueeze(0)).item()\n",
|
167 |
-
"\n",
|
168 |
" replaced_words = masked_words.copy()\n",
|
169 |
" replaced_words[i] = candidate_mlm\n",
|
170 |
" corrected_sentence = \" \".join(replaced_words).split() # Split and join to remove extra spaces\n",
|
171 |
" corrected_sentence = \" \".join(corrected_sentence) # Join words without extra spaces\n",
|
172 |
-
"\n",
|
173 |
" # Tokenize the corrected sentence for sequence classification\n",
|
174 |
" inputs_cls = tokenizer_cls(corrected_sentence, return_tensors=\"pt\")\n",
|
175 |
"\n",
|
176 |
-
" # Forward pass through the model for sequence classification\n",
|
177 |
" with torch.no_grad():\n",
|
178 |
" outputs_cls = model_cls(**inputs_cls)\n",
|
179 |
"\n",
|
180 |
-
" # Get softmax probabilities\n",
|
181 |
-
"
|
182 |
"\n",
|
183 |
-
" #
|
184 |
-
"
|
185 |
"\n",
|
186 |
-
" # Append the corrected sentence along with its probability and class\n",
|
187 |
-
" grammar_correction_candidates.append((corrected_sentence, probabilities[predicted_class], similarity))\n",
|
188 |
"\n",
|
189 |
" # Sort the grammar correction candidates by their probabilities and cosine similarities in descending order\n",
|
190 |
" grammar_correction_candidates.sort(key=lambda x: (x[1], x[2]), reverse=True)\n",
|
191 |
"\n",
|
192 |
-
" # Print the top 5 most probable grammar correction candidates with high cosine similarity\n",
|
193 |
-
" print(\"Grammar correction candidates:\")\n",
|
194 |
-
" for candidate, probability, cosine_similarity in grammar_correction_candidates[:5]:\n",
|
195 |
-
" print(\"Candidate:\", candidate)\n",
|
196 |
-
" print(\"Probability:\", probability)\n",
|
197 |
-
" print(\"Cosine Similarity:\", cosine_similarity)\n",
|
198 |
-
" print()\n",
|
199 |
"\n",
|
200 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
201 |
]
|
202 |
}
|
203 |
],
|
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
+
"execution_count": 12,
|
6 |
"metadata": {},
|
7 |
"outputs": [
|
8 |
{
|
9 |
"name": "stdout",
|
10 |
"output_type": "stream",
|
11 |
"text": [
|
12 |
+
"Original sentence: Magigising siya kanina dahil sa ingay\n",
|
13 |
+
"Sentence: Nagulat siya kanina dahil sa ingay\n",
|
14 |
+
"Correctness Probability: 0.9978345036506653\n",
|
15 |
+
"Cosine Similarity: 0.22926439344882965\n",
|
16 |
+
"Levenshtein Score: 82\n",
|
17 |
+
"[('Nagulat siya kanina dahil sa ingay', 0.9978345036506653, 0.22926439344882965, 82)]\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
]
|
19 |
}
|
20 |
],
|
|
|
23 |
"import nltk\n",
|
24 |
"from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForMaskedLM\n",
|
25 |
"import joblib\n",
|
26 |
+
"from fuzzywuzzy import fuzz\n",
|
27 |
"\n",
|
28 |
+
"\n",
|
29 |
+
"tokenizer_mlm = AutoTokenizer.from_pretrained(\"zklmorales/bert_mlm_fine-tuned\")\n",
|
30 |
+
"model_mlm = AutoModelForMaskedLM.from_pretrained(\"zklmorales/bert_mlm_fine-tuned\")\n",
|
31 |
"\n",
|
32 |
"# Load pre-trained BERT tokenizer and model for sequence classification\n",
|
33 |
"tokenizer_cls = AutoTokenizer.from_pretrained(\"zklmorales/bert_finetuned\")\n",
|
34 |
"model_cls = AutoModelForSequenceClassification.from_pretrained(\"zklmorales/bert_finetuned\")\n",
|
35 |
+
"\n",
|
36 |
+
"# Load CRF Model for POS Tagging\n",
|
37 |
"crf_model = joblib.load(r'D:\\Thesis\\POS Tag Automation\\crf_model.pkl')\n",
|
38 |
"\n",
|
39 |
+
"# Define function to extract word features\n",
|
40 |
"def word_features(sent, i):\n",
|
41 |
" word = sent[i][0]\n",
|
42 |
" pos = sent[i][1]\n",
|
|
|
86 |
" 'prev2pos': prev2pos \n",
|
87 |
" }\n",
|
88 |
"\n",
|
89 |
+
"# Define the new sentence\n",
|
90 |
+
"new_sentence = input(\"Sentence: \")\n",
|
91 |
"\n",
|
92 |
+
"# Tokenize the new sentence and get POS tags\n",
|
93 |
"tokens = nltk.word_tokenize(new_sentence)\n",
|
94 |
+
"tagged_tokens = [nltk.pos_tag([token])[0] for token in tokens]\n",
|
|
|
|
|
|
|
|
|
|
|
95 |
"\n",
|
96 |
"# Extract features for each token in the new sentence\n",
|
97 |
"features = [word_features(tagged_tokens, i) for i in range(len(tagged_tokens))]\n",
|
98 |
"\n",
|
99 |
+
"# Use the BERT classifier to check if the sentence is grammatically correct\n",
|
|
|
|
|
|
|
100 |
"inputs_cls = tokenizer_cls(new_sentence, return_tensors=\"pt\")\n",
|
101 |
"with torch.no_grad():\n",
|
102 |
" outputs_cls = model_cls(**inputs_cls)\n",
|
|
|
|
|
103 |
"probabilities_cls = torch.softmax(outputs_cls.logits, dim=1).squeeze().tolist()\n",
|
|
|
|
|
104 |
"predicted_class = torch.argmax(outputs_cls.logits, dim=1).item()\n",
|
105 |
"\n",
|
106 |
"# Check if the sentence is grammatically correct\n",
|
107 |
+
"if predicted_class == 1:\n",
|
108 |
" print(\"The sentence is grammatically correct.\")\n",
|
109 |
"else:\n",
|
110 |
+
" # Use the CRF model to predict POS tags for the tokens\n",
|
111 |
+
" predicted_labels = crf_model.predict([features])[0]\n",
|
112 |
+
"\n",
|
113 |
+
" # Combine tokens with predicted labels\n",
|
114 |
+
" predicted_tokens_with_labels = list(zip(tokens, predicted_labels))\n",
|
115 |
+
"\n",
|
116 |
+
" print(\"Original sentence:\", new_sentence)\n",
|
117 |
+
"\n",
|
118 |
" grammar_correction_candidates = []\n",
|
119 |
"\n",
|
120 |
" # Iterate over each word and mask it, then predict the masked word\n",
|
|
|
146 |
" # Get embeddings for the masked word and the candidate word\n",
|
147 |
" original_embedding = model_mlm.get_input_embeddings()(torch.tensor(tokenizer_mlm.encode(token, add_special_tokens=False))).mean(dim=0)\n",
|
148 |
" candidate_embedding = model_mlm.get_input_embeddings()(torch.tensor(tokenizer_mlm.encode(candidate_mlm, add_special_tokens=False))).mean(dim=0)\n",
|
149 |
+
" \n",
|
150 |
" # Compute cosine similarity between original masked word and predicted word\n",
|
151 |
" similarity = torch.nn.functional.cosine_similarity(original_embedding.unsqueeze(0), candidate_embedding.unsqueeze(0)).item()\n",
|
152 |
+
" \n",
|
153 |
" replaced_words = masked_words.copy()\n",
|
154 |
" replaced_words[i] = candidate_mlm\n",
|
155 |
" corrected_sentence = \" \".join(replaced_words).split() # Split and join to remove extra spaces\n",
|
156 |
" corrected_sentence = \" \".join(corrected_sentence) # Join words without extra spaces\n",
|
157 |
+
" \n",
|
158 |
" # Tokenize the corrected sentence for sequence classification\n",
|
159 |
" inputs_cls = tokenizer_cls(corrected_sentence, return_tensors=\"pt\")\n",
|
160 |
"\n",
|
161 |
+
" # Forward pass through the model for sequence classification 1 or 0 \n",
|
162 |
" with torch.no_grad():\n",
|
163 |
" outputs_cls = model_cls(**inputs_cls)\n",
|
164 |
"\n",
|
165 |
+
" # Get softmax probabilities for class indicating grammatically correct sentences\n",
|
166 |
+
" probability = torch.softmax(outputs_cls.logits, dim=1).squeeze().tolist()[1]\n",
|
167 |
"\n",
|
168 |
+
" # Append the corrected sentence along with its probability and cosine similarity\n",
|
169 |
+
" grammar_correction_candidates.append((corrected_sentence, probability, similarity))\n",
|
170 |
"\n",
|
|
|
|
|
171 |
"\n",
|
172 |
" # Sort the grammar correction candidates by their probabilities and cosine similarities in descending order\n",
|
173 |
" grammar_correction_candidates.sort(key=lambda x: (x[1], x[2]), reverse=True)\n",
|
174 |
"\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
"\n",
|
176 |
+
" threshold = 60 # Adjust this threshold according to your requirement\n",
|
177 |
+
" # Initialize a list to store the top 5 candidates\n",
|
178 |
+
" top_candidates = []\n",
|
179 |
+
"\n",
|
180 |
+
" # Iterate over each candidate and keep track of the top 5 based on cosine similarity\n",
|
181 |
+
" for candidate, probability, cosine_similarity in grammar_correction_candidates:\n",
|
182 |
+
" fuzzy_match_score = fuzz.ratio(new_sentence, candidate)\n",
|
183 |
+
" \n",
|
184 |
+
" # Check if the current candidate should be included in the top 5\n",
|
185 |
+
" if len(top_candidates) < 1:\n",
|
186 |
+
" top_candidates.append((candidate, probability, cosine_similarity, fuzzy_match_score))\n",
|
187 |
+
" # Sort the top_candidates based on cosine similarity in descending order\n",
|
188 |
+
" top_candidates.sort(key=lambda x: x[2], reverse=True)\n",
|
189 |
+
" else:\n",
|
190 |
+
" # Compare the cosine similarity of the current candidate with the lowest similarity in the top_candidates\n",
|
191 |
+
" min_similarity = min(top_candidates, key=lambda x: x[2])[2]\n",
|
192 |
+
" if cosine_similarity > min_similarity:\n",
|
193 |
+
" # Replace the candidate with the lowest similarity in the top_candidates list\n",
|
194 |
+
" min_index = top_candidates.index(min(top_candidates, key=lambda x: x[2]))\n",
|
195 |
+
" top_candidates[min_index] = (candidate, probability, cosine_similarity, fuzzy_match_score)\n",
|
196 |
+
" # Sort the top_candidates based on cosine similarity in descending order\n",
|
197 |
+
" top_candidates.sort(key=lambda x: x[2], reverse=True)\n",
|
198 |
+
"\n",
|
199 |
+
" for idx, (candidate, probability, cosine_similarity, fuzzy_match_score) in enumerate(top_candidates):\n",
|
200 |
+
" print(\"Sentence:\", candidate)\n",
|
201 |
+
" print(\"Correctness Probability:\", probability)\n",
|
202 |
+
" print(\"Cosine Similarity:\", cosine_similarity)\n",
|
203 |
+
" print(\"Levenshtein Score:\", fuzzy_match_score)\n",
|
204 |
+
" print(top_candidates)\n"
|
205 |
]
|
206 |
}
|
207 |
],
|