zklmorales commited on
Commit
1a873d5
1 Parent(s): e166efd

Upload Final.ipynb

Browse files
Files changed (1) hide show
  1. Final.ipynb +65 -61
Final.ipynb CHANGED
@@ -2,35 +2,19 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 14,
6
  "metadata": {},
7
  "outputs": [
8
  {
9
  "name": "stdout",
10
  "output_type": "stream",
11
  "text": [
12
- "Grammar correction candidates:\n",
13
- "Candidate: Siya ay nagising kanina .\n",
14
- "Probability: 0.9917004704475403\n",
15
- "Cosine Similarity: 0.18928596377372742\n",
16
- "\n",
17
- "Candidate: Siya ay dumating kanina .\n",
18
- "Probability: 0.9892023205757141\n",
19
- "Cosine Similarity: 0.002990148961544037\n",
20
- "\n",
21
- "Candidate: Siya ay namatay kanina .\n",
22
- "Probability: 0.9889046549797058\n",
23
- "Cosine Similarity: -0.04294966533780098\n",
24
- "\n",
25
- "Candidate: Siya ay nagbitiw kanina .\n",
26
- "Probability: 0.9842618703842163\n",
27
- "Cosine Similarity: -0.029277324676513672\n",
28
- "\n",
29
- "Candidate: Siya ay nahuli kanina .\n",
30
- "Probability: 0.9830281734466553\n",
31
- "Cosine Similarity: -0.02716892771422863\n",
32
- "\n",
33
- "Original sentence POS Tags: ['PRS', 'LM', 'VBTF', 'RBW', 'PMP']\n"
34
  ]
35
  }
36
  ],
@@ -39,16 +23,20 @@
39
  "import nltk\n",
40
  "from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForMaskedLM\n",
41
  "import joblib\n",
 
42
  "\n",
43
- "# Load pre-trained RoBERTa tokenizer and model for MLM\n",
44
- "tokenizer_mlm = AutoTokenizer.from_pretrained(\"fine_tuned_model\")\n",
45
- "model_mlm = AutoModelForMaskedLM.from_pretrained(\"fine_tuned_model\")\n",
46
  "\n",
47
  "# Load pre-trained BERT tokenizer and model for sequence classification\n",
48
  "tokenizer_cls = AutoTokenizer.from_pretrained(\"zklmorales/bert_finetuned\")\n",
49
  "model_cls = AutoModelForSequenceClassification.from_pretrained(\"zklmorales/bert_finetuned\")\n",
 
 
50
  "crf_model = joblib.load(r'D:\\Thesis\\POS Tag Automation\\crf_model.pkl')\n",
51
  "\n",
 
52
  "def word_features(sent, i):\n",
53
  " word = sent[i][0]\n",
54
  " pos = sent[i][1]\n",
@@ -98,38 +86,35 @@
98
  " 'prev2pos': prev2pos \n",
99
  " }\n",
100
  "\n",
101
- "new_sentence = \"Siya ay magigising kanina.\"\n",
 
102
  "\n",
 
103
  "tokens = nltk.word_tokenize(new_sentence)\n",
104
- "\n",
105
- "tagged_tokens = []\n",
106
- "\n",
107
- "for token in tokens:\n",
108
- " pos_tag = nltk.pos_tag([token])[0][1]\n",
109
- " tagged_tokens.append((token, pos_tag))\n",
110
  "\n",
111
  "# Extract features for each token in the new sentence\n",
112
  "features = [word_features(tagged_tokens, i) for i in range(len(tagged_tokens))]\n",
113
  "\n",
114
- "# Use the trained CRF model to predict labels for the tokens\n",
115
- "predicted_labels = crf_model.predict([features])[0]\n",
116
- "\n",
117
- "# Forward pass through the model for sequence classification\n",
118
  "inputs_cls = tokenizer_cls(new_sentence, return_tensors=\"pt\")\n",
119
  "with torch.no_grad():\n",
120
  " outputs_cls = model_cls(**inputs_cls)\n",
121
- "\n",
122
- "# Get softmax probabilities\n",
123
  "probabilities_cls = torch.softmax(outputs_cls.logits, dim=1).squeeze().tolist()\n",
124
- "\n",
125
- "# Get the most probable class\n",
126
  "predicted_class = torch.argmax(outputs_cls.logits, dim=1).item()\n",
127
  "\n",
128
  "# Check if the sentence is grammatically correct\n",
129
- "if predicted_class == 1: # Assuming class 0 represents grammatical correctness\n",
130
  " print(\"The sentence is grammatically correct.\")\n",
131
  "else:\n",
132
- " # Proceed with grammar correction candidates\n",
 
 
 
 
 
 
 
133
  " grammar_correction_candidates = []\n",
134
  "\n",
135
  " # Iterate over each word and mask it, then predict the masked word\n",
@@ -161,43 +146,62 @@
161
  " # Get embeddings for the masked word and the candidate word\n",
162
  " original_embedding = model_mlm.get_input_embeddings()(torch.tensor(tokenizer_mlm.encode(token, add_special_tokens=False))).mean(dim=0)\n",
163
  " candidate_embedding = model_mlm.get_input_embeddings()(torch.tensor(tokenizer_mlm.encode(candidate_mlm, add_special_tokens=False))).mean(dim=0)\n",
164
- "\n",
165
  " # Compute cosine similarity between original masked word and predicted word\n",
166
  " similarity = torch.nn.functional.cosine_similarity(original_embedding.unsqueeze(0), candidate_embedding.unsqueeze(0)).item()\n",
167
- "\n",
168
  " replaced_words = masked_words.copy()\n",
169
  " replaced_words[i] = candidate_mlm\n",
170
  " corrected_sentence = \" \".join(replaced_words).split() # Split and join to remove extra spaces\n",
171
  " corrected_sentence = \" \".join(corrected_sentence) # Join words without extra spaces\n",
172
- "\n",
173
  " # Tokenize the corrected sentence for sequence classification\n",
174
  " inputs_cls = tokenizer_cls(corrected_sentence, return_tensors=\"pt\")\n",
175
  "\n",
176
- " # Forward pass through the model for sequence classification\n",
177
  " with torch.no_grad():\n",
178
  " outputs_cls = model_cls(**inputs_cls)\n",
179
  "\n",
180
- " # Get softmax probabilities\n",
181
- " probabilities = torch.softmax(outputs_cls.logits, dim=1).squeeze().tolist()\n",
182
  "\n",
183
- " # Get the most probable class\n",
184
- " predicted_class = torch.argmax(outputs_cls.logits, dim=1).item()\n",
185
  "\n",
186
- " # Append the corrected sentence along with its probability and class\n",
187
- " grammar_correction_candidates.append((corrected_sentence, probabilities[predicted_class], similarity))\n",
188
  "\n",
189
  " # Sort the grammar correction candidates by their probabilities and cosine similarities in descending order\n",
190
  " grammar_correction_candidates.sort(key=lambda x: (x[1], x[2]), reverse=True)\n",
191
  "\n",
192
- " # Print the top 5 most probable grammar correction candidates with high cosine similarity\n",
193
- " print(\"Grammar correction candidates:\")\n",
194
- " for candidate, probability, cosine_similarity in grammar_correction_candidates[:5]:\n",
195
- " print(\"Candidate:\", candidate)\n",
196
- " print(\"Probability:\", probability)\n",
197
- " print(\"Cosine Similarity:\", cosine_similarity)\n",
198
- " print()\n",
199
  "\n",
200
- "print(\"Original sentence POS Tags:\", predicted_labels)\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  ]
202
  }
203
  ],
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 12,
6
  "metadata": {},
7
  "outputs": [
8
  {
9
  "name": "stdout",
10
  "output_type": "stream",
11
  "text": [
12
+ "Original sentence: Magigising siya kanina dahil sa ingay\n",
13
+ "Sentence: Nagulat siya kanina dahil sa ingay\n",
14
+ "Correctness Probability: 0.9978345036506653\n",
15
+ "Cosine Similarity: 0.22926439344882965\n",
16
+ "Levenshtein Score: 82\n",
17
+ "[('Nagulat siya kanina dahil sa ingay', 0.9978345036506653, 0.22926439344882965, 82)]\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  ]
19
  }
20
  ],
 
23
  "import nltk\n",
24
  "from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForMaskedLM\n",
25
  "import joblib\n",
26
+ "from fuzzywuzzy import fuzz\n",
27
  "\n",
28
+ "\n",
29
+ "tokenizer_mlm = AutoTokenizer.from_pretrained(\"zklmorales/bert_mlm_fine-tuned\")\n",
30
+ "model_mlm = AutoModelForMaskedLM.from_pretrained(\"zklmorales/bert_mlm_fine-tuned\")\n",
31
  "\n",
32
  "# Load pre-trained BERT tokenizer and model for sequence classification\n",
33
  "tokenizer_cls = AutoTokenizer.from_pretrained(\"zklmorales/bert_finetuned\")\n",
34
  "model_cls = AutoModelForSequenceClassification.from_pretrained(\"zklmorales/bert_finetuned\")\n",
35
+ "\n",
36
+ "# Load CRF Model for POS Tagging\n",
37
  "crf_model = joblib.load(r'D:\\Thesis\\POS Tag Automation\\crf_model.pkl')\n",
38
  "\n",
39
+ "# Define function to extract word features\n",
40
  "def word_features(sent, i):\n",
41
  " word = sent[i][0]\n",
42
  " pos = sent[i][1]\n",
 
86
  " 'prev2pos': prev2pos \n",
87
  " }\n",
88
  "\n",
89
+ "# Define the new sentence\n",
90
+ "new_sentence = input(\"Sentence: \")\n",
91
  "\n",
92
+ "# Tokenize the new sentence and get POS tags\n",
93
  "tokens = nltk.word_tokenize(new_sentence)\n",
94
+ "tagged_tokens = [nltk.pos_tag([token])[0] for token in tokens]\n",
 
 
 
 
 
95
  "\n",
96
  "# Extract features for each token in the new sentence\n",
97
  "features = [word_features(tagged_tokens, i) for i in range(len(tagged_tokens))]\n",
98
  "\n",
99
+ "# Use the BERT classifier to check if the sentence is grammatically correct\n",
 
 
 
100
  "inputs_cls = tokenizer_cls(new_sentence, return_tensors=\"pt\")\n",
101
  "with torch.no_grad():\n",
102
  " outputs_cls = model_cls(**inputs_cls)\n",
 
 
103
  "probabilities_cls = torch.softmax(outputs_cls.logits, dim=1).squeeze().tolist()\n",
 
 
104
  "predicted_class = torch.argmax(outputs_cls.logits, dim=1).item()\n",
105
  "\n",
106
  "# Check if the sentence is grammatically correct\n",
107
+ "if predicted_class == 1:\n",
108
  " print(\"The sentence is grammatically correct.\")\n",
109
  "else:\n",
110
+ " # Use the CRF model to predict POS tags for the tokens\n",
111
+ " predicted_labels = crf_model.predict([features])[0]\n",
112
+ "\n",
113
+ " # Combine tokens with predicted labels\n",
114
+ " predicted_tokens_with_labels = list(zip(tokens, predicted_labels))\n",
115
+ "\n",
116
+ " print(\"Original sentence:\", new_sentence)\n",
117
+ "\n",
118
  " grammar_correction_candidates = []\n",
119
  "\n",
120
  " # Iterate over each word and mask it, then predict the masked word\n",
 
146
  " # Get embeddings for the masked word and the candidate word\n",
147
  " original_embedding = model_mlm.get_input_embeddings()(torch.tensor(tokenizer_mlm.encode(token, add_special_tokens=False))).mean(dim=0)\n",
148
  " candidate_embedding = model_mlm.get_input_embeddings()(torch.tensor(tokenizer_mlm.encode(candidate_mlm, add_special_tokens=False))).mean(dim=0)\n",
149
+ " \n",
150
  " # Compute cosine similarity between original masked word and predicted word\n",
151
  " similarity = torch.nn.functional.cosine_similarity(original_embedding.unsqueeze(0), candidate_embedding.unsqueeze(0)).item()\n",
152
+ " \n",
153
  " replaced_words = masked_words.copy()\n",
154
  " replaced_words[i] = candidate_mlm\n",
155
  " corrected_sentence = \" \".join(replaced_words).split() # Split and join to remove extra spaces\n",
156
  " corrected_sentence = \" \".join(corrected_sentence) # Join words without extra spaces\n",
157
+ " \n",
158
  " # Tokenize the corrected sentence for sequence classification\n",
159
  " inputs_cls = tokenizer_cls(corrected_sentence, return_tensors=\"pt\")\n",
160
  "\n",
161
+ " # Forward pass through the model for sequence classification 1 or 0 \n",
162
  " with torch.no_grad():\n",
163
  " outputs_cls = model_cls(**inputs_cls)\n",
164
  "\n",
165
+ " # Get softmax probabilities for class indicating grammatically correct sentences\n",
166
+ " probability = torch.softmax(outputs_cls.logits, dim=1).squeeze().tolist()[1]\n",
167
  "\n",
168
+ " # Append the corrected sentence along with its probability and cosine similarity\n",
169
+ " grammar_correction_candidates.append((corrected_sentence, probability, similarity))\n",
170
  "\n",
 
 
171
  "\n",
172
  " # Sort the grammar correction candidates by their probabilities and cosine similarities in descending order\n",
173
  " grammar_correction_candidates.sort(key=lambda x: (x[1], x[2]), reverse=True)\n",
174
  "\n",
 
 
 
 
 
 
 
175
  "\n",
176
+ " threshold = 60 # Adjust this threshold according to your requirement\n",
177
+ " # Initialize a list to store the top 5 candidates\n",
178
+ " top_candidates = []\n",
179
+ "\n",
180
+ " # Iterate over each candidate and keep track of the top 5 based on cosine similarity\n",
181
+ " for candidate, probability, cosine_similarity in grammar_correction_candidates:\n",
182
+ " fuzzy_match_score = fuzz.ratio(new_sentence, candidate)\n",
183
+ " \n",
184
+ " # Check if the current candidate should be included in the top 5\n",
185
+ " if len(top_candidates) < 1:\n",
186
+ " top_candidates.append((candidate, probability, cosine_similarity, fuzzy_match_score))\n",
187
+ " # Sort the top_candidates based on cosine similarity in descending order\n",
188
+ " top_candidates.sort(key=lambda x: x[2], reverse=True)\n",
189
+ " else:\n",
190
+ " # Compare the cosine similarity of the current candidate with the lowest similarity in the top_candidates\n",
191
+ " min_similarity = min(top_candidates, key=lambda x: x[2])[2]\n",
192
+ " if cosine_similarity > min_similarity:\n",
193
+ " # Replace the candidate with the lowest similarity in the top_candidates list\n",
194
+ " min_index = top_candidates.index(min(top_candidates, key=lambda x: x[2]))\n",
195
+ " top_candidates[min_index] = (candidate, probability, cosine_similarity, fuzzy_match_score)\n",
196
+ " # Sort the top_candidates based on cosine similarity in descending order\n",
197
+ " top_candidates.sort(key=lambda x: x[2], reverse=True)\n",
198
+ "\n",
199
+ " for idx, (candidate, probability, cosine_similarity, fuzzy_match_score) in enumerate(top_candidates):\n",
200
+ " print(\"Sentence:\", candidate)\n",
201
+ " print(\"Correctness Probability:\", probability)\n",
202
+ " print(\"Cosine Similarity:\", cosine_similarity)\n",
203
+ " print(\"Levenshtein Score:\", fuzzy_match_score)\n",
204
+ " print(top_candidates)\n"
205
  ]
206
  }
207
  ],