zklmorales
/

Filipino_Grammar_Emendation

TensorBoard

Safetensors

Model card Files Files and versions Metrics Training metrics Community

zklmorales commited on Apr 1

Commit

1a873d5

•

1 Parent(s): e166efd

Upload Final.ipynb

Browse files

Files changed (1) hide show

Final.ipynb +65 -61

Final.ipynb CHANGED Viewed

@@ -2,35 +2,19 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Grammar correction candidates:\n",
-      "Candidate: Siya ay nagising kanina .\n",
-      "Probability: 0.9917004704475403\n",
-      "Cosine Similarity: 0.18928596377372742\n",
-      "\n",
-      "Candidate: Siya ay dumating kanina .\n",
-      "Probability: 0.9892023205757141\n",
-      "Cosine Similarity: 0.002990148961544037\n",
-      "\n",
-      "Candidate: Siya ay namatay kanina .\n",
-      "Probability: 0.9889046549797058\n",
-      "Cosine Similarity: -0.04294966533780098\n",
-      "\n",
-      "Candidate: Siya ay nagbitiw kanina .\n",
-      "Probability: 0.9842618703842163\n",
-      "Cosine Similarity: -0.029277324676513672\n",
-      "\n",
-      "Candidate: Siya ay nahuli kanina .\n",
-      "Probability: 0.9830281734466553\n",
-      "Cosine Similarity: -0.02716892771422863\n",
-      "\n",
-      "Original sentence POS Tags: ['PRS', 'LM', 'VBTF', 'RBW', 'PMP']\n"
      ]
     }
    ],
@@ -39,16 +23,20 @@
     "import nltk\n",
     "from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForMaskedLM\n",
     "import joblib\n",
     "\n",
-    "# Load pre-trained RoBERTa tokenizer and model for MLM\n",
-    "tokenizer_mlm = AutoTokenizer.from_pretrained(\"fine_tuned_model\")\n",
-    "model_mlm = AutoModelForMaskedLM.from_pretrained(\"fine_tuned_model\")\n",
     "\n",
     "# Load pre-trained BERT tokenizer and model for sequence classification\n",
     "tokenizer_cls = AutoTokenizer.from_pretrained(\"zklmorales/bert_finetuned\")\n",
     "model_cls = AutoModelForSequenceClassification.from_pretrained(\"zklmorales/bert_finetuned\")\n",
     "crf_model = joblib.load(r'D:\\Thesis\\POS Tag Automation\\crf_model.pkl')\n",
     "\n",
     "def word_features(sent, i):\n",
     "    word = sent[i][0]\n",
     "    pos = sent[i][1]\n",
@@ -98,38 +86,35 @@
     "            'prev2pos': prev2pos           \n",
     "           }\n",
     "\n",
-    "new_sentence = \"Siya ay magigising kanina.\"\n",
     "\n",
     "tokens = nltk.word_tokenize(new_sentence)\n",
-    "\n",
-    "tagged_tokens = []\n",
-    "\n",
-    "for token in tokens:\n",
-    "    pos_tag = nltk.pos_tag([token])[0][1]\n",
-    "    tagged_tokens.append((token, pos_tag))\n",
     "\n",
     "# Extract features for each token in the new sentence\n",
     "features = [word_features(tagged_tokens, i) for i in range(len(tagged_tokens))]\n",
     "\n",
-    "# Use the trained CRF model to predict labels for the tokens\n",
-    "predicted_labels = crf_model.predict([features])[0]\n",
-    "\n",
-    "# Forward pass through the model for sequence classification\n",
     "inputs_cls = tokenizer_cls(new_sentence, return_tensors=\"pt\")\n",
     "with torch.no_grad():\n",
     "    outputs_cls = model_cls(**inputs_cls)\n",
-    "\n",
-    "# Get softmax probabilities\n",
     "probabilities_cls = torch.softmax(outputs_cls.logits, dim=1).squeeze().tolist()\n",
-    "\n",
-    "# Get the most probable class\n",
     "predicted_class = torch.argmax(outputs_cls.logits, dim=1).item()\n",
     "\n",
     "# Check if the sentence is grammatically correct\n",
-    "if predicted_class == 1:  # Assuming class 0 represents grammatical correctness\n",
     "    print(\"The sentence is grammatically correct.\")\n",
     "else:\n",
-    "    # Proceed with grammar correction candidates\n",
     "    grammar_correction_candidates = []\n",
     "\n",
     "    # Iterate over each word and mask it, then predict the masked word\n",
@@ -161,43 +146,62 @@
     "                # Get embeddings for the masked word and the candidate word\n",
     "                original_embedding = model_mlm.get_input_embeddings()(torch.tensor(tokenizer_mlm.encode(token, add_special_tokens=False))).mean(dim=0)\n",
     "                candidate_embedding = model_mlm.get_input_embeddings()(torch.tensor(tokenizer_mlm.encode(candidate_mlm, add_special_tokens=False))).mean(dim=0)\n",
-    "\n",
     "                # Compute cosine similarity between original masked word and predicted word\n",
     "                similarity = torch.nn.functional.cosine_similarity(original_embedding.unsqueeze(0), candidate_embedding.unsqueeze(0)).item()\n",
-    "\n",
     "                replaced_words = masked_words.copy()\n",
     "                replaced_words[i] = candidate_mlm\n",
     "                corrected_sentence = \" \".join(replaced_words).split()  # Split and join to remove extra spaces\n",
     "                corrected_sentence = \" \".join(corrected_sentence)  # Join words without extra spaces\n",
-    "\n",
     "                # Tokenize the corrected sentence for sequence classification\n",
     "                inputs_cls = tokenizer_cls(corrected_sentence, return_tensors=\"pt\")\n",
     "\n",
-    "                # Forward pass through the model for sequence classification\n",
     "                with torch.no_grad():\n",
     "                    outputs_cls = model_cls(**inputs_cls)\n",
     "\n",
-    "                # Get softmax probabilities\n",
-    "                probabilities = torch.softmax(outputs_cls.logits, dim=1).squeeze().tolist()\n",
     "\n",
-    "                # Get the most probable class\n",
-    "                predicted_class = torch.argmax(outputs_cls.logits, dim=1).item()\n",
     "\n",
-    "                # Append the corrected sentence along with its probability and class\n",
-    "                grammar_correction_candidates.append((corrected_sentence, probabilities[predicted_class], similarity))\n",
     "\n",
     "    # Sort the grammar correction candidates by their probabilities and cosine similarities in descending order\n",
     "    grammar_correction_candidates.sort(key=lambda x: (x[1], x[2]), reverse=True)\n",
     "\n",
-    "    # Print the top 5 most probable grammar correction candidates with high cosine similarity\n",
-    "    print(\"Grammar correction candidates:\")\n",
-    "    for candidate, probability, cosine_similarity in grammar_correction_candidates[:5]:\n",
-    "        print(\"Candidate:\", candidate)\n",
-    "        print(\"Probability:\", probability)\n",
-    "        print(\"Cosine Similarity:\", cosine_similarity)\n",
-    "        print()\n",
     "\n",
-    "print(\"Original sentence POS Tags:\", predicted_labels)\n"
    ]
   }
  ],

  "cells": [
   {
    "cell_type": "code",
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "Original sentence: Magigising siya kanina dahil sa ingay\n",
+      "Sentence: Nagulat siya kanina dahil sa ingay\n",
+      "Correctness Probability: 0.9978345036506653\n",
+      "Cosine Similarity: 0.22926439344882965\n",
+      "Levenshtein Score: 82\n",
+      "[('Nagulat siya kanina dahil sa ingay', 0.9978345036506653, 0.22926439344882965, 82)]\n"
      ]
     }
    ],
     "import nltk\n",
     "from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForMaskedLM\n",
     "import joblib\n",
+    "from fuzzywuzzy import fuzz\n",
     "\n",
+    "\n",
+    "tokenizer_mlm = AutoTokenizer.from_pretrained(\"zklmorales/bert_mlm_fine-tuned\")\n",
+    "model_mlm = AutoModelForMaskedLM.from_pretrained(\"zklmorales/bert_mlm_fine-tuned\")\n",
     "\n",
     "# Load pre-trained BERT tokenizer and model for sequence classification\n",
     "tokenizer_cls = AutoTokenizer.from_pretrained(\"zklmorales/bert_finetuned\")\n",
     "model_cls = AutoModelForSequenceClassification.from_pretrained(\"zklmorales/bert_finetuned\")\n",
+    "\n",
+    "# Load CRF Model for POS Tagging\n",
     "crf_model = joblib.load(r'D:\\Thesis\\POS Tag Automation\\crf_model.pkl')\n",
     "\n",
+    "# Define function to extract word features\n",
     "def word_features(sent, i):\n",
     "    word = sent[i][0]\n",
     "    pos = sent[i][1]\n",
     "            'prev2pos': prev2pos           \n",
     "           }\n",
     "\n",
+    "# Define the new sentence\n",
+    "new_sentence = input(\"Sentence: \")\n",
     "\n",
+    "# Tokenize the new sentence and get POS tags\n",
     "tokens = nltk.word_tokenize(new_sentence)\n",
+    "tagged_tokens = [nltk.pos_tag([token])[0] for token in tokens]\n",
     "\n",
     "# Extract features for each token in the new sentence\n",
     "features = [word_features(tagged_tokens, i) for i in range(len(tagged_tokens))]\n",
     "\n",
+    "# Use the BERT classifier to check if the sentence is grammatically correct\n",
     "inputs_cls = tokenizer_cls(new_sentence, return_tensors=\"pt\")\n",
     "with torch.no_grad():\n",
     "    outputs_cls = model_cls(**inputs_cls)\n",
     "probabilities_cls = torch.softmax(outputs_cls.logits, dim=1).squeeze().tolist()\n",
     "predicted_class = torch.argmax(outputs_cls.logits, dim=1).item()\n",
     "\n",
     "# Check if the sentence is grammatically correct\n",
+    "if predicted_class == 1:\n",
     "    print(\"The sentence is grammatically correct.\")\n",
     "else:\n",
+    "    # Use the CRF model to predict POS tags for the tokens\n",
+    "    predicted_labels = crf_model.predict([features])[0]\n",
+    "\n",
+    "    # Combine tokens with predicted labels\n",
+    "    predicted_tokens_with_labels = list(zip(tokens, predicted_labels))\n",
+    "\n",
+    "    print(\"Original sentence:\", new_sentence)\n",
+    "\n",
     "    grammar_correction_candidates = []\n",
     "\n",
     "    # Iterate over each word and mask it, then predict the masked word\n",
     "                # Get embeddings for the masked word and the candidate word\n",
     "                original_embedding = model_mlm.get_input_embeddings()(torch.tensor(tokenizer_mlm.encode(token, add_special_tokens=False))).mean(dim=0)\n",
     "                candidate_embedding = model_mlm.get_input_embeddings()(torch.tensor(tokenizer_mlm.encode(candidate_mlm, add_special_tokens=False))).mean(dim=0)\n",
+    "                \n",
     "                # Compute cosine similarity between original masked word and predicted word\n",
     "                similarity = torch.nn.functional.cosine_similarity(original_embedding.unsqueeze(0), candidate_embedding.unsqueeze(0)).item()\n",
+    "                \n",
     "                replaced_words = masked_words.copy()\n",
     "                replaced_words[i] = candidate_mlm\n",
     "                corrected_sentence = \" \".join(replaced_words).split()  # Split and join to remove extra spaces\n",
     "                corrected_sentence = \" \".join(corrected_sentence)  # Join words without extra spaces\n",
+    "                \n",
     "                # Tokenize the corrected sentence for sequence classification\n",
     "                inputs_cls = tokenizer_cls(corrected_sentence, return_tensors=\"pt\")\n",
     "\n",
+    "                # Forward pass through the model for sequence classification 1 or 0 \n",
     "                with torch.no_grad():\n",
     "                    outputs_cls = model_cls(**inputs_cls)\n",
     "\n",
+    "                # Get softmax probabilities for class indicating grammatically correct sentences\n",
+    "                probability = torch.softmax(outputs_cls.logits, dim=1).squeeze().tolist()[1]\n",
     "\n",
+    "                # Append the corrected sentence along with its probability and cosine similarity\n",
+    "                grammar_correction_candidates.append((corrected_sentence, probability, similarity))\n",
     "\n",
     "\n",
     "    # Sort the grammar correction candidates by their probabilities and cosine similarities in descending order\n",
     "    grammar_correction_candidates.sort(key=lambda x: (x[1], x[2]), reverse=True)\n",
     "\n",
     "\n",
+    "    threshold = 60  # Adjust this threshold according to your requirement\n",
+    "    # Initialize a list to store the top 5 candidates\n",
+    "    top_candidates = []\n",
+    "\n",
+    "    # Iterate over each candidate and keep track of the top 5 based on cosine similarity\n",
+    "    for candidate, probability, cosine_similarity in grammar_correction_candidates:\n",
+    "        fuzzy_match_score = fuzz.ratio(new_sentence, candidate)\n",
+    "        \n",
+    "        # Check if the current candidate should be included in the top 5\n",
+    "        if len(top_candidates) < 1:\n",
+    "            top_candidates.append((candidate, probability, cosine_similarity, fuzzy_match_score))\n",
+    "            # Sort the top_candidates based on cosine similarity in descending order\n",
+    "            top_candidates.sort(key=lambda x: x[2], reverse=True)\n",
+    "        else:\n",
+    "            # Compare the cosine similarity of the current candidate with the lowest similarity in the top_candidates\n",
+    "            min_similarity = min(top_candidates, key=lambda x: x[2])[2]\n",
+    "            if cosine_similarity > min_similarity:\n",
+    "                # Replace the candidate with the lowest similarity in the top_candidates list\n",
+    "                min_index = top_candidates.index(min(top_candidates, key=lambda x: x[2]))\n",
+    "                top_candidates[min_index] = (candidate, probability, cosine_similarity, fuzzy_match_score)\n",
+    "                # Sort the top_candidates based on cosine similarity in descending order\n",
+    "                top_candidates.sort(key=lambda x: x[2], reverse=True)\n",
+    "\n",
+    "    for idx, (candidate, probability, cosine_similarity, fuzzy_match_score) in enumerate(top_candidates):\n",
+    "        print(\"Sentence:\", candidate)\n",
+    "        print(\"Correctness Probability:\", probability)\n",
+    "        print(\"Cosine Similarity:\", cosine_similarity)\n",
+    "        print(\"Levenshtein Score:\", fuzzy_match_score)\n",
+    "        print(top_candidates)\n"
    ]
   }
  ],