File size: 9,907 Bytes

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\Users\\Admin\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n",
      "c:\\Users\\Admin\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\fuzzywuzzy\\fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning\n",
      "  warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Original sentence: Magugulat ako kanina dahil sa pagsabog\n",
      "Sentence: Nagulat ako kanina dahil sa pagsabog\n",
      "Correctness Probability: 0.9976696372032166\n",
      "Cosine Similarity: 0.20241191983222961\n",
      "Levenshtein Score: 75\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "import nltk\n",
    "from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForMaskedLM\n",
    "import joblib\n",
    "from fuzzywuzzy import fuzz\n",
    "\n",
    "\n",
    "tokenizer_mlm = AutoTokenizer.from_pretrained(\"zklmorales/bert_mlm_fine-tuned\")\n",
    "model_mlm = AutoModelForMaskedLM.from_pretrained(\"zklmorales/bert_mlm_fine-tuned\")\n",
    "\n",
    "# Load pre-trained BERT tokenizer and model for sequence classification\n",
    "tokenizer_cls = AutoTokenizer.from_pretrained(\"zklmorales/bert_finetuned\")\n",
    "model_cls = AutoModelForSequenceClassification.from_pretrained(\"zklmorales/bert_finetuned\")\n",
    "\n",
    "# Load CRF Model for POS Tagging\n",
    "crf_model = joblib.load(r'D:\\Thesis\\POS Tag Automation\\crf_model.pkl')\n",
    "\n",
    "# Define function to extract word features\n",
    "def word_features(sent, i):\n",
    "    word = sent[i][0]\n",
    "    pos = sent[i][1]\n",
    "    \n",
    "    # first word\n",
    "    if i == 0:\n",
    "        prevword = '<START>'\n",
    "        prevpos = '<START>'\n",
    "    else:\n",
    "        prevword = sent[i-1][0]\n",
    "        prevpos = sent[i-1][1]\n",
    "        \n",
    "    # first or second word\n",
    "    if i == 0 or i == 1:\n",
    "        prev2word = '<START>'\n",
    "        prev2pos = '<START>'\n",
    "    else:\n",
    "        prev2word = sent[i-2][0]\n",
    "        prev2pos = sent[i-2][1]\n",
    "    \n",
    "    # last word\n",
    "    if i == len(sent) - 1:\n",
    "        nextword = '<END>'\n",
    "        nextpos = '<END>'\n",
    "    else:\n",
    "        nextword = sent[i+1][0]\n",
    "        nextpos = sent[i+1][1]\n",
    "    \n",
    "    # suffixes and prefixes\n",
    "    pref_1, pref_2, pref_3, pref_4 = word[:1], word[:2], word[:3], word[:4]\n",
    "    suff_1, suff_2, suff_3, suff_4 = word[-1:], word[-2:], word[-3:], word[-4:]\n",
    "    \n",
    "    return {'word':word,            \n",
    "            'prevword': prevword,\n",
    "            'prevpos': prevpos,  \n",
    "            'nextword': nextword, \n",
    "            'nextpos': nextpos,          \n",
    "            'suff_1': suff_1,  \n",
    "            'suff_2': suff_2,  \n",
    "            'suff_3': suff_3,  \n",
    "            'suff_4': suff_4, \n",
    "            'pref_1': pref_1,  \n",
    "            'pref_2': pref_2,  \n",
    "            'pref_3': pref_3, \n",
    "            'pref_4': pref_4,\n",
    "            'prev2word': prev2word,\n",
    "            'prev2pos': prev2pos           \n",
    "           }\n",
    "\n",
    "# Define the new sentence\n",
    "new_sentence = input(\"Sentence: \")\n",
    "\n",
    "# Tokenize the new sentence and get POS tags\n",
    "tokens = nltk.word_tokenize(new_sentence)\n",
    "tagged_tokens = [nltk.pos_tag([token])[0] for token in tokens]\n",
    "\n",
    "# Extract features for each token in the new sentence\n",
    "features = [word_features(tagged_tokens, i) for i in range(len(tagged_tokens))]\n",
    "\n",
    "# Use the BERT classifier to check if the sentence is grammatically correct\n",
    "inputs_cls = tokenizer_cls(new_sentence, return_tensors=\"pt\")\n",
    "with torch.no_grad():\n",
    "    outputs_cls = model_cls(**inputs_cls)\n",
    "probabilities_cls = torch.softmax(outputs_cls.logits, dim=1).squeeze().tolist()\n",
    "predicted_class = torch.argmax(outputs_cls.logits, dim=1).item()\n",
    "\n",
    "# Check if the sentence is grammatically correct\n",
    "if predicted_class == 1:\n",
    "    print(\"The sentence is grammatically correct.\")\n",
    "else:\n",
    "    # Use the CRF model to predict POS tags for the tokens\n",
    "    predicted_labels = crf_model.predict([features])[0]\n",
    "\n",
    "    # Combine tokens with predicted labels\n",
    "    predicted_tokens_with_labels = list(zip(tokens, predicted_labels))\n",
    "\n",
    "    print(\"Original sentence:\", new_sentence)\n",
    "\n",
    "    grammar_correction_candidates = []\n",
    "\n",
    "    # Iterate over each word and mask it, then predict the masked word\n",
    "    for i, (token, predicted_label) in enumerate(zip(tokens, predicted_labels)):\n",
    "        # Check if the predicted label is a verb\n",
    "        if predicted_label.startswith('VB'):\n",
    "            # Mask the word\n",
    "            masked_words = tokens.copy()\n",
    "            masked_words[i] = tokenizer_mlm.mask_token\n",
    "            masked_sentence = \" \".join(masked_words)\n",
    "\n",
    "            # Tokenize the masked sentence\n",
    "            tokens_mlm = tokenizer_mlm(masked_sentence, return_tensors=\"pt\")\n",
    "\n",
    "            # Get the position of the masked token\n",
    "            masked_index = torch.where(tokens_mlm[\"input_ids\"] == tokenizer_mlm.mask_token_id)[1][0]\n",
    "\n",
    "            # Get the logits for the masked token\n",
    "            with torch.no_grad():\n",
    "                outputs = model_mlm(**tokens_mlm)\n",
    "                predictions_mlm = outputs.logits\n",
    "\n",
    "            # Get the top predicted words for the masked token\n",
    "            top_predictions_mlm = torch.topk(predictions_mlm[0, masked_index], k=5)\n",
    "            candidates_mlm = [tokenizer_mlm.decode(idx.item()) for idx in top_predictions_mlm.indices]\n",
    "\n",
    "            # Reconstruct the sentence with each candidate\n",
    "            for candidate_mlm in candidates_mlm:\n",
    "                # Get embeddings for the masked word and the candidate word\n",
    "                original_embedding = model_mlm.get_input_embeddings()(torch.tensor(tokenizer_mlm.encode(token, add_special_tokens=False))).mean(dim=0)\n",
    "                candidate_embedding = model_mlm.get_input_embeddings()(torch.tensor(tokenizer_mlm.encode(candidate_mlm, add_special_tokens=False))).mean(dim=0)\n",
    "                \n",
    "                # Compute cosine similarity between original masked word and predicted word\n",
    "                similarity = torch.nn.functional.cosine_similarity(original_embedding.unsqueeze(0), candidate_embedding.unsqueeze(0)).item()\n",
    "                fuzzy_match_score = fuzz.ratio(token, candidate_mlm)\n",
    "\n",
    "                replaced_words = masked_words.copy()\n",
    "                replaced_words[i] = candidate_mlm\n",
    "                corrected_sentence = \" \".join(replaced_words).split()  # Split and join to remove extra spaces\n",
    "                corrected_sentence = \" \".join(corrected_sentence)  # Join words without extra spaces\n",
    "                \n",
    "                # Tokenize the corrected sentence for sequence classification\n",
    "                inputs_cls = tokenizer_cls(corrected_sentence, return_tensors=\"pt\")\n",
    "\n",
    "                # Forward pass through the model for sequence classification 1 or 0 \n",
    "                with torch.no_grad():\n",
    "                    outputs_cls = model_cls(**inputs_cls)\n",
    "\n",
    "                # Get softmax probabilities for class indicating grammatically correct sentences\n",
    "                probability = torch.softmax(outputs_cls.logits, dim=1).squeeze().tolist()[1]\n",
    "\n",
    "                # Append the corrected sentence along with its probability and cosine similarity\n",
    "                grammar_correction_candidates.append((corrected_sentence, probability, similarity, fuzzy_match_score))\n",
    "\n",
    "\n",
    "    # Sort the grammar correction candidates by their probabilities and cosine similarities in descending order\n",
    "    grammar_correction_candidates.sort(key=lambda x: (x[3], x[1], x[2]), reverse=True)\n",
    "\n",
    "if grammar_correction_candidates:\n",
    "    candidate, probability, cosine_similarity, fuzzy_match_score = grammar_correction_candidates[0]\n",
    "    print(\"Sentence:\", candidate)\n",
    "    print(\"Correctness Probability:\", probability)\n",
    "    print(\"Cosine Similarity:\", cosine_similarity)\n",
    "    print(\"Levenshtein Score:\", fuzzy_match_score)\n",
    "\n",
    "\n",
    "\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}