File size: 13,010 Bytes

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('Pupunta', 'VBAF'), ('ako', 'PRS'), ('kanina', 'RBW'), ('sa', 'CCT'), ('mall', 'NNP'), ('upang', 'CCB'), ('bumili', 'VBAF')]\n"
     ]
    }
   ],
   "source": [
    "import joblib\n",
    "import nltk\n",
    "\n",
    "# Load the saved CRF model\n",
    "crf_model = joblib.load(r'D:\\Thesis\\POS Tag Automation\\crf_model.pkl')\n",
    "\n",
    "def word_features(sent, i):\n",
    "    word = sent[i][0]\n",
    "    pos = sent[i][1]\n",
    "    \n",
    "    # first word\n",
    "    if i == 0:\n",
    "        prevword = '<START>'\n",
    "        prevpos = '<START>'\n",
    "    else:\n",
    "        prevword = sent[i-1][0]\n",
    "        prevpos = sent[i-1][1]\n",
    "        \n",
    "    # first or second word\n",
    "    if i == 0 or i == 1:\n",
    "        prev2word = '<START>'\n",
    "        prev2pos = '<START>'\n",
    "    else:\n",
    "        prev2word = sent[i-2][0]\n",
    "        prev2pos = sent[i-2][1]\n",
    "    \n",
    "    # last word\n",
    "    if i == len(sent) - 1:\n",
    "        nextword = '<END>'\n",
    "        nextpos = '<END>'\n",
    "    else:\n",
    "        nextword = sent[i+1][0]\n",
    "        nextpos = sent[i+1][1]\n",
    "    \n",
    "    # suffixes and prefixes\n",
    "    pref_1, pref_2, pref_3, pref_4 = word[:1], word[:2], word[:3], word[:4]\n",
    "    suff_1, suff_2, suff_3, suff_4 = word[-1:], word[-2:], word[-3:], word[-4:]\n",
    "    \n",
    "    return {'word':word,            \n",
    "            'prevword': prevword,\n",
    "            'prevpos': prevpos,  \n",
    "            'nextword': nextword, \n",
    "            'nextpos': nextpos,          \n",
    "            'suff_1': suff_1,  \n",
    "            'suff_2': suff_2,  \n",
    "            'suff_3': suff_3,  \n",
    "            'suff_4': suff_4, \n",
    "            'pref_1': pref_1,  \n",
    "            'pref_2': pref_2,  \n",
    "            'pref_3': pref_3, \n",
    "            'pref_4': pref_4,\n",
    "            'prev2word': prev2word,\n",
    "            'prev2pos': prev2pos           \n",
    "           }\n",
    "\n",
    "new_sentence = \"Pupunta ako kanina sa mall upang bumili\"\n",
    "\n",
    "# Tokenize the new sentence\n",
    "tokens = nltk.word_tokenize(new_sentence)\n",
    "\n",
    "\n",
    "tagged_tokens = []\n",
    "\n",
    "for token in tokens:\n",
    "    pos_tag = nltk.pos_tag([token])[0][1]\n",
    "    tagged_tokens.append((token, pos_tag))\n",
    "\n",
    "\n",
    "# Extract features for each token in the new sentence\n",
    "features = [word_features(tagged_tokens, i) for i in range(len(tagged_tokens))]\n",
    "\n",
    "# Use the trained CRF model to predict labels for the tokens\n",
    "predicted_labels = crf_model.predict([features])[0]\n",
    "\n",
    "# Combine tokens with predicted labels\n",
    "predicted_tokens_with_labels = list(zip(tokens, predicted_labels))\n",
    "\n",
    "print(predicted_tokens_with_labels)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sentence is grammatically correct.\n",
      "Probabilities: [0.00594444340094924, 0.9940555095672607]\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "from transformers import AutoTokenizer, AutoModelForSequenceClassification\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"zklmorales/bert_finetuned\")\n",
    "model = AutoModelForSequenceClassification.from_pretrained(\"zklmorales/bert_finetuned\")\n",
    "\n",
    "new_sentence = \"Pupunta ako kahapon sa siyudad upang bumili ang mga gamit ko\"\n",
    "\n",
    "# Tokenize the input text\n",
    "inputs = tokenizer(new_sentence, return_tensors=\"pt\")\n",
    "\n",
    "# Forward pass through the model\n",
    "with torch.no_grad():\n",
    "    outputs = model(**inputs)\n",
    "\n",
    "# Get the predicted class (label) from the model output\n",
    "predicted_class = torch.argmax(outputs.logits, dim=1).item()\n",
    "\n",
    "# Get softmax probabilities\n",
    "probabilities = torch.softmax(outputs.logits, dim=1).squeeze().tolist()\n",
    "\n",
    "# Print the prediction and probabilities\n",
    "if predicted_class == 1:\n",
    "    print(\"Sentence is grammatically correct.\")\n",
    "else:\n",
    "    print(\"Sentence is grammatically wrong.\")\n",
    "\n",
    "print(\"Probabilities:\", probabilities)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Original sentence: Tumakbo ang mga bata mula sa pagsabog\n",
      "Grammar correction candidates:\n",
      "Patay ang mga bata mula sa pagsabog Probability: 0.9976784586906433\n",
      "Alisin ang mga bata mula sa pagsabog Probability: 0.9921312928199768\n",
      "Turuan ang mga bata mula sa pagsabog Probability: 0.9664002060890198\n",
      "Hanapin ang mga bata mula sa pagsabog Probability: 0.9470312595367432\n",
      "Sinusuportahan ang mga bata mula sa pagsabog Probability: 0.9317439198493958\n",
      "['VBTS', 'DTC', 'DTCP', 'NNC', 'RBL', 'CCT', 'NNC']\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForMaskedLM\n",
    "\n",
    "# Load pre-trained RoBERTa tokenizer and model for MLM\n",
    "tokenizer_mlm = AutoTokenizer.from_pretrained(\"fine_tuned_model\")\n",
    "model_mlm = AutoModelForMaskedLM.from_pretrained(\"fine_tuned_model\")\n",
    "\n",
    "# Load pre-trained BERT tokenizer and model for sequence classification\n",
    "tokenizer_cls = AutoTokenizer.from_pretrained(\"zklmorales/bert_finetuned\")\n",
    "model_cls = AutoModelForSequenceClassification.from_pretrained(\"zklmorales/bert_finetuned\")\n",
    "\n",
    "def word_features(sent, i):\n",
    "    word = sent[i][0]\n",
    "    pos = sent[i][1]\n",
    "    \n",
    "    # first word\n",
    "    if i == 0:\n",
    "        prevword = '<START>'\n",
    "        prevpos = '<START>'\n",
    "    else:\n",
    "        prevword = sent[i-1][0]\n",
    "        prevpos = sent[i-1][1]\n",
    "        \n",
    "    # first or second word\n",
    "    if i == 0 or i == 1:\n",
    "        prev2word = '<START>'\n",
    "        prev2pos = '<START>'\n",
    "    else:\n",
    "        prev2word = sent[i-2][0]\n",
    "        prev2pos = sent[i-2][1]\n",
    "    \n",
    "    # last word\n",
    "    if i == len(sent) - 1:\n",
    "        nextword = '<END>'\n",
    "        nextpos = '<END>'\n",
    "    else:\n",
    "        nextword = sent[i+1][0]\n",
    "        nextpos = sent[i+1][1]\n",
    "    \n",
    "    # suffixes and prefixes\n",
    "    pref_1, pref_2, pref_3, pref_4 = word[:1], word[:2], word[:3], word[:4]\n",
    "    suff_1, suff_2, suff_3, suff_4 = word[-1:], word[-2:], word[-3:], word[-4:]\n",
    "    \n",
    "    return {'word':word,            \n",
    "            'prevword': prevword,\n",
    "            'prevpos': prevpos,  \n",
    "            'nextword': nextword, \n",
    "            'nextpos': nextpos,          \n",
    "            'suff_1': suff_1,  \n",
    "            'suff_2': suff_2,  \n",
    "            'suff_3': suff_3,  \n",
    "            'suff_4': suff_4, \n",
    "            'pref_1': pref_1,  \n",
    "            'pref_2': pref_2,  \n",
    "            'pref_3': pref_3, \n",
    "            'pref_4': pref_4,\n",
    "            'prev2word': prev2word,\n",
    "            'prev2pos': prev2pos           \n",
    "           }\n",
    "\n",
    "new_sentence = \"Tumakbo ang mga bata mula sa pagsabog\"\n",
    "\n",
    "tokens = nltk.word_tokenize(new_sentence)\n",
    "\n",
    "tagged_tokens = []\n",
    "\n",
    "for token in tokens:\n",
    "    pos_tag = nltk.pos_tag([token])[0][1]\n",
    "    tagged_tokens.append((token, pos_tag))\n",
    "\n",
    "# Extract features for each token in the new sentence\n",
    "features = [word_features(tagged_tokens, i) for i in range(len(tagged_tokens))]\n",
    "\n",
    "# Use the trained CRF model to predict labels for the tokens\n",
    "predicted_labels = crf_model.predict([features])[0]\n",
    "\n",
    "# Combine tokens with predicted labels\n",
    "predicted_tokens_with_labels = list(zip(tokens, predicted_labels))\n",
    "\n",
    "print(\"Original sentence:\", new_sentence)\n",
    "\n",
    "grammar_correction_candidates = []\n",
    "\n",
    "# Iterate over each word and mask it, then predict the masked word\n",
    "for i, (token, predicted_label) in enumerate(zip(tokens, predicted_labels)):\n",
    "    # Check if the predicted label is a verb\n",
    "    if predicted_label.startswith('VB'):\n",
    "        # Mask the word\n",
    "        masked_words = tokens.copy()\n",
    "        masked_words[i] = tokenizer_mlm.mask_token\n",
    "        masked_sentence = \" \".join(masked_words)\n",
    "\n",
    "        # Tokenize the masked sentence\n",
    "        tokens_mlm = tokenizer_mlm(masked_sentence, return_tensors=\"pt\")\n",
    "\n",
    "        # Get the position of the masked token\n",
    "        masked_index = torch.where(tokens_mlm[\"input_ids\"] == tokenizer_mlm.mask_token_id)[1][0]\n",
    "\n",
    "        # Get the logits for the masked token\n",
    "        with torch.no_grad():\n",
    "            outputs = model_mlm(**tokens_mlm)\n",
    "            predictions_mlm = outputs.logits\n",
    "\n",
    "        # Get the top predicted words for the masked token\n",
    "        top_predictions_mlm = torch.topk(predictions_mlm[0, masked_index], k=5)\n",
    "        candidates_mlm = [tokenizer_mlm.decode(idx.item()) for idx in top_predictions_mlm.indices]\n",
    "\n",
    "        # Reconstruct the sentence with each candidate\n",
    "        for candidate_mlm in candidates_mlm:\n",
    "            replaced_words = masked_words.copy()\n",
    "            replaced_words[i] = candidate_mlm\n",
    "            corrected_sentence = \" \".join(replaced_words).split()  # Split and join to remove extra spaces\n",
    "            corrected_sentence = \" \".join(corrected_sentence)  # Join words without extra spaces\n",
    "            \n",
    "            # Tokenize the corrected sentence for sequence classification\n",
    "            inputs_cls = tokenizer_cls(corrected_sentence, return_tensors=\"pt\")\n",
    "\n",
    "            # Forward pass through the model for sequence classification\n",
    "            with torch.no_grad():\n",
    "                outputs_cls = model_cls(**inputs_cls)\n",
    "\n",
    "            # Get softmax probabilities\n",
    "            probabilities = torch.softmax(outputs_cls.logits, dim=1).squeeze().tolist()\n",
    "            \n",
    "            # Get the most probable class\n",
    "            predicted_class = torch.argmax(outputs_cls.logits, dim=1).item()\n",
    "\n",
    "            # Append the corrected sentence along with its probability and class\n",
    "            grammar_correction_candidates.append((corrected_sentence, probabilities[predicted_class]))\n",
    "\n",
    "# Sort the grammar correction candidates by their probabilities in descending order\n",
    "grammar_correction_candidates.sort(key=lambda x: x[1], reverse=True)\n",
    "\n",
    "# Print the top 5 most probable grammar correction candidates\n",
    "print(\"Grammar correction candidates:\")\n",
    "for candidate, probability in grammar_correction_candidates:\n",
    "    print(candidate, \"Probability:\", probability)\n",
    "print(predicted_labels)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Nagising\n",
      "67\n"
     ]
    }
   ],
   "source": [
    "from fuzzywuzzy import fuzz\n",
    "\n",
    "original_word = \"Gigisingin\"\n",
    "suggestions = [\"Tatakbo\", \"Nagising\", \"Hihiga\", \"Kakain\"]\n",
    "\n",
    "threshold = 60\n",
    "\n",
    "for suggestion in suggestions:\n",
    "    similarity_score = fuzz.ratio(original_word, suggestion)\n",
    "    if similarity_score >= threshold:\n",
    "        print(suggestion)\n",
    "        print(fuzz.ratio(original_word, suggestion))\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}