{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[('Pupunta', 'VBAF'), ('ako', 'PRS'), ('kanina', 'RBW'), ('sa', 'CCT'), ('mall', 'NNP'), ('upang', 'CCB'), ('bumili', 'VBAF')]\n" ] } ], "source": [ "import joblib\n", "import nltk\n", "\n", "# Load the saved CRF model\n", "crf_model = joblib.load(r'D:\\Thesis\\POS Tag Automation\\crf_model.pkl')\n", "\n", "def word_features(sent, i):\n", " word = sent[i][0]\n", " pos = sent[i][1]\n", " \n", " # first word\n", " if i == 0:\n", " prevword = ''\n", " prevpos = ''\n", " else:\n", " prevword = sent[i-1][0]\n", " prevpos = sent[i-1][1]\n", " \n", " # first or second word\n", " if i == 0 or i == 1:\n", " prev2word = ''\n", " prev2pos = ''\n", " else:\n", " prev2word = sent[i-2][0]\n", " prev2pos = sent[i-2][1]\n", " \n", " # last word\n", " if i == len(sent) - 1:\n", " nextword = ''\n", " nextpos = ''\n", " else:\n", " nextword = sent[i+1][0]\n", " nextpos = sent[i+1][1]\n", " \n", " # suffixes and prefixes\n", " pref_1, pref_2, pref_3, pref_4 = word[:1], word[:2], word[:3], word[:4]\n", " suff_1, suff_2, suff_3, suff_4 = word[-1:], word[-2:], word[-3:], word[-4:]\n", " \n", " return {'word':word, \n", " 'prevword': prevword,\n", " 'prevpos': prevpos, \n", " 'nextword': nextword, \n", " 'nextpos': nextpos, \n", " 'suff_1': suff_1, \n", " 'suff_2': suff_2, \n", " 'suff_3': suff_3, \n", " 'suff_4': suff_4, \n", " 'pref_1': pref_1, \n", " 'pref_2': pref_2, \n", " 'pref_3': pref_3, \n", " 'pref_4': pref_4,\n", " 'prev2word': prev2word,\n", " 'prev2pos': prev2pos \n", " }\n", "\n", "new_sentence = \"Pupunta ako kanina sa mall upang bumili\"\n", "\n", "# Tokenize the new sentence\n", "tokens = nltk.word_tokenize(new_sentence)\n", "\n", "\n", "tagged_tokens = []\n", "\n", "for token in tokens:\n", " pos_tag = nltk.pos_tag([token])[0][1]\n", " tagged_tokens.append((token, pos_tag))\n", "\n", "\n", "# Extract features for each token in the new sentence\n", "features = [word_features(tagged_tokens, i) for i in range(len(tagged_tokens))]\n", "\n", "# Use the trained CRF model to predict labels for the tokens\n", "predicted_labels = crf_model.predict([features])[0]\n", "\n", "# Combine tokens with predicted labels\n", "predicted_tokens_with_labels = list(zip(tokens, predicted_labels))\n", "\n", "print(predicted_tokens_with_labels)\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Sentence is grammatically correct.\n", "Probabilities: [0.00594444340094924, 0.9940555095672607]\n" ] } ], "source": [ "import torch\n", "from transformers import AutoTokenizer, AutoModelForSequenceClassification\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(\"zklmorales/bert_finetuned\")\n", "model = AutoModelForSequenceClassification.from_pretrained(\"zklmorales/bert_finetuned\")\n", "\n", "new_sentence = \"Pupunta ako kahapon sa siyudad upang bumili ang mga gamit ko\"\n", "\n", "# Tokenize the input text\n", "inputs = tokenizer(new_sentence, return_tensors=\"pt\")\n", "\n", "# Forward pass through the model\n", "with torch.no_grad():\n", " outputs = model(**inputs)\n", "\n", "# Get the predicted class (label) from the model output\n", "predicted_class = torch.argmax(outputs.logits, dim=1).item()\n", "\n", "# Get softmax probabilities\n", "probabilities = torch.softmax(outputs.logits, dim=1).squeeze().tolist()\n", "\n", "# Print the prediction and probabilities\n", "if predicted_class == 1:\n", " print(\"Sentence is grammatically correct.\")\n", "else:\n", " print(\"Sentence is grammatically wrong.\")\n", "\n", "print(\"Probabilities:\", probabilities)\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Original sentence: Tumakbo ang mga bata mula sa pagsabog\n", "Grammar correction candidates:\n", "Patay ang mga bata mula sa pagsabog Probability: 0.9976784586906433\n", "Alisin ang mga bata mula sa pagsabog Probability: 0.9921312928199768\n", "Turuan ang mga bata mula sa pagsabog Probability: 0.9664002060890198\n", "Hanapin ang mga bata mula sa pagsabog Probability: 0.9470312595367432\n", "Sinusuportahan ang mga bata mula sa pagsabog Probability: 0.9317439198493958\n", "['VBTS', 'DTC', 'DTCP', 'NNC', 'RBL', 'CCT', 'NNC']\n" ] } ], "source": [ "import torch\n", "from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForMaskedLM\n", "\n", "# Load pre-trained RoBERTa tokenizer and model for MLM\n", "tokenizer_mlm = AutoTokenizer.from_pretrained(\"fine_tuned_model\")\n", "model_mlm = AutoModelForMaskedLM.from_pretrained(\"fine_tuned_model\")\n", "\n", "# Load pre-trained BERT tokenizer and model for sequence classification\n", "tokenizer_cls = AutoTokenizer.from_pretrained(\"zklmorales/bert_finetuned\")\n", "model_cls = AutoModelForSequenceClassification.from_pretrained(\"zklmorales/bert_finetuned\")\n", "\n", "def word_features(sent, i):\n", " word = sent[i][0]\n", " pos = sent[i][1]\n", " \n", " # first word\n", " if i == 0:\n", " prevword = ''\n", " prevpos = ''\n", " else:\n", " prevword = sent[i-1][0]\n", " prevpos = sent[i-1][1]\n", " \n", " # first or second word\n", " if i == 0 or i == 1:\n", " prev2word = ''\n", " prev2pos = ''\n", " else:\n", " prev2word = sent[i-2][0]\n", " prev2pos = sent[i-2][1]\n", " \n", " # last word\n", " if i == len(sent) - 1:\n", " nextword = ''\n", " nextpos = ''\n", " else:\n", " nextword = sent[i+1][0]\n", " nextpos = sent[i+1][1]\n", " \n", " # suffixes and prefixes\n", " pref_1, pref_2, pref_3, pref_4 = word[:1], word[:2], word[:3], word[:4]\n", " suff_1, suff_2, suff_3, suff_4 = word[-1:], word[-2:], word[-3:], word[-4:]\n", " \n", " return {'word':word, \n", " 'prevword': prevword,\n", " 'prevpos': prevpos, \n", " 'nextword': nextword, \n", " 'nextpos': nextpos, \n", " 'suff_1': suff_1, \n", " 'suff_2': suff_2, \n", " 'suff_3': suff_3, \n", " 'suff_4': suff_4, \n", " 'pref_1': pref_1, \n", " 'pref_2': pref_2, \n", " 'pref_3': pref_3, \n", " 'pref_4': pref_4,\n", " 'prev2word': prev2word,\n", " 'prev2pos': prev2pos \n", " }\n", "\n", "new_sentence = \"Tumakbo ang mga bata mula sa pagsabog\"\n", "\n", "tokens = nltk.word_tokenize(new_sentence)\n", "\n", "tagged_tokens = []\n", "\n", "for token in tokens:\n", " pos_tag = nltk.pos_tag([token])[0][1]\n", " tagged_tokens.append((token, pos_tag))\n", "\n", "# Extract features for each token in the new sentence\n", "features = [word_features(tagged_tokens, i) for i in range(len(tagged_tokens))]\n", "\n", "# Use the trained CRF model to predict labels for the tokens\n", "predicted_labels = crf_model.predict([features])[0]\n", "\n", "# Combine tokens with predicted labels\n", "predicted_tokens_with_labels = list(zip(tokens, predicted_labels))\n", "\n", "print(\"Original sentence:\", new_sentence)\n", "\n", "grammar_correction_candidates = []\n", "\n", "# Iterate over each word and mask it, then predict the masked word\n", "for i, (token, predicted_label) in enumerate(zip(tokens, predicted_labels)):\n", " # Check if the predicted label is a verb\n", " if predicted_label.startswith('VB'):\n", " # Mask the word\n", " masked_words = tokens.copy()\n", " masked_words[i] = tokenizer_mlm.mask_token\n", " masked_sentence = \" \".join(masked_words)\n", "\n", " # Tokenize the masked sentence\n", " tokens_mlm = tokenizer_mlm(masked_sentence, return_tensors=\"pt\")\n", "\n", " # Get the position of the masked token\n", " masked_index = torch.where(tokens_mlm[\"input_ids\"] == tokenizer_mlm.mask_token_id)[1][0]\n", "\n", " # Get the logits for the masked token\n", " with torch.no_grad():\n", " outputs = model_mlm(**tokens_mlm)\n", " predictions_mlm = outputs.logits\n", "\n", " # Get the top predicted words for the masked token\n", " top_predictions_mlm = torch.topk(predictions_mlm[0, masked_index], k=5)\n", " candidates_mlm = [tokenizer_mlm.decode(idx.item()) for idx in top_predictions_mlm.indices]\n", "\n", " # Reconstruct the sentence with each candidate\n", " for candidate_mlm in candidates_mlm:\n", " replaced_words = masked_words.copy()\n", " replaced_words[i] = candidate_mlm\n", " corrected_sentence = \" \".join(replaced_words).split() # Split and join to remove extra spaces\n", " corrected_sentence = \" \".join(corrected_sentence) # Join words without extra spaces\n", " \n", " # Tokenize the corrected sentence for sequence classification\n", " inputs_cls = tokenizer_cls(corrected_sentence, return_tensors=\"pt\")\n", "\n", " # Forward pass through the model for sequence classification\n", " with torch.no_grad():\n", " outputs_cls = model_cls(**inputs_cls)\n", "\n", " # Get softmax probabilities\n", " probabilities = torch.softmax(outputs_cls.logits, dim=1).squeeze().tolist()\n", " \n", " # Get the most probable class\n", " predicted_class = torch.argmax(outputs_cls.logits, dim=1).item()\n", "\n", " # Append the corrected sentence along with its probability and class\n", " grammar_correction_candidates.append((corrected_sentence, probabilities[predicted_class]))\n", "\n", "# Sort the grammar correction candidates by their probabilities in descending order\n", "grammar_correction_candidates.sort(key=lambda x: x[1], reverse=True)\n", "\n", "# Print the top 5 most probable grammar correction candidates\n", "print(\"Grammar correction candidates:\")\n", "for candidate, probability in grammar_correction_candidates:\n", " print(candidate, \"Probability:\", probability)\n", "print(predicted_labels)\n" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Nagising\n", "67\n" ] } ], "source": [ "from fuzzywuzzy import fuzz\n", "\n", "original_word = \"Gigisingin\"\n", "suggestions = [\"Tatakbo\", \"Nagising\", \"Hihiga\", \"Kakain\"]\n", "\n", "threshold = 60\n", "\n", "for suggestion in suggestions:\n", " similarity_score = fuzz.ratio(original_word, suggestion)\n", " if similarity_score >= threshold:\n", " print(suggestion)\n", " print(fuzz.ratio(original_word, suggestion))\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.1" } }, "nbformat": 4, "nbformat_minor": 2 }