{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\Admin\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n", "c:\\Users\\Admin\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\fuzzywuzzy\\fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning\n", " warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Original sentence: Magugulat ako kanina dahil sa pagsabog\n", "Sentence: Nagulat ako kanina dahil sa pagsabog\n", "Correctness Probability: 0.9976696372032166\n", "Cosine Similarity: 0.20241191983222961\n", "Levenshtein Score: 75\n" ] } ], "source": [ "import torch\n", "import nltk\n", "from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForMaskedLM\n", "import joblib\n", "from fuzzywuzzy import fuzz\n", "\n", "\n", "tokenizer_mlm = AutoTokenizer.from_pretrained(\"zklmorales/bert_mlm_fine-tuned\")\n", "model_mlm = AutoModelForMaskedLM.from_pretrained(\"zklmorales/bert_mlm_fine-tuned\")\n", "\n", "# Load pre-trained BERT tokenizer and model for sequence classification\n", "tokenizer_cls = AutoTokenizer.from_pretrained(\"zklmorales/bert_finetuned\")\n", "model_cls = AutoModelForSequenceClassification.from_pretrained(\"zklmorales/bert_finetuned\")\n", "\n", "# Load CRF Model for POS Tagging\n", "crf_model = joblib.load(r'D:\\Thesis\\POS Tag Automation\\crf_model.pkl')\n", "\n", "# Define function to extract word features\n", "def word_features(sent, i):\n", " word = sent[i][0]\n", " pos = sent[i][1]\n", " \n", " # first word\n", " if i == 0:\n", " prevword = ''\n", " prevpos = ''\n", " else:\n", " prevword = sent[i-1][0]\n", " prevpos = sent[i-1][1]\n", " \n", " # first or second word\n", " if i == 0 or i == 1:\n", " prev2word = ''\n", " prev2pos = ''\n", " else:\n", " prev2word = sent[i-2][0]\n", " prev2pos = sent[i-2][1]\n", " \n", " # last word\n", " if i == len(sent) - 1:\n", " nextword = ''\n", " nextpos = ''\n", " else:\n", " nextword = sent[i+1][0]\n", " nextpos = sent[i+1][1]\n", " \n", " # suffixes and prefixes\n", " pref_1, pref_2, pref_3, pref_4 = word[:1], word[:2], word[:3], word[:4]\n", " suff_1, suff_2, suff_3, suff_4 = word[-1:], word[-2:], word[-3:], word[-4:]\n", " \n", " return {'word':word, \n", " 'prevword': prevword,\n", " 'prevpos': prevpos, \n", " 'nextword': nextword, \n", " 'nextpos': nextpos, \n", " 'suff_1': suff_1, \n", " 'suff_2': suff_2, \n", " 'suff_3': suff_3, \n", " 'suff_4': suff_4, \n", " 'pref_1': pref_1, \n", " 'pref_2': pref_2, \n", " 'pref_3': pref_3, \n", " 'pref_4': pref_4,\n", " 'prev2word': prev2word,\n", " 'prev2pos': prev2pos \n", " }\n", "\n", "# Define the new sentence\n", "new_sentence = input(\"Sentence: \")\n", "\n", "# Tokenize the new sentence and get POS tags\n", "tokens = nltk.word_tokenize(new_sentence)\n", "tagged_tokens = [nltk.pos_tag([token])[0] for token in tokens]\n", "\n", "# Extract features for each token in the new sentence\n", "features = [word_features(tagged_tokens, i) for i in range(len(tagged_tokens))]\n", "\n", "# Use the BERT classifier to check if the sentence is grammatically correct\n", "inputs_cls = tokenizer_cls(new_sentence, return_tensors=\"pt\")\n", "with torch.no_grad():\n", " outputs_cls = model_cls(**inputs_cls)\n", "probabilities_cls = torch.softmax(outputs_cls.logits, dim=1).squeeze().tolist()\n", "predicted_class = torch.argmax(outputs_cls.logits, dim=1).item()\n", "\n", "# Check if the sentence is grammatically correct\n", "if predicted_class == 1:\n", " print(\"The sentence is grammatically correct.\")\n", "else:\n", " # Use the CRF model to predict POS tags for the tokens\n", " predicted_labels = crf_model.predict([features])[0]\n", "\n", " # Combine tokens with predicted labels\n", " predicted_tokens_with_labels = list(zip(tokens, predicted_labels))\n", "\n", " print(\"Original sentence:\", new_sentence)\n", "\n", " grammar_correction_candidates = []\n", "\n", " # Iterate over each word and mask it, then predict the masked word\n", " for i, (token, predicted_label) in enumerate(zip(tokens, predicted_labels)):\n", " # Check if the predicted label is a verb\n", " if predicted_label.startswith('VB'):\n", " # Mask the word\n", " masked_words = tokens.copy()\n", " masked_words[i] = tokenizer_mlm.mask_token\n", " masked_sentence = \" \".join(masked_words)\n", "\n", " # Tokenize the masked sentence\n", " tokens_mlm = tokenizer_mlm(masked_sentence, return_tensors=\"pt\")\n", "\n", " # Get the position of the masked token\n", " masked_index = torch.where(tokens_mlm[\"input_ids\"] == tokenizer_mlm.mask_token_id)[1][0]\n", "\n", " # Get the logits for the masked token\n", " with torch.no_grad():\n", " outputs = model_mlm(**tokens_mlm)\n", " predictions_mlm = outputs.logits\n", "\n", " # Get the top predicted words for the masked token\n", " top_predictions_mlm = torch.topk(predictions_mlm[0, masked_index], k=5)\n", " candidates_mlm = [tokenizer_mlm.decode(idx.item()) for idx in top_predictions_mlm.indices]\n", "\n", " # Reconstruct the sentence with each candidate\n", " for candidate_mlm in candidates_mlm:\n", " # Get embeddings for the masked word and the candidate word\n", " original_embedding = model_mlm.get_input_embeddings()(torch.tensor(tokenizer_mlm.encode(token, add_special_tokens=False))).mean(dim=0)\n", " candidate_embedding = model_mlm.get_input_embeddings()(torch.tensor(tokenizer_mlm.encode(candidate_mlm, add_special_tokens=False))).mean(dim=0)\n", " \n", " # Compute cosine similarity between original masked word and predicted word\n", " similarity = torch.nn.functional.cosine_similarity(original_embedding.unsqueeze(0), candidate_embedding.unsqueeze(0)).item()\n", " fuzzy_match_score = fuzz.ratio(token, candidate_mlm)\n", "\n", " replaced_words = masked_words.copy()\n", " replaced_words[i] = candidate_mlm\n", " corrected_sentence = \" \".join(replaced_words).split() # Split and join to remove extra spaces\n", " corrected_sentence = \" \".join(corrected_sentence) # Join words without extra spaces\n", " \n", " # Tokenize the corrected sentence for sequence classification\n", " inputs_cls = tokenizer_cls(corrected_sentence, return_tensors=\"pt\")\n", "\n", " # Forward pass through the model for sequence classification 1 or 0 \n", " with torch.no_grad():\n", " outputs_cls = model_cls(**inputs_cls)\n", "\n", " # Get softmax probabilities for class indicating grammatically correct sentences\n", " probability = torch.softmax(outputs_cls.logits, dim=1).squeeze().tolist()[1]\n", "\n", " # Append the corrected sentence along with its probability and cosine similarity\n", " grammar_correction_candidates.append((corrected_sentence, probability, similarity, fuzzy_match_score))\n", "\n", "\n", " # Sort the grammar correction candidates by their probabilities and cosine similarities in descending order\n", " grammar_correction_candidates.sort(key=lambda x: (x[3], x[1], x[2]), reverse=True)\n", "\n", "if grammar_correction_candidates:\n", " candidate, probability, cosine_similarity, fuzzy_match_score = grammar_correction_candidates[0]\n", " print(\"Sentence:\", candidate)\n", " print(\"Correctness Probability:\", probability)\n", " print(\"Cosine Similarity:\", cosine_similarity)\n", " print(\"Levenshtein Score:\", fuzzy_match_score)\n", "\n", "\n", "\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.1" } }, "nbformat": 4, "nbformat_minor": 2 }