In [1]:
import joblib
import nltk

# Load the saved CRF model
crf_model = joblib.load(r'D:\Thesis\POS Tag Automation\crf_model.pkl')

def word_features(sent, i):
 word = sent[i][0]
 pos = sent[i][1]
 
 # first word
 if i == 0:
 prevword = ''
 prevpos = ''
 else:
 prevword = sent[i-1][0]
 prevpos = sent[i-1][1]
 
 # first or second word
 if i == 0 or i == 1:
 prev2word = ''
 prev2pos = ''
 else:
 prev2word = sent[i-2][0]
 prev2pos = sent[i-2][1]
 
 # last word
 if i == len(sent) - 1:
 nextword = ''
 nextpos = ''
 else:
 nextword = sent[i+1][0]
 nextpos = sent[i+1][1]
 
 # suffixes and prefixes
 pref_1, pref_2, pref_3, pref_4 = word[:1], word[:2], word[:3], word[:4]
 suff_1, suff_2, suff_3, suff_4 = word[-1:], word[-2:], word[-3:], word[-4:]
 
 return {'word':word, 
 'prevword': prevword,
 'prevpos': prevpos, 
 'nextword': nextword, 
 'nextpos': nextpos, 
 'suff_1': suff_1, 
 'suff_2': suff_2, 
 'suff_3': suff_3, 
 'suff_4': suff_4, 
 'pref_1': pref_1, 
 'pref_2': pref_2, 
 'pref_3': pref_3, 
 'pref_4': pref_4,
 'prev2word': prev2word,
 'prev2pos': prev2pos 
 }

new_sentence = "Pupunta ako kanina sa mall upang bumili"

# Tokenize the new sentence
tokens = nltk.word_tokenize(new_sentence)


tagged_tokens = []

for token in tokens:
 pos_tag = nltk.pos_tag([token])[0][1]
 tagged_tokens.append((token, pos_tag))


# Extract features for each token in the new sentence
features = [word_features(tagged_tokens, i) for i in range(len(tagged_tokens))]

# Use the trained CRF model to predict labels for the tokens
predicted_labels = crf_model.predict([features])[0]

# Combine tokens with predicted labels
predicted_tokens_with_labels = list(zip(tokens, predicted_labels))

print(predicted_tokens_with_labels)


[('Pupunta', 'VBAF'), ('ako', 'PRS'), ('kanina', 'RBW'), ('sa', 'CCT'), ('mall', 'NNP'), ('upang', 'CCB'), ('bumili', 'VBAF')]


In [11]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("zklmorales/bert_finetuned")
model = AutoModelForSequenceClassification.from_pretrained("zklmorales/bert_finetuned")

new_sentence = "Pupunta ako kahapon sa siyudad upang bumili ang mga gamit ko"

# Tokenize the input text
inputs = tokenizer(new_sentence, return_tensors="pt")

# Forward pass through the model
with torch.no_grad():
 outputs = model(**inputs)

# Get the predicted class (label) from the model output
predicted_class = torch.argmax(outputs.logits, dim=1).item()

# Get softmax probabilities
probabilities = torch.softmax(outputs.logits, dim=1).squeeze().tolist()

# Print the prediction and probabilities
if predicted_class == 1:
 print("Sentence is grammatically correct.")
else:
 print("Sentence is grammatically wrong.")

print("Probabilities:", probabilities)


Sentence is grammatically correct.
Probabilities: [0.00594444340094924, 0.9940555095672607]


In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForMaskedLM

# Load pre-trained RoBERTa tokenizer and model for MLM
tokenizer_mlm = AutoTokenizer.from_pretrained("fine_tuned_model")
model_mlm = AutoModelForMaskedLM.from_pretrained("fine_tuned_model")

# Load pre-trained BERT tokenizer and model for sequence classification
tokenizer_cls = AutoTokenizer.from_pretrained("zklmorales/bert_finetuned")
model_cls = AutoModelForSequenceClassification.from_pretrained("zklmorales/bert_finetuned")

def word_features(sent, i):
 word = sent[i][0]
 pos = sent[i][1]
 
 # first word
 if i == 0:
 prevword = ''
 prevpos = ''
 else:
 prevword = sent[i-1][0]
 prevpos = sent[i-1][1]
 
 # first or second word
 if i == 0 or i == 1:
 prev2word = ''
 prev2pos = ''
 else:
 prev2word = sent[i-2][0]
 prev2pos = sent[i-2][1]
 
 # last word
 if i == len(sent) - 1:
 nextword = ''
 nextpos = ''
 else:
 nextword = sent[i+1][0]
 nextpos = sent[i+1][1]
 
 # suffixes and prefixes
 pref_1, pref_2, pref_3, pref_4 = word[:1], word[:2], word[:3], word[:4]
 suff_1, suff_2, suff_3, suff_4 = word[-1:], word[-2:], word[-3:], word[-4:]
 
 return {'word':word, 
 'prevword': prevword,
 'prevpos': prevpos, 
 'nextword': nextword, 
 'nextpos': nextpos, 
 'suff_1': suff_1, 
 'suff_2': suff_2, 
 'suff_3': suff_3, 
 'suff_4': suff_4, 
 'pref_1': pref_1, 
 'pref_2': pref_2, 
 'pref_3': pref_3, 
 'pref_4': pref_4,
 'prev2word': prev2word,
 'prev2pos': prev2pos 
 }

new_sentence = "Tumakbo ang mga bata mula sa pagsabog"

tokens = nltk.word_tokenize(new_sentence)

tagged_tokens = []

for token in tokens:
 pos_tag = nltk.pos_tag([token])[0][1]
 tagged_tokens.append((token, pos_tag))

# Extract features for each token in the new sentence
features = [word_features(tagged_tokens, i) for i in range(len(tagged_tokens))]

# Use the trained CRF model to predict labels for the tokens
predicted_labels = crf_model.predict([features])[0]

# Combine tokens with predicted labels
predicted_tokens_with_labels = list(zip(tokens, predicted_labels))

print("Original sentence:", new_sentence)

grammar_correction_candidates = []

# Iterate over each word and mask it, then predict the masked word
for i, (token, predicted_label) in enumerate(zip(tokens, predicted_labels)):
 # Check if the predicted label is a verb
 if predicted_label.startswith('VB'):
 # Mask the word
 masked_words = tokens.copy()
 masked_words[i] = tokenizer_mlm.mask_token
 masked_sentence = " ".join(masked_words)

 # Tokenize the masked sentence
 tokens_mlm = tokenizer_mlm(masked_sentence, return_tensors="pt")

 # Get the position of the masked token
 masked_index = torch.where(tokens_mlm["input_ids"] == tokenizer_mlm.mask_token_id)[1][0]

 # Get the logits for the masked token
 with torch.no_grad():
 outputs = model_mlm(**tokens_mlm)
 predictions_mlm = outputs.logits

 # Get the top predicted words for the masked token
 top_predictions_mlm = torch.topk(predictions_mlm[0, masked_index], k=5)
 candidates_mlm = [tokenizer_mlm.decode(idx.item()) for idx in top_predictions_mlm.indices]

 # Reconstruct the sentence with each candidate
 for candidate_mlm in candidates_mlm:
 replaced_words = masked_words.copy()
 replaced_words[i] = candidate_mlm
 corrected_sentence = " ".join(replaced_words).split() # Split and join to remove extra spaces
 corrected_sentence = " ".join(corrected_sentence) # Join words without extra spaces
 
 # Tokenize the corrected sentence for sequence classification
 inputs_cls = tokenizer_cls(corrected_sentence, return_tensors="pt")

 # Forward pass through the model for sequence classification
 with torch.no_grad():
 outputs_cls = model_cls(**inputs_cls)

 # Get softmax probabilities
 probabilities = torch.softmax(outputs_cls.logits, dim=1).squeeze().tolist()
 
 # Get the most probable class
 predicted_class = torch.argmax(outputs_cls.logits, dim=1).item()

 # Append the corrected sentence along with its probability and class
 grammar_correction_candidates.append((corrected_sentence, probabilities[predicted_class]))

# Sort the grammar correction candidates by their probabilities in descending order
grammar_correction_candidates.sort(key=lambda x: x[1], reverse=True)

# Print the top 5 most probable grammar correction candidates
print("Grammar correction candidates:")
for candidate, probability in grammar_correction_candidates:
 print(candidate, "Probability:", probability)
print(predicted_labels)


Original sentence: Tumakbo ang mga bata mula sa pagsabog
Grammar correction candidates:
Patay ang mga bata mula sa pagsabog Probability: 0.9976784586906433
Alisin ang mga bata mula sa pagsabog Probability: 0.9921312928199768
Turuan ang mga bata mula sa pagsabog Probability: 0.9664002060890198
Hanapin ang mga bata mula sa pagsabog Probability: 0.9470312595367432
Sinusuportahan ang mga bata mula sa pagsabog Probability: 0.9317439198493958
['VBTS', 'DTC', 'DTCP', 'NNC', 'RBL', 'CCT', 'NNC']


In [17]:
from fuzzywuzzy import fuzz

original_word = "Gigisingin"
suggestions = ["Tatakbo", "Nagising", "Hihiga", "Kakain"]

threshold = 60

for suggestion in suggestions:
 similarity_score = fuzz.ratio(original_word, suggestion)
 if similarity_score >= threshold:
 print(suggestion)
 print(fuzz.ratio(original_word, suggestion))


Nagising
67
