In [14]:
import torch
import nltk
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForMaskedLM
import joblib

# Load pre-trained RoBERTa tokenizer and model for MLM
tokenizer_mlm = AutoTokenizer.from_pretrained("fine_tuned_model")
model_mlm = AutoModelForMaskedLM.from_pretrained("fine_tuned_model")

# Load pre-trained BERT tokenizer and model for sequence classification
tokenizer_cls = AutoTokenizer.from_pretrained("zklmorales/bert_finetuned")
model_cls = AutoModelForSequenceClassification.from_pretrained("zklmorales/bert_finetuned")
crf_model = joblib.load(r'D:\Thesis\POS Tag Automation\crf_model.pkl')

def word_features(sent, i):
    word = sent[i][0]
    pos = sent[i][1]
    
    # first word
    if i == 0:
        prevword = '<START>'
        prevpos = '<START>'
    else:
        prevword = sent[i-1][0]
        prevpos = sent[i-1][1]
        
    # first or second word
    if i == 0 or i == 1:
        prev2word = '<START>'
        prev2pos = '<START>'
    else:
        prev2word = sent[i-2][0]
        prev2pos = sent[i-2][1]
    
    # last word
    if i == len(sent) - 1:
        nextword = '<END>'
        nextpos = '<END>'
    else:
        nextword = sent[i+1][0]
        nextpos = sent[i+1][1]
    
    # suffixes and prefixes
    pref_1, pref_2, pref_3, pref_4 = word[:1], word[:2], word[:3], word[:4]
    suff_1, suff_2, suff_3, suff_4 = word[-1:], word[-2:], word[-3:], word[-4:]
    
    return {'word':word,            
            'prevword': prevword,
            'prevpos': prevpos,  
            'nextword': nextword, 
            'nextpos': nextpos,          
            'suff_1': suff_1,  
            'suff_2': suff_2,  
            'suff_3': suff_3,  
            'suff_4': suff_4, 
            'pref_1': pref_1,  
            'pref_2': pref_2,  
            'pref_3': pref_3, 
            'pref_4': pref_4,
            'prev2word': prev2word,
            'prev2pos': prev2pos           
           }

new_sentence = "Siya ay magigising kanina."

tokens = nltk.word_tokenize(new_sentence)

tagged_tokens = []

for token in tokens:
    pos_tag = nltk.pos_tag([token])[0][1]
    tagged_tokens.append((token, pos_tag))

# Extract features for each token in the new sentence
features = [word_features(tagged_tokens, i) for i in range(len(tagged_tokens))]

# Use the trained CRF model to predict labels for the tokens
predicted_labels = crf_model.predict([features])[0]

# Forward pass through the model for sequence classification
inputs_cls = tokenizer_cls(new_sentence, return_tensors="pt")
with torch.no_grad():
    outputs_cls = model_cls(**inputs_cls)

# Get softmax probabilities
probabilities_cls = torch.softmax(outputs_cls.logits, dim=1).squeeze().tolist()

# Get the most probable class
predicted_class = torch.argmax(outputs_cls.logits, dim=1).item()

# Check if the sentence is grammatically correct
if predicted_class == 1:  # Assuming class 0 represents grammatical correctness
    print("The sentence is grammatically correct.")
else:
    # Proceed with grammar correction candidates
    grammar_correction_candidates = []

    # Iterate over each word and mask it, then predict the masked word
    for i, (token, predicted_label) in enumerate(zip(tokens, predicted_labels)):
        # Check if the predicted label is a verb
        if predicted_label.startswith('VB'):
            # Mask the word
            masked_words = tokens.copy()
            masked_words[i] = tokenizer_mlm.mask_token
            masked_sentence = " ".join(masked_words)

            # Tokenize the masked sentence
            tokens_mlm = tokenizer_mlm(masked_sentence, return_tensors="pt")

            # Get the position of the masked token
            masked_index = torch.where(tokens_mlm["input_ids"] == tokenizer_mlm.mask_token_id)[1][0]

            # Get the logits for the masked token
            with torch.no_grad():
                outputs = model_mlm(**tokens_mlm)
                predictions_mlm = outputs.logits

            # Get the top predicted words for the masked token
            top_predictions_mlm = torch.topk(predictions_mlm[0, masked_index], k=5)
            candidates_mlm = [tokenizer_mlm.decode(idx.item()) for idx in top_predictions_mlm.indices]

            # Reconstruct the sentence with each candidate
            for candidate_mlm in candidates_mlm:
                # Get embeddings for the masked word and the candidate word
                original_embedding = model_mlm.get_input_embeddings()(torch.tensor(tokenizer_mlm.encode(token, add_special_tokens=False))).mean(dim=0)
                candidate_embedding = model_mlm.get_input_embeddings()(torch.tensor(tokenizer_mlm.encode(candidate_mlm, add_special_tokens=False))).mean(dim=0)

                # Compute cosine similarity between original masked word and predicted word
                similarity = torch.nn.functional.cosine_similarity(original_embedding.unsqueeze(0), candidate_embedding.unsqueeze(0)).item()

                replaced_words = masked_words.copy()
                replaced_words[i] = candidate_mlm
                corrected_sentence = " ".join(replaced_words).split()  # Split and join to remove extra spaces
                corrected_sentence = " ".join(corrected_sentence)  # Join words without extra spaces

                # Tokenize the corrected sentence for sequence classification
                inputs_cls = tokenizer_cls(corrected_sentence, return_tensors="pt")

                # Forward pass through the model for sequence classification
                with torch.no_grad():
                    outputs_cls = model_cls(**inputs_cls)

                # Get softmax probabilities
                probabilities = torch.softmax(outputs_cls.logits, dim=1).squeeze().tolist()

                # Get the most probable class
                predicted_class = torch.argmax(outputs_cls.logits, dim=1).item()

                # Append the corrected sentence along with its probability and class
                grammar_correction_candidates.append((corrected_sentence, probabilities[predicted_class], similarity))

    # Sort the grammar correction candidates by their probabilities and cosine similarities in descending order
    grammar_correction_candidates.sort(key=lambda x: (x[1], x[2]), reverse=True)

    # Print the top 5 most probable grammar correction candidates with high cosine similarity
    print("Grammar correction candidates:")
    for candidate, probability, cosine_similarity in grammar_correction_candidates[:5]:
        print("Candidate:", candidate)
        print("Probability:", probability)
        print("Cosine Similarity:", cosine_similarity)
        print()

print("Original sentence POS Tags:", predicted_labels)


Grammar correction candidates:
Candidate: Siya ay nagising kanina .
Probability: 0.9917004704475403
Cosine Similarity: 0.18928596377372742

Candidate: Siya ay dumating kanina .
Probability: 0.9892023205757141
Cosine Similarity: 0.002990148961544037

Candidate: Siya ay namatay kanina .
Probability: 0.9889046549797058
Cosine Similarity: -0.04294966533780098

Candidate: Siya ay nagbitiw kanina .
Probability: 0.9842618703842163
Cosine Similarity: -0.029277324676513672

Candidate: Siya ay nahuli kanina .
Probability: 0.9830281734466553
Cosine Similarity: -0.02716892771422863

Original sentence POS Tags: ['PRS', 'LM', 'VBTF', 'RBW', 'PMP']
