In [None]:
import joblib
import nltk
import csv

# Load the saved CRF model
crf_model = joblib.load(r'D:\Thesis\POS Tag Automation\crf_model.pkl')

def word_features(sent, i):
 word = sent[i][0]
 pos = sent[i][1]
 
 # first word
 if i == 0:
 prevword = ''
 prevpos = ''
 else:
 prevword = sent[i-1][0]
 prevpos = sent[i-1][1]
 
 # first or second word
 if i == 0 or i == 1:
 prev2word = ''
 prev2pos = ''
 else:
 prev2word = sent[i-2][0]
 prev2pos = sent[i-2][1]
 
 # last word
 if i == len(sent) - 1:
 nextword = ''
 nextpos = ''
 else:
 nextword = sent[i+1][0]
 nextpos = sent[i+1][1]
 
 # suffixes and prefixes
 pref_1, pref_2, pref_3, pref_4 = word[:1], word[:2], word[:3], word[:4]
 suff_1, suff_2, suff_3, suff_4 = word[-1:], word[-2:], word[-3:], word[-4:]
 
 return {'word':word, 
 'prevword': prevword,
 'prevpos': prevpos, 
 'nextword': nextword, 
 'nextpos': nextpos, 
 'suff_1': suff_1, 
 'suff_2': suff_2, 
 'suff_3': suff_3, 
 'suff_4': suff_4, 
 'pref_1': pref_1, 
 'pref_2': pref_2, 
 'pref_3': pref_3, 
 'pref_4': pref_4,
 'prev2word': prev2word,
 'prev2pos': prev2pos 
 }

# Function to process a sentence and output tokens with their POS tags
def process_sentence(sentence, label):
 tokens = nltk.word_tokenize(sentence)
 tagged_tokens = [(token, nltk.pos_tag([token])[0][1]) for token in tokens]
 features = [word_features(tagged_tokens, i) for i in range(len(tagged_tokens))]
 predicted_labels = crf_model.predict([features])[0]
 predicted_tokens_with_labels = list(zip(tokens, predicted_labels))
 input_tokens = [token[0] for token in predicted_tokens_with_labels]
 pos_tags = [token[1] for token in predicted_tokens_with_labels]
 return input_tokens, pos_tags, [label] * len(input_tokens)

# Input CSV file path
input_csv_file = "D:\Thesis\Datasets\preprocessed_dataset.csv"
# Output CSV file path
output_csv_file = "testing_bert_finetune.csv"

# Open input CSV file for reading
with open(input_csv_file, 'r', newline='', encoding='utf-8') as csv_input_file:
 reader = csv.reader(csv_input_file)
 # Open output CSV file for writing
 with open(output_csv_file, 'w', newline='', encoding='utf-8') as csv_output_file:
 writer = csv.writer(csv_output_file)
 # Write header to output CSV file
 writer.writerow(['sentence', 'pos_tag', 'label'])
 # Skip header row in input CSV file
 next(reader)
 # Process each row in input CSV file
 for row in reader:
 sentence = row[0]
 label = row[1]
 # Process the sentence to obtain tokens with POS tags and labels
 tokens, pos_tags, labels = process_sentence(sentence, label)
 # Write [CLS] token
 writer.writerow(['[CLS]', '[CLS]', '1'])
 # Write each token with its POS tag and label to the output CSV file
 for token, pos_tag, label in zip(tokens, pos_tags, labels):
 writer.writerow([token, '[POS_' + pos_tag + ']', label])
 # Write [SEP] token at the end of the sentence
 writer.writerow(['[SEP]', '[SEP]', '1'])
