In [1]:
import transformers 

 from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("syke9p3/bert-tagalog-base-uncased-pos-tagger")
model = AutoModelForTokenClassification.from_pretrained("syke9p3/bert-tagalog-base-uncased-pos-tagger")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [48]:
import pandas as pd
import torch

symbol_tags = {
 ".": "PMP",
 "!": "PME",
 "?": "PMQ",
 ",": "PMC",
 ";": "PMSC",
 "@": "PMS",
 "/": "PMS",
 "+": "PMS",
 "*": "PMS",
 "(": "PMS",
 ")": "PMS",
 "\"": "PMS",
 "'": "PMS",
 "~": "PMS",
 "&": "PMS",
 "%": "PMS",
 "$": "PMS",
 "#": "PMS",
 "=": "PMS",
 "-": "PMS",
 ":": "PMS"
}

def process_sentence(sentence):
 tokens = tokenizer(sentence, return_tensors="pt", padding=False, truncation=True)

 with torch.no_grad():
 outputs = model(**tokens)

 predictions = torch.argmax(outputs.logits, dim=-1)
 predicted_tags = [model.config.id2label[tag_id] for tag_id in predictions[0].tolist()]

 tokens_list = tokens['input_ids'].tolist()[0] 

 # Replace the POS tags for punctuation symbols
 for i, token in enumerate(tokens_list):
 if tokenizer.convert_ids_to_tokens(token) in symbol_tags:
 predicted_tags[i] = symbol_tags[tokenizer.convert_ids_to_tokens(token)]

 result = [(tokenizer.convert_ids_to_tokens(token), tag) for token, tag in zip(tokens_list, predicted_tags) if token not in [tokenizer.pad_token_id, tokenizer.sep_token_id, tokenizer.cls_token_id]]
 return result

def process_csv(file_path):
 df = pd.read_csv(file_path)
 data = []
 sentences = []
 for index, row in df.iterrows():
 sentence = row['Sentence']
 tags = process_sentence(sentence)
 sentence_data = []
 for token, tag in tags:
 sentence_data.append((token, tag))
 sentences.append(sentence_data)
 return sentences

output = process_csv(r"D:\Thesis\Datasets\COpy - Sheet1.csv")
print(output)


[[('iminumungkahi', 'VBTR'), ('ni', 'DTP'), ('senador', 'NNP'), ('franklin', 'NNP'), ('drilon', 'NNP'), ('na', 'CCP'), ('tanggalin', 'VBOF'), ('na', 'CCP'), ('lang', 'RBI'), ('ang', 'DTC'), ('pork', 'FW'), ('barrel', 'FW'), ('na', 'CCP'), ('mariin', 'JJD'), ('namang', 'RBI_CCP'), ('tinutulan', 'VBTS'), ('ni', 'DTP'), ('quezon', 'NNP'), ('city', 'NNP'), ('rep', 'NNP'), ('.', 'PMP'), ('feliciano', 'NNP'), ('belmonte', 'NNP'), ('.', 'PMP')], [('bibigyan', 'VBTF'), ('ng', 'CCB'), ('legal', 'JJD'), ('assistance', 'FW'), ('ng', 'CCB'), ('philippine', 'NNP'), ('national', 'NNP'), ('police', 'NNP'), ('(', 'PMS'), ('pnp', 'NNPA'), (')', 'PMS'), ('ang', 'DTC'), ('babaeng', 'NNC_CCP'), ('pulis', 'NNC'), ('na', 'CCP'), ('sinaktan', 'VBTS'), ('ng', 'CCB'), ('sinibak', 'VBTS'), ('na', 'CCP'), ('hepe', 'NNC'), ('ng', 'CCB'), ('eastern', 'NNP'), ('police', 'NNP'), ('district', 'NNP'), ('(', 'PMS'), ('epd', 'NNPA'), (')', 'PMS'), ('.', 'PMP')], [('sinabi', 'VBTS'), ('pa', 'RBI'), ('ni', 'DTP'), ('albay

In [49]:
#Importing libraries
import nltk, re, pprint
import numpy as np
import pandas as pd
import pprint, time
import random
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
import sklearn

import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn_crfsuite import scorers
from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt

In [5]:
train_set,test_set = train_test_split(output,train_size=0.85,test_size=0.15,random_state=101)
train_set,val_set = train_test_split(output,train_size=0.80,test_size=0.20,random_state=101)

In [6]:
# extract features from a given sentence
def word_features(sent, i):
 word = sent[i][0]
 pos = sent[i][1]
 
 # first word
 if i == 0:
 prevword = ''
 prevpos = ''
 else:
 prevword = sent[i-1][0]
 prevpos = sent[i-1][1]
 
 # first or second word
 if i == 0 or i == 1:
 prev2word = ''
 prev2pos = ''
 else:
 prev2word = sent[i-2][0]
 prev2pos = sent[i-2][1]
 
 # last word
 if i == len(sent) - 1:
 nextword = ''
 nextpos = ''
 else:
 nextword = sent[i+1][0]
 nextpos = sent[i+1][1]
 
 # suffixes and prefixes
 pref_1, pref_2, pref_3, pref_4 = word[:1], word[:2], word[:3], word[:4]
 suff_1, suff_2, suff_3, suff_4 = word[-1:], word[-2:], word[-3:], word[-4:]
 
 return {'word':word, 
 'prevword': prevword,
 'prevpos': prevpos, 
 'nextword': nextword, 
 'nextpos': nextpos, 
 'suff_1': suff_1, 
 'suff_2': suff_2, 
 'suff_3': suff_3, 
 'suff_4': suff_4, 
 'pref_1': pref_1, 
 'pref_2': pref_2, 
 'pref_3': pref_3, 
 'pref_4': pref_4,
 'prev2word': prev2word,
 'prev2pos': prev2pos 
 }


In [7]:
print(train_set[0][0:5])

word_features(train_set[0],3)

[('wala', 'VBN'), ('aniya', 'PRS'), ('siyang', 'PRS_CCP'), ('pakialam', 'NNC'), ('kung', 'CCR')]


{'word': 'pakialam',
 'prevword': 'siyang',
 'prevpos': 'PRS_CCP',
 'nextword': 'kung',
 'nextpos': 'CCR',
 'suff_1': 'm',
 'suff_2': 'am',
 'suff_3': 'lam',
 'suff_4': 'alam',
 'pref_1': 'p',
 'pref_2': 'pa',
 'pref_3': 'pak',
 'pref_4': 'paki',
 'prev2word': 'aniya',
 'prev2pos': 'PRS'}

In [8]:
def sent2features(sent):
 return [word_features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
 return [postag for word, postag in sent]

def sent2tokens(sent):
 return [word for word, postag in sent] 

In [9]:
X_train = [sent2features(s) for s in train_set]
y_train = [sent2labels(s) for s in train_set]

X_valid = [sent2features(s) for s in val_set]
y_valid = [sent2labels(s) for s in val_set]

X_test = [sent2features(s) for s in test_set]
y_test = [sent2labels(s) for s in test_set]

In [10]:
print(X_train[0][1:10])

[{'word': 'aniya', 'prevword': 'wala', 'prevpos': 'VBN', 'nextword': 'siyang', 'nextpos': 'PRS_CCP', 'suff_1': 'a', 'suff_2': 'ya', 'suff_3': 'iya', 'suff_4': 'niya', 'pref_1': 'a', 'pref_2': 'an', 'pref_3': 'ani', 'pref_4': 'aniy', 'prev2word': '', 'prev2pos': ''}, {'word': 'siyang', 'prevword': 'aniya', 'prevpos': 'PRS', 'nextword': 'pakialam', 'nextpos': 'NNC', 'suff_1': 'g', 'suff_2': 'ng', 'suff_3': 'ang', 'suff_4': 'yang', 'pref_1': 's', 'pref_2': 'si', 'pref_3': 'siy', 'pref_4': 'siya', 'prev2word': 'wala', 'prev2pos': 'VBN'}, {'word': 'pakialam', 'prevword': 'siyang', 'prevpos': 'PRS_CCP', 'nextword': 'kung', 'nextpos': 'CCR', 'suff_1': 'm', 'suff_2': 'am', 'suff_3': 'lam', 'suff_4': 'alam', 'pref_1': 'p', 'pref_2': 'pa', 'pref_3': 'pak', 'pref_4': 'paki', 'prev2word': 'aniya', 'prev2pos': 'PRS'}, {'word': 'kung', 'prevword': 'pakialam', 'prevpos': 'NNC', 'nextword': 'ilan', 'nextpos': 'PRQ', 'suff_1': 'g', 'suff_2': 'ng', 'suff_3': 'ung', 'suff_4': 'kung', 'pref_1': 'k', 'pref

In [11]:
y_train[0]

['VBN',
 'PRS',
 'PRS_CCP',
 'NNC',
 'CCR',
 'PRQ',
 'RBI',
 'DTC',
 '[PAD]',
 'VBAF',
 'CCT',
 'NNC',
 'CCR',
 'JJD',
 'CCP',
 'RBF',
 'VBTF',
 'PRO',
 'CCB',
 'RBF',
 'VBS',
 'CCB',
 'PRI',
 'PMP']

In [12]:
crf = sklearn_crfsuite.CRF(
 algorithm='lbfgs',
 c1=0.1,
 c2=0.1,
 max_iterations=100,
 all_possible_transitions=True
)
try:
 crf.fit(X_train, y_train)
except AttributeError:
 pass
predictions = crf.predict(X_test)

In [60]:
from sklearn.metrics import classification_report

# Get the list of unique labels
labels = list(crf.classes_)

# Predict labels for the validation set
y_pred = crf.predict(X_valid)

from sklearn.preprocessing import MultiLabelBinarizer

# Convert labels to strings
y_valid_str = [[str(label)] for label in y_valid]
y_pred_str = [[str(label)] for label in y_pred]

# Convert labels to binary array format
mlb = MultiLabelBinarizer()
y_valid_bin = mlb.fit_transform(y_valid_str)
y_pred_bin = mlb.transform(y_pred_str)

# Print classification report
print(classification_report(y_valid_bin, y_pred_bin, labels=labels))





UFuncTypeError: ufunc 'maximum' did not contain a loop with signature matching types (dtype(' None

In [41]:
new_sentence = "Tumakbo ako "

# Tokenize the new sentence
tokens = nltk.word_tokenize(new_sentence)


tagged_tokens = []

for token in tokens:
 pos_tag = nltk.pos_tag([token])[0][1]
 tagged_tokens.append((token, pos_tag))


# Extract features for each token in the new sentence
features = [word_features(tagged_tokens, i) for i in range(len(tagged_tokens))]

# Use the trained CRF model to predict labels for the tokens
predicted_labels = crf.predict([features])[0]

# Combine tokens with predicted labels
predicted_tokens_with_labels = list(zip(tokens, predicted_labels))

print(predicted_tokens_with_labels)



[('Tumakbo', 'VBTS'), ('ako', 'PRS')]


In [36]:
import joblib

# Assuming crf_model is your trained CRF model
joblib.dump(crf, 'crf_model.pkl')


['crf_model.pkl']