Spaces:
Sleeping
Sleeping
import numpy as np | |
import nltk | |
import sklearn_crfsuite | |
from sklearn_crfsuite import metrics | |
import gradio as gr | |
import re | |
nltk.download('brown') | |
nltk.download('universal_tagset') | |
corpus = nltk.corpus.brown.tagged_sents(tagset='universal') | |
sentence = [ | |
('The', 'DET'), | |
('dog', 'NOUN'), | |
('jumps', 'VERB'), | |
('over', 'ADP'), | |
('the', 'DET'), | |
('car', 'NOUN') | |
] | |
corpus = list(corpus) | |
corpus[21058] = sentence | |
def word_features(sentence, i, prev_tag): | |
word = sentence[i][0] | |
features = { | |
'word': word, | |
'is_first': i == 0, #if the word is a first word | |
'is_last': i == len(sentence) - 1, #if the word is a last word | |
'is_capitalized': word[0].upper() == word[0], | |
'is_all_caps': word.upper() == word, #word is in uppercase | |
'is_all_lower': word.lower() == word, #word is in lowercase | |
'prefix-1': word[0], | |
'prefix-2': word[:2], | |
'prefix-3': word[:3], | |
'prefix-un': word[:2] == 'un', #if word starts with un | |
'prefix-re': word[:2] == 're', #if word starts with re | |
'prefix-over': word[:4] == 'over', #if word starts with over | |
'prefix-dis': word[:4] == 'dis', #if word starts with dis | |
'prefix-mis': word[:4] == 'mis', #if word starts with mis | |
'prefix-pre': word[:4] == 'pre', #if word starts with pre | |
'prefix-non': word[:4] == 'non', #if word starts with non | |
'prefix-de': word[:3] == 'de', #if word starts with de | |
'prefix-in': word[:3] == 'in', #if word starts with in | |
'prefix-en': word[:3] == 'en', #if word starts with en | |
'suffix-1': word[-1], | |
'suffix-2': word[-2:], | |
'suffix-3': word[-3:], | |
'suffix-ed': word[-2:] == 'ed', #if word ends with ed | |
'suffix-ing': word[-3:] == 'ing', #if word ends with ing | |
'suffix-es': word[-2:] == 'es', #if word ends with es | |
'suffix-s': word[-1] == 's', #if word ends with s | |
'suffix-ly': word[-2:] == 'ly', #if word ends with ly | |
'suffix-ment': word[-4:] == 'ment', #if word ends with ment | |
'suffix-er': word[-2:] == 'er', #if word ends with er | |
'prev_word': '' if i == 0 else sentence[i-1][0], | |
'next_word': '' if i == len(sentence)-1 else sentence[i+1][0], | |
'has_hyphen': '-' in word, #if word has hypen | |
'is_numeric': word.isdigit(), #if word is in numeric | |
'capitals_inside': word[1:].lower() != word[1:], | |
'is_first_capital': word[0].upper() == word[0], #if first letter is in uppercase | |
'suffix-s_and-prev_tag_noun': word[-1] == 's' and prev_tag == 'NOUN', #if word ends with s and previous tag is NOUN | |
'prev_tag': prev_tag, | |
} | |
return features | |
X = [] | |
y = [] | |
for sentence in corpus: | |
X_sentence = [] | |
y_sentence = [] | |
for i in range(len(sentence)): | |
X_sentence.append(word_features(sentence, i, '' if i == 0 else sentence[i-1][1])) | |
y_sentence.append(sentence[i][1]) | |
X.append(X_sentence) | |
y.append(y_sentence) | |
# Split the data into training and testing sets | |
split = int(0.8 * len(X)) | |
X_train = X[:split] | |
y_train = y[:split] | |
X_test = X[split:] | |
y_test = y[split:] | |
# Train a CRF model on the training data | |
crf = sklearn_crfsuite.CRF( | |
algorithm='lbfgs', | |
c1=0.1, | |
c2=0.1, | |
max_iterations=100, | |
all_possible_transitions=True | |
) | |
crf.fit(X_train, y_train) | |
# Make predictions on the test data and evaluate the performance | |
y_pred = crf.predict(X_test) | |
print(metrics.flat_accuracy_score(y_test, y_pred)) | |
def predict_tags(sentence): | |
tokens = sentence.split() | |
tokens2 = [(token, '') for token in tokens] | |
features = [] | |
prev_prev_tag = '' | |
prev_tag = '' | |
for i in range(len(tokens)): | |
features.append(word_features(tokens2, i, prev_tag)) | |
if i > 0: | |
prev_tag = crf.predict([features[:i]])[0][i-1] | |
predicted_tags = crf.predict([features])[0] | |
return list(zip(tokens, predicted_tags)) | |
# Example usage | |
new_sentence = "The dog walks over the car" | |
predicted_tags = predict_tags(new_sentence) | |
print(predicted_tags) | |
def tagging(input): | |
input = (re.sub(r'(\S)([.,;:!?])', r'\1 \2', input.strip())) | |
tagged_list = predict_tags(input) | |
output = ''.join(f"{word}[{tag}] " for word, tag in tagged_list) | |
return output | |
interface = gr.Interface(fn = tagging, | |
inputs = gr.Textbox( | |
label="Input Sentence", | |
placeholder="Enter your sentence here...", | |
), | |
outputs = gr.Textbox( | |
label="Tagged Output", | |
placeholder="Tagged sentence appears here...", | |
), | |
title = "Conditional Random Field POS Tagger", | |
description = "CS626 Assignment 1B (Autumn 2024)", | |
theme=gr.themes.Soft()) | |
interface.launch(inline = False) | |