import numpy as np import nltk import sklearn_crfsuite from sklearn_crfsuite import metrics import gradio as gr nltk.download('brown') nltk.download('universal_tagset') corpus = nltk.corpus.brown.tagged_sents(tagset='universal') sentence = [ ('The', 'DET'), ('dog', 'NOUN'), ('jumps', 'VERB'), ('over', 'ADP'), ('the', 'DET'), ('car', 'NOUN') ] corpus = list(corpus) corpus[21058] = sentence def word_features(sentence, i, prev_tag): word = sentence[i][0] features = { 'word': word, 'is_first': i == 0, #if the word is a first word 'is_last': i == len(sentence) - 1, #if the word is a last word 'is_capitalized': word[0].upper() == word[0], 'is_all_caps': word.upper() == word, #word is in uppercase 'is_all_lower': word.lower() == word, #word is in lowercase 'prefix-1': word[0], 'prefix-2': word[:2], 'prefix-3': word[:3], 'prefix-un': word[:2] == 'un', #if word starts with un 'prefix-re': word[:2] == 're', #if word starts with re 'prefix-over': word[:4] == 'over', #if word starts with over 'prefix-dis': word[:4] == 'dis', #if word starts with dis 'prefix-mis': word[:4] == 'mis', #if word starts with mis 'prefix-pre': word[:4] == 'pre', #if word starts with pre 'prefix-non': word[:4] == 'non', #if word starts with non 'prefix-de': word[:3] == 'de', #if word starts with de 'prefix-in': word[:3] == 'in', #if word starts with in 'prefix-en': word[:3] == 'en', #if word starts with en 'suffix-1': word[-1], 'suffix-2': word[-2:], 'suffix-3': word[-3:], 'suffix-ed': word[-2:] == 'ed', #if word ends with ed 'suffix-ing': word[-3:] == 'ing', #if word ends with ing 'suffix-es': word[-2:] == 'es', #if word ends with es 'suffix-s': word[-1] == 's', #if word ends with s 'suffix-ly': word[-2:] == 'ly', #if word ends with ly 'suffix-ment': word[-4:] == 'ment', #if word ends with ment 'suffix-er': word[-2:] == 'er', #if word ends with er 'prev_word': '' if i == 0 else sentence[i-1][0], 'next_word': '' if i == len(sentence)-1 else sentence[i+1][0], 'has_hyphen': '-' in word, #if word has hypen 'is_numeric': word.isdigit(), #if word is in numeric 'capitals_inside': word[1:].lower() != word[1:], 'is_first_capital': word[0].upper() == word[0], #if first letter is in uppercase 'suffix-s_and-prev_tag_noun': word[-1] == 's' and prev_tag == 'NOUN', #if word ends with s and previous tag is NOUN 'prev_tag': prev_tag, } return features X = [] y = [] for sentence in corpus: X_sentence = [] y_sentence = [] for i in range(len(sentence)): X_sentence.append(word_features(sentence, i, '' if i == 0 else sentence[i-1][1])) y_sentence.append(sentence[i][1]) X.append(X_sentence) y.append(y_sentence) # Split the data into training and testing sets split = int(0.8 * len(X)) X_train = X[:split] y_train = y[:split] X_test = X[split:] y_test = y[split:] # Train a CRF model on the training data crf = sklearn_crfsuite.CRF( algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True ) crf.fit(X_train, y_train) # Make predictions on the test data and evaluate the performance y_pred = crf.predict(X_test) print(metrics.flat_accuracy_score(y_test, y_pred)) def predict_tags(sentence): tokens = sentence.split() tokens2 = [(token, '') for token in tokens] features = [] prev_prev_tag = '' prev_tag = '' for i in range(len(tokens)): features.append(word_features(tokens2, i, prev_tag)) if i > 0: prev_tag = crf.predict([features[:i]])[0][i-1] predicted_tags = crf.predict([features])[0] return list(zip(tokens, predicted_tags)) # Example usage new_sentence = "The dog walks over the car" predicted_tags = predict_tags(new_sentence) print(predicted_tags) def tagging(input): tagged_list = predict_tags(input) output = ''.join(f"{word}[{tag}] " for word, tag in tagged_list) return output interface = gr.Interface(fn = tagging, inputs = gr.Textbox( label="Input Sentence", placeholder="Enter your sentence here...", ), outputs = gr.Textbox( label="Tagged Output", placeholder="Tagged sentence appears here...", ), title = "Conditional Random Field POS Tagger", description = "CS626 Assignment 1B (Autumn 2024)", theme=gr.themes.Soft()) interface.launch(inline = False, share = True)