Spaces:

madhavkotecha
/

CRF-NLP

Sleeping

App Files Files Community

CRF-NLP / app.py

madhavkotecha

Update app.py

2ba6acd verified 3 months ago

raw

history blame contribute delete

5.01 kB

	import numpy as np
	import nltk
	import sklearn_crfsuite
	from sklearn_crfsuite import metrics
	import gradio as gr
	import re

	nltk.download('brown')
	nltk.download('universal_tagset')
	corpus = nltk.corpus.brown.tagged_sents(tagset='universal')

	sentence = [
	('The', 'DET'),
	('dog', 'NOUN'),
	('jumps', 'VERB'),
	('over', 'ADP'),
	('the', 'DET'),
	('car', 'NOUN')
	]
	corpus = list(corpus)
	corpus[21058] = sentence

	def word_features(sentence, i, prev_tag):
	word = sentence[i][0]
	features = {
	'word': word,
	'is_first': i == 0, #if the word is a first word
	'is_last': i == len(sentence) - 1, #if the word is a last word
	'is_capitalized': word[0].upper() == word[0],
	'is_all_caps': word.upper() == word, #word is in uppercase
	'is_all_lower': word.lower() == word, #word is in lowercase
	'prefix-1': word[0],
	'prefix-2': word[:2],
	'prefix-3': word[:3],
	'prefix-un': word[:2] == 'un', #if word starts with un
	'prefix-re': word[:2] == 're', #if word starts with re
	'prefix-over': word[:4] == 'over', #if word starts with over
	'prefix-dis': word[:4] == 'dis', #if word starts with dis
	'prefix-mis': word[:4] == 'mis', #if word starts with mis
	'prefix-pre': word[:4] == 'pre', #if word starts with pre
	'prefix-non': word[:4] == 'non', #if word starts with non
	'prefix-de': word[:3] == 'de', #if word starts with de
	'prefix-in': word[:3] == 'in', #if word starts with in
	'prefix-en': word[:3] == 'en', #if word starts with en
	'suffix-1': word[-1],
	'suffix-2': word[-2:],
	'suffix-3': word[-3:],
	'suffix-ed': word[-2:] == 'ed', #if word ends with ed
	'suffix-ing': word[-3:] == 'ing', #if word ends with ing
	'suffix-es': word[-2:] == 'es', #if word ends with es
	'suffix-s': word[-1] == 's', #if word ends with s
	'suffix-ly': word[-2:] == 'ly', #if word ends with ly
	'suffix-ment': word[-4:] == 'ment', #if word ends with ment
	'suffix-er': word[-2:] == 'er', #if word ends with er
	'prev_word': '' if i == 0 else sentence[i-1][0],
	'next_word': '' if i == len(sentence)-1 else sentence[i+1][0],
	'has_hyphen': '-' in word, #if word has hypen
	'is_numeric': word.isdigit(), #if word is in numeric
	'capitals_inside': word[1:].lower() != word[1:],
	'is_first_capital': word[0].upper() == word[0], #if first letter is in uppercase
	'suffix-s_and-prev_tag_noun': word[-1] == 's' and prev_tag == 'NOUN', #if word ends with s and previous tag is NOUN
	'prev_tag': prev_tag,
	}
	return features

	X = []
	y = []
	for sentence in corpus:
	X_sentence = []
	y_sentence = []
	for i in range(len(sentence)):
	X_sentence.append(word_features(sentence, i, '' if i == 0 else sentence[i-1][1]))
	y_sentence.append(sentence[i][1])
	X.append(X_sentence)
	y.append(y_sentence)


	# Split the data into training and testing sets
	split = int(0.8 * len(X))
	X_train = X[:split]
	y_train = y[:split]
	X_test = X[split:]
	y_test = y[split:]

	# Train a CRF model on the training data
	crf = sklearn_crfsuite.CRF(
	algorithm='lbfgs',
	c1=0.1,
	c2=0.1,
	max_iterations=100,
	all_possible_transitions=True
	)
	crf.fit(X_train, y_train)

	# Make predictions on the test data and evaluate the performance
	y_pred = crf.predict(X_test)

	print(metrics.flat_accuracy_score(y_test, y_pred))

	def predict_tags(sentence):
	tokens = sentence.split()
	tokens2 = [(token, '') for token in tokens]
	features = []
	prev_prev_tag = ''
	prev_tag = ''
	for i in range(len(tokens)):
	features.append(word_features(tokens2, i, prev_tag))
	if i > 0:
	prev_tag = crf.predict([features[:i]])[0][i-1]

	predicted_tags = crf.predict([features])[0]
	return list(zip(tokens, predicted_tags))


	# Example usage
	new_sentence = "The dog walks over the car"
	predicted_tags = predict_tags(new_sentence)
	print(predicted_tags)

	def tagging(input):
	input = (re.sub(r'(\S)([.,;:!?])', r'\1 \2', input.strip()))
	tagged_list = predict_tags(input)
	output = ''.join(f"{word}[{tag}] " for word, tag in tagged_list)
	return output


	interface = gr.Interface(fn = tagging,
	inputs = gr.Textbox(
	label="Input Sentence",
	placeholder="Enter your sentence here...",
	),
	outputs = gr.Textbox(
	label="Tagged Output",
	placeholder="Tagged sentence appears here...",
	),
	title = "Conditional Random Field POS Tagger",
	description = "CS626 Assignment 1B (Autumn 2024)",
	theme=gr.themes.Soft())
	interface.launch(inline = False)