Spaces:
Sleeping
Sleeping
madhavkotecha
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -6,4 +6,133 @@ import gradio as gr
|
|
6 |
|
7 |
nltk.download('brown')
|
8 |
nltk.download('universal_tagset')
|
9 |
-
corpus = nltk.corpus.brown.tagged_sents(tagset='universal')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
nltk.download('brown')
|
8 |
nltk.download('universal_tagset')
|
9 |
+
corpus = nltk.corpus.brown.tagged_sents(tagset='universal')
|
10 |
+
|
11 |
+
sentence = [
|
12 |
+
('The', 'DET'),
|
13 |
+
('dog', 'NOUN'),
|
14 |
+
('jumps', 'VERB'),
|
15 |
+
('over', 'ADP'),
|
16 |
+
('the', 'DET'),
|
17 |
+
('car', 'NOUN')
|
18 |
+
]
|
19 |
+
corpus = list(corpus)
|
20 |
+
corpus[21058] = sentence
|
21 |
+
|
22 |
+
def word_features(sentence, i, prev_tag):
|
23 |
+
word = sentence[i][0]
|
24 |
+
features = {
|
25 |
+
'word': word,
|
26 |
+
'is_first': i == 0, #if the word is a first word
|
27 |
+
'is_last': i == len(sentence) - 1, #if the word is a last word
|
28 |
+
'is_capitalized': word[0].upper() == word[0],
|
29 |
+
'is_all_caps': word.upper() == word, #word is in uppercase
|
30 |
+
'is_all_lower': word.lower() == word, #word is in lowercase
|
31 |
+
'prefix-1': word[0],
|
32 |
+
'prefix-2': word[:2],
|
33 |
+
'prefix-3': word[:3],
|
34 |
+
'prefix-un': word[:2] == 'un', #if word starts with un
|
35 |
+
'prefix-re': word[:2] == 're', #if word starts with re
|
36 |
+
'prefix-over': word[:4] == 'over', #if word starts with over
|
37 |
+
'prefix-dis': word[:4] == 'dis', #if word starts with dis
|
38 |
+
'prefix-mis': word[:4] == 'mis', #if word starts with mis
|
39 |
+
'prefix-pre': word[:4] == 'pre', #if word starts with pre
|
40 |
+
'prefix-non': word[:4] == 'non', #if word starts with non
|
41 |
+
'prefix-de': word[:3] == 'de', #if word starts with de
|
42 |
+
'prefix-in': word[:3] == 'in', #if word starts with in
|
43 |
+
'prefix-en': word[:3] == 'en', #if word starts with en
|
44 |
+
'suffix-1': word[-1],
|
45 |
+
'suffix-2': word[-2:],
|
46 |
+
'suffix-3': word[-3:],
|
47 |
+
'suffix-ed': word[-2:] == 'ed', #if word ends with ed
|
48 |
+
'suffix-ing': word[-3:] == 'ing', #if word ends with ing
|
49 |
+
'suffix-es': word[-2:] == 'es', #if word ends with es
|
50 |
+
'suffix-s': word[-1] == 's', #if word ends with s
|
51 |
+
'suffix-ly': word[-2:] == 'ly', #if word ends with ly
|
52 |
+
'suffix-ment': word[-4:] == 'ment', #if word ends with ment
|
53 |
+
'suffix-er': word[-2:] == 'er', #if word ends with er
|
54 |
+
'prev_word': '' if i == 0 else sentence[i-1][0],
|
55 |
+
'next_word': '' if i == len(sentence)-1 else sentence[i+1][0],
|
56 |
+
'has_hyphen': '-' in word, #if word has hypen
|
57 |
+
'is_numeric': word.isdigit(), #if word is in numeric
|
58 |
+
'capitals_inside': word[1:].lower() != word[1:],
|
59 |
+
'is_first_capital': word[0].upper() == word[0], #if first letter is in uppercase
|
60 |
+
'suffix-s_and-prev_tag_noun': word[-1] == 's' and prev_tag == 'NOUN', #if word ends with s and previous tag is NOUN
|
61 |
+
'prev_tag': prev_tag,
|
62 |
+
}
|
63 |
+
return features
|
64 |
+
|
65 |
+
X = []
|
66 |
+
y = []
|
67 |
+
for sentence in corpus:
|
68 |
+
X_sentence = []
|
69 |
+
y_sentence = []
|
70 |
+
for i in range(len(sentence)):
|
71 |
+
X_sentence.append(word_features(sentence, i, '' if i == 0 else sentence[i-1][1]))
|
72 |
+
y_sentence.append(sentence[i][1])
|
73 |
+
X.append(X_sentence)
|
74 |
+
y.append(y_sentence)
|
75 |
+
|
76 |
+
|
77 |
+
# Split the data into training and testing sets
|
78 |
+
split = int(0.8 * len(X))
|
79 |
+
X_train = X[:split]
|
80 |
+
y_train = y[:split]
|
81 |
+
X_test = X[split:]
|
82 |
+
y_test = y[split:]
|
83 |
+
|
84 |
+
# Train a CRF model on the training data
|
85 |
+
crf = sklearn_crfsuite.CRF(
|
86 |
+
algorithm='lbfgs',
|
87 |
+
c1=0.1,
|
88 |
+
c2=0.1,
|
89 |
+
max_iterations=100,
|
90 |
+
all_possible_transitions=True
|
91 |
+
)
|
92 |
+
crf.fit(X_train, y_train)
|
93 |
+
|
94 |
+
# Make predictions on the test data and evaluate the performance
|
95 |
+
y_pred = crf.predict(X_test)
|
96 |
+
|
97 |
+
print(metrics.flat_accuracy_score(y_test, y_pred))
|
98 |
+
|
99 |
+
def predict_tags(sentence):
|
100 |
+
tokens = sentence.split()
|
101 |
+
tokens2 = [(token, '') for token in tokens]
|
102 |
+
features = []
|
103 |
+
prev_prev_tag = ''
|
104 |
+
prev_tag = ''
|
105 |
+
for i in range(len(tokens)):
|
106 |
+
features.append(word_features(tokens2, i, prev_tag))
|
107 |
+
if i > 0:
|
108 |
+
prev_tag = crf.predict([features[:i]])[0][i-1]
|
109 |
+
|
110 |
+
predicted_tags = crf.predict([features])[0]
|
111 |
+
return list(zip(tokens, predicted_tags))
|
112 |
+
|
113 |
+
|
114 |
+
# Example usage
|
115 |
+
new_sentence = "The dog walks over the car"
|
116 |
+
predicted_tags = predict_tags(new_sentence)
|
117 |
+
print(predicted_tags)
|
118 |
+
|
119 |
+
def tagging(input):
|
120 |
+
tagged_list = predict_tags(input)
|
121 |
+
output = ' '.join(f"{word}[{tag}] " for word, tag in input_list)
|
122 |
+
return output
|
123 |
+
|
124 |
+
|
125 |
+
interface = gr.Interface(fn = tagging,
|
126 |
+
inputs = gr.Textbox(
|
127 |
+
label="Input Sentence",
|
128 |
+
placeholder="Enter your sentence here...",
|
129 |
+
),
|
130 |
+
outputs = gr.Textbox(
|
131 |
+
label="Tagged Output",
|
132 |
+
placeholder="Tagged sentence appears here...",
|
133 |
+
),
|
134 |
+
title = "Conditional Random Field POS Tagger",
|
135 |
+
description = "CS626 Assignment 1B (Autumn 2024)",
|
136 |
+
theme=gr.themes.Soft())
|
137 |
+
interface.launch(inline = False, share = True)
|
138 |
+
|