madhavkotecha commited on
Commit
135bc84
·
verified ·
1 Parent(s): faad496

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +130 -1
app.py CHANGED
@@ -6,4 +6,133 @@ import gradio as gr
6
 
7
  nltk.download('brown')
8
  nltk.download('universal_tagset')
9
- corpus = nltk.corpus.brown.tagged_sents(tagset='universal')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  nltk.download('brown')
8
  nltk.download('universal_tagset')
9
+ corpus = nltk.corpus.brown.tagged_sents(tagset='universal')
10
+
11
+ sentence = [
12
+ ('The', 'DET'),
13
+ ('dog', 'NOUN'),
14
+ ('jumps', 'VERB'),
15
+ ('over', 'ADP'),
16
+ ('the', 'DET'),
17
+ ('car', 'NOUN')
18
+ ]
19
+ corpus = list(corpus)
20
+ corpus[21058] = sentence
21
+
22
+ def word_features(sentence, i, prev_tag):
23
+ word = sentence[i][0]
24
+ features = {
25
+ 'word': word,
26
+ 'is_first': i == 0, #if the word is a first word
27
+ 'is_last': i == len(sentence) - 1, #if the word is a last word
28
+ 'is_capitalized': word[0].upper() == word[0],
29
+ 'is_all_caps': word.upper() == word, #word is in uppercase
30
+ 'is_all_lower': word.lower() == word, #word is in lowercase
31
+ 'prefix-1': word[0],
32
+ 'prefix-2': word[:2],
33
+ 'prefix-3': word[:3],
34
+ 'prefix-un': word[:2] == 'un', #if word starts with un
35
+ 'prefix-re': word[:2] == 're', #if word starts with re
36
+ 'prefix-over': word[:4] == 'over', #if word starts with over
37
+ 'prefix-dis': word[:4] == 'dis', #if word starts with dis
38
+ 'prefix-mis': word[:4] == 'mis', #if word starts with mis
39
+ 'prefix-pre': word[:4] == 'pre', #if word starts with pre
40
+ 'prefix-non': word[:4] == 'non', #if word starts with non
41
+ 'prefix-de': word[:3] == 'de', #if word starts with de
42
+ 'prefix-in': word[:3] == 'in', #if word starts with in
43
+ 'prefix-en': word[:3] == 'en', #if word starts with en
44
+ 'suffix-1': word[-1],
45
+ 'suffix-2': word[-2:],
46
+ 'suffix-3': word[-3:],
47
+ 'suffix-ed': word[-2:] == 'ed', #if word ends with ed
48
+ 'suffix-ing': word[-3:] == 'ing', #if word ends with ing
49
+ 'suffix-es': word[-2:] == 'es', #if word ends with es
50
+ 'suffix-s': word[-1] == 's', #if word ends with s
51
+ 'suffix-ly': word[-2:] == 'ly', #if word ends with ly
52
+ 'suffix-ment': word[-4:] == 'ment', #if word ends with ment
53
+ 'suffix-er': word[-2:] == 'er', #if word ends with er
54
+ 'prev_word': '' if i == 0 else sentence[i-1][0],
55
+ 'next_word': '' if i == len(sentence)-1 else sentence[i+1][0],
56
+ 'has_hyphen': '-' in word, #if word has hypen
57
+ 'is_numeric': word.isdigit(), #if word is in numeric
58
+ 'capitals_inside': word[1:].lower() != word[1:],
59
+ 'is_first_capital': word[0].upper() == word[0], #if first letter is in uppercase
60
+ 'suffix-s_and-prev_tag_noun': word[-1] == 's' and prev_tag == 'NOUN', #if word ends with s and previous tag is NOUN
61
+ 'prev_tag': prev_tag,
62
+ }
63
+ return features
64
+
65
+ X = []
66
+ y = []
67
+ for sentence in corpus:
68
+ X_sentence = []
69
+ y_sentence = []
70
+ for i in range(len(sentence)):
71
+ X_sentence.append(word_features(sentence, i, '' if i == 0 else sentence[i-1][1]))
72
+ y_sentence.append(sentence[i][1])
73
+ X.append(X_sentence)
74
+ y.append(y_sentence)
75
+
76
+
77
+ # Split the data into training and testing sets
78
+ split = int(0.8 * len(X))
79
+ X_train = X[:split]
80
+ y_train = y[:split]
81
+ X_test = X[split:]
82
+ y_test = y[split:]
83
+
84
+ # Train a CRF model on the training data
85
+ crf = sklearn_crfsuite.CRF(
86
+ algorithm='lbfgs',
87
+ c1=0.1,
88
+ c2=0.1,
89
+ max_iterations=100,
90
+ all_possible_transitions=True
91
+ )
92
+ crf.fit(X_train, y_train)
93
+
94
+ # Make predictions on the test data and evaluate the performance
95
+ y_pred = crf.predict(X_test)
96
+
97
+ print(metrics.flat_accuracy_score(y_test, y_pred))
98
+
99
+ def predict_tags(sentence):
100
+ tokens = sentence.split()
101
+ tokens2 = [(token, '') for token in tokens]
102
+ features = []
103
+ prev_prev_tag = ''
104
+ prev_tag = ''
105
+ for i in range(len(tokens)):
106
+ features.append(word_features(tokens2, i, prev_tag))
107
+ if i > 0:
108
+ prev_tag = crf.predict([features[:i]])[0][i-1]
109
+
110
+ predicted_tags = crf.predict([features])[0]
111
+ return list(zip(tokens, predicted_tags))
112
+
113
+
114
+ # Example usage
115
+ new_sentence = "The dog walks over the car"
116
+ predicted_tags = predict_tags(new_sentence)
117
+ print(predicted_tags)
118
+
119
+ def tagging(input):
120
+ tagged_list = predict_tags(input)
121
+ output = ' '.join(f"{word}[{tag}] " for word, tag in input_list)
122
+ return output
123
+
124
+
125
+ interface = gr.Interface(fn = tagging,
126
+ inputs = gr.Textbox(
127
+ label="Input Sentence",
128
+ placeholder="Enter your sentence here...",
129
+ ),
130
+ outputs = gr.Textbox(
131
+ label="Tagged Output",
132
+ placeholder="Tagged sentence appears here...",
133
+ ),
134
+ title = "Conditional Random Field POS Tagger",
135
+ description = "CS626 Assignment 1B (Autumn 2024)",
136
+ theme=gr.themes.Soft())
137
+ interface.launch(inline = False, share = True)
138
+