Spaces:

madhavkotecha
/

CRF-NLP

Sleeping

App Files Files Community

madhavkotecha commited on Oct 2, 2024

Commit

135bc84

verified ·

1 Parent(s): faad496

Update app.py

Browse files

Files changed (1) hide show

app.py +130 -1

app.py CHANGED Viewed

@@ -6,4 +6,133 @@ import gradio as gr
 nltk.download('brown')
 nltk.download('universal_tagset')
-corpus = nltk.corpus.brown.tagged_sents(tagset='universal')

 nltk.download('brown')
 nltk.download('universal_tagset')
+corpus = nltk.corpus.brown.tagged_sents(tagset='universal')
+sentence = [
+    ('The', 'DET'),
+    ('dog', 'NOUN'),
+    ('jumps', 'VERB'),
+    ('over', 'ADP'),
+    ('the', 'DET'),
+    ('car', 'NOUN')
+]
+corpus = list(corpus)
+corpus[21058] = sentence
+def word_features(sentence, i, prev_tag):
+    word = sentence[i][0]
+    features = {
+        'word': word,
+        'is_first': i == 0, #if the word is a first word
+        'is_last': i == len(sentence) - 1,  #if the word is a last word
+        'is_capitalized': word[0].upper() == word[0],
+        'is_all_caps': word.upper() == word,      #word is in uppercase
+        'is_all_lower': word.lower() == word,      #word is in lowercase
+        'prefix-1': word[0],
+        'prefix-2': word[:2],
+        'prefix-3': word[:3],
+        'prefix-un': word[:2] == 'un',   #if word starts with un
+        'prefix-re': word[:2] == 're',   #if word starts with re
+        'prefix-over': word[:4] == 'over',  #if word starts with over
+        'prefix-dis': word[:4] == 'dis',   #if word starts with dis
+        'prefix-mis': word[:4] == 'mis',   #if word starts with mis
+        'prefix-pre': word[:4] == 'pre',   #if word starts with pre
+        'prefix-non': word[:4] == 'non',   #if word starts with non
+        'prefix-de': word[:3] == 'de',     #if word starts with de
+        'prefix-in': word[:3] == 'in',     #if word starts with in
+        'prefix-en': word[:3] == 'en',     #if word starts with en
+        'suffix-1': word[-1],
+        'suffix-2': word[-2:],
+        'suffix-3': word[-3:],
+        'suffix-ed': word[-2:] == 'ed',   #if word ends with ed
+        'suffix-ing': word[-3:] == 'ing',  #if word ends with ing
+        'suffix-es': word[-2:] == 'es',    #if word ends with es
+        'suffix-s': word[-1] == 's',       #if word ends with s
+        'suffix-ly': word[-2:] == 'ly',    #if word ends with ly
+        'suffix-ment': word[-4:] == 'ment',  #if word ends with ment
+        'suffix-er': word[-2:] == 'er',     #if word ends with er
+        'prev_word': '' if i == 0 else sentence[i-1][0],
+        'next_word': '' if i == len(sentence)-1 else sentence[i+1][0],
+        'has_hyphen': '-' in word,    #if word has hypen
+        'is_numeric': word.isdigit(),  #if word is in numeric
+        'capitals_inside': word[1:].lower() != word[1:],
+        'is_first_capital': word[0].upper() == word[0],  #if first letter is in uppercase
+        'suffix-s_and-prev_tag_noun': word[-1] == 's' and prev_tag == 'NOUN',  #if word ends with s and previous tag is NOUN
+        'prev_tag': prev_tag,
+    }
+    return features
+X = []
+y = []
+for sentence in corpus:
+    X_sentence = []
+    y_sentence = []
+    for i in range(len(sentence)):
+        X_sentence.append(word_features(sentence, i, '' if i == 0 else sentence[i-1][1]))
+        y_sentence.append(sentence[i][1])
+    X.append(X_sentence)
+    y.append(y_sentence)
+# Split the data into training and testing sets
+split = int(0.8 * len(X))
+X_train = X[:split]
+y_train = y[:split]
+X_test = X[split:]
+y_test = y[split:]
+# Train a CRF model on the training data
+crf = sklearn_crfsuite.CRF(
+    algorithm='lbfgs',
+    c1=0.1,
+    c2=0.1,
+    max_iterations=100,
+    all_possible_transitions=True
+)
+crf.fit(X_train, y_train)
+# Make predictions on the test data and evaluate the performance
+y_pred = crf.predict(X_test)
+print(metrics.flat_accuracy_score(y_test, y_pred))
+def predict_tags(sentence):
+    tokens = sentence.split()
+    tokens2 = [(token, '') for token in tokens]
+    features = []
+    prev_prev_tag = ''
+    prev_tag = ''
+    for i in range(len(tokens)):
+        features.append(word_features(tokens2, i, prev_tag))
+        if i > 0:
+            prev_tag = crf.predict([features[:i]])[0][i-1]
+    predicted_tags = crf.predict([features])[0]
+    return list(zip(tokens, predicted_tags))
+# Example usage
+new_sentence = "The dog walks over the car"
+predicted_tags = predict_tags(new_sentence)
+print(predicted_tags)
+def tagging(input):
+    tagged_list = predict_tags(input)
+    output = ' '.join(f"{word}[{tag}]   " for word, tag in input_list)
+    return output
+interface = gr.Interface(fn = tagging,
+                         inputs = gr.Textbox(
+                             label="Input Sentence",
+                             placeholder="Enter your sentence here...",
+                         ),
+                         outputs = gr.Textbox(
+                             label="Tagged Output",
+                             placeholder="Tagged sentence appears here...",
+                         ),
+                         title = "Conditional Random Field POS Tagger",
+                         description = "CS626 Assignment 1B (Autumn 2024)",
+                         theme=gr.themes.Soft())
+interface.launch(inline = False, share = True)