Spaces:

cswamy
/

named_entity_recognition

Runtime error

App Files Files Community

cswamy commited on Sep 10, 2023

Commit

84c08ac

•

1 Parent(s): 98656aa

first commit

Browse files

Files changed (6) hide show

.gitattributes +1 -0
app.py +127 -0
bertbasecased_finetuned_conll.pth +3 -0
class_names.txt +9 -0
model.py +23 -0
requirements.txt +2 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+bertbasecased_finetuned_conll.pth filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import gradio as gr
+import os
+import re
+import torch
+from model import create_bertcased_ner
+from timeit import default_timer as timer
+from typing import Tuple, Dict
+# Read class names from class_names.txt
+with open("class_names.txt", "r") as f:
+  class_names = [entity_name.strip() for entity_name in f.readlines()]
+# Setup model and tokenizer
+model, tokenizer = create_bertcased_ner(class_names)
+# Load state dict from model
+model.load_state_dict(
+    torch.load(
+        f="bertbasecased_finetuned_conll.pth",
+        map_location=torch.device("cpu")
+    ))
+# Predict function
+def predict(new_text:str,
+            model:torch.nn.Module,
+            tokenizer,
+            device:torch.device):
+  """
+  Function for named entity recognition on new text.
+  Args:
+    new_text(str): A new sentence to classify entities on.
+    model(torch.nn.Module): Trained pytorch model for NER.
+    tokenizer: tokenizer for the model.
+    device(torch.device): Device setting
+  Returns:
+    List of dicts with words and entities in text.
+  """
+  # Start timer
+  start_time = timer()
+  new_text_tokens = new_text.split(' ')
+  tokenized_sample = tokenizer(new_text_tokens, is_split_into_words=True)
+  input_to_model = {k: torch.tensor(v).unsqueeze(dim=0).to(device) for k, v in tokenized_sample.items()}
+  outputs = model(**input_to_model)
+  preds = torch.argmax(outputs.logits, dim=-1)
+  preds_list = preds.squeeze(dim=0).tolist()
+  # Remove CLS and SEP tokens from all lists
+  tokenized_tokens = tokenized_sample.tokens()[1:-1]
+  word_ids = tokenized_sample.word_ids()[1:-1]
+  preds_list = preds_list[1:-1]
+  # Remove pred = 0 from tokens and word ids (0's are non-entities)
+  ix_remove = []
+  for i, pred in enumerate(preds_list):
+    if pred == 0:
+      ix_remove.append(i)
+  filtered_tokens = [tokenized_tokens[t] for t in range(len(tokenized_tokens)) if t not in ix_remove]
+  filtered_wordids = [word_ids[w] for w in range(len(word_ids)) if w not in ix_remove]
+  filtered_preds = [preds_list[p] for p in range(len(preds_list)) if p not in ix_remove]
+  # Create list with words from original text and predictions
+  current_word = None
+  results_list = []
+  for i, word in enumerate(filtered_wordids):
+    if word != current_word:
+      if filtered_preds[i] % 2 == 1:
+        results_dict = {}
+        results_dict["word"] = re.sub(r'[^\w\s]', '', new_text_tokens[word])
+        results_dict["pred"] = filtered_preds[i]
+        results_list.append(results_dict)
+        current_word = word
+      else:
+        tmp_dict = results_list[-1]
+        tmp_dict["word"] = new_text_tokens[word-1] + ' ' + new_text_tokens[word]
+  # Finally convert predictions to entity categories
+  # Person, Organization, Location and Miscellaneous
+  for pred in results_list:
+    if pred["pred"] <= 2:
+      pred["pred"] = "Person"
+    elif pred["pred"] <= 4:
+      pred["pred"] = "Organisation"
+    elif pred["pred"] <= 6:
+      pred["pred"] = "Location"
+    else:
+      pred["pred"] = "Miscellaneous"
+  # Calculate prediction time
+  pred_time = round(timer() - start_time, 5)
+  return results_list, pred_time
+# Create custom display function
+def display(results_list):
+    table_html = "<table>"
+    table_html += "<tr><th>Word</th><th>Prediction</th></tr>"
+    for item in results_list:
+        table_html += f"<tr><td>{item['word']}</td><td>{item['pred']}</td></tr>"
+    table_html += "</table>"
+    return table_html
+# Create examples list
+examples_list = ["Barack Obama was the 44th President of the United States.",
+                "Islington is a borough in the city of London.",
+                "United Nations is headquartered in New York City."]
+# Create Gradio app
+title = "Named Entity Recognition 🔎"
+description = "Bert finetuned model for named entity recognition!"
+article = "Finetuned on the conll2003 dataset"\
+demo = gr.Interface(fn=predict,
+                    inputs=gr.Textbox(placeholder="Enter sentence here..."),
+                    outputs=[gr.outputs.HTML(display),
+                             gr.Number(label="Prediction_time (sec)")],
+                    examples=examples_list,
+                    title=title,
+                    description=description,
+                    article=article)
+# Launch gradio
+demo.launch()

bertbasecased_finetuned_conll.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d0bcbad40b0b3f525fccddac6366579b3e80e1565ed799ccea8b7556e9f8c1ca
+size 430991289

class_names.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+O
+B-PER
+I-PER
+B-ORG
+I-ORG
+B-LOC
+I-LOC
+B-MISC
+I-MISC

model.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from transformers import AutoModelForTokenClassification, AutoTokenizer
+def create_bertcased_ner(class_names):
+  """
+  Initializes tokenizer and model for a bert-cased checkpoint.
+  Args:
+    class_names: List of classnames
+  Returns:
+    Instance of model and tokenizer
+  """
+  checkpoint = "bert-base-cased"
+  tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+  # Setup id2label and label2id dicts
+  id2label = {i: label for i, label in enumerate(class_names)}
+  label2id = {label: i for i, label in enumerate(class_names)}
+  # Instantiate model
+  model = AutoModelForTokenClassification.from_pretrained(checkpoint,
+                                                          id2label=id2label,
+                                                          label2id=label2id)
+  return model, tokenizer

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ torch==1.12.0
2	+ gradio==3.1.4