Spaces:

knowledgator
/

0-shot-NER

Runtime error

App Files Files Community

mvy commited on Jan 14

Commit

10b2302

•

1 Parent(s): 58e3f16

refactor

Browse files

Files changed (2) hide show

app.py +8 -2
ner.py +111 -59

app.py CHANGED Viewed

@@ -21,9 +21,15 @@ examples = [
     ],
 ]
 gradio_app = gr.Interface(
-    NER.ner,
-    inputs = ['text', gr.Textbox(placeholder="Enter sentence here..."), gr.Number(value=0.0, label="treshold")],
     outputs = [gr.HighlightedText()],
     examples=examples,
     theme="huggingface",

     ],
 ]
+ner = NER('knowledgator/UTC-DeBERTa-small')
 gradio_app = gr.Interface(
+    ner,
+    inputs = [
+        'text',
+        gr.Textbox(placeholder="Enter sentence here..."),
+        gr.Number(value=0.0, label="threshold")
+    ],
     outputs = [gr.HighlightedText()],
     examples=examples,
     theme="huggingface",

ner.py CHANGED Viewed

@@ -1,78 +1,130 @@
 from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
 import spacy
 import torch
-nlp = spacy.load('en_core_web_sm', disable = ['lemmatizer', 'parser', 'tagger', 'ner'])
-nlp.add_pipe('sentencizer')
-device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
 class NER:
-    model_name = 'knowledgator/UTC-DeBERTa-small'
-    prompt="""
 Identify entities in the text having the following classes:
 {}
 Text:
-"""
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    model = AutoModelForTokenClassification.from_pretrained(model_name)
-    ner_pipeline = pipeline(
-        "ner",
-        model=model,
-        tokenizer=tokenizer,
-        aggregation_strategy='first',
-        batch_size=12,
-        device=device
-    )
-    @classmethod
-    def chunkanize(cls, text, prompt_ = '', n_sents = 10):
-        doc = nlp(text)
         chunks = []
         starts = []
-        start = 0
-        end = 0
-        proc = False
-        for id, sent in enumerate(doc.sents, start=1):
-            if not proc:
-                start = sent[0].idx
-                starts.append(start)
-            proc = True
-            end = sent[-1].idx+len(sent[-1].text)
-            if id%n_sents==0:
-                chunk_text = prompt_+text[start:end]
-                chunks.append(chunk_text)
-                proc = False
-        if proc:
-            chunk_text = prompt_+text[start:end]
-            chunks.append(chunk_text)
         return chunks, starts
     @classmethod
-    def ner(cls, labels, text, treshold = 0.):
-        chunks, starts, classes = [], [], []
-        label2prompt_len = {}
-        for label in labels.split(', '):
-            prompt_ = cls.prompt.format(label)
-            prompt_len = len(prompt_)
-            label2prompt_len[label] = prompt_len
-            curr_chunks, curr_starts = cls.chunkanize(text, prompt_)
-            curr_labels = [label for _ in range(len(curr_chunks))]
-            chunks+=curr_chunks
-            starts+=curr_starts
-            classes+=curr_labels
         outputs = []
-        for id, output in enumerate(cls.ner_pipeline(chunks)):
-            label = classes[id]
-            prompt_len = label2prompt_len[label]
-            start = starts[id]-prompt_len
             for ent in output:
-                if ent['score']>treshold:
-                    ent['start'] += start
-                    ent['end'] += start
-                    ent['entity'] = label
-                    outputs.append(ent)
         return {"text": text, "entities": outputs}

+from typing import Tuple
+import string
 from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
 import spacy
 import torch
 class NER:
+    prompt: str = """
 Identify entities in the text having the following classes:
 {}
 Text:
+ """
+    def __init__(self, model_name: str, sents_batch: int=10):
+        self.sents_batch = sents_batch
+        self.nlp: spacy.Language = spacy.load(
+            'en_core_web_sm',
+            disable = ['lemmatizer', 'parser', 'tagger', 'ner']
+        )
+        self.nlp.add_pipe('sentencizer')
+        device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        model = AutoModelForTokenClassification.from_pretrained(model_name)
+        self.pipeline = pipeline(
+            "ner",
+            model=model,
+            tokenizer=tokenizer,
+            aggregation_strategy='first',
+            batch_size=12,
+            device=device
+        )
+    def get_last_sentence_id(self, i: int, sentences_len: int) -> int:
+        return min(i + self.sents_batch, sentences_len) - 1
+    def chunkanize(self, text: str) -> Tuple[list[str], list[int]]:
+        doc = self.nlp(text)
         chunks = []
         starts = []
+        sentences = list(doc.sents)
+        for i in range(0, len(sentences), self.sents_batch):
+            start = sentences[i].start_char
+            starts.append(start)
+            last_sentence = self.get_last_sentence_id(i, len(sentences))
+            end = sentences[last_sentence].end_char
+            chunks.append(text[start:end])
         return chunks, starts
+    def get_inputs(
+        self, chunks: list[str], labels: list[str]
+    ) -> Tuple[list[str], list[int]]:
+        inputs = []
+        prompts_lens = []
+        for label in labels:
+            prompt = self.prompt.format(label)
+            prompts_lens.append(len(prompt))
+            for chunk in chunks:
+                inputs.append(prompt + chunk)
+        return inputs, prompts_lens
     @classmethod
+    def clean_span(
+        cls, start: int, end: int, span: str
+    ) -> Tuple[int, int, str]:
+        if len(span) >= 1:
+            if span[0] in string.punctuation:
+                return cls.clean_span(start+1, end, span[1:])
+            if span[-1] in string.punctuation:
+                return cls.clean_span(start, end-1, span[:-1])
+        return start, end, span.strip()
+    def predict(
+        self,
+        text: str,
+        inputs: list[str],
+        labels: list[str],
+        chunks_starts: list[int],
+        prompts_lens: list[int],
+        threshold: float
+    ) -> list[dict[str, any]]:
         outputs = []
+        for id, output in enumerate(self.pipeline(inputs)):
+            label = labels[id//len(chunks_starts)]
+            shift = chunks_starts[id%len(chunks_starts)] - prompts_lens[id//len(chunks_starts)]
             for ent in output:
+                start = ent['start'] + shift + 1
+                end = ent['end'] + shift
+                start, end, span = self.clean_span(start, end, text[start:end])
+                if not span:
+                    continue
+                if ent['score'] >= threshold:
+                    outputs.append({
+                        'span': span,
+                        'start': start,
+                        'end': end,
+                        'entity': label
+                    })
+        return outputs
+    def __call__(
+        self, labels: str, text: str, threshold: float=0.
+    ) -> dict[str, any]:
+        labels_list = [label.strip() for label in labels.split(',')]
+        chunks, chunks_starts = self.chunkanize(text)
+        inputs, prompts_lens = self.get_inputs(chunks, labels_list)
+        outputs = self.predict(
+            text, inputs, labels_list, chunks_starts, prompts_lens, threshold
+        )
+        print(outputs)
         return {"text": text, "entities": outputs}