Spaces:

vonewman
/

ner_app

Runtime error

App Files Files Community

vonewman commited on Oct 28, 2023

Commit

f5b6e30

1 Parent(s): bbf7868

use predict_ner_labels

Browse files

Files changed (1) hide show

app.py +53 -72

app.py CHANGED Viewed

@@ -1,125 +1,106 @@
 import streamlit as st
 import pandas as pd
-import numpy as np
 import re
 import json
-import base64
-import uuid
 import transformers
-from datasets import Dataset,load_dataset, load_from_disk
 from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer
 st.set_page_config(
-    page_title="Named Entity Recognition Wolof", page_icon="📘"
 )
-def convert_df(df:pd.DataFrame):
-     return df.to_csv(index=False).encode('utf-8')
-#@st.cache
-def convert_json(df:pd.DataFrame):
     result = df.to_json(orient="index")
     parsed = json.loads(result)
     json_string = json.dumps(parsed)
-    #st.json(json_string, expanded=True)
     return json_string
-st.title("📘Named Entity Recognition Wolof")
-@st.cache(allow_output_mutation=True)
 def load_model():
     model = AutoModelForTokenClassification.from_pretrained("vonewman/wolof-finetuned-ner")
     trainer = Trainer(model=model)
     tokenizer = AutoTokenizer.from_pretrained("vonewman/wolof-finetuned-ner")
     return trainer, model, tokenizer
-id2tag = {0: 'O',
-         1: 'B-LOC',
-         2: 'B-PER',
-         3: 'I-PER',
-         4: 'B-ORG',
-         5: 'I-DATE',
-         6: 'B-DATE',
-         7: 'I-ORG',
-         8: 'I-LOC'
-        }
-def tag_sentence(text:str):
-      # convert our text to a tokenized sequence
-      inputs = tokenizer(text, truncation=True, return_tensors="pt")
-      # get outputs
-      outputs = model(**inputs)
-      # convert to probabilities with softmax
-      probs = outputs[0][0].softmax(1)
-      # get the tags with the highest probability
-      word_tags = [(tokenizer.decode(inputs['input_ids'][0][i].item()), id2tag[tagid.item()], np.round(probs[i][tagid].item() *100,2) )
-                    for i, tagid in enumerate (probs.argmax(axis=1))]
-      df=pd.DataFrame(word_tags, columns=['word', 'tag', 'probability'])
-      return df
-with st.form(key='my_form'):
     x1 = st.text_input(label='Enter a sentence:', max_chars=250)
-    print(x1)
     submit_button = st.form_submit_button(label='🏷️ Create tags')
 if submit_button:
-    if re.sub('\s+','',x1)=='':
         st.error('Please enter a non-empty sentence.')
     elif re.match(r'\A\s*\w+\s*\Z', x1):
         st.error("Please enter a sentence with at least one word")
     else:
         st.markdown("### Tagged Sentence")
         st.header("")
-        Trainer, model, tokenizer = load_model()
-        results=tag_sentence(x1)
         cs, c1, c2, c3, cLast = st.columns([0.75, 1.5, 1.5, 1.5, 0.75])
         with c1:
-            #csvbutton = download_button(results, "results.csv", "📥 Download .csv")
-            csvbutton = st.download_button(label="📥 Download .csv", data=convert_df(results),
-                                           file_name= "results.csv", mime='text/csv', key='csv')
         with c2:
-            #textbutton = download_button(results, "results.txt", "📥 Download .txt")
-            textbutton = st.download_button(label="📥 Download .txt", data=convert_df(results),
-                                            file_name= "results.text", mime='text/plain',  key='text')
         with c3:
-            #jsonbutton = download_button(results, "results.json", "📥 Download .json")
-            jsonbutton = st.download_button(label="📥 Download .json", data=convert_json(results),
-                                            file_name= "results.json", mime='application/json',  key='json')
         st.header("")
         c1, c2, c3 = st.columns([1, 3, 1])
-        with c2:
-             st.table(results.style.background_gradient(subset=['probability']).format(precision=2))
 st.header("")
 st.header("")
 st.header("")
 with st.expander("ℹ️ - About this app", expanded=True):
     st.write(
         """
 -   The **Named Entity Recognition Wolof** app is a tool that performs named entity recognition in Wolof.
--   The available entitites are: *corporation*, *location*, *person* and *date*.
--   The app uses the [XLMRoberta model](https://huggingface.co/xlm-roberta-base), fine-tuned on the [masakhaNER](https://huggingface.co/datasets/masakhane/masakhaner2) dataset.
--   The model uses the **byte-level BPE tokenizer**. Each sentece is first tokenized.
-       """
-    )

 import streamlit as st
 import pandas as pd
 import re
 import json
 import transformers
+import torch
 from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer
 st.set_page_config(
+    page_title="Named Entity Recognition Wolof",
+    page_icon="📘"
 )
+def convert_df(df: pd.DataFrame):
+    return df.to_csv(index=False).encode('utf-8')
+def convert_json(df: pd.DataFrame):
     result = df.to_json(orient="index")
     parsed = json.loads(result)
     json_string = json.dumps(parsed)
     return json_string
 def load_model():
     model = AutoModelForTokenClassification.from_pretrained("vonewman/wolof-finetuned-ner")
     trainer = Trainer(model=model)
     tokenizer = AutoTokenizer.from_pretrained("vonewman/wolof-finetuned-ner")
     return trainer, model, tokenizer
+def predict_ner_labels(model, tokenizer, sentence):
+    use_cuda = torch.cuda.is_available()
+    device = torch.device("cuda" if use_cuda else "cpu")
+    if use_cuda:
+        model = model.cuda()
+    text = tokenizer(sentence, padding='max_length', max_length=218, truncation=True, return_tensors="pt")
+    mask = text['attention_mask'].to(device)
+    input_id = text['input_ids'].to(device)
+    label_ids = torch.Tensor(align_word_ids(sentence)).unsqueeze(0).to(device)
+    logits = model(input_id, mask, None)
+    logits_clean = logits[0][label_ids != -100]
+    predictions = logits_clean.argmax(dim=1).tolist()
+    prediction_label = [id2tag[i] for i in predictions]
+    return prediction_label
+id2tag = {0: 'O', 1: 'B-LOC', 2: 'B-PER', 3: 'I-PER', 4: 'B-ORG', 5: 'I-DATE', 6: 'B-DATE', 7: 'I-ORG', 8: 'I-LOC'}
+def tag_sentence(text):
+    trainer, model, tokenizer = load_model()
+    predictions = predict_ner_labels(model, tokenizer, text)
+    df = pd.DataFrame(predictions, columns=['tag'])
+    df['word'] = text.split()
+    df['probability'] = 100.0  # Vous pouvez ajuster cette valeur selon vos besoins
+    return df
+st.title("📘 Named Entity Recognition Wolof")
+with st.form(key='my_form'):
     x1 = st.text_input(label='Enter a sentence:', max_chars=250)
     submit_button = st.form_submit_button(label='🏷️ Create tags')
 if submit_button:
+    if re.sub('\s+', '', x1) == '':
         st.error('Please enter a non-empty sentence.')
     elif re.match(r'\A\s*\w+\s*\Z', x1):
         st.error("Please enter a sentence with at least one word")
     else:
         st.markdown("### Tagged Sentence")
         st.header("")
+        results = tag_sentence(x1)
         cs, c1, c2, c3, cLast = st.columns([0.75, 1.5, 1.5, 1.5, 0.75])
         with c1:
+            csvbutton = st.download_button(label="📥 Download .csv", data=convert_df(results),
+                                           file_name="results.csv", mime='text/csv', key='csv')
         with c2:
+            textbutton = st.download_button(label="📥 Download .txt", data=convert_df(results),
+                                            file_name="results.text", mime='text/plain', key='text')
         with c3:
+            jsonbutton = st.download_button(label="📥 Download .json", data=convert_json(results),
+                                            file_name="results.json", mime='application/json', key='json')
         st.header("")
         c1, c2, c3 = st.columns([1, 3, 1])
+        with c2:
+            st.table(results.style.background_gradient(subset=['probability']).format(precision=2))
 st.header("")
 st.header("")
 st.header("")
 with st.expander("ℹ️ - About this app", expanded=True):
     st.write(
         """
 -   The **Named Entity Recognition Wolof** app is a tool that performs named entity recognition in Wolof.
+-   The available entities are: *corporation*, *location*, *person*, and *date*.
+-   The app uses the [XLMRoberta model](https://huggingface.co/xlm-roberta-base), fine-tuned on the [masakhaNER](https://huggingface.co/datasets/masakhane/masakhaner2) dataset.
+-   The model uses the **byte-level BPE tokenizer**. Each sentence is first tokenized.
+        """
+    )