Spaces:

vonewman
/

ner_app

Runtime error

App Files Files Community

ner_app / app.py

vonewman

Update app.py

345d423 11 months ago

raw

history blame contribute delete

No virus

4.59 kB

	import streamlit as st
	import pandas as pd
	import re
	import json
	import transformers
	import torch
	from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer

	st.set_page_config(
	page_title="Named Entity Recognition Wolof",
	page_icon="📘"
	)

	def convert_df(df: pd.DataFrame):
	return df.to_csv(index=False).encode('utf-8')

	def convert_json(df: pd.DataFrame):
	result = df.to_json(orient="index")
	parsed = json.loads(result)
	json_string = json.dumps(parsed)
	return json_string

	def load_model():
	model = AutoModelForTokenClassification.from_pretrained("vonewman/wolof-finetuned-ner")
	trainer = Trainer(model=model)
	tokenizer = AutoTokenizer.from_pretrained("vonewman/wolof-finetuned-ner")
	return trainer, model, tokenizer

	def align_word_ids(texts):
	trainer, model, tokenizer = load_model()
	tokenized_inputs = tokenizer(texts, padding='max_length', max_length=218, truncation=True)
	word_ids = tokenized_inputs.word_ids()
	previous_word_idx = None
	label_ids = []
	for word_idx in word_ids:
	if word_idx is None:
	label_ids.append(-100)
	elif word_idx != previous_word_idx:
	try:
	label_ids.append(1)
	except:
	label_ids.append(-100)
	else:
	try:
	label_ids.append(1 if label_all_tokens else -100)
	except:
	label_ids.append(-100)
	previous_word_idx = word_idx
	return label_ids

	def predict_ner_labels(model, tokenizer, sentence):
	use_cuda = torch.cuda.is_available()
	device = torch.device("cuda" if use_cuda else "cpu")
	if use_cuda:
	model = model.cuda()
	text = tokenizer(sentence, padding='max_length', max_length=218, truncation=True, return_tensors="pt")
	mask = text['attention_mask'].to(device)
	input_id = text['input_ids'].to(device)
	label_ids = torch.Tensor(align_word_ids(sentence)).unsqueeze(0).to(device)
	logits = model(input_id, mask, None)
	logits_clean = logits[0][label_ids != -100]
	predictions = logits_clean.argmax(dim=1).tolist()
	prediction_label = [id2tag[i] for i in predictions]
	return prediction_label

	id2tag = {0: 'O', 1: 'B-LOC', 2: 'B-PER', 3: 'I-PER', 4: 'B-ORG', 5: 'I-DATE', 6: 'B-DATE', 7: 'I-ORG', 8: 'I-LOC'}

	def tag_sentence(text):
	trainer, model, tokenizer = load_model()
	predictions = predict_ner_labels(model, tokenizer, text)
	# Créez un DataFrame avec les colonnes "words" et "tags"
	df = pd.DataFrame({'words': text.split(), 'tags': predictions})
	return df

	st.title("📘 Named Entity Recognition Wolof")

	with st.form(key='my_form'):
	x1 = st.text_input(label='Enter a sentence:', max_chars=250)
	submit_button = st.form_submit_button(label='🏷️ Create tags')

	if submit_button:
	if re.sub('\s+', '', x1) == '':
	st.error('Please enter a non-empty sentence.')
	elif re.match(r'\A\s\w+\s\Z', x1):
	st.error("Please enter a sentence with at least one word")
	else:
	st.markdown("### Tagged Sentence")
	st.header("")
	results = tag_sentence(x1)
	cs, c1, c2, c3, cLast = st.columns([0.75, 1.5, 1.5, 1.5, 0.75])
	with c1:
	csvbutton = st.download_button(label="📥 Download .csv", data=convert_df(results),
	file_name="results.csv", mime='text/csv', key='csv')
	with c2:
	textbutton = st.download_button(label="📥 Download .txt", data=convert_df(results),
	file_name="results.text", mime='text/plain', key='text')
	with c3:
	jsonbutton = st.download_button(label="📥 Download .json", data=convert_json(results),
	file_name="results.json", mime='application/json', key='json')
	st.header("")
	c1, c2, c3 = st.columns([1, 3, 1])
	with c2:
	st.table(results[['words', 'tags']])

	st.header("")
	st.header("")
	st.header("")
	with st.expander("ℹ️ - About this app", expanded=True):
	st.write(
	"""
	- The Named Entity Recognition Wolof app is a tool that performs named entity recognition in Wolof.
	- The available entities are: corporation, location, person, and date.
	- The app uses the [XLMRoberta model](https://huggingface.co/xlm-roberta-base), fine-tuned on the [masakhaNER](https://huggingface.co/datasets/masakhane/masakhaner2) dataset.
	- The model uses the byte-level BPE tokenizer. Each sentence is first tokenized.
	"""
	)