Modfiededition's picture
Update app.py
c09e449
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from transformers import AutoTokenizer
from transformers import TFAutoModelForQuestionAnswering
from datasets import Dataset
import streamlit as st
# loading saved roberta-base tokenizer to tokenize the text into input IDs that model can make sense of.
model_checkpoint = "Modfiededition/roberta-fine-tuned-tweet-sentiment-extractor"
@st.cache(allow_output_mutation=True, suppress_st_warning=True)
def load_tokenizer():
return AutoTokenizer.from_pretrained(model_checkpoint )
tokenizer = load_tokenizer()
@st.cache(allow_output_mutation=True, suppress_st_warning=True)
def load_model():
return TFAutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
model = load_model()
#prompts
st.title("Tweet Sentiment Extractor...")
# take text/tweet input
textbox = st.text_area('Write your text in this box:', '',height=100, max_chars=500 )
option = st.selectbox(
'Pick the sentiment',
('positive', 'negative', 'neutral'))
python_dict = {"text":[textbox], "sentiment":[option]}
dataset = Dataset.from_dict(python_dict)
MAX_LENGTH = 105
button = st.button('Click here to extract the word/phrase from the text with the given sentiment: {0}..'.format(option))
if button:
if not textbox:
st.markdown("#### " +"Please write something in the above textbox..")
else:
with st.spinner('In progress.......'):
def process_data(examples):
questions = examples["sentiment"]
context = examples["text"]
inputs = tokenizer(
questions,
context,
max_length = MAX_LENGTH,
padding="max_length",
return_offsets_mapping = True,
)
# Assigning None values to all offset mapping of tokens which are not the context tokens.
for i in range(len(inputs["input_ids"])):
offset = inputs["offset_mapping"][i]
sequence_ids = inputs.sequence_ids(i)
inputs["offset_mapping"][i] = [
o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
]
return inputs
processed_raw_data = dataset.map(
process_data,
batched = True
)
tf_raw_dataset = processed_raw_data.to_tf_dataset(
columns=["input_ids", "attention_mask"],
shuffle=False,
batch_size=1,
)
# final predictions.
outputs = model.predict(tf_raw_dataset)
start_logits = outputs.start_logits
end_logits = outputs.end_logits
# Post Processing.
# Using start_logits and end_logits to generate the final answer from the given context.
n_best = 20
def predict_answers(inputs):
predicted_answer = []
for i in range(len(inputs["offset_mapping"])):
start_logit = inputs["start_logits"][i]
end_logit = inputs["end_logits"][i]
context = inputs["text"][i]
offset = inputs["offset_mapping"][i]
start_indexes = np.argsort(start_logit)[-1: -n_best - 1:-1].tolist()
end_indexes = np.argsort(end_logit)[-1: -n_best - 1: -1].tolist()
flag = False
for start_index in start_indexes:
for end_index in end_indexes:
# skip answer that are not in the context.
if offset[start_index] is None or offset[end_index] is None:
continue
# skip answer with length that is either < 0
if end_index < start_index:
continue
flag = True
answer = context[offset[start_index][0]: offset[end_index][1]]
predicted_answer.append(answer)
break
if flag:
break
if not flag:
predicted_answer.append(answer)
return {"predicted_answer":predicted_answer}
processed_raw_data.set_format("pandas")
processed_raw_df = processed_raw_data[:]
processed_raw_df["start_logits"] = start_logits.tolist()
processed_raw_df["end_logits"] = end_logits.tolist()
processed_raw_df["text"] = python_dict["text"]
final_data = Dataset.from_pandas(processed_raw_df)
final_data = final_data.map(predict_answers,batched=True)
st.markdown("## " +final_data["predicted_answer"][0])