web_qa / app.py
naveed92's picture
Update app.py
d6ddb1f
import streamlit as st
from transformers import MarkupLMProcessor, MarkupLMForQuestionAnswering
import requests
from bs4 import BeautifulSoup
import numpy as np
import torch
import torch.nn.functional as F
# Prediction Parameters
MAX_LEN = 512
STRIDE = 100
# Answer filtering parameters
MAX_ANSWER_LEN = 30
MIN_CONFIDENCE = 0.9
# Model name
MODEL_STR = "microsoft/markuplm-base-finetuned-websrc"
# Load markuplm model
processor = MarkupLMProcessor.from_pretrained(MODEL_STR)
model = MarkupLMForQuestionAnswering.from_pretrained(MODEL_STR)
headers = {
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36',
}
# User Input
input_url = st.text_input(
label="Enter url of page to scrape",
value="https://finance.yahoo.com/quote/META/",
key="url",
)
input_question = st.text_input(
label="Enter Question",
value="What is the market capitalization?",
key="question",
)
st.write("Getting html page ...")
# Request page
page = requests.get(input_url, headers=headers)
# Parse page with beautifulsoup
soup = BeautifulSoup(page.content, "html.parser")
# Extract page body
body = soup.find('body')
html_string = str(body)
len(html_string)
# Process input string
encoding = processor(html_string, questions=input_question, return_tensors="pt", truncation="only_second",
stride=STRIDE, max_length=MAX_LEN, return_overflowing_tokens=True, padding=True)
# Postprocess encoding
del encoding['overflow_to_sample_mapping']
encoding['token_type_ids'] = encoding['token_type_ids'].fill_(0)
# Keep index of question for future use
n_segments = encoding['input_ids'].shape[0]
question_index = encoding[0].tokens.index('</s>')
# Run model
with torch.no_grad():
outputs = model(**encoding)
# Get start and end probabilities
start_probs = F.softmax(outputs.start_logits, dim=1).numpy()
end_probs = F.softmax(outputs.end_logits, dim=1).numpy()
# Extract and filter answers for each window
answers = []
for i in range(n_segments):
start_index = np.argmax(start_probs[i])
end_index = np.argmax(end_probs[i])
confidence = max(start_probs[i]) * max(end_probs[i])
if end_index > start_index and end_index - start_index <= MAX_ANSWER_LEN and start_index > question_index and end_index > question_index and confidence > MIN_CONFIDENCE:
predict_answer_tokens = encoding.input_ids[0, start_index : end_index + 1]
answer = processor.decode(predict_answer_tokens, skip_special_tokens=True)
answers.append({"answer": answer, "confidence": confidence})
# Print answers
for answer in answers:
st.write(answer)
st.write("Done!")