web_qa / app.py
naveed92's picture
Update app.py
d6ddb1f
raw history blame
No virus
2.69 kB
import streamlit as st
from transformers import MarkupLMProcessor, MarkupLMForQuestionAnswering
import requests
from bs4 import BeautifulSoup
import numpy as np
import torch
import torch.nn.functional as F
# Prediction Parameters
MAX_LEN = 512
STRIDE = 100
# Answer filtering parameters
MAX_ANSWER_LEN = 30
MIN_CONFIDENCE = 0.9
# Model name
MODEL_STR = "microsoft/markuplm-base-finetuned-websrc"
# Load markuplm model
processor = MarkupLMProcessor.from_pretrained(MODEL_STR)
model = MarkupLMForQuestionAnswering.from_pretrained(MODEL_STR)
headers = {
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36',
}
# User Input
input_url = st.text_input(
label="Enter url of page to scrape",
value="https://finance.yahoo.com/quote/META/",
key="url",
)
input_question = st.text_input(
label="Enter Question",
value="What is the market capitalization?",
key="question",
)
st.write("Getting html page ...")
# Request page
page = requests.get(input_url, headers=headers)
# Parse page with beautifulsoup
soup = BeautifulSoup(page.content, "html.parser")
# Extract page body
body = soup.find('body')
html_string = str(body)
len(html_string)
# Process input string
encoding = processor(html_string, questions=input_question, return_tensors="pt", truncation="only_second",
stride=STRIDE, max_length=MAX_LEN, return_overflowing_tokens=True, padding=True)
# Postprocess encoding
del encoding['overflow_to_sample_mapping']
encoding['token_type_ids'] = encoding['token_type_ids'].fill_(0)
# Keep index of question for future use
n_segments = encoding['input_ids'].shape[0]
question_index = encoding[0].tokens.index('</s>')
# Run model
with torch.no_grad():
outputs = model(**encoding)
# Get start and end probabilities
start_probs = F.softmax(outputs.start_logits, dim=1).numpy()
end_probs = F.softmax(outputs.end_logits, dim=1).numpy()
# Extract and filter answers for each window
answers = []
for i in range(n_segments):
start_index = np.argmax(start_probs[i])
end_index = np.argmax(end_probs[i])
confidence = max(start_probs[i]) * max(end_probs[i])
if end_index > start_index and end_index - start_index <= MAX_ANSWER_LEN and start_index > question_index and end_index > question_index and confidence > MIN_CONFIDENCE:
predict_answer_tokens = encoding.input_ids[0, start_index : end_index + 1]
answer = processor.decode(predict_answer_tokens, skip_special_tokens=True)
answers.append({"answer": answer, "confidence": confidence})
# Print answers
for answer in answers:
st.write(answer)
st.write("Done!")