Spaces:

naveed92
/

web_qa

Sleeping

App Files Files Community

naveed92 commited on Jul 22, 2023

Commit

8c0af92

•

1 Parent(s): 454903d

Create app.py

Browse files

Files changed (1) hide show

app.py +99 -0

app.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import streamlit as st
+from transformers import MarkupLMProcessor, MarkupLMForQuestionAnswering
+import requests
+from bs4 import BeautifulSoup
+import numpy as np
+import torch
+import torch.nn.functional as F
+# Prediction Parameters
+MAX_LEN = 512
+STRIDE = 100
+# Answer filtering parameters
+MAX_ANSWER_LEN = 30
+MIN_CONFIDENCE = 0.9
+# Model name
+MODEL_STR = "microsoft/markuplm-base-finetuned-websrc"
+# Load markuplm model
+processor = MarkupLMProcessor.from_pretrained(MODEL_STR)
+model = MarkupLMForQuestionAnswering.from_pretrained(MODEL_STR)
+headers = {
+    'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36',
+}
+# User Input
+input_url = st.text_input(
+    label="Enter url of page to scrape",
+    value="https://www.opentable.com/carlo-and-johnny",
+    key="url",
+)
+input_question = st.text_input(
+    label="Enter Question",
+    value="What is the food on the menu?",
+    key="question",
+)
+st.write("Getting html page ...")
+# Request page
+page = requests.get(input_url, headers=headers)
+# Parse page with beautifulsoup
+soup = BeautifulSoup(page.content, "html.parser")
+# Extract page body
+body = soup.find('body')
+html_string = str(body)
+len(html_string)
+# Process input string
+encoding = processor(html_string, questions=input_question, return_tensors="pt", truncation="only_second",
+                     stride=STRIDE, max_length=MAX_LEN, return_overflowing_tokens=True, padding=True)
+# Postprocess encoding
+del encoding['overflow_to_sample_mapping']
+encoding['token_type_ids'] = encoding['token_type_ids'].fill_(0)
+# Keep index of question for future use
+n_segments = encoding['input_ids'].shape[0]
+question_index = encoding[0].tokens.index('</s>')
+# Run model
+with torch.no_grad():
+    outputs = model(**encoding)
+# Get start and end probabilities
+start_probs = F.softmax(outputs.start_logits, dim=1).numpy()
+end_probs = F.softmax(outputs.end_logits, dim=1).numpy()
+# Extract and filter answers for each window
+answers = []
+for i in range(n_segments):
+    start_index = np.argmax(start_probs[i])
+    end_index = np.argmax(end_probs[i])
+    confidence = max(start_probs[i]) * max(end_probs[i])
+    if end_index > start_index and end_index - start_index <= MAX_ANSWER_LEN and start_index > question_index and end_index > question_index and confidence > MIN_CONFIDENCE:
+        predict_answer_tokens = encoding.input_ids[0, start_index : end_index + 1]
+        answer = processor.decode(predict_answer_tokens, skip_special_tokens=True)
+        answers.append({"answer": answer, "confidence": confidence})
+# Print answers
+for answer in answers:
+    st.write(answer)
+st.write("Done!")