import streamlit as st from transformers import MarkupLMProcessor, MarkupLMForQuestionAnswering import requests from bs4 import BeautifulSoup import numpy as np import torch import torch.nn.functional as F # Prediction Parameters MAX_LEN = 512 STRIDE = 100 # Answer filtering parameters MAX_ANSWER_LEN = 30 MIN_CONFIDENCE = 0.9 # Model name MODEL_STR = "microsoft/markuplm-base-finetuned-websrc" # Load markuplm model processor = MarkupLMProcessor.from_pretrained(MODEL_STR) model = MarkupLMForQuestionAnswering.from_pretrained(MODEL_STR) headers = { 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36', } # User Input input_url = st.text_input( label="Enter url of page to scrape", value="https://finance.yahoo.com/quote/META/", key="url", ) input_question = st.text_input( label="Enter Question", value="What is the market capitalization?", key="question", ) st.write("Getting html page ...") # Request page page = requests.get(input_url, headers=headers) # Parse page with beautifulsoup soup = BeautifulSoup(page.content, "html.parser") # Extract page body body = soup.find('body') html_string = str(body) len(html_string) # Process input string encoding = processor(html_string, questions=input_question, return_tensors="pt", truncation="only_second", stride=STRIDE, max_length=MAX_LEN, return_overflowing_tokens=True, padding=True) # Postprocess encoding del encoding['overflow_to_sample_mapping'] encoding['token_type_ids'] = encoding['token_type_ids'].fill_(0) # Keep index of question for future use n_segments = encoding['input_ids'].shape[0] question_index = encoding[0].tokens.index('') # Run model with torch.no_grad(): outputs = model(**encoding) # Get start and end probabilities start_probs = F.softmax(outputs.start_logits, dim=1).numpy() end_probs = F.softmax(outputs.end_logits, dim=1).numpy() # Extract and filter answers for each window answers = [] for i in range(n_segments): start_index = np.argmax(start_probs[i]) end_index = np.argmax(end_probs[i]) confidence = max(start_probs[i]) * max(end_probs[i]) if end_index > start_index and end_index - start_index <= MAX_ANSWER_LEN and start_index > question_index and end_index > question_index and confidence > MIN_CONFIDENCE: predict_answer_tokens = encoding.input_ids[0, start_index : end_index + 1] answer = processor.decode(predict_answer_tokens, skip_special_tokens=True) answers.append({"answer": answer, "confidence": confidence}) # Print answers for answer in answers: st.write(answer) st.write("Done!")