naveed92 commited on
Commit
8c0af92
1 Parent(s): 454903d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +99 -0
app.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ from transformers import MarkupLMProcessor, MarkupLMForQuestionAnswering
4
+
5
+ import requests
6
+ from bs4 import BeautifulSoup
7
+
8
+ import numpy as np
9
+
10
+ import torch
11
+ import torch.nn.functional as F
12
+
13
+ # Prediction Parameters
14
+ MAX_LEN = 512
15
+ STRIDE = 100
16
+
17
+ # Answer filtering parameters
18
+ MAX_ANSWER_LEN = 30
19
+ MIN_CONFIDENCE = 0.9
20
+
21
+ # Model name
22
+ MODEL_STR = "microsoft/markuplm-base-finetuned-websrc"
23
+
24
+ # Load markuplm model
25
+ processor = MarkupLMProcessor.from_pretrained(MODEL_STR)
26
+ model = MarkupLMForQuestionAnswering.from_pretrained(MODEL_STR)
27
+
28
+ headers = {
29
+ 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36',
30
+ }
31
+
32
+ # User Input
33
+ input_url = st.text_input(
34
+ label="Enter url of page to scrape",
35
+ value="https://www.opentable.com/carlo-and-johnny",
36
+ key="url",
37
+ )
38
+
39
+ input_question = st.text_input(
40
+ label="Enter Question",
41
+ value="What is the food on the menu?",
42
+ key="question",
43
+ )
44
+
45
+ st.write("Getting html page ...")
46
+
47
+ # Request page
48
+ page = requests.get(input_url, headers=headers)
49
+
50
+ # Parse page with beautifulsoup
51
+ soup = BeautifulSoup(page.content, "html.parser")
52
+
53
+ # Extract page body
54
+ body = soup.find('body')
55
+
56
+ html_string = str(body)
57
+ len(html_string)
58
+
59
+ # Process input string
60
+ encoding = processor(html_string, questions=input_question, return_tensors="pt", truncation="only_second",
61
+ stride=STRIDE, max_length=MAX_LEN, return_overflowing_tokens=True, padding=True)
62
+
63
+ # Postprocess encoding
64
+ del encoding['overflow_to_sample_mapping']
65
+ encoding['token_type_ids'] = encoding['token_type_ids'].fill_(0)
66
+
67
+ # Keep index of question for future use
68
+ n_segments = encoding['input_ids'].shape[0]
69
+ question_index = encoding[0].tokens.index('</s>')
70
+
71
+ # Run model
72
+ with torch.no_grad():
73
+ outputs = model(**encoding)
74
+
75
+ # Get start and end probabilities
76
+ start_probs = F.softmax(outputs.start_logits, dim=1).numpy()
77
+ end_probs = F.softmax(outputs.end_logits, dim=1).numpy()
78
+
79
+ # Extract and filter answers for each window
80
+ answers = []
81
+
82
+ for i in range(n_segments):
83
+
84
+ start_index = np.argmax(start_probs[i])
85
+ end_index = np.argmax(end_probs[i])
86
+ confidence = max(start_probs[i]) * max(end_probs[i])
87
+
88
+ if end_index > start_index and end_index - start_index <= MAX_ANSWER_LEN and start_index > question_index and end_index > question_index and confidence > MIN_CONFIDENCE:
89
+
90
+ predict_answer_tokens = encoding.input_ids[0, start_index : end_index + 1]
91
+ answer = processor.decode(predict_answer_tokens, skip_special_tokens=True)
92
+
93
+ answers.append({"answer": answer, "confidence": confidence})
94
+
95
+ # Print answers
96
+ for answer in answers:
97
+ st.write(answer)
98
+
99
+ st.write("Done!")