walter1 commited on
Commit
f225398
1 Parent(s): 8e90a66

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +92 -2
app.py CHANGED
@@ -1,9 +1,99 @@
1
  import gradio as gr
2
  from transformers import pipeline
3
  from datetime import datetime
 
 
 
4
 
5
- question_answerer = pipeline("question-answering")
6
- question_answerer2 = pipeline("question-answering", model="bert-large-uncased-whole-word-masking-finetuned-squad")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  iface = gr.Interface(fn=greet, inputs="text", outputs="text")
9
  iface.launch()
 
1
  import gradio as gr
2
  from transformers import pipeline
3
  from datetime import datetime
4
+ import pandas as pd
5
+ import requests
6
+ from bs4 import BeautifulSoup
7
 
8
+ import re
9
+
10
+ benefits = [
11
+ {"benefitName": "Universal Credit", "coreName": "Overview", "link": "https://www.gov.uk/universal-credit/"},
12
+ {"benefitName": "Universal Credit", "coreName": "Eligibility", "link": "https://www.gov.uk/universal-credit/eligibility"},
13
+ {"benefitName": "Universal Credit", "coreName": "how much can I get​", "link": "https://www.gov.uk/universal-credit/what-youll-get,https://www.gov.uk/universal-credit/how-youre-paid"},
14
+ {"benefitName": "Universal Credit", "coreName": "how to apply/claim", "link": "https://www.gov.uk/universal-credit/how-to-claim"},
15
+ ]
16
+
17
+
18
+ def requestPage(link):
19
+ page = requests.get(link)
20
+ # print(page.text)
21
+
22
+ soup = BeautifulSoup(page.content, "html.parser")
23
+
24
+ return soup
25
+
26
+ def scrapeTable(table):
27
+ columns = [col.text.strip() for col in table.thead.tr.find_all()]
28
+ columns
29
+
30
+ rows = table.tbody.find_all(recursive=False)
31
+ clean_rows = ""
32
+
33
+ for row in rows:
34
+ elements = ["{}: {}".format(columns[index], element.text.strip()) for index, element in enumerate(row.find_all(recursive=False))]
35
+ elements = " ".join(elements)
36
+ # print(elements)
37
+ clean_rows += elements + "\n"
38
+
39
+ return clean_rows
40
+
41
+
42
+
43
+ def scrapePage(page):
44
+ # Scrape the text
45
+ corpus = ""
46
+
47
+ # starting from the main page
48
+ content = page.find('div', {"id":"guide-contents"})
49
+
50
+ title = content.find('h1', {"class":"part-title"})
51
+ title = title.text.strip()
52
+ corpus += title +"\n\n"
53
+
54
+ print(title)
55
+
56
+ content = content.find('div', {"class":"gem-c-govspeak"})
57
+
58
+ fragments = content.find_all(recursive=False)
59
+ for frag in fragments:
60
+ text= frag.text.strip()
61
+ if frag.name == 'ul':
62
+ clean = re.sub('\n+', "{;}", text)
63
+ corpus += "{;}" + clean
64
+ elif frag.name == 'table':
65
+ corpus += scrapeTable(frag)
66
+ else:
67
+ corpus += text
68
+
69
+ corpus += "\n"
70
+
71
+ # print(corpus)
72
+
73
+ return corpus
74
+
75
+
76
+ for benefit in benefits:
77
+ links = benefit['link'].split(',')
78
+ print(benefit['benefitName'], benefit['coreName'], len(links))
79
+
80
+ context = ""
81
+ for link in links:
82
+ page = requestPage(link)
83
+ context += scrapePage(page)
84
+
85
+ benefit['context'] = context
86
+ benefit['contextLen'] = len(context)
87
+ print("--------------------------------")
88
+
89
+
90
+ benefitsClasses = list(set(list(map(lambda x: x['benefitName'], benefits))))
91
+ core4Classes = list(set(list(map(lambda x: x['coreName'], benefits))))
92
+
93
+ # contexts
94
+ benefitsClasses, core4Classes
95
+
96
+ #question_answerer = pipeline("question-answering")
97
 
98
  iface = gr.Interface(fn=greet, inputs="text", outputs="text")
99
  iface.launch()