File size: 3,094 Bytes
3962a20
 
 
f225398
 
 
3962a20
f225398
 
 
551bca9
 
 
 
f225398
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
743288e
f225398
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
551bca9
3962a20
551bca9
 
 
 
 
 
 
 
 
cbf83a0
551bca9
 
 
 
 
3962a20
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import gradio as gr
from transformers import pipeline
from datetime import datetime
import pandas as pd
import requests
from bs4 import BeautifulSoup

import re

benefits = [
  {"benefitName": "Universal Credit", "coreName": "what is this benefit", "link": "https://www.gov.uk/universal-credit/"},
  {"benefitName": "Universal Credit", "coreName": "who can apply", "link": "https://www.gov.uk/universal-credit/eligibility"},
  {"benefitName": "Universal Credit", "coreName": "how much can I get", "link": "https://www.gov.uk/universal-credit/what-youll-get,https://www.gov.uk/universal-credit/how-youre-paid"},
  {"benefitName": "Universal Credit", "coreName": "How to apply", "link": "https://www.gov.uk/universal-credit/how-to-claim"}
]


def requestPage(link):
  page = requests.get(link)
  # print(page.text)

  soup = BeautifulSoup(page.content, "html.parser")

  return soup
  
def scrapeTable(table):
  columns = [col.text.strip() for col in table.thead.tr.find_all()]
  columns

  rows = table.tbody.find_all(recursive=False)
  clean_rows = ""

  for row in rows:
    elements = ["{}: {}".format(columns[index], element.text.strip()) for index, element in enumerate(row.find_all(recursive=False))]
    elements = " ".join(elements)
    # print(elements)
    clean_rows += elements + "\n"

  return clean_rows



def scrapePage(page):
  # Scrape the text
  corpus = ""

  # starting from the main page
  content = page.find('div', {"id":"guide-contents"})

  title = content.find('h1', {"class":"part-title"})
  title = title.text.strip()
  corpus += title +"\n\n"

  print(title)

  content = content.find('div', {"class":"gem-c-govspeak"})

  fragments = content.find_all(recursive=False)
  for frag in fragments:
    text= frag.text.strip()
    if frag.name == 'ul':
      clean = re.sub('\n+', "{;}", text)
      corpus += "{;}" + clean
    elif frag.name == 'table':
      corpus += scrapeTable(frag)
    else:
      corpus += text 

    corpus += "\n"

  # print(corpus)

  return corpus
  
  
for benefit in benefits:
  links = benefit['link'].split(',')
  print(benefit['benefitName'], benefit['coreName'], len(links))

  context = ""
  for link in links:
    page = requestPage(link)
    context += scrapePage(page)

  benefit['context'] = context
  benefit['contextLen'] = len(context)
  print("--------------------------------")


benefitsClasses = list(set(list(map(lambda x: x['benefitName'], benefits))))
core4Classes = list(set(list(map(lambda x: x['coreName'], benefits))))

# contexts
benefitsClasses, core4Classes

question_answerer = pipeline("question-answering")


coreName = 'how much can I get'
def testQA(question):

  predictedBenefit = "Universal Credit"
  predictedCore = coreName

  time = datetime.now()
  context = list(filter(lambda x: x['benefitName']==predictedBenefit and x['coreName']==predictedCore, benefits))[0]
  answer = question_answerer(question = question, context = context['context'])['answer']
  time3 = (datetime.now() - time).total_seconds()

  return answer
  
iface = gr.Interface(fn=testQA, inputs="text", outputs="text")
iface.launch()