StevenLimcorn commited on
Commit
c651eb0
1 Parent(s): e992df1

Initial scraper deployment

Browse files
Files changed (2) hide show
  1. app.py +69 -0
  2. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from distutils.log import error
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ import json
5
+ import re
6
+ from urllib.parse import urlparse
7
+ import gradio as gr
8
+
9
+
10
+ def extract_wikipedia_text(raw_text, language):
11
+ contents = []
12
+ paragraph = ""
13
+
14
+ for element in raw_text:
15
+ # detected next headline
16
+ if element.name == "span":
17
+ if paragraph == "":
18
+ continue
19
+ contents.append({f"text-{language}": paragraph})
20
+ paragraph = ""
21
+ else:
22
+ clean_text = preprocessing(element.text)
23
+ if clean_text == "":
24
+ continue
25
+ if paragraph != "":
26
+ clean_text = " " + clean_text
27
+ paragraph += clean_text
28
+ return contents
29
+
30
+
31
+ def preprocessing(text):
32
+ # remove square brackets a.k.a citations
33
+ clean_text = re.sub("\[.*?]", "", text).strip()
34
+ # remove \n
35
+ clean_text = clean_text.replace("\n", "")
36
+ return clean_text
37
+
38
+
39
+ def scrape(url):
40
+ language = urlparse(url).netloc.split(".")[0]
41
+ try:
42
+ page = requests.get(url, headers={"user-agent": "Mozilla/5.0"})
43
+ soup = BeautifulSoup(page.content, "html.parser")
44
+ except:
45
+ print("error")
46
+ title = soup.find("h1", {"id": "firstHeading"}).get_text().strip()
47
+ raw_text = soup.select(
48
+ "h2 span.mw-headline, h3 span.mw-headline, h4 span.mw-headline, p"
49
+ )
50
+ contents = extract_wikipedia_text(raw_text, language)
51
+ json_output = {"source": url, f"title-{language}": title, "pages": contents}
52
+ return json_output
53
+
54
+
55
+ with gr.Blocks() as demo:
56
+ gr.Markdown(
57
+ f"""
58
+ <center>
59
+ <h1>Wikipedia Scraper 📜</h1>
60
+ </center>
61
+ """
62
+ )
63
+ with gr.Row():
64
+ inp = gr.Textbox(placeholder="Wikipedia URL")
65
+ out = gr.JSON()
66
+ btn = gr.Button("Scrape")
67
+ btn.click(fn=scrape, inputs=inp, outputs=out)
68
+
69
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ beautifulsoup4==4.11.1
2
+ gradio==3.1.7
3
+ requests==2.28.1