StevenLimcorn's picture
Initial scraper deployment
c651eb0
raw history blame
No virus
1.93 kB
from distutils.log import error
import requests
from bs4 import BeautifulSoup
import json
import re
from urllib.parse import urlparse
import gradio as gr
def extract_wikipedia_text(raw_text, language):
contents = []
paragraph = ""
for element in raw_text:
# detected next headline
if element.name == "span":
if paragraph == "":
continue
contents.append({f"text-{language}": paragraph})
paragraph = ""
else:
clean_text = preprocessing(element.text)
if clean_text == "":
continue
if paragraph != "":
clean_text = " " + clean_text
paragraph += clean_text
return contents
def preprocessing(text):
# remove square brackets a.k.a citations
clean_text = re.sub("\[.*?]", "", text).strip()
# remove \n
clean_text = clean_text.replace("\n", "")
return clean_text
def scrape(url):
language = urlparse(url).netloc.split(".")[0]
try:
page = requests.get(url, headers={"user-agent": "Mozilla/5.0"})
soup = BeautifulSoup(page.content, "html.parser")
except:
print("error")
title = soup.find("h1", {"id": "firstHeading"}).get_text().strip()
raw_text = soup.select(
"h2 span.mw-headline, h3 span.mw-headline, h4 span.mw-headline, p"
)
contents = extract_wikipedia_text(raw_text, language)
json_output = {"source": url, f"title-{language}": title, "pages": contents}
return json_output
with gr.Blocks() as demo:
gr.Markdown(
f"""
<center>
<h1>Wikipedia Scraper πŸ“œ</h1>
</center>
"""
)
with gr.Row():
inp = gr.Textbox(placeholder="Wikipedia URL")
out = gr.JSON()
btn = gr.Button("Scrape")
btn.click(fn=scrape, inputs=inp, outputs=out)
demo.launch()