w11wo's picture
modified output filename
566a5ab
raw history blame
No virus
2.23 kB
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urlparse
import gradio as gr
import json
def extract_wikipedia_text(raw_text, language):
contents = []
paragraph = ""
for element in raw_text:
# detected next headline
if element.name == "span":
if paragraph == "":
continue
contents.append({f"text-{language}": paragraph})
paragraph = ""
else:
clean_text = preprocessing(element.text)
if clean_text == "":
continue
if paragraph != "":
clean_text = " " + clean_text
paragraph += clean_text
return contents
def preprocessing(text):
# remove square brackets a.k.a citations
clean_text = re.sub("\[.*?]", "", text).strip()
# remove \n
clean_text = clean_text.replace("\n", "")
return clean_text
def scrape(url):
language = urlparse(url).netloc.split(".")[0]
try:
page = requests.get(url, headers={"user-agent": "Mozilla/5.0"})
soup = BeautifulSoup(page.content, "html.parser")
except:
print("error")
title = soup.find("h1", {"id": "firstHeading"}).get_text().strip()
raw_text = soup.select(
"h2 span.mw-headline, h3 span.mw-headline, h4 span.mw-headline, p"
)
contents = extract_wikipedia_text(raw_text, language)
json_output = {"source": url, f"title-{language}": title, "pages": contents}
filename = f"{url.split('/')[-1]}.json"
with open(filename, "w") as f:
json.dump(json_output, f)
return json_output, filename
style_sheet = "#json-output { max-height: 400px; overflow-y: auto; }"
with gr.Blocks(css=style_sheet) as demo:
gr.Markdown(
f"""
<center>
<h1>Wikipedia Scraper πŸ“œ</h1>
</center>
"""
)
with gr.Row():
inp = gr.Textbox(placeholder="Wikipedia URL")
with gr.Column():
out = gr.JSON(elem_id="json-output")
out_download = gr.File()
btn = gr.Button("Scrape")
btn.click(fn=scrape, inputs=inp, outputs=[out, out_download])
demo.launch(debug=True)