Spaces:

bookbot
/

Wikipedia-Scraper

Build error

App Files Files Community

Wikipedia-Scraper / app.py

w11wo

modified output filename

566a5ab over 1 year ago

raw history blame contribute delete

No virus

2.23 kB

	import requests
	from bs4 import BeautifulSoup
	import re
	from urllib.parse import urlparse
	import gradio as gr
	import json


	def extract_wikipedia_text(raw_text, language):
	contents = []
	paragraph = ""

	for element in raw_text:
	# detected next headline
	if element.name == "span":
	if paragraph == "":
	continue
	contents.append({f"text-{language}": paragraph})
	paragraph = ""
	else:
	clean_text = preprocessing(element.text)
	if clean_text == "":
	continue
	if paragraph != "":
	clean_text = " " + clean_text
	paragraph += clean_text
	return contents


	def preprocessing(text):
	# remove square brackets a.k.a citations
	clean_text = re.sub("\[.*?]", "", text).strip()
	# remove \n
	clean_text = clean_text.replace("\n", "")
	return clean_text


	def scrape(url):
	language = urlparse(url).netloc.split(".")[0]
	try:
	page = requests.get(url, headers={"user-agent": "Mozilla/5.0"})
	soup = BeautifulSoup(page.content, "html.parser")
	except:
	print("error")
	title = soup.find("h1", {"id": "firstHeading"}).get_text().strip()
	raw_text = soup.select(
	"h2 span.mw-headline, h3 span.mw-headline, h4 span.mw-headline, p"
	)
	contents = extract_wikipedia_text(raw_text, language)
	json_output = {"source": url, f"title-{language}": title, "pages": contents}
	filename = f"{url.split('/')[-1]}.json"
	with open(filename, "w") as f:
	json.dump(json_output, f)
	return json_output, filename


	style_sheet = "#json-output { max-height: 400px; overflow-y: auto; }"
	with gr.Blocks(css=style_sheet) as demo:
	gr.Markdown(
	f"""
	<center>
	<h1>Wikipedia Scraper 📜</h1>
	</center>
	"""
	)
	with gr.Row():
	inp = gr.Textbox(placeholder="Wikipedia URL")
	with gr.Column():
	out = gr.JSON(elem_id="json-output")
	out_download = gr.File()
	btn = gr.Button("Scrape")
	btn.click(fn=scrape, inputs=inp, outputs=[out, out_download])

	demo.launch(debug=True)