from distutils.log import error import requests from bs4 import BeautifulSoup import json import re from urllib.parse import urlparse import gradio as gr def extract_wikipedia_text(raw_text, language): contents = [] paragraph = "" for element in raw_text: # detected next headline if element.name == "span": if paragraph == "": continue contents.append({f"text-{language}": paragraph}) paragraph = "" else: clean_text = preprocessing(element.text) if clean_text == "": continue if paragraph != "": clean_text = " " + clean_text paragraph += clean_text return contents def preprocessing(text): # remove square brackets a.k.a citations clean_text = re.sub("\[.*?]", "", text).strip() # remove \n clean_text = clean_text.replace("\n", "") return clean_text def scrape(url): language = urlparse(url).netloc.split(".")[0] try: page = requests.get(url, headers={"user-agent": "Mozilla/5.0"}) soup = BeautifulSoup(page.content, "html.parser") except: print("error") title = soup.find("h1", {"id": "firstHeading"}).get_text().strip() raw_text = soup.select( "h2 span.mw-headline, h3 span.mw-headline, h4 span.mw-headline, p" ) contents = extract_wikipedia_text(raw_text, language) json_output = {"source": url, f"title-{language}": title, "pages": contents} return json_output with gr.Blocks() as demo: gr.Markdown( f"""

Wikipedia Scraper 📜

""" ) with gr.Row(): inp = gr.Textbox(placeholder="Wikipedia URL") out = gr.JSON() btn = gr.Button("Scrape") btn.click(fn=scrape, inputs=inp, outputs=out) demo.launch()