import requests from bs4 import BeautifulSoup import re from urllib.parse import urlparse import gradio as gr import json def extract_wikipedia_text(raw_text, language): contents = [] paragraph = "" for element in raw_text: # detected next headline if element.name == "span": if paragraph == "": continue contents.append({f"text-{language}": paragraph}) paragraph = "" else: clean_text = preprocessing(element.text) if clean_text == "": continue if paragraph != "": clean_text = " " + clean_text paragraph += clean_text return contents def preprocessing(text): # remove square brackets a.k.a citations clean_text = re.sub("\[.*?]", "", text).strip() # remove \n clean_text = clean_text.replace("\n", "") return clean_text def scrape(url): language = urlparse(url).netloc.split(".")[0] try: page = requests.get(url, headers={"user-agent": "Mozilla/5.0"}) soup = BeautifulSoup(page.content, "html.parser") except: print("error") title = soup.find("h1", {"id": "firstHeading"}).get_text().strip() raw_text = soup.select( "h2 span.mw-headline, h3 span.mw-headline, h4 span.mw-headline, p" ) contents = extract_wikipedia_text(raw_text, language) json_output = {"source": url, f"title-{language}": title, "pages": contents} filename = f"{url.split('/')[-1]}.json" with open(filename, "w") as f: json.dump(json_output, f) return json_output, filename style_sheet = "#json-output { max-height: 400px; overflow-y: auto; }" with gr.Blocks(css=style_sheet) as demo: gr.Markdown( f"""