Spaces:

bookbot
/

Wikipedia-Scraper

Build error

File size: 2,227 Bytes

c651eb0
 
 
 
 
dffeab2
c651eb0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
566a5ab
 
dffeab2
566a5ab
c651eb0
 
dffeab2
 
c651eb0
 
 
 
 
 
 
 
 
dffeab2
 
 
c651eb0
dffeab2
c651eb0
dffeab2

import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urlparse
import gradio as gr
import json


def extract_wikipedia_text(raw_text, language):
    contents = []
    paragraph = ""

    for element in raw_text:
        # detected next headline
        if element.name == "span":
            if paragraph == "":
                continue
            contents.append({f"text-{language}": paragraph})
            paragraph = ""
        else:
            clean_text = preprocessing(element.text)
            if clean_text == "":
                continue
            if paragraph != "":
                clean_text = " " + clean_text
            paragraph += clean_text
    return contents


def preprocessing(text):
    # remove square brackets a.k.a citations
    clean_text = re.sub("\[.*?]", "", text).strip()
    # remove \n
    clean_text = clean_text.replace("\n", "")
    return clean_text


def scrape(url):
    language = urlparse(url).netloc.split(".")[0]
    try:
        page = requests.get(url, headers={"user-agent": "Mozilla/5.0"})
        soup = BeautifulSoup(page.content, "html.parser")
    except:
        print("error")
    title = soup.find("h1", {"id": "firstHeading"}).get_text().strip()
    raw_text = soup.select(
        "h2 span.mw-headline, h3 span.mw-headline, h4 span.mw-headline, p"
    )
    contents = extract_wikipedia_text(raw_text, language)
    json_output = {"source": url, f"title-{language}": title, "pages": contents}
    filename = f"{url.split('/')[-1]}.json"
    with open(filename, "w") as f:
        json.dump(json_output, f)
    return json_output, filename


style_sheet = "#json-output { max-height: 400px; overflow-y: auto; }"
with gr.Blocks(css=style_sheet) as demo:
    gr.Markdown(
        f"""

    <center>

    <h1>Wikipedia Scraper 📜</h1>

    </center>

    """
    )
    with gr.Row():
        inp = gr.Textbox(placeholder="Wikipedia URL")
        with gr.Column():
            out = gr.JSON(elem_id="json-output")
            out_download = gr.File()
    btn = gr.Button("Scrape")
    btn.click(fn=scrape, inputs=inp, outputs=[out, out_download])

demo.launch(debug=True)