File size: 2,227 Bytes
c651eb0
 
 
 
 
dffeab2
c651eb0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
566a5ab
 
dffeab2
566a5ab
c651eb0
 
dffeab2
 
c651eb0
 
 
 
 
 
 
 
 
dffeab2
 
 
c651eb0
dffeab2
c651eb0
dffeab2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urlparse
import gradio as gr
import json


def extract_wikipedia_text(raw_text, language):
    contents = []
    paragraph = ""

    for element in raw_text:
        # detected next headline
        if element.name == "span":
            if paragraph == "":
                continue
            contents.append({f"text-{language}": paragraph})
            paragraph = ""
        else:
            clean_text = preprocessing(element.text)
            if clean_text == "":
                continue
            if paragraph != "":
                clean_text = " " + clean_text
            paragraph += clean_text
    return contents


def preprocessing(text):
    # remove square brackets a.k.a citations
    clean_text = re.sub("\[.*?]", "", text).strip()
    # remove \n
    clean_text = clean_text.replace("\n", "")
    return clean_text


def scrape(url):
    language = urlparse(url).netloc.split(".")[0]
    try:
        page = requests.get(url, headers={"user-agent": "Mozilla/5.0"})
        soup = BeautifulSoup(page.content, "html.parser")
    except:
        print("error")
    title = soup.find("h1", {"id": "firstHeading"}).get_text().strip()
    raw_text = soup.select(
        "h2 span.mw-headline, h3 span.mw-headline, h4 span.mw-headline, p"
    )
    contents = extract_wikipedia_text(raw_text, language)
    json_output = {"source": url, f"title-{language}": title, "pages": contents}
    filename = f"{url.split('/')[-1]}.json"
    with open(filename, "w") as f:
        json.dump(json_output, f)
    return json_output, filename


style_sheet = "#json-output { max-height: 400px; overflow-y: auto; }"
with gr.Blocks(css=style_sheet) as demo:
    gr.Markdown(
        f"""

    <center>

    <h1>Wikipedia Scraper 📜</h1>

    </center>

    """
    )
    with gr.Row():
        inp = gr.Textbox(placeholder="Wikipedia URL")
        with gr.Column():
            out = gr.JSON(elem_id="json-output")
            out_download = gr.File()
    btn = gr.Button("Scrape")
    btn.click(fn=scrape, inputs=inp, outputs=[out, out_download])

demo.launch(debug=True)