File size: 4,154 Bytes
ee6241d d5cee61 ee6241d b58cd97 ee6241d 70e1662 84e72a9 70e1662 ee6241d 2d0d147 c583e39 0fe8bc1 2d0d147 ee6241d c6d76a5 0fe8bc1 ee6241d d5cee61 bd02494 9e110d0 ee6241d 00fc816 6605b07 d5cee61 70e1662 ee6241d 7ca549f bd02494 ca3f295 6ebd9e5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
import gradio as gr
import wikipedia
import numpy as np
import pandas as pd
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
def wikipediaScrap(article_name, wikipedia_language = "en - English"):
wikipedia_language = wikipedia_language.split(" - ")[0]
if wikipedia_language:
wikipedia.set_lang(wikipedia_language)
# rem_sp = article_name.replace(" ", "")
et_page = wikipedia.page(article_name)
title = et_page.title
content = et_page.content
page_url = et_page.url
linked_pages = et_page.links
text = content
# Create and generate a word cloud image:
wordcloud = WordCloud(font_path="HelveticaWorld-Regular.ttf").generate(text)
# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
return title, content, page_url, "\n". join(linked_pages), plt
css = """
footer {display:none !important}
.output-markdown{display:none !important}
footer {visibility: hidden}
.gr-button-lg {
z-index: 14;
width: 113px;
height: 30px;
left: 0px;
top: 0px;
padding: 0px;
cursor: pointer !important;
background: none rgb(17, 20, 45) !important;
border: none !important;
text-align: center !important;
font-size: 14px !important;
font-weight: 500 !important;
color: rgb(255, 255, 255) !important;
line-height: 1 !important;
border-radius: 6px !important;
transition: box-shadow 200ms ease 0s, background 200ms ease 0s !important;
box-shadow: none !important;
}
.gr-button-lg:hover{
z-index: 14;
width: 113px;
height: 30px;
left: 0px;
top: 0px;
padding: 0px;
cursor: pointer !important;
background: none rgb(66, 133, 244) !important;
border: none !important;
text-align: center !important;
font-size: 14px !important;
font-weight: 500 !important;
color: rgb(255, 255, 255) !important;
line-height: 1 !important;
border-radius: 6px !important;
transition: box-shadow 200ms ease 0s, background 200ms ease 0s !important;
box-shadow: rgb(0 0 0 / 23%) 0px 1px 7px 0px !important;
}
#component-14 textarea[data-testid="textbox"] { height: 178px !important}
#component-17 textarea[data-testid="textbox"] { height: 178px !important}
#component-21 textarea[data-testid="textbox"] { height: 178px !important}
#component-20 tr:hover{
background-color: rgb(229,225,255) !important;
}
.output-image {max-height: 11rem !important;}
.output-image img {max-height: 17rem !important;}
.hover\:bg-orange-50:hover {
--tw-bg-opacity: 1 !important;
background-color: rgb(229,225,255) !important;
}
"""
ini_dict = wikipedia.languages()
# split dictionary into keys and values
keys = []
values = []
language=[]
items = ini_dict.items()
for item in items:
keys.append(item[0]), values.append(item[1])
language.append(item[0]+" - "+item[1])
with gr.Blocks(title="Wikipedia Article Scrape | Data Science Dojo", css = css) as demo:
with gr.Row():
inp = gr.Textbox(placeholder="Enter the name of wikipedia article", label="Wikipedia article name")
lan = gr.Dropdown(label=" Select Language", choices=language, value=language[108], interactive=True)
btn = gr.Button("Start scraping", elem_id="dsd_button")
with gr.Row():
with gr.Column():
gr.Markdown("""## About""")
title = gr.Textbox(label="Article title")
url = gr.Textbox(label="Article URL")
with gr.Column():
gr.Markdown("""## Wordcloud""")
wordcloud = gr.Plot()
gr.Markdown("""### Content""")
with gr.Row():
content = gr.Textbox(label="Content")
gr.Markdown("""### Linked Articles""")
with gr.Row():
linked = gr.Textbox(label="Linked Articles")
btn.click(fn=wikipediaScrap, inputs=[inp, lan], outputs=[title, content, url, linked, wordcloud])
with gr.Row():
gr.Examples(examples = [["Eiffel Tower", "en - English"], ["Eiffel tower", 'ur - اردو']], fn=wikipediaScrap, inputs=[inp, lan], outputs=[title, content, url, linked, wordcloud], cache_examples=True)
demo.launch() |