File size: 4,154 Bytes
ee6241d
 
 
 
 
 
 
 
 
d5cee61
 
 
ee6241d
 
 
b58cd97
 
ee6241d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70e1662
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84e72a9
70e1662
 
 
 
 
 
 
 
 
 
 
ee6241d
2d0d147
c583e39
0fe8bc1
2d0d147
 
 
ee6241d
c6d76a5
0fe8bc1
ee6241d
 
 
 
 
 
 
d5cee61
 
 
 
 
 
 
 
 
 
 
 
bd02494
9e110d0
ee6241d
00fc816
6605b07
d5cee61
70e1662
ee6241d
 
 
 
 
 
 
 
 
 
 
 
 
 
7ca549f
bd02494
 
ca3f295
6ebd9e5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import gradio as gr
import wikipedia
import numpy as np
import pandas as pd
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt

def wikipediaScrap(article_name, wikipedia_language = "en - English"):
  wikipedia_language = wikipedia_language.split(" - ")[0]
  
  if wikipedia_language:
    wikipedia.set_lang(wikipedia_language)

  # rem_sp = article_name.replace(" ", "")
  et_page = wikipedia.page(article_name)
  title = et_page.title
  content = et_page.content
  page_url = et_page.url
  linked_pages = et_page.links
  
  text = content

  # Create and generate a word cloud image:
  wordcloud = WordCloud(font_path="HelveticaWorld-Regular.ttf").generate(text)

  # Display the generated image:
  plt.imshow(wordcloud, interpolation='bilinear')
  plt.axis("off")
  
  return title, content, page_url, "\n". join(linked_pages), plt

css = """
footer {display:none !important}
.output-markdown{display:none !important}
footer {visibility: hidden} 

.gr-button-lg {
    z-index: 14;
    width: 113px;
    height: 30px;
    left: 0px;
    top: 0px;
    padding: 0px;
    cursor: pointer !important; 
    background: none rgb(17, 20, 45) !important;
    border: none !important;
    text-align: center !important;
    font-size: 14px !important;
    font-weight: 500 !important;
    color: rgb(255, 255, 255) !important;
    line-height: 1 !important;
    border-radius: 6px !important;
    transition: box-shadow 200ms ease 0s, background 200ms ease 0s !important;
    box-shadow: none !important;
}
.gr-button-lg:hover{
    z-index: 14;
    width: 113px;
    height: 30px;
    left: 0px;
    top: 0px;
    padding: 0px;
    cursor: pointer !important; 
    background: none rgb(66, 133, 244) !important;
    border: none !important;
    text-align: center !important;
    font-size: 14px !important;
    font-weight: 500 !important;
    color: rgb(255, 255, 255) !important;
    line-height: 1 !important;
    border-radius: 6px !important;
    transition: box-shadow 200ms ease 0s, background 200ms ease 0s !important;
    box-shadow: rgb(0 0 0 / 23%) 0px 1px 7px 0px !important;
}


#component-14 textarea[data-testid="textbox"] { height: 178px !important}
#component-17 textarea[data-testid="textbox"] { height: 178px !important}
#component-21 textarea[data-testid="textbox"] { height: 178px !important}
#component-20 tr:hover{
    background-color: rgb(229,225,255) !important;
}

.output-image {max-height: 11rem !important;}
.output-image img {max-height: 17rem !important;}

.hover\:bg-orange-50:hover {
    --tw-bg-opacity: 1 !important;
    background-color: rgb(229,225,255) !important;
}
"""

ini_dict = wikipedia.languages()
 
# split dictionary into keys and values
keys = []
values = []
language=[]

items = ini_dict.items()
for item in items:
    keys.append(item[0]), values.append(item[1])
    language.append(item[0]+" - "+item[1])


with gr.Blocks(title="Wikipedia Article Scrape | Data Science Dojo", css = css) as demo:
    with gr.Row():
      inp = gr.Textbox(placeholder="Enter the name of wikipedia article", label="Wikipedia article name")
      lan = gr.Dropdown(label=" Select Language", choices=language, value=language[108], interactive=True)
      
    btn = gr.Button("Start scraping", elem_id="dsd_button")
    with gr.Row():
      with gr.Column():
        gr.Markdown("""## About""")
        title = gr.Textbox(label="Article title")
        url = gr.Textbox(label="Article URL")
      with gr.Column():
        gr.Markdown("""## Wordcloud""")
        wordcloud = gr.Plot()
    gr.Markdown("""### Content""")
    with gr.Row():
      content = gr.Textbox(label="Content")
    gr.Markdown("""### Linked Articles""")
    with gr.Row():
      linked = gr.Textbox(label="Linked Articles")
    btn.click(fn=wikipediaScrap, inputs=[inp, lan], outputs=[title, content, url, linked, wordcloud])
    with gr.Row():
      gr.Examples(examples = [["Eiffel Tower", "en - English"], ["Eiffel tower", 'ur - اردو']], fn=wikipediaScrap, inputs=[inp, lan], outputs=[title, content, url, linked, wordcloud], cache_examples=True)
  
demo.launch()