datasciencedojo commited on
Commit
ee6241d
1 Parent(s): 9268412

Create new file

Browse files
Files changed (1) hide show
  1. app.py +72 -0
app.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import wikipedia
3
+ import numpy as np
4
+ import pandas as pd
5
+ from os import path
6
+ from PIL import Image
7
+ from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
8
+ import matplotlib.pyplot as plt
9
+
10
+ def wikipediaScrap(article_name, wikipedia_language = "en"):
11
+ if wikipedia_language:
12
+ wikipedia.set_lang(wikipedia_language)
13
+
14
+ et_page = wikipedia.page(article_name)
15
+ title = et_page.title
16
+ content = et_page.content
17
+ page_url = et_page.url
18
+ linked_pages = et_page.links
19
+
20
+ text = content
21
+
22
+ # Create and generate a word cloud image:
23
+ wordcloud = WordCloud(font_path="HelveticaWorld-Regular.ttf").generate(text)
24
+
25
+ # Display the generated image:
26
+ plt.imshow(wordcloud, interpolation='bilinear')
27
+ plt.axis("off")
28
+
29
+ return title, content, page_url, "\n". join(linked_pages), plt
30
+
31
+ css = """
32
+ footer {display:none !important}
33
+ .output-markdown{display:none !important}
34
+ footer {visibility: hidden}
35
+ #dsd_button {background: purple, color: white}
36
+
37
+ textarea[data-testid="textbox"] { height: 178px !important}
38
+
39
+ .max-h-[30rem] {max-height: 18rem !important;}
40
+
41
+ .hover\:bg-orange-50:hover {
42
+ --tw-bg-opacity: 1 !important;
43
+ background-color: rgb(229,225,255) !important;
44
+ }
45
+
46
+ """
47
+
48
+ with gr.Blocks(title="Wikipedia Article Scrap | Datascience Dojo", css = css) as demo:
49
+ with gr.Row():
50
+ inp = gr.Textbox(placeholder="Enter the name of wikipedia article", label="Wikipedia article name")
51
+ lan = gr.Textbox(placeholder="Enter the language code", label="Language")
52
+ btn = gr.Button("Start Scraping", elem_id="dsd_button")
53
+ with gr.Row():
54
+ with gr.Column():
55
+ gr.Markdown("""## About""")
56
+ title = gr.Textbox(label="Article title")
57
+ url = gr.Textbox(label="Article URL")
58
+ with gr.Column():
59
+ gr.Markdown("""## Wordcloud""")
60
+ wordcloud = gr.Plot()
61
+ gr.Markdown("""### Content""")
62
+ with gr.Row():
63
+ content = gr.Textbox(label="Content")
64
+ gr.Markdown("""### Linked Articles""")
65
+ with gr.Row():
66
+ linked = gr.Textbox(label="Linked Articles")
67
+ with gr.Row():
68
+ gr.Examples(
69
+ examples = [["Eiffel Tower", "en"], ["Eiffel tower", 'ur']], fn=wikipediaScrap, inputs=[inp, lan], outputs=[title, content, url, linked, wordcloud], cache_examples=True)
70
+ btn.click(fn=wikipediaScrap, inputs=[inp, lan], outputs=[title, content, url, linked, wordcloud])
71
+
72
+ demo.launch()