Upload 3 files
Browse files- gnews2.py +75 -0
- packages.txt +1 -0
- requirements.txt +6 -0
gnews2.py
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import selenium
|
3 |
+
import requests
|
4 |
+
from bs4 import BeautifulSoup
|
5 |
+
import pandas as pd
|
6 |
+
|
7 |
+
from selenium import webdriver
|
8 |
+
from selenium.webdriver.common.keys import Keys
|
9 |
+
import pandas as pd
|
10 |
+
import time
|
11 |
+
from transformers import pipeline
|
12 |
+
|
13 |
+
# Search Query
|
14 |
+
def news_and_analysis(query):
|
15 |
+
|
16 |
+
# Encode special characters in a text string
|
17 |
+
def encode_special_characters(text):
|
18 |
+
encoded_text = ''
|
19 |
+
special_characters = {'&': '%26', '=': '%3D', '+': '%2B', ' ': '%20'} # Add more special characters as needed
|
20 |
+
for char in text.lower():
|
21 |
+
encoded_text += special_characters.get(char, char)
|
22 |
+
return encoded_text
|
23 |
+
|
24 |
+
query2 = encode_special_characters(query)
|
25 |
+
url = f"https://news.google.com/search?q={query2}&hl=en-US&gl=in&ceid=US%3Aen&num=3"
|
26 |
+
|
27 |
+
response = requests.get(url, verify = False)
|
28 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
29 |
+
|
30 |
+
articles = soup.find_all('article')
|
31 |
+
links = [article.find('a')['href'] for article in articles]
|
32 |
+
links = [link.replace("./articles/", "https://news.google.com/articles/") for link in links]
|
33 |
+
|
34 |
+
news_text = [article.get_text(separator='\n') for article in articles]
|
35 |
+
news_text_split = [text.split('\n') for text in news_text]
|
36 |
+
|
37 |
+
news_df = pd.DataFrame({
|
38 |
+
'Title': [text[2] for text in news_text_split],
|
39 |
+
'Source': [text[0] for text in news_text_split],
|
40 |
+
'Time': [text[3] if len(text) > 3 else 'Missing' for text in news_text_split],
|
41 |
+
'Author': [text[4].split('By ')[-1] if len(text) > 4 else 'Missing' for text in news_text_split],
|
42 |
+
'Link': links
|
43 |
+
})
|
44 |
+
|
45 |
+
news_df = news_df.loc[0:5,:]
|
46 |
+
options = webdriver.ChromeOptions()
|
47 |
+
options.add_argument('--headless')
|
48 |
+
options.add_argument('--no-sandbox')
|
49 |
+
options.add_argument('--disable-dev-shm-usage')
|
50 |
+
options.use_chromium = True
|
51 |
+
driver = webdriver.Chrome(options = options)
|
52 |
+
|
53 |
+
classification= pipeline(model="finiteautomata/bertweet-base-sentiment-analysis")
|
54 |
+
|
55 |
+
news_df['Sentiment'] = ''
|
56 |
+
for i in range(0, len(news_df)):
|
57 |
+
# driver.get(news_df.loc[i,'Link'])
|
58 |
+
# time.sleep(10)
|
59 |
+
# headline = driver.find_element('xpath', '//h1').text
|
60 |
+
#news_df.loc[i, 'Headline'] = headline
|
61 |
+
title = news_df.loc[i, 'Title']
|
62 |
+
news_df.loc[i, 'Sentiment'] = str(classification(title))
|
63 |
+
print(news_df)
|
64 |
+
|
65 |
+
return(news_df)
|
66 |
+
|
67 |
+
with gr.Blocks() as demo:
|
68 |
+
|
69 |
+
|
70 |
+
topic= gr.Textbox(label="Topic for which you want Google news and sentiment analysis")
|
71 |
+
|
72 |
+
btn = gr.Button(value="Submit")
|
73 |
+
btn.click(news_and_analysis, inputs=topic, outputs= gr.Dataframe())
|
74 |
+
|
75 |
+
demo.launch()
|
packages.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
chromium-driver
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
selenium ==4.21.0
|
2 |
+
gradio>=3.40.1
|
3 |
+
transformers
|
4 |
+
tensorflow
|
5 |
+
beautifulsoup4
|
6 |
+
requests
|