66596 commited on
Commit
a151177
1 Parent(s): b95af79

initial commit

Browse files
Files changed (2) hide show
  1. requirements.txt +16 -0
  2. streamlit_app.py +297 -0
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit==1.34.0
2
+ requests==2.31.0
3
+ regex==2024.5.15
4
+ beautifulsoup4==4.12.3
5
+ urllib3==2.2.1
6
+ newspaper3k==0.2.8
7
+ pandas==2.2.2
8
+ lxml_html_clean==0.1.1
9
+ tweet-preprocessor==0.6.0
10
+ transformers==4.41.0
11
+ torch==2.3.0
12
+ torchaudio==2.3.0
13
+ torchvision==0.18.0
14
+ google-api-python-client==2.131.0
15
+ goose3==3.1.19
16
+ selenium==4.21.0
streamlit_app.py ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import re
3
+ import requests
4
+ from newspaper import Article
5
+ from newspaper import Config
6
+ import preprocessor as p
7
+ import pandas as pd
8
+ import torch
9
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
10
+ import numpy as np
11
+ import torch.nn.functional as F
12
+ from goose3 import Goose
13
+ from goose3.configuration import Configuration
14
+ from bs4 import BeautifulSoup
15
+
16
+ st.write("""
17
+ # ESG Prediction App
18
+
19
+ This is a Proof of Concept for a company ESG (Environmental, Social, and Governance) risk prediction application.
20
+ """)
21
+
22
+ company = st.text_input("Company", placeholder="PT Adaro Minerals Indonesia Tbk")
23
+
24
+ GOOGLE = 'https://www.google.com/search'
25
+ headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Cafari/537.36'}
26
+
27
+ API_KEY = 'AIzaSyDCfIltnvAQ3lvpovRXydRMhGQ-VxkboQ4'
28
+ SEARCH_ENGINE_ID = 'e586ee8a6c7e64d7b'
29
+
30
+ from googleapiclient.discovery import build
31
+ import math
32
+
33
+ def google_search(search_term, api_key, cse_id, **kwargs):
34
+ service = build("customsearch", "v1", developerKey=api_key)
35
+
36
+ num_search_results = kwargs['num']
37
+ if num_search_results > 100:
38
+ raise NotImplementedError('Google Custom Search API supports max of 100 results')
39
+ elif num_search_results > 10:
40
+ kwargs['num'] = 10 # this cannot be > 10 in API call
41
+ calls_to_make = math.ceil(num_search_results / 10)
42
+ else:
43
+ calls_to_make = 1
44
+
45
+ kwargs['start'] = start_item = 1
46
+ items_to_return = []
47
+ while calls_to_make > 0:
48
+ res = service.cse().list(q=search_term, cx=cse_id, **kwargs).execute()
49
+ items_to_return.extend(res['items'])
50
+ calls_to_make -= 1
51
+ start_item += 10
52
+ kwargs['start'] = start_item
53
+ leftover = num_search_results - start_item + 1
54
+ if 0 < leftover < 10:
55
+ kwargs['num'] = leftover
56
+
57
+ return items_to_return
58
+
59
+ if company:
60
+ print(f'Run: {company}')
61
+ links = []
62
+ news_text = []
63
+
64
+ query = f'{company} after:2023-01-01'
65
+ response = google_search(query, API_KEY, SEARCH_ENGINE_ID, num=10)
66
+
67
+ url_collection = [item['link'] for item in response]
68
+ import os
69
+ os.environ['ST_USER_AGENT'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'
70
+
71
+ user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'
72
+ config = Config()
73
+ config.browser_user_agent = user_agent
74
+ config.request_timeout = 60
75
+ config.fetch_images = False
76
+ config.memoize_articles = True
77
+ config.language = 'id'
78
+
79
+ # p.set_options(p.OPT.MENTION, p.OPT.EMOJI, p.OPT.HASHTAG, p.OPT.RESERVED, p.OPT.SMILEY, p.OPT.URL)
80
+
81
+ def cleaner(text):
82
+ text = re.sub("@[A-Za-z0-9]+", "", text) #Remove @ sign
83
+ text = text.replace("#", "").replace("_", "") #Remove hashtag sign but keep the text
84
+ # text = p.clean(text) # Clean text from any mention, emoji, hashtag, reserve words(such as FAV, RT), smiley, and url
85
+ text = text.strip().replace("\n","")
86
+ return text
87
+
88
+ for url in url_collection:
89
+ if "http" not in url:
90
+ continue
91
+ lang = "id"
92
+ if "eco-business.com" in url or "thejakartapost.com" in url or "marketforces.org.au" in url or "jakartaglobe.id" in url:
93
+ lang = "en"
94
+
95
+ ### Selenium
96
+ # from selenium import webdriver
97
+ # from selenium.webdriver.chrome.options import Options
98
+ # from goose3 import Goose
99
+
100
+ # options = Options()
101
+ # options.headless = True
102
+ # options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
103
+
104
+ # driver = webdriver.Chrome(options=options)
105
+ # # url = 'https://example.com/news-article'
106
+ # driver.get(url)
107
+
108
+ # html = driver.page_source
109
+ # driver.quit()
110
+
111
+ # g = Goose()
112
+ # article = g.extract(raw_html=html)
113
+
114
+ # print(article.cleaned_text)
115
+ # news_text.append(article.cleaned_text)
116
+ ###
117
+
118
+ # article = Article(url, language=lang, config=config)
119
+ # article.download()
120
+ # article.parse()
121
+ # article_clean = cleaner(article.text)
122
+
123
+ # url = 'https://example.com/news-article'
124
+ headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
125
+
126
+ response = requests.get(url, headers=headers)
127
+ # html = response.text
128
+
129
+ soup = BeautifulSoup(response.content, 'html.parser')
130
+
131
+ g = Goose()
132
+ article = g.extract(raw_html=str(soup))
133
+
134
+ # print(url)
135
+ # print(soup)
136
+ # news_empty = True
137
+
138
+ possible_class = ['detail', 'body-content', 'article-content', 'detail-konten', 'DetailBlock']
139
+ excluded_sentence = ['Komentar menjadi tanggung-jawab Anda sesuai UU ITE', 'Dapatkan berita terbaru dari kami Ikuti langkah ini untuk mendapatkan notifikasi:']
140
+
141
+ if not article.cleaned_text:
142
+ article_content = soup.find('div', class_=possible_class)
143
+ if article_content and article_content.get_text() not in excluded_sentence:
144
+ news_text.append(article_content.get_text())
145
+ news_empty = False
146
+ # print(f'{url} News Exist using POSSIBLE CLASS')
147
+ else:
148
+ if article.cleaned_text not in excluded_sentence:
149
+ news_text.append(article.cleaned_text)
150
+ news_empty = False
151
+ # print(f'{url} News Exist using ARTICLE CLEANED TEXT')
152
+
153
+ # if news_empty:
154
+ # print(f'Cannot Get URL: {url}')
155
+ # print(soup)
156
+
157
+ # print(article.cleaned_text)
158
+
159
+
160
+
161
+ # goose = Goose()
162
+ # config = Configuration()
163
+ # config.strict = False # turn of strict exception handling
164
+ # config.browser_user_agent = 'Mozilla 5.0' # set the browser agent string
165
+ # config.http_timeout = 5.05 # set http timeout in seconds
166
+
167
+ # with Goose(config) as g:
168
+ # article = goose.extract(url=url)
169
+
170
+ # news_text.append(article.cleaned_text)
171
+
172
+ df = pd.DataFrame({
173
+ 'news': news_text
174
+ })
175
+
176
+ # Load the tokenizer and model
177
+ tokenizer_esg = AutoTokenizer.from_pretrained("didev007/ESG-indobert-model")
178
+ model_esg = AutoModelForSequenceClassification.from_pretrained("didev007/ESG-indobert-model")
179
+
180
+ # Load the tokenizer and model
181
+ tokenizer_sentiment = AutoTokenizer.from_pretrained("adhityaprimandhika/distillbert_sentiment_analysis")
182
+ model_sentiment = AutoModelForSequenceClassification.from_pretrained("adhityaprimandhika/distillbert_sentiment_analysis")
183
+
184
+ def get_chunk_weights(num_chunks):
185
+ center = num_chunks / 2
186
+ sigma = num_chunks / 4
187
+ weights = [np.exp(-0.5 * ((i - center) / sigma) ** 2) for i in range(num_chunks)]
188
+ weights = np.array(weights)
189
+ return weights / weights.sum()
190
+
191
+ def tokenize_and_chunk(text, tokenizer, chunk_size=512):
192
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
193
+ input_ids = inputs['input_ids'][0]
194
+
195
+ chunks = [input_ids[i:i+chunk_size] for i in range(0, len(input_ids), chunk_size)]
196
+ return chunks
197
+
198
+ def esg_category(chunks, model):
199
+ num_chunks = len(chunks)
200
+ weights = get_chunk_weights(num_chunks)
201
+
202
+ esg_scores = np.zeros(4)
203
+ labels = ["none", "E", "S", "G"]
204
+
205
+ for i, chunk in enumerate(chunks):
206
+ inputs = {'input_ids': chunk.unsqueeze(0)}
207
+ outputs = model(**inputs)
208
+ logits = outputs.logits
209
+ probs = F.softmax(logits, dim=1).detach().numpy()[0]
210
+ esg_scores += weights[i] * probs
211
+
212
+ predicted_class = esg_scores.argmax()
213
+ aggregated_esg = labels[predicted_class]
214
+
215
+ return aggregated_esg
216
+
217
+ def sentiment_analysis(text, tokenizer, model):
218
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
219
+ outputs = model(**inputs)
220
+ logits = outputs.logits
221
+ predicted_class = torch.argmax(logits, dim=1).item()
222
+ labels = ["positive", "neutral", "negative"]
223
+ predicted_sentiment = labels[predicted_class]
224
+ return predicted_sentiment
225
+
226
+ def apply_model_to_dataframe(df, tokenizer_esg, model_esg, tokenizer_sentiment, model_sentiment, text_column='news'):
227
+ esg_categories = []
228
+ sentiments = []
229
+ for text in df[text_column]:
230
+ if isinstance(text, str):
231
+ chunks = tokenize_and_chunk(text, tokenizer_esg)
232
+ esg = esg_category(chunks, model_esg)
233
+ sentiment = sentiment_analysis(text, tokenizer_sentiment, model_sentiment)
234
+ esg_categories.append(esg)
235
+ sentiments.append(sentiment)
236
+ else:
237
+ esg_categories.append("none")
238
+ sentiments.append("neutral")
239
+
240
+ df['aggregated_esg'] = esg_categories
241
+ df['sentiment'] = sentiments
242
+ return df
243
+
244
+ result_data = apply_model_to_dataframe(df, tokenizer_esg, model_esg, tokenizer_sentiment, model_sentiment)
245
+
246
+ grouped_counts = df.groupby(['aggregated_esg', 'sentiment']).size().reset_index(name='count')
247
+ data = grouped_counts.pivot(index='aggregated_esg', columns='sentiment', values='count')
248
+ required_columns_sentiment = ['negative', 'positive', 'neutral']
249
+ for col in required_columns_sentiment:
250
+ if col not in data.columns:
251
+ data[col] = 0
252
+
253
+ # Handle potential missing values
254
+ data['negative'] = data['negative'].fillna(0)
255
+ data['positive'] = data['positive'].fillna(0)
256
+ data['neutral'] = data['neutral'].fillna(0)
257
+
258
+ # print(data)
259
+
260
+ data['count'] = (data['negative']+data['positive']+data['neutral'])
261
+ data['total'] = data['negative']/data['count'] + data['positive']*(-0.2)/data['count']
262
+ # data['total'] = data['negative'] + data['positive']*(-1)
263
+ if 'none' in data:
264
+ data = data.drop('none')
265
+ # data
266
+
267
+ total = data['total'].sum()
268
+
269
+ # Min-max normalization
270
+ min_esg = -1
271
+ max_esg = 2
272
+ min_score = 0
273
+ max_score = 60
274
+
275
+ ESG_score = ((total - min_esg) / (max_esg - min_esg)) * (max_score - min_score) + min_score
276
+
277
+ def esg_risk_categorization(esg_score):
278
+ if esg_score <= 10:
279
+ return 'Negligible'
280
+ elif 10 < esg_score <= 20:
281
+ return 'Low'
282
+ elif 20 < esg_score <= 30:
283
+ return 'Medium'
284
+ elif 30 < esg_score <= 40:
285
+ return 'High'
286
+ else:
287
+ return 'Severe'
288
+
289
+ risk = esg_risk_categorization(ESG_score)
290
+
291
+ # st.dataframe(df)
292
+
293
+ st.write(company)
294
+ # print(f'ESG Score Prediction: {ESG_score}')
295
+ st.write(f'ESG Score Prediction: {ESG_score}')
296
+ st.write(f'ESG Category Risk Prediction: {risk}')
297
+