salmanmapkar commited on
Commit
9f1049d
1 Parent(s): 40ef3b4

Initial Commit

Browse files
Files changed (2) hide show
  1. app.py +188 -0
  2. requirements.txt +0 -0
app.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ import concurrent.futures
5
+ import pandas as pd
6
+ from io import BytesIO
7
+ from pyxlsb import open_workbook as open_xlsb
8
+
9
+ if 'df' not in st.session_state:
10
+ st.session_state['df'] = None
11
+
12
+ if 'is_df' not in st.session_state:
13
+ st.session_state['is_df'] = False
14
+
15
+ headers = {
16
+ 'authority': 'cdn.jwplayer.com',
17
+ 'accept': '*/*',
18
+ 'accept-language': 'en-US,en;q=0.5',
19
+ 'origin': 'https://hotcopper.com.au',
20
+ 'referer': 'https://hotcopper.com.au/',
21
+ 'sec-ch-ua': '"Chromium";v="122", "Not(A:Brand";v="24", "Brave";v="122"',
22
+ 'sec-ch-ua-mobile': '?0',
23
+ 'sec-ch-ua-platform': '"Windows"',
24
+ 'sec-fetch-dest': 'empty',
25
+ 'sec-fetch-mode': 'cors',
26
+ 'sec-fetch-site': 'cross-site',
27
+ 'sec-gpc': '1',
28
+ 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
29
+ }
30
+
31
+ cookies = {
32
+ 'xf_show_post_view': '0',
33
+ 'xf_threads_terms_conditions_pop': '1',
34
+ 'hc_user_tracker': 'ZAXlB7LEHd0eT6X4QQfAOFkXms373LrE',
35
+ 'xf_user': '971779%2C5e49c37ac9fa56c1a2798923703d4978b9f8427d',
36
+ 'xf_session': '16mn22etenrnkf7aimqacdu026',
37
+ }
38
+
39
+ post_links = []
40
+
41
+ post_data = {
42
+ 'username': [],
43
+ 'number_of_posts_by_user': [],
44
+ 'number_of_great_analysis_for_user': [],
45
+ 'post_date': [],
46
+ 'post_time': [],
47
+ 'post_id': [],
48
+ 'number_of_upvotes': [],
49
+ 'stock_pill': [],
50
+ 'stock_pill_link': [],
51
+ 'price_at_posting': [],
52
+ 'sentiment': [],
53
+ 'disclosure': [],
54
+ 'message': [],
55
+ 'reply_post_id': [],
56
+ 'reply_post_url': []
57
+ }
58
+
59
+ def get_number_of_posts(company_code, max_page_count=999999999):
60
+ url = f'https://hotcopper.com.au/asx/{company_code}/discussion/page-{max_page_count**3}'
61
+
62
+ response = requests.get(url, headers=headers, cookies=cookies)
63
+ number_of_posts = 0
64
+
65
+ if url != response.url:
66
+ number_of_posts = int(response.url.split('-')[-1])
67
+ else:
68
+ return get_number_of_posts(company_code, max_page_count*3)
69
+ return number_of_posts
70
+
71
+ def get_all_posts(url):
72
+ global post_links
73
+ response = requests.get(url, headers=headers, cookies=cookies)
74
+ soup = BeautifulSoup(response.text, 'html.parser')
75
+ posts = [f"https://hotcopper.com.au{post['href']}" for post in soup.find_all('a', class_='subject-a')]
76
+ post_links.extend(posts)
77
+ return posts
78
+
79
+ def get_post(url):
80
+ response = requests.get(url, headers=headers, cookies=cookies)
81
+ soup = BeautifulSoup(response.text, 'html.parser')
82
+ username = soup.find('div', class_='user-username').text.strip()
83
+ number_of_posts_by_user = soup.find('div', class_='user-post-num').text.replace(',','').replace('Posts.', '').strip()
84
+ try:
85
+ number_of_great_analysis_for_user = soup.find('div', class_='user-ga-count').text.replace(',','').replace('lightbulb Created with Sketch.','').strip()
86
+ except:
87
+ number_of_great_analysis_for_user = 0
88
+ post_date = soup.find('div', class_='post-metadata-date').text.replace('Posted:','').strip()
89
+ post_time = soup.find('div', class_='post-metadata-time').text.replace('Time:', '').strip()
90
+ post_id = url.split('=')[-1]
91
+ try:
92
+ number_of_upvotes = soup.find('div', class_='votes-num has-not-voted').text.strip()
93
+ except:
94
+ number_of_upvotes = 0
95
+ stock_pill = soup.find('span', class_='stock-pill').text.strip()
96
+ stock_pill_link = f"https://hotcopper.com.au{soup.find('span', class_='stock-pill').find('a')['href']}"
97
+ for meta_detail in soup.find_all('span', class_='meta-details'):
98
+ if 'Price at posting:' in meta_detail.text:
99
+ price_at_posting = meta_detail.text.replace('Price at posting:', '').strip()
100
+ if 'Sentiment:' in meta_detail.text:
101
+ sentiment = meta_detail.text.replace('Sentiment:', '').strip()
102
+ if 'Disclosure' in meta_detail.text:
103
+ disclosure = meta_detail.text.replace('Disclosure:', '').strip()
104
+
105
+ message = soup.find('blockquote', class_='message-text ugc baseHtml').get_text(strip=True)
106
+ if '↑' in message:
107
+ message = message.split('↑')[1]
108
+ if soup.find('a', class_='AttributionLink') is not None:
109
+ reply_post_id = soup.find('a', class_='AttributionLink')['data-hash']
110
+ reply_post_url = f"https://hotcopper.com.au/{soup.find('a', class_='AttributionLink')['href']}"
111
+ else:
112
+ reply_post_id = None
113
+ reply_post_url = None
114
+ post_data['username'].append(username)
115
+ post_data['number_of_posts_by_user'].append(number_of_posts_by_user)
116
+ post_data['number_of_great_analysis_for_user'].append(number_of_great_analysis_for_user)
117
+ post_data['post_date'].append(post_date)
118
+ post_data['post_time'].append(post_time)
119
+ post_data['post_id'].append(post_id)
120
+ post_data['number_of_upvotes'].append(number_of_upvotes)
121
+ post_data['stock_pill'].append(stock_pill)
122
+ post_data['stock_pill_link'].append(stock_pill_link)
123
+ post_data['price_at_posting'].append(price_at_posting)
124
+ post_data['sentiment'].append(sentiment)
125
+ post_data['disclosure'].append(disclosure)
126
+ post_data['message'].append(message)
127
+ post_data['reply_post_id'].append(reply_post_id)
128
+ post_data['reply_post_url'].append(reply_post_url)
129
+
130
+ return post_data
131
+
132
+ @st.cache_data
133
+ def convert_df(df: pd.DataFrame):
134
+ return df.to_excel('',index=False, engine='openpyxl', sheet_name='Sheet1')
135
+
136
+ def to_excel(df):
137
+ output = BytesIO()
138
+ writer = pd.ExcelWriter(output, engine='xlsxwriter')
139
+ df.to_excel(writer, index=False, sheet_name='Sheet1')
140
+ writer.save()
141
+ processed_data = output.getvalue()
142
+ return processed_data
143
+
144
+ def track_progress(futures, total, message):
145
+ progress_bar = st.empty()
146
+ completed = 0
147
+ for future in concurrent.futures.as_completed(futures):
148
+ completed += 1
149
+ progress_bar.progress(completed / total, f'Scraped {completed}/{total} {message}...')
150
+
151
+ st.title('Thread Scraper')
152
+
153
+ st.sidebar.title('Settings')
154
+ company_code = st.sidebar.text_input('Company Code', 'ZIP').lower()
155
+ scrape = st.sidebar.button('Scrape')
156
+
157
+ if st.session_state['is_df']:
158
+ with st.spinner('Creating database ..'):
159
+ df = pd.DataFrame(post_data)
160
+ csv = to_excel(df)
161
+ st.download_button(
162
+ label="Download Data",
163
+ data=to_excel(st.session_state['df']),
164
+ file_name=f'{company_code.upper()}.xlsx',
165
+ mime='application/vnd.ms-excel',
166
+ )
167
+
168
+ if scrape:
169
+ with st.spinner('Getting number of post pages to scrape ..'):
170
+ number_of_posts = get_number_of_posts(company_code)
171
+
172
+ with st.spinner('Getting all post links ..'):
173
+ pages = [f'https://hotcopper.com.au/asx/{company_code}/discussion/page-{page_number}' for page_number in range(1, get_number_of_posts(company_code)+1)]
174
+ with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
175
+ futures = [executor.submit(get_all_posts, url) for url in pages]
176
+ track_progress(futures, len(pages), 'post pages' )
177
+
178
+ with st.spinner('Getting all post data ..'):
179
+ with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
180
+ futures = [executor.submit(get_post, url) for url in post_links]
181
+ track_progress(futures, len(post_links), 'posts')
182
+
183
+ st.session_state['df'] = pd.DataFrame(post_data)
184
+ st.session_state['is_df'] = True
185
+
186
+
187
+
188
+
requirements.txt ADDED
Binary file (2.58 kB). View file