pranked03 commited on
Commit
6cb7011
1 Parent(s): 92e09ea

Upload 4 files

Browse files
Files changed (4) hide show
  1. all_funcs.py +373 -0
  2. app.py +153 -0
  3. create_table.py +206 -0
  4. requirements.txt +7 -0
all_funcs.py ADDED
@@ -0,0 +1,373 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import validators
2
+ from selectorlib import Extractor
3
+ import requests
4
+ import json
5
+ import time
6
+ import csv
7
+ from dateutil.parser import parse
8
+ import sys, os
9
+ import re
10
+ from datetime import date, datetime
11
+ import numpy as np
12
+ import math
13
+ import concurrent.futures
14
+ import boto3
15
+ import botocore
16
+ from io import StringIO
17
+ import pandas as pd
18
+ import streamlit as st
19
+ import streamlit.components.v1 as components
20
+ import base64
21
+ import uuid
22
+ #import pyperclip
23
+ #from IPython.core.display import HTML
24
+ from bokeh.plotting import figure
25
+ import plotly.express as px
26
+ import plotly.graph_objects as go
27
+
28
+
29
+ # In[2]:
30
+
31
+
32
+ AWS_ACCESS_KEY_ID = 'AKIA4WLULVFKDGROP37L'
33
+ AWS_SECRET_ACCESS_KEY = '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'
34
+
35
+ s3 = boto3.client("s3",
36
+ region_name='ap-south-1',
37
+ aws_access_key_id=AWS_ACCESS_KEY_ID,
38
+ aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
39
+
40
+ res = boto3.resource("s3",
41
+ region_name='ap-south-1',
42
+ aws_access_key_id=AWS_ACCESS_KEY_ID,
43
+ aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
44
+ def getrate(df):
45
+ ind_time_diff = []
46
+ ind_rating = []
47
+ ind_helped = []
48
+ count_of_day = 0
49
+ count_of_five_star = 0
50
+
51
+ #print(min(df['date']))
52
+
53
+ df['date'] = pd.to_datetime(df.date, infer_datetime_format = True)
54
+ df['date'] = df['date'].apply(lambda x: pd.Timestamp(x).strftime('%Y-%m-%d'))
55
+ df.sort_values(by = 'date', inplace = True, ascending=True)
56
+ #df.to_csv('data.csv', index=False)
57
+ df = df.query('verified == 1')
58
+ df_len = len(df)
59
+ d0 = parse(min(df['date']))
60
+ d1 = parse(max(df['date']))
61
+ today = parse(date.today().strftime("%Y-%m-%d"))
62
+ for i in df["date"].values:
63
+ ind_time_diff.append((today-parse(i)).days)
64
+ for i in ind_time_diff:
65
+ if i <=100:
66
+ count_of_day+=1
67
+ #print(count_of_day)
68
+ ind_hun_days = ind_time_diff[len(ind_time_diff)-count_of_day:]
69
+ for i in range(0, len(df['rating'].values)):
70
+ if df['rating'].values[i] == None or df['rating'].values[i] == "" or df['rating'].values[i] == "None":
71
+ ind_rating.append(0)
72
+ else:
73
+ ind_rating.append(float(df['rating'].values[i])/5)
74
+ ind_rating_count_of_day = [i*5 for i in ind_rating[len(ind_time_diff)-count_of_day:]]
75
+ for i in ind_rating_count_of_day:
76
+ if i == 5:
77
+ count_of_five_star += 1
78
+ ind_verified = df['verified'].values
79
+ for i in range(0, len(df['helped'].values)):
80
+ if df['helped'].values[i] == None:
81
+ ind_helped.append(1)
82
+ else:
83
+ if str(df['helped'].values[i]).isdigit() == True:
84
+ ind_helped.append(int(df['helped'].values[i]) + 1)
85
+ else:
86
+ df['helped'].values[i] = df['helped'].values[i].split(",")
87
+ df['helped'].values[i] = "".join(df['helped'].values[i])
88
+ ind_helped.append(int(df['helped'].values[i]) + 1)
89
+
90
+ deltaT = abs((d1-d0).days)
91
+ if deltaT == 0:
92
+ deltaT = 1
93
+ #print(deltaT)
94
+ rate = (df_len/deltaT)
95
+ #revenue = rate * int(p[1])
96
+ #print(df_len)
97
+ """print(df['date'])
98
+ print(d0, d1, deltaT)
99
+ print(int(p[1]))
100
+ print(revenue)"""
101
+
102
+ return df_len, deltaT, rate, ind_time_diff, ind_rating, ind_verified, ind_helped, count_of_day, count_of_five_star, ind_hun_days
103
+ #p = ["", "1"]
104
+ #df_len, deltaT, rate, revenue = getrate(p)
105
+
106
+
107
+ # In[4]:
108
+
109
+
110
+ def recordlinks(name, df_len, deltaT, rate, url):
111
+ to_insert = {
112
+ 'product': name,
113
+ 'num_reviews': df_len,
114
+ 'deltaT': deltaT,
115
+ 'rate': rate,
116
+ 'url': url,
117
+ }
118
+ df = pd.read_csv('datalist.csv')
119
+ with open('datalist.csv', 'a', newline="") as savefile:
120
+ writer = csv.DictWriter(savefile, fieldnames=["product", 'num_reviews', "deltaT", "rate", "url"])
121
+ writer.writerow(to_insert)
122
+ print("Saved Data!")
123
+
124
+
125
+ # In[5]:
126
+
127
+
128
+ def scrape(url, e):
129
+ headers = {
130
+ 'authority': 'www.amazon.in',
131
+ 'pragma': 'no-cache',
132
+ 'cache-control': 'no-cache',
133
+ 'dnt': '1',
134
+ 'upgrade-insecure-requests': '1',
135
+ 'user-agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36',
136
+ 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
137
+ 'sec-fetch-site': 'none',
138
+ 'sec-fetch-mode': 'navigate',
139
+ 'sec-fetch-dest': 'document',
140
+ 'accept-language': 'en-GB,en-US,en-IN;q=0.9,en;q=0.8',
141
+ }
142
+
143
+ r = requests.get(url, headers=headers)
144
+ if r.status_code > 500:
145
+ if "To discuss automated access to Amazon data please contact" in r.text:
146
+ print("Page %s was blocked by Amazon. Please try using better proxies %d\n"%(url, r.status_code))
147
+ else:
148
+ print("Page %s must have been blocked by Amazon as the status code was %d"%(url,r.status_code))
149
+ return None
150
+ #print(e.extract(r.text)["product_title"])
151
+ return e.extract(r.text)
152
+
153
+
154
+ # In[6]:
155
+
156
+
157
+ def finding_data(data, url):
158
+
159
+ if data:
160
+ for r in data['reviews']:
161
+ if r["title"] == None:
162
+ r["title"] = "None"
163
+ r["product"] = data["product_title"]
164
+ r['url'] = url
165
+ try:
166
+ r['rating'] = r['rating'].split(' out of')[0]
167
+ except:
168
+ r['rating'] = "None"
169
+
170
+ date_posted = r['date'].split('on ')[-1]
171
+ r['date'] = parse(date_posted).strftime('%m-%d-%Y')
172
+ if r['helped'] != None:
173
+ r['helped'] = r['helped'].split(" ")[0]
174
+ if r['helped'] == "One":
175
+ r['helped'] = "1"
176
+ else:
177
+ r['helped'] = 0
178
+ if r['verified'] != None:
179
+ r['verified'] = r['verified'].split(" ")[0]
180
+ if r['verified'] == "Verified":
181
+ r['verified'] = "1"
182
+ else:
183
+ r['verified'] = "0"
184
+
185
+
186
+
187
+ #print(data)
188
+ return data
189
+
190
+ # In[7]:
191
+
192
+ # In[8]:
193
+
194
+
195
+ def get_nextpage(data):
196
+ return "https://www.amazon.in"+data["next_page"]
197
+
198
+
199
+ # In[9]:
200
+
201
+
202
+ def clear_none():
203
+ #df = pd.read_csv('datalist.csv')
204
+ #df.dropna(axis="rows", how="any", inplace = True)
205
+ #df.to_csv('datalist.csv', index=False)
206
+ with open('data.csv', 'w+', encoding="utf-8", errors="ignore") as outfile:
207
+ writer = csv.DictWriter(outfile, fieldnames=["title","content","date", "author","rating","product","url", "verified", "helped"])
208
+ writer.writeheader()
209
+ outfile.close()
210
+ #clear_none()
211
+
212
+
213
+ # In[27]:
214
+
215
+
216
+ def get_details(link):
217
+ weight = 0
218
+ count = 0
219
+ details = scrape(link, price_e)
220
+ while details['amazon_given_rating'] == None and count < 15:
221
+ details = scrape(link, price_e)
222
+ print("count: " + str(count))
223
+ count += 1
224
+
225
+ if details["price"] == None:
226
+ details["price"] = ["", "1"]
227
+ else:
228
+ if "x" in details["price"]:
229
+ details["price"] = details["price"].split("\xa0")
230
+ details["price"][1] = details["price"][1].split(",")
231
+ details["price"][1] = ["".join(details["price"][1])]
232
+ details["price"][1] = details["price"][1][0].split(".")[0]
233
+ else:
234
+ details["price"] = list(details["price"])
235
+ details["price"].pop(0)
236
+ details["price"] = "".join(details["price"])
237
+ #print(details["price"])
238
+
239
+ if details["amazon_given_rating"] == None:
240
+ amazon_rating = "-"
241
+ else:
242
+ amazon_rating = details["amazon_given_rating"].split(" out")[0]
243
+
244
+ if (details['info'] == None) and (details['info2'] != None):
245
+ details['info'] = details['info2']
246
+ details['info2'] = None
247
+
248
+ if details['info'] != None:
249
+ info = details['info']
250
+ #weight = info.split("Weight ")[1][0]
251
+ print(amazon_rating)
252
+ print(details)
253
+
254
+
255
+
256
+
257
+
258
+ return details["price"], amazon_rating
259
+
260
+
261
+ # In[28]:
262
+
263
+
264
+ def relative_rates(timediff, allrating, allverified, all_helped):
265
+ sum_list = []
266
+ temp_arr = []
267
+ for i in range(0, len(all_helped)):
268
+ temp_arr.append(max(all_helped[i]))
269
+ norm_fact = max(temp_arr)
270
+ #print(temp_arr)
271
+
272
+ for i in range(0, len(timediff)):
273
+ for j in range(0, len(timediff[i])):
274
+ if int(allverified[i][j]) != 1:
275
+ timediff[i][j] = round((np.exp(-(timediff[i][j]**(1/4))) * allrating[i][j] * (all_helped[i][j]/norm_fact) * 0.1), 5)
276
+ else:
277
+ timediff[i][j] = round((np.exp(-(timediff[i][j]**(1/4))) * allrating[i][j] * (all_helped[i][j]/norm_fact)), 5)
278
+ for i in range(0, len(timediff)):
279
+ sum_list.append(round(sum(timediff[i]), 5))
280
+ return sum_list
281
+
282
+
283
+ # In[29]:
284
+
285
+ # In[30]:
286
+
287
+
288
+ def find_all_links(link, num):
289
+ link = link.split("?")
290
+ all_links = []
291
+ num_pages = math.ceil(int(num)/10)
292
+ for page in range(0, num_pages):
293
+ link[1] = "pageNumber=" + str(page+1)
294
+ temp_data = {"next_page": "?".join(link)}
295
+ finallink = get_nextpage(temp_data)
296
+ all_links.append(finallink)
297
+ return all_links
298
+
299
+
300
+
301
+ # In[31]:
302
+
303
+
304
+ def upload(res, asin, file_name):
305
+ file_name = asin + ".csv"
306
+ bucket = "productreviewsdata"
307
+ res.Bucket(bucket).upload_file("data.csv", "alldata/"+file_name)
308
+
309
+
310
+ # In[32]:
311
+
312
+
313
+ def find_asin(link):
314
+ link = link.split("/")
315
+ for i in range(0, len(link)):
316
+ if link[i] == "product-reviews":
317
+ asin = link[i+1]
318
+ if link[i] == "dp":
319
+ asin=link[i+1][0:10]
320
+ if link[i] == "product":
321
+ asin=link[i+1][0:10]
322
+ return asin
323
+
324
+
325
+ # In[33]:
326
+
327
+
328
+ def get_total_reviews(data):
329
+ data['total_reviews'] = data['total_reviews'].split("| ")
330
+ data['total_reviews'] = data['total_reviews'][1].split(" ")[0].split(",")
331
+ data["total_reviews"] = int(''.join(data["total_reviews"]))
332
+ return data["total_reviews"]
333
+
334
+ def myFunc(e):
335
+ return e["Our Rating"]
336
+ def list_down():
337
+ all_the_asin = []
338
+ for l in range(0, len(st.session_state.linksFinal)):
339
+ col1, col2= st.columns([2, 0.5])
340
+ exp = col1.expander(st.session_state.linksFinal[l].split("/ref")[0])
341
+ col2.button("X", key=str(l))
342
+ ASIN = find_asin(st.session_state.linksFinal[l])
343
+ all_the_asin.append(ASIN)
344
+ the_link = """https://ws-in.amazon-adsystem.com/widgets/q?ServiceVersion=20070822&OneJS=1&Operation=GetAdHtml&MarketPlace=IN&source=ss&ref=as_ss_li_til&ad_type=product_link&tracking_id=universalcont-21&language=en_IN&marketplace=amazon&region=IN&placement="""+ASIN+"""&asins="""+ASIN+"""&show_border=true&link_opens_in_new_window=true"""
345
+ with exp:
346
+ components.iframe(the_link, height=240, width=120)
347
+
348
+
349
+
350
+ #print(globals()["col"])
351
+ #print(globals()["col_an"])
352
+ #for n, val in enumerate(st.session_state["final"]):
353
+ # globals()["var%d"%n] = val
354
+
355
+ def create_vars(func_col):
356
+ for n, val in enumerate(func_col):
357
+ globals()["var%d"%n] = val
358
+ for n in range(0, len(func_col)):
359
+ with globals()["var"+str(n)]:
360
+ try:
361
+ ASIN = find_asin(st.session_state.linksFinal[n])
362
+ the_link = """https://ws-in.amazon-adsystem.com/widgets/q?ServiceVersion=20070822&OneJS=1&Operation=GetAdHtml&MarketPlace=IN&source=ss&ref=as_ss_li_til&ad_type=product_link&tracking_id=universalcont-21&language=en_IN&marketplace=amazon&region=IN&placement="""+ASIN+"""&asins="""+ASIN+"""&show_border=true&link_opens_in_new_window=true"""
363
+ components.iframe(the_link, height=240, width=120)
364
+ st.button("X", key=str(n))
365
+ except Exception as e:
366
+ st.write(e)
367
+ def create_graph(fig, df):
368
+ df['date'] = pd.to_datetime(df.date, infer_datetime_format = True)
369
+ df['date'] = df['date'].apply(lambda x: pd.Timestamp(x).strftime('%Y-%m-%d'))
370
+ df.sort_values(by = 'date', inplace = True, ascending=True)
371
+ y_data = [i+1 for i in range(0, len(df))]
372
+ fig.add_trace(go.Scatter(x=df["date"], y=y_data, name=list(set(df["product"]))[0][0:20]+"..."))
373
+ return fig
app.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import uuid
3
+ import boto3
4
+ import botocore
5
+ import streamlit.components.v1 as components
6
+ from streamlit_autorefresh import st_autorefresh
7
+ import requests
8
+ from all_funcs import *
9
+ from create_table import *
10
+ AWS_ACCESS_KEY_ID = 'AKIA4WLULVFKDGROP37L'
11
+ AWS_SECRET_ACCESS_KEY = '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'
12
+
13
+ s3 = boto3.client("s3",
14
+ region_name='ap-south-1',
15
+ aws_access_key_id=AWS_ACCESS_KEY_ID,
16
+ aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
17
+
18
+ res = boto3.resource("s3",
19
+ region_name='ap-south-1',
20
+ aws_access_key_id=AWS_ACCESS_KEY_ID,
21
+ aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
22
+ st.markdown("""
23
+ <style>
24
+ #MainMenu{visibility: hidden;}
25
+ td.css-57lzw8:nth-of-type(4){}
26
+ footer, label.css-zyb2jl, img.css-1jhkrss, button.css-bl767a {visibility: hidden;}
27
+ .copy-button{color:red;}
28
+
29
+ </style>
30
+
31
+ """, unsafe_allow_html=True)
32
+
33
+ if "iden" not in st.session_state:
34
+ st.session_state["iden"] = None
35
+ st.session_state["sesInBucket"] = None
36
+ st.session_state['dataInBucket'] = None
37
+ st.session_state['linksFinal'] = []
38
+ st.session_state['editLinks'] = []
39
+ st.session_state['chosen'] = ""
40
+ st.session_state["refresh"] = ""
41
+
42
+ id_place_con = st.sidebar.container()
43
+
44
+ def from_session():
45
+ already_in_body = st.session_state.dataInBucket
46
+ sessions_here = already_in_body.split(",")
47
+ a = []
48
+ indices = [("Comparison "+ str(num)) for num in range(1, len(sessions_here)+1)]
49
+ comparison_data_con = st.sidebar.container()
50
+ with comparison_data_con:
51
+ chosen = st.selectbox("Choose Session:", indices)
52
+ a = list(set(sessions_here[indices.index(chosen)].split("\n")))
53
+ a.remove("")
54
+ if st.session_state["refresh"] == True:
55
+ st.session_state.linksFinal = []
56
+ if chosen != st.session_state.chosen:
57
+ st.session_state.chosen = chosen
58
+ st.session_state["a"] = a
59
+ st.session_state["linksFinal"] = a
60
+ return sessions_here
61
+
62
+ def main():
63
+ #if "hey" not in st.session_state:
64
+ count = st_autorefresh(interval=1, limit=2, key="hey")
65
+ #st.write(st.session_state)
66
+ id_place_con.text("Comparison ID:")
67
+ id_place_con.code(st.session_state.iden.replace(".txt", ""))
68
+ id_place_con.download_button("Download ID", st.session_state.iden.replace(".txt", ""), file_name="Session ID.txt")
69
+ id_place_con.warning("Keep Comparison ID to access and save your comparisons.")
70
+ id_place_con.markdown("<hr>", unsafe_allow_html=True)
71
+ if st.session_state.sesInBucket==True:
72
+ sessions_here = from_session()
73
+ else:
74
+ sessions_here = []
75
+ #st.write(st.session_state.linksFinal)
76
+ if len(st.session_state) > 1:
77
+ for k in st.session_state:
78
+ if st.session_state[k] == True and k.isdigit():
79
+ st.session_state["linksFinal"].pop(int(k))
80
+
81
+ with st.sidebar.form(key='my_form'):
82
+ placeholder = st.empty()
83
+ s = placeholder.text_input(label='Enter URL')
84
+ submit = st.form_submit_button(label='Submit')
85
+ if submit:
86
+ try:
87
+ check_paste = requests.get(s)
88
+ if s in st.session_state["linksFinal"] or s.find("amazon.in") == -1:
89
+ pass
90
+ else:
91
+ st.session_state["linksFinal"].append(s)
92
+
93
+ except:
94
+ st.error('Not a valid URL')
95
+ conf1, refre1 = st.sidebar.columns([1, 1])
96
+ confirm = conf1.button("Compare")
97
+ refresh = refre1.button("Empty List", key="refresh")
98
+ if refresh:
99
+ st.session_state.linksFinal = []
100
+
101
+ if len(st.session_state.linksFinal) == 0:
102
+ pass
103
+ else:
104
+ exp=st.expander("Expand", expanded=True)
105
+ with exp:
106
+ create_vars(st.columns(len(st.session_state.linksFinal)))
107
+ if confirm:
108
+ string = create_table(st.session_state.linksFinal)
109
+ save_data_in_session(string, st.session_state.sesInBucket, sessions_here)
110
+ #count = st_autorefresh(interval=1, limit=2)
111
+ if st.session_state.iden != None:
112
+ main()
113
+
114
+ else:
115
+ enter_it = st.sidebar.container()
116
+ lol2 = st.sidebar.container()
117
+ create_it = st.sidebar.container()
118
+ with enter_it:
119
+ textPlace = st.empty()
120
+ produce_error = st.empty()
121
+ enter_uni_id = textPlace.text_input("Enter Comparison ID if you have one:")
122
+ if enter_uni_id == "":
123
+ pass
124
+ else:
125
+ try:
126
+ check_iden = s3.get_object(Bucket="productreviewsdata", Key="sessions/"+enter_uni_id+".txt")
127
+ st.session_state.iden = enter_uni_id + ".txt"
128
+ st.session_state.sesInBucket = True
129
+ st.session_state.dataInBucket = already_in_body = check_iden["Body"].read().decode()
130
+ textPlace.empty()
131
+ produce_error.empty()
132
+ except Exception as e:
133
+ produce_error.error("Comparison ID not found!")
134
+
135
+ with lol2:
136
+ or_thing = st.empty()
137
+ or_thing.write("OR")
138
+ with create_it:
139
+ create_it_button = st.empty()
140
+ thing = create_it_button.button("Create Comparison ID")
141
+ if thing == True:
142
+ iden = str(uuid.uuid4())
143
+ st.session_state["iden"] = iden + ".txt"
144
+ st.session_state.sesInBucket = False
145
+ if st.session_state.iden != None:
146
+ textPlace.empty()
147
+ or_thing.empty()
148
+ create_it_button.empty()
149
+ produce_error.empty()
150
+ main()
151
+
152
+
153
+
create_table.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import validators
2
+ from selectorlib import Extractor
3
+ import requests
4
+ import json
5
+ import time
6
+ import csv
7
+ from dateutil.parser import parse
8
+ import sys, os
9
+ import re
10
+ from datetime import date, datetime
11
+ import numpy as np
12
+ import math
13
+ import concurrent.futures
14
+ import boto3
15
+ import botocore
16
+ from io import StringIO
17
+ import pandas as pd
18
+ import streamlit as st
19
+ import streamlit.components.v1 as components
20
+ import base64
21
+ import uuid
22
+ #import pyperclip
23
+ #from IPython.core.display import HTML
24
+ from bokeh.plotting import figure
25
+ import plotly.express as px
26
+ import plotly.graph_objects as go
27
+ from all_funcs import *
28
+
29
+
30
+ def create_table(theurls):
31
+ e = Extractor.from_yaml_file('selectors.yml')
32
+ all_five_star = []
33
+ all_time_diff = []
34
+ all_hun_days = []
35
+ all_rating = []
36
+ all_verified = []
37
+ all_helped = []
38
+ urls_used = []
39
+ product_names = []
40
+ all_reviews = []
41
+ all_amazon_ratings = []
42
+ all_count_of_day = []
43
+ string = ""
44
+ fig = go.Figure()
45
+ prime = False
46
+ today = parse(date.today().strftime("%Y-%m-%d"))
47
+ url_dataframe = pd.DataFrame()
48
+
49
+ spin = st.empty()
50
+ stat = st.empty()
51
+ print(theurls)
52
+ for i in theurls:
53
+ try:
54
+ asin = find_asin(i)
55
+ print(asin)
56
+ if len(asin) != 10:
57
+ raise ValueError
58
+ except:
59
+ st.error("ASIN NUMBER NOT FOUND IN URL! PLEASE CHECK FORMAT OF URL")
60
+ prime = False
61
+ break
62
+ file_name = asin+'.csv'
63
+ print(file_name)
64
+ try:
65
+ df = s3.get_object(Bucket='productreviewsdata', Key="alldata/"+file_name)
66
+ body = df["Body"].read().decode('utf-8')
67
+ df_data = pd.read_csv(StringIO(body))
68
+ try:
69
+ title = list(set(df_data["product"]))[0]
70
+ print(list(set(df_data["title"])))
71
+ if list(set(df_data["title"]))[0] == "-":
72
+ st.error(title + " has 0 reviews. Please remove it from your list and try again!")
73
+ break
74
+
75
+ except IndexError:
76
+ string = string + "https://www.amazon.in/product-reviews/"+asin+"\n"
77
+ break
78
+ stat.info("Getting " + title + "....")
79
+ product_names.append(title)
80
+ try:
81
+ all_amazon_ratings.append(str(list(set(df_data["amazon_rating"]))[0]))
82
+ except:
83
+ all_amazon_ratings.append("-")
84
+ urls_used.append(list(set(df_data["url"]))[0])
85
+ string = string+list(set(df_data["url"]))[0]+"\n"
86
+ #st.write(df_data)
87
+ if len(df_data)==0:
88
+ pass
89
+ #string = string + "https://www.amazon.in/product-reviews/"+asin+"\n"
90
+ #st.write(string)
91
+ else:
92
+ fig = create_graph(fig, df_data)
93
+ df_len, deltaT, rate, ind_time_diff, ind_rating, ind_verified, ind_helped, count_of_day, count_of_five_star, ind_hun_days = getrate(df_data)
94
+ #print(df_len)
95
+ all_reviews.append(str(df_len))
96
+ all_time_diff.append(ind_time_diff)
97
+ all_rating.append(ind_rating)
98
+ all_verified.append(ind_verified)
99
+ all_helped.append(ind_helped)
100
+ all_count_of_day.append(count_of_day)
101
+ all_five_star.append(count_of_five_star)
102
+ all_hun_days.append(ind_hun_days)
103
+ prime=True
104
+
105
+ except botocore.exceptions.ClientError:
106
+ st.info("Request sent for " + asin)
107
+ create_df = pd.DataFrame({"title":[], "content": [], 'date':[], "author": [], "rating":[], "product":[], "url":[], "verified":[], "helped": [], "amazon_rating": []})
108
+ bucket = 'productreviewsdata'
109
+ csv_buffer = StringIO()
110
+ create_df.to_csv(csv_buffer, index=False)
111
+ res.Object(bucket, 'alldata/'+asin+'.csv').put(Body=csv_buffer.getvalue())
112
+ string = string + "https://www.amazon.in/product-reviews/"+asin+"\n"
113
+ prime=False
114
+ dataf = pd.DataFrame({'Product': [],
115
+ 'Our Rating': [],
116
+ 'Total Verified Purchases': [],
117
+ 'No. of Verified Purchases in last 100 days':[],
118
+ 'No. of Verified Purchases that have 5 stars in the last 100 days':[],
119
+ 'Amazon Rating': [],
120
+ 'URL': []})
121
+
122
+ if prime and len(all_time_diff) == len(st.session_state["linksFinal"]):
123
+ fig.update_layout(
124
+ title="Graph of reviews",
125
+ xaxis_title="Date",
126
+ yaxis_title="No. of Reviews",
127
+ legend_title="Products",
128
+ font=dict(
129
+ family="Courier New, monospace",
130
+ color="black"))
131
+ rates = relative_rates(all_time_diff, all_rating, all_verified, all_helped)
132
+ for record in range(0, len(urls_used)):
133
+ #dataf.append([product_names[record], all_reviews[record], rates[record], all_amazon_ratings[record]])
134
+
135
+ to_insert = {
136
+ 'Product': product_names[record][:70]+"...",
137
+ 'Our Rating': rates[record],
138
+ 'Total Verified Purchases': all_reviews[record],
139
+ 'No. of Verified Purchases in last 100 days': str(all_count_of_day[record]),
140
+ 'No. of Verified Purchases that have 5 stars in the last 100 days': str(all_five_star[record]),
141
+ 'Amazon Rating': all_amazon_ratings[record],
142
+ 'URL': urls_used[record]
143
+ }
144
+ dataf = dataf.append(to_insert, ignore_index=True)
145
+ dataf = dataf.sort_values(by=['Our Rating'], ascending=False)
146
+ dataf.set_index('Product', inplace=True)
147
+ stat.empty()
148
+ #st.table(dataf.style.format({"Total Reviews": "{:.0f}"}))
149
+
150
+ st.table(dataf)
151
+ st.plotly_chart(fig)
152
+ #st.dataframe(dataf)
153
+ else:
154
+ stat.empty()
155
+ #reqs_spin.empty()
156
+ spin.info("Your request is being processed...")
157
+
158
+ time.sleep(10)
159
+ #st.write(string)
160
+ return string
161
+
162
+ def save_data_in_session(string, prime_session, sessions_here):
163
+ if prime_session ==True:
164
+ s_check = string.split("\n")
165
+ try:
166
+ while True:
167
+ s_check.remove("")
168
+ except ValueError:
169
+ pass
170
+ print("THIS")
171
+ print(s_check)
172
+ if len(s_check) != len(st.session_state.linksFinal):
173
+ pass
174
+ else:
175
+ for ses in sessions_here:
176
+ ses_check = ses.split("\n")
177
+ try:
178
+ while True:
179
+ ses_check.remove("")
180
+ except ValueError:
181
+ pass
182
+ print("ses_check")
183
+ print(ses_check)
184
+ if set(s_check) == set(ses_check):
185
+ break
186
+ else:
187
+ print("HIIIIIIIIIIIIII")
188
+ string = st.session_state.dataInBucket+",\n"+string
189
+ st.success("Session Saved")
190
+ res.Object('productreviewsdata', 'sessions/'+st.session_state["iden"]).put(Body=string)
191
+
192
+ else:
193
+ s_check = string.split("\n")
194
+ try:
195
+ while True:
196
+ s_check.remove("")
197
+ except ValueError:
198
+ pass
199
+ if len(s_check) !=len(st.session_state.linksFinal):
200
+ pass
201
+ else:
202
+ st.success("Session Saved")
203
+ res.Object('productreviewsdata', 'sessions/'+st.session_state["iden"]).put(Body=string)
204
+
205
+
206
+
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ python-dateutil
2
+ requests
3
+ selectorlib
4
+ numpy
5
+ boto3
6
+ pandas
7
+ streamlit