poemsforaphrodite commited on
Commit
7b55067
·
verified ·
1 Parent(s): 7eda627

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +339 -0
app.py CHANGED
@@ -1,3 +1,342 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  def main():
2
  setup_streamlit()
3
  client_config = load_config()
 
1
+ # Standard library imports
2
+ import datetime
3
+ import base64
4
+ import os
5
+
6
+ # Related third-party imports
7
+ import streamlit as st
8
+ from streamlit_elements import elements
9
+ from google_auth_oauthlib.flow import Flow
10
+ from googleapiclient.discovery import build
11
+ from dotenv import load_dotenv
12
+ import pandas as pd
13
+ import searchconsole
14
+ import cohere
15
+ from sklearn.metrics.pairwise import cosine_similarity
16
+ import requests
17
+ from bs4 import BeautifulSoup
18
+
19
+ load_dotenv()
20
+ #test
21
+
22
+ # Initialize Cohere client
23
+ COHERE_API_KEY = os.environ["COHERE_API_KEY"]
24
+ co = cohere.Client(COHERE_API_KEY)
25
+
26
+ # Configuration: Set to True if running locally, False if running on Streamlit Cloud
27
+ IS_LOCAL = False
28
+
29
+ # Constants
30
+ SEARCH_TYPES = ["web", "image", "video", "news", "discover", "googleNews"]
31
+ DATE_RANGE_OPTIONS = [
32
+ "Last 7 Days",
33
+ "Last 30 Days",
34
+ "Last 3 Months",
35
+ "Last 6 Months",
36
+ "Last 12 Months",
37
+ "Last 16 Months",
38
+ "Custom Range"
39
+ ]
40
+ DEVICE_OPTIONS = ["All Devices", "desktop", "mobile", "tablet"]
41
+ BASE_DIMENSIONS = ["page", "query", "country", "date"]
42
+ MAX_ROWS = 250_000
43
+ DF_PREVIEW_ROWS = 100
44
+
45
+ # -------------
46
+ # Streamlit App Configuration
47
+ # -------------
48
+
49
+ def setup_streamlit():
50
+ st.set_page_config(page_title="Simple Google Search Console Data", layout="wide")
51
+ st.title("GSC Relenvacy Score Calculator")
52
+ # st.markdown(f"### Lightweight GSC Data Extractor. (Max {MAX_ROWS:,} Rows)")
53
+ st.divider()
54
+
55
+ def init_session_state():
56
+ if 'selected_property' not in st.session_state:
57
+ st.session_state.selected_property = None
58
+ if 'selected_search_type' not in st.session_state:
59
+ st.session_state.selected_search_type = 'web'
60
+ if 'selected_date_range' not in st.session_state:
61
+ st.session_state.selected_date_range = 'Last 7 Days'
62
+ if 'start_date' not in st.session_state:
63
+ st.session_state.start_date = datetime.date.today() - datetime.timedelta(days=7)
64
+ if 'end_date' not in st.session_state:
65
+ st.session_state.end_date = datetime.date.today()
66
+ if 'selected_dimensions' not in st.session_state:
67
+ st.session_state.selected_dimensions = ['page', 'query']
68
+ if 'selected_device' not in st.session_state:
69
+ st.session_state.selected_device = 'All Devices'
70
+ if 'custom_start_date' not in st.session_state:
71
+ st.session_state.custom_start_date = datetime.date.today() - datetime.timedelta(days=7)
72
+ if 'custom_end_date' not in st.session_state:
73
+ st.session_state.custom_end_date = datetime.date.today()
74
+
75
+ # -------------
76
+ # Data Processing Functions
77
+ # -------------
78
+
79
+ def fetch_content(url):
80
+ try:
81
+ response = requests.get(url)
82
+ response.raise_for_status()
83
+ soup = BeautifulSoup(response.text, 'html.parser')
84
+ content = soup.get_text(separator=' ', strip=True)
85
+ return content
86
+ except requests.RequestException as e:
87
+ return str(e)
88
+
89
+ def generate_embeddings(text_list, model_type):
90
+ if not text_list:
91
+ return []
92
+ model = 'embed-english-v3.0' if model_type == 'english' else 'embed-multilingual-v3.0'
93
+ input_type = 'search_document'
94
+ response = co.embed(model=model, texts=text_list, input_type=input_type)
95
+ embeddings = response.embeddings
96
+ return embeddings
97
+
98
+ def calculate_relevancy_scores(df, model_type):
99
+ try:
100
+ page_contents = [fetch_content(url) for url in df['page']]
101
+ page_embeddings = generate_embeddings(page_contents, model_type)
102
+ query_embeddings = generate_embeddings(df['query'].tolist(), model_type)
103
+ relevancy_scores = cosine_similarity(query_embeddings, page_embeddings).diagonal()
104
+ df = df.assign(relevancy_score=relevancy_scores)
105
+ except Exception as e:
106
+ st.warning(f"Error calculating relevancy scores: {e}")
107
+ df = df.assign(relevancy_score=0)
108
+ return df
109
+
110
+ def process_gsc_data(df):
111
+ # Filter for queries below position 10
112
+ df_filtered = df[df['position'] > 10].copy()
113
+
114
+ # Sort by impressions in descending order
115
+ df_sorted = df_filtered.sort_values(['impressions'], ascending=[False])
116
+
117
+ # Keep only the highest impression query for each page
118
+ df_unique = df_sorted.drop_duplicates(subset='page', keep='first')
119
+
120
+ if 'relevancy_score' not in df_unique.columns:
121
+ df_unique['relevancy_score'] = 0
122
+ else:
123
+ df_unique['relevancy_score'] = df_sorted.groupby('page')['relevancy_score'].first().values
124
+
125
+ result = df_unique[['page', 'query', 'clicks', 'impressions', 'ctr', 'position', 'relevancy_score']]
126
+ return result
127
+
128
+ # -------------
129
+ # Google Authentication Functions
130
+ # -------------
131
+
132
+ def load_config():
133
+ client_config = {
134
+ "web": {
135
+ "client_id": os.environ["CLIENT_ID"],
136
+ "client_secret": os.environ["CLIENT_SECRET"],
137
+ "auth_uri": "https://accounts.google.com/o/oauth2/auth",
138
+ "token_uri": "https://oauth2.googleapis.com/token",
139
+ "redirect_uris": ["https://poemsforaphrodite-gscpro.hf.space/"],
140
+ }
141
+ }
142
+ return client_config
143
+
144
+ def init_oauth_flow(client_config):
145
+ scopes = ["https://www.googleapis.com/auth/webmasters.readonly"]
146
+ flow = Flow.from_client_config(
147
+ client_config,
148
+ scopes=scopes,
149
+ redirect_uri=client_config["web"]["redirect_uris"][0]
150
+ )
151
+ return flow
152
+
153
+ def google_auth(client_config):
154
+ flow = init_oauth_flow(client_config)
155
+ auth_url, _ = flow.authorization_url(prompt="consent")
156
+ return flow, auth_url
157
+
158
+ def auth_search_console(client_config, credentials):
159
+ token = {
160
+ "token": credentials.token,
161
+ "refresh_token": credentials.refresh_token,
162
+ "token_uri": credentials.token_uri,
163
+ "client_id": credentials.client_id,
164
+ "client_secret": credentials.client_secret,
165
+ "scopes": credentials.scopes,
166
+ "id_token": getattr(credentials, "id_token", None),
167
+ }
168
+ return searchconsole.authenticate(client_config=client_config, credentials=token)
169
+
170
+ # -------------
171
+ # Data Fetching Functions
172
+ # -------------
173
+
174
+ def list_gsc_properties(credentials):
175
+ service = build('webmasters', 'v3', credentials=credentials)
176
+ site_list = service.sites().list().execute()
177
+ return [site['siteUrl'] for site in site_list.get('siteEntry', [])] or ["No properties found"]
178
+
179
+ def fetch_gsc_data(webproperty, search_type, start_date, end_date, dimensions, device_type=None):
180
+ query = webproperty.query.range(start_date, end_date).search_type(search_type).dimension(*dimensions)
181
+ if 'device' in dimensions and device_type and device_type != 'All Devices':
182
+ query = query.filter('device', 'equals', device_type.lower())
183
+ try:
184
+ df = query.limit(MAX_ROWS).get().to_dataframe()
185
+ return process_gsc_data(df)
186
+ except Exception as e:
187
+ show_error(e)
188
+ return pd.DataFrame()
189
+
190
+ def fetch_data_loading(webproperty, search_type, start_date, end_date, dimensions, device_type=None, model_type='english'):
191
+ with st.spinner('Fetching data and calculating relevancy scores...'):
192
+ df = fetch_gsc_data(webproperty, search_type, start_date, end_date, dimensions, device_type)
193
+ if not df.empty:
194
+ df = calculate_relevancy_scores(df, model_type)
195
+ processed_df = process_gsc_data(df)
196
+ return processed_df
197
+
198
+ # -------------
199
+ # Utility Functions
200
+ # -------------
201
+
202
+ def update_dimensions(selected_search_type):
203
+ return BASE_DIMENSIONS + ['device'] if selected_search_type in SEARCH_TYPES else BASE_DIMENSIONS
204
+
205
+ def calc_date_range(selection, custom_start=None, custom_end=None):
206
+ range_map = {
207
+ 'Last 7 Days': 7,
208
+ 'Last 30 Days': 30,
209
+ 'Last 3 Months': 90,
210
+ 'Last 6 Months': 180,
211
+ 'Last 12 Months': 365,
212
+ 'Last 16 Months': 480
213
+ }
214
+ today = datetime.date.today()
215
+ if selection == 'Custom Range':
216
+ if custom_start and custom_end:
217
+ return custom_start, custom_end
218
+ else:
219
+ return today - datetime.timedelta(days=7), today
220
+ return today - datetime.timedelta(days=range_map.get(selection, 0)), today
221
+
222
+ def show_error(e):
223
+ st.error(f"An error occurred: {e}")
224
+
225
+ def property_change():
226
+ st.session_state.selected_property = st.session_state['selected_property_selector']
227
+
228
+ # -------------
229
+ # File & Download Operations
230
+ # -------------
231
+
232
+ def show_dataframe(report):
233
+ with st.expander("Preview the First 100 Rows (Unique Pages with Top Query)"):
234
+ st.dataframe(report.head(DF_PREVIEW_ROWS))
235
+
236
+ def download_csv_link(report):
237
+ def to_csv(df):
238
+ return df.to_csv(index=False, encoding='utf-8-sig')
239
+ csv = to_csv(report)
240
+ b64_csv = base64.b64encode(csv.encode()).decode()
241
+ href = f'<a href="data:file/csv;base64,{b64_csv}" download="search_console_data.csv">Download CSV File</a>'
242
+ st.markdown(href, unsafe_allow_html=True)
243
+
244
+ # -------------
245
+ # Streamlit UI Components
246
+ # -------------
247
+
248
+ def show_google_sign_in(auth_url):
249
+ with st.sidebar:
250
+ if st.button("Sign in with Google"):
251
+ st.write('Please click the link below to sign in:')
252
+ st.markdown(f'[Google Sign-In]({auth_url})', unsafe_allow_html=True)
253
+
254
+ def show_property_selector(properties, account):
255
+ selected_property = st.selectbox(
256
+ "Select a Search Console Property:",
257
+ properties,
258
+ index=properties.index(
259
+ st.session_state.selected_property) if st.session_state.selected_property in properties else 0,
260
+ key='selected_property_selector',
261
+ on_change=property_change
262
+ )
263
+ return account[selected_property]
264
+
265
+ def show_search_type_selector():
266
+ return st.selectbox(
267
+ "Select Search Type:",
268
+ SEARCH_TYPES,
269
+ index=SEARCH_TYPES.index(st.session_state.selected_search_type),
270
+ key='search_type_selector'
271
+ )
272
+
273
+ def show_model_type_selector():
274
+ return st.selectbox(
275
+ "Select the embedding model:",
276
+ ["english", "multilingual"],
277
+ key='model_type_selector'
278
+ )
279
+
280
+ def show_date_range_selector():
281
+ return st.selectbox(
282
+ "Select Date Range:",
283
+ DATE_RANGE_OPTIONS,
284
+ index=DATE_RANGE_OPTIONS.index(st.session_state.selected_date_range),
285
+ key='date_range_selector'
286
+ )
287
+
288
+ def show_custom_date_inputs():
289
+ st.session_state.custom_start_date = st.date_input("Start Date", st.session_state.custom_start_date)
290
+ st.session_state.custom_end_date = st.date_input("End Date", st.session_state.custom_end_date)
291
+
292
+ def show_dimensions_selector(search_type):
293
+ available_dimensions = update_dimensions(search_type)
294
+ return st.multiselect(
295
+ "Select Dimensions:",
296
+ available_dimensions,
297
+ default=st.session_state.selected_dimensions,
298
+ key='dimensions_selector'
299
+ )
300
+
301
+ def show_paginated_dataframe(report, rows_per_page=20):
302
+ # Convert 'position' column to integer
303
+ report['position'] = report['position'].astype(int)
304
+
305
+ # Create a clickable URL column
306
+ def make_clickable(url):
307
+ return f'<a href="{url}" target="_blank">{url}</a>'
308
+
309
+ report['clickable_url'] = report['page'].apply(make_clickable)
310
+
311
+ # Reorder columns to put clickable_url first and sort by impressions
312
+ columns = ['clickable_url', 'query', 'impressions', 'clicks', 'ctr', 'position', 'relevancy_score']
313
+ report = report[columns].sort_values('impressions', ascending=False)
314
+
315
+ total_rows = len(report)
316
+ total_pages = (total_rows - 1) // rows_per_page + 1
317
+
318
+ if 'current_page' not in st.session_state:
319
+ st.session_state.current_page = 1
320
+
321
+ col1, col2, col3 = st.columns([1,3,1])
322
+ with col1:
323
+ if st.button("Previous", disabled=st.session_state.current_page == 1):
324
+ st.session_state.current_page -= 1
325
+ with col2:
326
+ st.write(f"Page {st.session_state.current_page} of {total_pages}")
327
+ with col3:
328
+ if st.button("Next", disabled=st.session_state.current_page == total_pages):
329
+ st.session_state.current_page += 1
330
+
331
+ start_idx = (st.session_state.current_page - 1) * rows_per_page
332
+ end_idx = start_idx + rows_per_page
333
+
334
+ # Use st.markdown to display the dataframe with clickable links
335
+ st.markdown(report.iloc[start_idx:end_idx].to_html(escape=False, index=False), unsafe_allow_html=True)
336
+ # -------------
337
+ # Main Streamlit App Function
338
+ # -------------
339
+
340
  def main():
341
  setup_streamlit()
342
  client_config = load_config()