poemsforaphrodite commited on
Commit
67a6275
1 Parent(s): 9f4df6e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -70
app.py CHANGED
@@ -120,8 +120,6 @@ def init_session_state():
120
  st.session_state.custom_start_date = datetime.date.today() - datetime.timedelta(days=7)
121
  if 'custom_end_date' not in st.session_state:
122
  st.session_state.custom_end_date = datetime.date.today()
123
- if 'relevancy_scores' not in st.session_state:
124
- st.session_state.relevancy_scores = {}
125
  #logging.info("Session state initialized")
126
 
127
  # -------------
@@ -223,21 +221,20 @@ def fetch_content(url, query):
223
  except requests.RequestException:
224
  return ""
225
 
226
- def calculate_relevance_score(page_content, query, co, model_type='english'):
 
227
  try:
228
- if not page_content.strip():
229
- st.warning("Page content is empty. Cannot calculate relevance score.")
230
- return 0
231
- model = 'embed-english-v3.0' if model_type == 'english' else 'embed-multilingual-v3.0'
232
- embeddings = co.embed(texts=[page_content, query], model=model, input_type=['search_document', 'search_query'])
233
- page_embedding = embeddings.embeddings[0]
234
- query_embedding = embeddings.embeddings[1]
235
- if not any(page_embedding) or not any(query_embedding):
236
- st.warning("One of the embeddings is empty. Returning a score of 0.")
237
  return 0
 
 
 
238
  score = cosine_similarity([query_embedding], [page_embedding])[0][0]
 
239
  return score
240
  except Exception as e:
 
241
  st.error(f"Error calculating relevance score: {str(e)}")
242
  return 0
243
 
@@ -247,47 +244,46 @@ def normalize_url(url):
247
  def analyze_competitors(row, co, custom_url=None, country_code=None):
248
  query = row['query']
249
  our_url = normalize_url(row['page'])
250
- model_type = row.get('model_type', 'english') # Assuming you store model_type per row
251
-
252
  competitor_data = get_serp_results(query, country_code)
253
-
254
  results = []
255
- our_url_found = False # Flag to check if our URL is in the results
256
-
257
  for data in competitor_data:
258
  competitor_url = normalize_url(data['url'])
259
- score = calculate_relevance_score(data['content'], query, co, model_type=model_type)
260
- is_our = competitor_url == our_url
261
- if is_our:
262
- our_url_found = True
263
  results.append({
264
  'Position': data['position'],
265
  'URL': competitor_url,
266
  'Score': score,
267
- 'is_our_url': is_our
268
  })
269
-
270
- # Retrieve "Our Score" from the main data table
271
- our_score = st.session_state.relevancy_scores.get(our_url, 0)
272
-
273
- if not our_url_found:
274
  results.append({
275
  'Position': len(results) + 1,
276
- 'URL': f"{our_url} (Our URL)",
277
  'Score': our_score,
278
  'is_our_url': True
279
  })
280
-
281
- # Sort results by position in ascending order
282
  results = sorted(results, key=lambda x: x['Position'])
283
-
284
  # Create DataFrame
285
  results_df = pd.DataFrame(results)
286
  results_df['Position'] = results_df['Position'].astype(int)
287
-
 
 
 
 
 
288
  # Keep only the columns we want to display
289
  results_df = results_df[['Position', 'URL', 'Score']]
290
-
291
  return results_df
292
 
293
  def show_competitor_analysis(row, co, country_code):
@@ -411,10 +407,19 @@ def fetch_gsc_data(webproperty, search_type, start_date, end_date, dimensions, d
411
 
412
 
413
  def calculate_relevancy_scores(df, model_type):
414
- for index, row in df.iterrows():
415
- if pd.isna(row['relevancy_score']) or row['relevancy_score'] == 0:
416
- score = calculate_single_relevancy(row)
417
- df.at[index, 'relevancy_score'] = score
 
 
 
 
 
 
 
 
 
418
  return df
419
 
420
  # -------------
@@ -517,9 +522,7 @@ def show_model_type_selector():
517
  def calculate_single_relevancy(row):
518
  page_content = fetch_content(row['page'], row['query'])
519
  query = row['query']
520
- model_type = st.session_state.get('model_type_selector', 'english') # Retrieve from session state
521
- score = calculate_relevance_score(page_content, query, co, model_type=model_type)
522
- st.session_state.relevancy_scores[normalize_url(row['page'])] = score # Ensure score is stored
523
  return score
524
 
525
  def compare_with_top_result(row, co, country_code):
@@ -540,11 +543,9 @@ def compare_with_top_result(row, co, country_code):
540
  our_content = fetch_content(our_url, query)
541
  top_content = top_result['content']
542
 
543
- # Retrieve "Our Score" from the main data table
544
- our_score = st.session_state['relevancy_scores'].get(normalize_url(our_url), 0)
545
-
546
  # Calculate relevancy scores
547
- top_score = calculate_relevance_score(top_content, query, co, model_type=row.get('model_type', 'english'))
 
548
 
549
  # Prepare prompt for GPT-4
550
  prompt = f"""
@@ -575,12 +576,12 @@ def compare_with_top_result(row, co, country_code):
575
 
576
  # Display results
577
  st.subheader("Content Comparison Analysis")
578
- st.write(f"**Query:** {query}")
579
- st.write(f"**Top-ranking URL:** {top_url}")
580
- st.write(f"**Our URL:** {our_url}")
581
- st.write(f"**Top-ranking score:** {top_score:.4f}")
582
- st.write(f"**Our score:** {our_score:.4f}")
583
- st.write("**Analysis:**")
584
  st.write(analysis)
585
  except Exception as e:
586
  st.error(f"Error in GPT-4 analysis: {str(e)}")
@@ -628,16 +629,12 @@ def show_tabular_data(df, co, country_code):
628
  )
629
  if st.button("Click here to calculate relevancy for selected pages"):
630
  selected_indices = [i for i, selected in enumerate(st.session_state.selected_rows) if selected]
631
- if selected_indices:
632
- progress_bar = st.progress(0)
633
- for i, index in enumerate(selected_indices):
634
- if pd.isna(df.at[index, 'relevancy_score']) or df.at[index, 'relevancy_score'] == 0:
635
- df.at[index, 'relevancy_score'] = calculate_single_relevancy(df.iloc[index])
636
- progress_bar.progress((i + 1) / len(selected_indices))
637
- st.success(f"Calculated relevancy scores for {len(selected_indices)} selected rows.")
638
- st.session_state.report_data = df # Update the report_data in session state
639
- else:
640
- st.warning("No rows selected. Please select at least one row to calculate relevancy.")
641
 
642
  # Display column headers
643
  cols = st.columns([0.5, 3, 2, 1, 1, 1, 1, 1, 1])
@@ -650,8 +647,9 @@ def show_tabular_data(df, co, country_code):
650
  cols = st.columns([0.5, 3, 2, 1, 1, 1, 1, 1, 1])
651
 
652
  # Checkbox for row selection
653
- cols[0].checkbox("Select", key=f"select_{i}", value=st.session_state.selected_rows[i],
654
- on_change=lambda idx=i: update_selected_rows(idx))
 
655
 
656
  # Truncate and make the URL clickable
657
  truncated_url = row.page[:30] + '...' if len(row.page) > 30 else row.page
@@ -680,7 +678,7 @@ def show_tabular_data(df, co, country_code):
680
  if st.session_state[competitor_state_key]:
681
  st.write(f"Competitor Analysis for: {row.query}")
682
  with st.spinner('Analyzing competitors...'):
683
- results_df = analyze_competitors(row, co, country_code=country_code)
684
 
685
  # Sort the results by Position in ascending order
686
  results_df = results_df.sort_values('Position', ascending=True).reset_index(drop=True)
@@ -720,11 +718,11 @@ def show_tabular_data(df, co, country_code):
720
  st.warning("Your page's relevancy score is in the lower half of the results. Consider optimizing your content.")
721
  else:
722
  st.error(f"Our page '{row.page}' is not in the results. This indicates an error in fetching or processing the page.")
723
-
724
  if compare_state_key not in st.session_state:
725
  st.session_state[compare_state_key] = False
726
 
727
- if cols[8].button("Compare Your Relevancy Score to the Page In First Place", key=compare_button_key):
728
  st.session_state[compare_state_key] = True
729
 
730
  if st.session_state[compare_state_key]:
@@ -734,9 +732,6 @@ def show_tabular_data(df, co, country_code):
734
 
735
  return df # Return the updated dataframe
736
 
737
- def update_selected_rows(idx):
738
- st.session_state.selected_rows[idx] = not st.session_state.selected_rows[idx]
739
-
740
  def show_date_range_selector():
741
  # logging.info("Showing date range selector")
742
  return st.selectbox(
@@ -889,8 +884,7 @@ def main():
889
  if st.session_state.report_data is not None and not st.session_state.report_data.empty:
890
  st.write("Data fetched successfully.")
891
 
892
- updated_df = show_tabular_data(st.session_state.report_data, co, country_code)
893
- st.session_state.report_data = updated_df # Update the report_data with the potentially modified dataframe
894
 
895
  download_csv_link(st.session_state.report_data)
896
  elif st.session_state.report_data is not None:
 
120
  st.session_state.custom_start_date = datetime.date.today() - datetime.timedelta(days=7)
121
  if 'custom_end_date' not in st.session_state:
122
  st.session_state.custom_end_date = datetime.date.today()
 
 
123
  #logging.info("Session state initialized")
124
 
125
  # -------------
 
221
  except requests.RequestException:
222
  return ""
223
 
224
+ def calculate_relevance_score(page_content, query, co):
225
+ # logger.info(f"Calculating relevance score for query: {query}")
226
  try:
227
+ if not page_content:
228
+ # logger.warning("Empty page content. Returning score 0.")
 
 
 
 
 
 
 
229
  return 0
230
+
231
+ page_embedding = co.embed(texts=[page_content], model='embed-english-v3.0', input_type='search_document').embeddings[0]
232
+ query_embedding = co.embed(texts=[query], model='embed-english-v3.0', input_type='search_query').embeddings[0]
233
  score = cosine_similarity([query_embedding], [page_embedding])[0][0]
234
+ # logger.debug(f"Relevance score calculated: {score}")
235
  return score
236
  except Exception as e:
237
+ # logger.exception(f"Error calculating relevance score: {str(e)}")
238
  st.error(f"Error calculating relevance score: {str(e)}")
239
  return 0
240
 
 
244
  def analyze_competitors(row, co, custom_url=None, country_code=None):
245
  query = row['query']
246
  our_url = normalize_url(row['page'])
247
+
 
248
  competitor_data = get_serp_results(query, country_code)
249
+
250
  results = []
 
 
251
  for data in competitor_data:
252
  competitor_url = normalize_url(data['url'])
253
+ score = calculate_relevance_score(data['content'], query, co)
 
 
 
254
  results.append({
255
  'Position': data['position'],
256
  'URL': competitor_url,
257
  'Score': score,
258
+ 'is_our_url': competitor_url == our_url
259
  })
260
+
261
+ our_content = fetch_content(our_url, query)
262
+ our_score = calculate_relevance_score(our_content, query, co)
263
+
264
+ if not any(r['is_our_url'] for r in results):
265
  results.append({
266
  'Position': len(results) + 1,
267
+ 'URL': our_url,
268
  'Score': our_score,
269
  'is_our_url': True
270
  })
271
+
272
+ # Sort results by position
273
  results = sorted(results, key=lambda x: x['Position'])
274
+
275
  # Create DataFrame
276
  results_df = pd.DataFrame(results)
277
  results_df['Position'] = results_df['Position'].astype(int)
278
+
279
+ # Mark our URL
280
+ results_df['URL'] = results_df.apply(
281
+ lambda x: f"{x['URL']} (Our URL)" if x['is_our_url'] else x['URL'], axis=1
282
+ )
283
+
284
  # Keep only the columns we want to display
285
  results_df = results_df[['Position', 'URL', 'Score']]
286
+
287
  return results_df
288
 
289
  def show_competitor_analysis(row, co, country_code):
 
407
 
408
 
409
  def calculate_relevancy_scores(df, model_type):
410
+ #logging.info("Calculating relevancy scores")
411
+ with st.spinner('Calculating relevancy scores...'):
412
+ try:
413
+ page_contents = [fetch_content(url) for url in df['page']]
414
+ page_embeddings = generate_embeddings(page_contents, model_type)
415
+ query_embeddings = generate_embeddings(df['query'].tolist(), model_type)
416
+ relevancy_scores = cosine_similarity(query_embeddings, page_embeddings).diagonal()
417
+ df = df.assign(relevancy_score=relevancy_scores)
418
+ #logging.info("Relevancy scores calculated successfully")
419
+ except Exception as e:
420
+ #logging.error(f"Error calculating relevancy scores: {e}")
421
+ st.warning(f"Error calculating relevancy scores: {e}")
422
+ df = df.assign(relevancy_score=0)
423
  return df
424
 
425
  # -------------
 
522
  def calculate_single_relevancy(row):
523
  page_content = fetch_content(row['page'], row['query'])
524
  query = row['query']
525
+ score = calculate_relevance_score(page_content, query, co)
 
 
526
  return score
527
 
528
  def compare_with_top_result(row, co, country_code):
 
543
  our_content = fetch_content(our_url, query)
544
  top_content = top_result['content']
545
 
 
 
 
546
  # Calculate relevancy scores
547
+ our_score = calculate_relevance_score(our_content, query, co)
548
+ top_score = calculate_relevance_score(top_content, query, co)
549
 
550
  # Prepare prompt for GPT-4
551
  prompt = f"""
 
576
 
577
  # Display results
578
  st.subheader("Content Comparison Analysis")
579
+ st.write(f"Query: {query}")
580
+ st.write(f"Top-ranking URL: {top_url}")
581
+ st.write(f"Our URL: {our_url}")
582
+ st.write(f"Top-ranking score: {top_score:.4f}")
583
+ st.write(f"Our score: {our_score:.4f}")
584
+ st.write("Analysis:")
585
  st.write(analysis)
586
  except Exception as e:
587
  st.error(f"Error in GPT-4 analysis: {str(e)}")
 
629
  )
630
  if st.button("Click here to calculate relevancy for selected pages"):
631
  selected_indices = [i for i, selected in enumerate(st.session_state.selected_rows) if selected]
632
+ with st.spinner('Calculating relevancy scores...'):
633
+ for index in selected_indices:
634
+ if pd.isna(df.iloc[index]['relevancy_score']) or df.iloc[index]['relevancy_score'] == 0:
635
+ df.iloc[index, df.columns.get_loc('relevancy_score')] = calculate_single_relevancy(df.iloc[index])
636
+ st.success(f"Calculated relevancy scores for {len(selected_indices)} selected rows.")
637
+ st.experimental_rerun()
 
 
 
 
638
 
639
  # Display column headers
640
  cols = st.columns([0.5, 3, 2, 1, 1, 1, 1, 1, 1])
 
647
  cols = st.columns([0.5, 3, 2, 1, 1, 1, 1, 1, 1])
648
 
649
  # Checkbox for row selection
650
+ cols[0].checkbox("", key=f"select_{i}", value=st.session_state.selected_rows[i],
651
+ on_change=lambda idx=i: setattr(st.session_state, 'selected_rows',
652
+ [True if j == idx else x for j, x in enumerate(st.session_state.selected_rows)]))
653
 
654
  # Truncate and make the URL clickable
655
  truncated_url = row.page[:30] + '...' if len(row.page) > 30 else row.page
 
678
  if st.session_state[competitor_state_key]:
679
  st.write(f"Competitor Analysis for: {row.query}")
680
  with st.spinner('Analyzing competitors...'):
681
+ results_df = analyze_competitors(row._asdict(), co, country_code=country_code)
682
 
683
  # Sort the results by Position in ascending order
684
  results_df = results_df.sort_values('Position', ascending=True).reset_index(drop=True)
 
718
  st.warning("Your page's relevancy score is in the lower half of the results. Consider optimizing your content.")
719
  else:
720
  st.error(f"Our page '{row.page}' is not in the results. This indicates an error in fetching or processing the page.")
721
+
722
  if compare_state_key not in st.session_state:
723
  st.session_state[compare_state_key] = False
724
 
725
+ if st.button("Compare Your Relevancy Score to the Page In First Place", key=compare_button_key):
726
  st.session_state[compare_state_key] = True
727
 
728
  if st.session_state[compare_state_key]:
 
732
 
733
  return df # Return the updated dataframe
734
 
 
 
 
735
  def show_date_range_selector():
736
  # logging.info("Showing date range selector")
737
  return st.selectbox(
 
884
  if st.session_state.report_data is not None and not st.session_state.report_data.empty:
885
  st.write("Data fetched successfully.")
886
 
887
+ st.session_state.report_data = show_tabular_data(st.session_state.report_data, co, country_code)
 
888
 
889
  download_csv_link(st.session_state.report_data)
890
  elif st.session_state.report_data is not None: