poemsforaphrodite commited on
Commit
4cfdb9d
1 Parent(s): 66111ac

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -22
app.py CHANGED
@@ -17,6 +17,7 @@ import cohere
17
  from sklearn.metrics.pairwise import cosine_similarity
18
  import requests
19
  from bs4 import BeautifulSoup
 
20
 
21
  # Configure logging
22
  logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -28,6 +29,11 @@ logging.info("Environment variables loaded")
28
  COHERE_API_KEY = os.environ["COHERE_API_KEY"]
29
  co = cohere.Client(COHERE_API_KEY)
30
  logging.info("Cohere client initialized")
 
 
 
 
 
31
 
32
  # Configuration: Set to True if running locally, False if running on Streamlit Cloud
33
  IS_LOCAL = False
@@ -83,18 +89,48 @@ def init_session_state():
83
  # Data Processing Functions
84
  # -------------
85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  def fetch_content(url):
87
- logging.debug(f"Fetching content from URL: {url}")
88
  try:
89
  response = requests.get(url)
90
  response.raise_for_status()
91
  soup = BeautifulSoup(response.text, 'html.parser')
92
- content = soup.get_text(separator=' ', strip=True)
93
- logging.debug(f"Content fetched successfully from URL: {url}")
94
- return content
95
  except requests.RequestException as e:
96
- logging.error(f"Error fetching content from URL: {url} - {e}")
97
- return str(e)
98
 
99
  def generate_embeddings(text_list, model_type):
100
  logging.debug(f"Generating embeddings for model type: {model_type}")
@@ -108,21 +144,39 @@ def generate_embeddings(text_list, model_type):
108
  logging.debug(f"Embeddings generated successfully for model type: {model_type}")
109
  return embeddings
110
 
111
- def calculate_relevancy_scores(df, model_type):
112
- logging.info("Calculating relevancy scores")
113
- try:
114
- page_contents = [fetch_content(url) for url in df['page']]
115
- page_embeddings = generate_embeddings(page_contents, model_type)
116
- query_embeddings = generate_embeddings(df['query'].tolist(), model_type)
117
- relevancy_scores = cosine_similarity(query_embeddings, page_embeddings).diagonal()
118
- df = df.assign(relevancy_score=relevancy_scores)
119
- logging.info("Relevancy scores calculated successfully")
120
- except Exception as e:
121
- logging.error(f"Error calculating relevancy scores: {e}")
122
- st.warning(f"Error calculating relevancy scores: {e}")
123
- df = df.assign(relevancy_score=0)
124
- return df
 
125
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  def process_gsc_data(df):
127
  logging.info("Processing GSC data")
128
  df_sorted = df.sort_values(['impressions'], ascending=[False])
@@ -466,15 +520,23 @@ def main():
466
 
467
  if st.session_state.report_data is not None and not st.session_state.report_data.empty:
468
  st.write("Data fetched successfully. Click the button below to calculate relevancy scores.")
469
-
470
  if st.button("Calculate Relevancy Scores"):
471
  st.session_state.report_data = calculate_relevancy_scores(st.session_state.report_data, model_type)
472
- show_paginated_dataframe(st.session_state.report_data)
 
 
 
 
 
 
 
473
  download_csv_link(st.session_state.report_data)
474
  elif st.session_state.report_data is not None:
475
  st.warning("No data found for the selected criteria.")
476
  logging.warning("No data found for the selected criteria")
477
 
 
478
  if __name__ == "__main__":
479
  logging.info("Running main function")
480
  main()
 
17
  from sklearn.metrics.pairwise import cosine_similarity
18
  import requests
19
  from bs4 import BeautifulSoup
20
+ from apify_client import ApifyClient
21
 
22
  # Configure logging
23
  logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
 
29
  COHERE_API_KEY = os.environ["COHERE_API_KEY"]
30
  co = cohere.Client(COHERE_API_KEY)
31
  logging.info("Cohere client initialized")
32
+ if not APIFY_API_TOKEN:
33
+ st.error("APIFY_API_TOKEN is not set in the environment variables. Please set it and restart the application.")
34
+
35
+ # Initialize the ApifyClient with the API token
36
+ client = ApifyClient(APIFY_API_TOKEN)
37
 
38
  # Configuration: Set to True if running locally, False if running on Streamlit Cloud
39
  IS_LOCAL = False
 
89
  # Data Processing Functions
90
  # -------------
91
 
92
+ def get_serp_results(query):
93
+ if not APIFY_API_TOKEN:
94
+ st.error("Apify API token is not set. Unable to fetch SERP results.")
95
+ return []
96
+
97
+ run_input = {
98
+ "queries": query,
99
+ "resultsPerPage": 5,
100
+ "maxPagesPerQuery": 1,
101
+ "languageCode": "",
102
+ "mobileResults": False,
103
+ "includeUnfilteredResults": False,
104
+ "saveHtml": False,
105
+ "saveHtmlToKeyValueStore": False,
106
+ "includeIcons": False,
107
+ }
108
+
109
+ try:
110
+ # Run the Actor and wait for it to finish
111
+ run = client.actor("nFJndFXA5zjCTuudP").call(run_input=run_input)
112
+
113
+ # Fetch results from the run's dataset
114
+ results = list(client.dataset(run["defaultDatasetId"]).iterate_items())
115
+
116
+ if results and 'organicResults' in results[0]:
117
+ return [item['url'] for item in results[0]['organicResults']]
118
+ else:
119
+ st.warning("No organic results found in the SERP data.")
120
+ return []
121
+ except Exception as e:
122
+ st.error(f"Error fetching SERP results: {str(e)}")
123
+ return []
124
+
125
  def fetch_content(url):
 
126
  try:
127
  response = requests.get(url)
128
  response.raise_for_status()
129
  soup = BeautifulSoup(response.text, 'html.parser')
130
+ return soup.get_text(separator=' ', strip=True)
 
 
131
  except requests.RequestException as e:
132
+ st.warning(f"Error fetching content from {url}: {e}")
133
+ return ""
134
 
135
  def generate_embeddings(text_list, model_type):
136
  logging.debug(f"Generating embeddings for model type: {model_type}")
 
144
  logging.debug(f"Embeddings generated successfully for model type: {model_type}")
145
  return embeddings
146
 
147
+ def calculate_relevance_score(page_content, query, co):
148
+ page_embedding = co.embed(texts=[page_content], model='embed-english-v3.0', input_type='search_document').embeddings[0]
149
+ query_embedding = co.embed(texts=[query], model='embed-english-v3.0', input_type='search_query').embeddings[0]
150
+ return cosine_similarity([query_embedding], [page_embedding])[0][0]
151
+
152
+ def show_competitor_analysis(row, co):
153
+ if st.button("Check Competitors", key=f"comp_{row['page']}"):
154
+ with st.spinner('Analyzing competitors...'):
155
+ results_df = analyze_competitors(row, co)
156
+ st.write("Relevancy Score Comparison:")
157
+ st.dataframe(results_df)
158
+
159
+ our_rank = results_df.index[results_df['url'] == row['page']].tolist()[0] + 1
160
+ st.write(f"Our page ranks {our_rank} out of {len(results_df)} in terms of relevancy score.")
161
+
162
 
163
+ def analyze_competitors(row, co):
164
+ query = row['query']
165
+ our_url = row['page']
166
+ our_score = row['relevancy_score']
167
+
168
+ competitor_urls = get_serp_results(query)
169
+
170
+ results = []
171
+ for url in competitor_urls:
172
+ content = fetch_content(url)
173
+ score = calculate_relevance_score(content, query, co)
174
+ results.append({'url': url, 'relevancy_score': score})
175
+
176
+ results.append({'url': our_url, 'relevancy_score': our_score})
177
+ results_df = pd.DataFrame(results).sort_values('relevancy_score', ascending=False)
178
+
179
+ return results_df
180
  def process_gsc_data(df):
181
  logging.info("Processing GSC data")
182
  df_sorted = df.sort_values(['impressions'], ascending=[False])
 
520
 
521
  if st.session_state.report_data is not None and not st.session_state.report_data.empty:
522
  st.write("Data fetched successfully. Click the button below to calculate relevancy scores.")
523
+
524
  if st.button("Calculate Relevancy Scores"):
525
  st.session_state.report_data = calculate_relevancy_scores(st.session_state.report_data, model_type)
526
+
527
+ for index, row in st.session_state.report_data.iterrows():
528
+ st.write(f"Query: {row['query']}")
529
+ st.write(f"Page: {row['page']}")
530
+ st.write(f"Relevancy Score: {row['relevancy_score']:.4f}")
531
+ show_competitor_analysis(row, co)
532
+ st.divider()
533
+
534
  download_csv_link(st.session_state.report_data)
535
  elif st.session_state.report_data is not None:
536
  st.warning("No data found for the selected criteria.")
537
  logging.warning("No data found for the selected criteria")
538
 
539
+
540
  if __name__ == "__main__":
541
  logging.info("Running main function")
542
  main()