Spaces:

poemsforaphrodite
/

gscpro

Sleeping

App Files Files Community

poemsforaphrodite commited on Jul 29, 2024

Commit

4cfdb9d

verified ·

1 Parent(s): 66111ac

Update app.py

Browse files

Files changed (1) hide show

app.py +84 -22

app.py CHANGED Viewed

@@ -17,6 +17,7 @@ import cohere
 from sklearn.metrics.pairwise import cosine_similarity
 import requests
 from bs4 import BeautifulSoup
 # Configure logging
 logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -28,6 +29,11 @@ logging.info("Environment variables loaded")
 COHERE_API_KEY = os.environ["COHERE_API_KEY"]
 co = cohere.Client(COHERE_API_KEY)
 logging.info("Cohere client initialized")
 # Configuration: Set to True if running locally, False if running on Streamlit Cloud
 IS_LOCAL = False
@@ -83,18 +89,48 @@ def init_session_state():
 # Data Processing Functions
 # -------------
 def fetch_content(url):
-    logging.debug(f"Fetching content from URL: {url}")
     try:
         response = requests.get(url)
         response.raise_for_status()
         soup = BeautifulSoup(response.text, 'html.parser')
-        content = soup.get_text(separator=' ', strip=True)
-        logging.debug(f"Content fetched successfully from URL: {url}")
-        return content
     except requests.RequestException as e:
-        logging.error(f"Error fetching content from URL: {url} - {e}")
-        return str(e)
 def generate_embeddings(text_list, model_type):
     logging.debug(f"Generating embeddings for model type: {model_type}")
@@ -108,21 +144,39 @@ def generate_embeddings(text_list, model_type):
     logging.debug(f"Embeddings generated successfully for model type: {model_type}")
     return embeddings
-def calculate_relevancy_scores(df, model_type):
-    logging.info("Calculating relevancy scores")
-    try:
-        page_contents = [fetch_content(url) for url in df['page']]
-        page_embeddings = generate_embeddings(page_contents, model_type)
-        query_embeddings = generate_embeddings(df['query'].tolist(), model_type)
-        relevancy_scores = cosine_similarity(query_embeddings, page_embeddings).diagonal()
-        df = df.assign(relevancy_score=relevancy_scores)
-        logging.info("Relevancy scores calculated successfully")
-    except Exception as e:
-        logging.error(f"Error calculating relevancy scores: {e}")
-        st.warning(f"Error calculating relevancy scores: {e}")
-        df = df.assign(relevancy_score=0)
-    return df
 def process_gsc_data(df):
     logging.info("Processing GSC data")
     df_sorted = df.sort_values(['impressions'], ascending=[False])
@@ -466,15 +520,23 @@ def main():
             if st.session_state.report_data is not None and not st.session_state.report_data.empty:
                 st.write("Data fetched successfully. Click the button below to calculate relevancy scores.")
                 if st.button("Calculate Relevancy Scores"):
                     st.session_state.report_data = calculate_relevancy_scores(st.session_state.report_data, model_type)
-                show_paginated_dataframe(st.session_state.report_data)
                 download_csv_link(st.session_state.report_data)
             elif st.session_state.report_data is not None:
                 st.warning("No data found for the selected criteria.")
                 logging.warning("No data found for the selected criteria")
 if __name__ == "__main__":
     logging.info("Running main function")
     main()

 from sklearn.metrics.pairwise import cosine_similarity
 import requests
 from bs4 import BeautifulSoup
+from apify_client import ApifyClient
 # Configure logging
 logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
 COHERE_API_KEY = os.environ["COHERE_API_KEY"]
 co = cohere.Client(COHERE_API_KEY)
 logging.info("Cohere client initialized")
+if not APIFY_API_TOKEN:
+    st.error("APIFY_API_TOKEN is not set in the environment variables. Please set it and restart the application.")
+# Initialize the ApifyClient with the API token
+client = ApifyClient(APIFY_API_TOKEN)
 # Configuration: Set to True if running locally, False if running on Streamlit Cloud
 IS_LOCAL = False
 # Data Processing Functions
 # -------------
+def get_serp_results(query):
+    if not APIFY_API_TOKEN:
+        st.error("Apify API token is not set. Unable to fetch SERP results.")
+        return []
+    run_input = {
+        "queries": query,
+        "resultsPerPage": 5,
+        "maxPagesPerQuery": 1,
+        "languageCode": "",
+        "mobileResults": False,
+        "includeUnfilteredResults": False,
+        "saveHtml": False,
+        "saveHtmlToKeyValueStore": False,
+        "includeIcons": False,
+    }
+    try:
+        # Run the Actor and wait for it to finish
+        run = client.actor("nFJndFXA5zjCTuudP").call(run_input=run_input)
+        # Fetch results from the run's dataset
+        results = list(client.dataset(run["defaultDatasetId"]).iterate_items())
+        if results and 'organicResults' in results[0]:
+            return [item['url'] for item in results[0]['organicResults']]
+        else:
+            st.warning("No organic results found in the SERP data.")
+            return []
+    except Exception as e:
+        st.error(f"Error fetching SERP results: {str(e)}")
+        return []
 def fetch_content(url):
     try:
         response = requests.get(url)
         response.raise_for_status()
         soup = BeautifulSoup(response.text, 'html.parser')
+        return soup.get_text(separator=' ', strip=True)
     except requests.RequestException as e:
+        st.warning(f"Error fetching content from {url}: {e}")
+        return ""
 def generate_embeddings(text_list, model_type):
     logging.debug(f"Generating embeddings for model type: {model_type}")
     logging.debug(f"Embeddings generated successfully for model type: {model_type}")
     return embeddings
+def calculate_relevance_score(page_content, query, co):
+    page_embedding = co.embed(texts=[page_content], model='embed-english-v3.0', input_type='search_document').embeddings[0]
+    query_embedding = co.embed(texts=[query], model='embed-english-v3.0', input_type='search_query').embeddings[0]
+    return cosine_similarity([query_embedding], [page_embedding])[0][0]
+def show_competitor_analysis(row, co):
+    if st.button("Check Competitors", key=f"comp_{row['page']}"):
+        with st.spinner('Analyzing competitors...'):
+            results_df = analyze_competitors(row, co)
+            st.write("Relevancy Score Comparison:")
+            st.dataframe(results_df)
+            our_rank = results_df.index[results_df['url'] == row['page']].tolist()[0] + 1
+            st.write(f"Our page ranks {our_rank} out of {len(results_df)} in terms of relevancy score.")
+def analyze_competitors(row, co):
+    query = row['query']
+    our_url = row['page']
+    our_score = row['relevancy_score']
+    competitor_urls = get_serp_results(query)
+    results = []
+    for url in competitor_urls:
+        content = fetch_content(url)
+        score = calculate_relevance_score(content, query, co)
+        results.append({'url': url, 'relevancy_score': score})
+    results.append({'url': our_url, 'relevancy_score': our_score})
+    results_df = pd.DataFrame(results).sort_values('relevancy_score', ascending=False)
+    return results_df
 def process_gsc_data(df):
     logging.info("Processing GSC data")
     df_sorted = df.sort_values(['impressions'], ascending=[False])
             if st.session_state.report_data is not None and not st.session_state.report_data.empty:
                 st.write("Data fetched successfully. Click the button below to calculate relevancy scores.")
                 if st.button("Calculate Relevancy Scores"):
                     st.session_state.report_data = calculate_relevancy_scores(st.session_state.report_data, model_type)
+                for index, row in st.session_state.report_data.iterrows():
+                    st.write(f"Query: {row['query']}")
+                    st.write(f"Page: {row['page']}")
+                    st.write(f"Relevancy Score: {row['relevancy_score']:.4f}")
+                    show_competitor_analysis(row, co)
+                    st.divider()
                 download_csv_link(st.session_state.report_data)
             elif st.session_state.report_data is not None:
                 st.warning("No data found for the selected criteria.")
                 logging.warning("No data found for the selected criteria")
 if __name__ == "__main__":
     logging.info("Running main function")
     main()