Spaces:
Sleeping
Sleeping
poemsforaphrodite
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -24,6 +24,7 @@ logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %
|
|
24 |
|
25 |
load_dotenv()
|
26 |
logging.info("Environment variables loaded")
|
|
|
27 |
|
28 |
# Initialize Cohere client
|
29 |
APIFY_API_TOKEN = os.environ.get('APIFY_API_TOKEN')
|
@@ -31,10 +32,13 @@ COHERE_API_KEY = os.environ["COHERE_API_KEY"]
|
|
31 |
co = cohere.Client(COHERE_API_KEY)
|
32 |
logging.info("Cohere client initialized")
|
33 |
if not APIFY_API_TOKEN:
|
|
|
34 |
st.error("APIFY_API_TOKEN is not set in the environment variables. Please set it and restart the application.")
|
35 |
|
36 |
# Initialize the ApifyClient with the API token
|
37 |
client = ApifyClient(APIFY_API_TOKEN)
|
|
|
|
|
38 |
|
39 |
# Configuration: Set to True if running locally, False if running on Streamlit Cloud
|
40 |
IS_LOCAL = False
|
@@ -91,7 +95,9 @@ def init_session_state():
|
|
91 |
# -------------
|
92 |
|
93 |
def get_serp_results(query):
|
|
|
94 |
if not APIFY_API_TOKEN:
|
|
|
95 |
st.error("Apify API token is not set. Unable to fetch SERP results.")
|
96 |
return []
|
97 |
|
@@ -108,56 +114,89 @@ def get_serp_results(query):
|
|
108 |
}
|
109 |
|
110 |
try:
|
|
|
111 |
# Run the Actor and wait for it to finish
|
112 |
run = client.actor("nFJndFXA5zjCTuudP").call(run_input=run_input)
|
|
|
113 |
|
114 |
# Fetch results from the run's dataset
|
|
|
115 |
results = list(client.dataset(run["defaultDatasetId"]).iterate_items())
|
|
|
116 |
|
117 |
if results and 'organicResults' in results[0]:
|
118 |
-
|
|
|
|
|
119 |
else:
|
|
|
120 |
st.warning("No organic results found in the SERP data.")
|
121 |
return []
|
122 |
except Exception as e:
|
|
|
123 |
st.error(f"Error fetching SERP results: {str(e)}")
|
124 |
return []
|
125 |
|
|
|
|
|
|
|
126 |
def fetch_content(url):
|
|
|
127 |
try:
|
128 |
response = requests.get(url)
|
129 |
response.raise_for_status()
|
130 |
soup = BeautifulSoup(response.text, 'html.parser')
|
131 |
-
|
|
|
|
|
132 |
except requests.RequestException as e:
|
|
|
133 |
st.warning(f"Error fetching content from {url}: {e}")
|
134 |
return ""
|
135 |
|
136 |
-
def generate_embeddings(text_list, model_type):
|
137 |
-
logging.debug(f"Generating embeddings for model type: {model_type}")
|
138 |
-
if not text_list:
|
139 |
-
logging.warning("Text list is empty, returning empty embeddings")
|
140 |
-
return []
|
141 |
-
model = 'embed-english-v3.0' if model_type == 'english' else 'embed-multilingual-v3.0'
|
142 |
-
input_type = 'search_document'
|
143 |
-
response = co.embed(model=model, texts=text_list, input_type=input_type)
|
144 |
-
embeddings = response.embeddings
|
145 |
-
logging.debug(f"Embeddings generated successfully for model type: {model_type}")
|
146 |
-
return embeddings
|
147 |
-
|
148 |
def calculate_relevance_score(page_content, query, co):
|
149 |
-
|
150 |
-
|
151 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
|
153 |
def show_competitor_analysis(row, co):
|
154 |
if st.button("Check Competitors", key=f"comp_{row['page']}"):
|
|
|
155 |
with st.spinner('Analyzing competitors...'):
|
156 |
results_df = analyze_competitors(row, co)
|
157 |
st.write("Relevancy Score Comparison:")
|
158 |
st.dataframe(results_df)
|
159 |
|
160 |
our_rank = results_df.index[results_df['url'] == row['page']].tolist()[0] + 1
|
|
|
161 |
st.write(f"Our page ranks {our_rank} out of {len(results_df)} in terms of relevancy score.")
|
162 |
|
163 |
|
@@ -523,9 +562,11 @@ def main():
|
|
523 |
st.write("Data fetched successfully. Click the button below to calculate relevancy scores.")
|
524 |
|
525 |
if st.button("Calculate Relevancy Scores"):
|
|
|
526 |
st.session_state.report_data = calculate_relevancy_scores(st.session_state.report_data, model_type)
|
527 |
|
528 |
for index, row in st.session_state.report_data.iterrows():
|
|
|
529 |
st.write(f"Query: {row['query']}")
|
530 |
st.write(f"Page: {row['page']}")
|
531 |
st.write(f"Relevancy Score: {row['relevancy_score']:.4f}")
|
@@ -534,10 +575,11 @@ def main():
|
|
534 |
|
535 |
download_csv_link(st.session_state.report_data)
|
536 |
elif st.session_state.report_data is not None:
|
|
|
537 |
st.warning("No data found for the selected criteria.")
|
538 |
-
logging.warning("No data found for the selected criteria")
|
539 |
|
540 |
|
541 |
if __name__ == "__main__":
|
542 |
logging.info("Running main function")
|
543 |
-
main()
|
|
|
|
24 |
|
25 |
load_dotenv()
|
26 |
logging.info("Environment variables loaded")
|
27 |
+
logger = logging.getLogger(__name__)
|
28 |
|
29 |
# Initialize Cohere client
|
30 |
APIFY_API_TOKEN = os.environ.get('APIFY_API_TOKEN')
|
|
|
32 |
co = cohere.Client(COHERE_API_KEY)
|
33 |
logging.info("Cohere client initialized")
|
34 |
if not APIFY_API_TOKEN:
|
35 |
+
logger.error("APIFY_API_TOKEN is not set in the environment variables.")
|
36 |
st.error("APIFY_API_TOKEN is not set in the environment variables. Please set it and restart the application.")
|
37 |
|
38 |
# Initialize the ApifyClient with the API token
|
39 |
client = ApifyClient(APIFY_API_TOKEN)
|
40 |
+
# Initialize the ApifyClient with the API token
|
41 |
+
logger.info("ApifyClient initialized")
|
42 |
|
43 |
# Configuration: Set to True if running locally, False if running on Streamlit Cloud
|
44 |
IS_LOCAL = False
|
|
|
95 |
# -------------
|
96 |
|
97 |
def get_serp_results(query):
|
98 |
+
logger.info(f"Getting SERP results for query: {query}")
|
99 |
if not APIFY_API_TOKEN:
|
100 |
+
logger.error("Apify API token is not set. Unable to fetch SERP results.")
|
101 |
st.error("Apify API token is not set. Unable to fetch SERP results.")
|
102 |
return []
|
103 |
|
|
|
114 |
}
|
115 |
|
116 |
try:
|
117 |
+
logger.debug(f"Calling Apify Actor with input: {run_input}")
|
118 |
# Run the Actor and wait for it to finish
|
119 |
run = client.actor("nFJndFXA5zjCTuudP").call(run_input=run_input)
|
120 |
+
logger.info(f"Apify Actor run completed. Run ID: {run.get('id')}")
|
121 |
|
122 |
# Fetch results from the run's dataset
|
123 |
+
logger.debug(f"Fetching results from dataset ID: {run.get('defaultDatasetId')}")
|
124 |
results = list(client.dataset(run["defaultDatasetId"]).iterate_items())
|
125 |
+
logger.info(f"Fetched {len(results)} results from Apify dataset")
|
126 |
|
127 |
if results and 'organicResults' in results[0]:
|
128 |
+
urls = [item['url'] for item in results[0]['organicResults']]
|
129 |
+
logger.info(f"Extracted {len(urls)} URLs from organic results")
|
130 |
+
return urls
|
131 |
else:
|
132 |
+
logger.warning("No organic results found in the SERP data.")
|
133 |
st.warning("No organic results found in the SERP data.")
|
134 |
return []
|
135 |
except Exception as e:
|
136 |
+
logger.exception(f"Error fetching SERP results: {str(e)}")
|
137 |
st.error(f"Error fetching SERP results: {str(e)}")
|
138 |
return []
|
139 |
|
140 |
+
|
141 |
+
|
142 |
+
|
143 |
def fetch_content(url):
|
144 |
+
logger.info(f"Fetching content from URL: {url}")
|
145 |
try:
|
146 |
response = requests.get(url)
|
147 |
response.raise_for_status()
|
148 |
soup = BeautifulSoup(response.text, 'html.parser')
|
149 |
+
content = soup.get_text(separator=' ', strip=True)
|
150 |
+
logger.debug(f"Fetched {len(content)} characters from {url}")
|
151 |
+
return content
|
152 |
except requests.RequestException as e:
|
153 |
+
logger.error(f"Error fetching content from {url}: {e}")
|
154 |
st.warning(f"Error fetching content from {url}: {e}")
|
155 |
return ""
|
156 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
def calculate_relevance_score(page_content, query, co):
|
158 |
+
logger.info(f"Calculating relevance score for query: {query}")
|
159 |
+
try:
|
160 |
+
page_embedding = co.embed(texts=[page_content], model='embed-english-v3.0', input_type='search_document').embeddings[0]
|
161 |
+
query_embedding = co.embed(texts=[query], model='embed-english-v3.0', input_type='search_query').embeddings[0]
|
162 |
+
score = cosine_similarity([query_embedding], [page_embedding])[0][0]
|
163 |
+
logger.debug(f"Relevance score calculated: {score}")
|
164 |
+
return score
|
165 |
+
except Exception as e:
|
166 |
+
logger.exception(f"Error calculating relevance score: {str(e)}")
|
167 |
+
st.error(f"Error calculating relevance score: {str(e)}")
|
168 |
+
return 0
|
169 |
+
|
170 |
+
def analyze_competitors(row, co):
|
171 |
+
logger.info(f"Analyzing competitors for query: {row['query']}")
|
172 |
+
query = row['query']
|
173 |
+
our_url = row['page']
|
174 |
+
our_score = row['relevancy_score']
|
175 |
+
|
176 |
+
competitor_urls = get_serp_results(query)
|
177 |
+
|
178 |
+
results = []
|
179 |
+
for url in competitor_urls:
|
180 |
+
content = fetch_content(url)
|
181 |
+
score = calculate_relevance_score(content, query, co)
|
182 |
+
results.append({'url': url, 'relevancy_score': score})
|
183 |
+
|
184 |
+
results.append({'url': our_url, 'relevancy_score': our_score})
|
185 |
+
results_df = pd.DataFrame(results).sort_values('relevancy_score', ascending=False)
|
186 |
+
|
187 |
+
logger.info(f"Competitor analysis completed. {len(results)} results obtained.")
|
188 |
+
return results_df
|
189 |
|
190 |
def show_competitor_analysis(row, co):
|
191 |
if st.button("Check Competitors", key=f"comp_{row['page']}"):
|
192 |
+
logger.info(f"Competitor analysis requested for page: {row['page']}")
|
193 |
with st.spinner('Analyzing competitors...'):
|
194 |
results_df = analyze_competitors(row, co)
|
195 |
st.write("Relevancy Score Comparison:")
|
196 |
st.dataframe(results_df)
|
197 |
|
198 |
our_rank = results_df.index[results_df['url'] == row['page']].tolist()[0] + 1
|
199 |
+
logger.info(f"Our page ranks {our_rank} out of {len(results_df)} in terms of relevancy score.")
|
200 |
st.write(f"Our page ranks {our_rank} out of {len(results_df)} in terms of relevancy score.")
|
201 |
|
202 |
|
|
|
562 |
st.write("Data fetched successfully. Click the button below to calculate relevancy scores.")
|
563 |
|
564 |
if st.button("Calculate Relevancy Scores"):
|
565 |
+
logger.info("Calculating relevancy scores for all rows")
|
566 |
st.session_state.report_data = calculate_relevancy_scores(st.session_state.report_data, model_type)
|
567 |
|
568 |
for index, row in st.session_state.report_data.iterrows():
|
569 |
+
logger.debug(f"Processing row {index}: Query: {row['query']}, Page: {row['page']}")
|
570 |
st.write(f"Query: {row['query']}")
|
571 |
st.write(f"Page: {row['page']}")
|
572 |
st.write(f"Relevancy Score: {row['relevancy_score']:.4f}")
|
|
|
575 |
|
576 |
download_csv_link(st.session_state.report_data)
|
577 |
elif st.session_state.report_data is not None:
|
578 |
+
logger.warning("No data found for the selected criteria.")
|
579 |
st.warning("No data found for the selected criteria.")
|
|
|
580 |
|
581 |
|
582 |
if __name__ == "__main__":
|
583 |
logging.info("Running main function")
|
584 |
+
main()
|
585 |
+
logger.info("Script completed")
|