Spaces:
Running
Running
poemsforaphrodite
commited on
Commit
•
4cfdb9d
1
Parent(s):
66111ac
Update app.py
Browse files
app.py
CHANGED
@@ -17,6 +17,7 @@ import cohere
|
|
17 |
from sklearn.metrics.pairwise import cosine_similarity
|
18 |
import requests
|
19 |
from bs4 import BeautifulSoup
|
|
|
20 |
|
21 |
# Configure logging
|
22 |
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
|
@@ -28,6 +29,11 @@ logging.info("Environment variables loaded")
|
|
28 |
COHERE_API_KEY = os.environ["COHERE_API_KEY"]
|
29 |
co = cohere.Client(COHERE_API_KEY)
|
30 |
logging.info("Cohere client initialized")
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
# Configuration: Set to True if running locally, False if running on Streamlit Cloud
|
33 |
IS_LOCAL = False
|
@@ -83,18 +89,48 @@ def init_session_state():
|
|
83 |
# Data Processing Functions
|
84 |
# -------------
|
85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
def fetch_content(url):
|
87 |
-
logging.debug(f"Fetching content from URL: {url}")
|
88 |
try:
|
89 |
response = requests.get(url)
|
90 |
response.raise_for_status()
|
91 |
soup = BeautifulSoup(response.text, 'html.parser')
|
92 |
-
|
93 |
-
logging.debug(f"Content fetched successfully from URL: {url}")
|
94 |
-
return content
|
95 |
except requests.RequestException as e:
|
96 |
-
|
97 |
-
return
|
98 |
|
99 |
def generate_embeddings(text_list, model_type):
|
100 |
logging.debug(f"Generating embeddings for model type: {model_type}")
|
@@ -108,21 +144,39 @@ def generate_embeddings(text_list, model_type):
|
|
108 |
logging.debug(f"Embeddings generated successfully for model type: {model_type}")
|
109 |
return embeddings
|
110 |
|
111 |
-
def
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
|
|
125 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
def process_gsc_data(df):
|
127 |
logging.info("Processing GSC data")
|
128 |
df_sorted = df.sort_values(['impressions'], ascending=[False])
|
@@ -466,15 +520,23 @@ def main():
|
|
466 |
|
467 |
if st.session_state.report_data is not None and not st.session_state.report_data.empty:
|
468 |
st.write("Data fetched successfully. Click the button below to calculate relevancy scores.")
|
469 |
-
|
470 |
if st.button("Calculate Relevancy Scores"):
|
471 |
st.session_state.report_data = calculate_relevancy_scores(st.session_state.report_data, model_type)
|
472 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
473 |
download_csv_link(st.session_state.report_data)
|
474 |
elif st.session_state.report_data is not None:
|
475 |
st.warning("No data found for the selected criteria.")
|
476 |
logging.warning("No data found for the selected criteria")
|
477 |
|
|
|
478 |
if __name__ == "__main__":
|
479 |
logging.info("Running main function")
|
480 |
main()
|
|
|
17 |
from sklearn.metrics.pairwise import cosine_similarity
|
18 |
import requests
|
19 |
from bs4 import BeautifulSoup
|
20 |
+
from apify_client import ApifyClient
|
21 |
|
22 |
# Configure logging
|
23 |
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
|
29 |
COHERE_API_KEY = os.environ["COHERE_API_KEY"]
|
30 |
co = cohere.Client(COHERE_API_KEY)
|
31 |
logging.info("Cohere client initialized")
|
32 |
+
if not APIFY_API_TOKEN:
|
33 |
+
st.error("APIFY_API_TOKEN is not set in the environment variables. Please set it and restart the application.")
|
34 |
+
|
35 |
+
# Initialize the ApifyClient with the API token
|
36 |
+
client = ApifyClient(APIFY_API_TOKEN)
|
37 |
|
38 |
# Configuration: Set to True if running locally, False if running on Streamlit Cloud
|
39 |
IS_LOCAL = False
|
|
|
89 |
# Data Processing Functions
|
90 |
# -------------
|
91 |
|
92 |
+
def get_serp_results(query):
|
93 |
+
if not APIFY_API_TOKEN:
|
94 |
+
st.error("Apify API token is not set. Unable to fetch SERP results.")
|
95 |
+
return []
|
96 |
+
|
97 |
+
run_input = {
|
98 |
+
"queries": query,
|
99 |
+
"resultsPerPage": 5,
|
100 |
+
"maxPagesPerQuery": 1,
|
101 |
+
"languageCode": "",
|
102 |
+
"mobileResults": False,
|
103 |
+
"includeUnfilteredResults": False,
|
104 |
+
"saveHtml": False,
|
105 |
+
"saveHtmlToKeyValueStore": False,
|
106 |
+
"includeIcons": False,
|
107 |
+
}
|
108 |
+
|
109 |
+
try:
|
110 |
+
# Run the Actor and wait for it to finish
|
111 |
+
run = client.actor("nFJndFXA5zjCTuudP").call(run_input=run_input)
|
112 |
+
|
113 |
+
# Fetch results from the run's dataset
|
114 |
+
results = list(client.dataset(run["defaultDatasetId"]).iterate_items())
|
115 |
+
|
116 |
+
if results and 'organicResults' in results[0]:
|
117 |
+
return [item['url'] for item in results[0]['organicResults']]
|
118 |
+
else:
|
119 |
+
st.warning("No organic results found in the SERP data.")
|
120 |
+
return []
|
121 |
+
except Exception as e:
|
122 |
+
st.error(f"Error fetching SERP results: {str(e)}")
|
123 |
+
return []
|
124 |
+
|
125 |
def fetch_content(url):
|
|
|
126 |
try:
|
127 |
response = requests.get(url)
|
128 |
response.raise_for_status()
|
129 |
soup = BeautifulSoup(response.text, 'html.parser')
|
130 |
+
return soup.get_text(separator=' ', strip=True)
|
|
|
|
|
131 |
except requests.RequestException as e:
|
132 |
+
st.warning(f"Error fetching content from {url}: {e}")
|
133 |
+
return ""
|
134 |
|
135 |
def generate_embeddings(text_list, model_type):
|
136 |
logging.debug(f"Generating embeddings for model type: {model_type}")
|
|
|
144 |
logging.debug(f"Embeddings generated successfully for model type: {model_type}")
|
145 |
return embeddings
|
146 |
|
147 |
+
def calculate_relevance_score(page_content, query, co):
|
148 |
+
page_embedding = co.embed(texts=[page_content], model='embed-english-v3.0', input_type='search_document').embeddings[0]
|
149 |
+
query_embedding = co.embed(texts=[query], model='embed-english-v3.0', input_type='search_query').embeddings[0]
|
150 |
+
return cosine_similarity([query_embedding], [page_embedding])[0][0]
|
151 |
+
|
152 |
+
def show_competitor_analysis(row, co):
|
153 |
+
if st.button("Check Competitors", key=f"comp_{row['page']}"):
|
154 |
+
with st.spinner('Analyzing competitors...'):
|
155 |
+
results_df = analyze_competitors(row, co)
|
156 |
+
st.write("Relevancy Score Comparison:")
|
157 |
+
st.dataframe(results_df)
|
158 |
+
|
159 |
+
our_rank = results_df.index[results_df['url'] == row['page']].tolist()[0] + 1
|
160 |
+
st.write(f"Our page ranks {our_rank} out of {len(results_df)} in terms of relevancy score.")
|
161 |
+
|
162 |
|
163 |
+
def analyze_competitors(row, co):
|
164 |
+
query = row['query']
|
165 |
+
our_url = row['page']
|
166 |
+
our_score = row['relevancy_score']
|
167 |
+
|
168 |
+
competitor_urls = get_serp_results(query)
|
169 |
+
|
170 |
+
results = []
|
171 |
+
for url in competitor_urls:
|
172 |
+
content = fetch_content(url)
|
173 |
+
score = calculate_relevance_score(content, query, co)
|
174 |
+
results.append({'url': url, 'relevancy_score': score})
|
175 |
+
|
176 |
+
results.append({'url': our_url, 'relevancy_score': our_score})
|
177 |
+
results_df = pd.DataFrame(results).sort_values('relevancy_score', ascending=False)
|
178 |
+
|
179 |
+
return results_df
|
180 |
def process_gsc_data(df):
|
181 |
logging.info("Processing GSC data")
|
182 |
df_sorted = df.sort_values(['impressions'], ascending=[False])
|
|
|
520 |
|
521 |
if st.session_state.report_data is not None and not st.session_state.report_data.empty:
|
522 |
st.write("Data fetched successfully. Click the button below to calculate relevancy scores.")
|
523 |
+
|
524 |
if st.button("Calculate Relevancy Scores"):
|
525 |
st.session_state.report_data = calculate_relevancy_scores(st.session_state.report_data, model_type)
|
526 |
+
|
527 |
+
for index, row in st.session_state.report_data.iterrows():
|
528 |
+
st.write(f"Query: {row['query']}")
|
529 |
+
st.write(f"Page: {row['page']}")
|
530 |
+
st.write(f"Relevancy Score: {row['relevancy_score']:.4f}")
|
531 |
+
show_competitor_analysis(row, co)
|
532 |
+
st.divider()
|
533 |
+
|
534 |
download_csv_link(st.session_state.report_data)
|
535 |
elif st.session_state.report_data is not None:
|
536 |
st.warning("No data found for the selected criteria.")
|
537 |
logging.warning("No data found for the selected criteria")
|
538 |
|
539 |
+
|
540 |
if __name__ == "__main__":
|
541 |
logging.info("Running main function")
|
542 |
main()
|