Spaces:
Running
Running
poemsforaphrodite
commited on
Commit
•
2913f49
1
Parent(s):
858a793
Update app.py
Browse files
app.py
CHANGED
@@ -16,6 +16,9 @@ import requests
|
|
16 |
from bs4 import BeautifulSoup
|
17 |
from apify_client import ApifyClient
|
18 |
import urllib.parse
|
|
|
|
|
|
|
19 |
|
20 |
|
21 |
load_dotenv()
|
@@ -32,6 +35,12 @@ if not APIFY_API_TOKEN:
|
|
32 |
client = ApifyClient(APIFY_API_TOKEN)
|
33 |
# Initialize the ApifyClient with the API token
|
34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
# Configuration: Set to True if running locally, False if running on Streamlit Cloud
|
36 |
IS_LOCAL = False
|
37 |
|
@@ -128,9 +137,12 @@ def get_serp_results(query):
|
|
128 |
# logger.info(f"Fetched {len(results)} results from Apify dataset")
|
129 |
|
130 |
if results and 'organicResults' in results[0]:
|
131 |
-
|
132 |
-
|
133 |
-
|
|
|
|
|
|
|
134 |
else:
|
135 |
# logger.warning("No organic results found in the SERP data.")
|
136 |
st.warning("No organic results found in the SERP data.")
|
@@ -143,14 +155,47 @@ def get_serp_results(query):
|
|
143 |
|
144 |
|
145 |
|
146 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
147 |
try:
|
148 |
decoded_url = urllib.parse.unquote(url)
|
149 |
response = requests.get(decoded_url, timeout=10)
|
150 |
response.raise_for_status()
|
151 |
soup = BeautifulSoup(response.text, 'html.parser')
|
152 |
-
|
153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
except requests.RequestException:
|
155 |
return ""
|
156 |
|
@@ -176,19 +221,21 @@ def analyze_competitors(row, co, custom_url=None):
|
|
176 |
query = row['query']
|
177 |
our_url = row['page']
|
178 |
|
179 |
-
|
180 |
-
competitor_urls = [url for url in competitor_urls if not url.startswith('/search')][:5] # Get top 5 valid competitors
|
181 |
|
182 |
-
if custom_url and custom_url not in
|
183 |
-
|
|
|
184 |
|
185 |
results = []
|
186 |
-
for
|
187 |
-
|
188 |
-
|
189 |
-
|
|
|
|
|
|
|
190 |
|
191 |
-
results.append({'url': our_url, 'relevancy_score': row['relevancy_score']})
|
192 |
results_df = pd.DataFrame(results).sort_values('relevancy_score', ascending=False)
|
193 |
|
194 |
return results_df
|
@@ -445,7 +492,7 @@ def show_model_type_selector():
|
|
445 |
)
|
446 |
|
447 |
def calculate_single_relevancy(row):
|
448 |
-
page_content = fetch_content(row['page'])
|
449 |
query = row['query']
|
450 |
score = calculate_relevance_score(page_content, query, co)
|
451 |
return score
|
|
|
16 |
from bs4 import BeautifulSoup
|
17 |
from apify_client import ApifyClient
|
18 |
import urllib.parse
|
19 |
+
import openai
|
20 |
+
from openai import OpenAI
|
21 |
+
import re
|
22 |
|
23 |
|
24 |
load_dotenv()
|
|
|
35 |
client = ApifyClient(APIFY_API_TOKEN)
|
36 |
# Initialize the ApifyClient with the API token
|
37 |
|
38 |
+
# Initialize OpenAI client
|
39 |
+
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')
|
40 |
+
if not OPENAI_API_KEY:
|
41 |
+
st.error("OPENAI_API_KEY is not set in the environment variables. Please set it and restart the application.")
|
42 |
+
client = OpenAI(api_key=OPENAI_API_KEY)
|
43 |
+
|
44 |
# Configuration: Set to True if running locally, False if running on Streamlit Cloud
|
45 |
IS_LOCAL = False
|
46 |
|
|
|
137 |
# logger.info(f"Fetched {len(results)} results from Apify dataset")
|
138 |
|
139 |
if results and 'organicResults' in results[0]:
|
140 |
+
serp_data = []
|
141 |
+
for item in results[0]['organicResults'][:5]: # Limit to top 5 results
|
142 |
+
url = item['url']
|
143 |
+
content = fetch_content(url, query)
|
144 |
+
serp_data.append({'url': url, 'content': content})
|
145 |
+
return serp_data
|
146 |
else:
|
147 |
# logger.warning("No organic results found in the SERP data.")
|
148 |
st.warning("No organic results found in the SERP data.")
|
|
|
155 |
|
156 |
|
157 |
|
158 |
+
def extract_relevant_content(full_content, query):
|
159 |
+
try:
|
160 |
+
response = client.chat.completions.create(
|
161 |
+
model="gpt-4o-mini",
|
162 |
+
messages=[
|
163 |
+
{"role": "system", "content": "You are a helpful assistant that extracts the most relevant content from web pages."},
|
164 |
+
{"role": "user", "content": f"Given the following web page content and search query, extract only the most relevant parts of the content that answer or relate to the query.If there's no relevant content, say 'No relevant content found.'\n\nQuery: {query}\n\nContent: {full_content[:4000]}"} # Limit input to 4000 characters
|
165 |
+
],
|
166 |
+
max_tokens=3000 # Adjust as needed
|
167 |
+
)
|
168 |
+
return response.choices[0].message.content.strip()
|
169 |
+
except Exception as e:
|
170 |
+
st.error(f"Error in GPT content extraction: {str(e)}")
|
171 |
+
return "Error in content extraction"
|
172 |
+
|
173 |
+
def fetch_content(url, query):
|
174 |
try:
|
175 |
decoded_url = urllib.parse.unquote(url)
|
176 |
response = requests.get(decoded_url, timeout=10)
|
177 |
response.raise_for_status()
|
178 |
soup = BeautifulSoup(response.text, 'html.parser')
|
179 |
+
|
180 |
+
# Remove unwanted elements
|
181 |
+
for unwanted in soup(['nav', 'header', 'footer', 'sidebar', 'menu', 'aside']):
|
182 |
+
unwanted.decompose()
|
183 |
+
|
184 |
+
# Try to find the main content
|
185 |
+
main_content = soup.find('main') or soup.find('article') or soup.find('div', class_=re.compile('content|main|body'))
|
186 |
+
|
187 |
+
if main_content:
|
188 |
+
content = main_content.get_text(separator=' ', strip=True)
|
189 |
+
else:
|
190 |
+
# Fallback to body if no main content is found
|
191 |
+
content = soup.body.get_text(separator=' ', strip=True)
|
192 |
+
|
193 |
+
# Clean up the content
|
194 |
+
content = re.sub(r'\s+', ' ', content) # Replace multiple spaces with single space
|
195 |
+
|
196 |
+
# Use GPT to extract relevant content
|
197 |
+
relevant_content = extract_relevant_content(content, query)
|
198 |
+
return relevant_content
|
199 |
except requests.RequestException:
|
200 |
return ""
|
201 |
|
|
|
221 |
query = row['query']
|
222 |
our_url = row['page']
|
223 |
|
224 |
+
competitor_data = get_serp_results(query)
|
|
|
225 |
|
226 |
+
if custom_url and custom_url not in [data['url'] for data in competitor_data]:
|
227 |
+
custom_content = fetch_content(custom_url, query)
|
228 |
+
competitor_data.append({'url': custom_url, 'content': custom_content})
|
229 |
|
230 |
results = []
|
231 |
+
for data in competitor_data:
|
232 |
+
score = calculate_relevance_score(data['content'], query, co)
|
233 |
+
results.append({'url': data['url'], 'relevancy_score': score})
|
234 |
+
|
235 |
+
our_content = fetch_content(our_url, query)
|
236 |
+
our_score = calculate_relevance_score(our_content, query, co)
|
237 |
+
results.append({'url': our_url, 'relevancy_score': our_score})
|
238 |
|
|
|
239 |
results_df = pd.DataFrame(results).sort_values('relevancy_score', ascending=False)
|
240 |
|
241 |
return results_df
|
|
|
492 |
)
|
493 |
|
494 |
def calculate_single_relevancy(row):
|
495 |
+
page_content = fetch_content(row['page'], row['query'])
|
496 |
query = row['query']
|
497 |
score = calculate_relevance_score(page_content, query, co)
|
498 |
return score
|