poemsforaphrodite commited on
Commit
2913f49
1 Parent(s): 858a793

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -16
app.py CHANGED
@@ -16,6 +16,9 @@ import requests
16
  from bs4 import BeautifulSoup
17
  from apify_client import ApifyClient
18
  import urllib.parse
 
 
 
19
 
20
 
21
  load_dotenv()
@@ -32,6 +35,12 @@ if not APIFY_API_TOKEN:
32
  client = ApifyClient(APIFY_API_TOKEN)
33
  # Initialize the ApifyClient with the API token
34
 
 
 
 
 
 
 
35
  # Configuration: Set to True if running locally, False if running on Streamlit Cloud
36
  IS_LOCAL = False
37
 
@@ -128,9 +137,12 @@ def get_serp_results(query):
128
  # logger.info(f"Fetched {len(results)} results from Apify dataset")
129
 
130
  if results and 'organicResults' in results[0]:
131
- urls = [item['url'] for item in results[0]['organicResults']]
132
- # logger.info(f"Extracted {len(urls)} URLs from organic results")
133
- return urls
 
 
 
134
  else:
135
  # logger.warning("No organic results found in the SERP data.")
136
  st.warning("No organic results found in the SERP data.")
@@ -143,14 +155,47 @@ def get_serp_results(query):
143
 
144
 
145
 
146
- def fetch_content(url):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  try:
148
  decoded_url = urllib.parse.unquote(url)
149
  response = requests.get(decoded_url, timeout=10)
150
  response.raise_for_status()
151
  soup = BeautifulSoup(response.text, 'html.parser')
152
- content = soup.get_text(separator=' ', strip=True)
153
- return content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  except requests.RequestException:
155
  return ""
156
 
@@ -176,19 +221,21 @@ def analyze_competitors(row, co, custom_url=None):
176
  query = row['query']
177
  our_url = row['page']
178
 
179
- competitor_urls = get_serp_results(query)
180
- competitor_urls = [url for url in competitor_urls if not url.startswith('/search')][:5] # Get top 5 valid competitors
181
 
182
- if custom_url and custom_url not in competitor_urls:
183
- competitor_urls.append(custom_url)
 
184
 
185
  results = []
186
- for url in competitor_urls:
187
- content = fetch_content(url)
188
- score = calculate_relevance_score(content, query, co)
189
- results.append({'url': url, 'relevancy_score': score})
 
 
 
190
 
191
- results.append({'url': our_url, 'relevancy_score': row['relevancy_score']})
192
  results_df = pd.DataFrame(results).sort_values('relevancy_score', ascending=False)
193
 
194
  return results_df
@@ -445,7 +492,7 @@ def show_model_type_selector():
445
  )
446
 
447
  def calculate_single_relevancy(row):
448
- page_content = fetch_content(row['page'])
449
  query = row['query']
450
  score = calculate_relevance_score(page_content, query, co)
451
  return score
 
16
  from bs4 import BeautifulSoup
17
  from apify_client import ApifyClient
18
  import urllib.parse
19
+ import openai
20
+ from openai import OpenAI
21
+ import re
22
 
23
 
24
  load_dotenv()
 
35
  client = ApifyClient(APIFY_API_TOKEN)
36
  # Initialize the ApifyClient with the API token
37
 
38
+ # Initialize OpenAI client
39
+ OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')
40
+ if not OPENAI_API_KEY:
41
+ st.error("OPENAI_API_KEY is not set in the environment variables. Please set it and restart the application.")
42
+ client = OpenAI(api_key=OPENAI_API_KEY)
43
+
44
  # Configuration: Set to True if running locally, False if running on Streamlit Cloud
45
  IS_LOCAL = False
46
 
 
137
  # logger.info(f"Fetched {len(results)} results from Apify dataset")
138
 
139
  if results and 'organicResults' in results[0]:
140
+ serp_data = []
141
+ for item in results[0]['organicResults'][:5]: # Limit to top 5 results
142
+ url = item['url']
143
+ content = fetch_content(url, query)
144
+ serp_data.append({'url': url, 'content': content})
145
+ return serp_data
146
  else:
147
  # logger.warning("No organic results found in the SERP data.")
148
  st.warning("No organic results found in the SERP data.")
 
155
 
156
 
157
 
158
+ def extract_relevant_content(full_content, query):
159
+ try:
160
+ response = client.chat.completions.create(
161
+ model="gpt-4o-mini",
162
+ messages=[
163
+ {"role": "system", "content": "You are a helpful assistant that extracts the most relevant content from web pages."},
164
+ {"role": "user", "content": f"Given the following web page content and search query, extract only the most relevant parts of the content that answer or relate to the query.If there's no relevant content, say 'No relevant content found.'\n\nQuery: {query}\n\nContent: {full_content[:4000]}"} # Limit input to 4000 characters
165
+ ],
166
+ max_tokens=3000 # Adjust as needed
167
+ )
168
+ return response.choices[0].message.content.strip()
169
+ except Exception as e:
170
+ st.error(f"Error in GPT content extraction: {str(e)}")
171
+ return "Error in content extraction"
172
+
173
+ def fetch_content(url, query):
174
  try:
175
  decoded_url = urllib.parse.unquote(url)
176
  response = requests.get(decoded_url, timeout=10)
177
  response.raise_for_status()
178
  soup = BeautifulSoup(response.text, 'html.parser')
179
+
180
+ # Remove unwanted elements
181
+ for unwanted in soup(['nav', 'header', 'footer', 'sidebar', 'menu', 'aside']):
182
+ unwanted.decompose()
183
+
184
+ # Try to find the main content
185
+ main_content = soup.find('main') or soup.find('article') or soup.find('div', class_=re.compile('content|main|body'))
186
+
187
+ if main_content:
188
+ content = main_content.get_text(separator=' ', strip=True)
189
+ else:
190
+ # Fallback to body if no main content is found
191
+ content = soup.body.get_text(separator=' ', strip=True)
192
+
193
+ # Clean up the content
194
+ content = re.sub(r'\s+', ' ', content) # Replace multiple spaces with single space
195
+
196
+ # Use GPT to extract relevant content
197
+ relevant_content = extract_relevant_content(content, query)
198
+ return relevant_content
199
  except requests.RequestException:
200
  return ""
201
 
 
221
  query = row['query']
222
  our_url = row['page']
223
 
224
+ competitor_data = get_serp_results(query)
 
225
 
226
+ if custom_url and custom_url not in [data['url'] for data in competitor_data]:
227
+ custom_content = fetch_content(custom_url, query)
228
+ competitor_data.append({'url': custom_url, 'content': custom_content})
229
 
230
  results = []
231
+ for data in competitor_data:
232
+ score = calculate_relevance_score(data['content'], query, co)
233
+ results.append({'url': data['url'], 'relevancy_score': score})
234
+
235
+ our_content = fetch_content(our_url, query)
236
+ our_score = calculate_relevance_score(our_content, query, co)
237
+ results.append({'url': our_url, 'relevancy_score': our_score})
238
 
 
239
  results_df = pd.DataFrame(results).sort_values('relevancy_score', ascending=False)
240
 
241
  return results_df
 
492
  )
493
 
494
  def calculate_single_relevancy(row):
495
+ page_content = fetch_content(row['page'], row['query'])
496
  query = row['query']
497
  score = calculate_relevance_score(page_content, query, co)
498
  return score