Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
import requests
|
|
|
2 |
from xml.etree import ElementTree as ET
|
3 |
import openpyxl
|
4 |
import gradio as gr
|
5 |
from urllib.parse import urlparse, urljoin
|
6 |
-
from lxml import etree
|
7 |
|
8 |
visited_urls = set()
|
9 |
unique_urls = set()
|
@@ -30,15 +30,14 @@ def create_sitemap_from_url(home_page_url):
|
|
30 |
|
31 |
# Check if the request was successful
|
32 |
if response.status_code == 200:
|
33 |
-
# Parse the HTML content using
|
34 |
-
|
35 |
-
tree = etree.fromstring(response.content, parser)
|
36 |
|
37 |
# Add the URL to the set of unique URLs
|
38 |
unique_urls.add(url)
|
39 |
|
40 |
# Extract all the links on the page
|
41 |
-
links =
|
42 |
|
43 |
# Visit each link
|
44 |
for link in links:
|
@@ -78,14 +77,12 @@ def fetch_and_save_to_excel(home_page_url):
|
|
78 |
def fetch_page_info(url):
|
79 |
response = requests.get(url)
|
80 |
if response.status_code == 200:
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
description = tree.xpath('//meta[@name="description"]/@content')
|
88 |
-
description = description[0] if description else 'No description found'
|
89 |
return title, keywords, description
|
90 |
return None, None, None
|
91 |
|
|
|
1 |
import requests
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
from xml.etree import ElementTree as ET
|
4 |
import openpyxl
|
5 |
import gradio as gr
|
6 |
from urllib.parse import urlparse, urljoin
|
|
|
7 |
|
8 |
visited_urls = set()
|
9 |
unique_urls = set()
|
|
|
30 |
|
31 |
# Check if the request was successful
|
32 |
if response.status_code == 200:
|
33 |
+
# Parse the HTML content using BeautifulSoup
|
34 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
|
|
35 |
|
36 |
# Add the URL to the set of unique URLs
|
37 |
unique_urls.add(url)
|
38 |
|
39 |
# Extract all the links on the page
|
40 |
+
links = soup.find_all('a')
|
41 |
|
42 |
# Visit each link
|
43 |
for link in links:
|
|
|
77 |
def fetch_page_info(url):
|
78 |
response = requests.get(url)
|
79 |
if response.status_code == 200:
|
80 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
81 |
+
title = soup.find('title').get_text() if soup.find('title') else 'No title found'
|
82 |
+
keywords = soup.find('meta', {'name': 'keywords'})
|
83 |
+
keywords = keywords.get('content') if keywords else 'No keywords found'
|
84 |
+
description = soup.find('meta', {'name': 'description'})
|
85 |
+
description = description.get('content') if description else 'No description found'
|
|
|
|
|
86 |
return title, keywords, description
|
87 |
return None, None, None
|
88 |
|