Spaces:

bonrix
/

url_to_duplicate_title_finder

Runtime error

App Files Files Community

bonrix commited on Aug 14, 2023

Commit

e525960

•

1 Parent(s): 3573855

Update app.py

Browse files

Files changed (1) hide show

app.py +10 -13

app.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import requests
 from xml.etree import ElementTree as ET
 import openpyxl
 import gradio as gr
 from urllib.parse import urlparse, urljoin
-from lxml import etree
 visited_urls = set()
 unique_urls = set()
@@ -30,15 +30,14 @@ def create_sitemap_from_url(home_page_url):
         # Check if the request was successful
         if response.status_code == 200:
-            # Parse the HTML content using lxml
-            parser = etree.HTMLParser()
-            tree = etree.fromstring(response.content, parser)
             # Add the URL to the set of unique URLs
             unique_urls.add(url)
             # Extract all the links on the page
-            links = tree.xpath('//a[@href]')
             # Visit each link
             for link in links:
@@ -78,14 +77,12 @@ def fetch_and_save_to_excel(home_page_url):
     def fetch_page_info(url):
         response = requests.get(url)
         if response.status_code == 200:
-            parser = etree.HTMLParser()
-            tree = etree.fromstring(response.content, parser)
-            title = tree.xpath('//title/text()')
-            title = title[0] if title else 'No title found'
-            keywords = tree.xpath('//meta[@name="keywords"]/@content')
-            keywords = keywords[0] if keywords else 'No keywords found'
-            description = tree.xpath('//meta[@name="description"]/@content')
-            description = description[0] if description else 'No description found'
             return title, keywords, description
         return None, None, None

 import requests
+from bs4 import BeautifulSoup
 from xml.etree import ElementTree as ET
 import openpyxl
 import gradio as gr
 from urllib.parse import urlparse, urljoin
 visited_urls = set()
 unique_urls = set()
         # Check if the request was successful
         if response.status_code == 200:
+            # Parse the HTML content using BeautifulSoup
+            soup = BeautifulSoup(response.content, 'html.parser')
             # Add the URL to the set of unique URLs
             unique_urls.add(url)
             # Extract all the links on the page
+            links = soup.find_all('a')
             # Visit each link
             for link in links:
     def fetch_page_info(url):
         response = requests.get(url)
         if response.status_code == 200:
+            soup = BeautifulSoup(response.text, 'html.parser')
+            title = soup.find('title').get_text() if soup.find('title') else 'No title found'
+            keywords = soup.find('meta', {'name': 'keywords'})
+            keywords = keywords.get('content') if keywords else 'No keywords found'
+            description = soup.find('meta', {'name': 'description'})
+            description = description.get('content') if description else 'No description found'
             return title, keywords, description
         return None, None, None