bonrix commited on
Commit
e525960
1 Parent(s): 3573855

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -13
app.py CHANGED
@@ -1,9 +1,9 @@
1
  import requests
 
2
  from xml.etree import ElementTree as ET
3
  import openpyxl
4
  import gradio as gr
5
  from urllib.parse import urlparse, urljoin
6
- from lxml import etree
7
 
8
  visited_urls = set()
9
  unique_urls = set()
@@ -30,15 +30,14 @@ def create_sitemap_from_url(home_page_url):
30
 
31
  # Check if the request was successful
32
  if response.status_code == 200:
33
- # Parse the HTML content using lxml
34
- parser = etree.HTMLParser()
35
- tree = etree.fromstring(response.content, parser)
36
 
37
  # Add the URL to the set of unique URLs
38
  unique_urls.add(url)
39
 
40
  # Extract all the links on the page
41
- links = tree.xpath('//a[@href]')
42
 
43
  # Visit each link
44
  for link in links:
@@ -78,14 +77,12 @@ def fetch_and_save_to_excel(home_page_url):
78
  def fetch_page_info(url):
79
  response = requests.get(url)
80
  if response.status_code == 200:
81
- parser = etree.HTMLParser()
82
- tree = etree.fromstring(response.content, parser)
83
- title = tree.xpath('//title/text()')
84
- title = title[0] if title else 'No title found'
85
- keywords = tree.xpath('//meta[@name="keywords"]/@content')
86
- keywords = keywords[0] if keywords else 'No keywords found'
87
- description = tree.xpath('//meta[@name="description"]/@content')
88
- description = description[0] if description else 'No description found'
89
  return title, keywords, description
90
  return None, None, None
91
 
 
1
  import requests
2
+ from bs4 import BeautifulSoup
3
  from xml.etree import ElementTree as ET
4
  import openpyxl
5
  import gradio as gr
6
  from urllib.parse import urlparse, urljoin
 
7
 
8
  visited_urls = set()
9
  unique_urls = set()
 
30
 
31
  # Check if the request was successful
32
  if response.status_code == 200:
33
+ # Parse the HTML content using BeautifulSoup
34
+ soup = BeautifulSoup(response.content, 'html.parser')
 
35
 
36
  # Add the URL to the set of unique URLs
37
  unique_urls.add(url)
38
 
39
  # Extract all the links on the page
40
+ links = soup.find_all('a')
41
 
42
  # Visit each link
43
  for link in links:
 
77
  def fetch_page_info(url):
78
  response = requests.get(url)
79
  if response.status_code == 200:
80
+ soup = BeautifulSoup(response.text, 'html.parser')
81
+ title = soup.find('title').get_text() if soup.find('title') else 'No title found'
82
+ keywords = soup.find('meta', {'name': 'keywords'})
83
+ keywords = keywords.get('content') if keywords else 'No keywords found'
84
+ description = soup.find('meta', {'name': 'description'})
85
+ description = description.get('content') if description else 'No description found'
 
 
86
  return title, keywords, description
87
  return None, None, None
88