Spaces:
Build error
Build error
| from curl_cffi import requests as req | |
| from bs4 import BeautifulSoup | |
| import html2text | |
| def scrape_to_markdown(url): | |
| """ | |
| Scrapes a webpage and converts its content to markdown format. | |
| Args: | |
| url (str): The URL of the webpage to scrape | |
| Returns: | |
| str: The webpage content converted to markdown | |
| """ | |
| # Fetch HTML content | |
| response = req.get(url, impersonate='chrome110') | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| # Clean up unwanted tags | |
| for tag in soup(['script', 'style', 'noscript', 'svg', 'css']): | |
| tag.decompose() | |
| # Extract cleaned HTML | |
| clean_html = str(soup) | |
| # Convert to Markdown | |
| markdown = html2text.html2text(clean_html) | |
| return markdown | |