Spaces:
Build error
Build error
# import requests | |
# from bs4 import BeautifulSoup | |
# url = 'https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue' # Replace with the URL you intend to scrape | |
# response = requests.get(url) | |
# soup = BeautifulSoup(response.text, 'html.parser') | |
# # Example of extracting all paragraph texts | |
# paragraphs = soup.find_all('p') | |
# for paragraph in paragraphs: | |
# print(paragraph.text) | |
# # Extract all text from the body of the HTML document | |
# text = soup.body.get_text(separator=' ', strip=True) | |
# print(text) | |
#2 | |
# import requests | |
# from bs4 import BeautifulSoup | |
# # List of URLs to scrape | |
# urls = [ | |
# 'https://vietnix.vn/java-la-gi/', 'https://200lab.io/blog/python-la-gi/' | |
# # Add more URLs as needed | |
# ] | |
# for url in urls: | |
# response = requests.get(url) | |
# soup = BeautifulSoup(response.text, 'html.parser') | |
# # Extract and print all paragraph texts for each URL | |
# paragraphs = soup.find_all('p') | |
# print(f'Content from {url}:') | |
# for paragraph in paragraphs: | |
# print(paragraph.text) | |
# print("\n") # Print a new line for better readability between different URLs | |
# # Extract all text from the body of the HTML document for each URL | |
# text = soup.body.get_text(separator=' ', strip=True) | |
# print(f'Full text from {url}:') | |
# print(text) | |
# print("="*100) # Print a separator line for better readability between different URLs | |
# 4 add save file | |
# import requests | |
# from bs4 import BeautifulSoup | |
# import os | |
# # List of URLs to scrape | |
# urls = [ | |
# 'https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue', | |
# # Add more URLs as needed | |
# ] | |
# for url in urls: | |
# response = requests.get(url) | |
# soup = BeautifulSoup(response.text, 'html.parser') | |
# # Extracting base name of the URL to use as the filename | |
# filename = os.path.basename(url).replace('%', '_').replace('?', '_') + '.txt' | |
# # Open a new text file for writing the scraped data | |
# with open(filename, 'w', encoding='utf-8') as file: | |
# # Write the URL to the file | |
# file.write(f'Content from {url}:\n') | |
# # Extract and write all paragraph texts for each URL | |
# paragraphs = soup.find_all('p') | |
# for paragraph in paragraphs: | |
# file.write(paragraph.text + '\n') | |
# file.write("\n") # Write a new line for better readability between different URLs | |
# # Extract and write all text from the body of the HTML document for each URL | |
# text = soup.body.get_text(separator=' ', strip=True) | |
# file.write(f'Full text from {url}:\n') | |
# file.write(text + '\n') | |
# file.write("="*100 + '\n') # Write a separator line for better readability between different URLs | |
# # Print out a message to let you know the data has been written to the file | |
# print(f'Scraped data from {url} has been saved to {filename}') | |
#5 It has internal link scrapping | |
# import requests | |
# from bs4 import BeautifulSoup | |
# import os | |
# # Initial list of main URLs to scan | |
# main_urls = [ | |
# 'https://proxyway.com/guides/best-websites-to-practice-your-web-scraping-skills', | |
# # Add more main URLs as needed | |
# ] | |
# # Function to get all unique links from a given URL | |
# def get_all_links(url): | |
# response = requests.get(url) | |
# soup = BeautifulSoup(response.text, 'html.parser') | |
# links = soup.find_all('a') | |
# unique_links = set() | |
# for link in links: | |
# href = link.get('href') | |
# if href and href.startswith('/wiki/'): # Filters out unwanted links and keeps wikipedia internal links | |
# complete_link = f"https://en.wikipedia.org{href}" | |
# unique_links.add(complete_link) | |
# return list(unique_links) | |
# # Iterate over main URLs to get all specific links and scrape data from each | |
# for main_url in main_urls: | |
# urls = get_all_links(main_url) # Get all sub-links from the main URL | |
# for url in urls: | |
# response = requests.get(url) | |
# soup = BeautifulSoup(response.text, 'html.parser') | |
# # Extracting base name of the URL to use as the filename | |
# filename = os.path.basename(url).split('#')[0] # Remove URL fragments | |
# filename = filename.replace('%', '_').replace('?', '_') + '.txt' # Replace special characters | |
# # Open a new text file for writing the scraped data | |
# with open(filename, 'w', encoding='utf-8') as file: | |
# # Write the URL to the file | |
# file.write(f'Content from {url}:\n\n') | |
# # Extract and write all paragraph texts for each URL | |
# paragraphs = soup.find_all('p') | |
# for paragraph in paragraphs: | |
# file.write(paragraph.text + '\n\n') | |
# file.write("="*100 + '\n') # Write a separator line for better readability | |
# # Print out a message to let you know the data has been written to the file | |
# print(f'Scraped data from {url} has been saved to {filename}') | |
import requests | |
from bs4 import BeautifulSoup | |
import os | |
# Initial list of main URLs to scan | |
main_urls = [ | |
'https://proxyway.com/guides/best-websites-to-practice-your-web-scraping-skills', | |
# Add more main URLs as needed | |
] | |
# Function to get all unique links from a given URL | |
def get_all_links(url): | |
response = requests.get(url) | |
soup = BeautifulSoup(response.text, 'html.parser') | |
links = soup.find_all('a') | |
unique_links = set() | |
for link in links: | |
href = link.get('href') | |
if href and not href.startswith('#') and not href.startswith('mailto:'): # Filters out unwanted links like anchors and emails | |
if not href.startswith('http'): # Check if the link is relative | |
href = url + href # Construct the complete URL | |
unique_links.add(href) | |
return list(unique_links) | |
# Iterate over main URLs to get all specific links and scrape data from each | |
for main_url in main_urls: | |
urls = get_all_links(main_url) # Get all sub-links from the main URL | |
for url in urls: | |
response = requests.get(url) | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# Extracting base name of the URL to use as the filename | |
filename = os.path.basename(url).split('#')[0] # Remove URL fragments | |
filename = filename.replace('%', '_').replace('?', '_') + '.txt' # Replace special characters | |
# Open a new text file for writing the scraped data | |
with open(filename, 'w', encoding='utf-8') as file: | |
# Write the URL to the file | |
file.write(f'Content from {url}:\n\n') | |
# Extract and write all paragraph texts for each URL | |
paragraphs = soup.find_all('p') | |
for paragraph in paragraphs: | |
file.write(paragraph.text + '\n\n') | |
file.write("="*100 + '\n') # Write a separator line for better readability | |
# Print out a message to let you know the data has been written to the file | |
print(f'Scraped data from {url} has been saved to {filename}') | |