Spaces:
Build error
Build error
File size: 7,119 Bytes
2e20f5f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 |
# import requests
# from bs4 import BeautifulSoup
# url = 'https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue' # Replace with the URL you intend to scrape
# response = requests.get(url)
# soup = BeautifulSoup(response.text, 'html.parser')
# # Example of extracting all paragraph texts
# paragraphs = soup.find_all('p')
# for paragraph in paragraphs:
# print(paragraph.text)
# # Extract all text from the body of the HTML document
# text = soup.body.get_text(separator=' ', strip=True)
# print(text)
#2
# import requests
# from bs4 import BeautifulSoup
# # List of URLs to scrape
# urls = [
# 'https://vietnix.vn/java-la-gi/', 'https://200lab.io/blog/python-la-gi/'
# # Add more URLs as needed
# ]
# for url in urls:
# response = requests.get(url)
# soup = BeautifulSoup(response.text, 'html.parser')
# # Extract and print all paragraph texts for each URL
# paragraphs = soup.find_all('p')
# print(f'Content from {url}:')
# for paragraph in paragraphs:
# print(paragraph.text)
# print("\n") # Print a new line for better readability between different URLs
# # Extract all text from the body of the HTML document for each URL
# text = soup.body.get_text(separator=' ', strip=True)
# print(f'Full text from {url}:')
# print(text)
# print("="*100) # Print a separator line for better readability between different URLs
# 4 add save file
# import requests
# from bs4 import BeautifulSoup
# import os
# # List of URLs to scrape
# urls = [
# 'https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue',
# # Add more URLs as needed
# ]
# for url in urls:
# response = requests.get(url)
# soup = BeautifulSoup(response.text, 'html.parser')
# # Extracting base name of the URL to use as the filename
# filename = os.path.basename(url).replace('%', '_').replace('?', '_') + '.txt'
# # Open a new text file for writing the scraped data
# with open(filename, 'w', encoding='utf-8') as file:
# # Write the URL to the file
# file.write(f'Content from {url}:\n')
# # Extract and write all paragraph texts for each URL
# paragraphs = soup.find_all('p')
# for paragraph in paragraphs:
# file.write(paragraph.text + '\n')
# file.write("\n") # Write a new line for better readability between different URLs
# # Extract and write all text from the body of the HTML document for each URL
# text = soup.body.get_text(separator=' ', strip=True)
# file.write(f'Full text from {url}:\n')
# file.write(text + '\n')
# file.write("="*100 + '\n') # Write a separator line for better readability between different URLs
# # Print out a message to let you know the data has been written to the file
# print(f'Scraped data from {url} has been saved to {filename}')
#5 It has internal link scrapping
# import requests
# from bs4 import BeautifulSoup
# import os
# # Initial list of main URLs to scan
# main_urls = [
# 'https://proxyway.com/guides/best-websites-to-practice-your-web-scraping-skills',
# # Add more main URLs as needed
# ]
# # Function to get all unique links from a given URL
# def get_all_links(url):
# response = requests.get(url)
# soup = BeautifulSoup(response.text, 'html.parser')
# links = soup.find_all('a')
# unique_links = set()
# for link in links:
# href = link.get('href')
# if href and href.startswith('/wiki/'): # Filters out unwanted links and keeps wikipedia internal links
# complete_link = f"https://en.wikipedia.org{href}"
# unique_links.add(complete_link)
# return list(unique_links)
# # Iterate over main URLs to get all specific links and scrape data from each
# for main_url in main_urls:
# urls = get_all_links(main_url) # Get all sub-links from the main URL
# for url in urls:
# response = requests.get(url)
# soup = BeautifulSoup(response.text, 'html.parser')
# # Extracting base name of the URL to use as the filename
# filename = os.path.basename(url).split('#')[0] # Remove URL fragments
# filename = filename.replace('%', '_').replace('?', '_') + '.txt' # Replace special characters
# # Open a new text file for writing the scraped data
# with open(filename, 'w', encoding='utf-8') as file:
# # Write the URL to the file
# file.write(f'Content from {url}:\n\n')
# # Extract and write all paragraph texts for each URL
# paragraphs = soup.find_all('p')
# for paragraph in paragraphs:
# file.write(paragraph.text + '\n\n')
# file.write("="*100 + '\n') # Write a separator line for better readability
# # Print out a message to let you know the data has been written to the file
# print(f'Scraped data from {url} has been saved to {filename}')
import requests
from bs4 import BeautifulSoup
import os
# Initial list of main URLs to scan
main_urls = [
'https://proxyway.com/guides/best-websites-to-practice-your-web-scraping-skills',
# Add more main URLs as needed
]
# Function to get all unique links from a given URL
def get_all_links(url):
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a')
unique_links = set()
for link in links:
href = link.get('href')
if href and not href.startswith('#') and not href.startswith('mailto:'): # Filters out unwanted links like anchors and emails
if not href.startswith('http'): # Check if the link is relative
href = url + href # Construct the complete URL
unique_links.add(href)
return list(unique_links)
# Iterate over main URLs to get all specific links and scrape data from each
for main_url in main_urls:
urls = get_all_links(main_url) # Get all sub-links from the main URL
for url in urls:
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# Extracting base name of the URL to use as the filename
filename = os.path.basename(url).split('#')[0] # Remove URL fragments
filename = filename.replace('%', '_').replace('?', '_') + '.txt' # Replace special characters
# Open a new text file for writing the scraped data
with open(filename, 'w', encoding='utf-8') as file:
# Write the URL to the file
file.write(f'Content from {url}:\n\n')
# Extract and write all paragraph texts for each URL
paragraphs = soup.find_all('p')
for paragraph in paragraphs:
file.write(paragraph.text + '\n\n')
file.write("="*100 + '\n') # Write a separator line for better readability
# Print out a message to let you know the data has been written to the file
print(f'Scraped data from {url} has been saved to {filename}')
|