FSA-PROJECT-CV / tools /crawl_data.py
trancongtuyetxuanthu's picture
Phase3/NhatTT: Add tools crawl data
2e20f5f
# import requests
# from bs4 import BeautifulSoup
# url = 'https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue' # Replace with the URL you intend to scrape
# response = requests.get(url)
# soup = BeautifulSoup(response.text, 'html.parser')
# # Example of extracting all paragraph texts
# paragraphs = soup.find_all('p')
# for paragraph in paragraphs:
# print(paragraph.text)
# # Extract all text from the body of the HTML document
# text = soup.body.get_text(separator=' ', strip=True)
# print(text)
#2
# import requests
# from bs4 import BeautifulSoup
# # List of URLs to scrape
# urls = [
# 'https://vietnix.vn/java-la-gi/', 'https://200lab.io/blog/python-la-gi/'
# # Add more URLs as needed
# ]
# for url in urls:
# response = requests.get(url)
# soup = BeautifulSoup(response.text, 'html.parser')
# # Extract and print all paragraph texts for each URL
# paragraphs = soup.find_all('p')
# print(f'Content from {url}:')
# for paragraph in paragraphs:
# print(paragraph.text)
# print("\n") # Print a new line for better readability between different URLs
# # Extract all text from the body of the HTML document for each URL
# text = soup.body.get_text(separator=' ', strip=True)
# print(f'Full text from {url}:')
# print(text)
# print("="*100) # Print a separator line for better readability between different URLs
# 4 add save file
# import requests
# from bs4 import BeautifulSoup
# import os
# # List of URLs to scrape
# urls = [
# 'https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue',
# # Add more URLs as needed
# ]
# for url in urls:
# response = requests.get(url)
# soup = BeautifulSoup(response.text, 'html.parser')
# # Extracting base name of the URL to use as the filename
# filename = os.path.basename(url).replace('%', '_').replace('?', '_') + '.txt'
# # Open a new text file for writing the scraped data
# with open(filename, 'w', encoding='utf-8') as file:
# # Write the URL to the file
# file.write(f'Content from {url}:\n')
# # Extract and write all paragraph texts for each URL
# paragraphs = soup.find_all('p')
# for paragraph in paragraphs:
# file.write(paragraph.text + '\n')
# file.write("\n") # Write a new line for better readability between different URLs
# # Extract and write all text from the body of the HTML document for each URL
# text = soup.body.get_text(separator=' ', strip=True)
# file.write(f'Full text from {url}:\n')
# file.write(text + '\n')
# file.write("="*100 + '\n') # Write a separator line for better readability between different URLs
# # Print out a message to let you know the data has been written to the file
# print(f'Scraped data from {url} has been saved to {filename}')
#5 It has internal link scrapping
# import requests
# from bs4 import BeautifulSoup
# import os
# # Initial list of main URLs to scan
# main_urls = [
# 'https://proxyway.com/guides/best-websites-to-practice-your-web-scraping-skills',
# # Add more main URLs as needed
# ]
# # Function to get all unique links from a given URL
# def get_all_links(url):
# response = requests.get(url)
# soup = BeautifulSoup(response.text, 'html.parser')
# links = soup.find_all('a')
# unique_links = set()
# for link in links:
# href = link.get('href')
# if href and href.startswith('/wiki/'): # Filters out unwanted links and keeps wikipedia internal links
# complete_link = f"https://en.wikipedia.org{href}"
# unique_links.add(complete_link)
# return list(unique_links)
# # Iterate over main URLs to get all specific links and scrape data from each
# for main_url in main_urls:
# urls = get_all_links(main_url) # Get all sub-links from the main URL
# for url in urls:
# response = requests.get(url)
# soup = BeautifulSoup(response.text, 'html.parser')
# # Extracting base name of the URL to use as the filename
# filename = os.path.basename(url).split('#')[0] # Remove URL fragments
# filename = filename.replace('%', '_').replace('?', '_') + '.txt' # Replace special characters
# # Open a new text file for writing the scraped data
# with open(filename, 'w', encoding='utf-8') as file:
# # Write the URL to the file
# file.write(f'Content from {url}:\n\n')
# # Extract and write all paragraph texts for each URL
# paragraphs = soup.find_all('p')
# for paragraph in paragraphs:
# file.write(paragraph.text + '\n\n')
# file.write("="*100 + '\n') # Write a separator line for better readability
# # Print out a message to let you know the data has been written to the file
# print(f'Scraped data from {url} has been saved to {filename}')
import requests
from bs4 import BeautifulSoup
import os
# Initial list of main URLs to scan
main_urls = [
'https://proxyway.com/guides/best-websites-to-practice-your-web-scraping-skills',
# Add more main URLs as needed
]
# Function to get all unique links from a given URL
def get_all_links(url):
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a')
unique_links = set()
for link in links:
href = link.get('href')
if href and not href.startswith('#') and not href.startswith('mailto:'): # Filters out unwanted links like anchors and emails
if not href.startswith('http'): # Check if the link is relative
href = url + href # Construct the complete URL
unique_links.add(href)
return list(unique_links)
# Iterate over main URLs to get all specific links and scrape data from each
for main_url in main_urls:
urls = get_all_links(main_url) # Get all sub-links from the main URL
for url in urls:
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# Extracting base name of the URL to use as the filename
filename = os.path.basename(url).split('#')[0] # Remove URL fragments
filename = filename.replace('%', '_').replace('?', '_') + '.txt' # Replace special characters
# Open a new text file for writing the scraped data
with open(filename, 'w', encoding='utf-8') as file:
# Write the URL to the file
file.write(f'Content from {url}:\n\n')
# Extract and write all paragraph texts for each URL
paragraphs = soup.find_all('p')
for paragraph in paragraphs:
file.write(paragraph.text + '\n\n')
file.write("="*100 + '\n') # Write a separator line for better readability
# Print out a message to let you know the data has been written to the file
print(f'Scraped data from {url} has been saved to {filename}')