Spaces:

MillMin
/

FSA-PROJECT-CV

Build error

App Files Files Community

FSA-PROJECT-CV / tools /crawl_data.py

trancongtuyetxuanthu

Phase3/NhatTT: Add tools crawl data

2e20f5f 4 months ago

raw history blame contribute delete

No virus

7.12 kB

	# import requests
	# from bs4 import BeautifulSoup

	# url = 'https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue' # Replace with the URL you intend to scrape
	# response = requests.get(url)
	# soup = BeautifulSoup(response.text, 'html.parser')

	# # Example of extracting all paragraph texts
	# paragraphs = soup.find_all('p')
	# for paragraph in paragraphs:
	# print(paragraph.text)


	# # Extract all text from the body of the HTML document
	# text = soup.body.get_text(separator=' ', strip=True)
	# print(text)

	#2

	# import requests
	# from bs4 import BeautifulSoup

	# # List of URLs to scrape
	# urls = [
	# 'https://vietnix.vn/java-la-gi/', 'https://200lab.io/blog/python-la-gi/'
	# # Add more URLs as needed
	# ]

	# for url in urls:
	# response = requests.get(url)
	# soup = BeautifulSoup(response.text, 'html.parser')

	# # Extract and print all paragraph texts for each URL
	# paragraphs = soup.find_all('p')
	# print(f'Content from {url}:')
	# for paragraph in paragraphs:
	# print(paragraph.text)
	# print("\n") # Print a new line for better readability between different URLs

	# # Extract all text from the body of the HTML document for each URL
	# text = soup.body.get_text(separator=' ', strip=True)
	# print(f'Full text from {url}:')
	# print(text)
	# print("="*100) # Print a separator line for better readability between different URLs

	# 4 add save file
	# import requests
	# from bs4 import BeautifulSoup
	# import os

	# # List of URLs to scrape
	# urls = [
	# 'https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue',
	# # Add more URLs as needed
	# ]

	# for url in urls:
	# response = requests.get(url)
	# soup = BeautifulSoup(response.text, 'html.parser')

	# # Extracting base name of the URL to use as the filename
	# filename = os.path.basename(url).replace('%', '_').replace('?', '_') + '.txt'

	# # Open a new text file for writing the scraped data
	# with open(filename, 'w', encoding='utf-8') as file:
	# # Write the URL to the file
	# file.write(f'Content from {url}:\n')

	# # Extract and write all paragraph texts for each URL
	# paragraphs = soup.find_all('p')
	# for paragraph in paragraphs:
	# file.write(paragraph.text + '\n')
	# file.write("\n") # Write a new line for better readability between different URLs

	# # Extract and write all text from the body of the HTML document for each URL
	# text = soup.body.get_text(separator=' ', strip=True)
	# file.write(f'Full text from {url}:\n')
	# file.write(text + '\n')
	# file.write("="*100 + '\n') # Write a separator line for better readability between different URLs

	# # Print out a message to let you know the data has been written to the file
	# print(f'Scraped data from {url} has been saved to {filename}')

	#5 It has internal link scrapping
	# import requests
	# from bs4 import BeautifulSoup
	# import os

	# # Initial list of main URLs to scan
	# main_urls = [
	# 'https://proxyway.com/guides/best-websites-to-practice-your-web-scraping-skills',
	# # Add more main URLs as needed
	# ]

	# # Function to get all unique links from a given URL
	# def get_all_links(url):
	# response = requests.get(url)
	# soup = BeautifulSoup(response.text, 'html.parser')
	# links = soup.find_all('a')
	# unique_links = set()
	# for link in links:
	# href = link.get('href')
	# if href and href.startswith('/wiki/'): # Filters out unwanted links and keeps wikipedia internal links
	# complete_link = f"https://en.wikipedia.org{href}"
	# unique_links.add(complete_link)
	# return list(unique_links)

	# # Iterate over main URLs to get all specific links and scrape data from each
	# for main_url in main_urls:
	# urls = get_all_links(main_url) # Get all sub-links from the main URL
	# for url in urls:
	# response = requests.get(url)
	# soup = BeautifulSoup(response.text, 'html.parser')

	# # Extracting base name of the URL to use as the filename
	# filename = os.path.basename(url).split('#')[0] # Remove URL fragments
	# filename = filename.replace('%', '_').replace('?', '_') + '.txt' # Replace special characters

	# # Open a new text file for writing the scraped data
	# with open(filename, 'w', encoding='utf-8') as file:
	# # Write the URL to the file
	# file.write(f'Content from {url}:\n\n')

	# # Extract and write all paragraph texts for each URL
	# paragraphs = soup.find_all('p')
	# for paragraph in paragraphs:
	# file.write(paragraph.text + '\n\n')
	# file.write("="*100 + '\n') # Write a separator line for better readability

	# # Print out a message to let you know the data has been written to the file
	# print(f'Scraped data from {url} has been saved to {filename}')

	import requests
	from bs4 import BeautifulSoup
	import os

	# Initial list of main URLs to scan
	main_urls = [
	'https://proxyway.com/guides/best-websites-to-practice-your-web-scraping-skills',
	# Add more main URLs as needed
	]

	# Function to get all unique links from a given URL
	def get_all_links(url):
	response = requests.get(url)
	soup = BeautifulSoup(response.text, 'html.parser')
	links = soup.find_all('a')
	unique_links = set()
	for link in links:
	href = link.get('href')
	if href and not href.startswith('#') and not href.startswith('mailto:'): # Filters out unwanted links like anchors and emails
	if not href.startswith('http'): # Check if the link is relative
	href = url + href # Construct the complete URL
	unique_links.add(href)
	return list(unique_links)

	# Iterate over main URLs to get all specific links and scrape data from each
	for main_url in main_urls:
	urls = get_all_links(main_url) # Get all sub-links from the main URL
	for url in urls:
	response = requests.get(url)
	soup = BeautifulSoup(response.text, 'html.parser')

	# Extracting base name of the URL to use as the filename
	filename = os.path.basename(url).split('#')[0] # Remove URL fragments
	filename = filename.replace('%', '_').replace('?', '_') + '.txt' # Replace special characters

	# Open a new text file for writing the scraped data
	with open(filename, 'w', encoding='utf-8') as file:
	# Write the URL to the file
	file.write(f'Content from {url}:\n\n')

	# Extract and write all paragraph texts for each URL
	paragraphs = soup.find_all('p')
	for paragraph in paragraphs:
	file.write(paragraph.text + '\n\n')
	file.write("="*100 + '\n') # Write a separator line for better readability

	# Print out a message to let you know the data has been written to the file
	print(f'Scraped data from {url} has been saved to {filename}')