Spaces:

MillMin
/

FSA-PROJECT-CV

Build error

File size: 7,119 Bytes

2e20f5f

# import requests
# from bs4 import BeautifulSoup

# url = 'https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue'  # Replace with the URL you intend to scrape
# response = requests.get(url)
# soup = BeautifulSoup(response.text, 'html.parser')

# # Example of extracting all paragraph texts
# paragraphs = soup.find_all('p')
# for paragraph in paragraphs:
#     print(paragraph.text)


# # Extract all text from the body of the HTML document
# text = soup.body.get_text(separator=' ', strip=True)
# print(text)

#2

# import requests
# from bs4 import BeautifulSoup

# # List of URLs to scrape
# urls = [
#     'https://vietnix.vn/java-la-gi/', 'https://200lab.io/blog/python-la-gi/'
#     # Add more URLs as needed
# ]

# for url in urls:
#     response = requests.get(url)
#     soup = BeautifulSoup(response.text, 'html.parser')
    
#     # Extract and print all paragraph texts for each URL
#     paragraphs = soup.find_all('p')
#     print(f'Content from {url}:')
#     for paragraph in paragraphs:
#         print(paragraph.text)
#     print("\n")  # Print a new line for better readability between different URLs
    
#     # Extract all text from the body of the HTML document for each URL
#     text = soup.body.get_text(separator=' ', strip=True)
#     print(f'Full text from {url}:')
#     print(text)
#     print("="*100)  # Print a separator line for better readability between different URLs

# 4 add save file
# import requests
# from bs4 import BeautifulSoup
# import os

# # List of URLs to scrape
# urls = [
#     'https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue',
#     # Add more URLs as needed
# ]

# for url in urls:
#     response = requests.get(url)
#     soup = BeautifulSoup(response.text, 'html.parser')
    
#     # Extracting base name of the URL to use as the filename
#     filename = os.path.basename(url).replace('%', '_').replace('?', '_') + '.txt'
    
#     # Open a new text file for writing the scraped data
#     with open(filename, 'w', encoding='utf-8') as file:
#         # Write the URL to the file
#         file.write(f'Content from {url}:\n')
        
#         # Extract and write all paragraph texts for each URL
#         paragraphs = soup.find_all('p')
#         for paragraph in paragraphs:
#             file.write(paragraph.text + '\n')
#         file.write("\n")  # Write a new line for better readability between different URLs
        
#         # Extract and write all text from the body of the HTML document for each URL
#         text = soup.body.get_text(separator=' ', strip=True)
#         file.write(f'Full text from {url}:\n')
#         file.write(text + '\n')
#         file.write("="*100 + '\n')  # Write a separator line for better readability between different URLs
    
#     # Print out a message to let you know the data has been written to the file
#     print(f'Scraped data from {url} has been saved to {filename}')

#5 It has internal link scrapping
# import requests
# from bs4 import BeautifulSoup
# import os

# # Initial list of main URLs to scan
# main_urls = [
#     'https://proxyway.com/guides/best-websites-to-practice-your-web-scraping-skills',
#     # Add more main URLs as needed
# ]

# # Function to get all unique links from a given URL
# def get_all_links(url):
#     response = requests.get(url)
#     soup = BeautifulSoup(response.text, 'html.parser')
#     links = soup.find_all('a')
#     unique_links = set()
#     for link in links:
#         href = link.get('href')
#         if href and href.startswith('/wiki/'):  # Filters out unwanted links and keeps wikipedia internal links
#             complete_link = f"https://en.wikipedia.org{href}"
#             unique_links.add(complete_link)
#     return list(unique_links)

# # Iterate over main URLs to get all specific links and scrape data from each
# for main_url in main_urls:
#     urls = get_all_links(main_url)  # Get all sub-links from the main URL
#     for url in urls:
#         response = requests.get(url)
#         soup = BeautifulSoup(response.text, 'html.parser')
        
#         # Extracting base name of the URL to use as the filename
#         filename = os.path.basename(url).split('#')[0]  # Remove URL fragments
#         filename = filename.replace('%', '_').replace('?', '_') + '.txt'  # Replace special characters
        
#         # Open a new text file for writing the scraped data
#         with open(filename, 'w', encoding='utf-8') as file:
#             # Write the URL to the file
#             file.write(f'Content from {url}:\n\n')
            
#             # Extract and write all paragraph texts for each URL
#             paragraphs = soup.find_all('p')
#             for paragraph in paragraphs:
#                 file.write(paragraph.text + '\n\n')
#             file.write("="*100 + '\n')  # Write a separator line for better readability
        
#         # Print out a message to let you know the data has been written to the file
#         print(f'Scraped data from {url} has been saved to {filename}')

import requests
from bs4 import BeautifulSoup
import os

# Initial list of main URLs to scan
main_urls = [
    'https://proxyway.com/guides/best-websites-to-practice-your-web-scraping-skills',
    # Add more main URLs as needed
]

# Function to get all unique links from a given URL
def get_all_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    links = soup.find_all('a')
    unique_links = set()
    for link in links:
        href = link.get('href')
        if href and not href.startswith('#') and not href.startswith('mailto:'):  # Filters out unwanted links like anchors and emails
            if not href.startswith('http'):  # Check if the link is relative
                href = url + href  # Construct the complete URL
            unique_links.add(href)
    return list(unique_links)

# Iterate over main URLs to get all specific links and scrape data from each
for main_url in main_urls:
    urls = get_all_links(main_url)  # Get all sub-links from the main URL
    for url in urls:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extracting base name of the URL to use as the filename
        filename = os.path.basename(url).split('#')[0]  # Remove URL fragments
        filename = filename.replace('%', '_').replace('?', '_') + '.txt'  # Replace special characters
        
        # Open a new text file for writing the scraped data
        with open(filename, 'w', encoding='utf-8') as file:
            # Write the URL to the file
            file.write(f'Content from {url}:\n\n')
            
            # Extract and write all paragraph texts for each URL
            paragraphs = soup.find_all('p')
            for paragraph in paragraphs:
                file.write(paragraph.text + '\n\n')
            file.write("="*100 + '\n')  # Write a separator line for better readability
        
        # Print out a message to let you know the data has been written to the file
        print(f'Scraped data from {url} has been saved to {filename}')