File size: 7,119 Bytes
2e20f5f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
# import requests
# from bs4 import BeautifulSoup

# url = 'https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue'  # Replace with the URL you intend to scrape
# response = requests.get(url)
# soup = BeautifulSoup(response.text, 'html.parser')

# # Example of extracting all paragraph texts
# paragraphs = soup.find_all('p')
# for paragraph in paragraphs:
#     print(paragraph.text)


# # Extract all text from the body of the HTML document
# text = soup.body.get_text(separator=' ', strip=True)
# print(text)

#2

# import requests
# from bs4 import BeautifulSoup

# # List of URLs to scrape
# urls = [
#     'https://vietnix.vn/java-la-gi/', 'https://200lab.io/blog/python-la-gi/'
#     # Add more URLs as needed
# ]

# for url in urls:
#     response = requests.get(url)
#     soup = BeautifulSoup(response.text, 'html.parser')
    
#     # Extract and print all paragraph texts for each URL
#     paragraphs = soup.find_all('p')
#     print(f'Content from {url}:')
#     for paragraph in paragraphs:
#         print(paragraph.text)
#     print("\n")  # Print a new line for better readability between different URLs
    
#     # Extract all text from the body of the HTML document for each URL
#     text = soup.body.get_text(separator=' ', strip=True)
#     print(f'Full text from {url}:')
#     print(text)
#     print("="*100)  # Print a separator line for better readability between different URLs

# 4 add save file
# import requests
# from bs4 import BeautifulSoup
# import os

# # List of URLs to scrape
# urls = [
#     'https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue',
#     # Add more URLs as needed
# ]

# for url in urls:
#     response = requests.get(url)
#     soup = BeautifulSoup(response.text, 'html.parser')
    
#     # Extracting base name of the URL to use as the filename
#     filename = os.path.basename(url).replace('%', '_').replace('?', '_') + '.txt'
    
#     # Open a new text file for writing the scraped data
#     with open(filename, 'w', encoding='utf-8') as file:
#         # Write the URL to the file
#         file.write(f'Content from {url}:\n')
        
#         # Extract and write all paragraph texts for each URL
#         paragraphs = soup.find_all('p')
#         for paragraph in paragraphs:
#             file.write(paragraph.text + '\n')
#         file.write("\n")  # Write a new line for better readability between different URLs
        
#         # Extract and write all text from the body of the HTML document for each URL
#         text = soup.body.get_text(separator=' ', strip=True)
#         file.write(f'Full text from {url}:\n')
#         file.write(text + '\n')
#         file.write("="*100 + '\n')  # Write a separator line for better readability between different URLs
    
#     # Print out a message to let you know the data has been written to the file
#     print(f'Scraped data from {url} has been saved to {filename}')

#5 It has internal link scrapping
# import requests
# from bs4 import BeautifulSoup
# import os

# # Initial list of main URLs to scan
# main_urls = [
#     'https://proxyway.com/guides/best-websites-to-practice-your-web-scraping-skills',
#     # Add more main URLs as needed
# ]

# # Function to get all unique links from a given URL
# def get_all_links(url):
#     response = requests.get(url)
#     soup = BeautifulSoup(response.text, 'html.parser')
#     links = soup.find_all('a')
#     unique_links = set()
#     for link in links:
#         href = link.get('href')
#         if href and href.startswith('/wiki/'):  # Filters out unwanted links and keeps wikipedia internal links
#             complete_link = f"https://en.wikipedia.org{href}"
#             unique_links.add(complete_link)
#     return list(unique_links)

# # Iterate over main URLs to get all specific links and scrape data from each
# for main_url in main_urls:
#     urls = get_all_links(main_url)  # Get all sub-links from the main URL
#     for url in urls:
#         response = requests.get(url)
#         soup = BeautifulSoup(response.text, 'html.parser')
        
#         # Extracting base name of the URL to use as the filename
#         filename = os.path.basename(url).split('#')[0]  # Remove URL fragments
#         filename = filename.replace('%', '_').replace('?', '_') + '.txt'  # Replace special characters
        
#         # Open a new text file for writing the scraped data
#         with open(filename, 'w', encoding='utf-8') as file:
#             # Write the URL to the file
#             file.write(f'Content from {url}:\n\n')
            
#             # Extract and write all paragraph texts for each URL
#             paragraphs = soup.find_all('p')
#             for paragraph in paragraphs:
#                 file.write(paragraph.text + '\n\n')
#             file.write("="*100 + '\n')  # Write a separator line for better readability
        
#         # Print out a message to let you know the data has been written to the file
#         print(f'Scraped data from {url} has been saved to {filename}')

import requests
from bs4 import BeautifulSoup
import os

# Initial list of main URLs to scan
main_urls = [
    'https://proxyway.com/guides/best-websites-to-practice-your-web-scraping-skills',
    # Add more main URLs as needed
]

# Function to get all unique links from a given URL
def get_all_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    links = soup.find_all('a')
    unique_links = set()
    for link in links:
        href = link.get('href')
        if href and not href.startswith('#') and not href.startswith('mailto:'):  # Filters out unwanted links like anchors and emails
            if not href.startswith('http'):  # Check if the link is relative
                href = url + href  # Construct the complete URL
            unique_links.add(href)
    return list(unique_links)

# Iterate over main URLs to get all specific links and scrape data from each
for main_url in main_urls:
    urls = get_all_links(main_url)  # Get all sub-links from the main URL
    for url in urls:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extracting base name of the URL to use as the filename
        filename = os.path.basename(url).split('#')[0]  # Remove URL fragments
        filename = filename.replace('%', '_').replace('?', '_') + '.txt'  # Replace special characters
        
        # Open a new text file for writing the scraped data
        with open(filename, 'w', encoding='utf-8') as file:
            # Write the URL to the file
            file.write(f'Content from {url}:\n\n')
            
            # Extract and write all paragraph texts for each URL
            paragraphs = soup.find_all('p')
            for paragraph in paragraphs:
                file.write(paragraph.text + '\n\n')
            file.write("="*100 + '\n')  # Write a separator line for better readability
        
        # Print out a message to let you know the data has been written to the file
        print(f'Scraped data from {url} has been saved to {filename}')