Spaces:
Sleeping
Sleeping
import os | |
import requests | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
import time | |
from random import randint | |
def scrape_tariffs(urls): | |
data = [] | |
# Ensure the 'data' directory exists before saving the CSV | |
os.makedirs("data", exist_ok=True) | |
for url in urls: | |
try: | |
response = requests.get(url, timeout=10) # Added timeout | |
response.raise_for_status() # Raise exception for bad status codes (4xx, 5xx) | |
# Scrape data if the response is OK | |
if response.status_code == 200: | |
soup = BeautifulSoup(response.content, "html.parser") | |
rows = soup.find_all("tr") | |
for row in rows: | |
cells = row.find_all("td") | |
if len(cells) >= 2: | |
try: | |
data.append({ | |
"category": cells[0].text.strip(), | |
"rate": float(cells[1].text.strip().replace(",", "")), | |
}) | |
except ValueError: | |
continue | |
except requests.exceptions.RequestException as e: | |
print(f"Error fetching data from {url}: {e}") | |
print("Retrying...") | |
# Retry logic in case of failure (max 3 retries with random delay) | |
retries = 3 | |
while retries > 0: | |
time.sleep(randint(1, 3)) # Sleep for a random time before retrying | |
retries -= 1 | |
try: | |
response = requests.get(url, timeout=10) | |
response.raise_for_status() | |
if response.status_code == 200: | |
soup = BeautifulSoup(response.content, "html.parser") | |
rows = soup.find_all("tr") | |
for row in rows: | |
cells = row.find_all("td") | |
if len(cells) >= 2: | |
try: | |
data.append({ | |
"category": cells[0].text.strip(), | |
"rate": float(cells[1].text.strip().replace(",", "")), | |
}) | |
except ValueError: | |
continue | |
break | |
except requests.exceptions.RequestException: | |
print(f"Retry failed: {e}") | |
continue | |
# Sleep between requests to avoid hitting the servers too quickly | |
time.sleep(randint(2, 5)) | |
if data: | |
df = pd.DataFrame(data) | |
# Save the scraped data to the 'data' directory | |
df.to_csv("data/tariffs.csv", index=False) | |
print("Tariff data saved successfully.") | |
else: | |
print("No tariff data found.") |