Spaces:

Unicone-Studio
/

instance1

Paused

App Files Files Community

instance1 / hf_scrapper.py

ChandimaPrabath

add cors

c9f8e3c 4 months ago

raw

history blame

10.3 kB

	import os
	import requests
	import json
	import urllib.request
	import time
	from requests.exceptions import RequestException
	from tqdm import tqdm

	CACHE_DIR = os.getenv("CACHE_DIR")
	CACHE_JSON_PATH = os.path.join(CACHE_DIR, "cached_films.json")

	download_progress = {}

	def get_system_proxies():
	"""
	Retrieves the system's HTTP and HTTPS proxies.

	Returns:
	dict: A dictionary containing the proxies.
	"""
	try:
	proxies = urllib.request.getproxies()
	print("System proxies:", proxies)
	return {
	"http": proxies.get("http"),
	"https": proxies.get("http")
	}
	except Exception as e:
	print(f"Error getting system proxies: {e}")
	return {}

	def download_film(file_url, token, cache_path, proxies, film_id, title, chunk_size=100 * 1024 * 1024):
	"""
	Downloads a file from the specified URL and saves it to the cache path.
	Tracks the download progress.

	Args:
	file_url (str): The URL of the file to download.
	token (str): The authorization token for the request.
	cache_path (str): The path to save the downloaded file.
	proxies (dict): Proxies for the request.
	film_id (str): Unique identifier for the film download.
	title (str): The title of the film.
	chunk_size (int): Size of each chunk to download.
	"""
	print(f"Downloading file from URL: {file_url} to {cache_path} with proxies: {proxies}")
	headers = {'Authorization': f'Bearer {token}'}
	try:
	response = requests.get(file_url, headers=headers, proxies=proxies, stream=True)
	response.raise_for_status()

	total_size = int(response.headers.get('content-length', 0))
	download_progress[film_id] = {"total": total_size, "downloaded": 0, "status": "Downloading", "start_time": time.time()}

	os.makedirs(os.path.dirname(cache_path), exist_ok=True)
	with open(cache_path, 'wb') as file, tqdm(total=total_size, unit='B', unit_scale=True, desc=cache_path) as pbar:
	for data in response.iter_content(chunk_size=chunk_size):
	file.write(data)
	pbar.update(len(data))
	download_progress[film_id]["downloaded"] += len(data)

	print(f'File cached to {cache_path} successfully.')
	update_film_store_json(title, cache_path)
	download_progress[film_id]["status"] = "Completed"
	except RequestException as e:
	print(f"Error downloading file: {e}")
	download_progress[film_id]["status"] = "Failed"
	except IOError as e:
	print(f"Error writing file {cache_path}: {e}")
	download_progress[film_id]["status"] = "Failed"
	finally:
	if download_progress[film_id]["status"] != "Downloading":
	download_progress[film_id]["end_time"] = time.time()

	def get_download_progress(id):
	"""
	Gets the download progress for a specific film.

	Args:
	film_id (str): The unique identifier for the film download.

	Returns:
	dict: A dictionary containing the total size, downloaded size, progress percentage, status, and ETA.
	"""
	if id in download_progress:
	total = download_progress[id]["total"]
	downloaded = download_progress[id]["downloaded"]
	status = download_progress[id].get("status", "In Progress")
	progress = (downloaded / total) * 100 if total > 0 else 0

	eta = None
	if status == "Downloading" and downloaded > 0:
	elapsed_time = time.time() - download_progress[id]["start_time"]
	estimated_total_time = elapsed_time * (total / downloaded)
	eta = estimated_total_time - elapsed_time
	elif status == "Completed":
	eta = 0

	return {"total": total, "downloaded": downloaded, "progress": progress, "status": status, "eta": eta}
	return {"total": 0, "downloaded": 0, "progress": 0, "status": "Not Found", "eta": None}

	def update_film_store_json(title, cache_path):
	"""
	Updates the film store JSON with the new file.

	Args:
	title (str): The title of the film.
	cache_path (str): The local path where the file is saved.
	"""
	FILM_STORE_JSON_PATH = os.path.join(CACHE_DIR, "film_store.json")

	film_store_data = {}
	if os.path.exists(FILM_STORE_JSON_PATH):
	with open(FILM_STORE_JSON_PATH, 'r') as json_file:
	film_store_data = json.load(json_file)

	film_store_data[title] = cache_path

	with open(FILM_STORE_JSON_PATH, 'w') as json_file:
	json.dump(film_store_data, json_file, indent=2)
	print(f'Film store updated with {title}.')


	###############################################################################
	def download_episode(file_url, token, cache_path, proxies, episode_id, title, chunk_size=100 * 1024 * 1024):
	"""
	Downloads a file from the specified URL and saves it to the cache path.
	Tracks the download progress.

	Args:
	file_url (str): The URL of the file to download.
	token (str): The authorization token for the request.
	cache_path (str): The path to save the downloaded file.
	proxies (dict): Proxies for the request.
	film_id (str): Unique identifier for the film download.
	title (str): The title of the film.
	chunk_size (int): Size of each chunk to download.
	"""
	print(f"Downloading file from URL: {file_url} to {cache_path} with proxies: {proxies}")
	headers = {'Authorization': f'Bearer {token}'}
	try:
	response = requests.get(file_url, headers=headers, proxies=proxies, stream=True)
	response.raise_for_status()

	total_size = int(response.headers.get('content-length', 0))
	download_progress[episode_id] = {"total": total_size, "downloaded": 0, "status": "Downloading", "start_time": time.time()}

	os.makedirs(os.path.dirname(cache_path), exist_ok=True)
	with open(cache_path, 'wb') as file, tqdm(total=total_size, unit='B', unit_scale=True, desc=cache_path) as pbar:
	for data in response.iter_content(chunk_size=chunk_size):
	file.write(data)
	pbar.update(len(data))
	download_progress[episode_id]["downloaded"] += len(data)

	print(f'File cached to {cache_path} successfully.')
	update_tv_store_json(title, cache_path)
	download_progress[episode_id]["status"] = "Completed"
	except RequestException as e:
	print(f"Error downloading file: {e}")
	download_progress[episode_id]["status"] = "Failed"
	except IOError as e:
	print(f"Error writing file {cache_path}: {e}")
	download_progress[episode_id]["status"] = "Failed"
	finally:
	if download_progress[episode_id]["status"] != "Downloading":
	download_progress[episode_id]["end_time"] = time.time()


	def update_tv_store_json(title, cache_path):
	"""
	Updates the TV store JSON with the new file, organizing by title, season, and episode.

	Args:
	title (str): The title of the TV show.
	cache_path (str): The local path where the file is saved.
	"""
	TV_STORE_JSON_PATH = os.path.join(CACHE_DIR, "tv_store.json")

	tv_store_data = {}
	if os.path.exists(TV_STORE_JSON_PATH):
	with open(TV_STORE_JSON_PATH, 'r') as json_file:
	tv_store_data = json.load(json_file)

	# Extract season and episode information from the cache_path
	season_part = os.path.basename(os.path.dirname(cache_path)) # Extracts 'Season 1'
	episode_part = os.path.basename(cache_path) # Extracts 'Grand Blue Dreaming - S01E01 - Deep Blue HDTV-720p.mp4'

	# Create the structure if not already present
	if title not in tv_store_data:
	tv_store_data[title] = {}

	if season_part not in tv_store_data[title]:
	tv_store_data[title][season_part] = {}

	# Assuming episode_part is unique for each episode within a season
	tv_store_data[title][season_part][episode_part] = cache_path

	with open(TV_STORE_JSON_PATH, 'w') as json_file:
	json.dump(tv_store_data, json_file, indent=2)

	print(f'TV store updated with {title}, {season_part}, {episode_part}.')

	###############################################################################
	def get_file_structure(repo, token, path="", proxies=None):
	"""
	Fetches the file structure of a specified Hugging Face repository.

	Args:
	repo (str): The name of the repository.
	token (str): The authorization token for the request.
	path (str, optional): The specific path in the repository. Defaults to "".
	proxies (dict, optional): The proxies to use for the request. Defaults to None.

	Returns:
	list: A list of file structure information.
	"""
	api_url = f"https://huggingface.co/api/models/{repo}/tree/main/{path}"
	headers = {'Authorization': f'Bearer {token}'}
	print(f"Fetching file structure from URL: {api_url} with proxies: {proxies}")
	try:
	response = requests.get(api_url, headers=headers, proxies=proxies)
	response.raise_for_status()
	return response.json()
	except RequestException as e:
	print(f"Error fetching file structure: {e}")
	return []

	def write_file_structure_to_json(file_structure, file_path):
	"""
	Writes the file structure to a JSON file.

	Args:
	file_structure (list): The file structure data.
	file_path (str): The path where the JSON file will be saved.
	"""
	try:
	with open(file_path, 'w') as json_file:
	json.dump(file_structure, json_file, indent=2)
	print(f'File structure written to {file_path}')
	except IOError as e:
	print(f"Error writing file structure to JSON: {e}")

	if __name__ == "__main__":
	file_url = "https://huggingface.co/Unicone-Studio/jellyfin_media/resolve/main/films/Funky%20Monkey%202004/Funky%20Monkey%20(2004)%20Web-dl%201080p.mp4"
	token = os.getenv("TOKEN")
	cache_path = os.path.join(CACHE_DIR, "films/Funky Monkey 2004/Funky Monkey (2004) Web-dl 1080p.mp4")
	proxies = get_system_proxies()
	film_id = "funky_monkey_2004" # Unique identifier for the film download
	download_film(file_url, token, cache_path, proxies=proxies, film_id=film_id)