Spaces:

Hans-Den
/

load-balancer

Running

App Files Files Community

ChandimaPrabath commited on Aug 19, 2024

Commit

836f75b

1 Parent(s): 15d97c1

0.0.0.1 Alpha

Browse files

Files changed (9) hide show

.gitignore +10 -0
LoadBalancer.py +302 -0
api.py +63 -0
app.py +18 -1
hf_scrapper.py +42 -0
indexer.py +33 -0
requirements.txt +1 -0
tvdb.py +91 -0
utils.py +64 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,10 @@

+#.env
+.env
+# cache
+tmp
+# pycache
+__pycache__
+# stream-test.py
+stream-test.py
+#test
+test.py

LoadBalancer.py ADDED Viewed

	@@ -0,0 +1,302 @@

+import os
+import json
+from indexer import indexer
+import re
+from tvdb import fetch_and_cache_json
+from threading import Event, Thread
+import time
+import logging
+from utils import convert_to_gb
+from api import InstancesAPI
+CACHE_DIR = os.getenv("CACHE_DIR")
+download_progress = {}
+class LoadBalancer:
+    def __init__(self, cache_dir, token, repo, polling_interval=4, max_retries=3, initial_delay=1):
+        self.version = "0.0.0.1 Alpha"
+        self.instances = []
+        self.instances_health = {}
+        self.polling_interval = polling_interval
+        self.max_retries = max_retries
+        self.initial_delay = initial_delay
+        self.stop_event = Event()
+        self.instances_api = InstancesAPI(self.instances)
+        self.CACHE_DIR = cache_dir
+        self.TOKEN = token
+        self.REPO = repo
+        self.FILM_STORE = {}
+        self.TV_STORE = {}
+        self.file_structure = None
+        # Ensure CACHE_DIR exists
+        if not os.path.exists(self.CACHE_DIR):
+            os.makedirs(self.CACHE_DIR)
+        # Index the file structure initially
+        self.file_structure=indexer()
+        # Start polling and file checking in separate threads
+        polling_thread = Thread(target=self.start_polling)
+        polling_thread.daemon = True
+        polling_thread.start()
+    def register_instance(self, instance_url):
+        if instance_url not in self.instances:
+            self.instances.append(instance_url)
+            logging.info(f"Registered instance {instance_url}")
+        else:
+            logging.info(f"Instance {instance_url} is already registered.")
+    def remove_instance(self, instance_url):
+        if instance_url in self.instances:
+            self.instances.remove(instance_url)
+            self.instances_health.pop(instance_url, None)
+            logging.info(f"Removed instance {instance_url}")
+        else:
+            logging.info(f"Instance {instance_url} not found for removal.")
+    def get_reports(self):
+        reports = self.instances_api.fetch_reports()
+        # Initialize temporary JSON data holders
+        temp_film_store = {}
+        temp_tv_store = {}
+        for instance_url in self.instances[:]:  # Copy list to avoid modification during iteration
+            if instance_url in reports:
+                report = reports[instance_url]
+                logging.info(f"Report from {instance_url}: {report}")
+                self.process_report(instance_url, report, temp_film_store, temp_tv_store)
+            else:
+                logging.error(f"Failed to get report from {instance_url}. Removing instance.")
+                self.remove_instance(instance_url)
+        self.FILM_STORE = temp_film_store
+        self.TV_STORE = temp_tv_store
+    def process_report(self, instance_url, report, temp_film_store, temp_tv_store):
+        film_store = report.get('film_store', {})
+        tv_store = report.get('tv_store', {})
+        cache_size = report.get('cache_size')
+        logging.info(f"Processing report from {instance_url}")
+        # Update temporary film store
+        for title, path in film_store.items():
+            url = f"{instance_url}/api/film/{title.replace(' ', '%20')}"
+            temp_film_store[title] = url
+        # Update temporary TV store
+        for title, seasons in tv_store.items():
+            if title not in temp_tv_store:
+                temp_tv_store[title] = {}
+            for season, episodes in seasons.items():
+                if season not in temp_tv_store[title]:
+                    temp_tv_store[title][season] = {}
+                for episode, path in episodes.items():
+                    url = f"{instance_url}/api/tv/{title.replace(' ', '%20')}/{season.replace(' ', '%20')}/{episode.replace(' ', '%20')}"
+                    temp_tv_store[title][season][episode] = url
+        logging.info("Film and TV Stores processed successfully.")
+        self.update_instances_health(instance=instance_url, cache_size=cache_size)
+    def start_polling(self):
+        logging.info("Starting polling.")
+        while not self.stop_event.is_set():
+            self.get_reports()
+            time.sleep(self.polling_interval)
+        logging.info("Polling stopped.")
+    def stop_polling(self):
+        logging.info("Stopping polling.")
+        self.stop_event.set()
+    def start_prefetching(self):
+        """Start the metadata prefetching in a separate thread."""
+        self.prefetch_metadata()
+#################################################################
+    def update_instances_health(self, instance, cache_size):
+        self.instances_health[instance] = {"used":cache_size["cache_size"],
+                                           "total": "50 GB"}
+        logging.info(f"Updated instance {instance} with cache size {cache_size}")
+    def download_film_to_best_instance(self, title):
+        """
+        Downloads a film to the first instance that has more free space on the self.instance_health list variable.
+        The instance_health looks like this:
+        {
+            "https://unicone-studio-instance1.hf.space": {
+                "total": "50 GB",
+                "used": "3.33 GB"
+            }
+        }
+        Args:
+            title (str): The title of the film.
+        """
+        best_instance = None
+        max_free_space = -1
+        # Calculate free space for each instance
+        for instance_url, space_info in self.instances_health.items():
+            total_space = convert_to_gb(space_info['total'])
+            used_space = convert_to_gb(space_info['used'])
+            free_space = total_space - used_space
+            if free_space > max_free_space:
+                max_free_space = free_space
+                best_instance = instance_url
+        if best_instance:
+            result = self.instances_api.download_film(best_instance, title)
+            film_id = result["film_id"]
+            status = result["status"]
+            progress_url = f'{best_instance}/api/progress/{film_id}'
+            response = {
+                "film_id":film_id,
+                "status":status,
+                "progress_url":progress_url
+            }
+            return response
+        else:
+            logging.error("No suitable instance found for downloading the film.")
+            return {"error": "No suitable instance found for downloading the film."}
+    def download_episode_to_best_instance(self, title, season, episode):
+        """
+        Downloads a episode to the first instance that has more free space on the self.instance_health list variable.
+        The instance_health looks like this:
+        {
+            "https://unicone-studio-instance1.hf.space": {
+                "total": "50 GB",
+                "used": "3.33 GB"
+            }
+        }
+        Args:
+            title (str): The title of the Tv show.
+            season (str): The season of the Tv show.
+            episode (str): The title of the Tv show.
+        """
+        best_instance = None
+        max_free_space = -1
+        # Calculate free space for each instance
+        for instance_url, space_info in self.instances_health.items():
+            total_space = convert_to_gb(space_info['total'])
+            used_space = convert_to_gb(space_info['used'])
+            free_space = total_space - used_space
+            if free_space > max_free_space:
+                max_free_space = free_space
+                best_instance = instance_url
+        if best_instance:
+            result = self.instances_api.download_episode(best_instance, title, season, episode)
+            episode_id = result["episode_id"]
+            status = result["status"]
+            progress_url = f'{best_instance}/api/progress/{episode_id}'
+            response = {
+                "episode_id":episode_id,
+                "status":status,
+                "progress_url":progress_url
+            }
+            return response
+        else:
+            logging.error("No suitable instance found for downloading the film.")
+            return {"error": "No suitable instance found for downloading the film."}
+#################################################################
+    def find_movie_path(self, title):
+        """Find the path of the movie in the JSON data based on the title."""
+        for directory in self.file_structure:
+            if directory['type'] == 'directory' and directory['path'] == 'films':
+                for sub_directory in directory['contents']:
+                    if sub_directory['type'] == 'directory':
+                        for item in sub_directory['contents']:
+                            if item['type'] == 'file' and title.lower() in item['path'].lower():
+                                return item['path']
+        return None
+    def find_tv_path(self, title):
+        """Find the path of the TV show in the JSON data based on the title."""
+        for directory in self.file_structure:
+            if directory['type'] == 'directory' and directory['path'] == 'tv':
+                for sub_directory in directory['contents']:
+                    if sub_directory['type'] == 'directory' and title.lower() in sub_directory['path'].lower():
+                        return sub_directory['path']
+        return None
+    def get_tv_structure(self, title):
+        """Find the path of the TV show in the JSON data based on the title."""
+        for directory in self.file_structure:
+            if directory['type'] == 'directory' and directory['path'] == 'tv':
+                for sub_directory in directory['contents']:
+                    if sub_directory['type'] == 'directory' and title.lower() in sub_directory['path'].lower():
+                        return sub_directory
+        return None
+    def get_film_id(self, title):
+        """Generate a film ID based on the title."""
+        return title.replace(" ", "_").lower()
+    def prefetch_metadata(self):
+        """Prefetch metadata for all items in the file structure."""
+        for item in self.file_structure:
+            if 'contents' in item:
+                for sub_item in item['contents']:
+                    original_title = sub_item['path'].split('/')[-1]
+                    media_type = 'series' if item['path'].startswith('tv') else 'movie'
+                    title = original_title
+                    year = None
+                    # Extract year from the title if available
+                    match = re.search(r'\((\d{4})\)', original_title)
+                    if match:
+                        year_str = match.group(1)
+                        if year_str.isdigit() and len(year_str) == 4:
+                            title = original_title[:match.start()].strip()
+                            year = int(year_str)
+                    else:
+                        parts = original_title.rsplit(' ', 1)
+                        if len(parts) > 1 and parts[-1].isdigit() and len(parts[-1]) == 4:
+                            title = parts[0].strip()
+                            year = int(parts[-1])
+                    fetch_and_cache_json(original_title, title, media_type, year)
+    def get_all_tv_shows(self):
+        """Get all TV shows from the indexed cache structure JSON file."""
+        tv_shows = {}
+        for directory in self.file_structure:
+            if directory['type'] == 'directory' and directory['path'] == 'tv':
+                for sub_directory in directory['contents']:
+                    if sub_directory['type'] == 'directory':
+                        show_title = sub_directory['path'].split('/')[-1]
+                        tv_shows[show_title] = []
+                        for season_directory in sub_directory['contents']:
+                            if season_directory['type'] == 'directory':
+                                season = season_directory['path'].split('/')[-1]
+                                for episode in season_directory['contents']:
+                                    if episode['type'] == 'file':
+                                        tv_shows[show_title].append({
+                                            "season": season,
+                                            "episode": episode['path'].split('/')[-1],
+                                            "path": episode['path']
+                                        })
+        return tv_shows
+    def get_all_films(self):
+        """Get all films from the indexed cache structure JSON file."""
+        films = []
+        for directory in self.file_structure:
+            if directory['type'] == 'directory' and directory['path'] == 'films':
+                for sub_directory in directory['contents']:
+                    if sub_directory['type'] == 'directory':
+                        films.append(sub_directory['path'])
+        return films

api.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import requests
+import logging
+class InstancesAPI:
+    def __init__(self, instances):
+        self.instances = instances
+    def fetch_reports(self):
+        reports = {}
+        for instance_url in self.instances:
+            try:
+                response = requests.get(f"{instance_url}/api/get/report")
+                response.raise_for_status()
+                reports[instance_url] = response.json()
+            except requests.exceptions.RequestException as e:
+                logging.error(f"Error contacting instance {instance_url}: {e}")
+        return reports
+    def download_film(self, instance_url, title):
+        """
+        Download a film to an instance.
+        If the download started, it returns a JSON like this:
+        example:
+            {"film_id": "my_spy_2020",
+            "status": "Download started"}
+        If the film has already been downloaded, it will return the video file.
+        """
+        data = {}
+        try:
+            response = requests.get(f"{instance_url}/api/film/{title}")
+            response.raise_for_status()
+            data = response.json()
+        except requests.exceptions.RequestException as e:
+            logging.error(f"Error contacting instance {instance_url}: {e}")
+            data = {"error": str(e)}
+        return data
+    def download_episode(self, instance_url, title, season, episode):
+        """
+        Download a film to an instance.
+        If the download started, it returns a JSON like this:
+        example:
+            {"film_id": "my_spy_2020",
+            "status": "Download started"}
+        If the film has already been downloaded, it will return the video file.
+        """
+        data = {}
+        try:
+            response = requests.get(f"{instance_url}/api/tv/{title}/{season}/{episode}")
+            response.raise_for_status()
+            data = response.json()
+        except requests.exceptions.RequestException as e:
+            logging.error(f"Error contacting instance {instance_url}: {e}")
+            data = {"error": str(e)}
+        return data

app.py CHANGED Viewed

@@ -1,7 +1,24 @@
 from fastapi import FastAPI
 app = FastAPI()
 @app.get("/")
 def greet_json():
-    return {"Hello": "World!"}

 from fastapi import FastAPI
+from LoadBalancer import LoadBalancer
+import os
+CACHE_DIR = os.getenv("CACHE_DIR")
+TOKEN = os.getenv("TOKEN")
+REPO = os.getenv("REPO")
+load_balancer = LoadBalancer(cache_dir=CACHE_DIR, token=TOKEN, repo=REPO)
 app = FastAPI()
 @app.get("/")
 def greet_json():
+    return {"Version": "0.0.1 Alpha"}
+@app.post("/api/register")
+def register_instance():
+    #register the instance to Instance Register
+    pass
+@app.get("/api/get/file_structure")
+def get_file_structure():
+    return load_balancer.file_structure

hf_scrapper.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import os
+import requests
+import json
+from requests.exceptions import RequestException
+def get_file_structure(repo, token, path=""):
+    """
+    Fetches the file structure of a specified Hugging Face repository.
+    Args:
+        repo (str): The name of the repository.
+        token (str): The authorization token for the request.
+        path (str, optional): The specific path in the repository. Defaults to "".
+    Returns:
+        list: A list of file structure information.
+    """
+    api_url = f"https://huggingface.co/api/models/{repo}/tree/main/{path}"
+    headers = {'Authorization': f'Bearer {token}'}
+    print(f"Fetching file structure from URL: {api_url}")
+    try:
+        response = requests.get(api_url, headers=headers)
+        response.raise_for_status()
+        return response.json()
+    except RequestException as e:
+        print(f"Error fetching file structure: {e}")
+        return []
+def write_file_structure_to_json(file_structure, file_path):
+    """
+    Writes the file structure to a JSON file.
+    Args:
+        file_structure (list): The file structure data.
+        file_path (str): The path where the JSON file will be saved.
+    """
+    try:
+        with open(file_path, 'w') as json_file:
+            json.dump(file_structure, json_file, indent=2)
+        print(f'File structure written to {file_path}')
+    except IOError as e:
+        print(f"Error writing file structure to JSON: {e}")

indexer.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from hf_scrapper import get_file_structure, write_file_structure_to_json
+from dotenv import load_dotenv
+import os
+load_dotenv()
+def index_repository(token, repo, current_path=""):
+    file_structure = get_file_structure(repo, token, current_path)
+    full_structure = []
+    for item in file_structure:
+        if item['type'] == 'directory':
+            sub_directory_structure = index_repository(token, repo, item['path'])
+            full_structure.append({
+                "type": "directory",
+                "path": item['path'],
+                "contents": sub_directory_structure
+            })
+        else:
+            # Exclude 'oid' and 'lfs' from the file item
+            file_item = {
+                "type": item['type'],
+                "size": item['size'],
+                "path": item['path']
+            }
+            full_structure.append(file_item)
+    return full_structure
+def indexer():
+    token = os.getenv("TOKEN")
+    repo = os.getenv("REPO")
+    full_structure = index_repository(token, repo, "")
+    print(f"Full file structure for repository '{repo}' has been indexed.")
+    return full_structure

requirements.txt CHANGED Viewed

@@ -1,2 +1,3 @@
 fastapi
 uvicorn[standard]

 fastapi
 uvicorn[standard]
+python-dotenv

tvdb.py ADDED Viewed

	@@ -0,0 +1,91 @@

+# tvdb.py
+import os
+import requests
+import urllib.parse
+from datetime import datetime, timedelta
+from dotenv import load_dotenv
+import json
+load_dotenv()
+THETVDB_API_KEY = os.getenv("THETVDB_API_KEY")
+THETVDB_API_URL = os.getenv("THETVDB_API_URL")
+CACHE_DIR = os.getenv("CACHE_DIR")
+TOKEN_EXPIRY = None
+THETVDB_TOKEN = None
+def authenticate_thetvdb():
+    global THETVDB_TOKEN, TOKEN_EXPIRY
+    auth_url = f"{THETVDB_API_URL}/login"
+    auth_data = {
+        "apikey": THETVDB_API_KEY
+    }
+    try:
+        response = requests.post(auth_url, json=auth_data)
+        response.raise_for_status()
+        response_data = response.json()
+        THETVDB_TOKEN = response_data['data']['token']
+        TOKEN_EXPIRY = datetime.now() + timedelta(days=30)
+    except requests.RequestException as e:
+        print(f"Authentication failed: {e}")
+        THETVDB_TOKEN = None
+        TOKEN_EXPIRY = None
+def get_thetvdb_token():
+    global THETVDB_TOKEN, TOKEN_EXPIRY
+    if not THETVDB_TOKEN or datetime.now() >= TOKEN_EXPIRY:
+        authenticate_thetvdb()
+    return THETVDB_TOKEN
+def fetch_and_cache_json(original_title, title, media_type, year=None):
+    if year:
+        search_url = f"{THETVDB_API_URL}/search?query={urllib.parse.quote(title)}&type={media_type}&year={year}"
+    else:
+        search_url = f"{THETVDB_API_URL}/search?query={urllib.parse.quote(title)}&type={media_type}"
+    token = get_thetvdb_token()
+    if not token:
+        print("Authentication failed")
+        return
+    headers = {
+        "Authorization": f"Bearer {token}",
+        "accept": "application/json",
+    }
+    try:
+        # Fetch initial search results
+        response = requests.get(search_url, headers=headers)
+        response.raise_for_status()
+        data = response.json()
+        if 'data' in data and data['data']:
+            # Extract the TVDB ID and type from the first result
+            first_result = data['data'][0]
+            tvdb_id = first_result.get('tvdb_id')
+            media_type = first_result.get('type')
+            if not tvdb_id:
+                print("TVDB ID not found in the search results")
+                return
+            # Determine the correct extended URL based on media type
+            if media_type == 'movie':
+                extended_url = f"{THETVDB_API_URL}/movies/{tvdb_id}/extended?meta=translations"
+            elif media_type == 'series':
+                extended_url = f"{THETVDB_API_URL}/series/{tvdb_id}/extended?meta=translations"
+            else:
+                print(f"Unsupported media type: {media_type}")
+                return
+            # Request the extended information using the TVDB ID
+            response = requests.get(extended_url, headers=headers)
+            response.raise_for_status()
+            extended_data = response.json()
+            # Cache the extended JSON response
+            json_cache_path = os.path.join(CACHE_DIR, f"{urllib.parse.quote(original_title)}.json")
+            with open(json_cache_path, 'w') as f:
+                json.dump(extended_data, f)
+    except requests.RequestException as e:
+        print(f"Error fetching data: {e}")

utils.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import re
+def is_valid_url(url):
+    """
+    Validates the URL.
+    Args:
+        url (str): The URL to validate.
+    Returns:
+        bool: True if the URL is valid, False otherwise.
+    """
+    regex = re.compile(
+        r'^(?:http|ftp)s?://'  # http:// or https://
+        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'  # domain...
+        r'localhost|'  # localhost...
+        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|'  # ...or ipv4
+        r'\[?[A-F0-9]*:[A-F0-9:]+\]?)'  # ...or ipv6
+        r'(?::\d+)?'  # optional port
+        r'(?:/?|[/?]\S+)$', re.IGNORECASE)
+    return re.match(regex, url) is not None
+def convert_to_gb(space_str):
+    """
+    Converts a space string like '50 GB' or '3.33 GB' to a float representing the number of GB.
+    Args:
+        space_str (str): The space string to convert.
+    Returns:
+        float: The space in GB.
+    """
+    return float(space_str.split()[0])
+def bytes_to_human_readable(num, suffix="B"):
+    """
+    Converts bytes to a human-readable format.
+    Args:
+        num (int): The number of bytes.
+        suffix (str): The suffix to use (default is 'B').
+    Returns:
+        str: The human-readable string.
+    """
+    for unit in ["", "K", "M", "G", "T", "P", "E", "Z"]:
+        if abs(num) < 1024.0:
+            return f"{num:3.1f} {unit}{suffix}"
+        num /= 1024.0
+    return f"{num:.1f} Y{suffix}"
+def encode_episodeid(title, season, episode):
+    """
+    Encodes the episode ID based on title, season, and episode.
+    Args:
+        title (str): The title of the TV show.
+        season (str): The season of the TV show.
+        episode (str): The episode number.
+    Returns:
+        str: The encoded episode ID.
+    """
+    return f"{title}_{season}_{episode}"