ChandimaPrabath commited on
Commit
e0663d3
·
1 Parent(s): dbe8689
Files changed (2) hide show
  1. hf_scrapper.py +3 -3
  2. indexer.py +38 -66
hf_scrapper.py CHANGED
@@ -1,8 +1,8 @@
1
  import os
2
  import json
3
  import aiohttp
4
- import aiofiles
5
  import asyncio
 
6
  import urllib.request
7
  from aiohttp import ClientSession, ClientTimeout
8
  from aiohttp.client_exceptions import ClientError
@@ -13,7 +13,7 @@ CACHE_JSON_PATH = os.path.join(CACHE_DIR, "cached_films.json")
13
 
14
  download_progress = {}
15
 
16
- def get_system_proxies():
17
  """
18
  Retrieves the system's HTTP and HTTPS proxies.
19
 
@@ -70,4 +70,4 @@ async def write_file_structure_to_json(file_structure, file_path):
70
  await json_file.write(json.dumps(file_structure, indent=2))
71
  print(f'File structure written to {file_path}')
72
  except IOError as e:
73
- print(f"Error writing file structure to JSON: {e}")
 
1
  import os
2
  import json
3
  import aiohttp
 
4
  import asyncio
5
+ import aiofiles
6
  import urllib.request
7
  from aiohttp import ClientSession, ClientTimeout
8
  from aiohttp.client_exceptions import ClientError
 
13
 
14
  download_progress = {}
15
 
16
+ async def get_system_proxies():
17
  """
18
  Retrieves the system's HTTP and HTTPS proxies.
19
 
 
70
  await json_file.write(json.dumps(file_structure, indent=2))
71
  print(f'File structure written to {file_path}')
72
  except IOError as e:
73
+ print(f"Error writing file structure to JSON: {e}")
indexer.py CHANGED
@@ -1,73 +1,45 @@
1
- import os
2
  import json
3
- import aiohttp
4
  import asyncio
5
- import aiofiles
6
- import urllib.request
7
- from aiohttp import ClientSession, ClientTimeout
8
- from aiohttp.client_exceptions import ClientError
9
- from tqdm.asyncio import tqdm
10
-
11
- CACHE_DIR = os.getenv("CACHE_DIR")
12
- CACHE_JSON_PATH = os.path.join(CACHE_DIR, "cached_films.json")
13
-
14
- download_progress = {}
15
 
16
- async def get_system_proxies():
17
- """
18
- Retrieves the system's HTTP and HTTPS proxies.
19
 
20
- Returns:
21
- dict: A dictionary containing the proxies.
22
- """
23
  try:
24
- proxies = urllib.request.getproxies()
25
- print("System proxies:", proxies)
26
- return {
27
- "http": proxies.get("http"),
28
- "https": proxies.get("http")
29
- }
 
 
 
 
 
 
 
30
  except Exception as e:
31
- print(f"Error getting system proxies: {e}")
32
- return {}
33
-
34
- async def get_file_structure(repo, token, path="", proxies=None):
35
- """
36
- Fetches the file structure of a specified Hugging Face repository.
37
-
38
- Args:
39
- repo (str): The name of the repository.
40
- token (str): The authorization token for the request.
41
- path (str, optional): The specific path in the repository. Defaults to "".
42
- proxies (dict, optional): The proxies to use for the request. Defaults to None.
43
-
44
- Returns:
45
- list: A list of file structure information.
46
- """
47
- api_url = f"https://huggingface.co/api/models/{repo}/tree/main/{path}"
48
- headers = {'Authorization': f'Bearer {token}'}
49
- timeout = ClientTimeout(total=10)
50
- async with ClientSession(timeout=timeout) as session:
51
- print(f"Fetching file structure from URL: {api_url} with proxies: {proxies}")
52
- try:
53
- async with session.get(api_url, headers=headers, proxy=proxies.get("http")) as response:
54
- response.raise_for_status()
55
- return await response.json()
56
- except ClientError as e:
57
- print(f"Error fetching file structure: {e}")
58
- return []
59
-
60
- async def write_file_structure_to_json(file_structure, file_path):
61
- """
62
- Writes the file structure to a JSON file.
63
-
64
- Args:
65
- file_structure (list): The file structure data.
66
- file_path (str): The path where the JSON file will be saved.
67
- """
68
  try:
69
- async with aiofiles.open(file_path, 'w') as json_file:
70
- await json_file.write(json.dumps(file_structure, indent=2))
71
- print(f'File structure written to {file_path}')
72
- except IOError as e:
73
- print(f"Error writing file structure to JSON: {e}")
 
 
1
  import json
2
+ import logging
3
  import asyncio
4
+ from hf_scrapper import get_system_proxies, get_file_structure, write_file_structure_to_json
5
+ from dotenv import load_dotenv
6
+ import os
 
 
 
 
 
 
 
7
 
8
+ load_dotenv()
 
 
9
 
10
+ async def index_repository(token, repo, current_path="", proxies=None):
 
 
11
  try:
12
+ file_structure = await get_file_structure(repo, token, current_path, proxies)
13
+ full_structure = []
14
+ for item in file_structure:
15
+ if item['type'] == 'directory':
16
+ sub_directory_structure = await index_repository(token, repo, item['path'], proxies)
17
+ full_structure.append({
18
+ "type": "directory",
19
+ "path": item['path'],
20
+ "contents": sub_directory_structure
21
+ })
22
+ else:
23
+ full_structure.append(item)
24
+ return full_structure
25
  except Exception as e:
26
+ logging.error(f"Error indexing repository: {e}")
27
+ raise
28
+
29
+ async def indexer():
30
+ token = os.getenv("TOKEN")
31
+ repo = os.getenv("REPO")
32
+ output_path = os.getenv("INDEX_FILE")
33
+
34
+ if not token or not repo or not output_path:
35
+ logging.error("Environment variables TOKEN, REPO, or INDEX_FILE are not set.")
36
+ return
37
+
38
+ proxies = await get_system_proxies()
39
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  try:
41
+ full_structure = await index_repository(token, repo, "", proxies)
42
+ await write_file_structure_to_json(full_structure, output_path)
43
+ logging.info(f"Full file structure for repository '{repo}' has been indexed and saved to {output_path}")
44
+ except Exception as e:
45
+ logging.error(f"Error during indexing: {e}")