Spaces:
Sleeping
Sleeping
File size: 5,651 Bytes
295a9df |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
# environment_loader.py
'''Utility module to load .xlsx files from environment-specific folders and optional Huggingface repositories.'''
import os
from pathlib import Path
import pandas as pd
from datetime import datetime, timedelta
# Optional Huggingface support
try:
from huggingface_hub import hf_hub_download
except ImportError:
hf_hub_download = None
def get_latest_file_in_directory(directory_path: str, pattern: str = '*.xlsx') -> Path:
'''Return the Path to the latest modified file matching pattern in directory_path.'''
dir_path = Path(directory_path)
files = list(dir_path.glob(pattern))
if not files:
return None
return max(files, key=lambda f: f.stat().st_mtime)
def get_file_by_date(directory_path: str, target_date: datetime.date, pattern: str = '*.xlsx') -> Path:
'''Return Path to the file whose name contains target_date or whose modification date matches target_date.'''
date_str = target_date.strftime('%Y-%m-%d')
dir_path = Path(directory_path)
# First try matching date string in filename
candidates = [f for f in dir_path.glob(pattern) if date_str in f.name]
if candidates:
return max(candidates, key=lambda f: f.stat().st_mtime)
# Fallback to checking file modification date
files = [f for f in dir_path.glob(pattern) if datetime.fromtimestamp(f.stat().st_mtime).date() == target_date]
if files:
return max(files, key=lambda f: f.stat().st_mtime)
return None
def load_latest_xlsx_for_env(env_code: str,
base_path: str = 'Q:/Selenium_Reports',
use_date: bool = False,
target_date: datetime.date = None) -> pd.DataFrame:
'''Load the latest or date-specific .xlsx file for the given environment code.'''
folder = Path(base_path) / env_code / 'XLSX'
if not folder.exists():
raise FileNotFoundError(f"Environment folder not found: {folder}")
if use_date:
if target_date is None:
raise ValueError('target_date must be provided when use_date is True')
file_path = get_file_by_date(folder, target_date)
else:
file_path = get_latest_file_in_directory(folder)
if file_path is None:
raise FileNotFoundError(f'No .xlsx files found for environment {env_code} in {folder}')
return pd.read_excel(file_path)
def load_environments(env_codes: list,
base_path: str = 'Q:/Selenium_Reports',
by_date: bool = False,
days_ago: int = 1) -> dict:
'''Load DataFrame for each environment code; by_date chooses file from days_ago days before.'''
data = {}
target_date = datetime.now().date() - timedelta(days=days_ago)
for env in env_codes:
df = load_latest_xlsx_for_env(
env_code=env,
base_path=base_path,
use_date=by_date,
target_date=target_date
)
data[env] = df
return data
def load_from_huggingface(repo_id: str,
filenames: list,
revision: str = 'main') -> dict:
'''Download files from a Huggingface repo and load as DataFrames.'''
if hf_hub_download is None:
raise ImportError('huggingface_hub is not installed. Please pip install huggingface_hub')
data = {}
for fname in filenames:
local_path = hf_hub_download(repo_id=repo_id, filename=fname, revision=revision)
data[fname] = pd.read_excel(local_path)
return data
def get_latest_xlsx_path_for_env(env_code: str,
base_path: str = 'Q:/Selenium_Reports',
use_date: bool = False,
target_date: datetime.date = None) -> Path:
'''Return the Path to the desired .xlsx file for the given environment code without loading.'''
folder = Path(base_path) / env_code / 'XLSX'
if not folder.exists():
raise FileNotFoundError(f"Environment folder not found: {folder}")
if use_date:
if target_date is None:
raise ValueError('target_date must be provided when use_date is True')
file_path = get_file_by_date(folder, target_date)
else:
file_path = get_latest_file_in_directory(folder)
if file_path is None:
raise FileNotFoundError(f'No .xlsx files found for environment {env_code} in {folder}')
return file_path
def get_environments_paths(env_codes: list,
base_path: str = 'Q:/Selenium_Reports',
by_date: bool = False,
days_ago: int = 1) -> dict:
'''Return file Paths for each environment code; by_date chooses file from days_ago days before.'''
data = {}
target_date = datetime.now().date() - timedelta(days=days_ago)
for env in env_codes:
path = get_latest_xlsx_path_for_env(
env_code=env,
base_path=base_path,
use_date=by_date,
target_date=target_date
)
data[env] = path
return data
def get_huggingface_paths(repo_id: str,
filenames: list,
revision: str = 'main') -> dict:
'''Download files from a Huggingface repo and return local Paths without loading.'''
if hf_hub_download is None:
raise ImportError('huggingface_hub is not installed. Please pip install huggingface_hub')
data = {}
for fname in filenames:
local_path = hf_hub_download(repo_id=repo_id, filename=fname, revision=revision)
data[fname] = Path(local_path)
return data |