andrewrreed's picture
andrewrreed HF staff
Add filters
167137b
raw
history blame
No virus
1.87 kB
from typing import Literal
from huggingface_hub import HfFileSystem, hf_hub_download
KEY_TO_CATEGORY_NAME = {
"full": "Overall",
"coding": "Coding",
"long_user": "Longer Query",
"english": "English",
"chinese": "Chinese",
"french": "French",
"no_tie": "Exclude Ties",
"no_short": "Exclude Short Query (< 5 tokens)",
"no_refusal": "Exclude Refusal",
}
CAT_NAME_TO_EXPLANATION = {
"Overall": "Overall Questions",
"Coding": "Coding: whether conversation contains code snippets",
"Longer Query": "Longer Query (>= 500 tokens)",
"English": "English Prompts",
"Chinese": "Chinese Prompts",
"French": "French Prompts",
"Exclude Ties": "Exclude Ties and Bothbad",
"Exclude Short Query (< 5 tokens)": "Exclude Short User Query (< 5 tokens)",
"Exclude Refusal": 'Exclude model responses with refusal (e.g., "I cannot answer")',
}
PROPRIETARY_LICENSES = [
"Proprietary",
]
def download_latest_data_from_space(
repo_id: str, file_type: Literal["pkl", "csv"]
) -> str:
"""
Downloads the latest data file of the specified file type from the given repository space.
Args:
repo_id (str): The ID of the repository space.
file_type (Literal["pkl", "csv"]): The type of the data file to download. Must be either "pkl" or "csv".
Returns:
str: The local file path of the downloaded data file.
"""
def extract_date(filename):
return filename.split("/")[-1].split(".")[0].split("_")[-1]
fs = HfFileSystem()
data_file_path = f"spaces/{repo_id}/*.{file_type}"
files = fs.glob(data_file_path)
latest_file = sorted(files, key=extract_date, reverse=True)[0]
latest_filepath_local = hf_hub_download(
repo_id=repo_id,
filename=latest_file.split("/")[-1],
repo_type="space",
)
return latest_filepath_local