import json import os from pydantic import BaseModel from typing import Literal class EnvConfig(BaseModel): # you token from Settings hf_token: str = os.getenv("HF_TOKEN") # NAME of TEI endpoint tei_name: str = os.getenv("TEI_NAME") # name of chunked dataset chunked_ds_name: str = os.getenv("CHUNKED_DS_NAME") # name of embeddings dataset embed_ds_name: str = os.getenv("EMBED_DS_NAME") # splits of input dataset to process, comma separated input_splits: str = os.getenv("INPUT_SPLITS") # name of column to load from input dataset input_text_col: str = os.getenv("INPUT_TEXT_COL") class ChunkConfig(BaseModel): strategy: Literal["recursive", "sequence", "constant"] split_seq: str chunk_len: int private: bool class EmbedConfig(BaseModel): private: bool semaphore_bound: int class WebhookPayloadEvent(BaseModel): action: Literal["create", "update", "delete"] scope: str class WebhookPayloadRepo(BaseModel): type: Literal["dataset", "model", "space"] name: str id: str private: bool headSha: str class WebhookPayload(BaseModel): event: WebhookPayloadEvent repo: WebhookPayloadRepo with open(os.path.join(os.getcwd(), "configs/chunk_config.json")) as c: data = json.load(c) chunk_config = ChunkConfig.model_validate_json(json.dumps(data)) with open(os.path.join(os.getcwd(), "configs/embed_config.json")) as c: data = json.load(c) embed_config = EmbedConfig.model_validate_json(json.dumps(data)) env_config = EnvConfig() env_config.input_splits = [spl.strip() for spl in env_config.input_splits.split(",") if spl]