File size: 1,560 Bytes
97fdba5
 
 
 
 
 
8b7a023
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97fdba5
b70008c
 
97fdba5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187981b
97fdba5
 
 
187981b
97fdba5
 
8b7a023
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import json
import os
from pydantic import BaseModel
from typing import Literal


class EnvConfig(BaseModel):
	# you token from Settings
	hf_token: str = os.getenv("HF_TOKEN")
	# NAME of TEI endpoint
	tei_name: str = os.getenv("TEI_NAME")
	# name of chunked dataset
	chunked_ds_name: str = os.getenv("CHUNKED_DS_NAME")
	# name of embeddings dataset
	embed_ds_name: str = os.getenv("EMBED_DS_NAME")
	# splits of input dataset to process, comma separated
	input_splits: str = os.getenv("INPUT_SPLITS")
	# name of column to load from input dataset
	input_text_col: str = os.getenv("INPUT_TEXT_COL")


class ChunkConfig(BaseModel):
	strategy: Literal["recursive", "sequence", "constant"]
	split_seq: str
	chunk_len: int
	private: bool


class EmbedConfig(BaseModel):
	private: bool
	semaphore_bound: int


class WebhookPayloadEvent(BaseModel):
	action: Literal["create", "update", "delete"]
	scope: str


class WebhookPayloadRepo(BaseModel):
	type: Literal["dataset", "model", "space"]
	name: str
	id: str
	private: bool
	headSha: str


class WebhookPayload(BaseModel):
	event: WebhookPayloadEvent
	repo: WebhookPayloadRepo


with open(os.path.join(os.getcwd(), "configs/chunk_config.json")) as c:
	data = json.load(c)
	chunk_config = ChunkConfig.model_validate_json(json.dumps(data))

with open(os.path.join(os.getcwd(), "configs/embed_config.json")) as c:
	data = json.load(c)
	embed_config = EmbedConfig.model_validate_json(json.dumps(data))


env_config = EnvConfig()
env_config.input_splits = [spl.strip() for spl in env_config.input_splits.split(",") if spl]