LukasGe commited on
Commit
14bf693
1 Parent(s): d35581a

Update load_data.py

Browse files
Files changed (1) hide show
  1. load_data.py +7 -7
load_data.py CHANGED
@@ -9,7 +9,7 @@ from datasets import load_dataset, concatenate_datasets
9
 
10
  from argilla.listeners import listener
11
 
12
- HF_TOKEN = os.environ.get("HF_TOKEN") #get HF_TOKEN
13
  HUB_DATASET_NAME = os.environ.get('HUB_DATASET_NAME') #get dataset name
14
 
15
 
@@ -24,7 +24,7 @@ def save_validated_to_hub(records, ctx):
24
  if HF_TOKEN:
25
  print("Pushing the dataset")
26
  print(ds)
27
- ds.push_to_hub(HUB_DATASET_NAME, token=HF_TOKEN)
28
  else:
29
  print("SET HF_TOKEN and HUB_DATASET_NAME TO SYNC YOUR DATASET!!!")
30
  else:
@@ -35,17 +35,17 @@ class LoadDatasets:
35
  rg.init(api_key=api_key, workspace=workspace)
36
 
37
  @staticmethod
38
- def load_somos():
39
- # Leer el dataset del Hub
40
  try:
41
  print(f"Trying to sync with {HUB_DATASET_NAME}")
42
- old_ds = load_dataset(HUB_DATASET_NAME, split="train")
43
  except Exception as e:
44
  print(f"Not possible to sync with {HUB_DATASET_NAME}")
45
  print(e)
46
  old_ds = None
47
 
48
- dataset = load_dataset("LukasGe/JOB_TITLES", split="train")
49
 
50
 
51
  if old_ds:
@@ -85,7 +85,7 @@ if __name__ == "__main__":
85
  response = requests.get("http://0.0.0.0:6900/")
86
  if response.status_code == 200:
87
  ld = LoadDatasets(API_KEY)
88
- ld.load_somos()
89
  break
90
 
91
  except requests.exceptions.ConnectionError:
 
9
 
10
  from argilla.listeners import listener
11
 
12
+ HF_TOKEN = os.environ.get("HF_TOKEN") #get HF_TOKEN from space env variables
13
  HUB_DATASET_NAME = os.environ.get('HUB_DATASET_NAME') #get dataset name
14
 
15
 
 
24
  if HF_TOKEN:
25
  print("Pushing the dataset")
26
  print(ds)
27
+ ds.push_to_hub(HUB_DATASET_NAME, token=HF_TOKEN, private=True, revision = 'validated') #push annotations on the validated branch of the dataset
28
  else:
29
  print("SET HF_TOKEN and HUB_DATASET_NAME TO SYNC YOUR DATASET!!!")
30
  else:
 
35
  rg.init(api_key=api_key, workspace=workspace)
36
 
37
  @staticmethod
38
+ def load_saved_data():
39
+ # load data from validated branch
40
  try:
41
  print(f"Trying to sync with {HUB_DATASET_NAME}")
42
+ old_ds = load_dataset(HUB_DATASET_NAME, split="train", use_auth_token=HF_TOKEN, revision='validated')
43
  except Exception as e:
44
  print(f"Not possible to sync with {HUB_DATASET_NAME}")
45
  print(e)
46
  old_ds = None
47
 
48
+ dataset = load_dataset({HUB_DATASET_NAME}, split="train", use_auth_token=HF_TOKEN, revision='main') # get starting file from the main branch of the dataset
49
 
50
 
51
  if old_ds:
 
85
  response = requests.get("http://0.0.0.0:6900/")
86
  if response.status_code == 200:
87
  ld = LoadDatasets(API_KEY)
88
+ ld.load_saved_data()
89
  break
90
 
91
  except requests.exceptions.ConnectionError: