LukasGe commited on
Commit
97e7095
1 Parent(s): e662ced

Create load_data.py

Browse files
Files changed (1) hide show
  1. load_data.py +99 -0
load_data.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import time
3
+ import os
4
+
5
+ import argilla as rg
6
+ import pandas as pd
7
+ import requests
8
+ from datasets import load_dataset, concatenate_datasets
9
+
10
+ from argilla.listeners import listener
11
+
12
+ HF_TOKEN = os.environ.get("HF_TOKEN") #set HF_TOKEN
13
+ HUB_DATASET_NAME = os.environ.get('HUB_DATASET_NAME') #set dataset name
14
+
15
+ @listener(
16
+ dataset="somos-alpaca-es",
17
+ query="status:Validated", # https://docs.argilla.io/en/latest/guides/features/queries.html
18
+ execution_interval_in_seconds=1200, # interval to check the execution of `save_validated_to_hub`
19
+ )
20
+ def save_validated_to_hub(records, ctx):
21
+ if len(records) > 0:
22
+ ds = rg.DatasetForTextClassification(records=records).to_datasets()
23
+ if HF_TOKEN:
24
+ print("Pushing the dataset")
25
+ print(ds)
26
+ ds.push_to_hub(HUB_DATASET_NAME, token=HF_TOKEN)
27
+ else:
28
+ print("SET HF_TOKEN and HUB_DATASET_NAME TO SYNC YOUR DATASET!!!")
29
+ else:
30
+ print("NO RECORDS found")
31
+
32
+ class LoadDatasets:
33
+ def __init__(self, api_key, workspace="team"):
34
+ rg.init(api_key=api_key, workspace=workspace)
35
+
36
+ @staticmethod
37
+ def load_somos():
38
+ # Leer el dataset del Hub
39
+ try:
40
+ print(f"Trying to sync with {HUB_DATASET_NAME}")
41
+ old_ds = load_dataset(HUB_DATASET_NAME, split="train")
42
+ except Exception as e:
43
+ print(f"Not possible to sync with {HUB_DATASET_NAME}")
44
+ print(e)
45
+ old_ds = None
46
+
47
+ dataset = load_dataset("somosnlp/somos-clean-alpaca-es", split="train")
48
+
49
+
50
+ if old_ds:
51
+ print("Concatenating datasets")
52
+ dataset = concatenate_datasets([dataset, old_ds])
53
+ print("Concatenated dataset is:")
54
+ print(dataset)
55
+
56
+ dataset = dataset.remove_columns("metrics")
57
+ records = rg.DatasetForTextClassification.from_datasets(dataset)
58
+
59
+ settings = rg.TextClassificationSettings(
60
+ label_schema=["BAD INSTRUCTION", "BAD INPUT", "BAD OUTPUT", "INAPPROPRIATE", "BIASED", "ALL GOOD"]
61
+ )
62
+ rg.configure_dataset(name="somos-alpaca-es", settings=settings, workspace="team")
63
+
64
+ # Log the dataset
65
+ rg.log(
66
+ records,
67
+ name="somos-alpaca-es",
68
+ tags={"description": "SomosNLP Hackathon dataset"},
69
+ batch_size=200
70
+ )
71
+
72
+ # run listener
73
+ save_validated_to_hub.start()
74
+
75
+ if __name__ == "__main__":
76
+ API_KEY = sys.argv[1]
77
+ LOAD_DATASETS = sys.argv[2]
78
+
79
+ if LOAD_DATASETS.lower() == "none":
80
+ print("No datasets being loaded")
81
+ else:
82
+ while True:
83
+ try:
84
+ response = requests.get("http://0.0.0.0:6900/")
85
+ if response.status_code == 200:
86
+ ld = LoadDatasets(API_KEY)
87
+ ld.load_somos()
88
+ break
89
+
90
+ except requests.exceptions.ConnectionError:
91
+ pass
92
+ except Exception as e:
93
+ print(e)
94
+ time.sleep(10)
95
+ pass
96
+
97
+ time.sleep(5)
98
+ while True:
99
+ time.sleep(60)