Spaces:
Runtime error
Runtime error
## adapted thankfully from: somosnlp/somos-alpaca-es | |
import sys | |
import time | |
import os | |
import argilla as rg | |
import pandas as pd | |
import requests | |
from datasets import load_dataset, concatenate_datasets | |
from argilla.listeners import listener | |
HF_TOKEN = os.environ.get("HF_TOKEN") #get HF_TOKEN from space env variables | |
HUB_DATASET_NAME = os.environ.get('HUB_DATASET_NAME') #get dataset name | |
def save_validated_to_hub(records, ctx): | |
if len(records) > 0: | |
ds = rg.DatasetForTokenClassification(records=records).to_datasets() | |
if HF_TOKEN: | |
print("Pushing the dataset") | |
print(ds) | |
ds.push_to_hub(HUB_DATASET_NAME, token=HF_TOKEN, private=True) | |
else: | |
print("SET HF_TOKEN and HUB_DATASET_NAME TO SYNC YOUR DATASET!!!") | |
else: | |
print("NO RECORDS found") | |
class LoadDatasets: | |
def __init__(self, api_key, workspace="admin"): | |
rg.init(api_key=api_key, workspace=workspace) | |
def load_saved_data(): | |
# load data from validated branch | |
try: | |
print(f"Trying to sync with {HUB_DATASET_NAME}") | |
old_ds = load_dataset(HUB_DATASET_NAME, split="train", use_auth_token=HF_TOKEN) | |
except Exception as e: | |
print(f"Not possible to sync with {HUB_DATASET_NAME}") | |
print(e) | |
old_ds = None | |
dataset = load_dataset('LukasGe/DVS-job-titles-raw', split="train", use_auth_token=HF_TOKEN) # get starting file from the main branch of the dataset | |
if old_ds: | |
print("Concatenating datasets") | |
dataset = concatenate_datasets([dataset, old_ds]) | |
print("Concatenated dataset is:") | |
print(dataset) | |
dataset = dataset.remove_columns("metrics") | |
records = rg.DatasetForTokenClassification.from_datasets(dataset) | |
settings = rg.TokenClassificationSettings( | |
label_schema=["B-FUN", "I-FUN", "B-RES", "I-RES", "B-LEVEL", "I-LEVEL"] | |
) | |
rg.configure_dataset(name="job-title-tagging", settings=settings, workspace="admin") | |
# Log the dataset | |
rg.log( | |
records, | |
name="job-title-tagging", | |
tags={"description": "Tagging of the DVS Job Titles"}, | |
batch_size=200 | |
) | |
# run listener | |
save_validated_to_hub.start() | |
if __name__ == "__main__": | |
API_KEY = os.environ.get('TEAM_API_KEY') | |
LOAD_DATASETS = sys.argv[2] | |
if LOAD_DATASETS.lower() == "none": | |
print("No datasets being loaded") | |
else: | |
while True: | |
try: | |
response = requests.get("http://0.0.0.0:6900/") | |
if response.status_code == 200: | |
ld = LoadDatasets(API_KEY) | |
ld.load_saved_data() | |
break | |
except requests.exceptions.ConnectionError: | |
pass | |
except Exception as e: | |
print(e) | |
time.sleep(10) | |
pass | |
time.sleep(5) | |
while True: | |
time.sleep(60) |