Spaces:
Running
Running
File size: 3,414 Bytes
80fca6c 97e7095 14bf693 9a11954 577daa9 97e7095 1b2cd7e 97e7095 85d0a41 97e7095 13e0012 97e7095 9a11954 97e7095 fa704b0 97e7095 14bf693 97e7095 bde66bc 97e7095 bde66bc 97e7095 725c6c9 97e7095 13e0012 1b2cd7e 97e7095 1b2cd7e 577daa9 97e7095 0ece02b 97e7095 14bf693 97e7095 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
## adapted thankfully from: somosnlp/somos-alpaca-es
import sys
import time
import os
import argilla as rg
import pandas as pd
import requests
from datasets import load_dataset, concatenate_datasets
from argilla.listeners import listener
HF_TOKEN = os.environ.get("HF_TOKEN") #get HF_TOKEN from space env variables
HUB_DATASET_NAME = os.environ.get('HUB_DATASET_NAME') #get dataset name with annotated titles
@listener(
dataset='job-titles-dv', #name of the dataset in argilla frontend
query="status:Validated", # https://docs.argilla.io/en/latest/guides/features/queries.html
execution_interval_in_seconds=10, # interval to check the execution of `save_validated_to_hub`
)
def save_validated_to_hub(records, ctx):
if len(records) > 0:
ds = rg.DatasetForTokenClassification(records=records).to_datasets()
if HF_TOKEN:
print("Pushing the dataset")
print(ds)
ds.push_to_hub(HUB_DATASET_NAME, token=HF_TOKEN, private=True)
else:
print("SET HF_TOKEN and HUB_DATASET_NAME TO SYNC YOUR DATASET!!!")
else:
print("NO RECORDS found")
class LoadDatasets:
def __init__(self, api_key, workspace="admin"):
rg.init(api_key=api_key, workspace=workspace)
@staticmethod
def load_saved_data():
# load data from validated branch
try:
print(f"Trying to sync with {HUB_DATASET_NAME}")
old_ds = load_dataset(HUB_DATASET_NAME, split="train", use_auth_token=HF_TOKEN)
except Exception as e:
print(f"Not possible to sync with {HUB_DATASET_NAME}")
print(e)
old_ds = None
dataset = load_dataset('LukasGe/DVS-job-titles-raw', split="train", use_auth_token=HF_TOKEN) # get starting file from the main branch of the dataset
if old_ds:
print("Concatenating datasets")
dataset = concatenate_datasets([dataset, old_ds])
print("Concatenated dataset is:")
print(dataset)
dataset = dataset.remove_columns("metrics")
records = rg.DatasetForTokenClassification.from_datasets(dataset)
settings = rg.TokenClassificationSettings(
label_schema=["B-FUN", "I-FUN", "B-RES", "I-RES", "B-LEVEL", "I-LEVEL"]
)
rg.configure_dataset(name='job-titles-dv', settings=settings, workspace="admin")
# Log the dataset
rg.log(
records,
name='job-titles-dv',
tags={"description": "Tagging of the DVS Job Titles"},
batch_size=200
)
# run listener
save_validated_to_hub.start()
if __name__ == "__main__":
API_KEY = os.environ.get('TEAM_API_KEY')
LOAD_DATASETS = sys.argv[2]
if LOAD_DATASETS.lower() == "none":
print("No datasets being loaded")
else:
while True:
try:
response = requests.get("http://0.0.0.0:6900/")
if response.status_code == 200:
ld = LoadDatasets(API_KEY)
ld.load_saved_data()
break
except requests.exceptions.ConnectionError:
pass
except Exception as e:
print(e)
time.sleep(10)
pass
time.sleep(5)
while True:
time.sleep(60) |