Spaces:

rhoitjadhav
/

template-space-docker-v1

Runtime error

File size: 4,313 Bytes

import sys
import time

import pandas as pd
import requests
from datasets import load_dataset

import argilla as rg
from argilla.labeling.text_classification import Rule, add_rules


def load_datasets():
    # This is the code that you want to execute when the endpoint is available
    print("Argilla is available! Loading datasets")
    api_key = sys.argv[-1]
    rg.init(api_key=api_key, workspace="team")

    # load dataset from json
    my_dataframe = pd.read_json(
        "https://raw.githubusercontent.com/recognai/datasets/main/sst-sentimentclassification.json"
    )

    # convert pandas dataframe to DatasetForTextClassification
    dataset_rg = rg.DatasetForTextClassification.from_pandas(my_dataframe)

    # Define labeling schema to avoid UI user modification
    settings = rg.TextClassificationSettings(label_schema={"POSITIVE", "NEGATIVE"})
    rg.configure_dataset(name="sst-sentiment-explainability", settings=settings)

    # log the dataset
    rg.log(
        dataset_rg,
        name="sst-sentiment-explainability",
        tags={
            "description": "The sst2 sentiment dataset with predictions from a pretrained pipeline and explanations "
            "from Transformers Interpret. "
        },
    )

    dataset = load_dataset("argilla/news-summary", split="train").select(range(100))
    dataset_rg = rg.read_datasets(dataset, task="Text2Text")

    # log the dataset
    rg.log(
        dataset_rg,
        name="news-text-summarization",
        tags={
            "description": "A text summarization dataset with news pieces and their predicted summaries."
        },
    )

    # Read dataset from Hub
    dataset_rg = rg.read_datasets(
        load_dataset("argilla/agnews_weak_labeling", split="train"),
        task="TextClassification",
    )

    # Define labeling schema to avoid UI user modification
    settings = rg.TextClassificationSettings(
        label_schema={"World", "Sports", "Sci/Tech", "Business"}
    )
    rg.configure_dataset(name="news-programmatic-labeling", settings=settings)

    # log the dataset
    rg.log(
        dataset_rg,
        name="news-programmatic-labeling",
        tags={
            "description": "The AG News with programmatic labeling rules (see weak labeling mode in the UI)."
        },
    )

    # define queries and patterns for each category (using ES DSL)
    queries = [
        (["money", "financ*", "dollar*"], "Business"),
        (["war", "gov*", "minister*", "conflict"], "World"),
        (["*ball", "sport*", "game", "play*"], "Sports"),
        (["sci*", "techno*", "computer*", "software", "web"], "Sci/Tech"),
    ]

    # define rules
    rules = [
        Rule(query=term, label=label) for terms, label in queries for term in terms
    ]

    # add rules to the dataset
    add_rules(dataset="news-programmatic-labeling", rules=rules)

    # load dataset from the hub
    dataset = load_dataset("argilla/gutenberg_spacy-ner", split="train")

    # read in dataset, assuming it's a dataset for token classification
    dataset_rg = rg.read_datasets(dataset, task="TokenClassification")

    # Define labeling schema to avoid UI user modification
    labels = {
        "CARDINAL",
        "DATE",
        "EVENT",
        "FAC",
        "GPE",
        "LANGUAGE",
        "LAW",
        "LOC",
        "MONEY",
        "NORP",
        "ORDINAL",
        "ORG",
        "PERCENT",
        "PERSON",
        "PRODUCT",
        "QUANTITY",
        "TIME",
        "WORK_OF_ART",
    }
    settings = rg.TokenClassificationSettings(label_schema=labels)
    rg.configure_dataset(name="gutenberg_spacy-ner-monitoring", settings=settings)

    # log the dataset
    rg.log(
        dataset_rg,
        "gutenberg_spacy-ner-monitoring",
        tags={
            "description": "A dataset containing text from books with predictions from two spaCy NER pre-trained "
            "models. "
        },
    )


if __name__ == "__main__":
    while True:
        try:
            response = requests.get("http://0.0.0.0:6900/")
            if response.status_code == 200:
                load_datasets()
                break
        except requests.exceptions.ConnectionError:
            pass
        except Exception as e:
            print(e)
            time.sleep(10)
            pass

        time.sleep(5)