demo-argilla / scripts /upload.py
brancengregory's picture
Add scripts
0ef555c unverified
raw
history blame contribute delete
No virus
781 Bytes
import pandas as pd
import argilla as rg
import spacy
from datasets import Dataset
# Configuration
rg.init(
api_url='https://brancengregory-demo-argilla.hf.space',
api_key='team.apikey'
)
# Plaintiffs
data = pd.read_csv("data/labelled_plaintiffs.csv")
data = data.rename(columns={"filed_by": "text"})
dataset = rg.read_pandas(data, task="TextClassification")
rg.log(dataset, "plaintiff_sample")
# Minutes
dataset = Dataset.from_csv("data/minutes.csv").rename_column("description", "text")
nlp = spacy.load("en_core_web_trf")
def tokenize(row):
tokens = [token.text for token in nlp(row["text"])]
return {"tokens": tokens}
dataset = dataset.map(tokenize)
dataset = rg.read_datasets(dataset, task="TokenClassification")
rg.log(dataset, "minutes_sample")