Spaces:
Sleeping
Sleeping
# pip install "distilabel[vllm] @ git+https://github.com/argilla-io/distilabel.git@develop" | |
# pip install flash-attn --no-build-isolation | |
# huggingface-cli login | |
import time | |
from distilabel.pipeline import Pipeline | |
from distilabel.steps import KeepColumns, LoadHubDataset | |
from distilabel.steps.tasks import PrometheusEval | |
from distilabel.llms import TransformersLLM | |
if __name__ == "__main__": | |
start_time = time.time() | |
with Pipeline(name="prometheus") as pipeline: | |
load_dataset = LoadHubDataset( | |
name="load_dataset", | |
repo_id="HuggingFaceH4/instruction-dataset", | |
split="test", | |
output_mappings={"prompt": "instruction", "completion": "generation"}, | |
) | |
task = PrometheusEval( | |
name="task", | |
llm=TransformersLLM( | |
model="prometheus-eval/prometheus-7b-v2.0", | |
chat_template="[INST] {{ messages[0]['content'] }}\n{{ messages[1]['content'] }}[/INST]", | |
), | |
mode="absolute", | |
rubric="factual-validity", | |
reference=False, | |
num_generations=1, | |
group_generations=False, | |
) | |
keep_columns = KeepColumns( | |
name="keep_columns", | |
columns=["instruction", "generation", "feedback", "result", "model_name"], | |
) | |
load_dataset >> task >> keep_columns # type: ignore | |
distiset = pipeline.run( | |
parameters={ | |
task.name: { # type: ignore | |
"llm": { | |
"generation_kwargs": { | |
"max_new_tokens": 1024, | |
"temperature": 0.7, | |
}, | |
}, | |
}, | |
}, | |
) | |
print("--- %s seconds ---" % (time.time() - start_time)) | |
if distiset is not None: | |
distiset.push_to_hub("instruction-dataset-prometheus") | |