# pip install "distilabel[vllm] @ git+https://github.com/argilla-io/distilabel.git@develop" # pip install flash-attn --no-build-isolation # huggingface-cli login import time from distilabel.pipeline import Pipeline from distilabel.steps import KeepColumns, LoadHubDataset from distilabel.steps.tasks import PrometheusEval from distilabel.llms import TransformersLLM if __name__ == "__main__": start_time = time.time() with Pipeline(name="prometheus") as pipeline: load_dataset = LoadHubDataset( name="load_dataset", repo_id="HuggingFaceH4/instruction-dataset", split="test", output_mappings={"prompt": "instruction", "completion": "generation"}, ) task = PrometheusEval( name="task", llm=TransformersLLM( model="prometheus-eval/prometheus-7b-v2.0", chat_template="[INST] {{ messages[0]['content'] }}\n{{ messages[1]['content'] }}[/INST]", ), mode="absolute", rubric="factual-validity", reference=False, num_generations=1, group_generations=False, ) keep_columns = KeepColumns( name="keep_columns", columns=["instruction", "generation", "feedback", "result", "model_name"], ) load_dataset >> task >> keep_columns # type: ignore distiset = pipeline.run( parameters={ task.name: { # type: ignore "llm": { "generation_kwargs": { "max_new_tokens": 1024, "temperature": 0.7, }, }, }, }, ) print("--- %s seconds ---" % (time.time() - start_time)) if distiset is not None: distiset.push_to_hub("instruction-dataset-prometheus")