disticleaner / prometheus_pipeline.py
Ben Burtenshaw
use transformers
956b1a1
# pip install "distilabel[vllm] @ git+https://github.com/argilla-io/distilabel.git@develop"
# pip install flash-attn --no-build-isolation
# huggingface-cli login
import time
from distilabel.pipeline import Pipeline
from distilabel.steps import KeepColumns, LoadHubDataset
from distilabel.steps.tasks import PrometheusEval
from distilabel.llms import TransformersLLM
if __name__ == "__main__":
start_time = time.time()
with Pipeline(name="prometheus") as pipeline:
load_dataset = LoadHubDataset(
name="load_dataset",
repo_id="HuggingFaceH4/instruction-dataset",
split="test",
output_mappings={"prompt": "instruction", "completion": "generation"},
)
task = PrometheusEval(
name="task",
llm=TransformersLLM(
model="prometheus-eval/prometheus-7b-v2.0",
chat_template="[INST] {{ messages[0]['content'] }}\n{{ messages[1]['content'] }}[/INST]",
),
mode="absolute",
rubric="factual-validity",
reference=False,
num_generations=1,
group_generations=False,
)
keep_columns = KeepColumns(
name="keep_columns",
columns=["instruction", "generation", "feedback", "result", "model_name"],
)
load_dataset >> task >> keep_columns # type: ignore
distiset = pipeline.run(
parameters={
task.name: { # type: ignore
"llm": {
"generation_kwargs": {
"max_new_tokens": 1024,
"temperature": 0.7,
},
},
},
},
)
print("--- %s seconds ---" % (time.time() - start_time))
if distiset is not None:
distiset.push_to_hub("instruction-dataset-prometheus")