from datasets import load_dataset import pandas as pd, os def load_legal_dataset(): """ Loads a small portion of the CUAD dataset (contract clauses). Converts each clause into (document_text, summary) pairs. """ dataset = load_dataset("cuad", "cuad_v1", split="train[:200]") df = pd.DataFrame(dataset) df["question_text"] = "Summarize the key legal clause: " + df["question_text"] df["answer"] = df["answers"].apply(lambda a: a[0]["text"][0] if a and a[0]["text"] else "") data = df[["question_text", "answer"]].rename(columns={"question_text": "question"}) os.makedirs("datasets", exist_ok=True) data.to_json("datasets/legal_sample.jsonl", orient="records", lines=True) print("✅ Saved sample dataset to datasets/legal_sample.jsonl") return data if __name__ == "__main__": load_legal_dataset()