Spaces:
Running
Running
| from datasets import load_dataset | |
| import pandas as pd, os | |
| def load_legal_dataset(): | |
| """ | |
| Loads a small portion of the CUAD dataset (contract clauses). | |
| Converts each clause into (document_text, summary) pairs. | |
| """ | |
| dataset = load_dataset("cuad", "cuad_v1", split="train[:200]") | |
| df = pd.DataFrame(dataset) | |
| df["question_text"] = "Summarize the key legal clause: " + df["question_text"] | |
| df["answer"] = df["answers"].apply(lambda a: a[0]["text"][0] if a and a[0]["text"] else "") | |
| data = df[["question_text", "answer"]].rename(columns={"question_text": "question"}) | |
| os.makedirs("datasets", exist_ok=True) | |
| data.to_json("datasets/legal_sample.jsonl", orient="records", lines=True) | |
| print("β Saved sample dataset to datasets/legal_sample.jsonl") | |
| return data | |
| if __name__ == "__main__": | |
| load_legal_dataset() | |