axionx-demo / legaldoc_summarizer /dataset_loader.py
deepsodha's picture
Upload 25 files
beb5479 verified
raw
history blame contribute delete
856 Bytes
from datasets import load_dataset
import pandas as pd, os
def load_legal_dataset():
"""
Loads a small portion of the CUAD dataset (contract clauses).
Converts each clause into (document_text, summary) pairs.
"""
dataset = load_dataset("cuad", "cuad_v1", split="train[:200]")
df = pd.DataFrame(dataset)
df["question_text"] = "Summarize the key legal clause: " + df["question_text"]
df["answer"] = df["answers"].apply(lambda a: a[0]["text"][0] if a and a[0]["text"] else "")
data = df[["question_text", "answer"]].rename(columns={"question_text": "question"})
os.makedirs("datasets", exist_ok=True)
data.to_json("datasets/legal_sample.jsonl", orient="records", lines=True)
print("βœ… Saved sample dataset to datasets/legal_sample.jsonl")
return data
if __name__ == "__main__":
load_legal_dataset()