| """ |
| fetch_cuad.py |
| |
| Downloads & saves the 3 CUAD data artifacts: |
| 1. ../data/cuad/taxonomy.json -> 41 clause categories + descriptions |
| 2. ../data/cuad/contracts/<title>.txt -> 510 full contract texts |
| 3. ../data/cuad/annotations.json -> all labeled spans, organized by contract |
| |
| Dataset structure (SQuAD-format): |
| Each row: {id, title, context, question, answers: {text: [...], answer_start: [...]}} |
| 510 unique contracts x 41 clause-category questions = ~20,950 rows total (train + test). |
| The same contract appears 41 times -- once per question. |
| Question format: "Highlight the parts (if any) of this contract related to |
| '{Category Name}' that should be reviewed by a lawyer." |
| """ |
|
|
| import io |
| import json |
| import re |
| import urllib.request |
| import zipfile |
| from pathlib import Path |
|
|
| DATA_DIR = Path("../data/cuad") |
| CONTRACTS_DIR = DATA_DIR / "../contracts" |
|
|
| |
| _CUAD_URL = "https://github.com/TheAtticusProject/cuad/raw/main/data.zip" |
|
|
| def load_cuad() -> list[dict]: |
| """Download CUAD data.zip and return a flat list of SQuAD-format rows.""" |
| print("LOADING CUAD DATA FROM GITHUB") |
| with urllib.request.urlopen(_CUAD_URL) as resp: |
| data = resp.read() |
| print(f" downloaded {len(data) // 1024} KB") |
|
|
| rows = [] |
| with zipfile.ZipFile(io.BytesIO(data)) as zf: |
| json_files = [n for n in zf.namelist() if n.endswith(".json")] |
| for name in json_files: |
| squad = json.loads(zf.read(name)) |
| for article in squad["data"]: |
| title = article["title"] |
| for para in article["paragraphs"]: |
| context = para["context"] |
| for qa in para["qas"]: |
| rows.append({ |
| "id": qa["id"], |
| "title": title, |
| "context": context, |
| "question": qa["question"], |
| "answers": { |
| "text": [a["text"] for a in qa["answers"]], |
| "answer_start": [a["answer_start"] for a in qa["answers"]], |
| }, |
| }) |
|
|
| print(f" {len(rows)} rows from {len(json_files)} file(s)") |
| return rows |
|
|
| def extract_taxonomy(rows: list[dict]) -> None: |
| """ |
| Extract all 41 clause categories |
| |
| Pattern for questions: |
| "Highlight the parts (if any) of this contract related to |
| '{Category Name}' that should be reviewed by a lawyer." |
| |
| The full question text is preserved as the category description since it is |
| exactly the prompt the Classification Agent should use when labeling clauses |
| |
| Output: |
| [{"id": 1, "name": "Document Name", "question": "<full question text>"}, ...] |
| """ |
| seen: dict[str, str] = {} |
|
|
| for row in rows: |
| question = row["question"] |
| match = re.search(r'related to "(.+?)" that should be', question) |
| if match: |
| name = match.group(1) |
| if name not in seen: |
| seen[name] = question.strip() |
|
|
| taxonomy = [ |
| {"id": i + 1, "name": name, "question": question} |
| for i, (name, question) in enumerate(seen.items()) |
| ] |
|
|
| out_path = DATA_DIR / "taxonomy.json" |
| with open(out_path, "w") as f: |
| json.dump(taxonomy, f, indent=2) |
|
|
| print(f" taxonomy.json: {len(taxonomy)} categories") |
|
|
|
|
| def extract_contracts(rows: list[dict]) -> None: |
| """ |
| Extract the 510 unique contract full texts and save each as a .txt file. |
| |
| The `context` field holds the full contract text and is the same across |
| all 41 rows for a given contract, so we just need the first occurrence. |
| |
| Output: data/cuad/contracts/<sanitized_title>.txt |
| """ |
| CONTRACTS_DIR.mkdir(parents=True, exist_ok=True) |
| seen: set[str] = set() |
|
|
| for row in rows: |
| title = row["title"] |
| if title in seen: |
| continue |
| seen.add(title) |
|
|
| safe_name = re.sub(r'[<>:"/\\|?*]', "_", title) |
| out_path = CONTRACTS_DIR / f"{safe_name}.txt" |
| with open(out_path, "w", encoding="utf-8") as f: |
| f.write(row["context"]) |
|
|
| print(f" contracts/: {len(seen)} files") |
|
|
| def extract_annotations(rows: list[dict]) -> None: |
| """ |
| Extract all the labeled spans and organize by contract title |
| |
| Each contract maps to a list of 41 annotation entries (one per category) |
| An entry's `spans` list is empty if that clause type doesn't appear in the contract |
| |
| Output: |
| { |
| "<contract_title>": [ |
| { |
| "category": "Governing Law", |
| "question": "<full question text>", |
| "spans": ["...annotated text...", ...], |
| "span_starts": [482, ...] |
| }, |
| ...41 entries per contract... |
| ] |
| } |
| """ |
| annotations: dict[str, list] = {} |
|
|
| for row in rows: |
| title = row["title"] |
| if title not in annotations: |
| annotations[title] = [] |
|
|
| question = row["question"] |
| match = re.search(r'related to "(.+?)" that should be', question) |
| category_name = match.group(1) if match else "Unknown" |
|
|
| annotations[title].append({ |
| "category": category_name, |
| "question": question.strip(), |
| "spans": row["answers"]["text"], |
| "span_starts": row["answers"]["answer_start"], |
| }) |
|
|
| out_path = DATA_DIR / "annotations.json" |
| with open(out_path, "w") as f: |
| json.dump(annotations, f, indent=2) |
|
|
| total_spans = sum( |
| len(entry["spans"]) |
| for entries in annotations.values() |
| for entry in entries |
| ) |
| print(f" annotations.json: {len(annotations)} contracts, {total_spans} labeled spans") |
|
|
|
|
|
|
|
|
| def main(): |
| DATA_DIR.mkdir(parents=True, exist_ok=True) |
| rows = load_cuad() |
| extract_taxonomy(rows) |
| extract_contracts(rows) |
| extract_annotations(rows) |
| print("\nAll CUAD data saved to ../data/cuad/") |
|
|
| if __name__ == "__main__": |
| main() |
|
|