Spaces:

google
/

radextract

Running on CPU Upgrade

File size: 2,815 Bytes

fab8051

"""Processing utilities for radiology report structuring prompts.

This module provides helper functions for processing and formatting prompts
used in the LangExtract system for radiology report structuring.
"""

import dataclasses
import json
from typing import Optional

from langextract.data import ExampleData
from langextract.data_lib import enum_asdict_factory

from prompt_instruction import PROMPT_INSTRUCTION


def clean_dict(obj):
    """Removes null values and empty objects/lists from dictionary recursively.

    This function recursively traverses a dictionary or list structure
    and removes any keys with null values, empty dictionaries, or empty
    lists to create cleaner JSON output for the prompt examples.

    Args:
        obj: The object to clean (dict, list, or primitive value).

    Returns:
        The cleaned object with null/empty values removed.
    """
    if isinstance(obj, dict):
        cleaned = {}
        for key, value in obj.items():
            cleaned_value = clean_dict(value)
            # Only include non-null, non-empty values
            if (
                cleaned_value is not None
                and cleaned_value != {}
                and cleaned_value != []
            ):
                cleaned[key] = cleaned_value
        return cleaned
    elif isinstance(obj, list):
        return [clean_dict(item) for item in obj if clean_dict(item) is not None]
    else:
        return obj


def generate_markdown_prompt(
    examples: list[ExampleData], input_text: Optional[str] = None
) -> str:
    """Generate markdown prompt with examples using LangExtract's enum_asdict_factory.

    Args:
        examples: List of ExampleData objects for few-shot learning
        input_text: Optional input text to include in inference example

    Returns:
        Formatted markdown string containing the complete prompt
    """
    examples_list = []

    for i, example in enumerate(examples, 1):
        example_dict = dataclasses.asdict(example, dict_factory=enum_asdict_factory)

        # Clean up null values and empty objects
        cleaned_extractions = clean_dict({"extractions": example_dict["extractions"]})
        json_output = json.dumps(cleaned_extractions, indent=2)

        example_section = f"""## Example {i}

**Input Text:**
```
{example.text}
```

**Expected Output:**
```json
{json_output}
```"""
        examples_list.append(example_section)

    examples_formatted = "\n\n---\n\n".join(examples_list)

    # Format inference section if input text provided
    inference_section = ""
    if input_text:
        inference_section = f"""

## Inference Example:

**Input Text:**
```
{input_text}
```

**Expected Output:**
"""

    return PROMPT_INSTRUCTION.format(
        examples=examples_formatted, inference_section=inference_section
    )