Spaces:

cmu-lti
/

sotopia-space

Runtime error

File size: 13,303 Bytes

import re
from typing import Literal, cast

from pydantic import BaseModel, Field

from utils import format_docstring

ActionType = Literal["none", "speak", "non-verbal communication", "action", "leave"]


class Message(BaseModel):
    """
    An interface for messages.
    There is only one required method: to_natural_language
    """

    def to_natural_language(self) -> str:
        raise NotImplementedError


class SimpleMessage(Message):
    """
    A simple message with a single string field.
    """

    message: str = Field(description="the message")

    def to_natural_language(self) -> str:
        return self.message


class Observation(Message):
    last_turn: str = Field(description="the last turn of the conversation")
    turn_number: int = Field(description="the turn number of the conversation")
    available_actions: list[ActionType] = Field(description="the available actions")

    def to_natural_language(self) -> str:
        if self.turn_number == 0:
            return f"\n{self.last_turn}\nConversation Starts:\n"
        else:
            return f"Turn #{self.turn_number-1}: {self.last_turn}\n"


class ScriptBackground(Message):
    scenario: str = Field(description="scenario of the episode")
    p1_name: str = Field(description="name of participant 1")
    p2_name: str = Field(description="name of participant 2")
    p1_background: str = Field(description="background of participant 1")
    p2_background: str = Field(description="background of participant 2")
    p1_goal: str = Field(description="goal of participant 1")
    p2_goal: str = Field(description="goal of participant 2")

    def to_natural_language(self) -> str:
        if self.p1_background or self.p2_background:
            p1_background = self.p1_background if self.p1_background else "Unknown"
            p2_background = self.p2_background if self.p2_background else "Unknown"
            # Not using AND, since in stranger relation the background is not visible
            return format_docstring(
                f"""Here is the context of this interaction:
            Scenario: {self.scenario}
            Participants: {self.p1_name} and {self.p2_name}
            {self.p1_name}'s background: {p1_background}
            {self.p2_name}'s background: {p2_background}
            {self.p1_name}'s goal: {self.p1_goal}
            {self.p2_name}'s goal: {self.p2_goal}
            """
            )
        else:
            return format_docstring(
                f"""Here is the context of this interaction:
            Scenario: {self.scenario}
            Participants: {self.p1_name} and {self.p2_name}
            {self.p1_name}'s goal: {self.p1_goal}
            {self.p2_name}'s goal: {self.p2_goal}
            """
            )


class ScriptEnvironmentResponse(Message):
    terminated: bool = Field(
        description="whether the conversation is terminated",
        default_factory=lambda: False,
    )
    p1_rate: float | tuple[float, dict[str, float]] | None = Field(
        description="rating of participant 1, on the scale of 1 to 10"
    )
    p2_rate: float | tuple[float, dict[str, float]] | None = Field(
        description="rating of participant 2, on the scale of 1 to 10"
    )
    comments: str | None = Field(
        description="All of the comments supporting the termination and rating"
    )

    def to_natural_language(self) -> str:
        reason_to_stop = format_docstring(
            f"""Environment response:
        {"The conversation is terminated." if self.terminated else ""}
        {"Rating of participant 1" + str(self.p1_rate) if self.p1_rate is not None else ""}
        {"Rating of participant 2" + str(self.p2_rate) if self.p2_rate is not None else ""}
        {self.comments if self.comments is not None else ""}
        """
        )
        clean_text = ""
        for line in reason_to_stop.split("\n"):
            if line.strip():
                clean_text += line + "\n"
        return clean_text


class AgentAction(Message):
    action_type: ActionType = Field(
        description="whether to speak at this turn or choose to not do anything"
    )
    argument: str = Field(
        description="the utterance if choose to speak, the expression or gesture if choose non-verbal communication, or the physical action if choose action"
    )

    def to_natural_language(self) -> str:
        match self.action_type:
            case "none":
                return "did nothing"
            case "speak":
                return f"{self.argument}"
            case "non-verbal communication":
                return f"[{self.action_type}] {self.argument}"
            case "action":
                return f"[{self.action_type}] {self.argument}"
            case "leave":
                return "left the conversation"


ScriptInteractionReturnType = tuple[
    list[list[tuple[str, str, Message]]], list[tuple[str, Message]]
]


class ScriptInteraction(Message):
    interactions: str = Field(
        description="""The interaction between the two participants in maximum 20 turns. Each turn is separated by a newline, and should only describe one agent. Following the structure:
        Turn #x
        [participant's name] [action] {argument for some actions}

        You can use different types of actions, but only use one in each turn. You should move other information into argument part. Below shows a python code snippet of the format for each action type:
        match self.action_type:
            case "none":
                return "did nothing"
            case "speak":
                return f'said: "{self.argument}"'
            case "non-verbal communication":
                return f"[{self.action_type}] {self.argument}"
            case "action":
                return f"[{self.action_type}] {self.argument}"
            case "leave":
                return "left the conversation"

        For example, the following is acceptable:
        Turn #x
        Oliver Thompson said: "Hey Esmeralda, what's wrong? You seem upset."
        Turn #x
        Esmeralda Solis [action] moved closer
        Turn #x
        Oliver Thompson [non-verbal communication] smiled
        Turn #x
        Esmeralda Solis did nothing
        Turn #x
        Oliver Thompson left the conversation
        Turn #x
        Esmeralda Solis [action] leaned in and lowered her voice: "Sorry"

        And the following is not acceptable:
        Turn #1
        Oliver Thompson [speak] said: "Hey Esmeralda, what's wrong? You seem upset."
        Turn #1
        Esmeralda Solis non-verbal communication moved closer
        """
    )

    def to_natural_language(self) -> str:
        return self.interactions

    def parse(
        self, agent_names: list[str], background: str
    ) -> tuple[list[list[tuple[str, str, Message]]], list[tuple[str, Message]]]:
        interaction = self.interactions
        # print("Interaction: ", interaction)
        lines = self.split_by_turn(interaction)

        agent_results = []
        results: list[list[tuple[str, str, Message]]] = [
            [
                (
                    "Environment",
                    name,
                    Observation(
                        last_turn=background,
                        turn_number=0,
                        available_actions=["none"],
                    ),
                )
                for name in agent_names
            ]
        ]

        for line_idx, line in enumerate(lines):
            try:
                res = self.parse_single_dialogue(line)
                action: AgentAction = cast(AgentAction, res["action"])
                argument: str = cast(str, res["argument"])
                cast(int, res["turn"])
                name: str = cast(str, res["name"])

                parsed_action = AgentAction(action_type=action, argument=argument)
                if name not in agent_names:
                    print(
                        f"The name of the agent, {name}, is not in the list of agent names, {agent_names}"
                    )
                    name = agent_names[
                        line_idx % 2
                    ]  # TODO Not sure what name to be set here
            except Exception as e:
                print(
                    f"Error when parsing the dialogue: {line}",
                    f"The error is: {e}",
                )
                raise e
                parsed_action = AgentAction(action_type="none", argument="")
                name = agent_names[line_idx % 2]  # TODO same question as above
            inactive_agent_name = (
                agent_names[0] if name == agent_names[1] else agent_names[1]
            )
            results.append(
                [
                    (
                        "Environment",
                        name,
                        Observation(
                            last_turn="environment is the agent",
                            turn_number=line_idx + 1,
                            available_actions=["none"],
                        ),
                    )
                    for name in agent_names
                ]
                + [
                    (name, "Environment", parsed_action),
                    (
                        inactive_agent_name,
                        "Environment",
                        AgentAction(action_type="none", argument="did nothing"),
                    ),
                ]
            )

            agent_results.append((name, parsed_action))
        # print("Parsed agent results: ", agent_results)
        return (results, agent_results)  # type: ignore

    def parse_single_dialogue(
        self, dialogue: str
    ) -> dict[str, str | int | AgentAction | None]:
        """Parse a single dialogue string and return a dictionary with turn, name, action, and argument."""

        # Match the turn number and name. Assume all agent name starts with a capital letter and is followed by lowercase letters
        match_turn_name = re.match(
            r"Turn #?(\d+):?\s*\n((?:[A-Z]['a-z]* ?)+)", dialogue
        )

        if not match_turn_name:
            raise ValueError(
                f"The dialogue does not match the expected format: {dialogue}"
            )
            return None  # TODO Which should we use, return None or raise error?

        turn, name = match_turn_name.groups()
        action_content = dialogue[
            len(match_turn_name.group(0)) :
        ].strip()  # Extract the action content

        # Check for different action types
        if "did nothing" in action_content:
            action, argument = "none", ""
        elif match := re.match(r'said: "(.*?)"', action_content):
            action, argument = "speak", match.group(1)
            action, argument = action.strip(), argument.strip()
        elif match := re.match(r'\[speak\] said: "(.*?)"', action_content):
            action, argument = "speak", match.group(1)
            action, argument = action.strip(), argument.strip()
        elif match := re.match(
            r"\[(non-verbal communication|action)\] (.*)", action_content
        ):
            action, argument = match.groups()
        elif "left the conversation" in action_content:
            # TODO Make it more elegant to handle the situation of `left the conversation.`
            action, argument = "leave", ""
        else:
            action, argument = None, None

        parsed_item = {
            "turn": int(turn),
            "name": name.strip(),
            "action": action,
            "argument": argument,
        }
        return parsed_item

    def split_by_turn(self, input_string: str) -> list[str]:
        """Split the input dialogue string by turn and return a list of dialogues."""
        # Split using 'Turn #' as delimiter, but keep the delimiter in the results
        dialogues = re.split(r"(?=Turn #?\d+)", input_string)
        # Remove any empty strings and strip whitespace
        dialogues = [dialogue.strip() for dialogue in dialogues if dialogue.strip()]
        dialogues = [dialogue for dialogue in dialogues if dialogue.startswith("Turn")]
        # Change from Turn #x to Turn (#)x (# is optional)
        dialogues[-1] = "\n".join(
            dialogues[-1].split("\n")[:2]
        )  # Discard further input in the last turn

        for dialogue in dialogues:
            # TODO this is current workaround for the issue of multiple agents in one turn
            if len(dialogue.split("\n")) >= 3:
                raise ValueError("Only one agent can act per turn.")
        return dialogues

    @staticmethod
    def default_value_for_return_type() -> ScriptInteractionReturnType:
        results_1: list[list[tuple[str, str, Message]]] = [
            [
                (
                    "Environment",
                    name,
                    Observation(
                        last_turn="Environment is the agent",
                        turn_number=0,
                        available_actions=["none"],
                    ),
                )
                for name in ["none", "none"]
            ]
        ]
        results_2: list[tuple[str, Message]] = [
            ("", AgentAction(action_type="none", argument=""))
        ]
        return (results_1, results_2)