File size: 8,738 Bytes
fcaa164
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========

from typing import Any, Callable, List, Literal, Type, Union

from pydantic import BaseModel

from .base import BaseConverter


class OutlinesConverter(BaseConverter):
    r"""OutlinesConverter is a class that converts a string or a function
    into a BaseModel schema.

    Args:
        model_type (str, optional): The model type to be used.
        platform (str, optional): The platform to be used.
            1. transformers
            2. mamba
            3. vllm
            4. llamacpp
            5. mlx
            (default: "transformers")
        **kwargs: The keyword arguments to be used. See the outlines
            documentation for more details. See
            https://dottxt-ai.github.io/outlines/latest/reference/models/models/
    """

    def __init__(
        self,
        model_type: str,
        platform: Literal[
            "vllm", "transformers", "mamba", "llamacpp", "mlx"
        ] = "transformers",
        **kwargs: Any,
    ):
        self.model_type = model_type
        from outlines import models

        match platform:
            case "vllm":
                self._outlines_model = models.vllm(model_type, **kwargs)
            case "transformers":
                self._outlines_model = models.transformers(
                    model_type, **kwargs
                )
            case "mamba":
                self._outlines_model = models.mamba(model_type, **kwargs)
            case "llamacpp":
                self._outlines_model = models.llamacpp(model_type, **kwargs)
            case "mlx":
                self._outlines_model = models.mlxlm(model_type, **kwargs)
            case _:
                raise ValueError(f"Unsupported platform: {platform}")

    def convert_regex(self, content: str, regex_pattern: str) -> str:
        r"""Convert the content to the specified regex pattern.

        Args:
            content (str): The content to be converted.
            regex_pattern (str): The regex pattern to be used.

        Returns:
            str: The converted content.
        """
        import outlines

        regex_generator = outlines.generate.regex(
            self._outlines_model, regex_pattern
        )
        return regex_generator(content)

    def convert_json(
        self,
        content: str,
        output_schema: Union[str, Callable],
    ) -> dict:
        r"""Convert the content to the specified JSON schema given by
        output_schema.

        Args:
            content (str): The content to be converted.
            output_schema (Union[str, Callable]): The expected format of the
                response.

        Returns:
            dict: The converted content in JSON format.
        """
        import outlines

        json_generator = outlines.generate.json(
            self._outlines_model, output_schema
        )
        return json_generator(content)

    def convert_pydantic(
        self,
        content: str,
        output_schema: Type[BaseModel],
    ) -> BaseModel:
        r"""Convert the content to the specified Pydantic schema.

        Args:
            content (str): The content to be converted.
            output_schema (Type[BaseModel]): The expected format of the
                response.

        Returns:
            BaseModel: The converted content in pydantic model format.
        """
        import outlines

        json_generator = outlines.generate.json(
            self._outlines_model, output_schema
        )
        return json_generator(content)

    def convert_type(self, content: str, type_name: type) -> str:
        r"""Convert the content to the specified type.

        The following types are currently available:
            1. int
            2. float
            3. bool
            4. datetime.date
            5. datetime.time
            6. datetime.datetime
            7. custom types (https://dottxt-ai.github.io/outlines/latest/reference/generation/types/)

        Args:
            content (str): The content to be converted.
            type_name (type): The type to be used.

        Returns:
            str: The converted content.
        """
        import outlines

        type_generator = outlines.generate.format(
            self._outlines_model, type_name
        )
        return type_generator(content)

    def convert_choice(self, content: str, choices: List[str]) -> str:
        r"""Convert the content to the specified choice.

        Args:
            content (str): The content to be converted.
            choices (List[str]): The choices to be used.

        Returns:
            str: The converted content.
        """
        import outlines

        choices_generator = outlines.generate.choice(
            self._outlines_model, choices
        )
        return choices_generator(content)

    def convert_grammar(self, content: str, grammar: str) -> str:
        r"""Convert the content to the specified grammar.

        Args:
            content (str): The content to be converted.
            grammar (str): The grammar to be used.

        Returns:
            str: The converted content.
        """
        import outlines

        grammar_generator = outlines.generate.cfg(
            self._outlines_model, grammar
        )
        return grammar_generator(content)

    def convert(  # type: ignore[override]
        self,
        content: str,
        type: Literal["regex", "json", "type", "choice", "grammar"],
        **kwargs,
    ) -> Any:
        r"""Formats the input content into the expected BaseModel.

        Args:
            type (Literal["regex", "json", "type", "choice", "grammar"]):
                The type of conversion to perform. Options are:
                    - "regex": Match the content against a regex pattern.
                    - "pydantic": Convert the content into a pydantic model.
                    - "json": Convert the content into a JSON based on a
                      schema.
                    - "type": Convert the content into a specified type.
                    - "choice": Match the content against a list of valid
                      choices.
                    - "grammar": Convert the content using a specified grammar.
            content (str): The content to be formatted.
            **kwargs: Additional keyword arguments specific to the conversion
                type.

            - For "regex":
                regex_pattern (str): The regex pattern to use for matching.

            - For "pydantic":
                output_schema (Type[BaseModel]): The schema to validate and
                    format the pydantic model.

            - For "json":
                output_schema (Union[str, Callable]): The schema to validate
                    and format the JSON object.

            - For "type":
                type_name (str): The target type name for the conversion.

            - For "choice":
                choices (List[str]): A list of valid choices to match against.

            - For "grammar":
                grammar (str): The grammar definition to use for content
                    conversion.
        """
        match type:
            case "regex":
                return self.convert_regex(content, kwargs.get("regex_pattern"))  # type: ignore[arg-type]
            case "pydantic":
                return self.convert_pydantic(
                    content, kwargs.get("output_schema")
                )  # type: ignore[arg-type]
            case "json":
                return self.convert_json(content, kwargs.get("output_schema"))  # type: ignore[arg-type]
            case "type":
                return self.convert_type(content, kwargs.get("type_name"))  # type: ignore[arg-type]
            case "choice":
                return self.convert_choice(content, kwargs.get("choices"))  # type: ignore[arg-type]
            case "grammar":
                return self.convert_grammar(content, kwargs.get("grammar"))  # type: ignore[arg-type]
            case _:
                raise ValueError("Unsupported output schema type")