Spaces:
Runtime error
Runtime error
File size: 6,246 Bytes
129cd69 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
"""Document transformers that use OpenAI Functions models"""
from typing import Any, Dict, Optional, Sequence, Type, Union
from langchain_core.documents import BaseDocumentTransformer, Document
from langchain_core.language_models import BaseLanguageModel
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel
from langchain.chains.llm import LLMChain
from langchain.chains.openai_functions import create_tagging_chain
class OpenAIMetadataTagger(BaseDocumentTransformer, BaseModel):
"""Extract metadata tags from document contents using OpenAI functions.
Example:
.. code-block:: python
from langchain.chat_models import ChatOpenAI
from langchain.document_transformers import OpenAIMetadataTagger
from langchain_core.documents import Document
schema = {
"properties": {
"movie_title": { "type": "string" },
"critic": { "type": "string" },
"tone": {
"type": "string",
"enum": ["positive", "negative"]
},
"rating": {
"type": "integer",
"description": "The number of stars the critic rated the movie"
}
},
"required": ["movie_title", "critic", "tone"]
}
# Must be an OpenAI model that supports functions
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613")
tagging_chain = create_tagging_chain(schema, llm)
document_transformer = OpenAIMetadataTagger(tagging_chain=tagging_chain)
original_documents = [
Document(page_content="Review of The Bee Movie\nBy Roger Ebert\n\nThis is the greatest movie ever made. 4 out of 5 stars."),
Document(page_content="Review of The Godfather\nBy Anonymous\n\nThis movie was super boring. 1 out of 5 stars.", metadata={"reliable": False}),
]
enhanced_documents = document_transformer.transform_documents(original_documents)
""" # noqa: E501
tagging_chain: LLMChain
"""The chain used to extract metadata from each document."""
def transform_documents(
self, documents: Sequence[Document], **kwargs: Any
) -> Sequence[Document]:
"""Automatically extract and populate metadata
for each document according to the provided schema."""
new_documents = []
for document in documents:
extracted_metadata: Dict = self.tagging_chain.run(document.page_content) # type: ignore[assignment] # noqa: E501
new_document = Document(
page_content=document.page_content,
metadata={**extracted_metadata, **document.metadata},
)
new_documents.append(new_document)
return new_documents
async def atransform_documents(
self, documents: Sequence[Document], **kwargs: Any
) -> Sequence[Document]:
raise NotImplementedError
def create_metadata_tagger(
metadata_schema: Union[Dict[str, Any], Type[BaseModel]],
llm: BaseLanguageModel,
prompt: Optional[ChatPromptTemplate] = None,
*,
tagging_chain_kwargs: Optional[Dict] = None,
) -> OpenAIMetadataTagger:
"""Create a DocumentTransformer that uses an OpenAI function chain to automatically
tag documents with metadata based on their content and an input schema.
Args:
metadata_schema: Either a dictionary or pydantic.BaseModel class. If a dictionary
is passed in, it's assumed to already be a valid JsonSchema.
For best results, pydantic.BaseModels should have docstrings describing what
the schema represents and descriptions for the parameters.
llm: Language model to use, assumed to support the OpenAI function-calling API.
Defaults to use "gpt-3.5-turbo-0613"
prompt: BasePromptTemplate to pass to the model.
Returns:
An LLMChain that will pass the given function to the model.
Example:
.. code-block:: python
from langchain.chat_models import ChatOpenAI
from langchain.document_transformers import create_metadata_tagger
from langchain_core.documents import Document
schema = {
"properties": {
"movie_title": { "type": "string" },
"critic": { "type": "string" },
"tone": {
"type": "string",
"enum": ["positive", "negative"]
},
"rating": {
"type": "integer",
"description": "The number of stars the critic rated the movie"
}
},
"required": ["movie_title", "critic", "tone"]
}
# Must be an OpenAI model that supports functions
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613")
document_transformer = create_metadata_tagger(schema, llm)
original_documents = [
Document(page_content="Review of The Bee Movie\nBy Roger Ebert\n\nThis is the greatest movie ever made. 4 out of 5 stars."),
Document(page_content="Review of The Godfather\nBy Anonymous\n\nThis movie was super boring. 1 out of 5 stars.", metadata={"reliable": False}),
]
enhanced_documents = document_transformer.transform_documents(original_documents)
""" # noqa: E501
metadata_schema = (
metadata_schema
if isinstance(metadata_schema, dict)
else metadata_schema.schema()
)
_tagging_chain_kwargs = tagging_chain_kwargs or {}
tagging_chain = create_tagging_chain(
metadata_schema, llm, prompt=prompt, **_tagging_chain_kwargs
)
return OpenAIMetadataTagger(tagging_chain=tagging_chain)
|