cjber's picture
fix: allow chapters to dynamically update
fbb5eac
raw
history blame contribute delete
4.68 kB
import numpy as np
import spacy
from langgraph.types import Send
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine
from planning_ai.chains.map_chain import map_chain
from planning_ai.chains.themes_chain import create_dynamic_chain
from planning_ai.logging import logger
from planning_ai.states import DocumentState, OverallState
analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()
nlp = spacy.load("en_core_web_lg")
def retrieve_chapters(state: DocumentState) -> DocumentState:
"""Retrieve themes from a document's content.
This function uses the `themes_chain` to extract themes from the document's
page content. It updates the document state with the themes and their scores.
Args:
state (DocumentState): The current state of the document, including its content.
Returns:
DocumentState: The updated document state with themes and scores.
"""
try:
chapters_chain = create_dynamic_chain(state["chapters"])
result = chapters_chain.invoke({"document": state["document"].page_content})
if not result.chapters:
state["themes"] = []
return state
chapters = [chapter.model_dump() for chapter in result.chapters]
chapters = [
{
"chapter": chapter["chapter"].value,
"score": chapter["score"],
"description": chapter["description"],
}
for chapter in chapters
]
except Exception as e:
logger.error(f"Theme selection error: {e}")
chapters = []
state["themes"] = [d for d in chapters if d["score"] >= 4]
return state
def remove_pii(document: str) -> str:
"""Remove personally identifiable information (PII) from a document.
This function uses the Presidio Analyzer and Anonymizer to detect and anonymize
PII such as names, phone numbers, and email addresses in the given document.
Args:
document (str): The document text from which PII should be removed.
Returns:
str: The document text with PII anonymized.
"""
results = analyzer.analyze(
text=document,
entities=["PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS"],
language="en",
)
return anonymizer.anonymize(text=document, analyzer_results=results).text
def generate_summary(state: DocumentState) -> dict:
"""Generate a summary for a document after removing PII.
This function first anonymizes the document to remove PII, then generates a summary
using the `map_chain`. The summary is added to the document state.
Args:
state (DocumentState): The current state of the document, including its text
and filename.
Returns:
dict: A dictionary containing the generated summary and updated document state.
"""
logger.info(f"Generating summary for document: {state['filename']}")
logger.info(f"Starting PII removal for: {state['filename']}")
state["document"].page_content = remove_pii(state["document"].page_content)
logger.info(f"Retrieving themes for: {state['filename']}")
state = retrieve_chapters(state)
try:
response = map_chain.invoke({"context": state["document"].page_content})
except Exception as e:
logger.error(f"Failed to decode JSON {state['document']}: {e}")
return {
"documents": [
{
**state,
"summary": "",
"refinement_attempts": 0,
"is_hallucinated": True,
"processed": True,
"failed": True,
}
]
}
logger.info(f"Summary generation completed for document: {state['filename']}")
return {
"documents": [
{
**state,
"summary": response,
"refinement_attempts": 0,
"is_hallucinated": True, # start true to ensure cycle begins
"failed": False,
"processed": False,
}
]
}
def map_documents(state: OverallState) -> list[Send]:
"""Map documents to generate summaries.
This function prepares a list of `Send` objects to trigger the summary generation
process for each document in the state.
Args:
state (OverallState): The overall state containing multiple documents.
Returns:
list[Send]: A list of `Send` objects for summary generation.
"""
logger.info("Mapping documents to generate summaries.")
return [Send("generate_summary", document) for document in state["documents"]]