Spaces:
Running
Running
import numpy as np | |
import spacy | |
from langgraph.types import Send | |
from presidio_analyzer import AnalyzerEngine | |
from presidio_anonymizer import AnonymizerEngine | |
from planning_ai.chains.map_chain import map_chain | |
from planning_ai.chains.themes_chain import create_dynamic_chain | |
from planning_ai.logging import logger | |
from planning_ai.states import DocumentState, OverallState | |
analyzer = AnalyzerEngine() | |
anonymizer = AnonymizerEngine() | |
nlp = spacy.load("en_core_web_lg") | |
def retrieve_chapters(state: DocumentState) -> DocumentState: | |
"""Retrieve themes from a document's content. | |
This function uses the `themes_chain` to extract themes from the document's | |
page content. It updates the document state with the themes and their scores. | |
Args: | |
state (DocumentState): The current state of the document, including its content. | |
Returns: | |
DocumentState: The updated document state with themes and scores. | |
""" | |
try: | |
chapters_chain = create_dynamic_chain(state["chapters"]) | |
result = chapters_chain.invoke({"document": state["document"].page_content}) | |
if not result.chapters: | |
state["themes"] = [] | |
return state | |
chapters = [chapter.model_dump() for chapter in result.chapters] | |
chapters = [ | |
{ | |
"chapter": chapter["chapter"].value, | |
"score": chapter["score"], | |
"description": chapter["description"], | |
} | |
for chapter in chapters | |
] | |
except Exception as e: | |
logger.error(f"Theme selection error: {e}") | |
chapters = [] | |
state["themes"] = [d for d in chapters if d["score"] >= 4] | |
return state | |
def remove_pii(document: str) -> str: | |
"""Remove personally identifiable information (PII) from a document. | |
This function uses the Presidio Analyzer and Anonymizer to detect and anonymize | |
PII such as names, phone numbers, and email addresses in the given document. | |
Args: | |
document (str): The document text from which PII should be removed. | |
Returns: | |
str: The document text with PII anonymized. | |
""" | |
results = analyzer.analyze( | |
text=document, | |
entities=["PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS"], | |
language="en", | |
) | |
return anonymizer.anonymize(text=document, analyzer_results=results).text | |
def generate_summary(state: DocumentState) -> dict: | |
"""Generate a summary for a document after removing PII. | |
This function first anonymizes the document to remove PII, then generates a summary | |
using the `map_chain`. The summary is added to the document state. | |
Args: | |
state (DocumentState): The current state of the document, including its text | |
and filename. | |
Returns: | |
dict: A dictionary containing the generated summary and updated document state. | |
""" | |
logger.info(f"Generating summary for document: {state['filename']}") | |
logger.info(f"Starting PII removal for: {state['filename']}") | |
state["document"].page_content = remove_pii(state["document"].page_content) | |
logger.info(f"Retrieving themes for: {state['filename']}") | |
state = retrieve_chapters(state) | |
try: | |
response = map_chain.invoke({"context": state["document"].page_content}) | |
except Exception as e: | |
logger.error(f"Failed to decode JSON {state['document']}: {e}") | |
return { | |
"documents": [ | |
{ | |
**state, | |
"summary": "", | |
"refinement_attempts": 0, | |
"is_hallucinated": True, | |
"processed": True, | |
"failed": True, | |
} | |
] | |
} | |
logger.info(f"Summary generation completed for document: {state['filename']}") | |
return { | |
"documents": [ | |
{ | |
**state, | |
"summary": response, | |
"refinement_attempts": 0, | |
"is_hallucinated": True, # start true to ensure cycle begins | |
"failed": False, | |
"processed": False, | |
} | |
] | |
} | |
def map_documents(state: OverallState) -> list[Send]: | |
"""Map documents to generate summaries. | |
This function prepares a list of `Send` objects to trigger the summary generation | |
process for each document in the state. | |
Args: | |
state (OverallState): The overall state containing multiple documents. | |
Returns: | |
list[Send]: A list of `Send` objects for summary generation. | |
""" | |
logger.info("Mapping documents to generate summaries.") | |
return [Send("generate_summary", document) for document in state["documents"]] | |