Spaces:
Build error
Build error
| import numpy as np | |
| import spacy | |
| from langgraph.types import Send | |
| from presidio_analyzer import AnalyzerEngine | |
| from presidio_anonymizer import AnonymizerEngine | |
| from planning_ai.chains.map_chain import map_chain | |
| from planning_ai.chains.themes_chain import create_dynamic_chain | |
| from planning_ai.logging import logger | |
| from planning_ai.states import DocumentState, OverallState | |
| analyzer = AnalyzerEngine() | |
| anonymizer = AnonymizerEngine() | |
| nlp = spacy.load("en_core_web_lg") | |
| def retrieve_chapters(state: DocumentState) -> DocumentState: | |
| """Retrieve themes from a document's content. | |
| This function uses the `themes_chain` to extract themes from the document's | |
| page content. It updates the document state with the themes and their scores. | |
| Args: | |
| state (DocumentState): The current state of the document, including its content. | |
| Returns: | |
| DocumentState: The updated document state with themes and scores. | |
| """ | |
| try: | |
| chapters_chain = create_dynamic_chain(state["chapters"]) | |
| result = chapters_chain.invoke({"document": state["document"].page_content}) | |
| if not result.chapters: | |
| state["themes"] = [] | |
| return state | |
| chapters = [chapter.model_dump() for chapter in result.chapters] | |
| chapters = [ | |
| { | |
| "chapter": chapter["chapter"].value, | |
| "score": chapter["score"], | |
| "description": chapter["description"], | |
| } | |
| for chapter in chapters | |
| ] | |
| except Exception as e: | |
| logger.error(f"Theme selection error: {e}") | |
| chapters = [] | |
| state["themes"] = [d for d in chapters if d["score"] >= 4] | |
| return state | |
| def remove_pii(document: str) -> str: | |
| """Remove personally identifiable information (PII) from a document. | |
| This function uses the Presidio Analyzer and Anonymizer to detect and anonymize | |
| PII such as names, phone numbers, and email addresses in the given document. | |
| Args: | |
| document (str): The document text from which PII should be removed. | |
| Returns: | |
| str: The document text with PII anonymized. | |
| """ | |
| results = analyzer.analyze( | |
| text=document, | |
| entities=["PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS"], | |
| language="en", | |
| ) | |
| return anonymizer.anonymize(text=document, analyzer_results=results).text | |
| def generate_summary(state: DocumentState) -> dict: | |
| """Generate a summary for a document after removing PII. | |
| This function first anonymizes the document to remove PII, then generates a summary | |
| using the `map_chain`. The summary is added to the document state. | |
| Args: | |
| state (DocumentState): The current state of the document, including its text | |
| and filename. | |
| Returns: | |
| dict: A dictionary containing the generated summary and updated document state. | |
| """ | |
| logger.info(f"Generating summary for document: {state['filename']}") | |
| logger.info(f"Starting PII removal for: {state['filename']}") | |
| state["document"].page_content = remove_pii(state["document"].page_content) | |
| logger.info(f"Retrieving themes for: {state['filename']}") | |
| state = retrieve_chapters(state) | |
| try: | |
| response = map_chain.invoke({"context": state["document"].page_content}) | |
| except Exception as e: | |
| logger.error(f"Failed to decode JSON {state['document']}: {e}") | |
| return { | |
| "documents": [ | |
| { | |
| **state, | |
| "summary": "", | |
| "refinement_attempts": 0, | |
| "is_hallucinated": True, | |
| "processed": True, | |
| "failed": True, | |
| } | |
| ] | |
| } | |
| logger.info(f"Summary generation completed for document: {state['filename']}") | |
| return { | |
| "documents": [ | |
| { | |
| **state, | |
| "summary": response, | |
| "refinement_attempts": 0, | |
| "is_hallucinated": True, # start true to ensure cycle begins | |
| "failed": False, | |
| "processed": False, | |
| } | |
| ] | |
| } | |
| def map_documents(state: OverallState) -> list[Send]: | |
| """Map documents to generate summaries. | |
| This function prepares a list of `Send` objects to trigger the summary generation | |
| process for each document in the state. | |
| Args: | |
| state (OverallState): The overall state containing multiple documents. | |
| Returns: | |
| list[Send]: A list of `Send` objects for summary generation. | |
| """ | |
| logger.info("Mapping documents to generate summaries.") | |
| return [Send("generate_summary", document) for document in state["documents"]] | |