Spaces:

cjber
/

planning-ai

Sleeping

App Files Files Community

planning-ai / planning_ai /nodes /map_node.py

cjber

fix: allow chapters to dynamically update

fbb5eac 4 months ago

raw

history blame contribute delete

4.68 kB

	import numpy as np
	import spacy
	from langgraph.types import Send
	from presidio_analyzer import AnalyzerEngine
	from presidio_anonymizer import AnonymizerEngine

	from planning_ai.chains.map_chain import map_chain
	from planning_ai.chains.themes_chain import create_dynamic_chain
	from planning_ai.logging import logger
	from planning_ai.states import DocumentState, OverallState

	analyzer = AnalyzerEngine()
	anonymizer = AnonymizerEngine()

	nlp = spacy.load("en_core_web_lg")


	def retrieve_chapters(state: DocumentState) -> DocumentState:
	"""Retrieve themes from a document's content.

	This function uses the `themes_chain` to extract themes from the document's
	page content. It updates the document state with the themes and their scores.

	Args:
	state (DocumentState): The current state of the document, including its content.

	Returns:
	DocumentState: The updated document state with themes and scores.
	"""
	try:
	chapters_chain = create_dynamic_chain(state["chapters"])
	result = chapters_chain.invoke({"document": state["document"].page_content})
	if not result.chapters:
	state["themes"] = []
	return state
	chapters = [chapter.model_dump() for chapter in result.chapters]
	chapters = [
	{
	"chapter": chapter["chapter"].value,
	"score": chapter["score"],
	"description": chapter["description"],
	}
	for chapter in chapters
	]
	except Exception as e:
	logger.error(f"Theme selection error: {e}")
	chapters = []
	state["themes"] = [d for d in chapters if d["score"] >= 4]
	return state


	def remove_pii(document: str) -> str:
	"""Remove personally identifiable information (PII) from a document.

	This function uses the Presidio Analyzer and Anonymizer to detect and anonymize
	PII such as names, phone numbers, and email addresses in the given document.

	Args:
	document (str): The document text from which PII should be removed.

	Returns:
	str: The document text with PII anonymized.
	"""
	results = analyzer.analyze(
	text=document,
	entities=["PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS"],
	language="en",
	)
	return anonymizer.anonymize(text=document, analyzer_results=results).text


	def generate_summary(state: DocumentState) -> dict:
	"""Generate a summary for a document after removing PII.

	This function first anonymizes the document to remove PII, then generates a summary
	using the `map_chain`. The summary is added to the document state.

	Args:
	state (DocumentState): The current state of the document, including its text
	and filename.

	Returns:
	dict: A dictionary containing the generated summary and updated document state.
	"""
	logger.info(f"Generating summary for document: {state['filename']}")

	logger.info(f"Starting PII removal for: {state['filename']}")
	state["document"].page_content = remove_pii(state["document"].page_content)

	logger.info(f"Retrieving themes for: {state['filename']}")
	state = retrieve_chapters(state)

	try:
	response = map_chain.invoke({"context": state["document"].page_content})
	except Exception as e:
	logger.error(f"Failed to decode JSON {state['document']}: {e}")
	return {
	"documents": [
	{
	**state,
	"summary": "",
	"refinement_attempts": 0,
	"is_hallucinated": True,
	"processed": True,
	"failed": True,
	}
	]
	}
	logger.info(f"Summary generation completed for document: {state['filename']}")

	return {
	"documents": [
	{
	**state,
	"summary": response,
	"refinement_attempts": 0,
	"is_hallucinated": True, # start true to ensure cycle begins
	"failed": False,
	"processed": False,
	}
	]
	}


	def map_documents(state: OverallState) -> list[Send]:
	"""Map documents to generate summaries.

	This function prepares a list of `Send` objects to trigger the summary generation
	process for each document in the state.

	Args:
	state (OverallState): The overall state containing multiple documents.

	Returns:
	list[Send]: A list of `Send` objects for summary generation.
	"""
	logger.info("Mapping documents to generate summaries.")
	return [Send("generate_summary", document) for document in state["documents"]]