Spaces:

pikamomo
/

Deep_Research_Agent

Sleeping

App Files Files Community

Deep_Research_Agent / backend /src /utils.py

pikamomo

Initial deployment

a60c0af 2 months ago

raw

history blame contribute delete

2.68 kB

	"""Utility helpers shared across deep researcher services."""

	from __future__ import annotations

	import logging
	from typing import Any, Dict, List, Union

	CHARS_PER_TOKEN = 4

	logger = logging.getLogger(__name__)


	def get_config_value(value: Any) -> str:
	"""Return configuration value as plain string."""

	return value if isinstance(value, str) else value.value


	def strip_thinking_tokens(text: str) -> str:
	"""Remove ``<think>`` sections from model responses."""

	while "<think>" in text and "</think>" in text:
	start = text.find("<think>")
	end = text.find("</think>") + len("</think>")
	text = text[:start] + text[end:]
	return text


	def deduplicate_and_format_sources(
	search_response: Dict[str, Any] \| List[Dict[str, Any]],
	max_tokens_per_source: int,
	*,
	fetch_full_page: bool = False,
	) -> str:
	"""Format and deduplicate search results for downstream prompting."""

	if isinstance(search_response, dict):
	sources_list = search_response.get("results", [])
	else:
	sources_list = search_response

	unique_sources: dict[str, Dict[str, Any]] = {}
	for source in sources_list:
	url = source.get("url")
	if not url:
	continue
	if url not in unique_sources:
	unique_sources[url] = source

	formatted_parts: List[str] = []
	for source in unique_sources.values():
	title = source.get("title") or source.get("url", "")
	content = source.get("content", "")
	formatted_parts.append(f"Source: {title}\n\n")
	formatted_parts.append(f"URL: {source.get('url', '')}\n\n")
	formatted_parts.append(f"Content: {content}\n\n")

	if fetch_full_page:
	raw_content = source.get("raw_content")
	if raw_content is None:
	logger.debug("raw_content missing for %s", source.get("url", ""))
	raw_content = ""
	char_limit = max_tokens_per_source * CHARS_PER_TOKEN
	if len(raw_content) > char_limit:
	raw_content = f"{raw_content[:char_limit]}... [truncated]"
	formatted_parts.append(
	f"Full content limited to {max_tokens_per_source} tokens: {raw_content}\n\n"
	)

	return "".join(formatted_parts).strip()


	def format_sources(search_results: Dict[str, Any] \| None) -> str:
	"""Return bullet list summarising search sources."""

	if not search_results:
	return ""

	results = search_results.get("results", [])
	return "\n".join(
	f"* {item.get('title', item.get('url', ''))} : {item.get('url', '')}"
	for item in results
	if item.get("url")
	)