Spaces:

gsgoncalves
/

hf_agents_final_assignment

Sleeping

App Files Files Community

hf_agents_final_assignment / agents /utils.py

gsgoncalves

Upload 8 files

1d0ceaf verified 5 months ago

raw

history blame contribute delete

7.12 kB

	from typing import Any, Dict, List
	from langchain_core.messages import AnyMessage, AIMessage, HumanMessage


	def get_research_topic(messages: List[AnyMessage]) -> str:
	"""
	Get the research topic from the messages.
	"""
	# check if request has a history and combine the messages into a single string
	if len(messages) == 1:
	research_topic = messages[-1].content
	else:
	research_topic = ""
	for message in messages:
	if isinstance(message, HumanMessage):
	research_topic += f"User: {message.content}\n"
	elif isinstance(message, AIMessage):
	research_topic += f"Assistant: {message.content}\n"
	return research_topic


	def resolve_urls(urls_to_resolve: List[Any], id: int) -> Dict[str, str]:
	"""
	Create a map of the vertex ai search urls (very long) to a short url with a unique id for each url.
	Ensures each original URL gets a consistent shortened form while maintaining uniqueness.
	"""
	prefix = f"https://vertexaisearch.cloud.google.com/id/"
	urls = [site.web.uri for site in urls_to_resolve]

	# Create a dictionary that maps each unique URL to its first occurrence index
	resolved_map = {}
	for idx, url in enumerate(urls):
	if url not in resolved_map:
	resolved_map[url] = f"{prefix}{id}-{idx}"

	return resolved_map


	def insert_citation_markers(text, citations_list):
	"""
	Inserts citation markers into a text string based on start and end indices.

	Args:
	text (str): The original text string.
	citations_list (list): A list of dictionaries, where each dictionary
	contains 'start_index', 'end_index', and
	'segment_string' (the marker to insert).
	Indices are assumed to be for the original text.

	Returns:
	str: The text with citation markers inserted.
	"""
	# Sort citations by end_index in descending order.
	# If end_index is the same, secondary sort by start_index descending.
	# This ensures that insertions at the end of the string don't affect
	# the indices of earlier parts of the string that still need to be processed.
	sorted_citations = sorted(
	citations_list, key=lambda c: (c["end_index"], c["start_index"]), reverse=True
	)

	modified_text = text
	for citation_info in sorted_citations:
	# These indices refer to positions in the original text,
	# but since we iterate from the end, they remain valid for insertion
	# relative to the parts of the string already processed.
	end_idx = citation_info["end_index"]
	marker_to_insert = ""
	for segment in citation_info["segments"]:
	marker_to_insert += f" [{segment['label']}]({segment['short_url']})"
	# Insert the citation marker at the original end_idx position
	modified_text = (
	modified_text[:end_idx] + marker_to_insert + modified_text[end_idx:]
	)

	return modified_text


	def get_citations(response, resolved_urls_map):
	"""
	Extracts and formats citation information from a Gemini model's response.

	This function processes the grounding metadata provided in the response to
	construct a list of citation objects. Each citation object includes the
	start and end indices of the text segment it refers to, and a string
	containing formatted markdown links to the supporting web chunks.

	Args:
	response: The response object from the Gemini model, expected to have
	a structure including `candidates[0].grounding_metadata`.
	It also relies on a `resolved_map` being available in its
	scope to map chunk URIs to resolved URLs.

	Returns:
	list: A list of dictionaries, where each dictionary represents a citation
	and has the following keys:
	- "start_index" (int): The starting character index of the cited
	segment in the original text. Defaults to 0
	if not specified.
	- "end_index" (int): The character index immediately after the
	end of the cited segment (exclusive).
	- "segments" (list[str]): A list of individual markdown-formatted
	links for each grounding chunk.
	- "segment_string" (str): A concatenated string of all markdown-
	formatted links for the citation.
	Returns an empty list if no valid candidates or grounding supports
	are found, or if essential data is missing.
	"""
	citations = []

	# Ensure response and necessary nested structures are present
	if not response or not response.candidates:
	return citations

	candidate = response.candidates[0]
	if (
	not hasattr(candidate, "grounding_metadata")
	or not candidate.grounding_metadata
	or not hasattr(candidate.grounding_metadata, "grounding_supports")
	):
	return citations

	for support in candidate.grounding_metadata.grounding_supports:
	citation = {}

	# Ensure segment information is present
	if not hasattr(support, "segment") or support.segment is None:
	continue # Skip this support if segment info is missing

	start_index = (
	support.segment.start_index
	if support.segment.start_index is not None
	else 0
	)

	# Ensure end_index is present to form a valid segment
	if support.segment.end_index is None:
	continue # Skip if end_index is missing, as it's crucial

	# Add 1 to end_index to make it an exclusive end for slicing/range purposes
	# (assuming the API provides an inclusive end_index)
	citation["start_index"] = start_index
	citation["end_index"] = support.segment.end_index

	citation["segments"] = []
	if (
	hasattr(support, "grounding_chunk_indices")
	and support.grounding_chunk_indices
	):
	for ind in support.grounding_chunk_indices:
	try:
	chunk = candidate.grounding_metadata.grounding_chunks[ind]
	resolved_url = resolved_urls_map.get(chunk.web.uri, None)
	citation["segments"].append(
	{
	"label": chunk.web.title.split(".")[:-1][0],
	"short_url": resolved_url,
	"value": chunk.web.uri,
	}
	)
	except (IndexError, AttributeError, NameError):
	# Handle cases where chunk, web, uri, or resolved_map might be problematic
	# For simplicity, we'll just skip adding this particular segment link
	# In a production system, you might want to log this.
	pass
	citations.append(citation)
	return citations