Spaces:
Runtime error
Runtime error
"""General prompt helper that can help deal with token limitations. | |
The helper can split text. It can also concatenate text from Node | |
structs but keeping token limitations in mind. | |
""" | |
from typing import Callable, List, Optional | |
from gpt_index.constants import MAX_CHUNK_OVERLAP | |
from gpt_index.data_structs.data_structs import Node | |
from gpt_index.langchain_helpers.chain_wrapper import LLMPredictor | |
from gpt_index.langchain_helpers.text_splitter import TokenTextSplitter | |
from gpt_index.prompts.base import Prompt | |
from gpt_index.utils import globals_helper | |
class PromptHelper: | |
"""Prompt helper. | |
This utility helps us fill in the prompt, split the text, | |
and fill in context information according to necessary token limitations. | |
Args: | |
max_input_size (int): Maximum input size for the LLM. | |
num_output (int): Number of outputs for the LLM. | |
max_chunk_overlap (int): Maximum chunk overlap for the LLM. | |
embedding_limit (Optional[int]): Maximum number of embeddings to use. | |
chunk_size_limit (Optional[int]): Maximum chunk size to use. | |
tokenizer (Optional[Callable[[str], List]]): Tokenizer to use. | |
""" | |
def __init__( | |
self, | |
max_input_size: int, | |
num_output: int, | |
max_chunk_overlap: int, | |
embedding_limit: Optional[int] = None, | |
chunk_size_limit: Optional[int] = None, | |
tokenizer: Optional[Callable[[str], List]] = None, | |
separator: str = " ", | |
) -> None: | |
"""Init params.""" | |
self.max_input_size = max_input_size | |
self.num_output = num_output | |
self.max_chunk_overlap = max_chunk_overlap | |
self.embedding_limit = embedding_limit | |
self.chunk_size_limit = chunk_size_limit | |
# TODO: make configurable | |
self._tokenizer = tokenizer or globals_helper.tokenizer | |
self._separator = separator | |
self.use_chunk_size_limit = chunk_size_limit is not None | |
def from_llm_predictor( | |
self, | |
llm_predictor: LLMPredictor, | |
max_chunk_overlap: Optional[int] = None, | |
embedding_limit: Optional[int] = None, | |
chunk_size_limit: Optional[int] = None, | |
tokenizer: Optional[Callable[[str], List]] = None, | |
) -> "PromptHelper": | |
"""Create from llm predictor. | |
This will autofill values like max_input_size and num_output. | |
""" | |
llm_metadata = llm_predictor.get_llm_metadata() | |
max_chunk_overlap = max_chunk_overlap or min( | |
MAX_CHUNK_OVERLAP, | |
llm_metadata.max_input_size // 10, | |
) | |
if chunk_size_limit is not None: | |
max_chunk_overlap = min(max_chunk_overlap, chunk_size_limit // 10) | |
return self( | |
llm_metadata.max_input_size, | |
llm_metadata.num_output, | |
max_chunk_overlap, | |
embedding_limit=embedding_limit, | |
chunk_size_limit=chunk_size_limit, | |
tokenizer=tokenizer, | |
) | |
def get_chunk_size_given_prompt( | |
self, prompt_text: str, num_chunks: int, padding: Optional[int] = 1 | |
) -> int: | |
"""Get chunk size making sure we can also fit the prompt in. | |
Chunk size is computed based on a function of the total input size, | |
the prompt length, the number of outputs, and the number of chunks. | |
If padding is specified, then we subtract that from the chunk size. | |
By default we assume there is a padding of 1 (for the newline between chunks). | |
Limit by embedding_limit and chunk_size_limit if specified. | |
""" | |
prompt_tokens = self._tokenizer(prompt_text) | |
num_prompt_tokens = len(prompt_tokens) | |
# NOTE: if embedding limit is specified, then chunk_size must not be larger than | |
# embedding_limit | |
result = ( | |
self.max_input_size - num_prompt_tokens - self.num_output | |
) // num_chunks | |
if padding is not None: | |
result -= padding | |
if self.embedding_limit is not None: | |
result = min(result, self.embedding_limit) | |
if self.chunk_size_limit is not None and self.use_chunk_size_limit: | |
result = min(result, self.chunk_size_limit) | |
return result | |
def _get_empty_prompt_txt(self, prompt: Prompt) -> str: | |
"""Get empty prompt text. | |
Substitute empty strings in parts of the prompt that have | |
not yet been filled out. Skip variables that have already | |
been partially formatted. This is used to compute the initial tokens. | |
""" | |
fmt_dict = { | |
v: "" for v in prompt.input_variables if v not in prompt.partial_dict | |
} | |
# TODO: change later from llm=None | |
empty_prompt_txt = prompt.format(llm=None, **fmt_dict) | |
return empty_prompt_txt | |
def get_biggest_prompt(self, prompts: List[Prompt]) -> Prompt: | |
"""Get biggest prompt. | |
Oftentimes we need to fetch the biggest prompt, in order to | |
be the most conservative about chunking text. This | |
is a helper utility for that. | |
""" | |
empty_prompt_txts = [self._get_empty_prompt_txt(prompt) for prompt in prompts] | |
empty_prompt_txt_lens = [len(txt) for txt in empty_prompt_txts] | |
biggest_prompt = prompts[ | |
empty_prompt_txt_lens.index(max(empty_prompt_txt_lens)) | |
] | |
return biggest_prompt | |
def get_text_splitter_given_prompt( | |
self, prompt: Prompt, num_chunks: int, padding: Optional[int] = 1 | |
) -> TokenTextSplitter: | |
"""Get text splitter given initial prompt. | |
Allows us to get the text splitter which will split up text according | |
to the desired chunk size. | |
""" | |
# generate empty_prompt_txt to compute initial tokens | |
empty_prompt_txt = self._get_empty_prompt_txt(prompt) | |
chunk_size = self.get_chunk_size_given_prompt( | |
empty_prompt_txt, num_chunks, padding=padding | |
) | |
text_splitter = TokenTextSplitter( | |
separator=self._separator, | |
chunk_size=chunk_size, | |
chunk_overlap=self.max_chunk_overlap // num_chunks, | |
tokenizer=self._tokenizer, | |
) | |
return text_splitter | |
def get_text_from_nodes( | |
self, node_list: List[Node], prompt: Optional[Prompt] = None | |
) -> str: | |
"""Get text from nodes. Used by tree-structured indices.""" | |
num_nodes = len(node_list) | |
text_splitter = None | |
if prompt is not None: | |
# add padding given the newline character | |
text_splitter = self.get_text_splitter_given_prompt( | |
prompt, | |
num_nodes, | |
padding=1, | |
) | |
results = [] | |
for node in node_list: | |
text = ( | |
text_splitter.truncate_text(node.get_text()) | |
if text_splitter is not None | |
else node.get_text() | |
) | |
results.append(text) | |
return "\n".join(results) | |
def get_numbered_text_from_nodes( | |
self, node_list: List[Node], prompt: Optional[Prompt] = None | |
) -> str: | |
"""Get text from nodes in the format of a numbered list. | |
Used by tree-structured indices. | |
""" | |
num_nodes = len(node_list) | |
text_splitter = None | |
if prompt is not None: | |
# add padding given the number, and the newlines | |
text_splitter = self.get_text_splitter_given_prompt( | |
prompt, | |
num_nodes, | |
padding=5, | |
) | |
results = [] | |
number = 1 | |
for node in node_list: | |
node_text = " ".join(node.get_text().splitlines()) | |
if text_splitter is not None: | |
node_text = text_splitter.truncate_text(node_text) | |
text = f"({number}) {node_text}" | |
results.append(text) | |
number += 1 | |
return "\n\n".join(results) | |
def compact_text_chunks(self, prompt: Prompt, text_chunks: List[str]) -> List[str]: | |
"""Compact text chunks. | |
This will combine text chunks into consolidated chunks | |
that more fully "pack" the prompt template given the max_input_size. | |
""" | |
combined_str = "\n\n".join([c.strip() for c in text_chunks if c.strip()]) | |
# resplit based on self.max_chunk_overlap | |
text_splitter = self.get_text_splitter_given_prompt(prompt, 1, padding=1) | |
return text_splitter.split_text(combined_str) | |