import tempfile import os import tiktoken import streamlit as st from llama_index.core import ( VectorStoreIndex, Settings, ) from llama_parse import LlamaParse from streamlit_pdf_viewer import pdf_viewer class MistralTokens: """ Returns tokens for MistralAI models. See: https://docs.mistral.ai/guides/tokenization/ """ def __init__(self, llm_name): from mistral_common.tokens.tokenizers.mistral import MistralTokenizer if 'open-mistral-nemo' in llm_name: self.tokenizer = MistralTokenizer.v3(is_tekken=True) else: # This might work for all models, but their documentation is unclear. self.tokenizer = MistralTokenizer.from_model(llm_name) def __call__(self, input): """This returns all the tokens indices in a list since LlamaIndex seems to count by calling `len()` on the tokenizer function.""" from mistral_common.protocol.instruct.messages import UserMessage from mistral_common.protocol.instruct.request import ChatCompletionRequest return self.tokenizer.encode_chat_completion( ChatCompletionRequest( tools=[], messages=[ UserMessage(content=input) ] ) ).tokens class GeminiTokens: """ Returns tokens for Gemini models. See: https://medium.com/google-cloud/counting-gemini-text-tokens-locally-with-the-vertex-ai-sdk-78979fea6244 """ def __init__(self, llm_name): from vertexai.preview import tokenization self.tokenizer = tokenization.get_tokenizer_for_model(llm_name) def __call__(self, input): """This returns all the tokens in a list since LlamaIndex seems to count by calling `len()` on the tokenizer function.""" tokens = [] for list in self.tokenizer.compute_tokens(input).token_info_list: tokens += list.tokens return tokens def main(): submit_button = False with st.sidebar: st.title('Document Summarization and QA System') with st.form(key="model_settings"): # Select Provider provider = st.selectbox( label="Select LLM Provider", options=['google', 'huggingface', 'mistralai', 'openai'], index=3 ) # Select LLM if provider == 'google': llm_list = ['gemini-1.0-pro', 'gemini-1.5-flash', 'gemini-1.5-pro'] elif provider == 'huggingface': llm_list = [] elif provider == 'mistralai': llm_list = ["mistral-large-latest", "open-mistral-nemo-latest"] elif provider == 'openai': llm_list = ['gpt-3.5-turbo', 'gpt-4', 'gpt-4-turbo', 'gpt-4o', 'gpt-4o-mini'] else: llm_list = [] if provider == 'huggingface': llm_name = st.text_input( "Enter LLM namespace/model-name", value="HuggingFaceH4/zephyr-7b-alpha", ) # Also give the user the option for different embedding models, too embed_name = st.text_input( label="Enter embedding namespace/model-name", value="BAAI/bge-small-en-v1.5", ) else: llm_name = st.selectbox( label="Select LLM Model", options=llm_list, index=0 ) # Temperature temperature = st.slider( "Temperature", min_value=0.0, max_value=1.0, value=0.0, step=0.05, ) similarity_top_k = st.number_input("Top k nodes to retrieve (similarity_top_k)", min_value=1, max_value=100, value=5, step=1) similarity_cutoff = st.slider("Select node similarity cutoff", min_value=0.0, max_value=1.0, value=0.7) # Enter Parsing API Key parse_key = st.text_input( "Enter your LlamaParse API Key", value=None ) # Enter LLM API Key llm_key = st.text_input( "Enter your LLM provider API Key", value=None, ) # Create LLM # Global tokenization needs to be consistent with LLM for token counting # https://docs.llamaindex.ai/en/stable/module_guides/models/llms/ if llm_key is not None: if provider == 'google': from llama_index.llms.gemini import Gemini from llama_index.embeddings.gemini import GeminiEmbedding max_output_tokens = 8192 # https://firebase.google.com/docs/vertex-ai/gemini-models os.environ['GOOGLE_API_KEY'] = str(llm_key) Settings.llm = Gemini( model=f"models/{llm_name}", token=os.environ.get("GOOGLE_API_KEY"), temperature=temperature, max_tokens=max_output_tokens ) Settings.tokenizer = GeminiTokens(llm_name) Settings.num_output = max_output_tokens Settings.embed_model = GeminiEmbedding( model_name="models/text-embedding-004", api_key=os.environ.get("GOOGLE_API_KEY") #, title="this is a document" ) if llm_name == 'gemini-1.0-pro': total_token_limit = 32760 else: total_token_limit = 1e6 Settings.context_window = total_token_limit - max_output_tokens # Gemini counts total tokens elif provider == 'huggingface': if llm_name is not None and embed_name is not None: from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI from llama_index.embeddings.huggingface import HuggingFaceInferenceAPIEmbedding from transformers import AutoTokenizer max_output_tokens = 2048 # Just a generic value os.environ['HF_TOKEN'] = str(llm_key) Settings.llm = HuggingFaceInferenceAPI( model_name=llm_name, token=os.environ.get("HF_TOKEN"), temperature=temperature, max_tokens=max_output_tokens ) Settings.tokenizer = AutoTokenizer.from_pretrained( llm_name, token=os.environ.get("HF_TOKEN"), ) Settings.num_output = max_output_tokens Settings.embed_model = HuggingFaceInferenceAPIEmbedding( model_name=embed_name ) Settings.context_window = 4096 # Just a generic value elif provider == 'mistralai': from llama_index.llms.mistralai import MistralAI from llama_index.embeddings.mistralai import MistralAIEmbedding max_output_tokens = 8192 # Based on internet consensus since this is not well documented os.environ['MISTRAL_API_KEY'] = str(llm_key) Settings.llm = MistralAI( model=llm_name, temperature=temperature, max_tokens=max_output_tokens, random_seed=42, safe_mode=True ) Settings.tokenizer = MistralTokens(llm_name) Settings.num_output = max_output_tokens Settings.embed_model = MistralAIEmbedding( model_name="mistral-embed", api_key=os.environ.get("MISTRAL_API_KEY") ) Settings.context_window = 128000 # 128k for flagship models - doesn't seem to count input tokens elif provider == 'openai': from llama_index.llms.openai import OpenAI from llama_index.embeddings.openai import OpenAIEmbedding # https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4 if llm_name == 'gpt-3.5-turbo': max_output_tokens = 4096 context_window = 16385 elif llm_name == 'gpt-4': max_output_tokens = 8192 context_window = 8192 elif llm_name == 'gpt-4-turbo': max_output_tokens = 4096 context_window = 128000 elif llm_name == 'gpt-4o': max_output_tokens = 4096 context_window = 128000 elif llm_name == 'gpt-4o-mini': max_output_tokens = 16384 context_window = 128000 os.environ["OPENAI_API_KEY"] = str(llm_key) Settings.llm = OpenAI( model=llm_name, temperature=temperature, max_tokens=max_output_tokens ) Settings.tokenizer = tiktoken.encoding_for_model(llm_name).encode Settings.num_output = max_output_tokens Settings.embed_model = OpenAIEmbedding() Settings.context_window = context_window else: raise NotImplementedError(f"{provider} is not supported yet") uploaded_file = st.file_uploader( "Choose a PDF file to upload", type=['pdf'], accept_multiple_files=False ) parsed_document = None if uploaded_file is not None: # Parse the file parser = LlamaParse( api_key=parse_key, # Can also be set in your env as LLAMA_CLOUD_API_KEY result_type="text" # "markdown" and "text" are available ) # Create a temporary directory to save the file then load and parse it temp_dir = tempfile.TemporaryDirectory() temp_filename = os.path.join(temp_dir.name, uploaded_file.name) with open(temp_filename, "wb") as f: f.write(uploaded_file.getvalue()) parsed_document = parser.load_data(temp_filename) temp_dir.cleanup() submit_button = st.form_submit_button( "Construct RAG" ) col1, col2 = st.columns(2) with col2: tab1, tab2 = st.tabs(["Uploaded File", "Parsed File",]) with tab1: if uploaded_file is not None: # Display the pdf bytes_data = uploaded_file.getvalue() pdf_viewer(input=bytes_data, width=700) with tab2: if parsed_document is not None: # Showed the raw parsing result st.write(parsed_document) with col1: st.markdown( """ # Introduction This app builds a [retrieval-augmented generation](https://en.wikipedia.org/wiki/Retrieval-augmented_generation) model that let's you ask "talk" to your document, ask questions, summarize, and extract data. :clap: The workflow relies on: * [OpenAI](https://platform.openai.com/apps) * [LlamaParse](https://cloud.llamaindex.ai/) * [LlamaIndex](https://cloud.llamaindex.ai/) :warning: This tool is provided "as-is" without warranty. # Instructions 1. Obtain an [API Key](https://cloud.llamaindex.ai/api-key) from LlamaParse to parse your document. 2. Obtain a similar API Key from your preferred LLM provider. Note, if you are using [Hugging Face](https://huggingface.co/models) you may need to request access to a model if it is gated. 3. Make selections at the left and upload a document to use as context. 4. Begin asking questions below! """ ) st.divider() prompt_txt = 'You are a trusted scientific expert that only responds truthfully to inquiries. Summarize this document in a 3-5 sentences.' prompt = st.text_area( label="Enter your query.", key="prompt_widget", value=prompt_txt ) run = st.button("Answer", type="primary") if parsed_document is not None and run: index = VectorStoreIndex.from_documents(parsed_document) query_engine = index.as_query_engine( similarity_top_k=similarity_top_k, similarity_cutoff=similarity_cutoff, response_mode='compact', # text_qa_template=text_qa_template, # refine_template=refine_template, ) response = query_engine.query(prompt) st.write(response.response) if __name__ == '__main__': # Global configurations # from llama_index.core import set_global_handler # set_global_handler("langfuse") # Also add API Key for this if using st.set_page_config(layout="wide") main()