import os import pickle from json import dumps, loads import time from typing import Any, List, Mapping, Optional import numpy as np import openai import pandas as pd import streamlit as st from dotenv import load_dotenv from huggingface_hub import HfFileSystem from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, Pipeline # prompts from assets.prompts import custom_prompts # llama index from llama_index.core import ( VectorStoreIndex, PromptTemplate, ) from llama_index.core.llms import ( CustomLLM, CompletionResponse, LLMMetadata, ) from llama_index.core.memory import ChatMemoryBuffer from llama_index.core.llms.callbacks import llm_completion_callback from llama_index.core.base.llms.types import ChatMessage from llama_index.core import Settings load_dotenv() # openai.api_key = os.getenv("OPENAI_API_KEY") fs = HfFileSystem() # define prompt helper # set maximum input size CONTEXT_WINDOW = 2048 # set number of output tokens NUM_OUTPUT = 525 # set maximum chunk overlap CHUNK_OVERLAP_RATION = 0.2 ANSWER_FORMAT = """ Provide the answer to the user question in the following format: [FORMAT] Your answer to the user question above. Reference: The list of references (such as page number, title, chapter, section) to the specific sections of the documents that support your answer. [END_FORMAT] """ # query engine templates QUERY_ENGINE_QA_TEMPLATE = """ We have provided context information below: [CONTEXT] {context_str} [END_CONTEXT] Given this information, please answer the following question: [QUESTION] {query_str} [END_QUESTION] """ QUERY_ENGINE_REFINE_TEMPLATE = """ The original query is as follows: [QUESTION] {query_str} [END_QUESTION] We have providec an existing answer: [ANSWER] {existing_answer} [END_ANSWER] We have the opportunity to refine the existing answer (only if needed) with some more context below. [CONTEXT] {context_msg} [END_CONTEXT] Given the new context, refine the original answer to include more details like references \ to the specific sections of the documents that support your answer. Refined Answer: """ CHAT_ENGINE_CONTEXT_PROMPT_TEMPLATE = """ The following is a friendly conversation between a user and an AI assistant. The assistant is talkative and provides lots of specific details from its context. If the assistant does not know the answer to a question, it truthfully says it does not know. Here are the relevant documents for the context: {context_str} Instruction: Based on the above documents, provide a detailed answer for the user question below. \ Include references to the specific sections of the documents that support your answer. \ Answer "don't know" if not present in the document. """ CHAT_ENGINE_CONDENSE_PROMPT_TEMPLATE = """ Given the following conversation between a user and an AI assistant and a follow up question from user, rephrase the follow up question to be a standalone question. Chat History: {chat_history} Follow Up Input: {question} Standalone question: """ @st.cache_resource def load_model(model_name: str): # llm_model_name = "bigscience/bloom-560m" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name, config="T5Config") pipe = pipeline( task="text-generation", model=model, tokenizer=tokenizer, # device=0, # GPU device number # max_length=512, do_sample=True, top_p=0.95, top_k=50, temperature=0.7, ) return pipe class OurLLM(CustomLLM): context_window: int = 3900 num_output: int = 256 model_name: str = "" pipeline: Pipeline = None @property def metadata(self) -> LLMMetadata: """Get LLM metadata.""" return LLMMetadata( context_window=CONTEXT_WINDOW, num_output=NUM_OUTPUT, model_name=self.model_name, ) # The decorator is optional, but provides observability via callbacks on the LLM calls. @llm_completion_callback() def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse: prompt_length = len(prompt) response = self.pipeline(prompt, max_new_tokens=NUM_OUTPUT)[0]["generated_text"] # only return newly generated tokens text = response[prompt_length:] return CompletionResponse(text=text) @llm_completion_callback() def stream_complete(self, prompt: str, **kwargs: Any): response = "" for token in self.dummy_response: response += token yield CompletionResponse(text=response, delta=token) class LlamaCustom: def __init__(self, model_name: str, index: VectorStoreIndex): self.model_name = model_name self.index = index self.chat_mode = "condense_plus_context" self.memory = ChatMemoryBuffer.from_defaults() self.verbose = True def get_response(self, query_str: str, chat_history: List[ChatMessage]): # https://docs.llamaindex.ai/en/stable/module_guides/deploying/chat_engines/ # https://docs.llamaindex.ai/en/stable/examples/query_engine/citation_query_engine/ # https://docs.llamaindex.ai/en/stable/examples/query_engine/knowledge_graph_rag_query_engine/ query_engine = self.index.as_query_engine( text_qa_template=PromptTemplate(QUERY_ENGINE_QA_TEMPLATE + ANSWER_FORMAT), refine_template=PromptTemplate( QUERY_ENGINE_REFINE_TEMPLATE ), # passing ANSWER_FORMAT here will not give the desired output, need to use the output parser from llama index? verbose=self.verbose, ) # chat_engine = self.index.as_chat_engine( # chat_mode=self.chat_mode, # memory=self.memory, # context_prompt=CHAT_ENGINE_CONTEXT_PROMPT_TEMPLATE, # condense_prompt=CHAT_ENGINE_CONDENSE_PROMPT_TEMPLATE, # # verbose=True, # ) response = query_engine.query(query_str) # response = chat_engine.chat(message=query_str, chat_history=chat_history) return str(response) def get_stream_response(self, query_str: str, chat_history: List[ChatMessage]): response = self.get_response(query_str=query_str, chat_history=chat_history) for word in response.split(): yield word + " " time.sleep(0.05)