Spaces:
Sleeping
Sleeping
| import os | |
| import pickle | |
| from json import dumps, loads | |
| import time | |
| from typing import Any, List, Mapping, Optional | |
| import numpy as np | |
| import openai | |
| import pandas as pd | |
| import streamlit as st | |
| from dotenv import load_dotenv | |
| from huggingface_hub import HfFileSystem | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, Pipeline | |
| # prompts | |
| from assets.prompts import custom_prompts | |
| # llama index | |
| from llama_index.core import ( | |
| VectorStoreIndex, | |
| PromptTemplate, | |
| ) | |
| from llama_index.core.llms import ( | |
| CustomLLM, | |
| CompletionResponse, | |
| LLMMetadata, | |
| ) | |
| from llama_index.core.memory import ChatMemoryBuffer | |
| from llama_index.core.llms.callbacks import llm_completion_callback | |
| from llama_index.core.base.llms.types import ChatMessage | |
| from llama_index.core import Settings | |
| load_dotenv() | |
| # openai.api_key = os.getenv("OPENAI_API_KEY") | |
| fs = HfFileSystem() | |
| # define prompt helper | |
| # set maximum input size | |
| CONTEXT_WINDOW = 2048 | |
| # set number of output tokens | |
| NUM_OUTPUT = 525 | |
| # set maximum chunk overlap | |
| CHUNK_OVERLAP_RATION = 0.2 | |
| ANSWER_FORMAT = """ | |
| Provide the answer to the user question in the following format: | |
| [FORMAT] | |
| Your answer to the user question above. | |
| Reference: | |
| The list of references (such as page number, title, chapter, section) to the specific sections of the documents that support your answer. | |
| [END_FORMAT] | |
| """ | |
| # query engine templates | |
| QUERY_ENGINE_QA_TEMPLATE = """ | |
| We have provided context information below: | |
| [CONTEXT] | |
| {context_str} | |
| [END_CONTEXT] | |
| Given this information, please answer the following question: | |
| [QUESTION] | |
| {query_str} | |
| [END_QUESTION] | |
| """ | |
| QUERY_ENGINE_REFINE_TEMPLATE = """ | |
| The original query is as follows: | |
| [QUESTION] | |
| {query_str} | |
| [END_QUESTION] | |
| We have providec an existing answer: | |
| [ANSWER] | |
| {existing_answer} | |
| [END_ANSWER] | |
| We have the opportunity to refine the existing answer (only if needed) with some more | |
| context below. | |
| [CONTEXT] | |
| {context_msg} | |
| [END_CONTEXT] | |
| Given the new context, refine the original answer to include more details like references \ | |
| to the specific sections of the documents that support your answer. | |
| Refined Answer: | |
| """ | |
| CHAT_ENGINE_CONTEXT_PROMPT_TEMPLATE = """ | |
| The following is a friendly conversation between a user and an AI assistant. | |
| The assistant is talkative and provides lots of specific details from its context. | |
| If the assistant does not know the answer to a question, it truthfully says it | |
| does not know. | |
| Here are the relevant documents for the context: | |
| {context_str} | |
| Instruction: Based on the above documents, provide a detailed answer for the user question below. \ | |
| Include references to the specific sections of the documents that support your answer. \ | |
| Answer "don't know" if not present in the document. | |
| """ | |
| CHAT_ENGINE_CONDENSE_PROMPT_TEMPLATE = """ | |
| Given the following conversation between a user and an AI assistant and a follow up question from user, | |
| rephrase the follow up question to be a standalone question. | |
| Chat History: | |
| {chat_history} | |
| Follow Up Input: {question} | |
| Standalone question: | |
| """ | |
| def load_model(model_name: str): | |
| # llm_model_name = "bigscience/bloom-560m" | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = AutoModelForCausalLM.from_pretrained(model_name, config="T5Config") | |
| pipe = pipeline( | |
| task="text-generation", | |
| model=model, | |
| tokenizer=tokenizer, | |
| # device=0, # GPU device number | |
| # max_length=512, | |
| do_sample=True, | |
| top_p=0.95, | |
| top_k=50, | |
| temperature=0.7, | |
| ) | |
| return pipe | |
| class OurLLM(CustomLLM): | |
| context_window: int = 3900 | |
| num_output: int = 256 | |
| model_name: str = "" | |
| pipeline: Pipeline = None | |
| def metadata(self) -> LLMMetadata: | |
| """Get LLM metadata.""" | |
| return LLMMetadata( | |
| context_window=CONTEXT_WINDOW, | |
| num_output=NUM_OUTPUT, | |
| model_name=self.model_name, | |
| ) | |
| # The decorator is optional, but provides observability via callbacks on the LLM calls. | |
| def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse: | |
| prompt_length = len(prompt) | |
| response = self.pipeline(prompt, max_new_tokens=NUM_OUTPUT)[0]["generated_text"] | |
| # only return newly generated tokens | |
| text = response[prompt_length:] | |
| return CompletionResponse(text=text) | |
| def stream_complete(self, prompt: str, **kwargs: Any): | |
| response = "" | |
| for token in self.dummy_response: | |
| response += token | |
| yield CompletionResponse(text=response, delta=token) | |
| class LlamaCustom: | |
| def __init__(self, model_name: str, index: VectorStoreIndex): | |
| self.model_name = model_name | |
| self.index = index | |
| self.chat_mode = "condense_plus_context" | |
| self.memory = ChatMemoryBuffer.from_defaults() | |
| self.verbose = True | |
| def get_response(self, query_str: str, chat_history: List[ChatMessage]): | |
| # https://docs.llamaindex.ai/en/stable/module_guides/deploying/chat_engines/ | |
| # https://docs.llamaindex.ai/en/stable/examples/query_engine/citation_query_engine/ | |
| # https://docs.llamaindex.ai/en/stable/examples/query_engine/knowledge_graph_rag_query_engine/ | |
| query_engine = self.index.as_query_engine( | |
| text_qa_template=PromptTemplate(QUERY_ENGINE_QA_TEMPLATE + ANSWER_FORMAT), | |
| refine_template=PromptTemplate( | |
| QUERY_ENGINE_REFINE_TEMPLATE | |
| ), # passing ANSWER_FORMAT here will not give the desired output, need to use the output parser from llama index? | |
| verbose=self.verbose, | |
| ) | |
| # chat_engine = self.index.as_chat_engine( | |
| # chat_mode=self.chat_mode, | |
| # memory=self.memory, | |
| # context_prompt=CHAT_ENGINE_CONTEXT_PROMPT_TEMPLATE, | |
| # condense_prompt=CHAT_ENGINE_CONDENSE_PROMPT_TEMPLATE, | |
| # # verbose=True, | |
| # ) | |
| response = query_engine.query(query_str) | |
| # response = chat_engine.chat(message=query_str, chat_history=chat_history) | |
| return str(response) | |
| def get_stream_response(self, query_str: str, chat_history: List[ChatMessage]): | |
| response = self.get_response(query_str=query_str, chat_history=chat_history) | |
| for word in response.split(): | |
| yield word + " " | |
| time.sleep(0.05) | |