#%% from tiktoken import get_encoding, encoding_for_model from weaviate_interface import WeaviateClient, WhereFilter from sentence_transformers import SentenceTransformer from prompt_templates import question_answering_prompt_series, question_answering_system from openai_interface import GPT_Turbo from app_features import (convert_seconds, generate_prompt_series, search_result, validate_token_threshold, load_content_cache, load_data, expand_content) from retrieval_evaluation import execute_evaluation, calc_hit_rate_scores from llama_index.finetuning import EmbeddingQAFinetuneDataset from weaviate_interface import WeaviateClient from openai import BadRequestError from reranker import ReRanker from loguru import logger import streamlit as st from streamlit_option_menu import option_menu import hydralit_components as hc import sys import json import os, time, requests, re from datetime import timedelta import pathlib import gdown import tempfile import base64 import shutil def get_base64_of_bin_file(bin_file): with open(bin_file, 'rb') as file: data = file.read() return base64.b64encode(data).decode() from dotenv import load_dotenv, find_dotenv load_dotenv(find_dotenv('env'), override=True) # I use a key that I increment each time I want to change a text_input if 'key' not in st.session_state: st.session_state.key = 0 # key = st.session_state['key'] if not pathlib.Path('models').exists(): os.mkdir('models') # I should cache these things but no time left # I put a file local.txt in my desktop models folder to find out if it's running online we_are_online = not pathlib.Path("models/local.txt").exists() we_are_not_online = not we_are_online golden_dataset = EmbeddingQAFinetuneDataset.from_json("data/golden_100.json") # shutil.rmtree("models/models") # remove it - I wanted to clear the space on streamlit online ## PAGE CONFIGURATION st.set_page_config(page_title="Ask Impact Theory", page_icon="assets/impact-theory-logo-only.png", layout="wide", initial_sidebar_state="collapsed", menu_items={'Report a bug': "https://www.extremelycoolapp.com/bug"}) image = "https://is2-ssl.mzstatic.com/image/thumb/Music122/v4/bd/34/82/bd348260-314c-5898-26c0-bef2e0388ebe/source/1200x1200bb.png" def add_bg_from_local(image_file): bin_str = get_base64_of_bin_file(image_file) page_bg_img = f''' ''' st.markdown(page_bg_img, unsafe_allow_html=True) # COMMENT: I tried to create a dropdown menu but it's harder than it looks, so I gave up # https://discuss.streamlit.io/t/streamlit-option-menu-is-a-simple-streamlit-component-that-allows-users-to-select-a-single-item-from-a-list-of-options-in-a-menu/20514 # not great, but it works # selected = option_menu("About", ["Improvements","This"], #"Main Menu", ["Home", 'Settings'], # icons=['house', 'gear'], # menu_icon="cast", # default_index=1) # # Custom HTML/CSS for the banner # base64_img = get_base64_of_bin_file("assets/it_tom_bilyeu.png") # banner_menu_html = f""" # # # """ # st.components.v1.html(banner_menu_html) # specify the primary menu definition # it gives a vertical menu inside a navigation bar !!! # menu_data = [ # {'icon': "far fa-copy", 'label':"Left End"}, # {'id':'Copy','icon':"šŸ™",'label':"Copy"}, # {'icon': "far fa-chart-bar", 'label':"Chart"},#no tooltip message # {'icon': "far fa-address-book", 'label':"Book"}, # {'id':' Crazy return value šŸ’€','icon': "šŸ’€", 'label':"Calendar"}, # {'icon': "far fa-clone", 'label':"Component"}, # {'icon': "fas fa-tachometer-alt", 'label':"Dashboard",'ttip':"I'm the Dashboard tooltip!"}, #can add a tooltip message # {'icon': "far fa-copy", 'label':"Right End"}, # ] # # we can override any part of the primary colors of the menu # over_theme = {'txc_inactive': '#FFFFFF','menu_background':'red','txc_active':'yellow','option_active':'blue'} # # over_theme = {'txc_inactive': '#FFFFFF'} # menu_id = hc.nav_bar(menu_definition=menu_data, # home_name='Home', # override_theme=over_theme) #get the id of the menu item clicked # st.info(f"{menu_id=}") ## RERANKER reranker = ReRanker('cross-encoder/ms-marco-MiniLM-L-6-v2') ## ENCODING --> tiktoken library model_ids = ['gpt-3.5-turbo-16k', 'gpt-3.5-turbo-0613'] model_nameGPT = model_ids[1] encoding = encoding_for_model(model_nameGPT) # = get_encoding('gpt-3.5-turbo-0613') ############## data_path = './data/impact_theory_data.json' cache_path = 'data/impact_theory_cache.parquet' data = load_data(data_path) cache = None # load_content_cache(cache_path) try: # st.write("Loading secrets from secrets.toml") Wapi_key = st.secrets['secrets']['WEAVIATE_API_KEY'] url = st.secrets['secrets']['WEAVIATE_ENDPOINT'] openai_api_key = st.secrets['secrets']['OPENAI_API_KEY'] hf_token = st.secrets['secrets']['LLAMA2_ENDPOINT_HF_TOKEN_chris'] hf_endpoint = st.secrets['secrets']['LLAMA2_ENDPOINT_UPLIMIT'] # st.write("Secrets loaded from secrets.toml") # st.write("HF_TOKEN", hf_token) except: st.write("Loading secrets from environment variables") api_key = os.environ['WEAVIATE_API_KEY'] url = os.environ['WEAVIATE_ENDPOINT'] openai_api_key = os.environ['OPENAI_API_KEY'] hf_token = os.environ['LLAMA2_ENDPOINT_HF_TOKEN_chris'] hf_endpoint = os.environ['LLAMA2_ENDPOINT_UPLIMIT'] #%% # model_default = 'sentence-transformers/all-mpnet-base-v2' model_default = 'models/finetuned-all-mpnet-base-v2-300' if we_are_not_online \ else 'sentence-transformers/all-mpnet-base-v2' available_models = ['sentence-transformers/all-mpnet-base-v2', 'sentence-transformers/all-MiniLM-L6-v2', 'models/finetuned-all-mpnet-base-v2-300'] #%% models_urls = {'models/finetuned-all-mpnet-base-v2-300': "https://drive.google.com/drive/folders/1asJ37-AUv5nytLtH6hp6_bVV3_cZOXfj"} def download_model_from_Gdrive(model_name_or_path, model_full_path): print("Downloading model from Google Drive") st.write("Downloading model from Google Drive") assert model_name_or_path in models_urls, f"Model {model_name_or_path} not found in models_urls" url = models_urls[model_name_or_path] gdown.download_folder(url, output=model_full_path, quiet=False, use_cookies=False) print("Model downloaded and saved to models folder") # st.write("Model downloaded") def download_model(model_name_or_path, model_full_path): if model_name_or_path.startswith("models/"): download_model_from_Gdrive(model_name_or_path, model_full_path) print(f"Model {model_full_path} downloaded") models_urls[model_name_or_path] = model_full_path # st.sidebar.write(f"Model {model_full_path} downloaded") elif model_name_or_path.startswith("sentence-transformers/"): st.sidebar.write(f"Downloading Sentence Transformer model {model_name_or_path}") model = SentenceTransformer(model_name_or_path) # HF looks into its own models folder/path models_urls[model_name_or_path] = model_full_path # st.sidebar.write(f"Model {model_name_or_path} downloaded") model.save(model_full_path) # st.sidebar.write(f"Model {model_name_or_path} saved to {model_full_path}") # if 'modelspath' not in st.session_state: # st.session_state['modelspath'] = None # if st.session_state.modelspath is None: # # let's create a temp folder on the first run # persistent_dir = pathlib.Path("path/to/persistent_dir") # persistent_dir.mkdir(parents=True, exist_ok=True) # with tempfile.TemporaryDirectory() as temp_dir: # st.session_state.modelspath = temp_dir # print(f"Temporary directory created at {temp_dir}") # # the temp folder disappears with the context, but not the one we've created manually # else: # temp_dir = st.session_state.modelspath # print(f"Temporary directory already exists at {temp_dir}") # # st.write(os.listdir(temp_dir)) #%% # for streamlit online, we must download the model from google drive # because github LFS doesn't work on forked repos def check_model(model_name_or_path): model_path = pathlib.Path(model_name_or_path) model_full_path = str(pathlib.Path("models") / model_path) # this creates a models folder inside /models model_full_path = model_full_path.replace("sentence-transformers/", "models/") # all are saved in models folder if pathlib.Path(model_full_path).exists(): # let's use the model that's already there print(f"Model {model_full_path} already exists") # but delete everything else in we are online because # streamlit online has limited space (and will shut down the app if it's full) if we_are_online: # st.sidebar.write(f"Model {model_full_path} already exists") # st.sidebar.write(f"Deleting other models") dirs = os.listdir("models/models") # we get only the folder name, not the full path dirs.remove(model_full_path.split('/')[-1]) for p in dirs: dirpath = pathlib.Path("models/models") / p if dirpath.is_dir(): shutil.rmtree(dirpath) else: if we_are_online: # space issues on streamlit online, let's not leave anything behind # and redownload the model eveery time print("Deleting models/models folder") if pathlib.Path('models/models').exists(): shutil.rmtree("models/models") # make room, if other models are there # st.sidebar.write(f"models/models folder deleted") download_model(model_name_or_path, model_full_path) return model_full_path #%% instantiate Weaviate client def get_weaviate_client(api_key, url, model_name_or_path, openai_api_key): client = WeaviateClient(api_key, url, model_name_or_path=model_name_or_path, openai_api_key=openai_api_key) client.display_properties.append('summary') available_classes = sorted(client.show_classes()) # st.write(f"Available classes: {available_classes}") # st.write(f"Available classes type: {type(available_classes)}") logger.info(available_classes) return client, available_classes ############## # data = load_data(data_path) # guests list for sidebar guest_list = sorted(list(set([d['guest'] for d in data]))) def main(): with st.sidebar: # moved it to main area # guest = st.selectbox('Select Guest', # options=guest_list, # index=None, # placeholder='Select Guest') _, center, _ = st.columns([3, 5, 3]) with center: st.text("Search Lab") _, center, _ = st.columns([2, 5, 3]) with center: if we_are_online: st.text("Running ONLINE") st.text("(UNSTABLE)") else: st.text("Running OFFLINE") st.write("----------") alpha_input = st.slider(label='Alpha',min_value=0.00, max_value=1.00, value=0.40, step=0.05) retrieval_limit = st.slider(label='Hybrid Search Results', min_value=10, max_value=300, value=10, step=10) hybrid_filter = st.toggle('Filter Guest', True) # i.e. look only at guests' data rerank = st.toggle('Use Reranker', True) if rerank: reranker_topk = st.slider(label='Reranker Top K',min_value=1, max_value=5, value=3, step=1) else: # needed to not fill the LLM with too many responses (> context size) # we could make it dependent on the model reranker_topk = 3 rag_it = st.toggle('RAG it', True) if rag_it: st.sidebar.write(f"Using LLM '{model_nameGPT}'") llm_temperature = st.slider(label='LLM TĖš', min_value=0.0, max_value=2.0, value=0.01, step=0.10 ) model_name_or_path = st.selectbox(label='Model Name:', options=available_models, index=available_models.index(model_default), placeholder='Select Model') st.write("Experimental and time limited 2'") finetune_model = st.toggle('Finetune on Modal A100 GPU', False) if finetune_model: from finetune_backend import finetune if 'finetuned' in model_name_or_path: st.write("Model already finetuned") elif model_name_or_path.startswith("models/"): st.write("Sentence Transformers models only!") else: try: if 'finetuned' in finetune_model: st.write("Model already finetuned") else: model_path = finetune(model_name_or_path, savemodel=True, outpath='models') if model_path is not None: if finetune_model.split('/')[-1] not in model_path: st.write(model_path) # a warning from finetuning in this case elif model_path not in available_models: # finetuning generated a model, let's add it available_models.append(model_path) st.write("Model saved!") except Exception: st.write("Model not found on HF or error") model_name_or_path = check_model(model_name_or_path) client, available_classes = get_weaviate_client(Wapi_key, url, model_name_or_path, openai_api_key) start_class = 'Impact_theory_all_mpnet_base_v2_finetuned' class_name = st.selectbox( label='Class Name:', options=available_classes, index=available_classes.index(start_class), placeholder='Select Class Name' ) st.write("----------") c1,c2 = st.columns([8,1]) with c1: show_metrics = st.toggle('Show Metrics on Golden set', False) if show_metrics: # _, center, _ = st.columns([3, 5, 3]) # with center: # st.text("Metrics") with c2: with st.spinner(''): metrics = execute_evaluation(golden_dataset, class_name, client, alpha=alpha_input) if show_metrics: kw_hit_rate = metrics['kw_hit_rate'] kw_mrr = metrics['kw_mrr'] hybrid_hit_rate = metrics['hybrid_hit_rate'] vector_hit_rate = metrics['vector_hit_rate'] vector_mrr = metrics['vector_mrr'] total_misses = metrics['total_misses'] st.text(f"KW hit rate: {kw_hit_rate}") st.text(f"Vector hit rate: {vector_hit_rate}") st.text(f"Hybrid hit rate: {hybrid_hit_rate}") st.text(f"Hybrid MRR: {vector_mrr}") st.text(f"Total misses: {total_misses}") st.write("----------") st.title("Chat with the Impact Theory podcasts!") # st.image('./assets/impact-theory-logo.png', width=400) st.image('assets/it_tom_bilyeu.png', use_column_width=True) # st.subheader(f"Chat with the Impact Theory podcast: ") st.write('\n') # st.stop() st.write("\u21D0 Open the sidebar to change Search settings \n ") # https://home.unicode.org also 21E0, 21B0 B2 D0 guest = st.selectbox('Select A Guest', options=guest_list, index=None, placeholder='Select Guest') col1, col2 = st.columns([7,3]) with col1: if guest is None: msg = f'Select a guest before asking your question:' else: msg = f'Enter your question about {guest}:' textbox = st.empty() # best solution I found to be able to change the text inside a text_input box afterwards, using a key query = textbox.text_input(msg, value="", placeholder="You can refer to the guest with pronoun or drop the question mark", key=st.session_state.key) # st.write(f"Guest = {guest}") # st.write(f"key = {st.session_state.key}") st.write('\n\n\n\n\n') reworded_query = {'changed': False, 'status': 'error'} # at start, the query is empty valid_response = [] # at start, the query is empty, so prevent the search if query: if guest is None: st.session_state.key += 1 query = textbox.text_input(msg, value="", placeholder="YOU MUST SELECT A GUEST BEFORE ASKING A QUESTION", key=st.session_state.key) # st.write(f"key = {st.session_state.key}") st.stop() else: # st.write(f'It looks like you selected {guest} as a filter (It is ignored for now).') with col2: # let's add a nice pulse bar while generating the response with hc.HyLoader('', hc.Loaders.pulse_bars, primary_color= 'red', height=50): #"#0e404d" for image green # with st.spinner('Generating Response...'): with col1: # let's use Llama2 here reworded_query = reword_query(query, guest, model_name='llama2-13b-chat') query = reworded_query['rewritten_question'] # we can arrive here only if a guest was selected where_filter = WhereFilter(path=['guest'], operator='Equal', valueText=guest).todict() \ if hybrid_filter else None hybrid_response = client.hybrid_search(query, class_name, # properties=['content'], #['title', 'summary', 'content'], alpha=alpha_input, display_properties=client.display_properties, where_filter=where_filter, limit=retrieval_limit) response = hybrid_response if rerank: # rerank results with cross encoder ranked_response = reranker.rerank(response, query, apply_sigmoid=True, # score between 0 and 1 top_k=reranker_topk) logger.info(ranked_response) expanded_response = expand_content(ranked_response, cache, content_key='doc_id', create_new_list=True) response = expanded_response # make sure token count < threshold token_threshold = 8000 if model_nameGPT == model_ids[0] else 3500 valid_response = validate_token_threshold(response, question_answering_prompt_series, query=query, tokenizer= encoding,# variable from ENCODING, token_threshold=token_threshold, verbose=True) # st.write(f"Number of results: {len(valid_response)}") # I jump out of col1 to get all page width, so need to retest query if query is not None and reworded_query['status'] != 'error': show_query = st.toggle('Show rewritten query', False) if show_query: # or reworded_query['changed']: st.write(f"Rewritten query: {query}") # creates container for LLM response to position it above search results chat_container, response_box = [], st.empty() # # RAG time !! execute chat call to LLM if rag_it: # st.subheader("Response from Impact Theory (context)") # will appear under the answer, moved it into the response box # generate LLM prompt prompt = generate_prompt_series(query=query, results=valid_response) GPTllm = GPT_Turbo(model=model_nameGPT, api_key=st.secrets['secrets']['OPENAI_API_KEY']) try: # inserts chat stream from LLM for resp in GPTllm.get_chat_completion(prompt=prompt, temperature=llm_temperature, max_tokens=350, show_response=True, stream=True): with response_box: content = resp.choices[0].delta.content if content: chat_container.append(content) result = "".join(chat_container).strip() response_box.markdown(f"### Response from Impact Theory (RAG):\n\n{result}") except BadRequestError as e: logger.info('Making request with smaller context') valid_response = validate_token_threshold(response, question_answering_prompt_series, query=query, tokenizer=encoding, token_threshold=3500, verbose=True) # if reranker is off, we may receive a LOT of responses # so we must reduce the context size manually if not rerank: valid_response = valid_response[:reranker_topk] prompt = generate_prompt_series(query=query, results=valid_response) for resp in GPTllm.get_chat_completion(prompt=prompt, temperature=llm_temperature, max_tokens=350, # expand for more verbose answers show_response=True, stream=True): try: # inserts chat stream from LLM with response_box: content = resp.choice[0].delta.content if content: chat_container.append(content) result = "".join(chat_container).strip() response_box.markdown(f"### Response from Impact Theory (RAG):\n\n{result}") except Exception as e: print(e) st.markdown("----") st.subheader("Search Results") for i, hit in enumerate(valid_response): col1, col2 = st.columns([7, 3], gap='large') image = hit['thumbnail_url'] # get thumbnail_url episode_url = hit['episode_url'] # get episode_url title = hit["title"] # get title show_length = hit["length"] # get length time_string = str(timedelta(seconds=show_length)) # convert show_length to readable time string with col1: st.write(search_result(i=i, url=episode_url, guest=hit['guest'], title=title, content='', length=time_string), unsafe_allow_html=True) st.write('\n\n') with col2: #st.write(f"", # unsafe_allow_html=True) #st.markdown(f"[![{title}]({image})]({episode_url})") # st.markdown(f'' # f'' # f'', # unsafe_allow_html=True) st.image(image, caption=title.split('|')[0], width=200, use_column_width=False) # let's use all width for the content st.write(hit['content']) def get_answer(query, valid_response, GPTllm): # generate LLM prompt prompt = generate_prompt_series(query=query, results=valid_response) return GPTllm.get_chat_completion(prompt=prompt, system_message='answer this question based on the podcast material', temperature=0, max_tokens=500, stream=False, show_response=False) def reword_query(query, guest, model_name='llama2-13b-chat', response_processing=True): """ Asks LLM to rewrite the query when the guest name is missing. Args: query (str): user query guest (str): guest name model_name (str, optional): name of a LLM model to be used """ # tags = {'llama2-13b-chat': {'start': '', 'end': '', 'instruction': '[INST]', 'system': '[SYS]'}, # 'gpt-3.5-turbo-0613': {'start': '<|startoftext|>', 'end': '', 'instruction': "```", 'system': ```}} prompt_fields = { "you_are":f"You are an expert in linguistics and semantics, analyzing the question asked by a user to a vector search system, \ and making sure that the question is well formulated and that the system can understand it.", "your_task":f"Your task is to detect if the name of the guest ({guest}) is mentioned in the user's question, \ and if that is not the case, rewrite the question using the guest name, \ without changing the meaning of the question. \ Most of the time, the user will have used a pronoun to designate the guest, in which case, \ simply replace the pronoun with the guest name.", "question":f"If the user mentions the guest name, ie {query}, just return his question as is. \ If the user does not mention the guest name, rewrite the question using the guest name.", "final_instruction":f"Only regerate the requested rewritten question or the original, WITHOUT ANY COMMENT OR REPHRASING. \ Your answer must be as close as possible to the original question, \ and exactly identical, word for word, if the user mentions the guest name, i.e. {guest}.", } # prompt created by chatGPT :-) # and Llama still outputs the original question and precedes the answer with 'rewritten question' prompt_fields2 = { "you_are": ( "You are an expert in linguistics and semantics. Your role is to analyze questions asked to a vector search system." ), "your_task": ( f"Detect if the guest's FULL name, {guest}, is mentioned in the user's question. " "If not, rewrite the question by replacing pronouns or indirect references with the guest's name." \ "If yes, return the original question as is, without any change at all, not even punctuation," "except a question mark that you MUST add if it's missing." ), "question": ( f"Original question: '{query}'. " "Rewrite this question to include the guest's FULL name if it's not already mentioned." "The Only thing you can and MUST add is a question mark if it's missing." ), "final_instruction": ( "Create a rewritten question or keep the original question as is. " "Do not include any labels, titles, or additional text before or after the question." "The Only thing you can and MUST add is a question mark if it's missing." "Return a json object, with the key 'original_question' for the original question, \ and 'rewritten_question' for the rewritten question \ and 'changed' being True if you changed the answer, otherwise False." ), } if model_name == 'llama2-13b-chat': # special tags are used: # `` - start prompt tag # `[INST], [/INST]` - Opening and closing model instruction tags # `<<>>, <>` - Opening and closing system prompt tags llama_prompt = """ [INST] <> {you_are} <> {your_task}\n ``` \n\n Question: {question}\n {final_instruction} [/INST] Answer: """ prompt = llama_prompt.format(**prompt_fields2) hf_token = st.secrets['secrets']['LLAMA2_ENDPOINT_HF_TOKEN_chris'] # hf_token = st.secrets['secrets']['LLAMA2_ENDPOINT_HF_TOKEN'] hf_endpoint = st.secrets['secrets']['LLAMA2_ENDPOINT_UPLIMIT'] headers = {"Authorization": f"Bearer {hf_token}", "Content-Type": "application/json",} json_body = { "inputs": prompt, "parameters": {"max_new_tokens":400, "repetition_penalty": 1.0, "temperature":0.01} } response = requests.request("POST", hf_endpoint, headers=headers, data=json.dumps(json_body)) response = json.loads(response.content.decode("utf-8")) # ^ will not process the badly formatted generated text, so we do it ourselves if isinstance(response, dict) and 'error' in response: print("Found error") print(response) # return {'error': response['error'], 'rewritten_question': query, 'changed': False, 'status': 'error'} # I test this here otherwise it gets in col 2 or 1, which are too # if reworded_query['status'] == 'error': # st.write(f"Error in LLM response: 'error':{reworded_query['error']}") # st.write("The LLM could not connect to the server. Please try again later.") # st.stop() return reword_query(query, guest, model_name='gpt-3.5-turbo-0613') if response_processing: if isinstance(response, list) and isinstance(response[0], dict) and 'generated_text' in response[0]: print("Found generated text") response0 = response[0]['generated_text'] pattern = r'\"(\w+)\":\s*(\".*?\"|\w+)' matches = re.findall(pattern, response0) # let's build a dictionary result = {key: json.loads(value) if value.startswith("\"") else value for key, value in matches} return result | {'status': 'success'} else: print("Found no answer") return reword_query(query, guest, model_name='gpt-3.5-turbo-0613') # return {'original_question': query, 'rewritten_question': query, 'changed': False, 'status': 'no properly formatted answer' } else: return response # return response # assert 'error' not in response, f"Error in LLM response: {response['error']}" # assert 'generated_text' in response[0], f"Error in LLM response: {response}, no 'generated_text' field" # # let's extract the rewritten question # return response[0]['generated_text'] .split("Rewritten question: '")[-1][:-1] else: # assume openai model_ids = ['gpt-3.5-turbo-16k', 'gpt-3.5-turbo-0613'] model_name = model_ids[1] GPTllm = GPT_Turbo(model=model_name, api_key=st.secrets['secrets']['OPENAI_API_KEY']) openai_prompt = """ {your_task}\n ``` \n\n Question: {question}\n {final_instruction} Answer: """ prompt = openai_prompt.format(**prompt_fields) try: resp = GPTllm.get_chat_completion(prompt=openai_prompt, system_message=prompt_fields['you_are'], temperature=0.01, max_tokens=1500, # it's a question... show_response=True, stream=False) return {'rewritten_question': resp.choices[0].delta.content, 'changed': True, 'status': 'success'} except Exception: return {'rewritten_question': query, 'changed': False, 'status': 'not success'} if __name__ == '__main__': main() # %%