Spaces:

JPBianchi
/

vectorsearch

Sleeping

App Files Files Community

JPBianchi commited on Dec 18, 2023

Commit

083cd31

1 Parent(s): 0cc727c

updated app

Browse files

Files changed (5) hide show

app.py +31 -24
app.shell.py +143 -0
app_features.py +179 -0
debug.py +23 -0
prompt_templates_luis.py +0 -63

app.py CHANGED Viewed

@@ -316,27 +316,30 @@ def main():
         st.write("Experimental and time limited 2'")
         finetune_model = st.toggle('Finetune on Modal A100 GPU', False)
-        if finetune_model:
-            from finetune_backend import finetune
-            if 'finetuned' in model_name_or_path:
-                st.write("Model already finetuned")
-            elif model_name_or_path.startswith("models/"):
-                st.write("Sentence Transformers models only!")
-            else:
-                try:
-                    if 'finetuned' in finetune_model:
-                        st.write("Model already finetuned")
-                    else:
-                        model_path = finetune(model_name_or_path, savemodel=True, outpath='models')
-                        if model_path is not None:
-                            if finetune_model.split('/')[-1] not in model_path:
-                                st.write(model_path)  # a warning from finetuning in this case
-                            elif model_path not in available_models:
-                                # finetuning generated a model, let's add it
-                                available_models.append(model_path)
-                                st.write("Model saved!")
-                except Exception:
-                    st.write("Model not found on HF or error")
         model_name_or_path = check_model(model_name_or_path)
         client, available_classes = get_weaviate_client(Wapi_key, url, model_name_or_path, openai_api_key)
@@ -404,7 +407,7 @@ def main():
         # best solution I found to be able to change the text inside a text_input box afterwards, using a key
         query = textbox.text_input(msg,
                                   value="",
-                                  placeholder="You can refer to the guest with pronoun or drop the question mark",
                                   key=st.session_state.key)
         # st.write(f"Guest = {guest}")
@@ -438,8 +441,12 @@ def main():
                             # let's use Llama2 here
                             reworded_query = reword_query(query, guest,
                                                           model_name='llama2-13b-chat')
-                            query = reworded_query['rewritten_question']
                             # we can arrive here only if a guest was selected
                             where_filter = WhereFilter(path=['guest'], operator='Equal', valueText=guest).todict() \
                                                 if hybrid_filter else None

         st.write("Experimental and time limited 2'")
         finetune_model = st.toggle('Finetune on Modal A100 GPU', False)
+        if we_are_not_online:
+            if finetune_model:
+                from finetune_backend import finetune
+                if 'finetuned' in model_name_or_path:
+                    st.write("Model already finetuned")
+                elif model_name_or_path.startswith("models/"):
+                    st.write("Sentence Transformers models only!")
+                else:
+                    try:
+                        if 'finetuned' in finetune_model:
+                            st.write("Model already finetuned")
+                        else:
+                            model_path = finetune(model_name_or_path, savemodel=True, outpath='models')
+                            if model_path is not None:
+                                if finetune_model.split('/')[-1] not in model_path:
+                                    st.write(model_path)  # a warning from finetuning in this case
+                                elif model_path not in available_models:
+                                    # finetuning generated a model, let's add it
+                                    available_models.append(model_path)
+                                    st.write("Model saved!")
+                    except Exception:
+                        st.write("Model not found on HF or error")
+        else:
+            st.write("Finetuning not available on Streamlit online because of space limitations")
         model_name_or_path = check_model(model_name_or_path)
         client, available_classes = get_weaviate_client(Wapi_key, url, model_name_or_path, openai_api_key)
         # best solution I found to be able to change the text inside a text_input box afterwards, using a key
         query = textbox.text_input(msg,
                                   value="",
+                                  placeholder="You can refer to the guest with PRONOUNS",
                                   key=st.session_state.key)
         # st.write(f"Guest = {guest}")
                             # let's use Llama2 here
                             reworded_query = reword_query(query, guest,
                                                           model_name='llama2-13b-chat')
+                            new_query = reworded_query['rewritten_question']
+                            if guest.split(' ')[1] not in new_query and guest.split(' ')[0] not in new_query:
+                                # if the guest name is not in the rewritten question, we add it
+                                new_query = f"About {guest}, " + new_query.lowerleft()
+                            query = new_query
                             # we can arrive here only if a guest was selected
                             where_filter = WhereFilter(path=['guest'], operator='Equal', valueText=guest).todict() \
                                                 if hybrid_filter else None

app.shell.py ADDED Viewed

	@@ -0,0 +1,143 @@

+from tiktoken import get_encoding
+from weaviate_interface import WeaviateClient
+from prompt_templates import question_answering_prompt_series, question_answering_system
+from openai_interface import GPT_Turbo
+from app_features import (convert_seconds, generate_prompt_series, search_result,
+                          validate_token_threshold, load_content_cache, load_data)
+from reranker import ReRanker
+from loguru import logger
+import streamlit as st
+import sys
+import json
+import os
+# load environment variables
+from dotenv import load_dotenv
+load_dotenv('.env', override=True)
+## PAGE CONFIGURATION
+st.set_page_config(page_title="Impact Theory",
+                   page_icon=None,
+                   layout="wide",
+                   initial_sidebar_state="auto",
+                   menu_items=None)
+##############
+# START CODE #
+##############
+data_path = 'data/impact_theory_data.json'
+cache_path = 'data/impact_theory_cache.parquet'
+data = load_data(data_path)
+cache = load_content_cache(cache_path)
+## RETRIEVER
+client.display_properties.append('summary')
+## RERANKER
+## LLM
+## ENCODING
+## INDEX NAME
+##############
+#  END CODE  #
+##############
+data = load_data(data_path)
+#creates list of guests for sidebar
+guest_list = sorted(list(set([d['guest'] for d in data])))
+def main():
+    with st.sidebar:
+        guest = st.selectbox('Select Guest', options=guest_list, index=None, placeholder='Select Guest')
+    st.image('./assets/impact-theory-logo.png', width=400)
+    st.subheader(f"Chat with the Impact Theory podcast: ")
+    st.write('\n')
+    col1, _ = st.columns([7,3])
+    with col1:
+        query = st.text_input('Enter your question: ')
+        st.write('\n\n\n\n\n')
+        if query:
+            ##############
+            # START CODE #
+            ##############
+            st.write('Hmmm...this app does not seem to be working yet.  Please check back later.')
+            if guest:
+                st.write(f'However, it looks like you selected {guest} as a filter.')
+            # make hybrid call to weaviate
+            hybrid_response = None
+            # rerank results
+            ranked_response = None
+            # validate token count is below threshold
+            # valid_response = validate_token_threshold(ranked_response,
+                                                    #    question_answering_prompt_series,
+                                                    #    query=query,
+                                                    #    tokenizer= # variable from ENCODING,
+                                                    #    token_threshold=4000,
+                                                    #    verbose=True)
+            ##############
+            #  END CODE  #
+            ##############
+            # # generate LLM prompt
+            # prompt = generate_prompt_series(query=query, results=valid_response)
+            # # prep for streaming response
+            # st.subheader("Response from Impact Theory (context)")
+            # with st.spinner('Generating Response...'):
+            #     st.markdown("----")
+            #     #creates container for LLM response
+            #     chat_container, response_box = [], st.empty()
+            #
+            #     # execute chat call to LLM
+            #                  ##############
+            #                  # START CODE #
+            #                  ##############
+            #
+            #                  ##############
+            #                  #  END CODE  #
+            #                  ##############
+            #         try:
+                          #inserts chat stream from LLM
+            #             with response_box:
+                        #     content = resp.choices[0].delta.content
+                        #     if content:
+                        #         chat_container.append(content)
+                        #         result = "".join(chat_container).strip()
+                        #         st.write(f'{result}')
+                        # except Exception as e:
+                        #     print(e)
+                        #     continue
+            # ##############
+            # # START CODE #
+            # ##############
+            # st.subheader("Search Results")
+            # for i, hit in enumerate(valid_response):
+            #     col1, col2 = st.columns([7, 3], gap='large')
+            #     image = # get thumbnail_url
+            #     episode_url = # get episode_url
+            #     title = # get title
+            #     show_length = # get length
+            #     time_string = # convert show_length to readable time string
+            # ##############
+            # #  END CODE  #
+            # ##############
+            #     with col1:
+            #         st.write( search_result(  i=i,
+                                                # url=episode_url,
+                                                # guest=hit['guest'],
+                                                # title=title,
+                                                # content=hit['content'],
+                                                # length=time_string),
+            #                 unsafe_allow_html=True)
+            #         st.write('\n\n')
+            #     with col2:
+            #         # st.write(f"<a href={episode_url} <img src={image} width='200'></a>",
+            #         #             unsafe_allow_html=True)
+            #         st.image(image, caption=title.split('|')[0], width=200, use_column_width=False)
+if __name__ == '__main__':
+    main()

app_features.py ADDED Viewed

	@@ -0,0 +1,179 @@

+import time
+import json
+from preprocessing import FileIO
+from typing import List, Optional
+import tiktoken
+from loguru import logger
+from prompt_templates import context_block, question_answering_prompt_series
+import streamlit as st
+@st.cache_data
+def load_content_cache(data_path: str):
+    data = FileIO().load_parquet(data_path)
+    content_data = {d['doc_id']: d['content'] for d in data}
+    return content_data
+@st.cache_data
+def load_data(data_path: str):
+    with open(data_path, 'r') as f:
+        data = json.load(f)
+    return data
+def convert_seconds(seconds: int):
+    """
+    Converts seconds to a string of format Hours:Minutes:Seconds
+    """
+    return time.strftime("%H:%M:%S", time.gmtime(seconds))
+def expand_content(ranked_results: List[dict],
+                   content_cache: Optional[dict] = None,
+                   content_key: str = 'doc_id',
+                   create_new_list: bool = False
+                   ) -> List[dict]:
+    '''
+    Updates or creates a list of ranked results with content from a cache.
+    This function iterates over a list of dictionaries representing ranked results.
+    If a cache is provided, it adds or updates the 'content' key in each dictionary
+    with the corresponding content from the cache based on the content_key.
+    Args:
+    - ranked_results (List[dict]): A list of dictionaries, each representing a ranked result.
+    - content_cache (Optional[dict]): A dictionary that maps content_key to content.
+      If None, the content of ranked results will not be updated.
+    - content_key (str): The key used in both the ranked results and content cache to match
+      the ranked results with their corresponding content in the cache.
+    - create_new_list (bool): If True, a new list of dictionaries will be created and
+      returned with the content updated. If False, the ranked_results will be updated in place.
+    Returns:
+    - List[dict]: A new list with updated content if create_new_list is True; otherwise,
+      the original ranked_results list with updated content.
+    Note:
+    - If create_new_list is False, the function will mutate the original ranked_results list.
+    - The function only updates content if the content_key exists in both the ranked result
+      and the content cache.
+    Example:
+    ```
+    ranked_results = [{'doc_id': '123', 'title': 'Title 1'}, {'doc_id': '456', 'title': 'Title 2'}]
+    content_cache = {'123': 'Content for 123', '456': 'Content for 456'}
+    updated_results = expand_content(ranked_results, content_cache, create_new_list=True)
+    # updated_results is now [{'doc_id': '123', 'title': 'Title 1', 'content': 'Content for 123'},
+    #                         {'doc_id': '456', 'title': 'Title 2', 'content': 'Content for 456'}]
+    ```
+    '''
+    if create_new_list:
+        expanded_response = [{k:v for k, v in resp.items()} for resp in ranked_results]
+        if content_cache is not None:
+            for resp in expanded_response:
+                if resp[content_key] in content_cache:
+                    resp['content'] = content_cache[resp[content_key]]
+        return expanded_response
+    else:
+        for resp in ranked_results:
+            if content_cache and resp[content_key] in content_cache:
+                resp['content'] = content_cache[resp[content_key]]
+        return ranked_results
+def generate_prompt_series(query: str, results: List[dict]) -> str:
+    """
+    Generates a prompt for the OpenAI API by joining the context blocks of the top results.
+    Provides context to the LLM by supplying the summary, guest, and retrieved content of each result.
+    You MUST make it easily readable, i.e. add newlines and indentation to create well-separated paragraphs.
+    Args:
+    -----
+        query : str
+            User query
+        results : List[dict]
+            List of results from the Weaviate client
+    """
+    context_series = '\n'.join([context_block.format(summary=res['summary'],
+                                                     guest=res['guest'], \
+                                                     transcript=res['content']) for res in results]).strip()
+    prompt = question_answering_prompt_series.format(question=query, series=context_series)
+    return prompt
+def validate_token_threshold(ranked_results: List[dict],
+                             base_prompt: str,
+                             query: str,
+                             tokenizer: tiktoken.Encoding,
+                             token_threshold: int,
+                             verbose: bool = False
+                             ) -> List[dict]:
+        """
+        Validates that prompt is below the set token threshold by adding lengths of:
+            1. Base prompt
+            2. User query
+            3. Context material
+        If threshold is exceeded, context results are reduced incrementally until the
+        combined prompt tokens are below the threshold. This function does not take into
+        account every token passed to the LLM, but it is a good approximation.
+        """
+        overhead_len = len(tokenizer.encode(base_prompt.format(question=query, series='')))
+        context_len = _get_batch_length(ranked_results, tokenizer)
+        token_count = overhead_len + context_len
+        if token_count > token_threshold:
+            print('Token count exceeds token count threshold, reducing size of returned results below token threshold')
+            while token_count > token_threshold and len(ranked_results) > 1:
+                num_results = len(ranked_results)
+                # remove the last ranked (most irrelevant) result
+                ranked_results = ranked_results[:num_results-1]
+                # recalculate new token_count
+                token_count = overhead_len + _get_batch_length(ranked_results, tokenizer)
+        if verbose:
+            logger.info(f'Total Final Token Count: {token_count}')
+        return ranked_results
+def _get_batch_length(ranked_results: List[dict], tokenizer: tiktoken.Encoding) -> int:
+    '''
+    Convenience function to get the length in tokens of a batch of results
+    '''
+    contexts = tokenizer.encode_batch([r['content'] for r in ranked_results])
+    context_len = sum(list(map(len, contexts)))
+    return context_len
+def search_result(i: int,
+                  url: str,
+                  title: str,
+                  content: str,
+                  guest: str,
+                  length: str,
+                  space: str='&nbsp; &nbsp;'
+                 ) -> str:
+    '''
+    HTML to display search results.
+    Args:
+    -----
+    i: int
+        index of search result
+    url: str
+        url of YouTube video
+    title: str
+        title of episode
+    content: str
+        content chunk of episode
+    '''
+    return f"""
+        <div style="font-size:120%;">
+            {i + 1}.<a href="{url}">{title}</a>
+        </div>
+        <div style="font-size:95%;">
+            <p>Episode Length: {length} {space}{space} Guest: {guest}</p>
+            <div style="color:grey;float:left;">
+                ...
+            </div>
+            {content}
+        </div>
+    """

debug.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import tiktoken
+from llama_index.text_splitter import SentenceSplitter
+d = {'title': "THE BIG AI RESET: The Next Global SuperPower Isn't Who You Think | Ian Bremmer",
+ 'video_id': 'nXJBccSwtB8',
+ 'playlist_id': 'PL8qcvQ7Byc3OJ02hbWJbHWePh4XEg3cvo',
+ 'length': 5410,
+ 'thumbnail_url': 'https://i.ytimg.com/vi/nXJBccSwtB8/hq720.jpg',
+ 'views': 138628,
+ 'episode_url': 'https://www.youtube.com/watch?v=nXJBccSwtB8&list=PL8qcvQ7Byc3OJ02hbWJbHWePh4XEg3cvo',
+ 'guest': 'Ian Bremmer',
+ 'summary': "In this episode, Ian Bremmer discusses the rise of big tech as a third superpower and the potential dangers and opportunities it presents. He highlights the immense power held by tech companies in shaping society, the economy, and national security, emphasizing their sovereignty over the digital world. Bremmer expresses concerns about the growing influence of AI and its potential to outstrip government regulation, leading to a reality where tech companies wield significant power over individuals. He also delves into the risks associated with AI proliferation, including the potential for non-governments to control and misuse the technology, exacerbating social inequalities and disinformation. Bremmer emphasizes the need to address negative externalities and regulate AI to mitigate its adverse impacts. Additionally, he discusses the implications of AI on job displacement and social discontent, particularly for marginalized communities. The conversation delves into the breakdown of truth in the digital age, driven by algorithmic sorting and micro-targeting, leading to fragmented echo chambers and the erosion of consensus on facts. Both Bremmer and the host explore the challenges of navigating truth in a polarized and algorithmically driven information landscape, highlighting the need for critical thinking and a focus on human flourishing as a guiding principle in the face of AI's transformative impact.",
+ 'content': "You said these are dangerous times. The world order is shifting before our eyes"}
+chunk_size = 256
+chunk_overlap = 0
+encoding = tiktoken.encoding_for_model('gpt-3.5-turbo-0613')
+gpt35_txt_splitter = SentenceSplitter(chunk_size=chunk_size, tokenizer=encoding.encode, chunk_overlap=chunk_overlap)
+gpt35_txt_splitter(d['content'])

prompt_templates_luis.py DELETED Viewed

@@ -1,63 +0,0 @@
-question_answering_system = '''
-You are the host of the show Impact Theory, and your name is Tom Bilyeu.  The description of your show is as follows:
-If you’re looking to thrive in uncertain times, achieve unprecedented goals, and improve the most meaningful aspects of your life, then Impact Theory is the show for you. Hosted by Tom Bilyeu, a voracious learner and hyper-successful entrepreneur, the show investigates and analyzes the most useful topics with the world’s most sought-after guests.
-Bilyeu attacks each episode with a clear desire to further evolve the holistic skillset that allowed him to co-found the billion-dollar company Quest Nutrition, generate over half a billion organic views on his content, build a thriving marriage of over 20 years, and quantifiably improve the lives of over 10,000 people through his school, Impact Theory University.
-Bilyeu’s insatiable hunger for knowledge gives the show urgency, relevance, and depth while leaving listeners with the knowledge, tools, and empowerment to take control of their lives and develop true personal power.
-'''
-question_answering_prompt_single = '''
-Use the below context enclosed in triple back ticks to answer the question. If the context does not provide enough information to answer the question, then use any knowledge you have to answer the question.\n
-```{context}```\n
-Question:\n
-{question}.\n
-Answer:
-'''
-question_answering_prompt_series = '''
-Your task is to synthesize and reason over a series of transcripts of an interview between Tom Bilyeu and his guest(s).
-After your synthesis, use the series of transcripts to answer the below question.  The series will be in the following format:\n
-```
-Show Summary: <summary>
-Show Guest: <guest>
-Transcript: <transcript>
-```\n\n
-Start Series:
-```
-{series}
-```
-Question:\n
-{question}\n
-Answer the question and provide reasoning if necessary to explain the answer.\n
-If the context does not provide enough information to answer the question, then \n
-state that you cannot answer the question with the provided context.\n
-Answer:
-'''
-context_block = '''
-Show Summary: {summary}
-Show Guest: {guest}
-Transcript: {transcript}
-'''
-qa_generation_prompt = '''
-Impact Theory episode summary and episode guest are below:
----------------------
-Summary: {summary}
----------------------
-Guest: {guest}
----------------------
-Given the Summary and Guest of the episode as context \
-use the following randomly selected transcript section \
-of the episode and not prior knowledge, generate questions that can \
-be answered by the transcript section:
----------------------
-Transcript: {transcript}
----------------------
-Your task is to create {num_questions_per_chunk} questions that can \
-only be answered given the previous context and transcript details. \
-The question should randomly start with How, Why, or What.
-'''