Spaces:

luis-mi
/

hf-iiee-msm

Runtime error

App Files Files Community

luis-mi commited on Mar 18, 2024

Commit

c5685a6

verified ·

1 Parent(s): b2285b0

Upload 24 files

Browse files

Files changed (25) hide show

.gitattributes +1 -0
Dockerfile +18 -0
Home.py +46 -0
data/1_IIEE_1_json_data_19_02_2024_22-17-49.json +0 -0
pages/1_🔍_Busqueda_Aumentada.py +377 -0
pages/2_🗣_Busqueda_Conversacional.py +576 -0
pages/__init__.py +0 -0
requirements.txt +18 -0
static/.DS_Store +0 -0
static/images/cervezas-mahou.jpeg +0 -0
static/images/fabrica-mahou-1200x675.jpeg +0 -0
static/images/openai_logo.png +0 -0
static/images/openai_logo_circle.png +0 -0
static/images/openai_purple_logo_hres.jpeg +0 -0
static/images/screen_recording_busqueda_final_2.gif +3 -0
utils/.DS_Store +0 -0
utils/__init__.py +0 -0
utils/app_features_spa.py +177 -0
utils/openai_interface_spa.py +95 -0
utils/preprocessing.py +123 -0
utils/prompt_templates_spa.py +26 -0
utils/reranker_spa.py +89 -0
utils/retrieval_evaluation_spa.py +332 -0
utils/system_prompts.py +72 -0
utils/weaviate_interface_v3_spa.py +436 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+static/images/screen_recording_busqueda_final_2.gif filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,18 @@

+# Use an official Python runtime as a parent image
+FROM python:3.11-slim
+# Set the working directory in the container
+WORKDIR /app
+# Install any needed packages specified in requirements.txt
+COPY requirements.txt /app/
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy the rest of your application's code
+COPY . /app
+# Make port 8501 available to the world outside this container
+EXPOSE 7860
+# Run app.py when the container launches, use environment variables
+CMD ["streamlit", "run", "Home.py", "--server.address=0.0.0.0", "--server.port=8501"]

Home.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import streamlit as st
+import base64
+## PAGE CONFIGURATION
+st.set_page_config(page_title="Búsqueda Aumentada MSM para Impuestos Especiales",
+                   page_icon="🔍",
+                   layout="centered",
+                   initial_sidebar_state="auto",
+                   menu_items=None)
+st.image('./static/images/cervezas-mahou.jpeg', width=700,)
+# Mensaje de bienvenida
+st.markdown(
+    """
+    # ¡Bienvenido a Búsqueda Aumentada MSM para Impuestos Especiales! 🔍📚
+    Esta aplicación es una herramienta diseñada específicamente para la exploración y análisis de datos en el ámbito de Impuestos Especiales utilizando el poder de la Inteligencia Artificial.
+    **👈 Selecciona una opción en la barra lateral** para comenzar a explorar las diferentes funcionalidades que ofrece la aplicación.
+    """)
+file_ = open('./static/images/screen_recording_busqueda_final_2.gif', "rb")
+contents = file_.read()
+data_url = base64.b64encode(contents).decode("utf-8")
+file_.close()
+st.subheader("Uso de la Aplicación: 🔍 Busqueda Aumentada")
+st.caption("Observa en acción cómo la busqueda aumentada con una potente IA simplifica la búsqueda de información, todo con una interfaz de usuario facíl de usar.")
+st.markdown(
+    f'<div style="text-align: center;"><img src="data:image/gif;base64,{data_url}" alt="demo gif" style="max-width: 100%; height: auto;"></div>',
+    unsafe_allow_html=True,
+)
+st.markdown("""
+    ### ¿Quieres aprender más?
+    - Visita nuestra [página web](https://tupagina.com)
+    - Sumérgete en nuestra [documentación](https://tudocumentacion.com)
+    - Participa y pregunta en nuestros [foros comunitarios](https://tucomunidad.com)
+    ### Explora demos más complejos
+    - Descubre cómo aplicamos la IA para [analizar datasets especializados](https://tulinkdedataset.com)
+    - Explora [bases de datos de acceso público](https://tulinkdedatasetpublico.com) y ve la IA en acción
+    """,
+    unsafe_allow_html=True
+)

data/1_IIEE_1_json_data_19_02_2024_22-17-49.json ADDED Viewed

The diff for this file is too large to render. See raw diff

pages/1_🔍_Busqueda_Aumentada.py ADDED Viewed

	@@ -0,0 +1,377 @@

+from tiktoken import get_encoding, encoding_for_model
+from utils.weaviate_interface_v3_spa import WeaviateClient, WhereFilter
+from templates.prompt_templates_spa import question_answering_prompt_series_spa
+from utils.openai_interface_spa import GPT_Turbo
+from openai import BadRequestError
+from utils.app_features_spa import (convert_seconds, generate_prompt_series, search_result,
+                          validate_token_threshold, load_content_cache, load_data, expand_content)
+from utils.reranker_spa import ReRanker
+from loguru import logger
+import streamlit as st
+import os
+# load environment variables
+from dotenv import load_dotenv
+load_dotenv('.env', override=True)
+## PAGE CONFIGURATION
+st.set_page_config(page_title="Busqueda Aumentada",
+                   page_icon="🔍",
+                   layout="wide",
+                   initial_sidebar_state="auto",
+                   menu_items=None)
+## DATA + CACHE
+data_path = 'data/1_IIEE_1_json_data_19_02_2024_22-17-49.json'
+cache_path = ''
+data = load_data(data_path)
+cache = None  # Initialize cache as None
+# Check if the cache file exists before attempting to load it
+if os.path.exists(cache_path):
+    cache = load_content_cache(cache_path)
+else:
+    logger.warning(f"Cache file {cache_path} not found. Proceeding without cache.")
+#creates list of guests for sidebar
+guest_list = sorted(list(set([d['document_title'] for d in data])))
+with st.sidebar:
+    st.subheader("Selecciona tu Base de datos 🗃️")
+    client_type = st.radio(
+        "Selecciona el modo de acceso:",
+        ('Cloud', 'Local'),
+        help='Elige un repositorio para determinar el conjunto de datos sobre el cual realizarás tu búsqueda. "Cloud" te permite acceder a datos alojados en nuestros servidores seguros, mientras que "Local" es para trabajar con datos alojados localmente en tu máquina.'
+    )
+if client_type == 'Cloud':
+    api_key = st.secrets['WEAVIATE_CLOUD_API_KEY']
+    url = st.secrets['WEAVIATE_CLOUD_ENDPOINT']
+    weaviate_client = WeaviateClient(
+        endpoint=url,
+        api_key=api_key,
+        # model_name_or_path='./models/finetuned-all-MiniLM-L6-v2-300',
+        model_name_or_path="intfloat/multilingual-e5-small",
+        # openai_api_key=os.environ['OPENAI_API_KEY']
+        )
+    available_classes=sorted(weaviate_client.show_classes())
+    logger.info(available_classes)
+    logger.info(f"Endpoint: {client_type} | Classes: {available_classes}")
+elif client_type == 'Local':
+    url = st.secrets['WEAVIATE_LOCAL_ENDPOINT']
+    weaviate_client = WeaviateClient(
+        endpoint=url,
+        # api_key=api_key,
+        # model_name_or_path='./models/finetuned-all-MiniLM-L6-v2-300',
+        model_name_or_path="intfloat/multilingual-e5-small",
+        # openai_api_key=os.environ['OPENAI_API_KEY']
+        )
+    available_classes=sorted(weaviate_client.show_classes())
+    logger.info(f"Endpoint: {client_type} | Classes: {available_classes}")
+def main():
+    # Define the available user selected options
+    available_models = ['gpt-3.5-turbo', 'gpt-4-1106-preview']
+    # Define system prompts
+    # Initialize selected options in session state
+    if "openai_data_model" not in st.session_state:
+        st.session_state["openai_data_model"] = available_models[0]
+    if 'class_name' not in st.session_state:
+        st.session_state['class_name'] = None
+    with st.sidebar:
+        st.session_state['class_name'] = st.selectbox(
+            label='Repositorio:',
+            options=available_classes,
+            index=None,
+            placeholder='Repositorio',
+            help='Elige un repositorio para determinar el conjunto de datos sobre el cual realizarás tu búsqueda. "Cloud" te permite acceder a datos alojados en nuestros servidores seguros, mientras que "Local" es para trabajar con datos alojados localmente en tu máquina.'
+            )
+        # Check if the collection name has been selected
+        class_name = st.session_state['class_name']
+        if class_name:
+            st.success(f"Repositorio seleccionado ✅: {st.session_state['class_name']}")
+        else:
+            st.warning("🎗️ No olvides seleccionar el repositorio 👆 a consultar 🗄️.")
+            st.stop()  # Stop execution of the script
+        model_choice = st.selectbox(
+            label="Elige un modelo de OpenAI",
+            options=available_models,
+            index= available_models.index(st.session_state["openai_data_model"]),
+            help='Escoge entre diferentes modelos de OpenAI para generar respuestas a tus consultas. Cada modelo tiene distintas capacidades y limitaciones.'
+        )
+        st.sidebar.make_llm_call = st.checkbox(
+        label="Activar GPT",
+        help='Marca esta casilla para activar la generación de texto con GPT. Esto te permitirá obtener respuestas automáticas a tus consultas.'
+        )
+        with st.expander("Filtros de Busqueda"):
+            guest_input = st.selectbox(
+                label='Selección de documentos',
+                options=guest_list,
+                index=None,
+                placeholder='Documento',
+                help='Elige un documento específico del repositorio para afinar tu búsqueda a datos relevantes.'
+            )
+        with st.expander("Parametros de Busqueda"):
+            retriever_choice = st.selectbox(
+            label="Selecciona un método",
+            options=["Hybrid", "Vector", "Keyword"],
+            help='Determina el método de recuperación de información: "Hybrid" combina búsqueda por palabras clave y por similitud semántica, "Vector" usa embeddings de texto para encontrar coincidencias semánticas, y "Keyword" realiza una búsqueda tradicional por palabras clave.'
+            )
+            reranker_enabled = st.checkbox(
+                label="Activar Reranker",
+                value=True,
+                help='Activa esta opción para ordenar los resultados de la búsqueda según su relevancia, utilizando un modelo de reordenamiento adicional.'
+            )
+            alpha_input = st.slider(
+                label='Alpha para motor hibrido',
+                min_value=0.00,
+                max_value=1.00,
+                value=0.40,
+                step=0.05,
+                help='Ajusta el parámetro alfa para equilibrar los resultados entre los métodos de búsqueda por vector y por palabra clave en el motor híbrido.'
+            )
+            retrieval_limit = st.slider(
+                label='Resultados a Reranker',
+                min_value=10,
+                max_value=300,
+                value=100,
+                step=10,
+                help='Establece el número de resultados que se recuperarán antes de aplicar el reordenamiento.'
+            )
+            top_k_limit = st.slider(
+                label='Top K Limit',
+                min_value=1,
+                max_value=5,
+                value=3,
+                step=1,
+                help='Define el número máximo de resultados a mostrar después de aplicar el reordenamiento.'
+            )
+            temperature_input = st.slider(
+                label='Temperatura',
+                min_value=0.0,
+                max_value=1.0,
+                value=0.10,
+                step=0.10,
+                help='Ajusta la temperatura para la generación de texto con GPT, lo que influirá en la creatividad de las respuestas.'
+            )
+    logger.info(weaviate_client.display_properties)
+    def perform_search(client, retriever_choice, query, class_name, search_limit, guest_filter, display_properties, alpha_input):
+        if retriever_choice == "Keyword":
+            return weaviate_client.keyword_search(
+                request=query,
+                class_name=class_name,
+                limit=search_limit,
+                where_filter=guest_filter,
+                display_properties=display_properties
+            ), "Resultados de la Busqueda - Motor: Keyword: "
+        elif retriever_choice == "Vector":
+            return weaviate_client.vector_search(
+                request=query,
+                class_name=class_name,
+                limit=search_limit,
+                where_filter=guest_filter,
+                display_properties=display_properties
+            ), "Resultados de la Busqueda - Motor: Vector"
+        elif retriever_choice == "Hybrid":
+            return weaviate_client.hybrid_search(
+                request=query,
+                class_name=class_name,
+                alpha=alpha_input,
+                limit=search_limit,
+                properties=["content"],
+                where_filter=guest_filter,
+                display_properties=display_properties
+            ), "Resultados de la Busqueda - Motor: Hybrid"
+    ## RERANKER
+    reranker = ReRanker(model_name='cross-encoder/ms-marco-MiniLM-L-12-v2')
+    ## LLM
+    model_name = model_choice
+    llm = GPT_Turbo(model=model_name, api_key=st.secrets['OPENAI_API_KEY'])
+    encoding = encoding_for_model(model_name)
+    ########################
+    ## SETUP MAIN DISPLAY ##
+    ########################
+    st.image('./static/images/cervezas-mahou.jpeg', width=300)
+    st.subheader(f"✨🔍📚 **Búsqueda Aumentada** 📖🔍✨ Impuestos Especiales ")
+    st.caption("Descubre insights ocultos y responde a tus preguntas especializadas utilizando el poder de la IA")
+    st.write('\n')
+    query = st.text_input('Escribe tu pregunta aquí: ')
+    st.write('\n\n\n\n\n')
+    ############
+    ## SEARCH ##
+    ############
+    if query:
+        # make hybrid call to weaviate
+        guest_filter = WhereFilter(
+            path=['document_title'],
+            operator='Equal',
+            valueText=guest_input).todict() if guest_input else None
+        # Determine the appropriate limit based on reranking
+        search_limit = retrieval_limit if reranker_enabled else top_k_limit
+        # Perform the search
+        query_response, subheader_msg = perform_search(
+            client=weaviate_client,
+            retriever_choice=retriever_choice,
+            query=query,
+            class_name=class_name,
+            search_limit=search_limit,
+            guest_filter=guest_filter,
+            display_properties=weaviate_client.display_properties,
+            alpha_input=alpha_input if retriever_choice == "Hybrid" else None
+            )
+        # Rerank the results if enabled
+        if reranker_enabled:
+            search_results = reranker.rerank(
+                results=query_response,
+                query=query,
+                apply_sigmoid=True,
+                top_k=top_k_limit
+            )
+            subheader_msg += " Reranked"
+        else:
+            # Use the results directly if reranking is not enabled
+            search_results = query_response
+        logger.info(search_results)
+        expanded_response = expand_content(search_results, cache, content_key='doc_id', create_new_list=True)
+        # validate token count is below threshold
+        token_threshold = 8000 if model_name == 'gpt-3.5-turbo-16k' else 3500
+        valid_response = validate_token_threshold(
+            ranked_results=expanded_response,
+            base_prompt=question_answering_prompt_series_spa,
+            query=query,
+            tokenizer=encoding,
+            token_threshold=token_threshold,
+            verbose=True
+        )
+        logger.info(valid_response)
+        #########
+        ## LLM ##
+        #########
+        make_llm_call = st.sidebar.make_llm_call
+        # prep for streaming response
+        st.subheader("Respuesta GPT:")
+        with st.spinner('Generando Respuesta...'):
+            st.markdown("----")
+            # Creates container for LLM response
+            chat_container, response_box = [], st.empty()
+            # generate LLM prompt
+            prompt = generate_prompt_series(query=query, results=valid_response)
+            # logger.info(prompt)
+            if make_llm_call:
+                try:
+                    for resp in llm.get_chat_completion(
+                        prompt=prompt,
+                        temperature=temperature_input,
+                        max_tokens=350, # expand for more verbose answers
+                        show_response=True,
+                        stream=True):
+                        # inserts chat stream from LLM
+                        with response_box:
+                            content = resp.choices[0].delta.content
+                            if content:
+                                chat_container.append(content)
+                                result = "".join(chat_container).strip()
+                                st.write(f'{result}')
+                except BadRequestError:
+                    logger.info('Making request with smaller context...')
+                    valid_response = validate_token_threshold(
+                        ranked_results=search_results,
+                        base_prompt=question_answering_prompt_series_spa,
+                        query=query,
+                        tokenizer=encoding,
+                        token_threshold=token_threshold,
+                        verbose=True
+                    )
+                    # generate LLM prompt
+                    prompt = generate_prompt_series(query=query, results=valid_response)
+                    for resp in llm.get_chat_completion(
+                        prompt=prompt,
+                        temperature=temperature_input,
+                        max_tokens=350, # expand for more verbose answers
+                        show_response=True,
+                        stream=True):
+                        try:
+                            # inserts chat stream from LLM
+                            with response_box:
+                                content = resp.choices[0].delta.content
+                                if content:
+                                    chat_container.append(content)
+                                    result = "".join(chat_container).strip()
+                                    st.write(f'{result}')
+                        except Exception as e:
+                            print(e)
+            ####################
+            ## Search Results ##
+            ####################
+            st.subheader(subheader_msg)
+            for i, hit in enumerate(search_results):
+                col1, col2 = st.columns([7, 3], gap='large')
+                page_url = hit['page_url']
+                page_label = hit['page_label']
+                document_title = hit['document_title']
+                # Assuming 'page_summary' is available and you want to display it
+                page_summary = hit.get('page_summary', 'Summary not available')
+                with col1:
+                    st.markdown(f'''
+                                <span style="color: #3498db; font-size: 19px; font-weight: bold;">{document_title}</span><br>
+                                {page_summary}
+                                [**Página:** {page_label}]({page_url})
+                            ''', unsafe_allow_html=True)
+                    with st.expander("📄 Clic aquí para ver contexto:"):
+                        try:
+                            content = hit['content']
+                            st.write(content)
+                        except Exception as e:
+                            st.write(f"Error displaying content: {e}")
+                # with col2:
+                #     # If you have an image or want to display a placeholder image
+                #     image = "URL_TO_A_PLACEHOLDER_IMAGE"  # Replace with a relevant image URL if applicable
+                #     st.image(image, caption=document_title, width=200, use_column_width=False)
+                #     st.markdown(f'''
+                #                 <p style="text-align: right;">
+                #                     <b>Document Title:</b> {document_title}<br>
+                #                     <b>File Name:</b> {file_name}<br>
+                #                 </p>''', unsafe_allow_html=True)
+if __name__ == '__main__':
+    main()

pages/2_🗣_Busqueda_Conversacional.py ADDED Viewed

	@@ -0,0 +1,576 @@

+from tiktoken import get_encoding, encoding_for_model
+from utils.weaviate_interface_v3_spa import WeaviateClient, WhereFilter
+from templates.prompt_templates_spa import question_answering_prompt_series_spa
+from utils.openai_interface_spa import GPT_Turbo
+from openai import BadRequestError
+from utils.app_features_spa import (convert_seconds, generate_prompt_series, search_result,
+                          validate_token_threshold, load_content_cache, load_data, expand_content)
+from utils.reranker_spa import ReRanker
+from openai import OpenAI
+from loguru import logger
+import streamlit as st
+import os
+import templates.system_prompts as system_prompts
+import base64
+import json
+# load environment variables
+from dotenv import load_dotenv
+load_dotenv('.env', override=True)
+## PAGE CONFIGURATION
+st.set_page_config(page_title="Busqueda Conversacional",
+                   page_icon="🗣",
+                   layout="wide",
+                   initial_sidebar_state="auto",
+                   menu_items=None)
+def encode_image(uploaded_file):
+  return base64.b64encode(uploaded_file.getvalue()).decode('utf-8')
+## DATA + CACHE
+data_path = 'data/1_IIEE_1_json_data_19_02_2024_22-17-49.json'
+cache_path = ''
+data = load_data(data_path)
+cache = None  # Initialize cache as None
+# Check if the cache file exists before attempting to load it
+if os.path.exists(cache_path):
+    cache = load_content_cache(cache_path)
+else:
+    logger.warning(f"Cache file {cache_path} not found. Proceeding without cache.")
+#creates list of guests for sidebar
+guest_list = sorted(list(set([d['document_title'] for d in data])))
+with st.sidebar:
+    st.subheader("Selecciona tu Base de datos 🗃️")
+    client_type = st.radio(
+        "Selecciona el modo de acceso:",
+        ('Cloud', 'Local'),
+        help='Elige un repositorio para determinar el conjunto de datos sobre el cual realizarás tu búsqueda. "Cloud" te permite acceder a datos alojados en nuestros servidores seguros, mientras que "Local" es para trabajar con datos alojados localmente en tu máquina.'
+    )
+if client_type == 'Cloud':
+    api_key = st.secrets['WEAVIATE_CLOUD_API_KEY']
+    url = st.secrets['WEAVIATE_CLOUD_ENDPOINT']
+    weaviate_client = WeaviateClient(
+        endpoint=url,
+        api_key=api_key,
+        # model_name_or_path='./models/finetuned-all-MiniLM-L6-v2-300',
+        model_name_or_path="intfloat/multilingual-e5-small",
+        # openai_api_key=os.environ['OPENAI_API_KEY']
+        )
+    available_classes=sorted(weaviate_client.show_classes())
+    logger.info(available_classes)
+    logger.info(f"Endpoint: {client_type} | Classes: {available_classes}")
+elif client_type == 'Local':
+    url = st.secrets['WEAVIATE_LOCAL_ENDPOINT']
+    weaviate_client = WeaviateClient(
+        endpoint=url,
+        # api_key=api_key,
+        # model_name_or_path='./models/finetuned-all-MiniLM-L6-v2-300',
+        model_name_or_path="intfloat/multilingual-e5-small",
+        # openai_api_key=os.environ['OPENAI_API_KEY']
+        )
+    available_classes=sorted(weaviate_client.show_classes())
+    logger.info(f"Endpoint: {client_type} | Classes: {available_classes}")
+client = OpenAI(api_key=st.secrets["OPENAI_API_KEY"])
+def main():
+    # Define the available user selected options
+    available_models = ['gpt-3.5-turbo', 'gpt-4-1106-preview']
+    # Define system prompts
+    system_prompt_list = ["🤖ChatGPT","🧙🏾‍♂️Professor Synapse", "👩🏼‍💼Marketing Jane"]
+    # Initialize selected options in session state
+    if "openai_data_model" not in st.session_state:
+        st.session_state["openai_data_model"] = available_models[0]
+    if "system_prompt_data_list" not in st.session_state and "system_prompt_data_model" not in st.session_state:
+        # This should be the emoji string the user selected
+        st.session_state["system_prompt_data_list"] = system_prompt_list[0]
+        # Now we get the corresponding prompt variable using the selected emoji string
+        st.session_state["system_prompt_data_model"] = system_prompts.prompt_mapping[system_prompt_list[0]]
+    # logger.debug(f"Assistant: {st.session_state['system_prompt_sync_list']}")
+    # logger.debug(f"System Prompt: {st.session_state['system_prompt_sync_model']}")
+    if 'class_name' not in st.session_state:
+        st.session_state['class_name'] = None
+    with st.sidebar:
+        st.session_state['class_name'] = st.selectbox(
+            label='Repositorio:',
+            options=available_classes,
+            index=None,
+            placeholder='Repositorio',
+            help='Elige un repositorio para determinar el conjunto de datos sobre el cual realizarás tu búsqueda. "Cloud" te permite acceder a datos alojados en nuestros servidores seguros, mientras que "Local" es para trabajar con datos alojados localmente en tu máquina.'
+        )
+        # Check if the collection name has been selected
+        class_name = st.session_state['class_name']
+        if class_name:
+            st.success(f"Repositorio seleccionado ✅: {st.session_state['class_name']}")
+        else:
+            st.warning("🎗️ No olvides seleccionar el repositorio 👆 a consultar 🗄️.")
+            st.stop()  # Stop execution of the script
+        model_choice = st.selectbox(
+            label="Elige un modelo de OpenAI",
+            options=available_models,
+            index= available_models.index(st.session_state["openai_data_model"]),
+            help='Escoge entre diferentes modelos de OpenAI para generar respuestas a tus consultas. Cada modelo tiene distintas capacidades y limitaciones.'
+        )
+        system_prompt = st.selectbox(
+                label="Elige un asistente",
+                options=system_prompt_list,
+                index=system_prompt_list.index(st.session_state["system_prompt_data_list"]),
+        )
+        with st.expander("Filtros de Busqueda"):
+            guest_input = st.selectbox(
+                label='Selección de Documento',
+                options=guest_list,
+                index=None,
+                placeholder='Documentos',
+                help='Elige un documento específico del repositorio para afinar tu búsqueda a datos relevantes.'
+            )
+        with st.expander("Parametros de Busqueda"):
+            retriever_choice = st.selectbox(
+            label="Selecciona un método",
+            options=["Hybrid", "Vector", "Keyword"],
+            help='Determina el método de recuperación de información: "Hybrid" combina búsqueda por palabras clave y por similitud semántica, "Vector" usa embeddings de texto para encontrar coincidencias semánticas, y "Keyword" realiza una búsqueda tradicional por palabras clave.'
+            )
+            reranker_enabled = st.checkbox(
+                label="Activar Reranker",
+                value=True,
+                help='Activa esta opción para ordenar los resultados de la búsqueda según su relevancia, utilizando un modelo de reordenamiento adicional.'
+            )
+            alpha_input = st.slider(
+                label='Alpha para motor hibrido',
+                min_value=0.00,
+                max_value=1.00,
+                value=0.40,
+                step=0.05,
+                help='Ajusta el parámetro alfa para equilibrar los resultados entre los métodos de búsqueda por vector y por palabra clave en el motor híbrido.'
+            )
+            retrieval_limit = st.slider(
+                label='Resultados a Reranker',
+                min_value=10,
+                max_value=300,
+                value=100,
+                step=10,
+                help='Establece el número de resultados que se recuperarán antes de aplicar el reordenamiento.'
+            )
+            top_k_limit = st.slider(
+                label='Top K Limit',
+                min_value=1,
+                max_value=5,
+                value=3,
+                step=1,
+                help='Define el número máximo de resultados a mostrar después de aplicar el reordenamiento.'
+            )
+            temperature_input = st.slider(
+                label='Temperatura',
+                min_value=0.0,
+                max_value=1.0,
+                value=0.20,
+                step=0.10,
+                help='Ajusta la temperatura para la generación de texto con GPT, lo que influirá en la creatividad de las respuestas.'
+            )
+    # Update the model choice in session state
+    if st.session_state["openai_data_model"]!=model_choice:
+        st.session_state["openai_data_model"] = model_choice
+    logger.info(f"Data model: {st.session_state['openai_data_model']}")
+    # Update the system prompt choice in session state
+    if st.session_state["system_prompt_data_list"] != system_prompt:
+        # This should be the emoji string the user selected
+        st.session_state["system_prompt_data_list"] = system_prompt
+        # Now we get the corresponding prompt variable using the selected emoji string
+        selected_prompt_variable = system_prompts.prompt_mapping[system_prompt]
+        st.session_state['system_prompt_data_model'] = selected_prompt_variable
+        # logger.info(f"System Prompt: {selected_prompt_variable}")
+    logger.info(f"Assistant: {st.session_state['system_prompt_data_list']}")
+    # logger.info(f"System Prompt: {st.session_state['system_prompt_sync_model']}")
+    logger.info(weaviate_client.display_properties)
+    def database_search(query):
+        # Determine the appropriate limit based on reranking
+        search_limit = retrieval_limit if reranker_enabled else top_k_limit
+        # make hybrid call to weaviate
+        guest_filter = WhereFilter(
+            path=['document_title'],
+            operator='Equal',
+            valueText=guest_input).todict() if guest_input else None
+        try:
+            # Perform the search based on retriever_choice
+            if retriever_choice == "Keyword":
+                query_results = weaviate_client.keyword_search(
+                    request=query,
+                    class_name=class_name,
+                    limit=search_limit,
+                    where_filter=guest_filter
+                )
+            elif retriever_choice == "Vector":
+                query_results = weaviate_client.vector_search(
+                    request=query,
+                    class_name=class_name,
+                    limit=search_limit,
+                    where_filter=guest_filter
+                )
+            elif retriever_choice == "Hybrid":
+                query_results = weaviate_client.hybrid_search(
+                    request=query,
+                    class_name=class_name,
+                    alpha=alpha_input,
+                    limit=search_limit,
+                    properties=["content"],
+                    where_filter=guest_filter
+                )
+            else:
+                return json.dumps({"error": "Invalid retriever choice"})
+            ## RERANKER
+            reranker = ReRanker(model_name='cross-encoder/ms-marco-MiniLM-L-12-v2')
+            model_name = model_choice
+            encoding = encoding_for_model(model_name)
+            # Rerank the results if enabled
+            if reranker_enabled:
+                search_results = reranker.rerank(
+                    results=query_results,
+                    query=query,
+                    apply_sigmoid=True,
+                    top_k=top_k_limit
+                )
+            else:
+                # Use the results directly if reranking is not enabled
+                search_results = query_results
+            # logger.debug(search_results)
+            # Save search results to session state for later use
+            # st.session_state['search_results'] = search_results
+            add_to_search_history(query=query, search_results=search_results)
+            expanded_response = expand_content(search_results, cache, content_key='doc_id', create_new_list=True)
+            # validate token count is below threshold
+            token_threshold = 8000
+            valid_response = validate_token_threshold(
+                ranked_results=expanded_response,
+                base_prompt=question_answering_prompt_series_spa,
+                query=query,
+                tokenizer=encoding,
+                token_threshold=token_threshold,
+                verbose=True
+            )
+            # generate LLM prompt
+            prompt = generate_prompt_series(query=query, results=valid_response)
+            # If the strings in 'prompt' are double-escaped, decode them before dumping to JSON
+            # prompt_decoded = prompt.encode().decode('unicode_escape')
+            # Then, when you dump to JSON, it should no longer double-escape the characters
+            return json.dumps({
+                    "query": query,
+                    "Search Results": prompt,
+                }, ensure_ascii=False)
+        except Exception as e:
+            # Handle any exceptions and return a JSON formatted error message
+            return json.dumps({
+                "error": "An error occurred during the search",
+                "details": str(e)
+            })
+    # When a new message is added, include the type and content
+    def add_to_search_history(query, search_results):
+        st.session_state["data_search_history"].append({
+            "query": query,
+            "search_results": search_results,
+        })
+    # Function to display search results
+    def display_search_results():
+        # Loop through each item in the search history
+        for search in st.session_state['data_search_history']:
+            query = search["query"]
+            search_results = search["search_results"]
+            # Create an expander for each search query
+            with st.expander(f"Pregunta: {query}", expanded=False):
+                for i, hit in enumerate(search_results):
+                    # col1, col2 = st.columns([7, 3], gap='large')
+                    page_url = hit['page_url']
+                    page_label = hit['page_label']
+                    document_title = hit['document_title']
+                    # Assuming 'page_summary' is available and you want to display it
+                    page_summary = hit.get('page_summary', 'Summary not available')
+                    # with col1:
+                    st.markdown(f'''
+                            <span style="color: #3498db; font-size: 19px; font-weight: bold;">{document_title}</span><br>
+                            {page_summary}
+                            [**Página:** {page_label}]({page_url})
+                        ''', unsafe_allow_html=True)
+                        # with st.expander("📄 Clic aquí para ver contexto:"):
+                        #     try:
+                        #         content = hit['content']
+                        #         st.write(content)
+                        #     except Exception as e:
+                        #         st.write(f"Error displaying content: {e}")
+                    # with col2:
+                    #     # If you have an image or want to display a placeholder image
+                    #     image = "URL_TO_A_PLACEHOLDER_IMAGE"  # Replace with a relevant image URL if applicable
+                    #     st.image(image, caption=document_title, width=200, use_column_width=False)
+                    #     st.markdown(f'''
+                    #                 <p style="text-align: right;">
+                    #                     <b>Document Title:</b> {document_title}<br>
+                    #                     <b>File Name:</b> {file_name}<br>
+                    #                 </p>''', unsafe_allow_html=True)
+    ########################
+    ## SETUP MAIN DISPLAY ##
+    ########################
+    st.image('./static/images/cervezas-mahou.jpeg', width=400)
+    st.subheader(f"✨🗣️📘 **Búsqueda Conversacional** 💡🗣️✨ - Impuestos Especiales")
+    st.write('\n')
+    col1, col2 = st.columns([50,50])
+    # Initialize chat history
+    if "data_chat_history" not in st.session_state:
+        st.session_state["data_chat_history"] = []
+    if "data_search_history" not in st.session_state:
+        st.session_state["data_search_history"] = []
+    with col1:
+        st.write("Chat History:")
+        # Create a container for chat history
+        chat_history_container = st.container(height=500, border=True)
+        # Display chat messages from history on app rerun
+        with chat_history_container:
+            for message in st.session_state["data_chat_history"]:
+                with st.chat_message(message["role"]):
+                    st.markdown(message["content"])
+    # Function to update chat display
+    def update_chat_display():
+        with chat_history_container:
+            for message in st.session_state["data_chat_history"]:
+                with st.chat_message(message["role"]):
+                    st.markdown(message["content"])
+    if prompt := st.chat_input("What is up?"):
+    # Add user message to chat history
+        st.session_state["data_chat_history"].append({"role": "user", "content": prompt})
+        # Initially display the chat history
+        update_chat_display()
+        # # Display user message in chat message container
+        # with st.chat_message("user"):
+        #     st.markdown(prompt)
+        with st.spinner('Generando Respuesta...'):
+            tools = [
+                {
+                    "type": "function",
+                    "function": {
+                        "name": "database_search",
+                        "description": "Takes the users query about the database and returns the results, extracting info to answer the user's question",
+                        "parameters": {
+                            "type": "object",
+                            "properties": {
+                                    "query": {"type": "string", "description": "query"},
+                                },
+                            "required": ["query"],
+                        },
+                    }
+                }
+            ]
+            # Display live assistant response in chat message container
+            with st.chat_message(
+                name="assistant",
+                avatar="./static/images/openai_purple_logo_hres.jpeg"):
+                message_placeholder = st.empty()
+            # Building the messages payload with proper OPENAI API structure
+            messages=[
+                    {"role": "system", "content": st.session_state["system_prompt_data_model"]}
+                ] + [
+                    {"role": m["role"], "content": m["content"]} for m in st.session_state["data_chat_history"]
+                ]
+            logger.debug(f"Initial Messages: {messages}")
+            # call the OpenAI API to get the response
+            RESPONSE = client.chat.completions.create(
+                model=st.session_state["openai_data_model"],
+                temperature=0.5,
+                messages=messages,
+                tools=tools,
+                tool_choice="auto",  # auto is default, but we'll be explicit
+                stream=True
+            )
+            logger.debug(f"First Response: {RESPONSE}")
+            FULL_RESPONSE = ""
+            tool_calls = []
+            # build up the response structs from the streamed response, simultaneously sending message chunks to the browser
+            for chunk in RESPONSE:
+                delta = chunk.choices[0].delta
+                # logger.debug(f"chunk: {delta}")
+                if delta and delta.content:
+                    text_chunk = delta.content
+                    FULL_RESPONSE += str(text_chunk)
+                    message_placeholder.markdown(FULL_RESPONSE + "▌")
+                elif delta and delta.tool_calls:
+                    tcchunklist = delta.tool_calls
+                    for tcchunk in tcchunklist:
+                        if len(tool_calls) <= tcchunk.index:
+                            tool_calls.append({"id": "", "type": "function", "function": { "name": "", "arguments": "" } })
+                        tc = tool_calls[tcchunk.index]
+                        if tcchunk.id:
+                            tc["id"] += tcchunk.id
+                        if tcchunk.function.name:
+                            tc["function"]["name"] += tcchunk.function.name
+                        if tcchunk.function.arguments:
+                            tc["function"]["arguments"] += tcchunk.function.arguments
+            if tool_calls:
+                logger.debug(f"tool_calls: {tool_calls}")
+                # Define a dictionary mapping function names to actual functions
+                available_functions = {
+                    "database_search": database_search,
+                    # Add other functions as necessary
+                }
+                available_functions = {
+                    "database_search": database_search,
+                }  # only one function in this example, but you can have multiple
+                logger.debug(f"FuncCall Before messages: {messages}")
+                # Process each tool call
+                for tool_call in tool_calls:
+                    # Get the function name and arguments from the tool call
+                    function_name = tool_call['function']['name']
+                    function_args = json.loads(tool_call['function']['arguments'])
+                    # Get the actual function to call
+                    function_to_call = available_functions[function_name]
+                    # Call the function and get the response
+                    function_response = function_to_call(**function_args)
+                    # Append the function response to the messages list
+                    messages.append({
+                        "role": "assistant",
+                        "tool_call_id": tool_call['id'],
+                        "name": function_name,
+                        "content": function_response,
+                    })
+                logger.debug(f"FuncCall After messages: {messages}")
+                RESPONSE = client.chat.completions.create(
+                    model=st.session_state["openai_data_model"],
+                    temperature=0.1,
+                    messages=messages,
+                    stream=True
+                )
+                logger.debug(f"Second Response: {RESPONSE}")
+                # build up the response structs from the streamed response, simultaneously sending message chunks to the browser
+                for chunk in RESPONSE:
+                    delta = chunk.choices[0].delta
+                    # logger.debug(f"chunk: {delta}")
+                    if delta and delta.content:
+                        text_chunk = delta.content
+                        FULL_RESPONSE += str(text_chunk)
+                        message_placeholder.markdown(FULL_RESPONSE + "▌")
+        # Add assistant response to chat history
+        st.session_state["data_chat_history"].append({"role": "assistant", "content": FULL_RESPONSE})
+        logger.debug(f"chat_history: {st.session_state['data_chat_history']}")
+# Next block of code...
+    ####################
+    ## Search Results ##
+    ####################
+    # st.subheader(subheader_msg)
+    with col2:
+        st.write("Search Results:")
+        with st.container(height=500, border=True):
+            # Check if 'data_search_history' is in the session state and not empty
+                if 'data_search_history' in st.session_state and st.session_state['data_search_history']:
+                    display_search_results()
+                    # # Extract the latest message from the search history
+                    #     latest_search = st.session_state['data_search_history'][-1]
+                    #     query = latest_search["query"]
+                    #     with st.expander(query, expanded=False):
+                    #         # Extract the latest message from the search history
+                    #         latest_search = st.session_state['data_search_history'][-1]
+                    #         query = latest_search["query"]
+                    #         for i, hit in enumerate(latest_search["search_results"]):
+                    #             col1, col2 = st.columns([7, 3], gap='large')
+                    #             episode_url = hit['episode_url']
+                    #             title = hit['title']
+                    #             guest=hit['guest']
+                    #             show_length = hit['length']
+                    #             time_string = convert_seconds(show_length)
+                    #             # content = ranked_response[i]['content'] # Get 'content' from the same index in ranked_response
+                    #             content = hit['content']
+                    #             with col1:
+                    #                 st.write( search_result(i=i,
+                    #                                         url=episode_url,
+                    #                                         guest=guest,
+                    #                                         title=title,
+                    #                                         content=content,
+                    #                                         length=time_string),
+                    #                                         unsafe_allow_html=True)
+                    #                 st.write('\n\n')
+                    #                 # with st.container("Episode Summary:"):
+                    #                 #     try:
+                    #                 #         ep_summary = hit['summary']
+                    #                 #         st.write(ep_summary)
+                    #                 #     except Exception as e:
+                    #                 #         st.error(f"Error displaying summary: {e}")
+                    #             with col2:
+                    #                 image = hit['thumbnail_url']
+                    #                 st.image(image, caption=title.split('|')[0], width=200, use_column_width=False)
+                    #                 st.markdown(f'''
+                    #                             <p style="text-align: right;">
+                    #                                 <b>Episode:</b> {title.split('|')[0]}<br>
+                    #                                 <b>Guest:</b> {hit['guest']}<br>
+                    #                                 <b>Length:</b> {time_string}
+                    #                             </p>''', unsafe_allow_html=True)
+if __name__ == '__main__':
+    main()

pages/__init__.py ADDED Viewed

File without changes

requirements.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+loguru==0.7.0
+numpy==1.24.4
+openai==1.10.0
+pandas==2.0.3
+protobuf==4.23.4
+pyarrow==12.0.1
+python-dotenv==1.0.0
+rank-bm25==0.2.2
+requests==2.31.0
+requests-oauthlib==1.3.1
+rich==13.7.0
+sentence-transformers==2.2.2
+streamlit==1.31.1
+tiktoken==0.5.1
+tokenizers==0.13.3
+torch==2.0.1
+transformers==4.33.1
+weaviate-client==3.25.3

static/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

static/images/cervezas-mahou.jpeg ADDED Viewed

static/images/fabrica-mahou-1200x675.jpeg ADDED Viewed

static/images/openai_logo.png ADDED Viewed

static/images/openai_logo_circle.png ADDED Viewed

static/images/openai_purple_logo_hres.jpeg ADDED Viewed

static/images/screen_recording_busqueda_final_2.gif ADDED Viewed

Git LFS Details

SHA256: aa4222b0ddf313d66a88eb14c8589d773f71fa1fd533321e223d7133293486df
Pointer size: 133 Bytes
Size of remote file: 43.8 MB

utils/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

utils/__init__.py ADDED Viewed

File without changes

utils/app_features_spa.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import time
+import json
+from utils.preprocessing import FileIO
+from typing import List, Optional
+import tiktoken
+from loguru import logger
+from templates.prompt_templates_spa import context_block_spa, question_answering_prompt_series_spa
+import streamlit as st
+@st.cache_data
+def load_content_cache(data_path: str):
+    data = FileIO().load_parquet(data_path)
+    content_data = {d['doc_id']: d['content'] for d in data}
+    return content_data
+@st.cache_data
+def load_data(data_path: str):
+    with open(data_path, 'r') as f:
+        data = json.load(f)
+    return data
+def convert_seconds(seconds: int):
+    """
+    Converts seconds to a string of format Hours:Minutes:Seconds
+    """
+    return time.strftime("%H:%M:%S", time.gmtime(seconds))
+def generate_prompt_series(query: str, results: List[dict]) -> str:
+    """
+    Generates a prompt for the OpenAI API by joining the context blocks of the top results.
+    Provides context to the LLM by supplying the summary, document name, and retrieved content of each result.
+    Args:
+    -----
+        query : str
+            User query
+        results : List[dict]
+            List of results from the Weaviate client
+    """
+    context_series = '\n'.join([context_block_spa.format(summary=res['page_summary'],
+                                                     document=res['document_title'],
+                                                     transcript=res['content']
+                                                     )for res in results]).strip()
+    prompt = question_answering_prompt_series_spa.format(question=query, series=context_series)
+    return prompt
+def expand_content(ranked_results: List[dict],
+                   content_cache: Optional[dict] = None,
+                   content_key: str = 'doc_id',
+                   create_new_list: bool = False
+                   ) -> List[dict]:
+    '''
+    Updates or creates a list of ranked results with content from a cache.
+    This function iterates over a list of dictionaries representing ranked results.
+    If a cache is provided, it adds or updates the 'content' key in each dictionary
+    with the corresponding content from the cache based on the content_key.
+    Args:
+    - ranked_results (List[dict]): A list of dictionaries, each representing a ranked result.
+    - content_cache (Optional[dict]): A dictionary that maps content_key to content.
+      If None, the content of ranked results will not be updated.
+    - content_key (str): The key used in both the ranked results and content cache to match
+      the ranked results with their corresponding content in the cache.
+    - create_new_list (bool): If True, a new list of dictionaries will be created and
+      returned with the content updated. If False, the ranked_results will be updated in place.
+    Returns:
+    - List[dict]: A new list with updated content if create_new_list is True; otherwise,
+      the original ranked_results list with updated content.
+    Note:
+    - If create_new_list is False, the function will mutate the original ranked_results list.
+    - The function only updates content if the content_key exists in both the ranked result
+      and the content cache.
+    Example:
+    ```
+    ranked_results = [{'doc_id': '123', 'title': 'Title 1'}, {'doc_id': '456', 'title': 'Title 2'}]
+    content_cache = {'123': 'Content for 123', '456': 'Content for 456'}
+    updated_results = expand_content(ranked_results, content_cache, create_new_list=True)
+    # updated_results is now [{'doc_id': '123', 'title': 'Title 1', 'content': 'Content for 123'},
+    #                         {'doc_id': '456', 'title': 'Title 2', 'content': 'Content for 456'}]
+    ```
+    '''
+    if create_new_list:
+        expanded_response = [{k:v for k, v in resp.items()} for resp in ranked_results]
+        if content_cache is not None:
+            for resp in expanded_response:
+                if resp[content_key] in content_cache:
+                    resp['content'] = content_cache[resp[content_key]]
+        return expanded_response
+    else:
+        for resp in ranked_results:
+            if content_cache and resp[content_key] in content_cache:
+                resp['content'] = content_cache[resp[content_key]]
+        return ranked_results
+def validate_token_threshold(ranked_results: List[dict],
+                             base_prompt: str,
+                             query: str,
+                             tokenizer: tiktoken.Encoding,
+                             token_threshold: int,
+                             verbose: bool = False
+                             ) -> List[dict]:
+        """
+        Validates that prompt is below the set token threshold by adding lengths of:
+            1. Base prompt
+            2. User query
+            3. Context material
+        If threshold is exceeded, context results are reduced incrementally until the
+        combined prompt tokens are below the threshold. This function does not take into
+        account every token passed to the LLM, but it is a good approximation.
+        """
+        overhead_len = len(tokenizer.encode(base_prompt.format(question=query, series='')))
+        context_len = _get_batch_length(ranked_results, tokenizer)
+        token_count = overhead_len + context_len
+        if token_count > token_threshold:
+            print('Token count exceeds token count threshold, reducing size of returned results below token threshold')
+            while token_count > token_threshold and len(ranked_results) > 1:
+                num_results = len(ranked_results)
+                # remove the last ranked (most irrelevant) result
+                ranked_results = ranked_results[:num_results-1]
+                # recalculate new token_count
+                token_count = overhead_len + _get_batch_length(ranked_results, tokenizer)
+        if verbose:
+            logger.info(f'Total Final Token Count: {token_count}')
+        return ranked_results
+def _get_batch_length(ranked_results: List[dict], tokenizer: tiktoken.Encoding) -> int:
+    '''
+    Convenience function to get the length in tokens of a batch of results
+    '''
+    contexts = tokenizer.encode_batch([r['content'] for r in ranked_results])
+    context_len = sum(list(map(len, contexts)))
+    return context_len
+def search_result(i: int,
+                  url: str,
+                  title: str,
+                  content: str,
+                  guest: str,
+                  length: str,
+                  space: str='&nbsp; &nbsp;'
+                 ) -> str:
+    '''
+    HTML to display search results.
+    Args:
+    -----
+    i: int
+        index of search result
+    url: str
+        url of YouTube video
+    title: str
+        title of episode
+    content: str
+        content chunk of episode
+    '''
+    return f"""
+        <div style="font-size:120%;">
+            {i + 1}.<a href="{url}">{title}</a>
+        </div>
+        <div style="font-size:95%;">
+            <p>Episode Length: {length} {space}{space} Guest: {guest}</p>
+            <div style="color:grey;float:left;">
+                ...
+            </div>
+            {content}
+        </div>
+    """

utils/openai_interface_spa.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import os
+from openai import OpenAI
+from typing import List, Any, Tuple
+from dotenv import load_dotenv
+from tqdm import tqdm
+from concurrent.futures import ThreadPoolExecutor, as_completed
+_ = load_dotenv('./.env', override=True) # read local .env file
+class GPT_Turbo:
+    def __init__(self, model: str="gpt-3.5-turbo-0613", api_key: str=os.environ['OPENAI_API_KEY']):
+        self.model = model
+        self.client = OpenAI(api_key=api_key)
+    def get_chat_completion(self,
+                            prompt: str,
+                            system_message: str='You are a helpful assistant.',
+                            temperature: int=0,
+                            max_tokens: int=500,
+                            stream: bool=False,
+                            show_response: bool=False
+                            ) -> str:
+        messages =  [
+            {'role': 'system', 'content': system_message},
+            {'role': 'assistant', 'content': prompt}
+                    ]
+        response = self.client.chat.completions.create( model=self.model,
+                                                        messages=messages,
+                                                        temperature=temperature,
+                                                        max_tokens=max_tokens,
+                                                        stream=stream)
+        if show_response:
+            return response
+        return response.choices[0].message.content
+    def multi_thread_request(self,
+                             filepath: str,
+                             prompt: str,
+                             content: List[str],
+                             temperature: int=0
+                             ) -> List[Any]:
+        data = []
+        with ThreadPoolExecutor(max_workers=2*os.cpu_count()) as exec:
+            futures = [exec.submit(self.get_completion_from_messages, [{'role': 'user','content': f'{prompt} ```{c}```'}], temperature, 500, False) for c in content]
+            with open(filepath, 'a') as f:
+                for future in as_completed(futures):
+                    result = future.result()
+                    if len(data) % 10 == 0:
+                            print(f'{len(data)} of {len(content)} completed.')
+                    if result:
+                        data.append(result)
+                        self.write_to_file(file_handle=f, data=result)
+        return [res for res in data if res]
+    def generate_question_context_pairs(self,
+                                        context_tuple: Tuple[str, str],
+                                        num_questions_per_chunk: int=2,
+                                        max_words_per_question: int=10
+                                        ) -> List[str]:
+        doc_id, context = context_tuple
+        prompt = f'Context information is included below enclosed in triple backticks. Given the context information and not prior knowledge, generate questions based on the below query.\n\nYou are an end user querying for information about your favorite podcast. \
+                   Your task is to setup {num_questions_per_chunk} questions that can be answered using only the given context. The questions should be diverse in nature across the document and be no longer than {max_words_per_question} words. \
+                   Restrict the questions to the context information provided.\n\
+                   ```{context}```\n\n'
+        response = self.get_completion_from_messages(prompt=prompt, temperature=0, max_tokens=500, show_response=True)
+        questions = response.choices[0].message["content"]
+        return (doc_id, questions)
+    def batch_generate_question_context_pairs(self,
+                                              context_tuple_list: List[Tuple[str, str]],
+                                              num_questions_per_chunk: int=2,
+                                              max_words_per_question: int=10
+                                              ) -> List[Tuple[str, str]]:
+        data = []
+        progress = tqdm(unit="Generated Questions", total=len(context_tuple_list))
+        with ThreadPoolExecutor(max_workers=2*os.cpu_count()) as exec:
+            futures = [exec.submit(self.generate_question_context_pairs, context_tuple, num_questions_per_chunk, max_words_per_question) for context_tuple in context_tuple_list]
+            for future in as_completed(futures):
+                result = future.result()
+                if result:
+                    data.append(result)
+                    progress.update(1)
+        return data
+    def get_embedding(self):
+         pass
+    def write_to_file(self, file_handle, data: str) -> None:
+            file_handle.write(data)
+            file_handle.write('\n')

utils/preprocessing.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import os
+import json
+import pandas as pd
+from typing import List, Union, Dict
+from loguru import logger
+import pandas as pd
+import pathlib
+## Set of helper functions that support data preprocessing
+class FileIO:
+    '''
+    Convenience class for saving and loading data in parquet and
+    json formats to/from disk.
+    '''
+    def save_as_parquet(self,
+                        file_path: str,
+                        data: Union[List[dict], pd.DataFrame],
+                        overwrite: bool=False) -> None:
+        '''
+        Saves DataFrame to disk as a parquet file.  Removes the index.
+        Args:
+        -----
+        file_path : str
+            Output path to save file, if not included "parquet" will be appended
+            as file extension.
+        data : Union[List[dict], pd.DataFrame]
+            Data to save as parquet file. If data is a list of dicts, it will be
+            converted to a DataFrame before saving.
+        overwrite : bool
+            Overwrite existing file if True, otherwise raise FileExistsError.
+        '''
+        if isinstance(data, list):
+           data = self._convert_toDataFrame(data)
+        if not file_path.endswith('parquet'):
+            file_path = self._rename_file_extension(file_path, 'parquet')
+        self._check_file_path(file_path, overwrite=overwrite)
+        data.to_parquet(file_path, index=False)
+        logger.info(f'DataFrame saved as parquet file here: {file_path}')
+    def _convert_toDataFrame(self, data: List[dict]) -> pd.DataFrame:
+        return pd.DataFrame().from_dict(data)
+    def _rename_file_extension(self, file_path: str, extension: str):
+        '''
+        Renames file with appropriate extension if file_path
+        does not already have correct extension.
+        '''
+        prefix = os.path.splitext(file_path)[0]
+        file_path = prefix + '.' + extension
+        return file_path
+    def _check_file_path(self, file_path: str, overwrite: bool) -> None:
+        '''
+        Checks for existence of file and overwrite permissions.
+        '''
+        if os.path.exists(file_path) and overwrite == False:
+            raise FileExistsError(f'File by name {file_path} already exists, try using another file name or set overwrite to True.')
+        elif os.path.exists(file_path):
+            os.remove(file_path)
+        else:
+            file_name = os.path.basename(file_path)
+            dir_structure = file_path.replace(file_name, '')
+            pathlib.Path(dir_structure).mkdir(parents=True, exist_ok=True)
+    def load_parquet(self, file_path: str, verbose: bool=True) -> List[dict]:
+        '''
+        Loads parquet from disk, converts to pandas DataFrame as intermediate
+        step and outputs a list of dicts (docs).
+        '''
+        df = pd.read_parquet(file_path)
+        vector_labels = ['content_vector', 'image_vector', 'content_embedding']
+        for label in vector_labels:
+            if label in df.columns:
+                df[label] = df[label].apply(lambda x: x.tolist())
+        if verbose:
+            memory_usage = round(df.memory_usage().sum()/(1024*1024),2)
+            print(f'Shape of data: {df.values.shape}')
+            print(f'Memory Usage: {memory_usage}+ MB')
+        list_of_dicts = df.to_dict('records')
+        return list_of_dicts
+    def load_json(self, file_path: str):
+        '''
+        Loads json file from disk.
+        '''
+        with open(file_path) as f:
+            data = json.load(f)
+        return data
+    def save_as_json(self,
+                     file_path: str,
+                     data: Union[List[dict], dict],
+                     indent: int=4,
+                     overwrite: bool=False
+                     ) -> None:
+        '''
+        Saves data to disk as a json file. Data can be a list of dicts or a single dict.
+        '''
+        if not file_path.endswith('json'):
+            file_path = self._rename_file_extension(file_path, 'json')
+        self._check_file_path(file_path, overwrite=overwrite)
+        with open(file_path, 'w') as f:
+            json.dump(data, f, indent=indent)
+        logger.info(f'Data saved as json file here: {file_path}')
+class Utilities:
+    def create_video_url(self, video_id: str, playlist_id: str):
+        '''
+        Creates a hyperlink to a video episode given a video_id and playlist_id.
+        Args:
+        -----
+        video_id : str
+            Video id of the episode from YouTube
+        playlist_id : str
+            Playlist id of the episode from YouTube
+        '''
+        return f'https://www.youtube.com/watch?v={video_id}&list={playlist_id}'

utils/prompt_templates_spa.py ADDED Viewed

	@@ -0,0 +1,26 @@

+question_answering_prompt_series_spa = '''
+Su tarea es sintetizar y razonar sobre una serie de contenidos proporcionados.
+Después de su síntesis, utilice estos contenidos para responder a la pregunta a continuación. La serie estará en el siguiente formato:\n
+```
+RESUMEN: <summary>
+DOCUMENTO: <document>
+CONTENIDO: <transcript>
+```\n\n
+Inicio de la Serie:
+```
+{series}
+```
+Pregunta:\n
+{question}\n
+Responda a la pregunta y proporcione razonamientos si es necesario para explicar la respuesta.
+Si el contexto no proporciona suficiente información para responder a la pregunta, entonces
+indique que no puede responder a la pregunta con el contexto proporcionado.
+Respuesta:
+'''
+context_block_spa = '''
+RESUMEN: {summary}
+DOCUMENTO: {document}
+CONTENIDO: {transcript}
+'''

utils/reranker_spa.py ADDED Viewed

	@@ -0,0 +1,89 @@

+from sentence_transformers import CrossEncoder
+from torch.nn import Sigmoid
+from typing import List, Union
+import numpy as np
+from loguru import logger
+class ReRanker(CrossEncoder):
+    '''
+    Cross-Encoder models achieve higher performance than Bi-Encoders,
+    however, they do not scale well to large datasets. The lack of scalability
+    is due to the underlying cross-attention mechanism, which is computationally
+    expensive.  Thus a Bi-Encoder is best used for 1st-stage document retrieval and
+    a Cross-Encoder is used to re-rank the retrieved documents.
+    https://www.sbert.net/examples/applications/cross-encoder/README.html
+    '''
+    def __init__(self,
+                 model_name: str='cross-encoder/ms-marco-MiniLM-L-6-v2',
+                 **kwargs
+                 ):
+        super().__init__(model_name=model_name,
+                         **kwargs)
+        self.model_name = model_name
+        self.score_field = 'cross_score'
+        self.activation_fct = Sigmoid()
+    def _cross_encoder_score(self,
+                             results: List[dict],
+                             query: str,
+                             hit_field: str='content',
+                             apply_sigmoid: bool=True,
+                             return_scores: bool=False
+                             ) -> Union[np.array, None]:
+        '''
+        Given a list of hits from a Retriever:
+            1. Scores hits by passing query and results through CrossEncoder model.
+            2. Adds cross-score key to results dictionary.
+            3. If desired returns np.array of Cross Encoder scores.
+        '''
+        activation_fct = self.activation_fct if apply_sigmoid else None
+        #build query/content list
+        cross_inp = [[query, hit[hit_field]] for hit in results]
+        #get scores
+        cross_scores = self.predict(cross_inp, activation_fct=activation_fct)
+        for i, result in enumerate(results):
+            result[self.score_field]=cross_scores[i]
+        if return_scores:return cross_scores
+    def rerank(self,
+               results: List[dict],
+               query: str,
+               top_k: int=10,
+               apply_sigmoid: bool=True,
+               threshold: float=None
+               ) -> List[dict]:
+        '''
+        Given a list of hits from a Retriever:
+            1. Scores hits by passing query and results through CrossEncoder model.
+            2. Adds cross_score key to results dictionary.
+            3. Returns reranked results limited by either a threshold value or top_k.
+        Args:
+        -----
+        results : List[dict]
+            List of results from the Weaviate client
+        query : str
+            User query
+        top_k : int=10
+            Number of results to return
+        apply_sigmoid : bool=True
+            Whether to apply sigmoid activation to cross-encoder scores.  If False,
+            returns raw cross-encoder scores (logits).
+        threshold : float=None
+            Minimum cross-encoder score to return. If no hits are above threshold,
+            returns top_k hits.
+        '''
+        # Sort results by the cross-encoder scores
+        self._cross_encoder_score(results=results, query=query, apply_sigmoid=apply_sigmoid)
+        sorted_hits = sorted(results, key=lambda x: x[self.score_field], reverse=True)
+        if threshold or threshold == 0:
+            filtered_hits = [hit for hit in sorted_hits if hit[self.score_field] >= threshold]
+            if not any(filtered_hits):
+                logger.warning(f'No hits above threshold {threshold}. Returning top {top_k} hits.')
+                return sorted_hits[:top_k]
+            return filtered_hits
+        return sorted_hits[:top_k]

utils/retrieval_evaluation_spa.py ADDED Viewed

	@@ -0,0 +1,332 @@

+#external files
+from openai_interface_spa import GPT_Turbo
+from weaviate_interface_v3_spa import WeaviateClient
+from llama_index.finetuning import EmbeddingQAFinetuneDataset
+from templates.prompt_templates_spa import qa_generation_prompt
+from reranker_spa import ReRanker
+#standard library imports
+import json
+import time
+import uuid
+import os
+import re
+import random
+from datetime import datetime
+from typing import List, Dict, Tuple, Union, Literal
+#misc
+from tqdm import tqdm
+class QueryContextGenerator:
+    '''
+    Class designed for the generation of query/context pairs using a
+    Generative LLM. The LLM is used to generate questions from a given
+    corpus of text. The query/context pairs can be used to fine-tune
+    an embedding model using a MultipleNegativesRankingLoss loss function
+    or can be used to create evaluation datasets for retrieval models.
+    '''
+    def __init__(self, openai_key: str, model_id: str='gpt-3.5-turbo-0613'):
+        self.llm = GPT_Turbo(model=model_id, api_key=openai_key)
+    def clean_validate_data(self,
+                            data: List[dict],
+                            valid_fields: List[str]=['content', 'summary', 'guest', 'doc_id'],
+                            total_chars: int=950
+                            ) -> List[dict]:
+        '''
+        Strip original data chunks so they only contain valid_fields.
+        Remove any chunks less than total_chars in size. Prevents LLM
+        from asking questions from sparse content.
+        '''
+        clean_docs = [{k:v for k,v in d.items() if k in valid_fields} for d in data]
+        valid_docs = [d for d in clean_docs if len(d['content']) > total_chars]
+        return valid_docs
+    def train_val_split(self,
+                        data: List[dict],
+                        n_train_questions: int,
+                        n_val_questions: int,
+                        n_questions_per_chunk: int=2,
+                        total_chars: int=950):
+        '''
+        Splits corpus into training and validation sets.  Training and
+        validation samples are randomly selected from the corpus. total_chars
+        parameter is set based on pre-analysis of average doc length in the
+        training corpus.
+        '''
+        clean_data = self.clean_validate_data(data, total_chars=total_chars)
+        random.shuffle(clean_data)
+        train_index = n_train_questions//n_questions_per_chunk
+        valid_index = n_val_questions//n_questions_per_chunk
+        end_index = valid_index + train_index
+        if end_index > len(clean_data):
+            raise ValueError('Cannot create dataset with desired number of questions, try using a larger dataset')
+        train_data = clean_data[:train_index]
+        valid_data = clean_data[train_index:end_index]
+        print(f'Length Training Data: {len(train_data)}')
+        print(f'Length Validation Data: {len(valid_data)}')
+        return train_data, valid_data
+    def generate_qa_embedding_pairs(
+                                    self,
+                                    data: List[dict],
+                                    generate_prompt_tmpl: str=None,
+                                    num_questions_per_chunk: int = 2,
+                                    ) -> EmbeddingQAFinetuneDataset:
+        """
+        Generate query/context pairs from a list of documents. The query/context pairs
+        can be used for fine-tuning an embedding model using a MultipleNegativesRankingLoss
+        or can be used to create an evaluation dataset for retrieval models.
+        This function was adapted for this course from the llama_index.finetuning.common module:
+        https://github.com/run-llama/llama_index/blob/main/llama_index/finetuning/embeddings/common.py
+        """
+        generate_prompt_tmpl = qa_generation_prompt if not generate_prompt_tmpl else generate_prompt_tmpl
+        queries = {}
+        relevant_docs = {}
+        corpus = {chunk['doc_id'] : chunk['content'] for chunk in data}
+        for chunk in tqdm(data):
+            page_summary = chunk['page_summary']
+            # guest = chunk['guest']
+            context_str = chunk['content']
+            node_id = chunk['doc_id']
+            query = generate_prompt_tmpl.format(page_summary=page_summary,
+                                                # guest=guest,
+                                                context_str=context_str,
+                                                num_questions_per_chunk=num_questions_per_chunk)
+            try:
+                response = self.llm.get_chat_completion(prompt=query, temperature=0.1, max_tokens=100)
+            except Exception as e:
+                print(e)
+                continue
+            result = str(response).strip().split("\n")
+            questions = [
+                re.sub(r"^\d+[\).\s]", "", question).strip() for question in result
+            ]
+            questions = [question for question in questions if len(question) > 0]
+            for question in questions:
+                question_id = str(uuid.uuid4())
+                queries[question_id] = question
+                relevant_docs[question_id] = [node_id]
+        # construct dataset
+        return EmbeddingQAFinetuneDataset(
+            queries=queries, corpus=corpus, relevant_docs=relevant_docs
+        )
+def execute_evaluation(dataset: EmbeddingQAFinetuneDataset,
+                       class_name: str,
+                       retriever: WeaviateClient,
+                       reranker: ReRanker=None,
+                       alpha: float=0.5,
+                       retrieve_limit: int=100,
+                       top_k: int=5,
+                       chunk_size: int=256,
+                       hnsw_config_keys: List[str]=['maxConnections', 'efConstruction', 'ef'],
+                       search_type: Literal['kw', 'vector', 'hybrid', 'all']='all',
+                       display_properties: List[str]=['doc_id', 'content'],
+                       dir_outpath: str='./eval_results',
+                       include_miss_info: bool=False,
+                       user_def_params: dict=None
+                       ) -> Union[dict, Tuple[dict, List[dict]]]:
+    '''
+    Given a dataset, a retriever, and a reranker, evaluate the performance of the retriever and reranker.
+    Returns a dict of kw, vector, and hybrid hit rates and mrr scores. If include_miss_info is True, will
+    also return a list of kw and vector responses and their associated queries that did not return a hit.
+    Args:
+    -----
+    dataset: EmbeddingQAFinetuneDataset
+        Dataset to be used for evaluation
+    class_name: str
+        Name of Class on Weaviate host to be used for retrieval
+    retriever: WeaviateClient
+        WeaviateClient object to be used for retrieval
+    reranker: ReRanker
+        ReRanker model to be used for results reranking
+    alpha: float=0.5
+        Weighting factor for BM25 and Vector search.
+        alpha can be any number from 0 to 1, defaulting to 0.5:
+            alpha = 0 executes a pure keyword search method (BM25)
+            alpha = 0.5 weighs the BM25 and vector methods evenly
+            alpha = 1 executes a pure vector search method
+    retrieve_limit: int=5
+        Number of documents to retrieve from Weaviate host
+    top_k: int=5
+        Number of top results to evaluate
+    chunk_size: int=256
+        Number of tokens used to chunk text
+    hnsw_config_keys: List[str]=['maxConnections', 'efConstruction', 'ef']
+        List of keys to be used for retrieving HNSW Index parameters from Weaviate host
+    search_type: Literal['kw', 'vector', 'hybrid', 'all']='all'
+        Type of search to be evaluated.  Options are 'kw', 'vector', 'hybrid', or 'all'
+    display_properties: List[str]=['doc_id', 'content']
+        List of properties to be returned from Weaviate host for display in response
+    dir_outpath: str='./eval_results'
+        Directory path for saving results.  Directory will be created if it does not
+        already exist.
+    include_miss_info: bool=False
+        Option to include queries and their associated search response values
+        for queries that are "total misses"
+    user_def_params : dict=None
+        Option for user to pass in a dictionary of user-defined parameters and their values.
+        Will be automatically added to the results_dict if correct type is passed.
+    '''
+    reranker_name = reranker.model_name if reranker else "None"
+    results_dict = {'n':retrieve_limit,
+                    'top_k': top_k,
+                    'alpha': alpha,
+                    'Retriever': retriever.model_name_or_path,
+                    'Ranker': reranker_name,
+                    'chunk_size': chunk_size,
+                    'kw_hit_rate': 0,
+                    'kw_mrr': 0,
+                    'vector_hit_rate': 0,
+                    'vector_mrr': 0,
+                    'hybrid_hit_rate':0,
+                    'hybrid_mrr': 0,
+                    'total_misses': 0,
+                    'total_questions':0
+                    }
+    #add extra params to results_dict
+    results_dict = add_params(retriever, class_name, results_dict, user_def_params, hnsw_config_keys)
+    start = time.perf_counter()
+    miss_info = []
+    for query_id, q in tqdm(dataset.queries.items(), 'Queries'):
+        results_dict['total_questions'] += 1
+        hit = False
+        #make Keyword, Vector, and Hybrid calls to Weaviate host
+        try:
+            kw_response = retriever.keyword_search(request=q, class_name=class_name, limit=retrieve_limit, display_properties=display_properties)
+            vector_response = retriever.vector_search(request=q, class_name=class_name, limit=retrieve_limit, display_properties=display_properties)
+            hybrid_response = retriever.hybrid_search(request=q, class_name=class_name, alpha=alpha, limit=retrieve_limit, display_properties=display_properties)
+            #rerank returned responses if reranker is provided
+            if reranker:
+                kw_response = reranker.rerank(kw_response, q, top_k=top_k)
+                vector_response = reranker.rerank(vector_response, q, top_k=top_k)
+                hybrid_response = reranker.rerank(hybrid_response, q, top_k=top_k)
+            #collect doc_ids to check for document matches (include only results_top_k)
+            kw_doc_ids = {result['doc_id']:i for i, result in enumerate(kw_response[:top_k], 1)}
+            vector_doc_ids = {result['doc_id']:i for i, result in enumerate(vector_response[:top_k], 1)}
+            hybrid_doc_ids = {result['doc_id']:i for i, result in enumerate(hybrid_response[:top_k], 1)}
+            #extract doc_id for scoring purposes
+            doc_id = dataset.relevant_docs[query_id][0]
+            #increment hit_rate counters and mrr scores
+            if doc_id in kw_doc_ids:
+                results_dict['kw_hit_rate'] += 1
+                results_dict['kw_mrr'] += 1/kw_doc_ids[doc_id]
+                hit = True
+            if doc_id in vector_doc_ids:
+                results_dict['vector_hit_rate'] += 1
+                results_dict['vector_mrr'] += 1/vector_doc_ids[doc_id]
+                hit = True
+            if doc_id in hybrid_doc_ids:
+                results_dict['hybrid_hit_rate'] += 1
+                results_dict['hybrid_mrr'] += 1/hybrid_doc_ids[doc_id]
+                hit = True
+            # if no hits, let's capture that
+            if not hit:
+                results_dict['total_misses'] += 1
+                miss_info.append({'query': q,
+                                  'answer': dataset.corpus[doc_id],
+                                  'doc_id': doc_id,
+                                  'kw_response': kw_response,
+                                  'vector_response': vector_response,
+                                  'hybrid_response': hybrid_response})
+        except Exception as e:
+            print(e)
+            continue
+    #use raw counts to calculate final scores
+    calc_hit_rate_scores(results_dict, search_type=search_type)
+    calc_mrr_scores(results_dict, search_type=search_type)
+    end = time.perf_counter() - start
+    print(f'Total Processing Time: {round(end/60, 2)} minutes')
+    record_results(results_dict, chunk_size, dir_outpath=dir_outpath, as_text=True)
+    if include_miss_info:
+        return results_dict, miss_info
+    return results_dict
+def calc_hit_rate_scores(results_dict: Dict[str, Union[str, int]],
+                         search_type: Literal['kw', 'vector', 'hybrid', 'all']=['kw', 'vector']
+                         ) -> None:
+    if search_type == 'all':
+        search_type = ['kw', 'vector', 'hybrid']
+    for prefix in search_type:
+        results_dict[f'{prefix}_hit_rate'] = round(results_dict[f'{prefix}_hit_rate']/results_dict['total_questions'],2)
+def calc_mrr_scores(results_dict: Dict[str, Union[str, int]],
+                    search_type: Literal['kw', 'vector', 'hybrid', 'all']=['kw', 'vector']
+                    ) -> None:
+    if search_type == 'all':
+        search_type = ['kw', 'vector', 'hybrid']
+    for prefix in search_type:
+        results_dict[f'{prefix}_mrr'] = round(results_dict[f'{prefix}_mrr']/results_dict['total_questions'],2)
+def create_dir(dir_path: str) -> None:
+    '''
+    Checks if directory exists, and creates new directory
+    if it does not exist
+    '''
+    if not os.path.exists(dir_path):
+        os.makedirs(dir_path)
+def record_results(results_dict: Dict[str, Union[str, int]],
+                   chunk_size: int,
+                   dir_outpath: str='./eval_results',
+                   as_text: bool=False
+                   ) -> None:
+    '''
+    Write results to output file in either txt or json format
+    Args:
+    -----
+    results_dict: Dict[str, Union[str, int]]
+        Dictionary containing results of evaluation
+    chunk_size: int
+        Size of text chunks in tokens
+    dir_outpath: str
+        Path to output directory.  Directory only, filename is hardcoded
+        as part of this function.
+    as_text: bool
+        If True, write results as text file.  If False, write as json file.
+    '''
+    create_dir(dir_outpath)
+    time_marker = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
+    ext = 'txt' if as_text else 'json'
+    path = os.path.join(dir_outpath, f'retrieval_eval_{chunk_size}_{time_marker}.{ext}')
+    if as_text:
+        with open(path, 'a') as f:
+            f.write(f"{results_dict}\n")
+    else:
+        with open(path, 'w') as f:
+            json.dump(results_dict, f, indent=4)
+def add_params(client: WeaviateClient,
+               class_name: str,
+               results_dict: dict,
+               param_options: dict,
+               hnsw_config_keys: List[str]
+              ) -> dict:
+    hnsw_params = {k:v for k,v in client.show_class_config(class_name)['vectorIndexConfig'].items() if k in hnsw_config_keys}
+    if hnsw_params:
+        results_dict = {**results_dict, **hnsw_params}
+    if param_options and isinstance(param_options, dict):
+        results_dict = {**results_dict, **param_options}
+    return results_dict

utils/system_prompts.py ADDED Viewed

	@@ -0,0 +1,72 @@

+chatgpt = '''
+You are a helpful assistant.
+'''
+professor_synapse = '''
+Act as Professor Synapse🧙🏾‍♂️, a conductor of expert agents. Your job is to support me in accomplishing my goals by finding alignment with me, then calling upon an expert agent perfectly suited to the task by initializing:
+Synapse_CoR = "[emoji]: I am an expert in [role&domain]. I know [context]. I will reason step-by-step to determine the best course of action to achieve [goal]. I can use [tools] and [relevant frameworks] to help in this process.
+I will help you accomplish your goal by following these steps:
+[reasoned steps]
+My task ends when [completion].
+[first step, question]"
+Instructions:
+1. 🧙🏾‍♂️ gather context, relevant information and clarify my goals by asking questions
+2. Once confirmed, initialize Synapse_CoR
+3. 🧙🏾‍♂️ and [emoji] support me until goal is complete
+Commands:
+/start=🧙🏾‍♂️,introduce and begin with step one
+/ts=🧙🏾‍♂️,summon (Synapse_CoR*3) town square debate
+/save🧙🏾‍♂️, restate goal, summarize progress, reason next step
+Personality:
+-curious, inquisitive, encouraging
+-use emojis to express yourself
+Rules:
+-End every output with a question or reasoned next step
+-Start every output with 🧙🏾‍♂️: or [emoji]: to indicate who is speaking.
+-Organize every output “🧙🏾‍♂️: [aligning on my goal],  [emoji]: [actionable response]
+-🧙🏾‍♂️, recommend save after each task is completed
+'''
+marketing_jane = '''
+Act as Marcus 👩🏼‍💼Marketing jane, a strategist adept at melding analytics with creative zest. With mastery over data-driven marketing and an innate knack for storytelling, your mission is to carve out distinctive marketing strategies. From fledgling startups to seasoned giants.
+Your strategy formulation entails:
+- Understanding the business's narrative, competitive landscape, and audience psyche.
+- Crafting a data-informed marketing roadmap, encompassing various channels, and innovative tactics.
+- Leveraging storytelling to forge brand engagement and pioneering avant-garde campaigns.
+Your endeavor culminates when the user possesses a dynamic, data-enriched marketing strategy, resonating with their business ethos.
+Steps:
+1. 👩🏼‍💼, Grasp the business's ethos, objectives, and challenges
+2. Design a data-backed marketing strategy, resonating with audience sentiments and business goals
+3. Engage in feedback loops, iteratively refining the strategy
+Commands:
+/explore - Modify the strategic focus or delve deeper into specific marketing nuances
+/save - Chronicle progress, dissect strategy elements, and chart future endeavors
+/critic - 👩🏼‍💼 seeks insights from fellow marketing aficionados
+/reason - 👩🏼‍💼 and user collaboratively weave the marketing narrative
+/new - Ignite a fresh strategic quest for a new venture or campaign
+Rules:
+- Culminate with an evocative campaign concept or the next strategic juncture
+- Preface with 👩🏼‍💼: for clarity
+- Integrate data insights with creative innovation
+'''
+# Define a dictionary to map the emojis to the variables
+prompt_mapping = {
+    "🤖ChatGPT": chatgpt,
+    "🧙🏾‍♂️Professor Synapse": professor_synapse,
+    "👩🏼‍💼Marketing Jane": marketing_jane,
+}

utils/weaviate_interface_v3_spa.py ADDED Viewed

	@@ -0,0 +1,436 @@

+from weaviate import Client, AuthApiKey
+from dataclasses import dataclass
+from openai import OpenAI
+from sentence_transformers import SentenceTransformer
+from typing import List, Union, Callable
+from torch import cuda
+from tqdm import tqdm
+import time
+class WeaviateClient(Client):
+    '''
+    A python native Weaviate Client class that encapsulates Weaviate functionalities
+    in one object. Several convenience methods are added for ease of use.
+    Args
+    ----
+    api_key: str
+        The API key for the Weaviate Cloud Service (WCS) instance.
+        https://console.weaviate.cloud/dashboard
+    endpoint: str
+        The url endpoint for the Weaviate Cloud Service instance.
+    model_name_or_path: str='sentence-transformers/all-MiniLM-L6-v2'
+        The name or path of the SentenceTransformer model to use for vector search.
+        Will also support OpenAI text-embedding-ada-002 model.  This param enables
+        the use of most leading models on MTEB Leaderboard:
+        https://huggingface.co/spaces/mteb/leaderboard
+    openai_api_key: str=None
+        The API key for the OpenAI API. Only required if using OpenAI text-embedding-ada-002 model.
+    '''
+    def __init__(self,
+                 endpoint: str,
+                 api_key: str = None,  # Make the api_key optional
+                 model_name_or_path: str = 'sentence-transformers/all-MiniLM-L6-v2',
+                 openai_api_key: str = None,
+                 **kwargs
+                ):
+        if api_key:  # Only use AuthApiKey if api_key is provided
+            auth_config = AuthApiKey(api_key=api_key)
+            super().__init__(auth_client_secret=auth_config, url=endpoint, **kwargs)
+        else:
+            super().__init__(url=endpoint, **kwargs)
+        self.model_name_or_path = model_name_or_path
+        self.openai_model = False
+        if self.model_name_or_path == 'text-embedding-ada-002':
+            if not openai_api_key:
+                raise ValueError(f'OpenAI API key must be provided to use this model: {self.model_name_or_path}')
+            self.model = OpenAI(api_key=openai_api_key)
+            self.openai_model = True
+        else:
+            self.model = SentenceTransformer(self.model_name_or_path) if self.model_name_or_path else None
+        self.display_properties = ['file_name', 'page_label', 'document_title', 'page_summary', 'page_url', 'doc_id', \
+                                    'content']
+    def show_classes(self) -> Union[List[str], str]:
+        '''
+        Shows all available classes (indexes) on the Weaviate instance.
+        '''
+        schema = self.schema.get()
+        if 'classes' in schema:
+            return [cls['class'] for cls in schema['classes']]
+        else:
+            return "No classes found on cluster."
+    def show_class_info(self) -> Union[List[dict], str]:
+        '''
+        Shows all information related to the classes (indexes) on the Weaviate instance.
+        '''
+        schema = self.schema.get()
+        if 'classes' in schema:
+            return schema['classes']
+        else:
+            return "No classes found on cluster."
+    def show_class_properties(self, class_name: str) -> Union[dict, str]:
+        '''
+        Shows all properties of a class (index) on the Weaviate instance.
+        '''
+        classes = self.schema.get()
+        if classes:
+            all_classes = classes['classes']
+            for d in all_classes:
+                if d['class'] == class_name:
+                    return d['properties']
+            return f'Class "{class_name}" not found on host'
+        return f'No Classes found on host'
+    def show_class_config(self, class_name: str) -> Union[dict, str]:
+        '''
+        Shows all configuration of a class (index) on the Weaviate instance.
+        '''
+        classes = self.schema.get()
+        if classes:
+            all_classes = classes['classes']
+            for d in all_classes:
+                if d['class'] == class_name:
+                    return d
+            return f'Class "{class_name}" not found on host'
+        return f'No Classes found on host'
+    def delete_class(self, class_name: str) -> str:
+        '''
+        Deletes a class (index) on the Weaviate instance, if it exists.
+        '''
+        available = self._check_class_avialability(class_name)
+        if isinstance(available, bool):
+            if available:
+                self.schema.delete_class(class_name)
+                not_deleted = self._check_class_avialability(class_name)
+                if isinstance(not_deleted, bool):
+                    if not_deleted:
+                        return f'Class "{class_name}" was not deleted. Try again.'
+                    else:
+                        return f'Class "{class_name}" deleted'
+                return f'Class "{class_name}" deleted and there are no longer any classes on host'
+            return f'Class "{class_name}" not found on host'
+        return available
+    def _check_class_avialability(self, class_name: str) -> Union[bool, str]:
+        '''
+        Checks if a class (index) exists on the Weaviate instance.
+        '''
+        classes = self.schema.get()
+        if classes:
+            all_classes = classes['classes']
+            for d in all_classes:
+                if d['class'] == class_name:
+                    return True
+            return False
+        else:
+            return f'No Classes found on host'
+    def format_response(self,
+                         response: dict,
+                         class_name: str
+                         ) -> List[dict]:
+        '''
+        Formats json response from Weaviate into a list of dictionaries.
+        Expands _additional fields if present into top-level dictionary.
+        '''
+        if response.get('errors'):
+            return response['errors'][0]['message']
+        results = []
+        hits = response['data']['Get'][class_name]
+        for d in hits:
+            temp = {k:v for k,v in d.items() if k != '_additional'}
+            if d.get('_additional'):
+                for key in d['_additional']:
+                    temp[key] = d['_additional'][key]
+            results.append(temp)
+        return results
+    def update_ef_value(self, class_name: str, ef_value: int) -> str:
+        '''
+        Updates ef_value for a class (index) on the Weaviate instance.
+        '''
+        self.schema.update_config(class_name=class_name, config={'vectorIndexConfig': {'ef': ef_value}})
+        print(f'ef_value updated to {ef_value} for class {class_name}')
+        return self.show_class_config(class_name)['vectorIndexConfig']
+    def keyword_search(self,
+                       request: str,
+                       class_name: str,
+                       properties: List[str]=['content'],
+                       limit: int=10,
+                       where_filter: dict=None,
+                       display_properties: List[str]=None,
+                       return_raw: bool=False) -> Union[dict, List[dict]]:
+        '''
+        Executes Keyword (BM25) search.
+        Args
+        ----
+        query: str
+            User query.
+        class_name: str
+            Class (index) to search.
+        properties: List[str]
+            List of properties to search across.
+        limit: int=10
+            Number of results to return.
+        display_properties: List[str]=None
+            List of properties to return in response.
+            If None, returns all properties.
+        return_raw: bool=False
+            If True, returns raw response from Weaviate.
+        '''
+        display_properties = display_properties if display_properties else self.display_properties
+        response = (self.query
+                    .get(class_name, display_properties)
+                    .with_bm25(query=request, properties=properties)
+                    .with_additional(['score', "id"])
+                    .with_limit(limit)
+                    )
+        response = response.with_where(where_filter).do() if where_filter else response.do()
+        if return_raw:
+            return response
+        else:
+            return self.format_response(response, class_name)
+    def vector_search(self,
+                      request: str,
+                      class_name: str,
+                      limit: int=10,
+                      where_filter: dict=None,
+                      display_properties: List[str]=None,
+                      return_raw: bool=False,
+                      device: str='cuda:0' if cuda.is_available() else 'cpu'
+                      ) -> Union[dict, List[dict]]:
+        '''
+        Executes vector search using embedding model defined on instantiation
+        of WeaviateClient instance.
+        Args
+        ----
+        query: str
+            User query.
+        class_name: str
+            Class (index) to search.
+        limit: int=10
+            Number of results to return.
+        display_properties: List[str]=None
+            List of properties to return in response.
+            If None, returns all properties.
+        return_raw: bool=False
+            If True, returns raw response from Weaviate.
+        '''
+        display_properties = display_properties if display_properties else self.display_properties
+        query_vector = self._create_query_vector(request, device=device)
+        response = (
+                    self.query
+                    .get(class_name, display_properties)
+                    .with_near_vector({"vector": query_vector})
+                    .with_limit(limit)
+                    .with_additional(['distance'])
+                    )
+        response = response.with_where(where_filter).do() if where_filter else response.do()
+        if return_raw:
+            return response
+        else:
+            return self.format_response(response, class_name)
+    def _create_query_vector(self, query: str, device: str) -> List[float]:
+        '''
+        Creates embedding vector from text query.
+        '''
+        return self.get_openai_embedding(query) if self.openai_model else self.model.encode(query, device=device).tolist()
+    def get_openai_embedding(self, query: str) -> List[float]:
+        '''
+        Gets embedding from OpenAI API for query.
+        '''
+        embedding = self.model.embeddings.create(input=query, model='text-embedding-ada-002').model_dump()
+        if embedding:
+            return embedding['data'][0]['embedding']
+        else:
+           raise ValueError(f'No embedding found for query: {query}')
+    def hybrid_search(self,
+                      request: str,
+                      class_name: str,
+                      properties: List[str]=['content'],
+                      alpha: float=0.5,
+                      limit: int=10,
+                      where_filter: dict=None,
+                      display_properties: List[str]=None,
+                      return_raw: bool=False,
+                      device: str='cuda:0' if cuda.is_available() else 'cpu'
+                     ) -> Union[dict, List[dict]]:
+        '''
+        Executes Hybrid (BM25 + Vector) search.
+        Args
+        ----
+        query: str
+            User query.
+        class_name: str
+            Class (index) to search.
+        properties: List[str]
+            List of properties to search across (using BM25)
+        alpha: float=0.5
+            Weighting factor for BM25 and Vector search.
+            alpha can be any number from 0 to 1, defaulting to 0.5:
+                alpha = 0 executes a pure keyword search method (BM25)
+                alpha = 0.5 weighs the BM25 and vector methods evenly
+                alpha = 1 executes a pure vector search method
+        limit: int=10
+            Number of results to return.
+        display_properties: List[str]=None
+            List of properties to return in response.
+            If None, returns all properties.
+        return_raw: bool=False
+            If True, returns raw response from Weaviate.
+        '''
+        display_properties = display_properties if display_properties else self.display_properties
+        query_vector = self._create_query_vector(request, device=device)
+        response = (
+                    self.query
+                    .get(class_name, display_properties)
+                    .with_hybrid(query=request,
+                                 alpha=alpha,
+                                 vector=query_vector,
+                                 properties=properties,
+                                 fusion_type='relativeScoreFusion') #hard coded option for now
+                    .with_additional(["score", "explainScore"])
+                    .with_limit(limit)
+                    )
+        response = response.with_where(where_filter).do() if where_filter else response.do()
+        if return_raw:
+            return response
+        else:
+            return self.format_response(response, class_name)
+class WeaviateIndexer:
+    def __init__(self,
+                 client: WeaviateClient,
+                 batch_size: int=150,
+                 num_workers: int=4,
+                 dynamic: bool=True,
+                 creation_time: int=5,
+                 timeout_retries: int=3,
+                 connection_error_retries: int=3,
+                 callback: Callable=None,
+                 ):
+        '''
+        Class designed to batch index documents into Weaviate. Instantiating
+        this class will automatically configure the Weaviate batch client.
+        '''
+        self._client = client
+        self._callback = callback if callback else self._default_callback
+        self._client.batch.configure(batch_size=batch_size,
+                                     num_workers=num_workers,
+                                     dynamic=dynamic,
+                                     creation_time=creation_time,
+                                     timeout_retries=timeout_retries,
+                                     connection_error_retries=connection_error_retries,
+                                     callback=self._callback
+                                    )
+    def _default_callback(self, results: dict):
+        """
+        Check batch results for errors.
+        Parameters
+        ----------
+        results : dict
+            The Weaviate batch creation return value.
+        """
+        if results is not None:
+            for result in results:
+                if "result" in result and "errors" in result["result"]:
+                    if "error" in result["result"]["errors"]:
+                        print(result["result"])
+    def batch_index_data(self,
+                         data: List[dict],
+                         class_name: str,
+                         vector_property: str='content_embedding'
+                         ) -> None:
+        '''
+        Batch function for fast indexing of data onto Weaviate cluster.
+        This method assumes that self._client.batch is already configured.
+        '''
+        start = time.perf_counter()
+        with self._client.batch as batch:
+            for d in tqdm(data):
+                #define single document
+                properties = {k:v for k,v in d.items() if k != vector_property}
+                try:
+                    #add data object to batch
+                    batch.add_data_object(
+                                        data_object=properties,
+                                        class_name=class_name,
+                                        vector=d[vector_property]
+                                        )
+                except Exception as e:
+                    print(e)
+                    continue
+        end = time.perf_counter() - start
+        print(f'Batch job completed in {round(end/60, 2)} minutes.')
+        # class_info = self._client.show_class_info()
+        # for i, c in enumerate(class_info):
+        #     if c['class'] == class_name:
+        #         print(class_info[i])
+        self._client.batch.shutdown()
+@dataclass
+class WhereFilter:
+    '''
+    Simplified interface for constructing a WhereFilter object.
+    Args
+    ----
+    path: List[str]
+        List of properties to filter on.
+    operator: str
+        Operator to use for filtering. Options: ['And', 'Or', 'Equal', 'NotEqual',
+        'GreaterThan', 'GreaterThanEqual', 'LessThan', 'LessThanEqual', 'Like',
+        'WithinGeoRange', 'IsNull', 'ContainsAny', 'ContainsAll']
+    value[dataType]: Union[int, bool, str, float, datetime]
+        Value to filter on. The dataType suffix must match the data type of the
+        property being filtered on. At least and only one value type must be provided.
+    '''
+    path: List[str]
+    operator: str
+    valueInt: int=None
+    valueBoolean: bool=None
+    valueText: str=None
+    valueNumber: float=None
+    valueDate = None
+    def post_init(self):
+        operators = ['And', 'Or', 'Equal', 'NotEqual','GreaterThan', 'GreaterThanEqual', 'LessThan',\
+                      'LessThanEqual', 'Like', 'WithinGeoRange', 'IsNull', 'ContainsAny', 'ContainsAll']
+        if self.operator not in operators:
+            raise ValueError(f'operator must be one of: {operators}, got {self.operator}')
+        values = [self.valueInt, self.valueBoolean, self.valueText, self.valueNumber, self.valueDate]
+        if not any(values):
+            raise ValueError('At least one value must be provided.')
+        if len(values) > 1:
+            raise ValueError('At most one value can be provided.')
+    def todict(self):
+        return {k:v for k,v in self.__dict__.items() if v is not None}