Spaces:

sambanovasystems
/

enterprise_knowledge_retriever

Runtime error

App Files Files Community

petrojm commited on Sep 20, 2024

Commit

a6c26b1

1 Parent(s): 08cf07c

add EKR files

Browse files

Files changed (22) hide show

app.py +128 -0
config.yaml +29 -0
prompts/final_chain_prompt.yaml +17 -0
prompts/llama7b-knowledge_retriever-custom_qa_prompt.yaml +19 -0
prompts/qa_prompt.yaml +21 -0
requirements.txt +30 -0
src/bulkQA.py +132 -0
src/document_retrieval.py +311 -0
utils/model_wrappers/api_gateway.py +260 -0
utils/model_wrappers/langchain_chat_models.py +465 -0
utils/model_wrappers/langchain_embeddings.py +309 -0
utils/model_wrappers/langchain_llms.py +770 -0
utils/model_wrappers/usage.ipynb +878 -0
utils/parsing/README.md +285 -0
utils/parsing/config.yaml +69 -0
utils/parsing/docker-compose.yaml +30 -0
utils/parsing/parse_usage.ipynb +228 -0
utils/parsing/requirements.txt +6 -0
utils/parsing/sambaparse.py +525 -0
utils/vectordb/create_vector_db.py +141 -0
utils/vectordb/vector_db.py +353 -0
utils/visual/env_utils.py +95 -0

app.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import os
+import sys
+import logging
+import yaml
+import gradio as gr
+import time
+current_dir = os.path.dirname(os.path.abspath(__file__))
+print(current_dir)
+from src.document_retrieval import DocumentRetrieval
+from utils.visual.env_utils import env_input_fields, initialize_env_variables, are_credentials_set, save_credentials
+from utils.parsing.sambaparse import parse_doc_universal # added Petro
+from utils.vectordb.vector_db import VectorDb
+CONFIG_PATH = os.path.join(current_dir,'config.yaml')
+PERSIST_DIRECTORY = os.path.join(current_dir,f"data/my-vector-db") # changed to current_dir
+logging.basicConfig(level=logging.INFO)
+logging.info("Gradio app is running")
+class ChatState:
+    def __init__(self):
+        self.conversation = None
+        self.chat_history = []
+        self.show_sources = True
+        self.sources_history = []
+        self.vectorstore = None
+        self.input_disabled = True
+        self.document_retrieval = None
+chat_state = ChatState()
+chat_state.document_retrieval = DocumentRetrieval()
+def handle_userinput(user_question):
+    if user_question:
+        try:
+            response_time = time.time()
+            response = chat_state.conversation.invoke({"question": user_question})
+            response_time = time.time() - response_time
+            chat_state.chat_history.append((user_question, response["answer"]))
+            #sources = set([f'{sd.metadata["filename"]}' for sd in response["source_documents"]])
+            #sources_text = "\n".join([f"{i+1}. {source}" for i, source in enumerate(sources)])
+            #state.sources_history.append(sources_text)
+            return chat_state.chat_history, "" #, state.sources_history
+        except Exception as e:
+            return f"An error occurred: {str(e)}", "" #, state.sources_history
+    return chat_state.chat_history, "" #, state.sources_history
+def process_documents(files, save_location=None):
+    try:
+        #for doc in files:
+        _, _, text_chunks = parse_doc_universal(doc=files)
+        print(text_chunks)
+        #text_chunks = chat_state.document_retrieval.parse_doc(files)
+        embeddings = chat_state.document_retrieval.load_embedding_model()
+        collection_name = 'ekr_default_collection' if not config['prod_mode'] else None
+        vectorstore = chat_state.document_retrieval.create_vector_store(text_chunks, embeddings, output_db=save_location, collection_name=collection_name)
+        chat_state.vectorstore = vectorstore
+        chat_state.document_retrieval.init_retriever(vectorstore)
+        chat_state.conversation = chat_state.document_retrieval.get_qa_retrieval_chain()
+        chat_state.input_disabled = False
+        return "Documents processed successfully. You can now ask questions."
+    except Exception as e:
+        return f"An error occurred while processing: {str(e)}"
+def reset_conversation():
+    chat_state.chat_history = []
+    #chat_state.sources_history = []
+    return chat_state.chat_history, ""
+def show_selection(model):
+    return f"You selected: {model}"
+# Read config file
+with open(CONFIG_PATH, 'r') as yaml_file:
+    config = yaml.safe_load(yaml_file)
+prod_mode = config.get('prod_mode', False)
+default_collection = 'ekr_default_collection'
+# Load env variables
+initialize_env_variables(prod_mode)
+caution_text = """⚠️ Note: depending on the size of your document, this could take several minutes.
+"""
+with gr.Blocks() as demo:
+    #gr.Markdown("# SambaNova Analyst Assistant") # title
+    gr.Markdown("# 🟠 SambaNova Analyst Assistant",
+            elem_id="title")
+    gr.Markdown("Powered by SambaNova Cloud. Get your API key [here](https://cloud.sambanova.ai/apis).")
+    api_key = gr.Textbox(label="API Key", type="password", placeholder="(Optional) Enter your API key here for more availability")
+    # Step 1: Add PDF file
+    gr.Markdown("## 1️⃣ Pick a datasource")
+    docs = gr.File(label="Add PDF file", file_types=["pdf"], file_count="single")
+    # Step 2: Process PDF file
+    gr.Markdown(("## 2️⃣ Process your documents and create vector store"))
+    process_btn = gr.Button("🔄 Process")
+    gr.Markdown(caution_text)
+    setup_output = gr.Textbox(label="Setup Output", visible=True)
+    process_btn.click(process_documents, inputs=[docs], outputs=setup_output, concurrency_limit=10)
+        #process_save_btn.click(process_documents, inputs=[file_upload, save_location], outputs=setup_output)
+        #load_db_btn.click(load_existing_db, inputs=[db_path], outputs=setup_output)
+    # Step 3: Chat with your data
+    gr.Markdown("## 3️⃣ Chat")
+    chatbot = gr.Chatbot(label="Chatbot", show_label=True, show_share_button=False, show_copy_button=True, likeable=True)
+    msg = gr.Textbox(label="Ask questions about your data", placeholder="Enter your message...")
+    clear = gr.Button("Clear chat")
+    #show_sources = gr.Checkbox(label="Show sources", value=True)
+    sources_output = gr.Textbox(label="Sources", visible=False)
+    #msg.submit(handle_userinput, inputs=[msg], outputs=[chatbot, sources_output])
+    msg.submit(handle_userinput, inputs=[msg], outputs=[chatbot, msg])
+    clear.click(reset_conversation, outputs=[chatbot,msg])
+    #show_sources.change(lambda x: gr.update(visible=x), show_sources, sources_output)
+if __name__ == "__main__":
+    demo.launch()

config.yaml ADDED Viewed

	@@ -0,0 +1,29 @@

+api: "sncloud" #  set either sambastudio or sncloud
+embedding_model:
+    "type": "cpu" # set either sambastudio or cpu
+    "batch_size": 1 #set depending of your endpoint configuration (1 if CoE embedding expert)
+    "coe": True #set true if using Sambastudio embeddings in a CoE endpoint
+    "select_expert": "e5-mistral-7b-instruct" #set if using SambaStudio CoE embedding expert
+llm:
+    "temperature": 0.0
+    "do_sample": False
+    "max_tokens_to_generate": 1200
+    "coe": True #set as true if using Sambastudio CoE endpoint
+    "select_expert": "llama3-8b" #set if using sncloud, SambaStudio CoE llm expert
+    #sncloud CoE expert name -> "llama3-8b"
+retrieval:
+    "k_retrieved_documents": 15 #set if rerank enabled
+    "score_threshold": 0.2
+    "rerank": False # set if you want to rerank retriever results
+    "reranker": 'BAAI/bge-reranker-large' # set if you rerank enabled
+    "final_k_retrieved_documents": 5
+pdf_only_mode: True  # Set to true for PDF-only mode, false for all file types
+prod_mode: False
+prompts:
+    "qa_prompt": "prompts/qa_prompt.yaml"
+    "final_chain_prompt": "prompts/final_chain_prompt.yaml"

prompts/final_chain_prompt.yaml ADDED Viewed

	@@ -0,0 +1,17 @@

+_type: prompt
+input_types: {}
+input_variables:
+- question
+- answers
+name: null
+output_parser: null
+partial_variables: {}
+template: |
+    <|begin_of_text|><|start_header_id|>system<|end_header_id|> You are an assistant for question-answering tasks.
+    Use the following intermediate answers, provide a final answer to the original question.  If you cannot answer based on the intermediate answers provided to you, say "Whoops!  I don't know!". <|eot_id|><|start_header_id|>user<|end_header_id|>
+    Original Question: {question}
+    Intermediate Answers: {answers}
+    \n ------- \n
+    Answer: <|eot_id|><|start_header_id|>assistant<|end_header_id|>
+template_format: f-string
+validate_template: false

prompts/llama7b-knowledge_retriever-custom_qa_prompt.yaml ADDED Viewed

	@@ -0,0 +1,19 @@

+_type: prompt
+input_types: {}
+input_variables:
+- context
+- question
+name: null
+output_parser: null
+partial_variables: {}
+template: "[INST]<<SYS>> You are a helpful assistant for question-answering tasks.\
+  \ Use the following pieces of retrieved context to answer the question.\n   \
+  \ each piece of context includes the Source for reference\n   if the question \
+  \ references a specific source then filter out that source and give a response based on that source\n   If\
+  \ the answer is not in the context, say that you don't know. Cross check if the\
+  \ answer is contained in provided context. If not than say \"I do not have information\
+  \ regarding this.\n    Do not use images or emojis in your answer. Keep the answer\
+  \ conversational and professional.<</SYS>>\n\n    {context} \n    \n    Question:\
+  \ {question} \n    Helpful answer: [/INST]"
+template_format: f-string
+validate_template: false

prompts/qa_prompt.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+_type: prompt
+input_types: {}
+input_variables:
+- context
+- question
+name: null
+output_parser: null
+partial_variables: {}
+template: |
+    <|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a knowledge base assistant chatbot powered by Sambanova's AI chip accelerator, designed to answer questions based on user-uploaded documents.
+    Use the following pieces of retrieved context to answer the question. Each piece of context includes the Source for reference. If the question references a specific source, then filter out that source and give a response based on that source.
+    If the answer is not in the context, say: "This information isn't in my current knowledge base." Then, suggest a related topic you can discuss based on the available context.
+    Maintain a professional yet conversational tone. Do not use images or emojis in your answer.
+    Prioritize accuracy and only provide information directly supported by the context. <|eot_id|><|start_header_id|>user<|end_header_id|>
+    Question: {question}
+    Context: {context}
+    \n ------- \n
+    Answer: <|eot_id|><|start_header_id|>assistant<|end_header_id|>
+template_format: f-string
+validate_template: false

requirements.txt ADDED Viewed

	@@ -0,0 +1,30 @@

+streamlit==1.36.0
+pydantic==2.7.0
+pydantic_core==2.18.1
+langchain==0.2.16
+langchain-core==0.2.38
+langchain-community==0.2.16
+sentence_transformers==2.2.2
+instructorembedding==1.0.1
+faiss-cpu==1.7.4
+python-dotenv==1.0.0
+streamlit-extras==0.4.3
+pillow==10.4.0
+sseclient-py==1.8.0
+# unstructured==0.14.9
+# unstructured_inference==0.7.36
+# unstructured_pytesseract==0.3.12
+# pytesseract==0.3.10
+chromadb==0.5.3
+langgraph==0.0.55
+openpyxl==3.1.4
+psutil==6.0.0
+pillow_heif==0.16.0
+ipython==8.26.0
+PyMuPDF==1.23.4
+PyMuPDFb==1.23.3
+#LLM Eval
+weave==0.51.1

src/bulkQA.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import os
+import sys
+import argparse
+import pandas as pd
+import time
+from typing import Any, Dict, Optional
+from langchain_core.callbacks import CallbackManagerForChainRun
+from langchain.prompts import load_prompt
+from langchain_core.output_parsers import StrOutputParser
+from transformers import AutoTokenizer
+current_dir = os.path.dirname(os.path.abspath(__file__))
+kit_dir = os.path.abspath(os.path.join(current_dir, ".."))
+repo_dir = os.path.abspath(os.path.join(kit_dir, ".."))
+sys.path.append(kit_dir)
+sys.path.append(repo_dir)
+from enterprise_knowledge_retriever.src.document_retrieval import DocumentRetrieval, RetrievalQAChain
+class TimedRetrievalQAChain(RetrievalQAChain):
+    #override call method to return times
+    def _call(self,
+        inputs: Dict[str, Any],
+        run_manager: Optional[CallbackManagerForChainRun] = None,
+    ) -> Dict[str, Any]:
+        qa_chain = self.qa_prompt | self.llm | StrOutputParser()
+        response = {}
+        start_time = time.time()
+        documents = self.retriever.invoke(inputs["question"])
+        if self.rerank:
+            documents = self.rerank_docs(inputs["question"], documents, self.final_k_retrieved_documents)
+        docs = self._format_docs(documents)
+        end_preprocessing_time=time.time()
+        response["answer"] = qa_chain.invoke({"question": inputs["question"], "context": docs})
+        end_llm_time=time.time()
+        response["source_documents"] = documents
+        response["start_time"] = start_time
+        response["end_preprocessing_time"] = end_preprocessing_time
+        response["end_llm_time"] = end_llm_time
+        return response
+def analyze_times(answer, start_time, end_preprocessing_time, end_llm_time, tokenizer):
+    preprocessing_time=end_preprocessing_time-start_time
+    llm_time=end_llm_time-end_preprocessing_time
+    token_count=len(tokenizer.encode(answer))
+    tokens_per_second = token_count / llm_time
+    perf =   {"preprocessing_time": preprocessing_time,
+             "llm_time": llm_time,
+             "token_count": token_count,
+             "tokens_per_second": tokens_per_second}
+    return perf
+def generate(qa_chain, question, tokenizer):
+    response  = qa_chain.invoke({"question": question})
+    answer =  response.get('answer')
+    sources = set([
+            f'{sd.metadata["filename"]}'
+            for sd in response["source_documents"]
+        ])
+    times = analyze_times(
+        answer,
+        response.get("start_time"),
+        response.get("end_preprocessing_time"),
+        response.get("end_llm_time"),
+        tokenizer
+        )
+    return answer, sources, times
+def process_bulk_QA(vectordb_path, questions_file_path):
+    documentRetrieval =  DocumentRetrieval()
+    tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-chat-hf")
+    if os.path.exists(vectordb_path):
+        # load the vectorstore
+        embeddings = documentRetrieval.load_embedding_model()
+        vectorstore = documentRetrieval.load_vdb(vectordb_path, embeddings)
+        print("Database loaded")
+        documentRetrieval.init_retriever(vectorstore)
+        print("retriever initialized")
+        #get qa chain
+        qa_chain = TimedRetrievalQAChain(
+            retriever=documentRetrieval.retriever,
+            llm=documentRetrieval.llm,
+            qa_prompt = load_prompt(os.path.join(kit_dir, documentRetrieval.prompts["qa_prompt"])),
+            rerank = documentRetrieval.retrieval_info["rerank"],
+            final_k_retrieved_documents = documentRetrieval.retrieval_info["final_k_retrieved_documents"]
+        )
+    else:
+        raise f"vector db path {vectordb_path} does not exist"
+    if os.path.exists(questions_file_path):
+        df = pd.read_excel(questions_file_path)
+        print(df)
+        output_file_path = questions_file_path.replace('.xlsx', '_output.xlsx')
+        if 'Answer' not in df.columns:
+            df['Answer'] = ''
+            df['Sources'] = ''
+            df['preprocessing_time'] = ''
+            df['llm_time'] = ''
+            df['token_count'] = ''
+            df['tokens_per_second'] = ''
+        for index, row in df.iterrows():
+            if row['Answer'].strip()=='':  # Only process if 'Answer' is empty
+                try:
+                    # Generate the answer
+                    print(f"Generating answer for row {index}")
+                    answer, sources, times = generate(qa_chain, row['Questions'], tokenizer)
+                    df.at[index, 'Answer'] = answer
+                    df.at[index, 'Sources'] = sources
+                    df.at[index, 'preprocessing_time'] = times.get("preprocessing_time")
+                    df.at[index, 'llm_time'] = times.get("llm_time")
+                    df.at[index, 'token_count'] = times.get("token_count")
+                    df.at[index, 'tokens_per_second'] = times.get("tokens_per_second")
+                except Exception as e:
+                    print(f"Error processing row {index}: {e}")
+                # Save the file after each iteration to avoid data loss
+                df.to_excel(output_file_path, index=False)
+            else:
+                print(f"Skipping row {index} because 'Answer' is already in the document")
+        return output_file_path
+    else:
+        raise f"questions file path {questions_file_path} does not exist"
+if __name__ == "__main__":
+    # Parse the arguments
+    parser = argparse.ArgumentParser(description='use a vectordb and an excel file with questions in the first column and generate answers for all the questions')
+    parser.add_argument('vectordb_path', type=str, help='vector db path with stored documents for RAG')
+    parser.add_argument('questions_path', type=str, help='xlsx file containing questions in a column named Questions')
+    args = parser.parse_args()
+    # process in bulk
+    out_file = process_bulk_QA(args.vectordb_path, args.questions_path)
+    print(f"Finished, responses in: {out_file}")

src/document_retrieval.py ADDED Viewed

	@@ -0,0 +1,311 @@

+import os
+import shutil
+import sys
+from typing import Any, Dict, List, Optional
+import torch
+import yaml
+from dotenv import load_dotenv
+from langchain.chains.base import Chain
+from langchain.docstore.document import Document
+from langchain.prompts import BasePromptTemplate, load_prompt
+from langchain_core.callbacks import CallbackManagerForChainRun
+from langchain_core.language_models import BaseLanguageModel
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.retrievers import BaseRetriever
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+current_dir = os.path.dirname(os.path.abspath(__file__)) # src/ directory
+kit_dir = os.path.abspath(os.path.join(current_dir, '..')) # EKR/ directory
+repo_dir = os.path.abspath(os.path.join(kit_dir, '..'))
+sys.path.append(kit_dir)
+sys.path.append(repo_dir)
+import streamlit as st
+from utils.model_wrappers.api_gateway import APIGateway
+from utils.vectordb.vector_db import VectorDb
+from utils.visual.env_utils import get_wandb_key
+CONFIG_PATH = os.path.join(kit_dir, 'config.yaml')
+PERSIST_DIRECTORY = os.path.join(kit_dir, 'data/my-vector-db')
+load_dotenv(os.path.join(kit_dir, '.env'))
+from utils.parsing.sambaparse import parse_doc_universal
+# Handle the WANDB_API_KEY resolution before importing weave
+#wandb_api_key = get_wandb_key()
+# If WANDB_API_KEY is set, proceed with weave initialization
+#if wandb_api_key:
+#    import weave
+    # Initialize Weave with your project name
+#    weave.init('sambanova_ekr')
+#else:
+#    print('WANDB_API_KEY is not set. Weave initialization skipped.')
+class RetrievalQAChain(Chain):
+    """class for question-answering."""
+    retriever: BaseRetriever
+    rerank: bool = True
+    llm: BaseLanguageModel
+    qa_prompt: BasePromptTemplate
+    final_k_retrieved_documents: int = 3
+    @property
+    def input_keys(self) -> List[str]:
+        """Input keys.
+        :meta private:
+        """
+        return ['question']
+    @property
+    def output_keys(self) -> List[str]:
+        """Output keys.
+        :meta private:
+        """
+        return ['answer', 'source_documents']
+    def _format_docs(self, docs):
+        return '\n\n'.join(doc.page_content for doc in docs)
+    def rerank_docs(self, query, docs, final_k):
+        # Lazy hardcoding for now
+        tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-reranker-large')
+        reranker = AutoModelForSequenceClassification.from_pretrained('BAAI/bge-reranker-large')
+        pairs = []
+        for d in docs:
+            pairs.append([query, d.page_content])
+        with torch.no_grad():
+            inputs = tokenizer(
+                pairs,
+                padding=True,
+                truncation=True,
+                return_tensors='pt',
+                max_length=512,
+            )
+            scores = (
+                reranker(**inputs, return_dict=True)
+                .logits.view(
+                    -1,
+                )
+                .float()
+            )
+        scores_list = scores.tolist()
+        scores_sorted_idx = sorted(range(len(scores_list)), key=lambda k: scores_list[k], reverse=True)
+        docs_sorted = [docs[k] for k in scores_sorted_idx]
+        # docs_sorted = [docs[k] for k in scores_sorted_idx if scores_list[k]>0]
+        docs_sorted = docs_sorted[:final_k]
+        return docs_sorted
+    def _call(
+        self,
+        inputs: Dict[str, Any],
+        run_manager: Optional[CallbackManagerForChainRun] = None,
+    ) -> Dict[str, Any]:
+        qa_chain = self.qa_prompt | self.llm | StrOutputParser()
+        response = {}
+        documents = self.retriever.invoke(inputs['question'])
+        if self.rerank:
+            documents = self.rerank_docs(inputs['question'], documents, self.final_k_retrieved_documents)
+        docs = self._format_docs(documents)
+        response['answer'] = qa_chain.invoke({'question': inputs['question'], 'context': docs})
+        response['source_documents'] = documents
+        return response
+class DocumentRetrieval:
+    def __init__(self):
+        self.vectordb = VectorDb()
+        config_info = self.get_config_info()
+        self.api_info = config_info[0]
+        self.llm_info = config_info[1]
+        self.embedding_model_info = config_info[2]
+        self.retrieval_info = config_info[3]
+        self.prompts = config_info[4]
+        self.prod_mode = config_info[5]
+        self.retriever = None
+        self.llm = self.set_llm()
+    def get_config_info(self):
+        """
+        Loads json config file
+        """
+        # Read config file
+        with open(CONFIG_PATH, 'r') as yaml_file:
+            config = yaml.safe_load(yaml_file)
+        api_info = config['api']
+        llm_info = config['llm']
+        embedding_model_info = config['embedding_model']
+        retrieval_info = config['retrieval']
+        prompts = config['prompts']
+        prod_mode = config['prod_mode']
+        return api_info, llm_info, embedding_model_info, retrieval_info, prompts, prod_mode
+    def set_llm(self):
+        if self.prod_mode:
+            sambanova_api_key = st.session_state.SAMBANOVA_API_KEY
+        else:
+            if 'SAMBANOVA_API_KEY' in st.session_state:
+                sambanova_api_key = os.environ.get('SAMBANOVA_API_KEY') or st.session_state.SAMBANOVA_API_KEY
+            else:
+                sambanova_api_key = os.environ.get('SAMBANOVA_API_KEY')
+        llm = APIGateway.load_llm(
+            type=self.api_info,
+            streaming=True,
+            coe=self.llm_info['coe'],
+            do_sample=self.llm_info['do_sample'],
+            max_tokens_to_generate=self.llm_info['max_tokens_to_generate'],
+            temperature=self.llm_info['temperature'],
+            select_expert=self.llm_info['select_expert'],
+            process_prompt=False,
+            sambanova_api_key=sambanova_api_key,
+        )
+        return llm
+    def parse_doc(self, docs: List, additional_metadata: Optional[Dict] = None) -> List[Document]:
+        """
+        Parse the uploaded documents and return a list of LangChain documents.
+        Args:
+            docs (List[UploadFile]): A list of uploaded files.
+            additional_metadata (Optional[Dict], optional): Additional metadata to include in the processed documents.
+                Defaults to an empty dictionary.
+        Returns:
+            List[Document]: A list of LangChain documents.
+        """
+        if additional_metadata is None:
+            additional_metadata = {}
+        # Create the data/tmp folder if it doesn't exist
+        temp_folder = os.path.join(kit_dir, 'data/tmp')
+        if not os.path.exists(temp_folder):
+            os.makedirs(temp_folder)
+        else:
+            # If there are already files there, delete them
+            for filename in os.listdir(temp_folder):
+                file_path = os.path.join(temp_folder, filename)
+                try:
+                    if os.path.isfile(file_path) or os.path.islink(file_path):
+                        os.unlink(file_path)
+                    elif os.path.isdir(file_path):
+                        shutil.rmtree(file_path)
+                except Exception as e:
+                    print(f'Failed to delete {file_path}. Reason: {e}')
+        # Save all selected files to the tmp dir with their file names
+        #for doc in docs:
+        #    temp_file = os.path.join(temp_folder, doc.name)
+        #    with open(temp_file, 'wb') as f:
+        #        f.write(doc.getvalue())
+        for doc_info in docs:
+            file_name, file_obj = doc_info
+            temp_file = os.path.join(temp_folder, file_name)
+            with open(temp_file, 'wb') as f:
+                f.write(file_obj.read())
+        # Pass in the temp folder for processing into the parse_doc_universal function
+        _, _, langchain_docs = parse_doc_universal(doc=temp_folder, additional_metadata=additional_metadata)
+        return langchain_docs
+    def load_embedding_model(self):
+        embeddings = APIGateway.load_embedding_model(
+            type=self.embedding_model_info['type'],
+            batch_size=self.embedding_model_info['batch_size'],
+            coe=self.embedding_model_info['coe'],
+            select_expert=self.embedding_model_info['select_expert'],
+        )
+        return embeddings
+    def create_vector_store(self, text_chunks, embeddings, output_db=None, collection_name=None):
+        print(f'Collection name is {collection_name}')
+        vectorstore = self.vectordb.create_vector_store(
+            text_chunks, embeddings, output_db=output_db, collection_name=collection_name, db_type='chroma'
+        )
+        return vectorstore
+    def load_vdb(self, db_path, embeddings, collection_name=None):
+        print(f'Loading collection name is {collection_name}')
+        vectorstore = self.vectordb.load_vdb(db_path, embeddings, db_type='chroma', collection_name=collection_name)
+        return vectorstore
+    def init_retriever(self, vectorstore):
+        if self.retrieval_info['rerank']:
+            self.retriever = vectorstore.as_retriever(
+                search_type='similarity_score_threshold',
+                search_kwargs={
+                    'score_threshold': self.retrieval_info['score_threshold'],
+                    'k': self.retrieval_info['k_retrieved_documents'],
+                },
+            )
+        else:
+            self.retriever = vectorstore.as_retriever(
+                search_type='similarity_score_threshold',
+                search_kwargs={
+                    'score_threshold': self.retrieval_info['score_threshold'],
+                    'k': self.retrieval_info['final_k_retrieved_documents'],
+                },
+            )
+    def get_qa_retrieval_chain(self):
+        """
+        Generate a qa_retrieval chain using a language model.
+        This function uses a language model, specifically a SambaNova LLM, to generate a qa_retrieval chain
+        based on the input vector store of text chunks.
+        Parameters:
+        vectorstore (Chroma): A Vector Store containing embeddings of text chunks used as context
+                            for generating the conversation chain.
+        Returns:
+        RetrievalQA: A chain ready for QA without memory
+        """
+        # customprompt = load_prompt(os.path.join(kit_dir, self.prompts["qa_prompt"]))
+        # qa_chain = customprompt | self.llm | StrOutputParser()
+        # response = {}
+        # documents = self.retriever.invoke(question)
+        # if self.retrieval_info["rerank"]:
+        #     documents = self.rerank_docs(question, documents, self.retrieval_info["final_k_retrieved_documents"])
+        # docs = self._format_docs(documents)
+        # response["answer"] = qa_chain.invoke({"question": question, "context": docs})
+        # response["source_documents"] = documents
+        retrievalQAChain = RetrievalQAChain(
+            retriever=self.retriever,
+            llm=self.llm,
+            qa_prompt=load_prompt(os.path.join(kit_dir, self.prompts['qa_prompt'])),
+            rerank=self.retrieval_info['rerank'],
+            final_k_retrieved_documents=self.retrieval_info['final_k_retrieved_documents'],
+        )
+        return retrievalQAChain
+    def get_conversational_qa_retrieval_chain(self):
+        """
+        Generate a conversational retrieval qa chain using a language model.
+        This function uses a language model, specifically a SambaNova LLM, to generate a conversational_qa_retrieval chain
+        based on the chat history and the relevant retrieved content from the input vector store of text chunks.
+        Parameters:
+        vectorstore (Chroma): A Vector Store containing embeddings of text chunks used as context
+                                        for generating the conversation chain.
+        Returns:
+        RetrievalQA: A chain ready for QA with memory
+        """

utils/model_wrappers/api_gateway.py ADDED Viewed

	@@ -0,0 +1,260 @@

+import logging
+import os
+import sys
+from typing import Optional, Dict
+from langchain_community.embeddings import HuggingFaceInstructEmbeddings
+from langchain_core.embeddings import Embeddings
+from langchain_core.language_models.llms import LLM
+from langchain_core.language_models.chat_models import BaseChatModel
+current_dir = os.path.dirname(os.path.abspath(__file__))
+utils_dir = os.path.abspath(os.path.join(current_dir, '..'))
+repo_dir = os.path.abspath(os.path.join(utils_dir, '..'))
+sys.path.append(utils_dir)
+sys.path.append(repo_dir)
+from utils.model_wrappers.langchain_embeddings import SambaStudioEmbeddings
+from utils.model_wrappers.langchain_llms import SambaStudio
+from utils.model_wrappers.langchain_llms import SambaNovaCloud
+from utils.model_wrappers.langchain_chat_models import ChatSambaNovaCloud
+EMBEDDING_MODEL = 'intfloat/e5-large-v2'
+NORMALIZE_EMBEDDINGS = True
+# Configure the logger
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s [%(levelname)s] - %(message)s',
+    handlers=[
+        logging.StreamHandler(),
+    ],
+)
+logger = logging.getLogger(__name__)
+class APIGateway:
+    @staticmethod
+    def load_embedding_model(
+        type: str = 'cpu',
+        batch_size: Optional[int] = None,
+        coe: bool = False,
+        select_expert: Optional[str] = None,
+        sambastudio_embeddings_base_url: Optional[str] = None,
+        sambastudio_embeddings_base_uri: Optional[str] = None,
+        sambastudio_embeddings_project_id: Optional[str] = None,
+        sambastudio_embeddings_endpoint_id: Optional[str] = None,
+        sambastudio_embeddings_api_key: Optional[str] = None,
+    ) -> Embeddings:
+        """Loads a langchain embedding model given a type and parameters
+        Args:
+            type (str): wether to use sambastudio embedding model or in local cpu model
+            batch_size (int, optional): batch size for sambastudio model. Defaults to None.
+            coe (bool, optional): whether to use coe model. Defaults to False. only for sambastudio models
+            select_expert (str, optional): expert model to be used when coe selected. Defaults to None.
+                only for sambastudio models.
+            sambastudio_embeddings_base_url (str, optional): base url for sambastudio model. Defaults to None.
+            sambastudio_embeddings_base_uri (str, optional): endpoint base uri for sambastudio model. Defaults to None.
+            sambastudio_embeddings_project_id (str, optional): project id for sambastudio model. Defaults to None.
+            sambastudio_embeddings_endpoint_id (str, optional): endpoint id for sambastudio model. Defaults to None.
+            sambastudio_embeddings_api_key (str, optional): api key for sambastudio model. Defaults to None.
+        Returns:
+            langchain embedding model
+        """
+        if type == 'sambastudio':
+            envs = {
+                'sambastudio_embeddings_base_url': sambastudio_embeddings_base_url,
+                'sambastudio_embeddings_base_uri': sambastudio_embeddings_base_uri,
+                'sambastudio_embeddings_project_id': sambastudio_embeddings_project_id,
+                'sambastudio_embeddings_endpoint_id': sambastudio_embeddings_endpoint_id,
+                'sambastudio_embeddings_api_key': sambastudio_embeddings_api_key,
+            }
+            envs = {k: v for k, v in envs.items() if v is not None}
+            if coe:
+                if batch_size is None:
+                    batch_size = 1
+                embeddings = SambaStudioEmbeddings(
+                    **envs, batch_size=batch_size, model_kwargs={'select_expert': select_expert}
+                )
+            else:
+                if batch_size is None:
+                    batch_size = 32
+                embeddings = SambaStudioEmbeddings(**envs, batch_size=batch_size)
+        elif type == 'cpu':
+            encode_kwargs = {'normalize_embeddings': NORMALIZE_EMBEDDINGS}
+            embedding_model = EMBEDDING_MODEL
+            embeddings = HuggingFaceInstructEmbeddings(
+                model_name=embedding_model,
+                embed_instruction='',  # no instruction is needed for candidate passages
+                query_instruction='Represent this sentence for searching relevant passages: ',
+                encode_kwargs=encode_kwargs,
+            )
+        else:
+            raise ValueError(f'{type} is not a valid embedding model type')
+        return embeddings
+    @staticmethod
+    def load_llm(
+        type: str,
+        streaming: bool = False,
+        coe: bool = False,
+        do_sample: Optional[bool] = None,
+        max_tokens_to_generate: Optional[int] = None,
+        temperature: Optional[float] = None,
+        select_expert: Optional[str] = None,
+        top_p: Optional[float] = None,
+        top_k: Optional[int] = None,
+        repetition_penalty: Optional[float] = None,
+        stop_sequences: Optional[str] = None,
+        process_prompt: Optional[bool] = False,
+        sambastudio_base_url: Optional[str] = None,
+        sambastudio_base_uri: Optional[str] = None,
+        sambastudio_project_id: Optional[str] = None,
+        sambastudio_endpoint_id: Optional[str] = None,
+        sambastudio_api_key: Optional[str] = None,
+        sambanova_url: Optional[str] = None,
+        sambanova_api_key: Optional[str] = None,
+    ) -> LLM:
+        """Loads a langchain Sambanova llm model given a type and parameters
+        Args:
+            type (str): wether to use sambastudio, or SambaNova Cloud model "sncloud"
+            streaming (bool): wether to use streaming method. Defaults to False.
+            coe (bool): whether to use coe model. Defaults to False.
+            do_sample (bool) : Optional wether to do sample.
+            max_tokens_to_generate (int) : Optional max number of tokens to generate.
+            temperature (float) : Optional model temperature.
+            select_expert (str) : Optional expert to use when using CoE models.
+            top_p (float) : Optional model top_p.
+            top_k (int) : Optional model top_k.
+            repetition_penalty (float) : Optional model repetition penalty.
+            stop_sequences (str) : Optional model stop sequences.
+            process_prompt (bool) : Optional default to false.
+            sambastudio_base_url (str): Optional SambaStudio environment URL".
+            sambastudio_base_uri (str): Optional SambaStudio-base-URI".
+            sambastudio_project_id (str): Optional SambaStudio project ID.
+            sambastudio_endpoint_id (str): Optional SambaStudio endpoint ID.
+            sambastudio_api_token (str): Optional SambaStudio endpoint API key.
+            sambanova_url (str): Optional SambaNova Cloud URL",
+            sambanova_api_key (str): Optional SambaNovaCloud API key.
+        Returns:
+            langchain llm model
+        """
+        if type == 'sambastudio':
+            envs = {
+                'sambastudio_base_url': sambastudio_base_url,
+                'sambastudio_base_uri': sambastudio_base_uri,
+                'sambastudio_project_id': sambastudio_project_id,
+                'sambastudio_endpoint_id': sambastudio_endpoint_id,
+                'sambastudio_api_key': sambastudio_api_key,
+            }
+            envs = {k: v for k, v in envs.items() if v is not None}
+            if coe:
+                model_kwargs = {
+                    'do_sample': do_sample,
+                    'max_tokens_to_generate': max_tokens_to_generate,
+                    'temperature': temperature,
+                    'select_expert': select_expert,
+                    'top_p': top_p,
+                    'top_k': top_k,
+                    'repetition_penalty': repetition_penalty,
+                    'stop_sequences': stop_sequences,
+                    'process_prompt': process_prompt,
+                }
+                model_kwargs = {k: v for k, v in model_kwargs.items() if v is not None}
+                llm = SambaStudio(
+                    **envs,
+                    streaming=streaming,
+                    model_kwargs=model_kwargs,
+                )
+            else:
+                model_kwargs = {
+                    'do_sample': do_sample,
+                    'max_tokens_to_generate': max_tokens_to_generate,
+                    'temperature': temperature,
+                    'top_p': top_p,
+                    'top_k': top_k,
+                    'repetition_penalty': repetition_penalty,
+                    'stop_sequences': stop_sequences,
+                }
+                model_kwargs = {k: v for k, v in model_kwargs.items() if v is not None}
+                llm = SambaStudio(
+                    **envs,
+                    streaming=streaming,
+                    model_kwargs=model_kwargs,
+                )
+        elif type == 'sncloud':
+            envs = {
+                'sambanova_url': sambanova_url,
+                'sambanova_api_key': sambanova_api_key,
+            }
+            envs = {k: v for k, v in envs.items() if v is not None}
+            llm = SambaNovaCloud(
+                **envs,
+                max_tokens=max_tokens_to_generate,
+                model=select_expert,
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p,
+            )
+        else:
+            raise ValueError(f"Invalid LLM API: {type}, only 'sncloud' and 'sambastudio' are supported.")
+        return llm
+    @staticmethod
+    def load_chat(
+        model: str,
+        streaming: bool = False,
+        max_tokens: int = 1024,
+        temperature: Optional[float] = 0.0,
+        top_p: Optional[float] = None,
+        top_k: Optional[int] = None,
+        stream_options: Optional[Dict[str, bool]] = {"include_usage": True},
+        sambanova_url: Optional[str] = None,
+        sambanova_api_key: Optional[str] = None,
+    ) -> BaseChatModel:
+        """
+        Loads a langchain SambanovaCloud chat model given some parameters
+        Args:
+            model (str): The name of the model to use, e.g., llama3-8b.
+            streaming (bool): whether to use streaming method. Defaults to False.
+            max_tokens (int) : Optional max number of tokens to generate.
+            temperature (float) : Optional model temperature.
+            top_p (float) : Optional model top_p.
+            top_k (int) : Optional model top_k.
+            stream_options (dict) : stream options, include usage to get generation metrics
+            sambanova_url (str): Optional SambaNova Cloud URL",
+            sambanova_api_key (str): Optional SambaNovaCloud API key.
+        Returns:
+            langchain BaseChatModel
+        """
+        envs = {
+                'sambanova_url': sambanova_url,
+                'sambanova_api_key': sambanova_api_key,
+            }
+        envs = {k: v for k, v in envs.items() if v is not None}
+        model = ChatSambaNovaCloud(
+            **envs,
+            model= model,
+            streaming=streaming,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            stream_options=stream_options
+        )
+        return model

utils/model_wrappers/langchain_chat_models.py ADDED Viewed

	@@ -0,0 +1,465 @@

+import json
+from typing import Any, Dict, Iterator, List, Optional
+import requests
+from langchain_core.callbacks import (
+    CallbackManagerForLLMRun,
+)
+from langchain_core.language_models.chat_models import (
+    BaseChatModel,
+    generate_from_stream,
+)
+from langchain_core.messages import (
+    AIMessage,
+    AIMessageChunk,
+    BaseMessage,
+    ChatMessage,
+    HumanMessage,
+    SystemMessage,
+    ToolMessage,
+)
+from langchain_core.outputs import ChatGeneration, ChatGenerationChunk, ChatResult
+from langchain_core.pydantic_v1 import Field, SecretStr
+from langchain_core.utils import convert_to_secret_str, get_from_dict_or_env
+class ChatSambaNovaCloud(BaseChatModel):
+    """
+    SambaNova Cloud chat model.
+    Setup:
+        To use, you should have the environment variables
+        ``SAMBANOVA_URL`` set with your SambaNova Cloud URL.
+        ``SAMBANOVA_API_KEY`` set with your SambaNova Cloud API Key.
+        http://cloud.sambanova.ai/
+        Example:
+        .. code-block:: python
+            ChatSambaNovaCloud(
+                sambanova_url = SambaNova cloud endpoint URL,
+                sambanova_api_key = set with your SambaNova cloud API key,
+                model = model name,
+                streaming = set True for use streaming API
+                max_tokens = max number of tokens to generate,
+                temperature = model temperature,
+                top_p = model top p,
+                top_k = model top k,
+                stream_options = include usage to get generation metrics
+            )
+    Key init args — completion params:
+        model: str
+            The name of the model to use, e.g., llama3-8b.
+        streaming: bool
+            Whether to use streaming or not
+        max_tokens: int
+            max tokens to generate
+        temperature: float
+            model temperature
+        top_p: float
+            model top p
+        top_k: int
+            model top k
+        stream_options: dict
+            stream options, include usage to get generation metrics
+    Key init args — client params:
+        sambanova_url: str
+            SambaNova Cloud Url
+        sambanova_api_key: str
+            SambaNova Cloud api key
+    Instantiate:
+        .. code-block:: python
+            from langchain_community.chat_models import ChatSambaNovaCloud
+            chat = ChatSambaNovaCloud(
+                sambanova_url = SambaNova cloud endpoint URL,
+                sambanova_api_key = set with your SambaNova cloud API key,
+                model = model name,
+                streaming = set True for streaming
+                max_tokens = max number of tokens to generate,
+                temperature = model temperature,
+                top_p = model top p,
+                top_k = model top k,
+                stream_options = include usage to get generation metrics
+            )
+    Invoke:
+        .. code-block:: python
+            messages = [
+                SystemMessage(content="your are an AI assistant."),
+                HumanMessage(content="tell me a joke."),
+            ]
+            response = chat.invoke(messages)
+    Stream:
+        .. code-block:: python
+        for chunk in chat.stream(messages):
+            print(chunk.content, end="", flush=True)
+    Async:
+        .. code-block:: python
+        response = chat.ainvoke(messages)
+        await response
+    Token usage:
+        .. code-block:: python
+        response = chat.invoke(messages)
+        print(response.response_metadata["usage"]["prompt_tokens"]
+        print(response.response_metadata["usage"]["total_tokens"]
+    Response metadata
+        .. code-block:: python
+        response = chat.invoke(messages)
+        print(response.response_metadata)
+    """
+    sambanova_url: str = Field(default="")
+    """SambaNova Cloud Url"""
+    sambanova_api_key: SecretStr = Field(default="")
+    """SambaNova Cloud api key"""
+    model: str = Field(default="llama3-8b")
+    """The name of the model"""
+    streaming: bool = Field(default=False)
+    """Whether to use streaming or not"""
+    max_tokens: int = Field(default=1024)
+    """max tokens to generate"""
+    temperature: float = Field(default=0.7)
+    """model temperature"""
+    top_p: float = Field(default=0.0)
+    """model top p"""
+    top_k: int = Field(default=1)
+    """model top k"""
+    stream_options: dict = Field(default={"include_usage": True})
+    """stream options, include usage to get generation metrics"""
+    class Config:
+        allow_population_by_field_name = True
+    @classmethod
+    def is_lc_serializable(cls) -> bool:
+        """Return whether this model can be serialized by Langchain."""
+        return False
+    @property
+    def lc_secrets(self) -> Dict[str, str]:
+        return {"sambanova_api_key": "sambanova_api_key"}
+    @property
+    def _identifying_params(self) -> Dict[str, Any]:
+        """Return a dictionary of identifying parameters.
+        This information is used by the LangChain callback system, which
+        is used for tracing purposes make it possible to monitor LLMs.
+        """
+        return {
+            "model": self.model,
+            "streaming": self.streaming,
+            "max_tokens": self.max_tokens,
+            "temperature": self.temperature,
+            "top_p": self.top_p,
+            "top_k": self.top_k,
+            "stream_options": self.stream_options,
+        }
+    @property
+    def _llm_type(self) -> str:
+        """Get the type of language model used by this chat model."""
+        return "sambanovacloud-chatmodel"
+    def __init__(self, **kwargs: Any) -> None:
+        """init and validate environment variables"""
+        kwargs["sambanova_url"] = get_from_dict_or_env(
+            kwargs,
+            "sambanova_url",
+            "SAMBANOVA_URL",
+            default="https://api.sambanova.ai/v1/chat/completions",
+        )
+        kwargs["sambanova_api_key"] = convert_to_secret_str(
+            get_from_dict_or_env(kwargs, "sambanova_api_key", "SAMBANOVA_API_KEY")
+        )
+        super().__init__(**kwargs)
+    def _handle_request(
+        self, messages_dicts: List[Dict], stop: Optional[List[str]] = None
+    ) -> Dict[str, Any]:
+        """
+        Performs a post request to the LLM API.
+        Args:
+            messages_dicts: List of role / content dicts to use as input.
+            stop: list of stop tokens
+        Returns:
+            An iterator of response dicts.
+        """
+        data = {
+            "messages": messages_dicts,
+            "max_tokens": self.max_tokens,
+            "stop": stop,
+            "model": self.model,
+            "temperature": self.temperature,
+            "top_p": self.top_p,
+            "top_k": self.top_k,
+        }
+        http_session = requests.Session()
+        response = http_session.post(
+            self.sambanova_url,
+            headers={
+                "Authorization": f"Bearer {self.sambanova_api_key.get_secret_value()}",
+                "Content-Type": "application/json",
+            },
+            json=data,
+        )
+        if response.status_code != 200:
+            raise RuntimeError(
+                f"Sambanova /complete call failed with status code "
+                f"{response.status_code}."
+                f"{response.text}."
+            )
+        response_dict = response.json()
+        if response_dict.get("error"):
+            raise RuntimeError(
+                f"Sambanova /complete call failed with status code "
+                f"{response.status_code}."
+                f"{response_dict}."
+            )
+        return response_dict
+    def _handle_streaming_request(
+        self, messages_dicts: List[Dict], stop: Optional[List[str]] = None
+    ) -> Iterator[Dict]:
+        """
+        Performs an streaming post request to the LLM API.
+        Args:
+            messages_dicts: List of role / content dicts to use as input.
+            stop: list of stop tokens
+        Returns:
+            An iterator of response dicts.
+        """
+        try:
+            import sseclient
+        except ImportError:
+            raise ImportError(
+                "could not import sseclient library"
+                "Please install it with `pip install sseclient-py`."
+            )
+        data = {
+            "messages": messages_dicts,
+            "max_tokens": self.max_tokens,
+            "stop": stop,
+            "model": self.model,
+            "temperature": self.temperature,
+            "top_p": self.top_p,
+            "top_k": self.top_k,
+            "stream": True,
+            "stream_options": self.stream_options,
+        }
+        http_session = requests.Session()
+        response = http_session.post(
+            self.sambanova_url,
+            headers={
+                "Authorization": f"Bearer {self.sambanova_api_key.get_secret_value()}",
+                "Content-Type": "application/json",
+            },
+            json=data,
+            stream=True,
+        )
+        client = sseclient.SSEClient(response)
+        if response.status_code != 200:
+            raise RuntimeError(
+                f"Sambanova /complete call failed with status code "
+                f"{response.status_code}."
+                f"{response.text}."
+            )
+        for event in client.events():
+            chunk = {
+                "event": event.event,
+                "data": event.data,
+                "status_code": response.status_code,
+            }
+            if chunk["event"] == "error_event" or chunk["status_code"] != 200:
+                raise RuntimeError(
+                    f"Sambanova /complete call failed with status code "
+                    f"{chunk['status_code']}."
+                    f"{chunk}."
+                )
+            try:
+                # check if the response is a final event
+                # in that case event data response is '[DONE]'
+                if chunk["data"] != "[DONE]":
+                    if isinstance(chunk["data"], str):
+                        data = json.loads(chunk["data"])
+                    else:
+                        raise RuntimeError(
+                            f"Sambanova /complete call failed with status code "
+                            f"{chunk['status_code']}."
+                            f"{chunk}."
+                        )
+                    if data.get("error"):
+                        raise RuntimeError(
+                            f"Sambanova /complete call failed with status code "
+                            f"{chunk['status_code']}."
+                            f"{chunk}."
+                        )
+                    yield data
+            except Exception:
+                raise Exception(
+                    f"Error getting content chunk raw streamed response: {chunk}"
+                )
+    def _convert_message_to_dict(self, message: BaseMessage) -> Dict[str, Any]:
+        """
+        convert a BaseMessage to a dictionary with Role / content
+        Args:
+            message: BaseMessage
+        Returns:
+            messages_dict:  role / content dict
+        """
+        if isinstance(message, ChatMessage):
+            message_dict = {"role": message.role, "content": message.content}
+        elif isinstance(message, SystemMessage):
+            message_dict = {"role": "system", "content": message.content}
+        elif isinstance(message, HumanMessage):
+            message_dict = {"role": "user", "content": message.content}
+        elif isinstance(message, AIMessage):
+            message_dict = {"role": "assistant", "content": message.content}
+        elif isinstance(message, ToolMessage):
+            message_dict = {"role": "tool", "content": message.content}
+        else:
+            raise TypeError(f"Got unknown type {message}")
+        return message_dict
+    def _create_message_dicts(
+        self, messages: List[BaseMessage]
+    ) -> List[Dict[str, Any]]:
+        """
+        convert a lit of BaseMessages to a list of dictionaries with Role / content
+        Args:
+            messages: list of BaseMessages
+        Returns:
+            messages_dicts:  list of role / content dicts
+        """
+        message_dicts = [self._convert_message_to_dict(m) for m in messages]
+        return message_dicts
+    def _generate(
+        self,
+        messages: List[BaseMessage],
+        stop: Optional[List[str]] = None,
+        run_manager: Optional[CallbackManagerForLLMRun] = None,
+        **kwargs: Any,
+    ) -> ChatResult:
+        """
+        SambaNovaCloud chat model logic.
+        Call SambaNovaCloud API.
+        Args:
+            messages: the prompt composed of a list of messages.
+            stop: a list of strings on which the model should stop generating.
+                  If generation stops due to a stop token, the stop token itself
+                  SHOULD BE INCLUDED as part of the output. This is not enforced
+                  across models right now, but it's a good practice to follow since
+                  it makes it much easier to parse the output of the model
+                  downstream and understand why generation stopped.
+            run_manager: A run manager with callbacks for the LLM.
+        """
+        if self.streaming:
+            stream_iter = self._stream(
+                messages, stop=stop, run_manager=run_manager, **kwargs
+            )
+            if stream_iter:
+                return generate_from_stream(stream_iter)
+        messages_dicts = self._create_message_dicts(messages)
+        response = self._handle_request(messages_dicts, stop)
+        message = AIMessage(
+            content=response["choices"][0]["message"]["content"],
+            additional_kwargs={},
+            response_metadata={
+                "finish_reason": response["choices"][0]["finish_reason"],
+                "usage": response.get("usage"),
+                "model_name": response["model"],
+                "system_fingerprint": response["system_fingerprint"],
+                "created": response["created"],
+            },
+            id=response["id"],
+        )
+        generation = ChatGeneration(message=message)
+        return ChatResult(generations=[generation])
+    def _stream(
+        self,
+        messages: List[BaseMessage],
+        stop: Optional[List[str]] = None,
+        run_manager: Optional[CallbackManagerForLLMRun] = None,
+        **kwargs: Any,
+    ) -> Iterator[ChatGenerationChunk]:
+        """
+        Stream the output of the SambaNovaCloud chat model.
+        Args:
+            messages: the prompt composed of a list of messages.
+            stop: a list of strings on which the model should stop generating.
+                  If generation stops due to a stop token, the stop token itself
+                  SHOULD BE INCLUDED as part of the output. This is not enforced
+                  across models right now, but it's a good practice to follow since
+                  it makes it much easier to parse the output of the model
+                  downstream and understand why generation stopped.
+            run_manager: A run manager with callbacks for the LLM.
+        """
+        messages_dicts = self._create_message_dicts(messages)
+        finish_reason = None
+        for partial_response in self._handle_streaming_request(messages_dicts, stop):
+            if len(partial_response["choices"]) > 0:
+                finish_reason = partial_response["choices"][0].get("finish_reason")
+                content = partial_response["choices"][0]["delta"]["content"]
+                id = partial_response["id"]
+                chunk = ChatGenerationChunk(
+                    message=AIMessageChunk(content=content, id=id, additional_kwargs={})
+                )
+            else:
+                content = ""
+                id = partial_response["id"]
+                metadata = {
+                    "finish_reason": finish_reason,
+                    "usage": partial_response.get("usage"),
+                    "model_name": partial_response["model"],
+                    "system_fingerprint": partial_response["system_fingerprint"],
+                    "created": partial_response["created"],
+                }
+                chunk = ChatGenerationChunk(
+                    message=AIMessageChunk(
+                        content=content,
+                        id=id,
+                        response_metadata=metadata,
+                        additional_kwargs={},
+                    )
+                )
+            if run_manager:
+                run_manager.on_llm_new_token(chunk.text, chunk=chunk)
+            yield chunk

utils/model_wrappers/langchain_embeddings.py ADDED Viewed

	@@ -0,0 +1,309 @@

+"""Langchain Wrapper around Sambanova embedding APIs."""
+import json
+from typing import Dict, Generator, List, Optional
+import requests
+from langchain_core.embeddings import Embeddings
+from langchain_core.pydantic_v1 import BaseModel
+from langchain_core.utils import get_from_dict_or_env, pre_init
+class SambaStudioEmbeddings(BaseModel, Embeddings):
+    """SambaNova embedding models.
+    To use, you should have the environment variables
+    ``SAMBASTUDIO_EMBEDDINGS_BASE_URL``, ``SAMBASTUDIO_EMBEDDINGS_BASE_URI``
+    ``SAMBASTUDIO_EMBEDDINGS_PROJECT_ID``, ``SAMBASTUDIO_EMBEDDINGS_ENDPOINT_ID``,
+    ``SAMBASTUDIO_EMBEDDINGS_API_KEY``
+    set with your personal sambastudio variable or pass it as a named parameter
+    to the constructor.
+    Example:
+        .. code-block:: python
+            from langchain_community.embeddings import SambaStudioEmbeddings
+            embeddings = SambaStudioEmbeddings(sambastudio_embeddings_base_url=base_url,
+                                          sambastudio_embeddings_base_uri=base_uri,
+                                          sambastudio_embeddings_project_id=project_id,
+                                          sambastudio_embeddings_endpoint_id=endpoint_id,
+                                          sambastudio_embeddings_api_key=api_key,
+                                          batch_size=32)
+            (or)
+            embeddings = SambaStudioEmbeddings(batch_size=32)
+            (or)
+            # CoE example
+            embeddings = SambaStudioEmbeddings(
+                batch_size=1,
+                model_kwargs={
+                    'select_expert':'e5-mistral-7b-instruct'
+                }
+            )
+    """
+    sambastudio_embeddings_base_url: str = ''
+    """Base url to use"""
+    sambastudio_embeddings_base_uri: str = ''
+    """endpoint base uri"""
+    sambastudio_embeddings_project_id: str = ''
+    """Project id on sambastudio for model"""
+    sambastudio_embeddings_endpoint_id: str = ''
+    """endpoint id on sambastudio for model"""
+    sambastudio_embeddings_api_key: str = ''
+    """sambastudio api key"""
+    model_kwargs: dict = {}
+    """Key word arguments to pass to the model."""
+    batch_size: int = 32
+    """Batch size for the embedding models"""
+    @pre_init
+    def validate_environment(cls, values: Dict) -> Dict:
+        """Validate that api key and python package exists in environment."""
+        values['sambastudio_embeddings_base_url'] = get_from_dict_or_env(
+            values, 'sambastudio_embeddings_base_url', 'SAMBASTUDIO_EMBEDDINGS_BASE_URL'
+        )
+        values['sambastudio_embeddings_base_uri'] = get_from_dict_or_env(
+            values,
+            'sambastudio_embeddings_base_uri',
+            'SAMBASTUDIO_EMBEDDINGS_BASE_URI',
+            default='api/predict/generic',
+        )
+        values['sambastudio_embeddings_project_id'] = get_from_dict_or_env(
+            values,
+            'sambastudio_embeddings_project_id',
+            'SAMBASTUDIO_EMBEDDINGS_PROJECT_ID',
+        )
+        values['sambastudio_embeddings_endpoint_id'] = get_from_dict_or_env(
+            values,
+            'sambastudio_embeddings_endpoint_id',
+            'SAMBASTUDIO_EMBEDDINGS_ENDPOINT_ID',
+        )
+        values['sambastudio_embeddings_api_key'] = get_from_dict_or_env(
+            values, 'sambastudio_embeddings_api_key', 'SAMBASTUDIO_EMBEDDINGS_API_KEY'
+        )
+        return values
+    def _get_tuning_params(self) -> str:
+        """
+        Get the tuning parameters to use when calling the model
+        Returns:
+            The tuning parameters as a JSON string.
+        """
+        if 'api/v2/predict/generic' in self.sambastudio_embeddings_base_uri:
+            tuning_params_dict = self.model_kwargs
+        else:
+            tuning_params_dict = {
+                k: {'type': type(v).__name__, 'value': str(v)} for k, v in (self.model_kwargs.items())
+            }
+        tuning_params = json.dumps(tuning_params_dict)
+        return tuning_params
+    def _get_full_url(self, path: str) -> str:
+        """
+        Return the full API URL for a given path.
+        :param str path: the sub-path
+        :returns: the full API URL for the sub-path
+        :rtype: str
+        """
+        return f'{self.sambastudio_embeddings_base_url}/{self.sambastudio_embeddings_base_uri}/{path}'  # noqa: E501
+    def _iterate_over_batches(self, texts: List[str], batch_size: int) -> Generator:
+        """Generator for creating batches in the embed documents method
+        Args:
+            texts (List[str]): list of strings to embed
+            batch_size (int, optional): batch size to be used for the embedding model.
+            Will depend on the RDU endpoint used.
+        Yields:
+            List[str]: list (batch) of strings of size batch size
+        """
+        for i in range(0, len(texts), batch_size):
+            yield texts[i : i + batch_size]
+    def embed_documents(self, texts: List[str], batch_size: Optional[int] = None) -> List[List[float]]:
+        """Returns a list of embeddings for the given sentences.
+        Args:
+            texts (`List[str]`): List of texts to encode
+            batch_size (`int`): Batch size for the encoding
+        Returns:
+            `List[np.ndarray]` or `List[tensor]`: List of embeddings
+            for the given sentences
+        """
+        if batch_size is None:
+            batch_size = self.batch_size
+        http_session = requests.Session()
+        url = self._get_full_url(f'{self.sambastudio_embeddings_project_id}/{self.sambastudio_embeddings_endpoint_id}')
+        params = json.loads(self._get_tuning_params())
+        embeddings = []
+        if 'api/predict/nlp' in self.sambastudio_embeddings_base_uri:
+            for batch in self._iterate_over_batches(texts, batch_size):
+                data = {'inputs': batch, 'params': params}
+                response = http_session.post(
+                    url,
+                    headers={'key': self.sambastudio_embeddings_api_key},
+                    json=data,
+                )
+                if response.status_code != 200:
+                    raise RuntimeError(
+                        f'Sambanova /complete call failed with status code '
+                        f'{response.status_code}.\n Details: {response.text}'
+                    )
+                try:
+                    embedding = response.json()['data']
+                    embeddings.extend(embedding)
+                except KeyError:
+                    raise KeyError(
+                        "'data' not found in endpoint response",
+                        response.json(),
+                    )
+        elif 'api/v2/predict/generic' in self.sambastudio_embeddings_base_uri:
+            for batch in self._iterate_over_batches(texts, batch_size):
+                items = [{'id': f'item{i}', 'value': item} for i, item in enumerate(batch)]
+                data = {'items': items, 'params': params}
+                response = http_session.post(
+                    url,
+                    headers={'key': self.sambastudio_embeddings_api_key},
+                    json=data,
+                )
+                if response.status_code != 200:
+                    raise RuntimeError(
+                        f'Sambanova /complete call failed with status code '
+                        f'{response.status_code}.\n Details: {response.text}'
+                    )
+                try:
+                    embedding = [item['value'] for item in response.json()['items']]
+                    embeddings.extend(embedding)
+                except KeyError:
+                    raise KeyError(
+                        "'items' not found in endpoint response",
+                        response.json(),
+                    )
+        elif 'api/predict/generic' in self.sambastudio_embeddings_base_uri:
+            for batch in self._iterate_over_batches(texts, batch_size):
+                data = {'instances': batch, 'params': params}
+                response = http_session.post(
+                    url,
+                    headers={'key': self.sambastudio_embeddings_api_key},
+                    json=data,
+                )
+                if response.status_code != 200:
+                    raise RuntimeError(
+                        f'Sambanova /complete call failed with status code '
+                        f'{response.status_code}.\n Details: {response.text}'
+                    )
+                try:
+                    if params.get('select_expert'):
+                        embedding = response.json()['predictions']
+                    else:
+                        embedding = response.json()['predictions']
+                    embeddings.extend(embedding)
+                except KeyError:
+                    raise KeyError(
+                        "'predictions' not found in endpoint response",
+                        response.json(),
+                    )
+        else:
+            raise ValueError(
+                f'handling of endpoint uri: {self.sambastudio_embeddings_base_uri} not implemented'  # noqa: E501
+            )
+        return embeddings
+    def embed_query(self, text: str) -> List[float]:
+        """Returns a list of embeddings for the given sentences.
+        Args:
+            sentences (`List[str]`): List of sentences to encode
+        Returns:
+            `List[np.ndarray]` or `List[tensor]`: List of embeddings
+            for the given sentences
+        """
+        http_session = requests.Session()
+        url = self._get_full_url(f'{self.sambastudio_embeddings_project_id}/{self.sambastudio_embeddings_endpoint_id}')
+        params = json.loads(self._get_tuning_params())
+        if 'api/predict/nlp' in self.sambastudio_embeddings_base_uri:
+            data = {'inputs': [text], 'params': params}
+            response = http_session.post(
+                url,
+                headers={'key': self.sambastudio_embeddings_api_key},
+                json=data,
+            )
+            if response.status_code != 200:
+                raise RuntimeError(
+                    f'Sambanova /complete call failed with status code '
+                    f'{response.status_code}.\n Details: {response.text}'
+                )
+            try:
+                embedding = response.json()['data'][0]
+            except KeyError:
+                raise KeyError(
+                    "'data' not found in endpoint response",
+                    response.json(),
+                )
+        elif 'api/v2/predict/generic' in self.sambastudio_embeddings_base_uri:
+            data = {'items': [{'id': 'item0', 'value': text}], 'params': params}
+            response = http_session.post(
+                url,
+                headers={'key': self.sambastudio_embeddings_api_key},
+                json=data,
+            )
+            if response.status_code != 200:
+                raise RuntimeError(
+                    f'Sambanova /complete call failed with status code '
+                    f'{response.status_code}.\n Details: {response.text}'
+                )
+            try:
+                embedding = response.json()['items'][0]['value']
+            except KeyError:
+                raise KeyError(
+                    "'items' not found in endpoint response",
+                    response.json(),
+                )
+        elif 'api/predict/generic' in self.sambastudio_embeddings_base_uri:
+            data = {'instances': [text], 'params': params}
+            response = http_session.post(
+                url,
+                headers={'key': self.sambastudio_embeddings_api_key},
+                json=data,
+            )
+            if response.status_code != 200:
+                raise RuntimeError(
+                    f'Sambanova /complete call failed with status code '
+                    f'{response.status_code}.\n Details: {response.text}'
+                )
+            try:
+                if params.get('select_expert'):
+                    embedding = response.json()['predictions'][0]
+                else:
+                    embedding = response.json()['predictions'][0]
+            except KeyError:
+                raise KeyError(
+                    "'predictions' not found in endpoint response",
+                    response.json(),
+                )
+        else:
+            raise ValueError(
+                f'handling of endpoint uri: {self.sambastudio_embeddings_base_uri} not implemented'  # noqa: E501
+            )
+        return embedding

utils/model_wrappers/langchain_llms.py ADDED Viewed

	@@ -0,0 +1,770 @@

+"""Langchain Wrapper around Sambanova LLM APIs."""
+import json
+from typing import Any, Dict, Generator, Iterator, List, Optional, Union
+import requests
+from langchain_core.callbacks.manager import CallbackManagerForLLMRun
+from langchain_core.language_models.llms import LLM
+from langchain_core.outputs import GenerationChunk
+from langchain_core.pydantic_v1 import Extra
+from langchain_core.utils import get_from_dict_or_env, pre_init
+class SSEndpointHandler:
+    """
+    SambaNova Systems Interface for SambaStudio model endpoints.
+    :param str host_url: Base URL of the DaaS API service
+    """
+    def __init__(self, host_url: str, api_base_uri: str):
+        """
+        Initialize the SSEndpointHandler.
+        :param str host_url: Base URL of the DaaS API service
+        :param str api_base_uri: Base URI of the DaaS API service
+        """
+        self.host_url = host_url
+        self.api_base_uri = api_base_uri
+        self.http_session = requests.Session()
+    def _process_response(self, response: requests.Response) -> Dict:
+        """
+        Processes the API response and returns the resulting dict.
+        All resulting dicts, regardless of success or failure, will contain the
+        `status_code` key with the API response status code.
+        If the API returned an error, the resulting dict will contain the key
+        `detail` with the error message.
+        If the API call was successful, the resulting dict will contain the key
+        `data` with the response data.
+        :param requests.Response response: the response object to process
+        :return: the response dict
+        :type: dict
+        """
+        result: Dict[str, Any] = {}
+        try:
+            result = response.json()
+        except Exception as e:
+            result['detail'] = str(e)
+        if 'status_code' not in result:
+            result['status_code'] = response.status_code
+        return result
+    def _process_streaming_response(
+        self,
+        response: requests.Response,
+    ) -> Generator[Dict, None, None]:
+        """Process the streaming response"""
+        if 'api/predict/nlp' in self.api_base_uri:
+            try:
+                import sseclient
+            except ImportError:
+                raise ImportError(
+                    'could not import sseclient library' 'Please install it with `pip install sseclient-py`.'
+                )
+            client = sseclient.SSEClient(response)
+            close_conn = False
+            for event in client.events():
+                if event.event == 'error_event':
+                    close_conn = True
+                chunk = {
+                    'event': event.event,
+                    'data': event.data,
+                    'status_code': response.status_code,
+                }
+                yield chunk
+            if close_conn:
+                client.close()
+        elif 'api/v2/predict/generic' in self.api_base_uri or 'api/predict/generic' in self.api_base_uri:
+            try:
+                for line in response.iter_lines():
+                    chunk = json.loads(line)
+                    if 'status_code' not in chunk:
+                        chunk['status_code'] = response.status_code
+                    yield chunk
+            except Exception as e:
+                raise RuntimeError(f'Error processing streaming response: {e}')
+        else:
+            raise ValueError(f'handling of endpoint uri: {self.api_base_uri} not implemented')
+    def _get_full_url(self, path: str) -> str:
+        """
+        Return the full API URL for a given path.
+        :param str path: the sub-path
+        :returns: the full API URL for the sub-path
+        :type: str
+        """
+        return f'{self.host_url}/{self.api_base_uri}/{path}'
+    def nlp_predict(
+        self,
+        project: str,
+        endpoint: str,
+        key: str,
+        input: Union[List[str], str],
+        params: Optional[str] = '',
+        stream: bool = False,
+    ) -> Dict:
+        """
+        NLP predict using inline input string.
+        :param str project: Project ID in which the endpoint exists
+        :param str endpoint: Endpoint ID
+        :param str key: API Key
+        :param str input_str: Input string
+        :param str params: Input params string
+        :returns: Prediction results
+        :type: dict
+        """
+        if isinstance(input, str):
+            input = [input]
+        if 'api/predict/nlp' in self.api_base_uri:
+            if params:
+                data = {'inputs': input, 'params': json.loads(params)}
+            else:
+                data = {'inputs': input}
+        elif 'api/v2/predict/generic' in self.api_base_uri:
+            items = [{'id': f'item{i}', 'value': item} for i, item in enumerate(input)]
+            if params:
+                data = {'items': items, 'params': json.loads(params)}
+            else:
+                data = {'items': items}
+        elif 'api/predict/generic' in self.api_base_uri:
+            if params:
+                data = {'instances': input, 'params': json.loads(params)}
+            else:
+                data = {'instances': input}
+        else:
+            raise ValueError(f'handling of endpoint uri: {self.api_base_uri} not implemented')
+        response = self.http_session.post(
+            self._get_full_url(f'{project}/{endpoint}'),
+            headers={'key': key},
+            json=data,
+        )
+        return self._process_response(response)
+    def nlp_predict_stream(
+        self,
+        project: str,
+        endpoint: str,
+        key: str,
+        input: Union[List[str], str],
+        params: Optional[str] = '',
+    ) -> Iterator[Dict]:
+        """
+        NLP predict using inline input string.
+        :param str project: Project ID in which the endpoint exists
+        :param str endpoint: Endpoint ID
+        :param str key: API Key
+        :param str input_str: Input string
+        :param str params: Input params string
+        :returns: Prediction results
+        :type: dict
+        """
+        if 'api/predict/nlp' in self.api_base_uri:
+            if isinstance(input, str):
+                input = [input]
+            if params:
+                data = {'inputs': input, 'params': json.loads(params)}
+            else:
+                data = {'inputs': input}
+        elif 'api/v2/predict/generic' in self.api_base_uri:
+            if isinstance(input, str):
+                input = [input]
+            items = [{'id': f'item{i}', 'value': item} for i, item in enumerate(input)]
+            if params:
+                data = {'items': items, 'params': json.loads(params)}
+            else:
+                data = {'items': items}
+        elif 'api/predict/generic' in self.api_base_uri:
+            if isinstance(input, list):
+                input = input[0]
+            if params:
+                data = {'instance': input, 'params': json.loads(params)}
+            else:
+                data = {'instance': input}
+        else:
+            raise ValueError(f'handling of endpoint uri: {self.api_base_uri} not implemented')
+        # Streaming output
+        response = self.http_session.post(
+            self._get_full_url(f'stream/{project}/{endpoint}'),
+            headers={'key': key},
+            json=data,
+            stream=True,
+        )
+        for chunk in self._process_streaming_response(response):
+            yield chunk
+class SambaStudio(LLM):
+    """
+    SambaStudio large language models.
+    To use, you should have the environment variables
+    ``SAMBASTUDIO_BASE_URL`` set with your SambaStudio environment URL.
+    ``SAMBASTUDIO_BASE_URI`` set with your SambaStudio api base URI.
+    ``SAMBASTUDIO_PROJECT_ID`` set with your SambaStudio project ID.
+    ``SAMBASTUDIO_ENDPOINT_ID`` set with your SambaStudio endpoint ID.
+    ``SAMBASTUDIO_API_KEY``  set with your SambaStudio endpoint API key.
+    https://sambanova.ai/products/enterprise-ai-platform-sambanova-suite
+    read extra documentation in https://docs.sambanova.ai/sambastudio/latest/index.html
+    Example:
+    .. code-block:: python
+        from langchain_community.llms.sambanova  import SambaStudio
+        SambaStudio(
+            sambastudio_base_url="your-SambaStudio-environment-URL",
+            sambastudio_base_uri="your-SambaStudio-base-URI",
+            sambastudio_project_id="your-SambaStudio-project-ID",
+            sambastudio_endpoint_id="your-SambaStudio-endpoint-ID",
+            sambastudio_api_key="your-SambaStudio-endpoint-API-key,
+            streaming=False
+            model_kwargs={
+                "do_sample": False,
+                "max_tokens_to_generate": 1000,
+                "temperature": 0.7,
+                "top_p": 1.0,
+                "repetition_penalty": 1,
+                "top_k": 50,
+                #"process_prompt": False,
+                #"select_expert": "Meta-Llama-3-8B-Instruct"
+            },
+        )
+    """
+    sambastudio_base_url: str = ''
+    """Base url to use"""
+    sambastudio_base_uri: str = ''
+    """endpoint base uri"""
+    sambastudio_project_id: str = ''
+    """Project id on sambastudio for model"""
+    sambastudio_endpoint_id: str = ''
+    """endpoint id on sambastudio for model"""
+    sambastudio_api_key: str = ''
+    """sambastudio api key"""
+    model_kwargs: Optional[dict] = None
+    """Key word arguments to pass to the model."""
+    streaming: Optional[bool] = False
+    """Streaming flag to get streamed response."""
+    class Config:
+        """Configuration for this pydantic object."""
+        extra = Extra.forbid
+    @classmethod
+    def is_lc_serializable(cls) -> bool:
+        return True
+    @property
+    def _identifying_params(self) -> Dict[str, Any]:
+        """Get the identifying parameters."""
+        return {**{'model_kwargs': self.model_kwargs}}
+    @property
+    def _llm_type(self) -> str:
+        """Return type of llm."""
+        return 'Sambastudio LLM'
+    @pre_init
+    def validate_environment(cls, values: Dict) -> Dict:
+        """Validate that api key and python package exists in environment."""
+        values['sambastudio_base_url'] = get_from_dict_or_env(values, 'sambastudio_base_url', 'SAMBASTUDIO_BASE_URL')
+        values['sambastudio_base_uri'] = get_from_dict_or_env(
+            values,
+            'sambastudio_base_uri',
+            'SAMBASTUDIO_BASE_URI',
+            default='api/predict/generic',
+        )
+        values['sambastudio_project_id'] = get_from_dict_or_env(
+            values, 'sambastudio_project_id', 'SAMBASTUDIO_PROJECT_ID'
+        )
+        values['sambastudio_endpoint_id'] = get_from_dict_or_env(
+            values, 'sambastudio_endpoint_id', 'SAMBASTUDIO_ENDPOINT_ID'
+        )
+        values['sambastudio_api_key'] = get_from_dict_or_env(values, 'sambastudio_api_key', 'SAMBASTUDIO_API_KEY')
+        return values
+    def _get_tuning_params(self, stop: Optional[List[str]]) -> str:
+        """
+        Get the tuning parameters to use when calling the LLM.
+        Args:
+            stop: Stop words to use when generating. Model output is cut off at the
+                first occurrence of any of the stop substrings.
+        Returns:
+            The tuning parameters as a JSON string.
+        """
+        _model_kwargs = self.model_kwargs or {}
+        _kwarg_stop_sequences = _model_kwargs.get('stop_sequences', [])
+        _stop_sequences = stop or _kwarg_stop_sequences
+        # if not _kwarg_stop_sequences:
+        # _model_kwargs["stop_sequences"] = ",".join(
+        #    f'"{x}"' for x in _stop_sequences
+        # )
+        if 'api/v2/predict/generic' in self.sambastudio_base_uri:
+            tuning_params_dict = _model_kwargs
+        else:
+            tuning_params_dict = {k: {'type': type(v).__name__, 'value': str(v)} for k, v in (_model_kwargs.items())}
+        # _model_kwargs["stop_sequences"] = _kwarg_stop_sequences
+        tuning_params = json.dumps(tuning_params_dict)
+        return tuning_params
+    def _handle_nlp_predict(self, sdk: SSEndpointHandler, prompt: Union[List[str], str], tuning_params: str) -> str:
+        """
+        Perform an NLP prediction using the SambaStudio endpoint handler.
+        Args:
+            sdk: The SSEndpointHandler to use for the prediction.
+            prompt: The prompt to use for the prediction.
+            tuning_params: The tuning parameters to use for the prediction.
+        Returns:
+            The prediction result.
+        Raises:
+            ValueError: If the prediction fails.
+        """
+        response = sdk.nlp_predict(
+            self.sambastudio_project_id,
+            self.sambastudio_endpoint_id,
+            self.sambastudio_api_key,
+            prompt,
+            tuning_params,
+        )
+        if response['status_code'] != 200:
+            optional_detail = response.get('detail')
+            if optional_detail:
+                raise RuntimeError(
+                    f"Sambanova /complete call failed with status code "
+                    f"{response['status_code']}.\n Details: {optional_detail}"
+                )
+            else:
+                raise RuntimeError(
+                    f"Sambanova /complete call failed with status code "
+                    f"{response['status_code']}.\n response {response}"
+                )
+        if 'api/predict/nlp' in self.sambastudio_base_uri:
+            return response['data'][0]['completion']
+        elif 'api/v2/predict/generic' in self.sambastudio_base_uri:
+            return response['items'][0]['value']['completion']
+        elif 'api/predict/generic' in self.sambastudio_base_uri:
+            return response['predictions'][0]['completion']
+        else:
+            raise ValueError(f'handling of endpoint uri: {self.sambastudio_base_uri} not implemented')
+    def _handle_completion_requests(self, prompt: Union[List[str], str], stop: Optional[List[str]]) -> str:
+        """
+        Perform a prediction using the SambaStudio endpoint handler.
+        Args:
+            prompt: The prompt to use for the prediction.
+            stop: stop sequences.
+        Returns:
+            The prediction result.
+        Raises:
+            ValueError: If the prediction fails.
+        """
+        ss_endpoint = SSEndpointHandler(self.sambastudio_base_url, self.sambastudio_base_uri)
+        tuning_params = self._get_tuning_params(stop)
+        return self._handle_nlp_predict(ss_endpoint, prompt, tuning_params)
+    def _handle_nlp_predict_stream(
+        self, sdk: SSEndpointHandler, prompt: Union[List[str], str], tuning_params: str
+    ) -> Iterator[GenerationChunk]:
+        """
+        Perform a streaming request to the LLM.
+        Args:
+            sdk: The SVEndpointHandler to use for the prediction.
+            prompt: The prompt to use for the prediction.
+            tuning_params: The tuning parameters to use for the prediction.
+        Returns:
+            An iterator of GenerationChunks.
+        """
+        for chunk in sdk.nlp_predict_stream(
+            self.sambastudio_project_id,
+            self.sambastudio_endpoint_id,
+            self.sambastudio_api_key,
+            prompt,
+            tuning_params,
+        ):
+            if chunk['status_code'] != 200:
+                error = chunk.get('error')
+                if error:
+                    optional_code = error.get('code')
+                    optional_details = error.get('details')
+                    optional_message = error.get('message')
+                    raise ValueError(
+                        f"Sambanova /complete call failed with status code "
+                        f"{chunk['status_code']}.\n"
+                        f"Message: {optional_message}\n"
+                        f"Details: {optional_details}\n"
+                        f"Code: {optional_code}\n"
+                    )
+                else:
+                    raise RuntimeError(
+                        f"Sambanova /complete call failed with status code " f"{chunk['status_code']}." f"{chunk}."
+                    )
+            if 'api/predict/nlp' in self.sambastudio_base_uri:
+                text = json.loads(chunk['data'])['stream_token']
+            elif 'api/v2/predict/generic' in self.sambastudio_base_uri:
+                text = chunk['result']['items'][0]['value']['stream_token']
+            elif 'api/predict/generic' in self.sambastudio_base_uri:
+                if len(chunk['result']['responses']) > 0:
+                    text = chunk['result']['responses'][0]['stream_token']
+                else:
+                    text = ''
+            else:
+                raise ValueError(f'handling of endpoint uri: {self.sambastudio_base_uri}' f'not implemented')
+            generated_chunk = GenerationChunk(text=text)
+            yield generated_chunk
+    def _stream(
+        self,
+        prompt: Union[List[str], str],
+        stop: Optional[List[str]] = None,
+        run_manager: Optional[CallbackManagerForLLMRun] = None,
+        **kwargs: Any,
+    ) -> Iterator[GenerationChunk]:
+        """Call out to Sambanova's complete endpoint.
+        Args:
+            prompt: The prompt to pass into the model.
+            stop: Optional list of stop words to use when generating.
+        Returns:
+            The string generated by the model.
+        """
+        ss_endpoint = SSEndpointHandler(self.sambastudio_base_url, self.sambastudio_base_uri)
+        tuning_params = self._get_tuning_params(stop)
+        try:
+            if self.streaming:
+                for chunk in self._handle_nlp_predict_stream(ss_endpoint, prompt, tuning_params):
+                    if run_manager:
+                        run_manager.on_llm_new_token(chunk.text)
+                    yield chunk
+            else:
+                return
+        except Exception as e:
+            # Handle any errors raised by the inference endpoint
+            raise ValueError(f'Error raised by the inference endpoint: {e}') from e
+    def _handle_stream_request(
+        self,
+        prompt: Union[List[str], str],
+        stop: Optional[List[str]],
+        run_manager: Optional[CallbackManagerForLLMRun],
+        kwargs: Dict[str, Any],
+    ) -> str:
+        """
+        Perform a streaming request to the LLM.
+        Args:
+            prompt: The prompt to generate from.
+            stop: Stop words to use when generating. Model output is cut off at the
+                first occurrence of any of the stop substrings.
+            run_manager: Callback manager for the run.
+            **kwargs: Additional keyword arguments. directly passed
+                to the sambastudio model in API call.
+        Returns:
+            The model output as a string.
+        """
+        completion = ''
+        for chunk in self._stream(prompt=prompt, stop=stop, run_manager=run_manager, **kwargs):
+            completion += chunk.text
+        return completion
+    def _call(
+        self,
+        prompt: Union[List[str], str],
+        stop: Optional[List[str]] = None,
+        run_manager: Optional[CallbackManagerForLLMRun] = None,
+        **kwargs: Any,
+    ) -> str:
+        """Call out to Sambanova's complete endpoint.
+        Args:
+            prompt: The prompt to pass into the model.
+            stop: Optional list of stop words to use when generating.
+        Returns:
+            The string generated by the model.
+        """
+        if stop is not None:
+            raise Exception('stop not implemented')
+        try:
+            if self.streaming:
+                return self._handle_stream_request(prompt, stop, run_manager, kwargs)
+            return self._handle_completion_requests(prompt, stop)
+        except Exception as e:
+            # Handle any errors raised by the inference endpoint
+            raise ValueError(f'Error raised by the inference endpoint: {e}') from e
+class SambaNovaCloud(LLM):
+    """
+    SambaNova Cloud large language models.
+    To use, you should have the environment variables
+    ``SAMBANOVA_URL`` set with your SambaNova Cloud URL.
+    ``SAMBANOVA_API_KEY`` set with your SambaNova Cloud API Key.
+    http://cloud.sambanova.ai/
+    Example:
+    .. code-block:: python
+        SambaNovaCloud(
+            sambanova_url = SambaNova cloud endpoint URL,
+            sambanova_api_key = set with your SambaNova cloud API key,
+            max_tokens = mas number of tokens to generate
+            stop_tokens = list of stop tokens
+            model = model name
+        )
+    """
+    sambanova_url: str = ''
+    """SambaNova Cloud Url"""
+    sambanova_api_key: str = ''
+    """SambaNova Cloud api key"""
+    max_tokens: int = 1024
+    """max tokens to generate"""
+    stop_tokens: list = ['<|eot_id|>']
+    """Stop tokens"""
+    model: str = 'llama3-8b'
+    """LLM model expert to use"""
+    temperature: float = 0.0
+    """model temperature"""
+    top_p: float = 0.0
+    """model top p"""
+    top_k: int = 1
+    """model top k"""
+    stream_api: bool = True
+    """use stream api"""
+    stream_options: dict = {'include_usage': True}
+    """stream options, include usage to get generation metrics"""
+    class Config:
+        """Configuration for this pydantic object."""
+        extra = Extra.forbid
+    @classmethod
+    def is_lc_serializable(cls) -> bool:
+        return True
+    @property
+    def _identifying_params(self) -> Dict[str, Any]:
+        """Get the identifying parameters."""
+        return {
+            'model': self.model,
+            'max_tokens': self.max_tokens,
+            'stop': self.stop_tokens,
+            'temperature': self.temperature,
+            'top_p': self.top_p,
+            'top_k': self.top_k,
+        }
+    @property
+    def _llm_type(self) -> str:
+        """Return type of llm."""
+        return 'SambaNova Cloud'
+    @pre_init
+    def validate_environment(cls, values: Dict) -> Dict:
+        """Validate that api key and python package exists in environment."""
+        values['sambanova_url'] = get_from_dict_or_env(
+            values, 'sambanova_url', 'SAMBANOVA_URL', default='https://api.sambanova.ai/v1/chat/completions'
+        )
+        values['sambanova_api_key'] = get_from_dict_or_env(values, 'sambanova_api_key', 'SAMBANOVA_API_KEY')
+        return values
+    def _handle_nlp_predict_stream(
+        self,
+        prompt: Union[List[str], str],
+        stop: List[str],
+    ) -> Iterator[GenerationChunk]:
+        """
+        Perform a streaming request to the LLM.
+        Args:
+            prompt: The prompt to use for the prediction.
+            stop: list of stop tokens
+        Returns:
+            An iterator of GenerationChunks.
+        """
+        try:
+            import sseclient
+        except ImportError:
+            raise ImportError('could not import sseclient library' 'Please install it with `pip install sseclient-py`.')
+        try:
+            formatted_prompt = json.loads(prompt)
+        except:
+            formatted_prompt = [{'role': 'user', 'content': prompt}]
+        http_session = requests.Session()
+        if not stop:
+            stop = self.stop_tokens
+        data = {
+            'messages': formatted_prompt,
+            'max_tokens': self.max_tokens,
+            'stop': stop,
+            'model': self.model,
+            'temperature': self.temperature,
+            'top_p': self.top_p,
+            'top_k': self.top_k,
+            'stream': self.stream_api,
+            'stream_options': self.stream_options,
+        }
+        # Streaming output
+        response = http_session.post(
+            self.sambanova_url,
+            headers={'Authorization': f'Bearer {self.sambanova_api_key}', 'Content-Type': 'application/json'},
+            json=data,
+            stream=True,
+        )
+        client = sseclient.SSEClient(response)
+        close_conn = False
+        if response.status_code != 200:
+            raise RuntimeError(
+                f'Sambanova /complete call failed with status code ' f'{response.status_code}.' f'{response.text}.'
+            )
+        for event in client.events():
+            if event.event == 'error_event':
+                close_conn = True
+            chunk = {
+                'event': event.event,
+                'data': event.data,
+                'status_code': response.status_code,
+            }
+            if chunk.get('error'):
+                raise RuntimeError(
+                    f"Sambanova /complete call failed with status code " f"{chunk['status_code']}." f"{chunk}."
+                )
+            try:
+                # check if the response is a final event in that case event data response is '[DONE]'
+                if chunk['data'] != '[DONE]':
+                    data = json.loads(chunk['data'])
+                    if data.get('error'):
+                        raise RuntimeError(
+                            f"Sambanova /complete call failed with status code " f"{chunk['status_code']}." f"{chunk}."
+                        )
+                    # check if the response is a final response with usage stats (not includes content)
+                    if data.get('usage') is None:
+                        # check is not "end of text" response
+                        if data['choices'][0]['finish_reason'] is None:
+                            text = data['choices'][0]['delta']['content']
+                            generated_chunk = GenerationChunk(text=text)
+                            yield generated_chunk
+            except Exception as e:
+                raise Exception(f'Error getting content chunk raw streamed response: {chunk}')
+    def _stream(
+        self,
+        prompt: Union[List[str], str],
+        stop: Optional[List[str]] = None,
+        run_manager: Optional[CallbackManagerForLLMRun] = None,
+        **kwargs: Any,
+    ) -> Iterator[GenerationChunk]:
+        """Call out to Sambanova's complete endpoint.
+        Args:
+            prompt: The prompt to pass into the model.
+            stop: Optional list of stop words to use when generating.
+        Returns:
+            The string generated by the model.
+        """
+        try:
+            for chunk in self._handle_nlp_predict_stream(prompt, stop):
+                if run_manager:
+                    run_manager.on_llm_new_token(chunk.text)
+                yield chunk
+        except Exception as e:
+            # Handle any errors raised by the inference endpoint
+            raise ValueError(f'Error raised by the inference endpoint: {e}') from e
+    def _handle_stream_request(
+        self,
+        prompt: Union[List[str], str],
+        stop: Optional[List[str]],
+        run_manager: Optional[CallbackManagerForLLMRun],
+        kwargs: Dict[str, Any],
+    ) -> str:
+        """
+        Perform a streaming request to the LLM.
+        Args:
+            prompt: The prompt to generate from.
+            stop: Stop words to use when generating. Model output is cut off at the
+                first occurrence of any of the stop substrings.
+            run_manager: Callback manager for the run.
+            **kwargs: Additional keyword arguments. directly passed
+                to the Sambanova Cloud model in API call.
+        Returns:
+            The model output as a string.
+        """
+        completion = ''
+        for chunk in self._stream(prompt=prompt, stop=stop, run_manager=run_manager, **kwargs):
+            completion += chunk.text
+        return completion
+    def _call(
+        self,
+        prompt: Union[List[str], str],
+        stop: Optional[List[str]] = None,
+        run_manager: Optional[CallbackManagerForLLMRun] = None,
+        **kwargs: Any,
+    ) -> str:
+        """Call out to Sambanova's  complete endpoint.
+        Args:
+            prompt: The prompt to pass into the model.
+            stop: Optional list of stop words to use when generating.
+        Returns:
+            The string generated by the model.
+        """
+        try:
+            return self._handle_stream_request(prompt, stop, run_manager, kwargs)
+        except Exception as e:
+            # Handle any errors raised by the inference endpoint
+            raise ValueError(f'Error raised by the inference endpoint: {e}') from e

utils/model_wrappers/usage.ipynb ADDED Viewed

	@@ -0,0 +1,878 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# SambanNova Langchain Wrappers Usage"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import os\n",
+    "\n",
+    "from dotenv import load_dotenv\n",
+    "from langchain_embeddings import SambaStudioEmbeddings\n",
+    "from langchain_llms import SambaStudio, SambaNovaCloud\n",
+    "from langchain_chat_models import ChatSambaNovaCloud\n",
+    "from langchain_core.messages import SystemMessage, HumanMessage\n",
+    "\n",
+    "current_dir = os.getcwd()\n",
+    "utils_dir = os.path.abspath(os.path.join(current_dir, '..'))\n",
+    "repo_dir = os.path.abspath(os.path.join(utils_dir, '..'))\n",
+    "\n",
+    "load_dotenv(os.path.join(repo_dir, '.env'), override=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# SambaStudio LLM"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Non streaming"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llm = SambaStudio(\n",
+    "    streaming=False,\n",
+    "    # base_uri=\"api/predict/generic\",\n",
+    "    model_kwargs={\n",
+    "        'do_sample': False,\n",
+    "        'temperature': 0.01,\n",
+    "        'max_tokens_to_generate': 256,\n",
+    "        'process_prompt': False,\n",
+    "        'select_expert': 'Meta-Llama-3-70B-Instruct-4096',\n",
+    "    },\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "' of a brave knight\\nSir Valoric, the fearless knight, charged into the dark forest, his armor shining like the sun. He battled the dragon, its fiery breath singeing his beard, but he stood tall, his sword flashing in the moonlight, until the beast lay defeated at his feet, its treasure his noble reward.'"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "llm.invoke('tell me a 50 word tale')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Streaming"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llm = SambaStudio(\n",
+    "    streaming=True,\n",
+    "    model_kwargs={\n",
+    "        'do_sample': False,\n",
+    "        'max_tokens_to_generate': 256,\n",
+    "        'temperature': 0.01,\n",
+    "        'process_prompt': False,\n",
+    "        'select_expert': 'Meta-Llama-3-70B-Instruct-4096',\n",
+    "    },\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " of a character who is a master of disguise\n",
+      "\n",
+      "Sure! Here is a 50-word tale of a character who is a master of disguise:\n",
+      "\n",
+      "\"Araxys, the skilled disguise artist, transformed into a stunning mermaid to infiltrate a pirate's lair. With a flick of her tail, she charmed the pirates and stole their treasure.\""
+     ]
+    }
+   ],
+   "source": [
+    "for chunk in llm.stream('tell me a 50 word tale'):\n",
+    "    print(chunk, end='', flush=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# SambaNovaCloud LLM"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Non Streaming"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llm = SambaNovaCloud(model='llama3-70b')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Hello. How can I assist you today?'"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import json\n",
+    "\n",
+    "llm.invoke(json.dumps([{'role': 'user', 'content': 'hello'}]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Hello. How can I assist you today?'"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "llm.invoke('hello')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Streaming"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Here's a long story \n",
+      "for you:\n",
+      "\n",
+      "Once upon \n",
+      "a time, in a small village \n",
+      "nestled in the rolling hills of \n",
+      "rural France, there lived a \n",
+      "young girl named Sophie. Sophie \n",
+      "was a curious and adventurous \n",
+      "child, with a mop of curly \n",
+      "brown hair and a smile that \n",
+      "could light up the darkest \n",
+      "of rooms. She lived with \n",
+      "her parents, Pierre and \n",
+      "Colette, in a small stone cottage \n",
+      "on the outskirts of \n",
+      "the village.\n",
+      "\n",
+      "Sophie's village was \n",
+      "a charming \n",
+      "place, filled with narrow \n",
+      "cobblestone streets, quaint shops, \n",
+      "and \n",
+      "bustling cafes. The villagers \n",
+      "were a tight-knit \n",
+      "community, and everyone knew each \n",
+      "other's names and stories. Sophie \n",
+      "loved listening to the villagers' \n",
+      "tales of \n",
+      "old, which \n",
+      "often featured brave knights, \n",
+      "beautiful princesses, and \n",
+      "magical creatures.\n",
+      "\n",
+      "One day, while exploring \n",
+      "the village, Sophie stumbled upon \n",
+      "a small, mysterious shop tucked \n",
+      "away on a quiet street. \n",
+      "The sign above the door \n",
+      "read \"Curios \n",
+      "and Wonders,\" and the \n",
+      "windows were filled \n",
+      "with a dazzling array of strange \n",
+      "and exotic objects. Sophie's \n",
+      "curiosity was piqued, \n",
+      "and she pushed open the door \n",
+      "to venture inside.\n",
+      "\n",
+      "The shop \n",
+      "was dimly lit, and \n",
+      "the air was thick with the \n",
+      "scent of old books and \n",
+      "dust. Sophie's eyes \n",
+      "adjusted slowly, and she \n",
+      "saw that the shop was filled \n",
+      "with all manner of curious \n",
+      "objects: vintage \n",
+      "clocks, rare coins, \n",
+      "and even a \n",
+      "taxidermied owl perched on \n",
+      "a shelf. Behind the counter stood \n",
+      "an old man with a kind \n",
+      "face \n",
+      "and a twinkle in his eye.\n",
+      "\n",
+      "\n",
+      "\n",
+      "\"Bonjour, mademoiselle,\" he \n",
+      "said, his voice low and \n",
+      "soothing. \"Welcome to Curios \n",
+      "and Wonders. I \n",
+      "am Monsieur LaFleur, \n",
+      "the proprietor. How may I \n",
+      "assist you \n",
+      "today?\"\n",
+      "\n",
+      "Sophie wandered the aisles, \n",
+      "running her fingers over \n",
+      "the strange objects on \n",
+      "display. She picked up \n",
+      "a small, delicate music \n",
+      "box and wound \n",
+      "it up, listening \n",
+      "as it played \n",
+      "a soft, melancholy \n",
+      "tune. Monsieur LaFleur \n",
+      "smiled and nodded \n",
+      "in approval.\n",
+      "\n",
+      "\"Ah, you have a \n",
+      "good ear for \n",
+      "music, mademoiselle,\" he \n",
+      "said. \"That music box \n",
+      "is a \n",
+      "rare and precious item. It \n",
+      "was crafted by a skilled artisan \n",
+      "in the 18th century.\"\n",
+      "\n",
+      "\n",
+      "As Sophie continued to \n",
+      "explore the shop, \n",
+      "she stumbled upon \n",
+      "a large, leather-bound book \n",
+      "with strange symbols etched into \n",
+      "the cover. \n",
+      "Monsieur LaFleur noticed her interest and \n",
+      "approached \n",
+      "her.\n",
+      "\n",
+      "\"Ah, you've found \n",
+      "the infamous 'Livre \n",
+      "\n",
+      "des Secrets,'\" \n",
+      "he said, his \n",
+      "voice low and mysterious. \n",
+      "\"That book is said to contain \n",
+      "the secrets of the universe, \n",
+      "hidden within its pages. But \n",
+      "be \n",
+      "warned, mademoiselle, \n",
+      "the book is said to \n",
+      "be cursed. Many have attempted \n",
+      "to unlock its secrets, but \n",
+      "none have \n",
+      "succeeded.\"\n",
+      "\n",
+      "Sophie's eyes widened with \n",
+      "excitement as she carefully opened \n",
+      "the book. The pages \n",
+      "were yellowed and \n",
+      "crackling, and \n",
+      "the text was written in a \n",
+      "language she couldn't understand. \n",
+      "But as she turned the \n",
+      "pages, \n",
+      "she felt a strange sensation, \n",
+      "as if the book \n",
+      "was calling \n",
+      "to her.\n",
+      "\n",
+      "Monsieur \n",
+      "LaFleur smiled \n",
+      "and \n",
+      "nodded. \"I see you have a \n",
+      "connection to the \n",
+      "book, mademoiselle. Perhaps you \n",
+      "are the one who can unlock \n",
+      "its secrets.\"\n",
+      "\n",
+      "Over the next \n",
+      "few weeks, Sophie returned to \n",
+      "the shop again and again, \n",
+      "pouring over \n",
+      "the pages of the Livre \n",
+      "des Secrets. She spent hours \n",
+      "studying \n",
+      "the symbols and trying to decipher \n",
+      "the text. \n",
+      "Monsieur \n",
+      "LaFleur watched her with a \n",
+      "keen eye, offering guidance and encouragement \n",
+      "whenever she needed it.\n",
+      "\n",
+      "As \n",
+      "the days turned into weeks, \n",
+      "Sophie began to notice strange occurrences \n",
+      "happening around her. She would \n",
+      "find objects moved from their \n",
+      "usual places, and she would hear \n",
+      "whispers in the night. She \n",
+      "began \n",
+      "to feel as though the book \n",
+      "was exerting some kind of \n",
+      "influence over her, drawing her \n",
+      "deeper into \n",
+      "its secrets.\n",
+      "\n",
+      "One \n",
+      "night, Sophie had a vivid dream \n",
+      "in which \n",
+      "she saw herself standing in \n",
+      "a \n",
+      "grand, \n",
+      "candlelit hall. \n",
+      "The walls were lined with \n",
+      "ancient tapestries, and the \n",
+      "air was thick with the scent \n",
+      "of \n",
+      "incense. At the far end of \n",
+      "the hall, she saw a \n",
+      "figure cloaked in shadows.\n",
+      "\n",
+      "\n",
+      "As she approached \n",
+      "the figure, it stepped forward, \n",
+      "revealing a woman \n",
+      "with long, flowing hair and \n",
+      "piercing green eyes. The woman \n",
+      "spoke in a voice that was \n",
+      "both familiar and yet \n",
+      "completely alien.\n",
+      "\n",
+      "\"Sophie, you \n",
+      "have been chosen to unlock the \n",
+      "secrets of the Livre \n",
+      "des Secrets,\" she \n",
+      "said. \"But be warned, \n",
+      "the \n",
+      "journey will \n",
+      "be difficult, and the cost \n",
+      "will be high. Are you \n",
+      "prepared to pay \n",
+      "the price?\"\n",
+      "\n",
+      "Sophie woke up with \n",
+      "a start, her heart racing and \n",
+      "her mind reeling. She \n",
+      "knew that she had \n",
+      "to return to the shop and \n",
+      "confront Monsieur LaFleur \n",
+      "about the \n",
+      "strange \n",
+      "occurrences. But when she \n",
+      "arrived at the shop, she \n",
+      "found that it \n",
+      "was closed, \n",
+      "and \n",
+      "a sign on the door \n",
+      "read \"Gone on \n",
+      "a \n",
+      "journey. Will return \n",
+      "soon.\"\n",
+      "\n",
+      "Sophie \n",
+      "was devastated. \n",
+      "She felt as though she had \n",
+      "been abandoned, left \n",
+      "to navigate the mysteries of \n",
+      "the Livre des Secrets on \n",
+      "her own. But as \n",
+      "she turned to leave, she \n",
+      "noticed a\n"
+     ]
+    }
+   ],
+   "source": [
+    "for i in llm.stream('hello tell me a long story'):\n",
+    "    print(i)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# SambaNova Cloud Chat Model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Non Streaming"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llm = ChatSambaNovaCloud(\n",
+    "    model= \"llama3-405b\",\n",
+    "    max_tokens=1024,\n",
+    "    temperature=0.7,\n",
+    "    top_k=1,\n",
+    "    top_p=0.01,\n",
+    "    stream_options={'include_usage':True}\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "AIMessage(content='A man walked into a library and asked the librarian, \"Do you have any books on Pavlov\\'s dogs and Schrödinger\\'s cat?\"\\n\\nThe librarian replied, \"It rings a bell, but I\\'m not sure if it\\'s here or not.\"', response_metadata={'finish_reason': 'stop', 'usage': {'acceptance_rate': 6.875, 'completion_tokens': 54, 'completion_tokens_after_first_per_sec': 146.48573712341215, 'completion_tokens_after_first_per_sec_first_ten': 172.9005798161617, 'completion_tokens_per_sec': 81.99632208428116, 'end_time': 1726178488.071125, 'is_last_response': True, 'prompt_tokens': 40, 'start_time': 1726178487.3630672, 'time_to_first_token': 0.34624791145324707, 'total_latency': 0.658566123789007, 'total_tokens': 94, 'total_tokens_per_sec': 142.73433844300794}, 'model_name': 'Meta-Llama-3.1-405B-Instruct', 'system_fingerprint': 'fastcoe', 'created': 1726178487}, id='a5590b89-4853-4bd9-9fd8-83276b369278')"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "llm.invoke(\"tell me a joke\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "AIMessage(content=\"Yer lookin' fer a joke, eh? Alright then, matey! Here be one fer ye:\\n\\nWhy did the pirate quit his job?\\n\\n(pause fer dramatic effect)\\n\\nBecause he was sick o' all the arrrr-guments!\\n\\nYarrr, hope that made ye laugh, me hearty!\", response_metadata={'finish_reason': 'stop', 'usage': {'acceptance_rate': 5.583333333333333, 'completion_tokens': 64, 'completion_tokens_after_first_per_sec': 120.91573778458478, 'completion_tokens_after_first_per_sec_first_ten': 140.3985499426452, 'completion_tokens_per_sec': 79.98855768735817, 'end_time': 1726065701.9732044, 'is_last_response': True, 'prompt_tokens': 48, 'start_time': 1726065701.107911, 'time_to_first_token': 0.3442692756652832, 'total_latency': 0.8001144394945743, 'total_tokens': 112, 'total_tokens_per_sec': 139.9799759528768}, 'model_name': 'Meta-Llama-3.1-405B-Instruct', 'system_fingerprint': 'fastcoe', 'created': 1726065701}, id='7b0748bb-c5f7-4696-ae56-03b734b60fb9')"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "messages = [\n",
+    "    SystemMessage(content=\"You are a helpful assistant with pirate accent\"),\n",
+    "    HumanMessage(content=\"tell me a joke\")\n",
+    "    ]\n",
+    "llm.invoke(messages)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "AIMessage(content='A man walked into a library and asked the librarian, \"Do you have any books on Pavlov\\'s dogs and Schrödinger\\'s cat?\"\\n\\nThe librarian replied, \"It rings a bell, but I\\'m not sure if it\\'s here or not.\"', response_metadata={'finish_reason': 'stop', 'usage': {'acceptance_rate': 6.875, 'completion_tokens': 54, 'completion_tokens_after_first_per_sec': 146.72813415408498, 'completion_tokens_after_first_per_sec_first_ten': 172.71830994351703, 'completion_tokens_per_sec': 82.34884281970663, 'end_time': 1726065746.6364844, 'is_last_response': True, 'prompt_tokens': 40, 'start_time': 1726065745.932173, 'time_to_first_token': 0.34309911727905273, 'total_latency': 0.6557469194585627, 'total_tokens': 94, 'total_tokens_per_sec': 143.34798564911895}, 'model_name': 'Meta-Llama-3.1-405B-Instruct', 'system_fingerprint': 'fastcoe', 'created': 1726065745}, id='27e7d4fe-8e24-419a-b75b-51ea2519781b')"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "future_response = llm.ainvoke(\"tell me a joke\")\n",
+    "await(future_response) "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Batching"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llm = ChatSambaNovaCloud(\n",
+    "    model= \"llama3-405b\",\n",
+    "    streaming=False,\n",
+    "    max_tokens=1024,\n",
+    "    temperature=0.7,\n",
+    "    top_k=1,\n",
+    "    top_p=0.01,\n",
+    "    stream_options={'include_usage':True}\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[AIMessage(content='A man walked into a library and asked the librarian, \"Do you have any books on Pavlov\\'s dogs and Schrödinger\\'s cat?\"\\n\\nThe librarian replied, \"It rings a bell, but I\\'m not sure if it\\'s here or not.\"', response_metadata={'finish_reason': 'stop', 'usage': {'acceptance_rate': 6.875, 'completion_tokens': 54, 'completion_tokens_after_first_per_sec': 146.72232349940003, 'completion_tokens_after_first_per_sec_first_ten': 173.01988455676758, 'completion_tokens_per_sec': 82.21649876350362, 'end_time': 1726065879.4066722, 'is_last_response': True, 'prompt_tokens': 40, 'start_time': 1726065878.700746, 'time_to_first_token': 0.3446996212005615, 'total_latency': 0.656802476536144, 'total_tokens': 94, 'total_tokens_per_sec': 143.1176089586915}, 'model_name': 'Meta-Llama-3.1-405B-Instruct', 'system_fingerprint': 'fastcoe', 'created': 1726065878}, id='28d3a38b-5dae-4d62-bf6c-cface081df34'),\n",
+       " AIMessage(content='The capital of the United Kingdom is London.', response_metadata={'finish_reason': 'stop', 'usage': {'acceptance_rate': 13, 'completion_tokens': 10, 'completion_tokens_after_first_per_sec': 110.21174794386165, 'completion_tokens_after_first_per_sec_first_ten': 327.0275172132524, 'completion_tokens_per_sec': 26.88555788272027, 'end_time': 1726065879.138034, 'is_last_response': True, 'prompt_tokens': 43, 'start_time': 1726065878.7150047, 'time_to_first_token': 0.3413684368133545, 'total_latency': 0.37194690337547887, 'total_tokens': 53, 'total_tokens_per_sec': 142.49345677841742}, 'model_name': 'Meta-Llama-3.1-405B-Instruct', 'system_fingerprint': 'fastcoe', 'created': 1726065878}, id='859a9e45-c0a5-44ec-bd53-686877c2cf89')]"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "llm.batch([\"tell me a joke\",\"which is the capital of UK?\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/var/folders/p4/y0q2kh796nx_k_yzfhxs57f00000gp/T/ipykernel_33601/1543848179.py:1: RuntimeWarning: coroutine 'Runnable.abatch' was never awaited\n",
+      "  future_responses = llm.abatch([\"tell me a joke\",\"which is the capital of UK?\"])\n",
+      "RuntimeWarning: Enable tracemalloc to get the object allocation traceback\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[AIMessage(content='A man walked into a library and asked the librarian, \"Do you have any books on Pavlov\\'s dogs and Schrödinger\\'s cat?\"\\n\\nThe librarian replied, \"It rings a bell, but I\\'m not sure if it\\'s here or not.\"', response_metadata={'finish_reason': 'stop', 'usage': {'acceptance_rate': 6.875, 'completion_tokens': 54, 'completion_tokens_after_first_per_sec': 120.34699641554552, 'completion_tokens_after_first_per_sec_first_ten': 141.51170437257693, 'completion_tokens_per_sec': 36.223157123884754, 'end_time': 1726065914.8678048, 'is_last_response': True, 'prompt_tokens': 40, 'start_time': 1726065913.3182464, 'time_to_first_token': 1.1091651916503906, 'total_latency': 1.4907590692693538, 'total_tokens': 94, 'total_tokens_per_sec': 63.05512536379939}, 'model_name': 'Meta-Llama-3.1-405B-Instruct', 'system_fingerprint': 'fastcoe', 'created': 1726065913}, id='f279d0fb-70b5-428c-9283-457b9831b559'),\n",
+       " AIMessage(content='The capital of the United Kingdom is London.', response_metadata={'finish_reason': 'stop', 'usage': {'acceptance_rate': 9.5, 'completion_tokens': 10, 'completion_tokens_after_first_per_sec': 60.73429985889864, 'completion_tokens_after_first_per_sec_first_ten': 195.5434460421063, 'completion_tokens_per_sec': 8.61842566880045, 'end_time': 1726065914.575598, 'is_last_response': True, 'prompt_tokens': 43, 'start_time': 1726065913.3182464, 'time_to_first_token': 1.1091651916503906, 'total_latency': 1.160304722033049, 'total_tokens': 53, 'total_tokens_per_sec': 45.67765604464238}, 'model_name': 'Meta-Llama-3.1-405B-Instruct', 'system_fingerprint': 'fastcoe', 'created': 1726065913}, id='f279d0fb-70b5-428c-9283-457b9831b559')]"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "future_responses = llm.abatch([\"tell me a joke\",\"which is the capital of UK?\"])\n",
+    "await(future_responses)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Streaming"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llm = ChatSambaNovaCloud(\n",
+    "    model= \"llama3-405b\",\n",
+    "    streaming=True,\n",
+    "    max_tokens=1024,\n",
+    "    temperature=0.7,\n",
+    "    top_k=1,\n",
+    "    top_p=0.01,\n",
+    "    stream_options={'include_usage':True}\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "A man walked into a \n",
+      "library and asked the \n",
+      "librarian, \"Do you have any books \n",
+      "on Pavlov's dogs \n",
+      "and Schrödinger's cat?\"\n",
+      "\n",
+      "\n",
+      "The librarian \n",
+      "replied, \"It rings a bell, \n",
+      "but I'm not sure \n",
+      "if it's here \n",
+      "or not.\"\n",
+      "\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "for chunk in llm.stream(\"tell me a joke\"):\n",
+    "    print(chunk.content)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Yer lookin' \n",
+      "fer a joke, eh? \n",
+      "Alright then, matey! \n",
+      "Here be one fer \n",
+      "ye:\n",
+      "\n",
+      "Why did the pirate quit his job?\n",
+      "\n",
+      "\n",
+      "\n",
+      "(pause fer \n",
+      "dramatic effect)\n",
+      "\n",
+      "Because he was sick \n",
+      "o' all the arrrr-guments!\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "Yarrr, hope that made ye \n",
+      "laugh, \n",
+      "me hearty!\n",
+      "\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "messages = [\n",
+    "    SystemMessage(content=\"You are a helpful assistant with pirate accent\"),\n",
+    "    HumanMessage(content=\"tell me a joke\")\n",
+    "    ]\n",
+    "for chunk in llm.stream(messages):\n",
+    "    print(chunk.content)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "A man walked into a \n",
+      "library and asked the \n",
+      "librarian, \"Do you have any books \n",
+      "on Pavlov's dogs \n",
+      "and Schrödinger's cat?\"\n",
+      "\n",
+      "\n",
+      "The librarian \n",
+      "replied, \"It rings a bell, \n",
+      "but I'm not sure \n",
+      "if it's here \n",
+      "or not.\"\n",
+      "\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "async for chunk in llm.astream(\"tell me a joke\"):\n",
+    "    print(chunk.content)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Sambastudio Embeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "embedding = SambaStudioEmbeddings(batch_size=1, model_kwargs={'select_expert': 'e5-mistral-7b-instruct'})\n",
+    "embedding.embed_documents(['tell me a 50 word tale', 'tell me a joke'])\n",
+    "embedding.embed_query('tell me a 50 word tale')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/jorgep/Documents/ask_public_own/finetuning_env/lib/python3.11/site-packages/langchain_core/_api/deprecation.py:139: LangChainDeprecationWarning: The method `BaseRetriever.get_relevant_documents` was deprecated in langchain-core 0.1.46 and will be removed in 0.3.0. Use invoke instead.\n",
+      "  warn_deprecated(\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[Document(page_content='tell me a 50 word tale'),\n",
+       " Document(page_content='tell me a joke'),\n",
+       " Document(page_content='give me 3 party activities'),\n",
+       " Document(page_content='give me three healty dishes')]"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from langchain.schema import Document\n",
+    "from langchain.vectorstores import Chroma\n",
+    "\n",
+    "docs = [\n",
+    "    'tell me a 50 word tale',\n",
+    "    'tell me a joke',\n",
+    "    'when was America discoverd?',\n",
+    "    'how to build an engine?',\n",
+    "    'give me 3 party activities',\n",
+    "    'give me three healty dishes',\n",
+    "]\n",
+    "docs = [Document(doc) for doc in docs]\n",
+    "\n",
+    "query = 'prompt for generating something fun'\n",
+    "\n",
+    "vectordb = Chroma.from_documents(docs, embedding)\n",
+    "retriever = vectordb.as_retriever()\n",
+    "\n",
+    "retriever.get_relevant_documents(query)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "peenv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

utils/parsing/README.md ADDED Viewed

	@@ -0,0 +1,285 @@

+# SambaParse
+SambaParse is a Python library that simplifies the process of extracting and processing unstructured data using the Unstructured.io API. It provides a convenient wrapper around the Unstructured.io CLI tool, allowing you to ingest data from various sources, perform partitioning, chunking, embedding, and load the processed data into a vector database. It's designed to be used within AI Starter kits and SN Apps, unifying our data ingestion and document intelligence platform. This allows us to keep our code base centralized for data ingestion kits.
+## Prerequisites
+Before using SambaParse, make sure you have the following:
+- Docker installed on your machine (or access to another API server)
+- An Unstructured.io API key
+Before using SambaParse, make sure you have the following:
+- Create a `.env` file in the ai-starter-kit root directory (not in the parsing folder root):
+     ```bash
+     UNSTRUCTURED_API_KEY=your_api_key_here
+     ```
+## Setup
+### Pre Reqs
+Using pyenv to manage virtualenv's is recommended
+Mac install instructions. See pyenv-virtualenv repo for more detailed instructions.
+  ```bash
+  brew install pyenv-virtualenv
+  ```
+- Create a python venv using python version 3.10.12
+  ```bash
+  pyenv install 3.10.12
+  pyenv  virtualenv 3.10.12 sambaparse
+  pyenv activate sambaparse
+  ```
+- Clone the ai-starter-kit repo and cd:
+  ```bash
+  git clone https://github.com/sambanova/ai-starter-kit
+  ```
+- cd into utils/parsing and pip install the requirements
+  ```bash
+  pip install -r requirements.txt
+  ```
+- cd into the unstructured-api foder and Install the unstructured-api make-file:
+  ```bash
+  cd  unstructured-api
+  ```
+- Run
+  ```bash
+  make install
+  ```
+- Run The Web Server:
+  ```bash
+  make run-web-app
+  ```
+  This script will start the Unstructured API server using the specified API key and expose it on port 8005.
+  - Alternatively, if you have another Unstructured API server running on a different instance, make sure to update the `partition_endpoint` and `unstructured_port` values in the YAML configuration file accordingly.
+## Usage
+1. Import the `SambaParse` class from the `ai-starter-kit` library:
+    ```python
+    from utils.parsing.sambaparse import SambaParse
+    ```
+2. Create a YAML configuration file (e.g., `config.yaml`) to specify the desired settings for the ingestion process. Here's the configuration for use cases 1 and 2 ie local files and folders:
+    ```yaml
+    processor:
+    verbose: True
+    output_dir: './output'
+    num_processes: 2
+    sources:
+      local:
+        recursive: True
+      confluence:
+        api_token: 'your_confluence_api_token'
+        user_email: 'your_email@example.com'
+        url: 'https://your-confluence-url.atlassian.net'
+      github:
+        url: 'owner/repo'
+        branch: 'main'
+      google_drive:
+        service_account_key: 'path/to/service_account_key.json'
+        recursive: True
+        drive_id: 'your_drive_id'
+    partitioning:
+      pdf_infer_table_structure: True
+      skip_infer_table_types: []
+      strategy: 'auto'
+      hi_res_model_name: 'yolox'
+      ocr_languages: ['eng']
+      encoding: 'utf-8'
+      fields_include: ['element_id', 'text', 'type', 'metadata', 'embeddings']
+      flatten_metadata: False
+      metadata_exclude: []
+      metadata_include: []
+      partition_endpoint: 'http://localhost'
+      unstructured_port: 8005
+      partition_by_api: True
+    chunking:
+      enabled: True
+      strategy: 'basic'
+      chunk_max_characters: 1500
+      chunk_overlap: 300
+    embedding:
+      enabled: False
+      provider: 'langchain-huggingface'
+      model_name: 'intfloat/e5-large-v2'
+    destination_connectors:
+      enabled: False
+      type: 'chroma'
+      batch_size: 80
+      chroma:
+        host: 'localhost'
+        port: 8004
+        collection_name: 'snconf'
+        tenant: 'default_tenant'
+        database: 'default_database'
+      qdrant:
+        location: 'http://localhost:6333'
+        collection_name: 'test'
+    additional_processing:
+      enabled: True
+      extend_metadata: True
+      replace_table_text: True
+      table_text_key: 'text_as_html'
+      return_langchain_docs: True
+      convert_metadata_keys_to_string: True
+    ```
+    Make sure to place the `config.yaml` file in the desired folder.
+3. Create an instance of the `SambaParse` class, passing the path to the YAML configuration file:
+    ```python
+    sambaparse = SambaParse('path/to/config.yaml')
+    ```
+4. Use the `run_ingest` method to process your data:
+- For a single file:
+    ```python
+    source_type = 'local'
+    input_path = 'path/to/your/file.pdf'
+    additional_metadata = {'key': 'value'}
+    texts, metadata_list, langchain_docs = sambaparse.run_ingest(source_type, input_path=input_path, additional_metadata=additional_metadata)
+    ```
+  - For a folder:
+    ```python
+    source_type = 'local'
+    input_path = 'path/to/your/file.pdf'
+    additional_metadata = {'key': 'value'}
+    texts, metadata_list, langchain_docs = sambaparse.run_ingest(source_type, input_path=input_path, additional_metadata=additional_metadata)
+    ```
+  - For Confluence:
+    ```python
+    source_type = 'confluence'
+    additional_metadata = {'key': 'value'}
+    texts, metadata_list, langchain_docs = sambaparse.run_ingest(source_type, additional_metadata=additional_metadata)
+      ```
+    Note that for conflence you must enable embedding and destinatation connectors automatically ie Chroma and turn off additional processing (ie langchain), an example yaml to do that is below
+      ```yaml
+        processor:
+        verbose: True
+        output_dir: './output'
+        num_processes: 2
+        sources:
+          local:
+            recursive: True
+          confluence:
+            api_token: 'your_confluence_api_token'
+            user_email: 'your_email@example.com'
+            url: 'https://your-confluence-url.atlassian.net'
+          github:
+            url: 'owner/repo'
+            branch: 'main'
+          google_drive:
+            service_account_key: 'path/to/service_account_key.json'
+            recursive: True
+            drive_id: 'your_drive_id'
+        partitioning:
+          pdf_infer_table_structure: True
+          skip_infer_table_types: []
+          strategy: 'auto'
+          hi_res_model_name: 'yolox'
+          ocr_languages: ['eng']
+          encoding: 'utf-8'
+          fields_include: ['element_id', 'text', 'type', 'metadata', 'embeddings']
+          flatten_metadata: False
+          metadata_exclude: []
+          metadata_include: []
+          partition_endpoint: 'http://localhost'
+          unstructured_port: 8005
+          partition_by_api: True
+        chunking:
+          enabled: True
+          strategy: 'basic'
+          chunk_max_characters: 1500
+          chunk_overlap: 300
+        embedding:
+          enabled: True
+          provider: 'langchain-huggingface'
+          model_name: 'intfloat/e5-large-v2'
+        destination_connectors:
+          enabled: True
+          type: 'chroma'
+          batch_size: 80
+          chroma:
+            host: 'localhost'
+            port: 8004
+            collection_name: 'snconf'
+            tenant: 'default_tenant'
+            database: 'default_database'
+          qdrant:
+            location: 'http://localhost:6333'
+            collection_name: 'test'
+        additional_processing:
+          enabled: False
+          extend_metadata: True
+          replace_table_text: True
+          table_text_key: 'text_as_html'
+          return_langchain_docs: True
+          convert_metadata_keys_to_string: True
+      ```
+    In addition for confluence you will need to have a Chroma Server running on port 8004, you can do this by running the docker command below
+      ```bash
+      docker run -d --rm --name chromadb -v ./chroma:/chroma/chroma -e IS_PERSISTENT=TRUE -e ANONYMIZED_TELEMETRY=TRUE -p 8004:8000 chromadb/chroma:latest
+      ```
+  The `run_ingest` method returns a tuple containing the extracted texts, metadata, and LangChain documents (if `return_langchain_docs` is set to `True` in the configuration).
+5. Process the returned data as needed:
+    - `texts`: A list of extracted text elements from the documents.
+    - `metadata_list`: A list of metadata dictionaries for each text element.
+    - `langchain_docs`: A list of LangChain `Document` objects, which combine the text and metadata.
+  #### Configuration Options
+  The YAML configuration file allows you to customize various aspects of the ingestion process. Here are some of the key options:
+  - `processor`: Settings related to the processing of documents, such as the output directory and the number of processes to use.
+  - `sources`: Configuration for different data sources, including local files, Confluence, GitHub, and Google Drive.
+  - `partitioning`: Options for partitioning the documents, including the strategy, OCR languages, and API settings.
+  - `chunking`: Settings for chunking the documents, such as enabling chunking, specifying the chunking strategy, and setting the maximum chunk size and overlap.
+  - `embedding`: Options for embedding the documents, including enabling embedding, specifying the embedding provider, and setting the model name.
+  - `additional_processing`: Configuration for additional processing steps, such as extending metadata, replacing table text, and returning LangChain documents.
+  Make sure to review and modify the configuration file according to your specific requirements.

utils/parsing/config.yaml ADDED Viewed

	@@ -0,0 +1,69 @@

+processor:
+  verbose: True
+  output_dir: './output'
+  num_processes: 2
+  reprocess: False
+sources:
+  local:
+    recursive: True
+  confluence:
+    api_token: 'your_confluence_api_token'
+    user_email: 'your_email@example.com'
+    url: 'https://your-confluence-url.atlassian.net'
+  github:
+    url: 'owner/repo'
+    branch: 'main'
+  google_drive:
+    service_account_key: 'path/to/service_account_key.json'
+    recursive: True
+    drive_id: 'your_drive_id'
+partitioning:
+  skip_infer_table_types: []
+  strategy: 'auto'
+  hi_res_model_name: 'yolox'
+  ocr_languages: ['eng']
+  encoding: 'utf-8'
+  fields_include: ['element_id', 'text', 'type', 'metadata', 'embeddings']
+  flatten_metadata: False
+  metadata_exclude: []
+  metadata_include: []
+  partition_endpoint: 'http://localhost'
+  unstructured_port: 8005
+  partition_by_api: False # set as true if using API server
+  default_unstructured_api_key: 123456789abcde
+chunking:
+  enabled: True
+  strategy: 'by_title'
+  chunk_max_characters: 1500
+  chunk_overlap: 300
+  combine_under_n_chars: 1500
+embedding:
+  enabled: False
+  provider: 'langchain-huggingface'
+  model_name: 'intfloat/e5-large-v2'
+destination_connectors:
+  enabled: False
+  type: 'chroma'
+  batch_size: 80
+  chroma:
+    host: 'localhost'
+    port: 8004
+    collection_name: 'snconf'
+    tenant: 'default_tenant'
+    database: 'default_database'
+  qdrant:
+    location: 'http://localhost:6333'
+    collection_name: 'test'
+additional_processing:
+  enabled: True
+  extend_metadata: True
+  replace_table_text: True
+  table_text_key: 'text_as_html'
+  return_langchain_docs: True
+  convert_metadata_keys_to_string: True

utils/parsing/docker-compose.yaml ADDED Viewed

	@@ -0,0 +1,30 @@

+version: '3.9'
+networks:
+  net:
+    driver: bridge
+services:
+  unstructured-api:
+    image: downloads.unstructured.io/unstructured-io/unstructured-api:latest
+    command: --port 8000 --host 0.0.0.0
+    ports:
+      - "${UNSTRUCTURED_PORT:-8005}:8000"
+    env_file:
+      - ../../.env
+    networks:
+      - net
+  chromadb:
+    image: chromadb/chroma:latest
+    volumes:
+      - ./chromadb:/chroma/chroma
+    environment:
+      - IS_PERSISTENT=TRUE
+      - PERSIST_DIRECTORY=/chroma/chroma
+      - ANONYMIZED_TELEMETRY=${ANONYMIZED_TELEMETRY:-TRUE}
+    ports:
+      - "${CHROMA_PORT:-8004}:8000"
+    networks:
+      - net

utils/parsing/parse_usage.ipynb ADDED Viewed

	@@ -0,0 +1,228 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "This is the repo dir /Users/kwasia/Documents/Projects/ai-starter-kit\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "\n",
+    "current_dir = os.getcwd()\n",
+    "kit_dir = os.path.abspath(os.path.join(current_dir, '..'))\n",
+    "repo_dir = os.path.abspath(os.path.join(kit_dir, '..'))\n",
+    "\n",
+    "sys.path.append(kit_dir)\n",
+    "sys.path.append(repo_dir)\n",
+    "\n",
+    "print(f'This is the repo dir {repo_dir}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Load DotEnv\n",
+    "\n",
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "load_dotenv('../../.env')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from utils.parsing.sambaparse import SambaParse"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Use Case 1 - Process a Single File"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024-06-20 16:15:20,971 - INFO - Deleting contents of output directory: ./output\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024-06-20 16:15:20,995 - INFO - Running command: unstructured-ingest local --output-dir ./output --num-processes 2 --strategy auto --ocr-languages eng --encoding utf-8 --fields-include element_id,text,type,metadata,embeddings --metadata-exclude  --metadata-include  --pdf-infer-table-structure --input-path \"./test_docs/samba_turbo.pdf\" --recursive --verbose --partition-by-api --api-key EA6ZX3037WEZUV8THwco --partition-endpoint http://localhost:8005 --pdf-infer-table-structure --chunking-strategy basic --chunk-max-characters 1500 --chunk-overlap 300\n",
+      "2024-06-20 16:15:20,996 - INFO - This may take some time depending on the size of your data. Please be patient...\n",
+      "2024-06-20 16:15:20,996 - INFO - This may take some time depending on the size of your data. Please be patient...\n",
+      "/Users/kwasia/.pyenv/versions/sambaparse/lib/python3.10/site-packages/dataclasses_json/core.py:201: RuntimeWarning: 'NoneType' object value of non-optional type additional_partition_args detected when decoding CliPartitionConfig.\n",
+      "  warnings.warn(\n",
+      "2024-06-20 16:15:22,908 MainProcess INFO     running pipeline: DocFactory -> Reader -> Partitioner -> Chunker -> Copier with config: {\"reprocess\": false, \"verbose\": true, \"work_dir\": \"/Users/kwasia/.cache/unstructured/ingest/pipeline\", \"output_dir\": \"./output\", \"num_processes\": 2, \"raise_on_error\": false}\n",
+      "2024-06-20 16:15:24,658 MainProcess INFO     Running doc factory to generate ingest docs. Source connector: {\"processor_config\": {\"reprocess\": false, \"verbose\": true, \"work_dir\": \"/Users/kwasia/.cache/unstructured/ingest/pipeline\", \"output_dir\": \"./output\", \"num_processes\": 2, \"raise_on_error\": false}, \"read_config\": {\"download_dir\": null, \"re_download\": false, \"preserve_downloads\": false, \"download_only\": false, \"max_docs\": null}, \"connector_config\": {\"input_path\": \"./test_docs/samba_turbo.pdf\", \"recursive\": true, \"file_glob\": null}}\n",
+      "2024-06-20 16:15:24,661 MainProcess INFO     processing 1 docs via 2 processes\n",
+      "2024-06-20 16:15:24,661 MainProcess INFO     Calling Reader with 1 docs\n",
+      "2024-06-20 16:15:24,661 MainProcess INFO     Running source node to download data associated with ingest docs\n",
+      "2024-06-20 16:15:26,511 SpawnPoolWorker-3 INFO     File exists: test_docs/samba_turbo.pdf, skipping download\n",
+      "2024-06-20 16:15:26,522 MainProcess INFO     Calling Partitioner with 1 docs\n",
+      "2024-06-20 16:15:26,523 MainProcess INFO     Running partition node to extract content from json files. Config: {\"pdf_infer_table_structure\": true, \"strategy\": \"auto\", \"ocr_languages\": [\"eng\"], \"encoding\": \"utf-8\", \"additional_partition_args\": null, \"skip_infer_table_types\": null, \"fields_include\": [\"element_id\", \"text\", \"type\", \"metadata\", \"embeddings\"], \"flatten_metadata\": false, \"metadata_exclude\": [\"--metadata-include\"], \"metadata_include\": [], \"partition_endpoint\": \"http://localhost:8005\", \"partition_by_api\": true, \"api_key\": \"*******\", \"hi_res_model_name\": null}, partition kwargs: {}]\n",
+      "2024-06-20 16:15:26,523 MainProcess INFO     Creating /Users/kwasia/.cache/unstructured/ingest/pipeline/partitioned\n",
+      "2024-06-20 16:15:28,387 SpawnPoolWorker-4 INFO     Processing test_docs/samba_turbo.pdf\n",
+      "2024-06-20 16:15:29,836 SpawnPoolWorker-4 DEBUG    Using remote partition (http://localhost:8005)\n",
+      "2024-06-20 16:15:40,244 SpawnPoolWorker-4 INFO     writing partitioned content to /Users/kwasia/.cache/unstructured/ingest/pipeline/partitioned/eb87c25354d57b8c7434994ca9c3f796.json\n",
+      "2024-06-20 16:15:40,254 MainProcess INFO     Calling Chunker with 1 docs\n",
+      "2024-06-20 16:15:40,255 MainProcess INFO     Running chunking node. Chunking config: {\"chunking_strategy\": \"basic\", \"combine_text_under_n_chars\": null, \"include_orig_elements\": true, \"max_characters\": 1500, \"multipage_sections\": true, \"new_after_n_chars\": null, \"overlap\": 300, \"overlap_all\": false}]\n",
+      "2024-06-20 16:15:40,255 MainProcess INFO     Creating /Users/kwasia/.cache/unstructured/ingest/pipeline/chunked\n",
+      "2024-06-20 16:15:42,318 SpawnPoolWorker-6 INFO     writing chunking content to /Users/kwasia/.cache/unstructured/ingest/pipeline/chunked/df2636b5a36c11e91958dfd7ae81ddb1.json\n",
+      "2024-06-20 16:15:42,323 MainProcess INFO     Calling Copier with 1 docs\n",
+      "2024-06-20 16:15:42,323 MainProcess INFO     Running copy node to move content to desired output location\n",
+      "2024-06-20 16:15:44,114 SpawnPoolWorker-9 INFO     Copying /Users/kwasia/.cache/unstructured/ingest/pipeline/chunked/df2636b5a36c11e91958dfd7ae81ddb1.json -> output/samba_turbo.pdf.json\n",
+      "2024-06-20 16:15:44,320 - INFO - Ingest process completed successfully!\n",
+      "2024-06-20 16:15:44,321 - INFO - Performing additional processing...\n",
+      "2024-06-20 16:15:44,324 - INFO - Additional processing completed.\n"
+     ]
+    }
+   ],
+   "source": [
+    "config_yaml = './config.yaml'\n",
+    "sambaparse = SambaParse(config_yaml)\n",
+    "\n",
+    "source_type = 'local'\n",
+    "input_path = './test_docs/samba_turbo.pdf'\n",
+    "additional_metadata = {'key': 'value'}\n",
+    "\n",
+    "texts, metadata_list, langchain_docs = sambaparse.run_ingest(\n",
+    "    source_type, input_path=input_path, additional_metadata=additional_metadata\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "This is the length of the lanchain docs 5\n",
+      "This is an example langcahin doc \n",
+      "\n",
+      " page_content=\"6/20/24, 3:23 PM\\n\\nSambaNova has broken the 1000 t/s barrier: why it's a big deal for enterprise AI\\n\\nG\\\\SambaNovar\\n\\nEN\\n\\nBACK TO RESOURCES\\n\\n<\\n\\nPREVIOUS | NEXT\\n\\n>\\n\\nMay 29, 2024\\n\\njn\\n\\nNX\\n\\nfF\\n\\nBS\\n\\nSambaNova has broken the 1000 t/s barrier: why it's a big deal for enterprise AI\\n\\nSambaNova is the clear winner of the latest large language model LLM benchmark by Artificial Analysis. Topping the Leaderboad at over 1000 tokens per second (t/s), Samba-1 Turbo sets a new record for Llama 3 8B performance on a single SN40L node and with full precision.\\n\\nWith speeds like this, enterprises can expect to accelerate an array of use cases and will enable innovation around unblocking agentic workflow, copilot, and synthetic data, to name a few. This breakthrough in AI technology is possible because the purpose-built SambaNova SN40L Reconfigurable Dataflow Unit RDU can hold hundreds of models at the same time and can switch between them in microseconds.\\n\\nSpeed for today and tomorrow\" metadata={'filename': 'samba_turbo.pdf', 'filetype': 'application/pdf', 'languages': 'eng', 'page_number': '1', 'orig_elements': 'eJzVl21v2zYQx7/KwW+2AV7DJ1FUMQxI22wrlqZFHrYCbVHw4WhzkSVBkut63b77jvYejCJF7BdDkleCyBN597v/Hak3nyZY4wKb8X0Kk8cwKWzl0TimbZBSVEWprZZRSx20YbJwkylMFjjaYEdL9p8mMdXY2AXmjwe7cPb9uOxd+6gLMdvm6XHdbaZt19XJ2zG1zdHf07VtZks7w4Hm30ywmU3e0WhHI++b5cJhT+P8Txoa8eOY19BHgh0JNQX5WEh49SIv8s/6P6EN9AWZfx5VjIWpKvRWRFlZ6dEJxr3SElEFr8xdR3WRtzhrP1iY2wFc315jA+McgTPGYDyiMdv3CfvHsJqvIY1fDWDBpRkEtDXEtgcKFvuuTwPC8fP9qJQlE8YV0ToeXamKqiAuKI3RJTNFFe6CymakPyBzuxh/fPv2X5L9LoPLNNZ4EwIUpQqxRKYL4RwtaQxHdMYrFSup7lzuu0E8X5DdTUGEGENRVbIqfCU1s0wFGazTVMGFJnZ3HcTJ2X56VFgq6jWoKMOlM0FykmMMEjk6LoW4D3q8lfVu4E+On/4Mly/h/OTi5dX505OLvTQZjUWunZeclwKDFJ4JpqMzUflKlXgfMNyaql0M3+2GfdXQ9jhr+/Q7hstscQMCxlUUUikUSjkjnePCCGFtYJE77x+eEl6dn/zy/OXVBfwBZyevL/fSQSWo+3tqzRELaniyUpoTFKkq5JWLd9KbPodwa6J2IXx/sA5Ky6pYeMZZITPlUAmnOTFBI6lC2IND8MKuQVRTEEyoXRpndLyTKx/wSyQsL5UpiAcLwUWLRisTSsmCcVYUgd8HEgdVxG/NXkVQBCtj0JJjLA1TvhROBaF9QR3IBV48uLjPXu8VNzeVQDR05SF5Oead08JjENFLr6R4ePmOP+wVty+rUvsouLHcM7qHlbzAwAXngU4Zey8Ov8PuAPsd+nQniq5iOieawi4dD5F5Q1WvqkKZ+ODi/j9/ab58dQqs0oXWBT0tp0opQwx0I6GyQS+re3Fe3JrpmymmYUPP12h7WKWmwR7auBmr6SAdRnr0s/yy9QkWbcAaTk9fgMPGzxe2vwa3huN+TDH5RIyPG1uvhzQ8gsu261Iz26x2urmfu9YGsCO0H2ifbcpyBgfo6H1A3zYBvqYsfjOFjY/fcrjM5GhuzIlscAU9mfVhk8rT2i4sSDBP8gI0srCNR2gbMh1o5xrh4kyxU2jIa7C0+CqNc4jLuoaO1kkDYX900ImpSAUycG1N6Zig25NkvIraMvqLlE7ci4o6SAu/ZiJDhxgGqNM1UrbSMN2pkgG8bQA/duhHShdY7wkJUcpEIfNaZ8ksqZ68zeZbzoSY4naUAlIVSS1HRdbtkmaXjatbf521QZE0Y/KwavvrWLerKfi2S3U7TjfrDOuG1JMNMtBp3j/DpPxGXJHCyFfqAmivxzktPZvTZlTVMKKfN23dztZZ4V07DCl74uiPP/uZBdktexrHb90y1SP8VxFbwZxnLcY0W/abEJ7R5tk7uGrSCOfPrjZM5m0dYE4B9RkeMdgUx5AFnncYsqNjWmyVlz8YSH5+Tm6MK9z2rUV2eJF8327VPxymRsOYDFHooL1WwmjDCxc8liW6KnJu74MaD+vvWYib2h7bQMLK5MZ20fZ9u7qhV7/7C2wCbXA=', 'key': 'value', 'type': 'CompositeElement', 'element_id': '34922f62e3c3e7600d32eb0627b79202', 'page': '1'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Inspect the Output\n",
+    "\n",
+    "# 1. Number of Chunks\n",
+    "print(f'This is the length of the lanchain docs {len(langchain_docs)}')\n",
+    "\n",
+    "# 2. Example Chunk\n",
+    "print(f'This is an example langcahin doc \\n\\n {langchain_docs[0]}')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Use Case 2 - Process Whole Directory "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "config_yaml = './config.yaml'\n",
+    "sambaparse = SambaParse(config_yaml)\n",
+    "\n",
+    "source_type = 'local'\n",
+    "input_path = './test_docs'\n",
+    "additional_metadata = {'key': 'value'}\n",
+    "\n",
+    "texts, metadata_list, langchain_docs = sambaparse.run_ingest(\n",
+    "    source_type, input_path=input_path, additional_metadata=additional_metadata\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "This is the length of the lanchain docs 44\n",
+      "This is an example langcahin doc \n",
+      "\n",
+      " page_content=\"6/20/24, 3:23 PM\\n\\nSambaNova has broken the 1000 t/s barrier: why it's a big deal for enterprise AI\\n\\nG\\\\SambaNovar\\n\\nEN\\n\\nBACK TO RESOURCES\\n\\n<\\n\\nPREVIOUS | NEXT\\n\\n>\\n\\nMay 29, 2024\\n\\njn\\n\\nNX\\n\\nfF\\n\\nBS\\n\\nSambaNova has broken the 1000 t/s barrier: why it's a big deal for enterprise AI\\n\\nSambaNova is the clear winner of the latest large language model LLM benchmark by Artificial Analysis. Topping the Leaderboad at over 1000 tokens per second (t/s), Samba-1 Turbo sets a new record for Llama 3 8B performance on a single SN40L node and with full precision.\\n\\nWith speeds like this, enterprises can expect to accelerate an array of use cases and will enable innovation around unblocking agentic workflow, copilot, and synthetic data, to name a few. This breakthrough in AI technology is possible because the purpose-built SambaNova SN40L Reconfigurable Dataflow Unit RDU can hold hundreds of models at the same time and can switch between them in microseconds.\\n\\nSpeed for today and tomorrow\" metadata={'filename': 'samba_turbo.pdf', 'filetype': 'application/pdf', 'languages': 'eng', 'page_number': '1', 'orig_elements': 'eJzVl21v2zYQx7/KwW+2AV7DJ1FUMQxI22wrlqZFHrYCbVHw4WhzkSVBkut63b77jvYejCJF7BdDkleCyBN597v/Hak3nyZY4wKb8X0Kk8cwKWzl0TimbZBSVEWprZZRSx20YbJwkylMFjjaYEdL9p8mMdXY2AXmjwe7cPb9uOxd+6gLMdvm6XHdbaZt19XJ2zG1zdHf07VtZks7w4Hm30ywmU3e0WhHI++b5cJhT+P8Txoa8eOY19BHgh0JNQX5WEh49SIv8s/6P6EN9AWZfx5VjIWpKvRWRFlZ6dEJxr3SElEFr8xdR3WRtzhrP1iY2wFc315jA+McgTPGYDyiMdv3CfvHsJqvIY1fDWDBpRkEtDXEtgcKFvuuTwPC8fP9qJQlE8YV0ToeXamKqiAuKI3RJTNFFe6CymakPyBzuxh/fPv2X5L9LoPLNNZ4EwIUpQqxRKYL4RwtaQxHdMYrFSup7lzuu0E8X5DdTUGEGENRVbIqfCU1s0wFGazTVMGFJnZ3HcTJ2X56VFgq6jWoKMOlM0FykmMMEjk6LoW4D3q8lfVu4E+On/4Mly/h/OTi5dX505OLvTQZjUWunZeclwKDFJ4JpqMzUflKlXgfMNyaql0M3+2GfdXQ9jhr+/Q7hstscQMCxlUUUikUSjkjnePCCGFtYJE77x+eEl6dn/zy/OXVBfwBZyevL/fSQSWo+3tqzRELaniyUpoTFKkq5JWLd9KbPodwa6J2IXx/sA5Ky6pYeMZZITPlUAmnOTFBI6lC2IND8MKuQVRTEEyoXRpndLyTKx/wSyQsL5UpiAcLwUWLRisTSsmCcVYUgd8HEgdVxG/NXkVQBCtj0JJjLA1TvhROBaF9QR3IBV48uLjPXu8VNzeVQDR05SF5Oead08JjENFLr6R4ePmOP+wVty+rUvsouLHcM7qHlbzAwAXngU4Zey8Ov8PuAPsd+nQniq5iOieawi4dD5F5Q1WvqkKZ+ODi/j9/ab58dQqs0oXWBT0tp0opQwx0I6GyQS+re3Fe3JrpmymmYUPP12h7WKWmwR7auBmr6SAdRnr0s/yy9QkWbcAaTk9fgMPGzxe2vwa3huN+TDH5RIyPG1uvhzQ8gsu261Iz26x2urmfu9YGsCO0H2ifbcpyBgfo6H1A3zYBvqYsfjOFjY/fcrjM5GhuzIlscAU9mfVhk8rT2i4sSDBP8gI0srCNR2gbMh1o5xrh4kyxU2jIa7C0+CqNc4jLuoaO1kkDYX900ImpSAUycG1N6Zig25NkvIraMvqLlE7ci4o6SAu/ZiJDhxgGqNM1UrbSMN2pkgG8bQA/duhHShdY7wkJUcpEIfNaZ8ksqZ68zeZbzoSY4naUAlIVSS1HRdbtkmaXjatbf521QZE0Y/KwavvrWLerKfi2S3U7TjfrDOuG1JMNMtBp3j/DpPxGXJHCyFfqAmivxzktPZvTZlTVMKKfN23dztZZ4V07DCl74uiPP/uZBdktexrHb90y1SP8VxFbwZxnLcY0W/abEJ7R5tk7uGrSCOfPrjZM5m0dYE4B9RkeMdgUx5AFnncYsqNjWmyVlz8YSH5+Tm6MK9z2rUV2eJF8327VPxymRsOYDFHooL1WwmjDCxc8liW6KnJu74MaD+vvWYib2h7bQMLK5MZ20fZ9u7qhV7/7C2wCbXA=', 'key': 'value', 'type': 'CompositeElement', 'element_id': '34922f62e3c3e7600d32eb0627b79202', 'page': '1'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Inspect the Output\n",
+    "\n",
+    "# 1. Number of Chunks\n",
+    "print(f'This is the length of the lanchain docs {len(langchain_docs)}')\n",
+    "\n",
+    "# 2. Example Chunk\n",
+    "print(f'This is an example langcahin doc \\n\\n {langchain_docs[0]}')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "aisk-fine-tune-embeddings",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

utils/parsing/requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+unstructured==0.13.6
+unstructured-client==0.18.0
+unstructured-inference==0.7.29
+langchain==0.1.16
+PyMuPDF==1.23.4
+PyMuPDFb==1.23.3

utils/parsing/sambaparse.py ADDED Viewed

	@@ -0,0 +1,525 @@

+import os
+import yaml
+import subprocess
+import json
+import logging
+from typing import Dict, Optional, List, Tuple, Union, Any
+from dotenv import load_dotenv
+from langchain.docstore.document import Document
+import shutil
+from langchain_community.document_loaders import PyMuPDFLoader
+load_dotenv()
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+class SambaParse:
+    def __init__(self, config_path: str):
+        with open(config_path, "r") as file:
+            self.config = yaml.safe_load(file)
+        # Set the default Unstructured API key as an environment variable if not already set
+        if "UNSTRUCTURED_API_KEY" not in os.environ:
+            default_api_key = self.config.get("partitioning", {}).get("default_unstructured_api_key")
+            if default_api_key:
+                os.environ["UNSTRUCTURED_API_KEY"] = default_api_key
+    def run_ingest(
+        self,
+        source_type: str,
+        input_path: Optional[str] = None,
+        additional_metadata: Optional[Dict] = None,
+    ) -> Tuple[List[str], List[Dict], List[Document]]:
+        """
+        Runs the ingest process for the specified source type and input path.
+        Args:
+            source_type (str): The type of source to ingest (e.g., 'local', 'confluence', 'github', 'google-drive').
+            input_path (Optional[str]): The input path for the source (only required for 'local' source type).
+            additional_metadata (Optional[Dict]): Additional metadata to include in the processed documents.
+        Returns:
+            Tuple[List[str], List[Dict], List[Document]]: A tuple containing the extracted texts, metadata, and LangChain documents.
+        """
+        if not self.config["partitioning"]["partition_by_api"]:
+            return self._run_ingest_pymupdf(input_path, additional_metadata)
+        output_dir = self.config["processor"]["output_dir"]
+        # Create the output directory if it doesn't exist
+        os.makedirs(output_dir, exist_ok=True)
+        # Delete contents of the output directory using shell command
+        del_command = f"rm -rf {output_dir}/*"
+        logger.info(f"Deleting contents of output directory: {output_dir}")
+        subprocess.run(del_command, shell=True, check=True)
+        command = [
+            "unstructured-ingest",
+            source_type,
+            "--output-dir",
+            output_dir,
+            "--num-processes",
+            str(self.config["processor"]["num_processes"]),
+        ]
+        if self.config["processor"]["reprocess"] == True:
+            command.extend(["--reprocess"])
+        # Add partition arguments
+        command.extend(
+            [
+                "--strategy",
+                self.config["partitioning"]["strategy"],
+                "--ocr-languages",
+                ",".join(self.config["partitioning"]["ocr_languages"]),
+                "--encoding",
+                self.config["partitioning"]["encoding"],
+                "--fields-include",
+                ",".join(self.config["partitioning"]["fields_include"]),
+                "--metadata-exclude",
+                ",".join(self.config["partitioning"]["metadata_exclude"]),
+                "--metadata-include",
+                ",".join(self.config["partitioning"]["metadata_include"]),
+            ]
+        )
+        if self.config["partitioning"]["skip_infer_table_types"]:
+            command.extend(
+                [
+                    "--skip-infer-table-types",
+                    ",".join(self.config["partitioning"]["skip_infer_table_types"]),
+                ]
+            )
+        if self.config["partitioning"]["flatten_metadata"]:
+            command.append("--flatten-metadata")
+        if source_type == "local":
+            if input_path is None:
+                raise ValueError("Input path is required for local source type.")
+            command.extend(["--input-path", f'"{input_path}"'])
+            if self.config["sources"]["local"]["recursive"]:
+                command.append("--recursive")
+        elif source_type == "confluence":
+            command.extend(
+                [
+                    "--url",
+                    self.config["sources"]["confluence"]["url"],
+                    "--user-email",
+                    self.config["sources"]["confluence"]["user_email"],
+                    "--api-token",
+                    self.config["sources"]["confluence"]["api_token"],
+                ]
+            )
+        elif source_type == "github":
+            command.extend(
+                [
+                    "--url",
+                    self.config["sources"]["github"]["url"],
+                    "--git-branch",
+                    self.config["sources"]["github"]["branch"],
+                ]
+            )
+        elif source_type == "google-drive":
+            command.extend(
+                [
+                    "--drive-id",
+                    self.config["sources"]["google_drive"]["drive_id"],
+                    "--service-account-key",
+                    self.config["sources"]["google_drive"]["service_account_key"],
+                ]
+            )
+            if self.config["sources"]["google_drive"]["recursive"]:
+                command.append("--recursive")
+        else:
+            raise ValueError(f"Unsupported source type: {source_type}")
+        if self.config["processor"]["verbose"]:
+            command.append("--verbose")
+        if self.config["partitioning"]["partition_by_api"]:
+            api_key = os.getenv("UNSTRUCTURED_API_KEY")
+            partition_endpoint_url = f"{self.config['partitioning']['partition_endpoint']}:{self.config['partitioning']['unstructured_port']}"
+            if api_key:
+                command.extend(["--partition-by-api", "--api-key", api_key])
+                command.extend(["--partition-endpoint", partition_endpoint_url])
+            else:
+                logger.warning("No Unstructured API key available. Partitioning by API will be skipped.")
+        if self.config["partitioning"]["strategy"] == "hi_res":
+            if (
+                "hi_res_model_name" in self.config["partitioning"]
+                and self.config["partitioning"]["hi_res_model_name"]
+            ):
+                command.extend(
+                    [
+                        "--hi-res-model-name",
+                        self.config["partitioning"]["hi_res_model_name"],
+                    ]
+                )
+            logger.warning(
+                "You've chosen the high-resolution partitioning strategy. Grab a cup of coffee or tea while you wait, as this may take some time due to OCR and table detection."
+            )
+        if self.config["chunking"]["enabled"]:
+            command.extend(
+                [
+                    "--chunking-strategy",
+                    self.config["chunking"]["strategy"],
+                    "--chunk-max-characters",
+                    str(self.config["chunking"]["chunk_max_characters"]),
+                    "--chunk-overlap",
+                    str(self.config["chunking"]["chunk_overlap"]),
+                ]
+            )
+            if self.config["chunking"]["strategy"] == "by_title":
+                command.extend(
+                    [
+                        "--chunk-combine-text-under-n-chars",
+                        str(self.config["chunking"]["combine_under_n_chars"]),
+                    ]
+                )
+        if self.config["embedding"]["enabled"]:
+            command.extend(
+                [
+                    "--embedding-provider",
+                    self.config["embedding"]["provider"],
+                    "--embedding-model-name",
+                    self.config["embedding"]["model_name"],
+                ]
+            )
+        if self.config["destination_connectors"]["enabled"]:
+            destination_type = self.config["destination_connectors"]["type"]
+            if destination_type == "chroma":
+                command.extend(
+                    [
+                        "chroma",
+                        "--host",
+                        self.config["destination_connectors"]["chroma"]["host"],
+                        "--port",
+                        str(self.config["destination_connectors"]["chroma"]["port"]),
+                        "--collection-name",
+                        self.config["destination_connectors"]["chroma"][
+                            "collection_name"
+                        ],
+                        "--tenant",
+                        self.config["destination_connectors"]["chroma"]["tenant"],
+                        "--database",
+                        self.config["destination_connectors"]["chroma"]["database"],
+                        "--batch-size",
+                        str(self.config["destination_connectors"]["batch_size"]),
+                    ]
+                )
+            elif destination_type == "qdrant":
+                command.extend(
+                    [
+                        "qdrant",
+                        "--location",
+                        self.config["destination_connectors"]["qdrant"]["location"],
+                        "--collection-name",
+                        self.config["destination_connectors"]["qdrant"][
+                            "collection_name"
+                        ],
+                        "--batch-size",
+                        str(self.config["destination_connectors"]["batch_size"]),
+                    ]
+                )
+            else:
+                raise ValueError(
+                    f"Unsupported destination connector type: {destination_type}"
+                )
+        command_str = " ".join(command)
+        logger.info(f"Running command: {command_str}")
+        logger.info(
+            "This may take some time depending on the size of your data. Please be patient..."
+        )
+        subprocess.run(command_str, shell=True, check=True)
+        logger.info("Ingest process completed successfully!")
+        # Call the additional processing function if enabled
+        if self.config["additional_processing"]["enabled"]:
+            logger.info("Performing additional processing...")
+            texts, metadata_list, langchain_docs = additional_processing(
+                directory=output_dir,
+                extend_metadata=self.config["additional_processing"]["extend_metadata"],
+                additional_metadata=additional_metadata,
+                replace_table_text=self.config["additional_processing"][
+                    "replace_table_text"
+                ],
+                table_text_key=self.config["additional_processing"]["table_text_key"],
+                return_langchain_docs=self.config["additional_processing"][
+                    "return_langchain_docs"
+                ],
+                convert_metadata_keys_to_string=self.config["additional_processing"][
+                    "convert_metadata_keys_to_string"
+                ],
+            )
+            logger.info("Additional processing completed.")
+            return texts, metadata_list, langchain_docs
+    def _run_ingest_pymupdf(
+        self, input_path: str, additional_metadata: Optional[Dict] = None
+    ) -> Tuple[List[str], List[Dict], List[Document]]:
+        """
+        Runs the ingest process using PyMuPDF via LangChain.
+        Args:
+            input_path (str): The input path for the source.
+            additional_metadata (Optional[Dict]): Additional metadata to include in the processed documents.
+        Returns:
+            Tuple[List[str], List[Dict], List[Document]]: A tuple containing the extracted texts, metadata, and LangChain documents.
+        """
+        if not input_path:
+            raise ValueError("Input path is required for PyMuPDF processing.")
+        texts = []
+        metadata_list = []
+        langchain_docs = []
+        if os.path.isfile(input_path):
+            file_paths = [input_path]
+        else:
+            file_paths = [
+                os.path.join(input_path, f)
+                for f in os.listdir(input_path)
+                if f.lower().endswith('.pdf')
+            ]
+        for file_path in file_paths:
+            loader = PyMuPDFLoader(file_path)
+            docs = loader.load()
+            for doc in docs:
+                text = doc.page_content
+                metadata = doc.metadata
+                # Add 'filename' key to metadata
+                metadata['filename'] = os.path.basename(metadata['source'])
+                if additional_metadata:
+                    metadata.update(additional_metadata)
+                texts.append(text)
+                metadata_list.append(metadata)
+                langchain_docs.append(doc)
+        return texts, metadata_list, langchain_docs
+def convert_to_string(value: Union[List, Tuple, Dict, Any]) -> str:
+    """
+    Convert a value to its string representation.
+    Args:
+        value (Union[List, Tuple, Dict, Any]): The value to be converted to a string.
+    Returns:
+        str: The string representation of the value.
+    """
+    if isinstance(value, (list, tuple)):
+        return ", ".join(map(str, value))
+    elif isinstance(value, dict):
+        return json.dumps(value)
+    else:
+        return str(value)
+def additional_processing(
+    directory: str,
+    extend_metadata: bool,
+    additional_metadata: Optional[Dict],
+    replace_table_text: bool,
+    table_text_key: str,
+    return_langchain_docs: bool,
+    convert_metadata_keys_to_string: bool,
+):
+    """
+    Performs additional processing on the extracted documents.
+    Args:
+        directory (str): The directory containing the extracted JSON files.
+        extend_metadata (bool): Whether to extend the metadata with additional metadata.
+        additional_metadata (Optional[Dict]): Additional metadata to include in the processed documents.
+        replace_table_text (bool): Whether to replace table text with the specified table text key.
+        table_text_key (str): The key to use for replacing table text.
+        return_langchain_docs (bool): Whether to return LangChain documents.
+        convert_metadata_keys_to_string (bool): Whether to convert non-string metadata keys to string.
+    Returns:
+        Tuple[List[str], List[Dict], List[Document]]: A tuple containing the extracted texts, metadata, and LangChain documents.
+    """
+    if os.path.isfile(directory):
+        file_paths = [directory]
+    else:
+        file_paths = [
+            os.path.join(directory, f)
+            for f in os.listdir(directory)
+            if f.endswith(".json")
+        ]
+    texts = []
+    metadata_list = []
+    langchain_docs = []
+    for file_path in file_paths:
+        with open(file_path, "r") as file:
+            data = json.load(file)
+        for element in data:
+            if extend_metadata and additional_metadata:
+                element["metadata"].update(additional_metadata)
+            if replace_table_text and element["type"] == "Table":
+                element["text"] = element["metadata"][table_text_key]
+            metadata = element["metadata"].copy()
+            if convert_metadata_keys_to_string:
+                metadata = {
+                    str(key): convert_to_string(value)
+                    for key, value in metadata.items()
+                }
+            for key in element:
+                if key not in ["text", "metadata", "embeddings"]:
+                    metadata[key] = element[key]
+            if "page_number" in metadata:
+                metadata["page"] = metadata["page_number"]
+            else:
+                metadata["page"] = 1
+            metadata_list.append(metadata)
+            texts.append(element["text"])
+        if return_langchain_docs:
+            langchain_docs.extend(get_langchain_docs(texts, metadata_list))
+        with open(file_path, "w") as file:
+            json.dump(data, file, indent=2)
+    return texts, metadata_list, langchain_docs
+def get_langchain_docs(texts: List[str], metadata_list: List[Dict]) -> List[Document]:
+    """
+    Creates LangChain documents from the extracted texts and metadata.
+    Args:
+        texts (List[str]): The extracted texts.
+        metadata_list (List[Dict]): The metadata associated with each text.
+    Returns:
+        List[Document]: A list of LangChain documents.
+    """
+    return [
+        Document(page_content=content, metadata=metadata)
+        for content, metadata in zip(texts, metadata_list)
+    ]
+def parse_doc_universal(
+    doc: str, additional_metadata: Optional[Dict] = None, source_type: str = "local"
+) -> Tuple[List[str], List[Dict], List[Document]]:
+    """
+    Extract text, tables, images, and metadata from a document or a folder of documents.
+    Args:
+        doc (str): Path to the document or folder of documents.
+        additional_metadata (Optional[Dict], optional): Additional metadata to include in the processed documents.
+            Defaults to an empty dictionary.
+        source_type (str, optional): The type of source to ingest. Defaults to 'local'.
+    Returns:
+        Tuple[List[str], List[Dict], List[Document]]: A tuple containing:
+            - A list of extracted text per page.
+            - A list of extracted metadata per page.
+            - A list of LangChain documents.
+    """
+    if additional_metadata is None:
+        additional_metadata = {}
+    # Get the directory of the current file
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    # Join the current directory with the relative path of the config file
+    config_path = os.path.join(current_dir, "config.yaml")
+    wrapper = SambaParse(config_path)
+    def process_file(file_path):
+        if file_path.lower().endswith('.pdf'):
+            return wrapper._run_ingest_pymupdf(file_path, additional_metadata)
+        else:
+            # Use the original method for non-PDF files
+            return wrapper.run_ingest(source_type, input_path=file_path, additional_metadata=additional_metadata)
+    if os.path.isfile(doc):
+        return process_file(doc)
+    else:
+        all_texts, all_metadata, all_docs = [], [], []
+        for root, _, files in os.walk(doc):
+            for file in files:
+                file_path = os.path.join(root, file)
+                texts, metadata_list, langchain_docs = process_file(file_path)
+                all_texts.extend(texts)
+                all_metadata.extend(metadata_list)
+                all_docs.extend(langchain_docs)
+        return all_texts, all_metadata, all_docs
+def parse_doc_streamlit(docs: List,
+              kit_dir: str,
+              additional_metadata: Optional[Dict] = None,
+              ) -> List[Document]:
+    """
+    Parse the uploaded documents and return a list of LangChain documents.
+    Args:
+        docs (List[UploadFile]): A list of uploaded files.
+        kit_dir (str): The directory of the current kit.
+        additional_metadata (Optional[Dict], optional): Additional metadata to include in the processed documents.
+            Defaults to an empty dictionary.
+    Returns:
+        List[Document]: A list of LangChain documents.
+    """
+    if additional_metadata is None:
+        additional_metadata = {}
+    # Create the data/tmp folder if it doesn't exist
+    temp_folder = os.path.join(kit_dir, "data/tmp")
+    if not os.path.exists(temp_folder):
+        os.makedirs(temp_folder)
+    else:
+        # If there are already files there, delete them
+        for filename in os.listdir(temp_folder):
+            file_path = os.path.join(temp_folder, filename)
+            try:
+                if os.path.isfile(file_path) or os.path.islink(file_path):
+                    os.unlink(file_path)
+                elif os.path.isdir(file_path):
+                    shutil.rmtree(file_path)
+            except Exception as e:
+                print(f'Failed to delete {file_path}. Reason: {e}')
+    # Save all selected files to the tmp dir with their file names
+    for doc in docs:
+        temp_file = os.path.join(temp_folder, doc.name)
+        with open(temp_file, "wb") as f:
+            f.write(doc.getvalue())
+    # Pass in the temp folder for processing into the parse_doc_universal function
+    _, _, langchain_docs = parse_doc_universal(doc=temp_folder, additional_metadata=additional_metadata)
+    return langchain_docs

utils/vectordb/create_vector_db.py ADDED Viewed

	@@ -0,0 +1,141 @@

+# Define the script's usage example
+USAGE_EXAMPLE = """
+Example usage:
+To process input *.txt files at input_path and save the vector db output at output_db:
+python create_vector_db.py input_path output_db --chunk_size 100 --chunk_overlap 10
+Required arguments:
+- input_path: Path to the input dir containing the .txt files
+- output_path: Path to the output vector db.
+Optional arguments:
+- --chunk_size: Size of the chunks (default: None).
+- --chunk_overlap: Overlap between chunks (default: None).
+"""
+import argparse
+import logging
+import os
+from langchain.document_loaders import DirectoryLoader
+from langchain.embeddings import HuggingFaceInstructEmbeddings
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.vectorstores import FAISS, Chroma, Qdrant
+# Configure the logger
+logging.basicConfig(
+    level=logging.INFO,  # Set the logging level (e.g., INFO, DEBUG)
+    format="%(asctime)s [%(levelname)s] - %(message)s",  # Define the log message format
+    handlers=[
+        logging.StreamHandler(),  # Output logs to the console
+        logging.FileHandler("create_vector_db.log"),
+    ],
+)
+# Create a logger object
+logger = logging.getLogger(__name__)
+# Parse the arguments
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="Process command line arguments.")
+    parser.add_argument("-input_path", type=dir_path, help="path to input directory")
+    parser.add_argument("--chunk_size", type=int, help="chunk size for splitting")
+    parser.add_argument("--chunk_overlap", type=int, help="chunk overlap for splitting")
+    parser.add_argument("-output_path", type=dir_path, help="path to input directory")
+    return parser.parse_args()
+# Check valid path
+def dir_path(path):
+    if os.path.isdir(path):
+        return path
+    else:
+        raise argparse.ArgumentTypeError(f"readable_dir:{path} is not a valid path")
+def main(input_path, output_db, chunk_size, chunk_overlap, db_type):
+    # Load files from input_location
+    loader = DirectoryLoader(input_path, glob="*.txt")
+    docs = loader.load()
+    logger.info(f"Total {len(docs)} files loaded")
+    # get the text chunks
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len
+    )
+    chunks = text_splitter.split_documents(docs)
+    logger.info(f"Total {len(chunks)} chunks created")
+    # create vector store
+    encode_kwargs = {"normalize_embeddings": True}
+    embedding_model = "BAAI/bge-large-en"
+    embeddings = HuggingFaceInstructEmbeddings(
+        model_name=embedding_model,
+        embed_instruction="",  # no instruction is needed for candidate passages
+        query_instruction="Represent this sentence for searching relevant passages: ",
+        encode_kwargs=encode_kwargs,
+    )
+    logger.info(
+        f"Processing embeddings using {embedding_model}. This could take time depending on the number of chunks ..."
+    )
+    if db_type == "faiss":
+        vectorstore = FAISS.from_documents(documents=chunks, embedding=embeddings)
+        # save vectorstore
+        vectorstore.save_local(output_db)
+    elif db_type == "chromadb":
+        vectorstore = Chroma.from_documents(
+            documents=chunks, embedding=embeddings, persist_directory=output_db
+        )
+    elif db_type == "qdrant":
+        vectorstore = Qdrant.from_documents(
+            documents=chunks,
+            embedding=embeddings,
+            path=output_db,
+            collection_name="test_collection",
+        )
+    elif db_type == "qdrant-server":
+        url = "http://localhost:6333/"
+        vectorstore = Qdrant.from_documents(
+            documents=chunks,
+            embedding=embeddings,
+            url=url,
+            prefer_grpc=True,
+            collection_name="anaconda",
+        )
+    logger.info(f"Vector store saved to {output_db}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Process data with optional chunking")
+    # Required arguments
+    parser.add_argument("input_path", type=str, help="Path to the input directory")
+    parser.add_argument("output_db", type=str, help="Path to the output vectordb")
+    # Optional arguments
+    parser.add_argument(
+        "--chunk_size", type=int, default=1000, help="Chunk size (default: 1000)"
+    )
+    parser.add_argument(
+        "--chunk_overlap", type=int, default=200, help="Chunk overlap (default: 200)"
+    )
+    parser.add_argument(
+        "--db_type",
+        type=str,
+        default="faiss",
+        help="Type of vectorstore (default: faiss)",
+    )
+    args = parser.parse_args()
+    main(
+        args.input_path,
+        args.output_db,
+        args.chunk_size,
+        args.chunk_overlap,
+        args.db_type,
+    )

utils/vectordb/vector_db.py ADDED Viewed

	@@ -0,0 +1,353 @@

+# Define the script's usage example
+USAGE_EXAMPLE = """
+Example usage:
+To process input *.txt files at input_path and save the vector db output at output_db:
+python create_vector_db.py input_path output_db --chunk_size 100 --chunk_overlap 10
+Required arguments:
+- input_path: Path to the input dir containing the .txt files
+- output_path: Path to the output vector db.
+Optional arguments:
+- --chunk_size: Size of the chunks (default: None).
+- --chunk_overlap: Overlap between chunks (default: None).
+"""
+import os
+import sys
+import argparse
+import logging
+from langchain_community.document_loaders import DirectoryLoader, UnstructuredURLLoader
+from langchain_community.embeddings import HuggingFaceInstructEmbeddings
+from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
+from langchain_community.vectorstores import FAISS, Chroma, Qdrant
+vectordb_dir = os.path.dirname(os.path.abspath(__file__))
+utils_dir = os.path.abspath(os.path.join(vectordb_dir, ".."))
+repo_dir = os.path.abspath(os.path.join(utils_dir, ".."))
+sys.path.append(repo_dir)
+sys.path.append(utils_dir)
+from utils.model_wrappers.api_gateway import APIGateway
+import uuid
+import streamlit as st
+EMBEDDING_MODEL = "intfloat/e5-large-v2"
+NORMALIZE_EMBEDDINGS = True
+VECTORDB_LOG_FILE_NAME = "vector_db.log"
+# Configure the logger
+logging.basicConfig(
+    level=logging.INFO,  # Set the logging level (e.g., INFO, DEBUG)
+    format="%(asctime)s [%(levelname)s] - %(message)s",  # Define the log message format
+    handlers=[
+        logging.StreamHandler(),  # Output logs to the console
+        logging.FileHandler(VECTORDB_LOG_FILE_NAME),
+    ],
+)
+# Create a logger object
+logger = logging.getLogger(__name__)
+class VectorDb():
+    """
+    A class for creating, updating and loading FAISS or Chroma vector databases,
+    to use them with retrieval augmented generation tasks with langchain
+    Args:
+        None
+    Attributes:
+        None
+    Methods:
+        load_files: Load files from an input directory as langchain documents
+        get_text_chunks: Get text chunks from a list of documents
+        get_token_chunks: Get token chunks from a list of documents
+        create_vector_store: Create a vector store from chunks and an embedding model
+        load_vdb: load a previous stored vector database
+        update_vdb: Update an existing vector store with new chunks
+        create_vdb: Create a vector database from the raw files in a specific input directory
+    """
+    def __init__(self) -> None:
+        self.collection_id = str(uuid.uuid4())
+        self.vector_collections = set()
+    def load_files(self, input_path, recursive=False, load_txt=True, load_pdf=False, urls = None) -> list:
+        """Load files from input location
+        Args:
+            input_path : input location of files
+            recursive (bool, optional): flag to load files recursively. Defaults to False.
+            load_txt (bool, optional): flag to load txt files. Defaults to True.
+            load_pdf (bool, optional): flag to load pdf files. Defaults to False.
+            urls (list, optional): list of urls to load. Defaults to None.
+        Returns:
+            list: list of documents
+        """
+        docs=[]
+        text_loader_kwargs={'autodetect_encoding': True}
+        if input_path is not None:
+            if load_txt:
+                loader = DirectoryLoader(input_path, glob="*.txt", recursive=recursive, show_progress=True, loader_kwargs=text_loader_kwargs)
+                docs.extend(loader.load())
+            if load_pdf:
+                loader = DirectoryLoader(input_path, glob="*.pdf", recursive=recursive, show_progress=True, loader_kwargs=text_loader_kwargs)
+                docs.extend(loader.load())
+        if urls:
+            loader = UnstructuredURLLoader(urls=urls)
+            docs.extend(loader.load())
+        logger.info(f"Total {len(docs)} files loaded")
+        return docs
+    def get_text_chunks(self, docs: list, chunk_size: int, chunk_overlap: int, meta_data: list = None) -> list:
+        """Gets text chunks. If metadata is not None, it will create chunks with metadata elements.
+        Args:
+            docs (list): list of documents or texts. If no metadata is passed, this parameter is a list of documents.
+            If metadata is passed, this parameter is a list of texts.
+            chunk_size (int): chunk size in number of characters
+            chunk_overlap (int): chunk overlap in number of characters
+            metadata (list, optional): list of metadata in dictionary format. Defaults to None.
+        Returns:
+            list: list of documents
+        """
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len
+        )
+        if meta_data is None:
+            logger.info(f"Splitter: splitting documents")
+            chunks = text_splitter.split_documents(docs)
+        else:
+            logger.info(f"Splitter: creating documents with metadata")
+            chunks = text_splitter.create_documents(docs, meta_data)
+        logger.info(f"Total {len(chunks)} chunks created")
+        return chunks
+    def get_token_chunks(self, docs: list, chunk_size: int, chunk_overlap: int, tokenizer) -> list:
+        """Gets token chunks. If metadata is not None, it will create chunks with metadata elements.
+        Args:
+            docs (list): list of documents or texts. If no metadata is passed, this parameter is a list of documents.
+            If metadata is passed, this parameter is a list of texts.
+            chunk_size (int): chunk size in number of tokens
+            chunk_overlap (int): chunk overlap in number of tokens
+        Returns:
+            list: list of documents
+        """
+        text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(
+            tokenizer, chunk_size=chunk_size, chunk_overlap=chunk_overlap
+        )
+        logger.info(f"Splitter: splitting documents")
+        chunks = text_splitter.split_documents(docs)
+        logger.info(f"Total {len(chunks)} chunks created")
+        return chunks
+    def create_vector_store(self, chunks: list, embeddings: HuggingFaceInstructEmbeddings, db_type: str,
+                            output_db: str = None, collection_name: str = None):
+        """Creates a vector store
+        Args:
+            chunks (list): list of chunks
+            embeddings (HuggingFaceInstructEmbeddings): embedding model
+            db_type (str): vector db type
+            output_db (str, optional): output path to save the vector db. Defaults to None.
+        """
+        if collection_name is None:
+            collection_name = f"collection_{self.collection_id}"
+            logger.info(f'This is the collection name: {collection_name}')
+        if db_type == "faiss":
+            vector_store = FAISS.from_documents(
+                documents=chunks,
+                embedding=embeddings
+            )
+            if output_db:
+                vector_store.save_local(output_db)
+        elif db_type == "chroma":
+            if output_db:
+                vector_store = Chroma()
+                vector_store.delete_collection()
+                vector_store = Chroma.from_documents(
+                    documents=chunks,
+                    embedding=embeddings,
+                    persist_directory=output_db,
+                    collection_name=collection_name
+                )
+            else:
+                vector_store = Chroma()
+                vector_store.delete_collection()
+                vector_store = Chroma.from_documents(
+                    documents=chunks,
+                    embedding=embeddings,
+                    collection_name=collection_name
+                )
+            self.vector_collections.add(collection_name)
+        elif db_type == "qdrant":
+            if output_db:
+                vector_store = Qdrant.from_documents(
+                    documents=chunks,
+                    embedding=embeddings,
+                    path=output_db,
+                    collection_name="test_collection",
+                )
+            else:
+                vector_store = Qdrant.from_documents(
+                    documents=chunks,
+                    embedding=embeddings,
+                    collection_name="test_collection",
+                )
+        logger.info(f"Vector store saved to {output_db}")
+        return vector_store
+    def load_vdb(self, persist_directory, embedding_model, db_type="chroma", collection_name=None):
+        if db_type == "faiss":
+            vector_store = FAISS.load_local(persist_directory, embedding_model, allow_dangerous_deserialization=True)
+        elif db_type == "chroma":
+            if collection_name:
+                vector_store = Chroma(
+                    persist_directory=persist_directory,
+                    embedding_function=embedding_model,
+                    collection_name=collection_name
+                )
+            else:
+                vector_store = Chroma(
+                    persist_directory=persist_directory,
+                    embedding_function=embedding_model
+                )
+        elif db_type == "qdrant":
+            # TODO: Implement Qdrant loading
+            pass
+        else:
+            raise ValueError(f"Unsupported database type: {db_type}")
+        return vector_store
+    def update_vdb(self, chunks: list, embeddings, db_type: str, input_db: str = None,
+                   output_db: str = None):
+        if db_type == "faiss":
+            vector_store = FAISS.load_local(input_db, embeddings, allow_dangerous_deserialization=True)
+            new_vector_store = self.create_vector_store(chunks, embeddings, db_type, None)
+            vector_store.merge_from(new_vector_store)
+            if output_db:
+                vector_store.save_local(output_db)
+        elif db_type == "chroma":
+            # TODO implement update method for chroma
+            pass
+        elif db_type == "qdrant":
+            # TODO implement update method for qdrant
+            pass
+        return vector_store
+    def create_vdb(
+        self,
+        input_path,
+        chunk_size,
+        chunk_overlap,
+        db_type,
+        output_db=None,
+        recursive=False,
+        tokenizer=None,
+        load_txt=True,
+        load_pdf=False,
+        urls=None,
+        embedding_type="cpu",
+        batch_size= None,
+        coe = None,
+        select_expert = None
+    ):
+        docs = self.load_files(input_path, recursive=recursive, load_txt=load_txt, load_pdf=load_pdf, urls=urls)
+        if tokenizer is None:
+            chunks = self.get_text_chunks(docs, chunk_size, chunk_overlap)
+        else:
+            chunks = self.get_token_chunks(docs, chunk_size, chunk_overlap, tokenizer)
+        embeddings = APIGateway.load_embedding_model(
+            type=embedding_type,
+            batch_size=batch_size,
+            coe=coe,
+            select_expert=select_expert
+        )
+        vector_store = self.create_vector_store(chunks, embeddings, db_type, output_db)
+        return vector_store
+def dir_path(path):
+    if os.path.isdir(path):
+        return path
+    else:
+        raise argparse.ArgumentTypeError(f"readable_dir:{path} is not a valid path")
+# Parse the arguments
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="Process command line arguments.")
+    parser.add_argument("-input_path", type=dir_path, help="path to input directory")
+    parser.add_argument("--chunk_size", type=int, help="chunk size for splitting")
+    parser.add_argument("--chunk_overlap", type=int, help="chunk overlap for splitting")
+    parser.add_argument("-output_path", type=dir_path, help="path to input directory")
+    return parser.parse_args()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Process data with optional chunking")
+    # Required arguments
+    parser.add_argument("--input_path", type=str, help="Path to the input directory")
+    parser.add_argument("--output_db", type=str, help="Path to the output vectordb")
+    # Optional arguments
+    parser.add_argument(
+        "--chunk_size", type=int, default=1000, help="Chunk size (default: 1000)"
+    )
+    parser.add_argument(
+        "--chunk_overlap", type=int, default=200, help="Chunk overlap (default: 200)"
+    )
+    parser.add_argument(
+        "--db_type",
+        type=str,
+        default="faiss",
+        help="Type of vector store (default: faiss)",
+    )
+    args = parser.parse_args()
+    vectordb = VectorDb()
+    vectordb.create_vdb(
+        args.input_path,
+        args.output_db,
+        args.chunk_size,
+        args.chunk_overlap,
+        args.db_type,
+    )

utils/visual/env_utils.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import netrc
+import os
+from typing import List, Optional, Tuple
+import streamlit as st
+def initialize_env_variables(prod_mode: bool = False, additional_env_vars: Optional[List[str]] = None) -> None:
+    if additional_env_vars is None:
+        additional_env_vars = []
+    if not prod_mode:
+        # In non-prod mode, prioritize environment variables
+        st.session_state.SAMBANOVA_API_KEY = os.environ.get(
+            'SAMBANOVA_API_KEY', st.session_state.get('SMABANOVA_API_KEY', '')
+        )
+        for var in additional_env_vars:
+            st.session_state[var] = os.environ.get(var, st.session_state.get(var, ''))
+    else:
+        # In prod mode, only use session state
+        if 'SAMBANOVA_API_KEY' not in st.session_state:
+            st.session_state.SAMBANOVA_API_KEY = ''
+        for var in additional_env_vars:
+            if var not in st.session_state:
+                st.session_state[var] = ''
+def set_env_variables(api_key, additional_vars=None, prod_mode=False):
+    st.session_state.SAMBANOVA_API_KEY = api_key
+    if additional_vars:
+        for key, value in additional_vars.items():
+            st.session_state[key] = value
+    if not prod_mode:
+        # In non-prod mode, also set environment variables
+        os.environ['SAMBANOVA_API_KEY'] = api_key
+        if additional_vars:
+            for key, value in additional_vars.items():
+                os.environ[key] = value
+def env_input_fields(additional_env_vars=None) -> Tuple[str, str]:
+    if additional_env_vars is None:
+        additional_env_vars = []
+    api_key = st.text_input('Sambanova API Key', value=st.session_state.SAMBANOVA_API_KEY, type='password')
+    additional_vars = {}
+    for var in additional_env_vars:
+        additional_vars[var] = st.text_input(f'{var}', value=st.session_state.get(var, ''), type='password')
+    return api_key, additional_vars
+def are_credentials_set(additional_env_vars=None) -> bool:
+    if additional_env_vars is None:
+        additional_env_vars = []
+    base_creds_set = bool(st.session_state.SAMBANOVA_API_KEY)
+    additional_creds_set = all(bool(st.session_state.get(var, '')) for var in additional_env_vars)
+    return base_creds_set and additional_creds_set
+def save_credentials(api_key, additional_vars=None, prod_mode=False) -> str:
+    set_env_variables(api_key, additional_vars, prod_mode)
+    return 'Credentials saved successfully!'
+def get_wandb_key():
+    # Check for WANDB_API_KEY in environment variables
+    env_wandb_api_key = os.getenv('WANDB_API_KEY')
+    # Check for WANDB_API_KEY in ~/.netrc
+    try:
+        netrc_path = os.path.expanduser('~/.netrc')
+        netrc_data = netrc.netrc(netrc_path)
+        netrc_wandb_api_key = netrc_data.authenticators('api.wandb.ai')
+    except (FileNotFoundError, netrc.NetrcParseError):
+        netrc_wandb_api_key = None
+    # If both are set, handle the conflict
+    if env_wandb_api_key and netrc_wandb_api_key:
+        print('WANDB_API_KEY is set in both the environment and ~/.netrc. Prioritizing environment variable.')
+        # Optionally, you can choose to remove one of them, here we remove the env variable
+        del os.environ['WANDB_API_KEY']  # Remove from environment to prioritize ~/.netrc
+        return netrc_wandb_api_key[2] if netrc_wandb_api_key else None  # Return the key from .netrc
+    # Return the key from environment if available, otherwise from .netrc
+    if env_wandb_api_key:
+        return env_wandb_api_key
+    elif netrc_wandb_api_key:
+        return netrc_wandb_api_key[2] if netrc_wandb_api_key else None
+    # If neither is set, return None
+    return None