# LLM # Ollama for local tests from langchain.llms import Ollama from langchain.callbacks.manager import CallbackManager from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler from langchain.llms import Ollama # Ref.: https://mistral.ai/news/mixtral-of-experts/#instructed-models # Q5_K_M quantzation flavor for best quality/recommended tradeoff (memory is no problem here) # Ref.: https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF#provided-files MISTRAL = "mistral:7b-instruct-v0.2-q5_K_M" # Q4_K quantization flavor for best memory/quality/recommended tradeoff # Ref.: https://huggingface.co/TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF#provided-files # mixtral:8x7b-instruct-v0.1-q4_K_M was sadly still too big for my Mac MIXTRAL = "mixtral:8x7b-instruct-v0.1-q3_K_L" # Llama2 13B # Ref.: https://huggingface.co/TheBloke/Llama-2-13B-GGUF LLAMA2 = "llama2:13b-chat-q5_K_M" mistral = Ollama( model=MISTRAL, callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]), # Ref.: https://api.python.langchain.com/en/latest/llms/langchain_community.llms.ollama.Ollama.html#langchain_community.llms.ollama.Ollama.format # format="json" ) mixtral = Ollama( model=MIXTRAL, callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]) ) llama2 = Ollama( model=LLAMA2, callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]) ) # LOAD from langchain_community.document_loaders import UnstructuredMarkdownLoader from langchain_community.document_loaders import TextLoader from langchain_community.document_loaders import PyPDFLoader FILES = { 'md': [ # "Présentation modes dégradés-20230120_112423-Enregistrement de la réunion.md", "YouTube - Mode secours telephonie.md" ], 'pdf': [ # "SI-Samu_Fiche procédure_Mode dégradé_Perte de CRRA.pdf", # "[SI-Samu] Fiche mémo - Procédure Mode dégradé.pdf", "SI-Samu_Documentation_produit_SF4_J18HF2_20231219 - mode secours seul.pdf", # "SI-Samu_Documentation_produit_SF4_J18HF2_20231219.pdf" ] } def load_data(files): data = {'md': [], 'pdf': []} for pdf in files['pdf']: data['pdf'].extend(PyPDFLoader('resources/' + pdf).load()) for md in files['md']: data['md'].extend(TextLoader('resources/' + md).load()) return data def to_full_data(data): return [ *data['md'], *data['pdf'] ] # SPLIT from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.text_splitter import MarkdownHeaderTextSplitter def split_MD_then_recursive(data): # - First use MarkDown title splitter on .MD and then RecursiveSplitter on all # MD splits markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=[ ("#", "Titre 1"), ("##", "Titre 2"), ("###", "Titre 3"), ], strip_headers=False) md_header_splits = data['pdf'].copy() for md in data['md']: md_header_splits.extend(markdown_splitter.split_text(md.page_content)) # Char-level splits text_splitter=RecursiveCharacterTextSplitter( chunk_size=500, chunk_overlap=50 # to improve results quality ) # Split return text_splitter.split_documents(md_header_splits) # EMBED # Directly done in the different scripts # RETRIEVE from langchain.storage import InMemoryStore from langchain.retrievers import ParentDocumentRetriever, BM25Retriever, EnsembleRetriever from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores import Chroma # Ensemble is based on weight fusion (Reciprocal Rank Fusion) | Ref.: https://safjan.com/implementing-rank-fusion-in-python/ def get_parent_ensemble_retriever(embeddings, full_data, all_splits, k=4, parent_chunk_size=2000, child_chunk_size=400, collection_name="store"): # - ParentDocumentRetriever: embed small chunks but retrieve with bigger context # This text splitter is used to create the parent documents parent_splitter = RecursiveCharacterTextSplitter(chunk_size=parent_chunk_size) # This text splitter is used to create the child documents # It should create documents smaller than the parent (don't make bigger than 512 as most embeddings trunk after that) child_splitter = RecursiveCharacterTextSplitter(chunk_size=child_chunk_size) # The vectorstore to use to index the child chunks parent_vectorstore = Chroma( collection_name=collection_name, embedding_function=embeddings ) # The storage layer for the parent documents parent_store = InMemoryStore() parent_retriever = ParentDocumentRetriever( vectorstore=parent_vectorstore, docstore=parent_store, child_splitter=child_splitter, parent_splitter=parent_splitter, search_kwargs={ "k": k, # "score_threshold": 0.5 }, # search_type="mmr" ) parent_retriever.add_documents(full_data) # - EnsembleRetriever # BM25 logic bm25_retriever = BM25Retriever.from_texts( list(map(lambda s: s.page_content, all_splits)), metadatas=list(map(lambda s: {"retriever": "BM25 sparse similiarity", **s.metadata}, all_splits)) ) bm25_retriever.k = k # Ensemble of BM25 + vectorstore on parent retriever return EnsembleRetriever( retrievers=[parent_retriever, bm25_retriever], weights=[0.5, 0.5] ) # PROMPT # Add more context to query + update system prompt to make it speak French # Ref.: https://stackoverflow.com/questions/76554411/unable-to-pass-prompt-template-to-retrievalqa-in-langchain # Ref.: https://community.openai.com/t/how-to-prevent-chatgpt-from-answering-questions-that-are-outside-the-scope-of-the-provided-context-in-the-system-role-message/112027/7 from langchain import PromptTemplate template = """ System: You are helping a user of "bandeau téléphonique SI-SAMU" (a CTI - Computer Telephony Integration - system) during system failure as he needs to use its local backup phone. Context information is below. Given the context information and not prior knowledge, answer the query. Language: Answer in French and using "vous". --- Context: {context} --- Question: {question} --- Réponse : """ PROMPT = PromptTemplate(template=template, input_variables=['question', 'context']) # RESULTS class color: PURPLE = '\033[95m' CYAN = '\033[96m' DARKCYAN = '\033[36m' BLUE = '\033[94m' GREEN = '\033[92m' YELLOW = '\033[93m' RED = '\033[91m' BOLD = '\033[1m' UNDERLINE = '\033[4m' END = '\033[0m' def parse_answer(answer): print(f">> {answer['query']}") print(f">> {answer['result']}") print(">> Sources :") for doc in answer['source_documents']: page = '' if 'page' in doc.metadata: page = f" (page {doc.metadata['page']})" source = '' if 'source' in doc.metadata: source = doc.metadata['source'] titles = ['Titre 1', 'Titre 2', 'Titre 3'] for title in titles: if title in doc.metadata: source += f" > {doc.metadata[title]}" retriever = f"B25" if 'retriever' in doc.metadata else "vectorstore" print(f">>> {color.BOLD}{source}{page} [{retriever}]{color.END}: {doc.page_content}\n---") print("--------\n\n")