import os os.environ['TOKENIZERS_PARALLELISM'] = 'true' from dotenv import load_dotenv load_dotenv() # load .env api keys mistral_api_key = os.getenv("MISTRAL_API_KEY") print("mistral_api_key", mistral_api_key) from langchain_community.document_loaders import PyPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.document_loaders import WebBaseLoader from langchain_community.vectorstores import Chroma, FAISS from langchain.chains.combine_documents import create_stuff_documents_chain from langchain_mistralai import MistralAIEmbeddings from langchain import hub from langchain.chains import ( create_history_aware_retriever, create_retrieval_chain, ) from typing import Literal from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder from langchain_core.pydantic_v1 import BaseModel, Field from langchain_mistralai import ChatMistralAI from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings from langchain_community.tools import DuckDuckGoSearchRun from pathlib import Path from huggingface_hub import login login(token=os.getenv("HUGGING_FACE_TOKEN")) def load_chunk_persist_pdf() -> Chroma: pdf_folder_path = os.path.join(os.getcwd(),Path("data/pdf/")) documents = [] for file in os.listdir(pdf_folder_path): if file.endswith('.pdf'): pdf_path = os.path.join(pdf_folder_path, file) loader = PyPDFLoader(pdf_path) documents.extend(loader.load()) text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=10) chunked_documents = text_splitter.split_documents(documents) os.makedirs("data/chroma_store/", exist_ok=True) vectorstore = Chroma.from_documents( documents=chunked_documents, embedding=MistralAIEmbeddings(api_key=mistral_api_key), persist_directory= os.path.join(os.getcwd(),Path("data/chroma_store/")) ) vectorstore.persist() return vectorstore vectorstore = load_chunk_persist_pdf() retriever = vectorstore.as_retriever() prompt = hub.pull("rlm/rag-prompt") # Data model class RouteQuery(BaseModel): """Route a user query to the most relevant datasource.""" datasource: Literal["vectorstore", "websearch"] = Field( ..., description="Given a user question choose to route it to web search or a vectorstore.", ) # LLM with function call llm = ChatMistralAI(model="mistral-large-latest", mistral_api_key=mistral_api_key, temperature=0) prompt = ChatPromptTemplate.from_template( """ You are a professional AI coach specialized in fitness, bodybuilding and nutrition. You must adapt to the user : if he is a beginner, use simple words. You are gentle and motivative. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know, and to refer to a nutritionist or a doctor. Use three sentences maximum and keep the answer concise. Question: {question} Context: {context} Answer: """, ) from langchain_core.output_parsers import StrOutputParser from langchain_core.runnables import RunnablePassthrough def format_docs(docs): return "\n\n".join(doc.page_content for doc in docs) rag_chain = ( {"context": retriever | format_docs, "question": RunnablePassthrough()} | prompt | llm | StrOutputParser() ) # print(rag_chain.invoke("Build a fitness program for me. Be precise in terms of exercises")) # print(rag_chain.invoke("I am a 45 years old woman and I have to loose weight for the summer. Provide me with a fitness program"))