Spaces:
Sleeping
Sleeping
import os | |
from langchain_community.document_loaders import PyPDFLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_huggingface.embeddings import HuggingFaceEmbeddings | |
from langchain_community.vectorstores import FAISS | |
from langchain_huggingface import HuggingFaceEndpoint | |
from langchain.prompts import PromptTemplate | |
from langchain.schema.runnable import RunnablePassthrough | |
from langchain.schema.output_parser import StrOutputParser | |
import gradio as gr | |
############################## OLD CODE FOR 1 PDF #################################### | |
# Load and split documents | |
FILE_PATH = "" | |
loader = PyPDFLoader(FILE_PATH) | |
docs = loader.load() | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=1000, | |
chunk_overlap=200, | |
) | |
splits = text_splitter.split_documents(docs) | |
# Set up embeddings and vector store | |
HF_EMBED_MODEL_ID = "BAAI/bge-small-en-v1.5" | |
embeddings = HuggingFaceEmbeddings(model_name=HF_EMBED_MODEL_ID) | |
vectorstore = FAISS.from_documents(splits, embeddings) | |
############################## NEW CODE FOR 5 PDFs #################################### | |
# Load and split documents | |
# FILE_PATHS = ["vol1.pdf", "vol2.pdf"] | |
# # Initialize an empty list to store all documents | |
# all_docs = [] | |
# # Iterate through each file path in FILE_PATHS | |
# for file_path in FILE_PATHS: | |
# # Load the PDF using PyPDFLoader | |
# loader = PyPDFLoader(file_path) | |
# # Split the loaded document using RecursiveCharacterTextSplitter | |
# text_splitter = RecursiveCharacterTextSplitter( | |
# chunk_size=1000, | |
# chunk_overlap=200, | |
# ) | |
# # Load and split the current document and append to all_docs | |
# documents = loader.load_and_split(text_splitter) | |
# all_docs.extend(documents) | |
# # Set up embeddings and vector store | |
# HF_EMBED_MODEL_ID = "BAAI/bge-small-en-v1.5" | |
# embeddings = HuggingFaceEmbeddings(model_name=HF_EMBED_MODEL_ID) | |
# vectorstore = FAISS.from_documents(all_docs, embeddings) | |
# Set up LLM | |
HF_API_KEY = os.environ.get("HF_API_KEY") # Ensure this is set in Hugging Face Space settings | |
HF_LLM_MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3" | |
llm = HuggingFaceEndpoint( | |
repo_id=HF_LLM_MODEL_ID, | |
huggingfacehub_api_token=HF_API_KEY, | |
) | |
# Create RAG pipeline | |
retriever = vectorstore.as_retriever() | |
prompt = PromptTemplate.from_template( | |
"Context information is below.\\n---------------------\\n{context}\\n---------------------\\nGiven the context information and not prior knowledge, answer the query.\\nQuery: {question}\\nAnswer:\\n" | |
) | |
rag_chain = ( | |
{"context": retriever, "question": RunnablePassthrough()} | |
| prompt | |
| llm | |
| StrOutputParser() | |
) | |
# Gradio interface | |
def ask_question(question): | |
return rag_chain.invoke(question) | |
iface = gr.Interface(fn=ask_question, inputs="text", outputs="text") | |
iface.launch() |