import gradio as gr from operator import itemgetter import os import pandas as pd from langchain_community.vectorstores import FAISS from langchain_core.output_parsers import StrOutputParser from langchain.prompts import PromptTemplate from langchain_core.runnables import RunnableLambda, RunnablePassthrough from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline from langchain_community.document_loaders import PyPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.embeddings import HuggingFaceEmbeddings from langchain.chains import RetrievalQA from langchain.prompts import PromptTemplate ## models tried ## TinyLlama/TinyLlama-1.1B-Chat-v1.0 ## meta-llama/Meta-Llama-3-8B ## google/gemma-1.1-7b-it HF_TOKEN = os.environ.get("HF_TOKEN", None) model_id = "google/gemma-1.1-2b-it" tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained(model_id) embeddings = HuggingFaceEmbeddings() pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=100) hf = HuggingFacePipeline(pipeline=pipe) pdfLoader = PyPDFLoader("./LangchainPaper/RAGInputPaper.pdf") documents = pdfLoader.load() text_splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=30) docs = text_splitter.split_documents(documents) ## creating vector embeddings during run using FAISS # vectorstore = FAISS.from_documents( # docs, embedding=embeddings # ) # retriever = vectorstore.as_retriever() ## loading previously saved vector embeddings from local space vectorstore = FAISS.load_local("./fi_LangchainPaper", embeddings,allow_dangerous_deserialization=True) retriever = vectorstore.as_retriever() qa = RetrievalQA.from_chain_type( llm=hf, chain_type="stuff", retriever=retriever, return_source_documents=False) queries=pd.read_csv('./interactions/queries.csv') def greet(Question): answer = qa({"query": Question}) pa=[a.split("Helpful Answer: ") for a in answer.get('result').split('\n') if "Helpful Answer" in a] new=pd.DataFrame({'query':Question,'response':pa[0][-1]}) queries.append(new) queries.to_csv('./interactions/queries.csv') return pa[0][-1] if __name__ == "__main__": title = "RAG with LLMs" description = """

Demo using Vector store-backed retriever. This space demonstrate application of RAG on a small model and its effectiveness, I used small model because of the space constraint. The current space runs on mere 2GB of RAM, hence there is some delay in generating output. Test this to your hearts content and let me know your thoughts, I will keep updating this space with tiny improvements on architecture and design

model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
update1: This space now does not create a faiss index on build, it uses a locally saved faiss index
update2: This space now uses google/gemma-1.1-2b-it model to generate output, reduces the response time to 1/3rd

""" article = """

What is langchain framework?
What is Action Agent?
What are forms of memory implementation in langchain
What is question answering from documents

Go through this paper here to find more about langchain and then test how this solution performs. This paper is the data source for this solution Have you already used RAG? feel free to suggest improvements Feel excited about the implementation? You know where to find me! I would love to connect and have a chat.

""" iface = gr.Interface(fn=greet, inputs="text", outputs=gr.Textbox(lines=5, label="Answer"), title=title, description=description, article=article,) iface.launch(share=True)