Spaces:

NHZ
/

First_Aid_Kit

Sleeping

App Files Files Community

NHZ commited on Jan 4, 2025

Commit

fdf7122

verified ·

1 Parent(s): 7efdb22

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -60

app.py CHANGED Viewed

@@ -1,14 +1,18 @@
 import requests
 import numpy as np
 import faiss
 from PyPDF2 import PdfReader
 from transformers import AutoTokenizer, AutoModel
 from groq import Groq
 import streamlit as st
-import torch
-import os
-# Initialize Groq client using secret API key
 client = Groq(api_key=os.getenv("GROQ_API_KEY"))
 # Function to download and extract content from a public Google Drive PDF link
@@ -31,28 +35,8 @@ def extract_pdf_content(drive_url):
         text += page.extract_text()
     return text
-# Function to chunk and tokenize text
-def chunk_and_tokenize(text, tokenizer, chunk_size=512):
-    tokens = tokenizer.encode(text, add_special_tokens=False)
-    chunks = [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]
-    return chunks
-# Function to compute embeddings and build FAISS index
-def build_faiss_index(chunks, model):
-    embeddings = []
-    for chunk in chunks:
-        input_ids = torch.tensor([chunk])
-        with torch.no_grad():
-            embedding = model(input_ids).last_hidden_state.mean(dim=1).detach().numpy()
-        embeddings.append(embedding)
-    embeddings = np.vstack(embeddings)
-    index = faiss.IndexFlatL2(embeddings.shape[1])
-    index.add(embeddings)
-    return index
 # Streamlit app
-st.title("RAG-based Application with Groq API")
 # Predefined Google Drive link
 drive_url = "https://drive.google.com/file/d/1XvqA1OIssRs2gbmOtKFKj-02yQ5X2yg0/view?usp=sharing"
@@ -63,46 +47,40 @@ text = extract_pdf_content(drive_url)
 if text:
     st.write("Document extracted successfully!")
-    # Initialize tokenizer and model
-    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
-    model = AutoModel.from_pretrained("bert-base-uncased")
-    st.write("Chunking and tokenizing content...")
-    chunks = chunk_and_tokenize(text, tokenizer)
-    st.write("Building FAISS index...")
-    index = build_faiss_index(chunks, model)
     # Query input
     query = st.text_input("Enter your query:")
     if query:
-        st.write("Searching for the most relevant chunk...")
-        query_tokens = tokenizer.encode(query, add_special_tokens=False)
-        query_embedding = (
-            model(torch.tensor([query_tokens]))
-            .last_hidden_state.mean(dim=1)
-            .detach().numpy()
-        )
-        _, indices = index.search(query_embedding, k=1)
-        # Retrieve the most relevant chunk
-        relevant_chunk = chunks[indices[0][0]]
-        relevant_text = tokenizer.decode(relevant_chunk)
-        st.write("Relevant chunk found:", relevant_text)
-        # Interact with Groq API
-        st.write("Querying the Groq API...")
-        chat_completion = client.chat.completions.create(
-            messages=[
-                {
-                    "role": "user",
-                    "content": relevant_text,
-                }
-            ],
-            model="llama-3.3-70b-versatile",
-        )
-        st.write("Model Response:", chat_completion.choices[0].message.content)
 else:
     st.error("Failed to extract content from the document.")

+import os
 import requests
 import numpy as np
 import faiss
 from PyPDF2 import PdfReader
 from transformers import AutoTokenizer, AutoModel
+from langchain.vectorstores import FAISS
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.chains import RetrievalQA
+from langchain.prompts import PromptTemplate
+from langchain.chat_models import ChatOpenAI
 from groq import Groq
 import streamlit as st
+# Initialize Groq client
 client = Groq(api_key=os.getenv("GROQ_API_KEY"))
 # Function to download and extract content from a public Google Drive PDF link
         text += page.extract_text()
     return text
 # Streamlit app
+st.title("Enhanced RAG with LangChain and Groq API")
 # Predefined Google Drive link
 drive_url = "https://drive.google.com/file/d/1XvqA1OIssRs2gbmOtKFKj-02yQ5X2yg0/view?usp=sharing"
 if text:
     st.write("Document extracted successfully!")
+    # LangChain embeddings and FAISS index setup
+    st.write("Building embeddings and FAISS index...")
+    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+    faiss_index = FAISS.from_texts([text], embeddings)
+    # LangChain retriever
+    retriever = faiss_index.as_retriever(search_kwargs={"k": 3})
+    # LangChain QA chain
+    prompt_template = """
+    Use the following document excerpts to answer the user's question.
+    If the answer is not directly found in the document, say "The answer is not in the provided document.".
+    Document Excerpts:
+    {context}
+    Question:
+    {question}
+    Answer:
+    """
+    PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
+    qa_chain = RetrievalQA.from_chain_type(
+        llm=ChatOpenAI(model_name="gpt-3.5-turbo"),
+        retriever=retriever,
+        chain_type_kwargs={"prompt": PROMPT},
+    )
     # Query input
     query = st.text_input("Enter your query:")
     if query:
+        st.write("Searching the document and generating a response...")
+        result = qa_chain.run(query)
+        st.write("Response:", result)
 else:
     st.error("Failed to extract content from the document.")