Spaces:

kapilyadav7155
/

neobot

Sleeping

App Files Files Community

yadavkapil23 commited on 22 days ago

Commit

b63ad23

1 Parent(s): 10e8a0c

new commit

Browse files

Files changed (5) hide show

Dockerfile +6 -17
README.md +23 -2
rag.py +8 -25
requirements.txt +2 -1
vector_rag.py +35 -24

Dockerfile CHANGED Viewed

@@ -1,31 +1,20 @@
-# Base image
 FROM python:3.10-slim
-# Set environment variables
-ENV PYTHONDONTWRITEBYTECODE=1
-ENV PYTHONUNBUFFERED=1
-# Set work directory inside the container
 WORKDIR /app
-# Install system dependencies
-RUN apt-get update && apt-get install -y \
-    build-essential \
-    && rm -rf /var/lib/apt/lists/*
-# Install pipenv/venv if needed, but here we use pip
-# Copy dependency list first for caching
 COPY requirements.txt .
 # Install dependencies
-RUN pip install --upgrade pip
 RUN pip install --no-cache-dir -r requirements.txt
-# Copy entire project into the container
 COPY . .
 # Expose the port your app runs on
 EXPOSE 8000
-# Command to run the application using uvicorn
-CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]

+# Use an official Python runtime as a parent image
 FROM python:3.10-slim
+# Set the working directory in the container
 WORKDIR /app
+# Copy the requirements file into the container
 COPY requirements.txt .
 # Install dependencies
 RUN pip install --no-cache-dir -r requirements.txt
+# Copy the rest of the application code into the container
 COPY . .
 # Expose the port your app runs on
 EXPOSE 8000
+# Command to run the application; Hugging Face Spaces sets PORT env
+CMD sh -c "uvicorn main:app --host 0.0.0.0 --port ${PORT:-8000}"

README.md CHANGED Viewed

@@ -1,4 +1,12 @@
 # 🚀 RAG System with LangChain and FastAPI 🌐
@@ -78,7 +86,20 @@ This project uses Ollama to run local large language models.
 ## 🚀 Deployment
-### Docker Deployment
 If you want to deploy your RAG system using Docker, simply build the Docker image and run the container:
 ```bash

+---
+title: RAG Project
+emoji: 🧠
+colorFrom: blue
+colorTo: purple
+sdk: docker
+app_port: 8000
+python_version: 3.10
+---
 # 🚀 RAG System with LangChain and FastAPI 🌐
 ## 🚀 Deployment
+### Hugging Face Spaces (Docker) Deployment
+This project is configured for a Hugging Face Space using the Docker runtime.
+1. Push this repository to GitHub (or connect local).
+2. Create a new Space on Hugging Face → Choose "Docker" SDK.
+3. Point it to this repo. Spaces will build using the `Dockerfile` and run `uvicorn` binding to the provided `PORT`.
+4. Ensure the file `data/sample.pdf` exists (or replace it) to allow FAISS index creation on startup.
+Notes:
+- Models `Qwen/Qwen2-0.5B-Instruct` and `all-MiniLM-L6-v2` will be downloaded on first run; initial cold start may take several minutes.
+- Dependencies are CPU-friendly; no GPU is required.
+- If you see OOM, consider reducing `max_new_tokens` in `vector_rag.py` or swapping to an even smaller instruct model.
+### Docker Deployment (Local)
 If you want to deploy your RAG system using Docker, simply build the Docker image and run the container:
 ```bash

rag.py CHANGED Viewed

@@ -1,25 +1,8 @@
-from vector_rag import query_vector_store
 import wikipedia
-from langchain_community.llms import HuggingFacePipeline
-from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
-import os
-from dotenv import load_dotenv
-load_dotenv()
-model_name = "Qwen/Qwen2-1.5B-Instruct"
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cpu")
-llm_pipeline = pipeline(
-    "text-generation",
-    model=model,
-    tokenizer=tokenizer,
-    max_new_tokens=512,
-    do_sample=True,
-    temperature=0.7,
-    top_p=0.9,
-)
-llm = HuggingFacePipeline(pipeline=llm_pipeline)
 wikipedia.set_lang("en")
 async def get_smart_rag_response(query: str) -> str:
@@ -27,7 +10,7 @@ async def get_smart_rag_response(query: str) -> str:
     # First: Try Wikipedia
     try:
-        summary = wikipedia.summary(query, sentences=5) # Dynamically gets summary
         print("Wikipedia summary found.")
         prompt = f"""Use the following Wikipedia information to answer the question as clearly as possible.
@@ -38,20 +21,20 @@ Wikipedia Context:
 Question: {query}
 Answer:"""
         result = llm.predict(prompt)
-        answer = result.replace(prompt, "").strip() # Cleanup
         return f"[Wikipedia]\n{answer}"
     except wikipedia.exceptions.PageError:
-        print("Wikipedia page not found.") # Corrected simple handling
     except wikipedia.exceptions.DisambiguationError as e:
         return f"The query is ambiguous. Did you mean: {', '.join(e.options[:5])}?"
     # Second: Fallback to LLM (no context)
     try:
         print("Fallback: LLM with no context")
-        # FALLBACK PROMPT LOGIC RESTORED
         fallback_prompt = f"You are a knowledgeable assistant. Please answer the following question clearly:\n\n{query}"
         llm_answer = llm.predict(fallback_prompt)
-        answer = llm_answer.replace(fallback_prompt, "").strip() # Cleanup
         if answer and "not sure" not in answer.lower():
             return f"[LLM Fallback]\n{answer.strip()}"
     except Exception as e:

+from vector_rag import query_vector_store, llm # <--- FIX: Import llm here!
 import wikipedia
+# REMOVED: All duplicate model/pipeline/tokenizer imports and initialization code
+# The 'llm' instance is now imported from vector_rag.py and is ready to use.
 wikipedia.set_lang("en")
 async def get_smart_rag_response(query: str) -> str:
     # First: Try Wikipedia
     try:
+        summary = wikipedia.summary(query, sentences=5)
         print("Wikipedia summary found.")
         prompt = f"""Use the following Wikipedia information to answer the question as clearly as possible.
 Question: {query}
 Answer:"""
         result = llm.predict(prompt)
+        answer = result.replace(prompt, "").strip()
         return f"[Wikipedia]\n{answer}"
     except wikipedia.exceptions.PageError:
+        print("Wikipedia page not found.")
     except wikipedia.exceptions.DisambiguationError as e:
         return f"The query is ambiguous. Did you mean: {', '.join(e.options[:5])}?"
     # Second: Fallback to LLM (no context)
     try:
         print("Fallback: LLM with no context")
         fallback_prompt = f"You are a knowledgeable assistant. Please answer the following question clearly:\n\n{query}"
         llm_answer = llm.predict(fallback_prompt)
+        answer = llm_answer.replace(fallback_prompt, "").strip()
         if answer and "not sure" not in answer.lower():
             return f"[LLM Fallback]\n{answer.strip()}"
     except Exception as e:

requirements.txt CHANGED Viewed

@@ -3,6 +3,7 @@ uvicorn
 langchain
 langchain-community
 python-dotenv
 faiss-cpu
 jinja2
 wikipedia
@@ -10,4 +11,4 @@ pypdf
 sentence-transformers
 torch
 transformers
-accelerate

 langchain
 langchain-community
 python-dotenv
+langchain-huggingface
 faiss-cpu
 jinja2
 wikipedia
 sentence-transformers
 torch
 transformers
+accelerate

vector_rag.py CHANGED Viewed

@@ -1,54 +1,65 @@
 from langchain_community.document_loaders import PyPDFLoader
 from langchain_community.vectorstores import FAISS
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain_community.embeddings import HuggingFaceInstructEmbeddings
-from langchain_community.llms import HuggingFacePipeline
 from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 import os
 from dotenv import load_dotenv
 load_dotenv()
-loader = PyPDFLoader("data/sample.pdf")
-documents = loader.load()
-text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
-chunks = text_splitter.split_documents(documents)
-if not chunks:
-    raise ValueError("No document chunks found. Ensure 'sample.pdf' exists and is readable.")
-# Embed & store (HuggingFace Embeddings are free and fast)
-embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-large")
-vectorstore = FAISS.from_documents(chunks, embeddings)
-retriever = vectorstore.as_retriever()
-# 1. NEW MODEL NAME
-model_name = "Qwen/Qwen2-1.5B-Instruct"
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cpu")
-# 2. Use the pipeline for text generation
 llm_pipeline = pipeline(
     "text-generation",
     model=model,
     tokenizer=tokenizer,
-    max_new_tokens=512,
     do_sample=True,
-    temperature=0.7,
     top_p=0.9,
 )
 llm = HuggingFacePipeline(pipeline=llm_pipeline)
 def query_vector_store(query: str) -> str:
     docs = retriever.get_relevant_documents(query)
     if docs:
         context = "\n\n".join([doc.page_content for doc in docs])
         prompt = f"""Use the following context to answer the question:\n\n{context}\n\nQuestion: {query}\nAnswer:"""
-        raw_output = llm.predict(prompt)
         answer = raw_output.replace(prompt, "").strip()
         return answer
     return None

 from langchain_community.document_loaders import PyPDFLoader
 from langchain_community.vectorstores import FAISS
 from langchain.text_splitter import RecursiveCharacterTextSplitter
+# Use the generic HuggingFaceEmbeddings for the smaller model
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_huggingface import HuggingFacePipeline
+# Remove BitsAndBytesConfig import
 from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 import os
 from dotenv import load_dotenv
 load_dotenv()
+# --- MODEL INITIALIZATION (Minimal Footprint) ---
+print("Loading Qwen2-0.5B-Instruct...")
+model_name = "Qwen/Qwen2-0.5B-Instruct"
+# Removed: quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+# Removed: quantization_config parameter from from_pretrained
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    device_map="cpu",
+    trust_remote_code=True
+)
 llm_pipeline = pipeline(
     "text-generation",
     model=model,
     tokenizer=tokenizer,
+    max_new_tokens=256,
     do_sample=True,
+    temperature=0.5,
     top_p=0.9,
 )
 llm = HuggingFacePipeline(pipeline=llm_pipeline)
+# Use the lighter all-MiniLM-L6-v2 embeddings model
+embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
+# --- DOCUMENT LOADING & CHUNKING ---
+loader = PyPDFLoader("data/sample.pdf") # Correct path for Docker: data/sample.pdf
+documents = loader.load()
+text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
+chunks = text_splitter.split_documents(documents)
+if not chunks:
+    raise ValueError("No document chunks found.")
+# Initialize FAISS and retriever
+vectorstore = FAISS.from_documents(chunks, embeddings)
+retriever = vectorstore.as_retriever()
+# Expose the necessary components for rag.py to import
 def query_vector_store(query: str) -> str:
     docs = retriever.get_relevant_documents(query)
     if docs:
         context = "\n\n".join([doc.page_content for doc in docs])
         prompt = f"""Use the following context to answer the question:\n\n{context}\n\nQuestion: {query}\nAnswer:"""
+        raw_output = llm.invoke(prompt)
         answer = raw_output.replace(prompt, "").strip()
         return answer
     return None