Spaces:
Sleeping
Sleeping
Commit
·
b63ad23
1
Parent(s):
10e8a0c
new commit
Browse files- Dockerfile +6 -17
- README.md +23 -2
- rag.py +8 -25
- requirements.txt +2 -1
- vector_rag.py +35 -24
Dockerfile
CHANGED
|
@@ -1,31 +1,20 @@
|
|
| 1 |
-
#
|
| 2 |
FROM python:3.10-slim
|
| 3 |
|
| 4 |
-
# Set
|
| 5 |
-
ENV PYTHONDONTWRITEBYTECODE=1
|
| 6 |
-
ENV PYTHONUNBUFFERED=1
|
| 7 |
-
|
| 8 |
-
# Set work directory inside the container
|
| 9 |
WORKDIR /app
|
| 10 |
|
| 11 |
-
#
|
| 12 |
-
RUN apt-get update && apt-get install -y \
|
| 13 |
-
build-essential \
|
| 14 |
-
&& rm -rf /var/lib/apt/lists/*
|
| 15 |
-
|
| 16 |
-
# Install pipenv/venv if needed, but here we use pip
|
| 17 |
-
# Copy dependency list first for caching
|
| 18 |
COPY requirements.txt .
|
| 19 |
|
| 20 |
# Install dependencies
|
| 21 |
-
RUN pip install --upgrade pip
|
| 22 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 23 |
|
| 24 |
-
# Copy
|
| 25 |
COPY . .
|
| 26 |
|
| 27 |
# Expose the port your app runs on
|
| 28 |
EXPOSE 8000
|
| 29 |
|
| 30 |
-
# Command to run the application
|
| 31 |
-
CMD
|
|
|
|
| 1 |
+
# Use an official Python runtime as a parent image
|
| 2 |
FROM python:3.10-slim
|
| 3 |
|
| 4 |
+
# Set the working directory in the container
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
WORKDIR /app
|
| 6 |
|
| 7 |
+
# Copy the requirements file into the container
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
COPY requirements.txt .
|
| 9 |
|
| 10 |
# Install dependencies
|
|
|
|
| 11 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 12 |
|
| 13 |
+
# Copy the rest of the application code into the container
|
| 14 |
COPY . .
|
| 15 |
|
| 16 |
# Expose the port your app runs on
|
| 17 |
EXPOSE 8000
|
| 18 |
|
| 19 |
+
# Command to run the application; Hugging Face Spaces sets PORT env
|
| 20 |
+
CMD sh -c "uvicorn main:app --host 0.0.0.0 --port ${PORT:-8000}"
|
README.md
CHANGED
|
@@ -1,4 +1,12 @@
|
|
| 1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
# 🚀 RAG System with LangChain and FastAPI 🌐
|
| 4 |
|
|
@@ -78,7 +86,20 @@ This project uses Ollama to run local large language models.
|
|
| 78 |
|
| 79 |
## 🚀 Deployment
|
| 80 |
|
| 81 |
-
### Docker Deployment
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
If you want to deploy your RAG system using Docker, simply build the Docker image and run the container:
|
| 83 |
|
| 84 |
```bash
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: RAG Project
|
| 3 |
+
emoji: 🧠
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: purple
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 8000
|
| 8 |
+
python_version: 3.10
|
| 9 |
+
---
|
| 10 |
|
| 11 |
# 🚀 RAG System with LangChain and FastAPI 🌐
|
| 12 |
|
|
|
|
| 86 |
|
| 87 |
## 🚀 Deployment
|
| 88 |
|
| 89 |
+
### Hugging Face Spaces (Docker) Deployment
|
| 90 |
+
This project is configured for a Hugging Face Space using the Docker runtime.
|
| 91 |
+
|
| 92 |
+
1. Push this repository to GitHub (or connect local).
|
| 93 |
+
2. Create a new Space on Hugging Face → Choose "Docker" SDK.
|
| 94 |
+
3. Point it to this repo. Spaces will build using the `Dockerfile` and run `uvicorn` binding to the provided `PORT`.
|
| 95 |
+
4. Ensure the file `data/sample.pdf` exists (or replace it) to allow FAISS index creation on startup.
|
| 96 |
+
|
| 97 |
+
Notes:
|
| 98 |
+
- Models `Qwen/Qwen2-0.5B-Instruct` and `all-MiniLM-L6-v2` will be downloaded on first run; initial cold start may take several minutes.
|
| 99 |
+
- Dependencies are CPU-friendly; no GPU is required.
|
| 100 |
+
- If you see OOM, consider reducing `max_new_tokens` in `vector_rag.py` or swapping to an even smaller instruct model.
|
| 101 |
+
|
| 102 |
+
### Docker Deployment (Local)
|
| 103 |
If you want to deploy your RAG system using Docker, simply build the Docker image and run the container:
|
| 104 |
|
| 105 |
```bash
|
rag.py
CHANGED
|
@@ -1,25 +1,8 @@
|
|
| 1 |
-
from vector_rag import query_vector_store
|
| 2 |
import wikipedia
|
| 3 |
-
|
| 4 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
| 5 |
-
import os
|
| 6 |
-
from dotenv import load_dotenv
|
| 7 |
-
|
| 8 |
-
load_dotenv()
|
| 9 |
-
model_name = "Qwen/Qwen2-1.5B-Instruct"
|
| 10 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 11 |
-
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cpu")
|
| 12 |
-
llm_pipeline = pipeline(
|
| 13 |
-
"text-generation",
|
| 14 |
-
model=model,
|
| 15 |
-
tokenizer=tokenizer,
|
| 16 |
-
max_new_tokens=512,
|
| 17 |
-
do_sample=True,
|
| 18 |
-
temperature=0.7,
|
| 19 |
-
top_p=0.9,
|
| 20 |
-
)
|
| 21 |
-
llm = HuggingFacePipeline(pipeline=llm_pipeline)
|
| 22 |
|
|
|
|
| 23 |
wikipedia.set_lang("en")
|
| 24 |
|
| 25 |
async def get_smart_rag_response(query: str) -> str:
|
|
@@ -27,7 +10,7 @@ async def get_smart_rag_response(query: str) -> str:
|
|
| 27 |
|
| 28 |
# First: Try Wikipedia
|
| 29 |
try:
|
| 30 |
-
summary = wikipedia.summary(query, sentences=5)
|
| 31 |
print("Wikipedia summary found.")
|
| 32 |
|
| 33 |
prompt = f"""Use the following Wikipedia information to answer the question as clearly as possible.
|
|
@@ -38,20 +21,20 @@ Wikipedia Context:
|
|
| 38 |
Question: {query}
|
| 39 |
Answer:"""
|
| 40 |
result = llm.predict(prompt)
|
| 41 |
-
answer = result.replace(prompt, "").strip()
|
| 42 |
return f"[Wikipedia]\n{answer}"
|
| 43 |
except wikipedia.exceptions.PageError:
|
| 44 |
-
print("Wikipedia page not found.")
|
| 45 |
except wikipedia.exceptions.DisambiguationError as e:
|
| 46 |
return f"The query is ambiguous. Did you mean: {', '.join(e.options[:5])}?"
|
| 47 |
|
| 48 |
# Second: Fallback to LLM (no context)
|
| 49 |
try:
|
| 50 |
print("Fallback: LLM with no context")
|
| 51 |
-
|
| 52 |
fallback_prompt = f"You are a knowledgeable assistant. Please answer the following question clearly:\n\n{query}"
|
| 53 |
llm_answer = llm.predict(fallback_prompt)
|
| 54 |
-
answer = llm_answer.replace(fallback_prompt, "").strip()
|
| 55 |
if answer and "not sure" not in answer.lower():
|
| 56 |
return f"[LLM Fallback]\n{answer.strip()}"
|
| 57 |
except Exception as e:
|
|
|
|
| 1 |
+
from vector_rag import query_vector_store, llm # <--- FIX: Import llm here!
|
| 2 |
import wikipedia
|
| 3 |
+
# REMOVED: All duplicate model/pipeline/tokenizer imports and initialization code
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
+
# The 'llm' instance is now imported from vector_rag.py and is ready to use.
|
| 6 |
wikipedia.set_lang("en")
|
| 7 |
|
| 8 |
async def get_smart_rag_response(query: str) -> str:
|
|
|
|
| 10 |
|
| 11 |
# First: Try Wikipedia
|
| 12 |
try:
|
| 13 |
+
summary = wikipedia.summary(query, sentences=5)
|
| 14 |
print("Wikipedia summary found.")
|
| 15 |
|
| 16 |
prompt = f"""Use the following Wikipedia information to answer the question as clearly as possible.
|
|
|
|
| 21 |
Question: {query}
|
| 22 |
Answer:"""
|
| 23 |
result = llm.predict(prompt)
|
| 24 |
+
answer = result.replace(prompt, "").strip()
|
| 25 |
return f"[Wikipedia]\n{answer}"
|
| 26 |
except wikipedia.exceptions.PageError:
|
| 27 |
+
print("Wikipedia page not found.")
|
| 28 |
except wikipedia.exceptions.DisambiguationError as e:
|
| 29 |
return f"The query is ambiguous. Did you mean: {', '.join(e.options[:5])}?"
|
| 30 |
|
| 31 |
# Second: Fallback to LLM (no context)
|
| 32 |
try:
|
| 33 |
print("Fallback: LLM with no context")
|
| 34 |
+
|
| 35 |
fallback_prompt = f"You are a knowledgeable assistant. Please answer the following question clearly:\n\n{query}"
|
| 36 |
llm_answer = llm.predict(fallback_prompt)
|
| 37 |
+
answer = llm_answer.replace(fallback_prompt, "").strip()
|
| 38 |
if answer and "not sure" not in answer.lower():
|
| 39 |
return f"[LLM Fallback]\n{answer.strip()}"
|
| 40 |
except Exception as e:
|
requirements.txt
CHANGED
|
@@ -3,6 +3,7 @@ uvicorn
|
|
| 3 |
langchain
|
| 4 |
langchain-community
|
| 5 |
python-dotenv
|
|
|
|
| 6 |
faiss-cpu
|
| 7 |
jinja2
|
| 8 |
wikipedia
|
|
@@ -10,4 +11,4 @@ pypdf
|
|
| 10 |
sentence-transformers
|
| 11 |
torch
|
| 12 |
transformers
|
| 13 |
-
accelerate
|
|
|
|
| 3 |
langchain
|
| 4 |
langchain-community
|
| 5 |
python-dotenv
|
| 6 |
+
langchain-huggingface
|
| 7 |
faiss-cpu
|
| 8 |
jinja2
|
| 9 |
wikipedia
|
|
|
|
| 11 |
sentence-transformers
|
| 12 |
torch
|
| 13 |
transformers
|
| 14 |
+
accelerate
|
vector_rag.py
CHANGED
|
@@ -1,54 +1,65 @@
|
|
| 1 |
from langchain_community.document_loaders import PyPDFLoader
|
| 2 |
from langchain_community.vectorstores import FAISS
|
| 3 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 4 |
-
|
| 5 |
-
from
|
|
|
|
|
|
|
| 6 |
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
| 7 |
import os
|
| 8 |
from dotenv import load_dotenv
|
| 9 |
|
| 10 |
load_dotenv()
|
| 11 |
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
chunks = text_splitter.split_documents(documents)
|
| 16 |
|
| 17 |
-
|
| 18 |
-
raise ValueError("No document chunks found. Ensure 'sample.pdf' exists and is readable.")
|
| 19 |
-
|
| 20 |
-
# Embed & store (HuggingFace Embeddings are free and fast)
|
| 21 |
-
embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-large")
|
| 22 |
-
vectorstore = FAISS.from_documents(chunks, embeddings)
|
| 23 |
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
model_name
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cpu")
|
| 32 |
|
| 33 |
-
# 2. Use the pipeline for text generation
|
| 34 |
llm_pipeline = pipeline(
|
| 35 |
"text-generation",
|
| 36 |
model=model,
|
| 37 |
tokenizer=tokenizer,
|
| 38 |
-
max_new_tokens=
|
| 39 |
do_sample=True,
|
| 40 |
-
temperature=0.
|
| 41 |
top_p=0.9,
|
| 42 |
)
|
| 43 |
llm = HuggingFacePipeline(pipeline=llm_pipeline)
|
| 44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
def query_vector_store(query: str) -> str:
|
| 46 |
docs = retriever.get_relevant_documents(query)
|
| 47 |
if docs:
|
| 48 |
context = "\n\n".join([doc.page_content for doc in docs])
|
| 49 |
prompt = f"""Use the following context to answer the question:\n\n{context}\n\nQuestion: {query}\nAnswer:"""
|
| 50 |
|
| 51 |
-
raw_output = llm.
|
| 52 |
answer = raw_output.replace(prompt, "").strip()
|
| 53 |
return answer
|
| 54 |
return None
|
|
|
|
| 1 |
from langchain_community.document_loaders import PyPDFLoader
|
| 2 |
from langchain_community.vectorstores import FAISS
|
| 3 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 4 |
+
# Use the generic HuggingFaceEmbeddings for the smaller model
|
| 5 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
| 6 |
+
from langchain_huggingface import HuggingFacePipeline
|
| 7 |
+
# Remove BitsAndBytesConfig import
|
| 8 |
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
| 9 |
import os
|
| 10 |
from dotenv import load_dotenv
|
| 11 |
|
| 12 |
load_dotenv()
|
| 13 |
|
| 14 |
+
# --- MODEL INITIALIZATION (Minimal Footprint) ---
|
| 15 |
+
print("Loading Qwen2-0.5B-Instruct...")
|
| 16 |
+
model_name = "Qwen/Qwen2-0.5B-Instruct"
|
|
|
|
| 17 |
|
| 18 |
+
# Removed: quantization_config = BitsAndBytesConfig(load_in_8bit=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
| 21 |
+
# Removed: quantization_config parameter from from_pretrained
|
| 22 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 23 |
+
model_name,
|
| 24 |
+
device_map="cpu",
|
| 25 |
+
trust_remote_code=True
|
| 26 |
+
)
|
|
|
|
| 27 |
|
|
|
|
| 28 |
llm_pipeline = pipeline(
|
| 29 |
"text-generation",
|
| 30 |
model=model,
|
| 31 |
tokenizer=tokenizer,
|
| 32 |
+
max_new_tokens=256,
|
| 33 |
do_sample=True,
|
| 34 |
+
temperature=0.5,
|
| 35 |
top_p=0.9,
|
| 36 |
)
|
| 37 |
llm = HuggingFacePipeline(pipeline=llm_pipeline)
|
| 38 |
|
| 39 |
+
# Use the lighter all-MiniLM-L6-v2 embeddings model
|
| 40 |
+
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
|
| 41 |
+
|
| 42 |
+
# --- DOCUMENT LOADING & CHUNKING ---
|
| 43 |
+
loader = PyPDFLoader("data/sample.pdf") # Correct path for Docker: data/sample.pdf
|
| 44 |
+
documents = loader.load()
|
| 45 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
| 46 |
+
chunks = text_splitter.split_documents(documents)
|
| 47 |
+
|
| 48 |
+
if not chunks:
|
| 49 |
+
raise ValueError("No document chunks found.")
|
| 50 |
+
|
| 51 |
+
# Initialize FAISS and retriever
|
| 52 |
+
vectorstore = FAISS.from_documents(chunks, embeddings)
|
| 53 |
+
retriever = vectorstore.as_retriever()
|
| 54 |
+
|
| 55 |
+
# Expose the necessary components for rag.py to import
|
| 56 |
def query_vector_store(query: str) -> str:
|
| 57 |
docs = retriever.get_relevant_documents(query)
|
| 58 |
if docs:
|
| 59 |
context = "\n\n".join([doc.page_content for doc in docs])
|
| 60 |
prompt = f"""Use the following context to answer the question:\n\n{context}\n\nQuestion: {query}\nAnswer:"""
|
| 61 |
|
| 62 |
+
raw_output = llm.invoke(prompt)
|
| 63 |
answer = raw_output.replace(prompt, "").strip()
|
| 64 |
return answer
|
| 65 |
return None
|