yadavkapil23 commited on
Commit
b63ad23
·
1 Parent(s): 10e8a0c

new commit

Browse files
Files changed (5) hide show
  1. Dockerfile +6 -17
  2. README.md +23 -2
  3. rag.py +8 -25
  4. requirements.txt +2 -1
  5. vector_rag.py +35 -24
Dockerfile CHANGED
@@ -1,31 +1,20 @@
1
- # Base image
2
  FROM python:3.10-slim
3
 
4
- # Set environment variables
5
- ENV PYTHONDONTWRITEBYTECODE=1
6
- ENV PYTHONUNBUFFERED=1
7
-
8
- # Set work directory inside the container
9
  WORKDIR /app
10
 
11
- # Install system dependencies
12
- RUN apt-get update && apt-get install -y \
13
- build-essential \
14
- && rm -rf /var/lib/apt/lists/*
15
-
16
- # Install pipenv/venv if needed, but here we use pip
17
- # Copy dependency list first for caching
18
  COPY requirements.txt .
19
 
20
  # Install dependencies
21
- RUN pip install --upgrade pip
22
  RUN pip install --no-cache-dir -r requirements.txt
23
 
24
- # Copy entire project into the container
25
  COPY . .
26
 
27
  # Expose the port your app runs on
28
  EXPOSE 8000
29
 
30
- # Command to run the application using uvicorn
31
- CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
 
1
+ # Use an official Python runtime as a parent image
2
  FROM python:3.10-slim
3
 
4
+ # Set the working directory in the container
 
 
 
 
5
  WORKDIR /app
6
 
7
+ # Copy the requirements file into the container
 
 
 
 
 
 
8
  COPY requirements.txt .
9
 
10
  # Install dependencies
 
11
  RUN pip install --no-cache-dir -r requirements.txt
12
 
13
+ # Copy the rest of the application code into the container
14
  COPY . .
15
 
16
  # Expose the port your app runs on
17
  EXPOSE 8000
18
 
19
+ # Command to run the application; Hugging Face Spaces sets PORT env
20
+ CMD sh -c "uvicorn main:app --host 0.0.0.0 --port ${PORT:-8000}"
README.md CHANGED
@@ -1,4 +1,12 @@
1
-
 
 
 
 
 
 
 
 
2
 
3
  # 🚀 RAG System with LangChain and FastAPI 🌐
4
 
@@ -78,7 +86,20 @@ This project uses Ollama to run local large language models.
78
 
79
  ## 🚀 Deployment
80
 
81
- ### Docker Deployment
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  If you want to deploy your RAG system using Docker, simply build the Docker image and run the container:
83
 
84
  ```bash
 
1
+ ---
2
+ title: RAG Project
3
+ emoji: 🧠
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: docker
7
+ app_port: 8000
8
+ python_version: 3.10
9
+ ---
10
 
11
  # 🚀 RAG System with LangChain and FastAPI 🌐
12
 
 
86
 
87
  ## 🚀 Deployment
88
 
89
+ ### Hugging Face Spaces (Docker) Deployment
90
+ This project is configured for a Hugging Face Space using the Docker runtime.
91
+
92
+ 1. Push this repository to GitHub (or connect local).
93
+ 2. Create a new Space on Hugging Face → Choose "Docker" SDK.
94
+ 3. Point it to this repo. Spaces will build using the `Dockerfile` and run `uvicorn` binding to the provided `PORT`.
95
+ 4. Ensure the file `data/sample.pdf` exists (or replace it) to allow FAISS index creation on startup.
96
+
97
+ Notes:
98
+ - Models `Qwen/Qwen2-0.5B-Instruct` and `all-MiniLM-L6-v2` will be downloaded on first run; initial cold start may take several minutes.
99
+ - Dependencies are CPU-friendly; no GPU is required.
100
+ - If you see OOM, consider reducing `max_new_tokens` in `vector_rag.py` or swapping to an even smaller instruct model.
101
+
102
+ ### Docker Deployment (Local)
103
  If you want to deploy your RAG system using Docker, simply build the Docker image and run the container:
104
 
105
  ```bash
rag.py CHANGED
@@ -1,25 +1,8 @@
1
- from vector_rag import query_vector_store
2
  import wikipedia
3
- from langchain_community.llms import HuggingFacePipeline
4
- from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
5
- import os
6
- from dotenv import load_dotenv
7
-
8
- load_dotenv()
9
- model_name = "Qwen/Qwen2-1.5B-Instruct"
10
- tokenizer = AutoTokenizer.from_pretrained(model_name)
11
- model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cpu")
12
- llm_pipeline = pipeline(
13
- "text-generation",
14
- model=model,
15
- tokenizer=tokenizer,
16
- max_new_tokens=512,
17
- do_sample=True,
18
- temperature=0.7,
19
- top_p=0.9,
20
- )
21
- llm = HuggingFacePipeline(pipeline=llm_pipeline)
22
 
 
23
  wikipedia.set_lang("en")
24
 
25
  async def get_smart_rag_response(query: str) -> str:
@@ -27,7 +10,7 @@ async def get_smart_rag_response(query: str) -> str:
27
 
28
  # First: Try Wikipedia
29
  try:
30
- summary = wikipedia.summary(query, sentences=5) # Dynamically gets summary
31
  print("Wikipedia summary found.")
32
 
33
  prompt = f"""Use the following Wikipedia information to answer the question as clearly as possible.
@@ -38,20 +21,20 @@ Wikipedia Context:
38
  Question: {query}
39
  Answer:"""
40
  result = llm.predict(prompt)
41
- answer = result.replace(prompt, "").strip() # Cleanup
42
  return f"[Wikipedia]\n{answer}"
43
  except wikipedia.exceptions.PageError:
44
- print("Wikipedia page not found.") # Corrected simple handling
45
  except wikipedia.exceptions.DisambiguationError as e:
46
  return f"The query is ambiguous. Did you mean: {', '.join(e.options[:5])}?"
47
 
48
  # Second: Fallback to LLM (no context)
49
  try:
50
  print("Fallback: LLM with no context")
51
- # FALLBACK PROMPT LOGIC RESTORED
52
  fallback_prompt = f"You are a knowledgeable assistant. Please answer the following question clearly:\n\n{query}"
53
  llm_answer = llm.predict(fallback_prompt)
54
- answer = llm_answer.replace(fallback_prompt, "").strip() # Cleanup
55
  if answer and "not sure" not in answer.lower():
56
  return f"[LLM Fallback]\n{answer.strip()}"
57
  except Exception as e:
 
1
+ from vector_rag import query_vector_store, llm # <--- FIX: Import llm here!
2
  import wikipedia
3
+ # REMOVED: All duplicate model/pipeline/tokenizer imports and initialization code
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
+ # The 'llm' instance is now imported from vector_rag.py and is ready to use.
6
  wikipedia.set_lang("en")
7
 
8
  async def get_smart_rag_response(query: str) -> str:
 
10
 
11
  # First: Try Wikipedia
12
  try:
13
+ summary = wikipedia.summary(query, sentences=5)
14
  print("Wikipedia summary found.")
15
 
16
  prompt = f"""Use the following Wikipedia information to answer the question as clearly as possible.
 
21
  Question: {query}
22
  Answer:"""
23
  result = llm.predict(prompt)
24
+ answer = result.replace(prompt, "").strip()
25
  return f"[Wikipedia]\n{answer}"
26
  except wikipedia.exceptions.PageError:
27
+ print("Wikipedia page not found.")
28
  except wikipedia.exceptions.DisambiguationError as e:
29
  return f"The query is ambiguous. Did you mean: {', '.join(e.options[:5])}?"
30
 
31
  # Second: Fallback to LLM (no context)
32
  try:
33
  print("Fallback: LLM with no context")
34
+
35
  fallback_prompt = f"You are a knowledgeable assistant. Please answer the following question clearly:\n\n{query}"
36
  llm_answer = llm.predict(fallback_prompt)
37
+ answer = llm_answer.replace(fallback_prompt, "").strip()
38
  if answer and "not sure" not in answer.lower():
39
  return f"[LLM Fallback]\n{answer.strip()}"
40
  except Exception as e:
requirements.txt CHANGED
@@ -3,6 +3,7 @@ uvicorn
3
  langchain
4
  langchain-community
5
  python-dotenv
 
6
  faiss-cpu
7
  jinja2
8
  wikipedia
@@ -10,4 +11,4 @@ pypdf
10
  sentence-transformers
11
  torch
12
  transformers
13
- accelerate
 
3
  langchain
4
  langchain-community
5
  python-dotenv
6
+ langchain-huggingface
7
  faiss-cpu
8
  jinja2
9
  wikipedia
 
11
  sentence-transformers
12
  torch
13
  transformers
14
+ accelerate
vector_rag.py CHANGED
@@ -1,54 +1,65 @@
1
  from langchain_community.document_loaders import PyPDFLoader
2
  from langchain_community.vectorstores import FAISS
3
  from langchain.text_splitter import RecursiveCharacterTextSplitter
4
- from langchain_community.embeddings import HuggingFaceInstructEmbeddings
5
- from langchain_community.llms import HuggingFacePipeline
 
 
6
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
7
  import os
8
  from dotenv import load_dotenv
9
 
10
  load_dotenv()
11
 
12
- loader = PyPDFLoader("data/sample.pdf")
13
- documents = loader.load()
14
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
15
- chunks = text_splitter.split_documents(documents)
16
 
17
- if not chunks:
18
- raise ValueError("No document chunks found. Ensure 'sample.pdf' exists and is readable.")
19
-
20
- # Embed & store (HuggingFace Embeddings are free and fast)
21
- embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-large")
22
- vectorstore = FAISS.from_documents(chunks, embeddings)
23
 
24
- retriever = vectorstore.as_retriever()
25
-
26
- # 1. NEW MODEL NAME
27
- model_name = "Qwen/Qwen2-1.5B-Instruct"
28
-
29
- tokenizer = AutoTokenizer.from_pretrained(model_name)
30
-
31
- model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cpu")
32
 
33
- # 2. Use the pipeline for text generation
34
  llm_pipeline = pipeline(
35
  "text-generation",
36
  model=model,
37
  tokenizer=tokenizer,
38
- max_new_tokens=512,
39
  do_sample=True,
40
- temperature=0.7,
41
  top_p=0.9,
42
  )
43
  llm = HuggingFacePipeline(pipeline=llm_pipeline)
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  def query_vector_store(query: str) -> str:
46
  docs = retriever.get_relevant_documents(query)
47
  if docs:
48
  context = "\n\n".join([doc.page_content for doc in docs])
49
  prompt = f"""Use the following context to answer the question:\n\n{context}\n\nQuestion: {query}\nAnswer:"""
50
 
51
- raw_output = llm.predict(prompt)
52
  answer = raw_output.replace(prompt, "").strip()
53
  return answer
54
  return None
 
1
  from langchain_community.document_loaders import PyPDFLoader
2
  from langchain_community.vectorstores import FAISS
3
  from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ # Use the generic HuggingFaceEmbeddings for the smaller model
5
+ from langchain_huggingface import HuggingFaceEmbeddings
6
+ from langchain_huggingface import HuggingFacePipeline
7
+ # Remove BitsAndBytesConfig import
8
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
9
  import os
10
  from dotenv import load_dotenv
11
 
12
  load_dotenv()
13
 
14
+ # --- MODEL INITIALIZATION (Minimal Footprint) ---
15
+ print("Loading Qwen2-0.5B-Instruct...")
16
+ model_name = "Qwen/Qwen2-0.5B-Instruct"
 
17
 
18
+ # Removed: quantization_config = BitsAndBytesConfig(load_in_8bit=True)
 
 
 
 
 
19
 
20
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
21
+ # Removed: quantization_config parameter from from_pretrained
22
+ model = AutoModelForCausalLM.from_pretrained(
23
+ model_name,
24
+ device_map="cpu",
25
+ trust_remote_code=True
26
+ )
 
27
 
 
28
  llm_pipeline = pipeline(
29
  "text-generation",
30
  model=model,
31
  tokenizer=tokenizer,
32
+ max_new_tokens=256,
33
  do_sample=True,
34
+ temperature=0.5,
35
  top_p=0.9,
36
  )
37
  llm = HuggingFacePipeline(pipeline=llm_pipeline)
38
 
39
+ # Use the lighter all-MiniLM-L6-v2 embeddings model
40
+ embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
41
+
42
+ # --- DOCUMENT LOADING & CHUNKING ---
43
+ loader = PyPDFLoader("data/sample.pdf") # Correct path for Docker: data/sample.pdf
44
+ documents = loader.load()
45
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
46
+ chunks = text_splitter.split_documents(documents)
47
+
48
+ if not chunks:
49
+ raise ValueError("No document chunks found.")
50
+
51
+ # Initialize FAISS and retriever
52
+ vectorstore = FAISS.from_documents(chunks, embeddings)
53
+ retriever = vectorstore.as_retriever()
54
+
55
+ # Expose the necessary components for rag.py to import
56
  def query_vector_store(query: str) -> str:
57
  docs = retriever.get_relevant_documents(query)
58
  if docs:
59
  context = "\n\n".join([doc.page_content for doc in docs])
60
  prompt = f"""Use the following context to answer the question:\n\n{context}\n\nQuestion: {query}\nAnswer:"""
61
 
62
+ raw_output = llm.invoke(prompt)
63
  answer = raw_output.replace(prompt, "").strip()
64
  return answer
65
  return None