Spaces:
Sleeping
Sleeping
Copied GitHub project to Hugging Face Space
Browse files- .gitignore +73 -0
- Dockerfile +10 -0
- backend/__init__.py +0 -0
- backend/api/__init__.py +0 -0
- backend/api/main.py +69 -0
- backend/h.py +9 -0
- backend/models/__init__.py +0 -0
- backend/models/app.py +77 -0
- backend/models/embed.py +62 -0
- requirements.txt +0 -0
.gitignore
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python cache
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# Jupyter Notebook
|
| 7 |
+
.ipynb_checkpoints/
|
| 8 |
+
|
| 9 |
+
# Virtual environment
|
| 10 |
+
env/
|
| 11 |
+
venv/
|
| 12 |
+
.venv/
|
| 13 |
+
ENV/
|
| 14 |
+
env.bak/
|
| 15 |
+
venv.bak/
|
| 16 |
+
|
| 17 |
+
# VS Code
|
| 18 |
+
.vscode/
|
| 19 |
+
|
| 20 |
+
# PyCharm
|
| 21 |
+
.idea/
|
| 22 |
+
|
| 23 |
+
# OS files
|
| 24 |
+
.DS_Store
|
| 25 |
+
Thumbs.db
|
| 26 |
+
|
| 27 |
+
# Logs and outputs
|
| 28 |
+
*.log
|
| 29 |
+
*.out
|
| 30 |
+
*.err
|
| 31 |
+
|
| 32 |
+
# Python packages
|
| 33 |
+
*.egg
|
| 34 |
+
*.egg-info/
|
| 35 |
+
dist/
|
| 36 |
+
build/
|
| 37 |
+
*.whl
|
| 38 |
+
|
| 39 |
+
# Test and coverage results
|
| 40 |
+
htmlcov/
|
| 41 |
+
.tox/
|
| 42 |
+
.nox/
|
| 43 |
+
.coverage
|
| 44 |
+
.cache
|
| 45 |
+
nosetests.xml
|
| 46 |
+
coverage.xml
|
| 47 |
+
*.cover
|
| 48 |
+
*.py,cover
|
| 49 |
+
|
| 50 |
+
# Environment variables
|
| 51 |
+
.env
|
| 52 |
+
.env.*
|
| 53 |
+
|
| 54 |
+
# Local notebooks
|
| 55 |
+
*.ipynb
|
| 56 |
+
notebooks/
|
| 57 |
+
*.checkpoint.ipynb
|
| 58 |
+
|
| 59 |
+
# FastAPI/Streamlit uploads or temporary files
|
| 60 |
+
uploads/
|
| 61 |
+
tmp/
|
| 62 |
+
*.bak
|
| 63 |
+
*.swp
|
| 64 |
+
|
| 65 |
+
# MyPy, Pyre, Pytype
|
| 66 |
+
.mypy_cache/
|
| 67 |
+
.pytype/
|
| 68 |
+
.pyre/
|
| 69 |
+
|
| 70 |
+
# FAISS index or data
|
| 71 |
+
*.faiss
|
| 72 |
+
*.index
|
| 73 |
+
|
Dockerfile
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10
|
| 2 |
+
|
| 3 |
+
WORKDIR /code
|
| 4 |
+
|
| 5 |
+
COPY requirements.txt .
|
| 6 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 7 |
+
|
| 8 |
+
COPY . .
|
| 9 |
+
|
| 10 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
backend/__init__.py
ADDED
|
File without changes
|
backend/api/__init__.py
ADDED
|
File without changes
|
backend/api/main.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI, UploadFile, File, Form
|
| 2 |
+
from fastapi.responses import JSONResponse
|
| 3 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 4 |
+
import os
|
| 5 |
+
import shutil
|
| 6 |
+
from typing import List
|
| 7 |
+
from backend.models.embed import embed_and_store # Import from embed.py
|
| 8 |
+
from backend.models.app import chat_with_user
|
| 9 |
+
|
| 10 |
+
app = FastAPI()
|
| 11 |
+
|
| 12 |
+
# CORS setup for frontend communication(Streamlit)
|
| 13 |
+
app.add_middleware(
|
| 14 |
+
CORSMiddleware,
|
| 15 |
+
allow_origins=["*"],
|
| 16 |
+
allow_credentials=True,
|
| 17 |
+
allow_methods=["*"],
|
| 18 |
+
allow_headers=["*"],
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
# PDF Upload endpoint
|
| 22 |
+
@app.post("/upload/")
|
| 23 |
+
async def upload_pdfs(user_id: str = Form(...), files: List[UploadFile] = File(...)):
|
| 24 |
+
# Create directories for the user
|
| 25 |
+
base_dir = os.path.join("docs", user_id)
|
| 26 |
+
pdf_dir = os.path.join(base_dir, "pdfs")
|
| 27 |
+
image_dir = os.path.join(base_dir,"images")
|
| 28 |
+
faiss_dir = os.path.join(base_dir, "faiss_index")
|
| 29 |
+
|
| 30 |
+
os.makedirs(pdf_dir, exist_ok=True)
|
| 31 |
+
os.makedirs(image_dir,exist_ok=True)
|
| 32 |
+
os.makedirs(faiss_dir, exist_ok=True)
|
| 33 |
+
|
| 34 |
+
# Save uploaded PDFs to the user's directory
|
| 35 |
+
for file in files:
|
| 36 |
+
filename_lower = file.filename.lower()
|
| 37 |
+
|
| 38 |
+
# First check filename extensions
|
| 39 |
+
if filename_lower.endswith(".pdf"):
|
| 40 |
+
file_path = os.path.join(pdf_dir, file.filename)
|
| 41 |
+
|
| 42 |
+
elif any(filename_lower.endswith(ext) for ext in [".png", ".jpg", ".jpeg"]):
|
| 43 |
+
file_path = os.path.join(image_dir, file.filename)
|
| 44 |
+
# Then fall back to content-type if extension not recognized
|
| 45 |
+
elif file.content_type == "application/pdf":
|
| 46 |
+
file_path = os.path.join(pdf_dir, file.filename)
|
| 47 |
+
elif file.content_type and file.content_type.startswith("image/"):
|
| 48 |
+
file_path = os.path.join(image_dir, file.filename)
|
| 49 |
+
else:
|
| 50 |
+
return {"error": f"Unsupported file type: {file.filename} ({file.content_type})"}
|
| 51 |
+
with open(file_path, "wb") as f:
|
| 52 |
+
f.write(await file.read())
|
| 53 |
+
|
| 54 |
+
# Process the PDFs and embed them into FAISS
|
| 55 |
+
try:
|
| 56 |
+
embed_and_store(user_id=user_id)
|
| 57 |
+
return {"message": f"Documents uploaded and embedded successfully for user {user_id}"}
|
| 58 |
+
except Exception as e:
|
| 59 |
+
return JSONResponse(status_code=500, content={"error": str(e)})
|
| 60 |
+
|
| 61 |
+
# Chat endpoint
|
| 62 |
+
@app.post("/chat/")
|
| 63 |
+
async def chat(user_id: str = Form(...), query: str = Form(...)):
|
| 64 |
+
# Load the user's vectorstore and perform the query
|
| 65 |
+
try:
|
| 66 |
+
response = chat_with_user(user_id, query)
|
| 67 |
+
return {"response": response}
|
| 68 |
+
except Exception as e:
|
| 69 |
+
return JSONResponse(status_code=500, content={"error": str(e)})
|
backend/h.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from PIL import Image
|
| 2 |
+
import pytesseract
|
| 3 |
+
|
| 4 |
+
# Load image and run OCR
|
| 5 |
+
image = Image.open("C:\\Users\\Acer\\Downloads\\ChatGPT Image May 3, 2025, 01_11_31 PM.png")
|
| 6 |
+
text = pytesseract.image_to_string(image)
|
| 7 |
+
|
| 8 |
+
print("Extracted Text:")
|
| 9 |
+
print(text)
|
backend/models/__init__.py
ADDED
|
File without changes
|
backend/models/app.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain_community.vectorstores import FAISS
|
| 2 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 3 |
+
from langchain.chains.combine_documents import create_stuff_documents_chain
|
| 4 |
+
from langchain.chains import create_retrieval_chain
|
| 5 |
+
from langchain_core.prompts import ChatPromptTemplate
|
| 6 |
+
from langchain_groq import ChatGroq
|
| 7 |
+
import os
|
| 8 |
+
from dotenv import load_dotenv
|
| 9 |
+
load_dotenv()
|
| 10 |
+
|
| 11 |
+
#Loade LLM
|
| 12 |
+
os.environ['GROQ_API_KEY'] = os.getenv('GROQ_API_KEY')
|
| 13 |
+
llm = ChatGroq(model='llama-3.3-70b-versatile')
|
| 14 |
+
|
| 15 |
+
#Funtion for load Vector data
|
| 16 |
+
def load_user_vectorstore(user_id: str):
|
| 17 |
+
faiss_path = os.path.join("docs", user_id, "faiss_index")
|
| 18 |
+
if not os.path.exists(faiss_path):
|
| 19 |
+
raise ValueError(f"No FAISS index found for user '{user_id}'")
|
| 20 |
+
|
| 21 |
+
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
|
| 22 |
+
return FAISS.load_local(faiss_path, embeddings,allow_dangerous_deserialization=True)
|
| 23 |
+
|
| 24 |
+
prompt = ChatPromptTemplate.from_template("""
|
| 25 |
+
You are a highly skilled document research assistant.
|
| 26 |
+
|
| 27 |
+
Your task is to read the extracted document snippets provided in <context> and respond to the user's question using the following structure:
|
| 28 |
+
|
| 29 |
+
Document-Level Answers:
|
| 30 |
+
Identify relevant content from each document that helps answer the question. Present the findings in a markdown table with **three columns**:
|
| 31 |
+
- `Document ID`: A unique identifier (e.g., DOC001, DOC002)
|
| 32 |
+
- `Extracted Answer`: A short but meaningful excerpt from the document (max 2–3 lines)
|
| 33 |
+
- `Citation`: Include "Page X, Paragraph Y" or "Page X, Sentence Y" based on metadata
|
| 34 |
+
|
| 35 |
+
Format:
|
| 36 |
+
Extract the document id and citation from documents and show in this format below:
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
| Document ID | Extracted Answer | Citation |
|
| 40 |
+
|-------------|------------------|----------|
|
| 41 |
+
| DOC001 | The company was fined under section 15A for non-compliance… | Page 3, Para 2 |
|
| 42 |
+
| DOC002 | The delay in reporting was noted as a violation of Clause 49… | Page 5, Para 1 |
|
| 43 |
+
---
|
| 44 |
+
|
| 45 |
+
Synthesized Summary:
|
| 46 |
+
Next, synthesize key **themes or insights** found across the extracted answers. Group the responses by theme (e.g., "Regulatory Non-Compliance", "Disclosure Failures"). For each theme, follow this format:
|
| 47 |
+
|
| 48 |
+
**Theme Name – Short Description:**
|
| 49 |
+
DOC IDs involved: Summarized insight based on their content.
|
| 50 |
+
|
| 51 |
+
Example:
|
| 52 |
+
**Theme 1 – Regulatory Non-Compliance:**
|
| 53 |
+
DOC001, DOC002: Highlighted breaches of SEBI Act and LODR regulations.
|
| 54 |
+
|
| 55 |
+
Return the final response **in markdown format** so it can be rendered on-screen or exported to PDF.
|
| 56 |
+
|
| 57 |
+
<context>
|
| 58 |
+
{context}
|
| 59 |
+
</context>
|
| 60 |
+
|
| 61 |
+
User Question:
|
| 62 |
+
{input}
|
| 63 |
+
""")
|
| 64 |
+
|
| 65 |
+
# Funtion for Asking any Query related to uploaded Documents
|
| 66 |
+
def chat_with_user(user_id:str,query:str):
|
| 67 |
+
vectors = load_user_vectorstore(user_id)
|
| 68 |
+
retriever = vectors.as_retriever()
|
| 69 |
+
# Get top relevant documents
|
| 70 |
+
retrieved_docs = retriever.get_relevant_documents(query)
|
| 71 |
+
|
| 72 |
+
# Chaining and Retrieving the Answer from documents
|
| 73 |
+
stuff_documents = create_stuff_documents_chain(llm,prompt)
|
| 74 |
+
retrieval_chain = create_retrieval_chain(retriever,stuff_documents)
|
| 75 |
+
response = retrieval_chain.invoke({'input':query})
|
| 76 |
+
return response['answer']
|
| 77 |
+
|
backend/models/embed.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain_community.document_loaders import PyPDFDirectoryLoader
|
| 2 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 3 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 4 |
+
from langchain_community.vectorstores import FAISS
|
| 5 |
+
from langchain_core.documents import Document
|
| 6 |
+
import os
|
| 7 |
+
import shutil
|
| 8 |
+
from PIL import Image
|
| 9 |
+
import pytesseract
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
# Funtion for load Documents and Save it into Vector Stores
|
| 13 |
+
def embed_and_store(user_id: str):
|
| 14 |
+
# Setup user directories
|
| 15 |
+
base_dir = os.path.join("docs", user_id)
|
| 16 |
+
pdf_dir = os.path.join(base_dir, "pdfs")
|
| 17 |
+
image_dir = os.path.join(base_dir,"images")
|
| 18 |
+
faiss_dir = os.path.join(base_dir, "faiss_index")
|
| 19 |
+
|
| 20 |
+
#Using Pytesseract for extracting Image texts
|
| 21 |
+
image_texts = []
|
| 22 |
+
for filename in os.listdir(image_dir):
|
| 23 |
+
if filename.lower().endswith((".png", ".jpg", ".jpeg")):
|
| 24 |
+
image_path = os.path.join(image_dir, filename)
|
| 25 |
+
image = Image.open(image_path)
|
| 26 |
+
text = pytesseract.image_to_string(image)
|
| 27 |
+
image_texts.append((filename, text))
|
| 28 |
+
doc_images = [Document(page_content=text, metadata={"source": fname}) for fname, text in image_texts]
|
| 29 |
+
|
| 30 |
+
# Loade Pdfs using PyPDFDirectoryLoader
|
| 31 |
+
loader = PyPDFDirectoryLoader(pdf_dir)
|
| 32 |
+
docs = loader.load()
|
| 33 |
+
splitter = RecursiveCharacterTextSplitter(chunk_size=2000,chunk_overlap=200)
|
| 34 |
+
document = splitter.split_documents(docs)
|
| 35 |
+
documents = document + doc_images
|
| 36 |
+
updated_documents = []
|
| 37 |
+
for i, doc in enumerate(documents):
|
| 38 |
+
meta = doc.metadata.copy()
|
| 39 |
+
|
| 40 |
+
meta["doc_id"] = meta.get("source", f"doc_{i}") # Use filename or fallback
|
| 41 |
+
meta["chunk_id"] = i
|
| 42 |
+
|
| 43 |
+
# If page number available (for PDF)
|
| 44 |
+
if "page" in meta:
|
| 45 |
+
meta["citation"] = f"{meta['source']} - page {meta['page']}, chunk {i}"
|
| 46 |
+
else:
|
| 47 |
+
meta["citation"] = f"{meta['source']} - chunk {i}"
|
| 48 |
+
|
| 49 |
+
updated_documents.append(Document(page_content=doc.page_content, metadata=meta))
|
| 50 |
+
# Load HuggingFace Embedding model
|
| 51 |
+
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
|
| 52 |
+
|
| 53 |
+
#Load existing FAISS index if exists
|
| 54 |
+
if os.path.exists(os.path.join(faiss_dir, "index.faiss")):
|
| 55 |
+
vectorstore = FAISS.load_local(faiss_dir, embeddings, allow_dangerous_deserialization=True)
|
| 56 |
+
vectorstore.add_documents(updated_documents)
|
| 57 |
+
else:
|
| 58 |
+
vectorstore = FAISS.from_documents(updated_documents, embeddings)
|
| 59 |
+
|
| 60 |
+
vectorstore.save_local(faiss_dir)
|
| 61 |
+
print(f"✅ FAISS updated for user: {user_id}")
|
| 62 |
+
|
requirements.txt
ADDED
|
Binary file (7.57 kB). View file
|
|
|