Spaces:

tech5
/

chatbot-backend

Sleeping

App Files Files Community

tech5 commited on Jun 12, 2025

Commit

7bdc074

1 Parent(s): a55fa62

Copied GitHub project to Hugging Face Space

Browse files

Files changed (10) hide show

.gitignore +73 -0
Dockerfile +10 -0
backend/__init__.py +0 -0
backend/api/__init__.py +0 -0
backend/api/main.py +69 -0
backend/h.py +9 -0
backend/models/__init__.py +0 -0
backend/models/app.py +77 -0
backend/models/embed.py +62 -0
requirements.txt +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,73 @@

+# Python cache
+__pycache__/
+*.py[cod]
+*$py.class
+# Jupyter Notebook
+.ipynb_checkpoints/
+# Virtual environment
+env/
+venv/
+.venv/
+ENV/
+env.bak/
+venv.bak/
+# VS Code
+.vscode/
+# PyCharm
+.idea/
+# OS files
+.DS_Store
+Thumbs.db
+# Logs and outputs
+*.log
+*.out
+*.err
+# Python packages
+*.egg
+*.egg-info/
+dist/
+build/
+*.whl
+# Test and coverage results
+htmlcov/
+.tox/
+.nox/
+.coverage
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+# Environment variables
+.env
+.env.*
+# Local notebooks
+*.ipynb
+notebooks/
+*.checkpoint.ipynb
+# FastAPI/Streamlit uploads or temporary files
+uploads/
+tmp/
+*.bak
+*.swp
+# MyPy, Pyre, Pytype
+.mypy_cache/
+.pytype/
+.pyre/
+# FAISS index or data
+*.faiss
+*.index

Dockerfile ADDED Viewed

	@@ -0,0 +1,10 @@

+FROM python:3.10
+WORKDIR /code
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

backend/__init__.py ADDED Viewed

File without changes

backend/api/__init__.py ADDED Viewed

File without changes

backend/api/main.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from fastapi import FastAPI, UploadFile, File, Form
+from fastapi.responses import JSONResponse
+from fastapi.middleware.cors import CORSMiddleware
+import os
+import shutil
+from typing import List
+from backend.models.embed import embed_and_store  # Import from embed.py
+from backend.models.app import chat_with_user
+app = FastAPI()
+# CORS setup for frontend communication(Streamlit)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# PDF Upload endpoint
+@app.post("/upload/")
+async def upload_pdfs(user_id: str = Form(...), files: List[UploadFile] = File(...)):
+    # Create directories for the user
+    base_dir = os.path.join("docs", user_id)
+    pdf_dir = os.path.join(base_dir, "pdfs")
+    image_dir = os.path.join(base_dir,"images")
+    faiss_dir = os.path.join(base_dir, "faiss_index")
+    os.makedirs(pdf_dir, exist_ok=True)
+    os.makedirs(image_dir,exist_ok=True)
+    os.makedirs(faiss_dir, exist_ok=True)
+    # Save uploaded PDFs to the user's directory
+    for file in files:
+        filename_lower = file.filename.lower()
+        # First check filename extensions
+        if filename_lower.endswith(".pdf"):
+            file_path = os.path.join(pdf_dir, file.filename)
+        elif any(filename_lower.endswith(ext) for ext in [".png", ".jpg", ".jpeg"]):
+            file_path = os.path.join(image_dir, file.filename)
+        # Then fall back to content-type if extension not recognized
+        elif file.content_type == "application/pdf":
+            file_path = os.path.join(pdf_dir, file.filename)
+        elif file.content_type and file.content_type.startswith("image/"):
+            file_path = os.path.join(image_dir, file.filename)
+        else:
+           return {"error": f"Unsupported file type: {file.filename} ({file.content_type})"}
+        with open(file_path, "wb") as f:
+           f.write(await file.read())
+    # Process the PDFs and embed them into FAISS
+    try:
+        embed_and_store(user_id=user_id)
+        return {"message": f"Documents uploaded and embedded successfully for user {user_id}"}
+    except Exception as e:
+        return JSONResponse(status_code=500, content={"error": str(e)})
+# Chat endpoint
+@app.post("/chat/")
+async def chat(user_id: str = Form(...), query: str = Form(...)):
+    # Load the user's vectorstore and perform the query
+    try:
+        response = chat_with_user(user_id, query)
+        return {"response": response}
+    except Exception as e:
+        return JSONResponse(status_code=500, content={"error": str(e)})

backend/h.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from PIL import Image
+import pytesseract
+# Load image and run OCR
+image = Image.open("C:\\Users\\Acer\\Downloads\\ChatGPT Image May 3, 2025, 01_11_31 PM.png")
+text = pytesseract.image_to_string(image)
+print("Extracted Text:")
+print(text)

backend/models/__init__.py ADDED Viewed

File without changes

backend/models/app.py ADDED Viewed

	@@ -0,0 +1,77 @@

+from langchain_community.vectorstores import FAISS
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain.chains.combine_documents import create_stuff_documents_chain
+from langchain.chains import create_retrieval_chain
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_groq import ChatGroq
+import os
+from dotenv import load_dotenv
+load_dotenv()
+#Loade LLM
+os.environ['GROQ_API_KEY'] = os.getenv('GROQ_API_KEY')
+llm = ChatGroq(model='llama-3.3-70b-versatile')
+#Funtion for load Vector data
+def load_user_vectorstore(user_id: str):
+    faiss_path = os.path.join("docs", user_id, "faiss_index")
+    if not os.path.exists(faiss_path):
+        raise ValueError(f"No FAISS index found for user '{user_id}'")
+    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
+    return FAISS.load_local(faiss_path, embeddings,allow_dangerous_deserialization=True)
+prompt = ChatPromptTemplate.from_template("""
+You are a highly skilled document research assistant.
+Your task is to read the extracted document snippets provided in <context> and respond to the user's question using the following structure:
+Document-Level Answers:
+Identify relevant content from each document that helps answer the question. Present the findings in a markdown table with **three columns**:
+- `Document ID`: A unique identifier (e.g., DOC001, DOC002)
+- `Extracted Answer`: A short but meaningful excerpt from the document (max 2–3 lines)
+- `Citation`: Include "Page X, Paragraph Y" or "Page X, Sentence Y" based on metadata
+Format:
+Extract the document id and citation from documents and show in this format below:
+| Document ID | Extracted Answer | Citation |
+|-------------|------------------|----------|
+| DOC001 | The company was fined under section 15A for non-compliance… | Page 3, Para 2 |
+| DOC002 | The delay in reporting was noted as a violation of Clause 49… | Page 5, Para 1 |
+---
+Synthesized Summary:
+Next, synthesize key **themes or insights** found across the extracted answers. Group the responses by theme (e.g., "Regulatory Non-Compliance", "Disclosure Failures"). For each theme, follow this format:
+**Theme Name – Short Description:**
+DOC IDs involved: Summarized insight based on their content.
+Example:
+**Theme 1 – Regulatory Non-Compliance:**
+DOC001, DOC002: Highlighted breaches of SEBI Act and LODR regulations.
+Return the final response **in markdown format** so it can be rendered on-screen or exported to PDF.
+<context>
+{context}
+</context>
+User Question:
+{input}
+""")
+# Funtion for Asking any Query related to uploaded Documents
+def chat_with_user(user_id:str,query:str):
+    vectors = load_user_vectorstore(user_id)
+    retriever = vectors.as_retriever()
+    # Get top relevant documents
+    retrieved_docs = retriever.get_relevant_documents(query)
+    # Chaining and Retrieving the Answer from documents
+    stuff_documents = create_stuff_documents_chain(llm,prompt)
+    retrieval_chain = create_retrieval_chain(retriever,stuff_documents)
+    response = retrieval_chain.invoke({'input':query})
+    return response['answer']

backend/models/embed.py ADDED Viewed

	@@ -0,0 +1,62 @@

+from langchain_community.document_loaders import PyPDFDirectoryLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.vectorstores import FAISS
+from langchain_core.documents import Document
+import os
+import shutil
+from PIL import Image
+import pytesseract
+# Funtion for load Documents and Save it into Vector Stores
+def embed_and_store(user_id: str):
+    # Setup user directories
+    base_dir = os.path.join("docs", user_id)
+    pdf_dir = os.path.join(base_dir, "pdfs")
+    image_dir = os.path.join(base_dir,"images")
+    faiss_dir = os.path.join(base_dir, "faiss_index")
+    #Using Pytesseract for extracting Image texts
+    image_texts = []
+    for filename in os.listdir(image_dir):
+        if filename.lower().endswith((".png", ".jpg", ".jpeg")):
+            image_path = os.path.join(image_dir, filename)
+            image = Image.open(image_path)
+            text = pytesseract.image_to_string(image)
+            image_texts.append((filename, text))
+    doc_images = [Document(page_content=text, metadata={"source": fname}) for fname, text in image_texts]
+    # Loade Pdfs using PyPDFDirectoryLoader
+    loader = PyPDFDirectoryLoader(pdf_dir)
+    docs = loader.load()
+    splitter = RecursiveCharacterTextSplitter(chunk_size=2000,chunk_overlap=200)
+    document = splitter.split_documents(docs)
+    documents = document + doc_images
+    updated_documents = []
+    for i, doc in enumerate(documents):
+        meta = doc.metadata.copy()
+        meta["doc_id"] = meta.get("source", f"doc_{i}")  # Use filename or fallback
+        meta["chunk_id"] = i
+    # If page number available (for PDF)
+        if "page" in meta:
+            meta["citation"] = f"{meta['source']} - page {meta['page']}, chunk {i}"
+        else:
+            meta["citation"] = f"{meta['source']} - chunk {i}"
+        updated_documents.append(Document(page_content=doc.page_content, metadata=meta))
+    # Load HuggingFace Embedding model
+    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
+     #Load existing FAISS index if exists
+    if os.path.exists(os.path.join(faiss_dir, "index.faiss")):
+        vectorstore = FAISS.load_local(faiss_dir, embeddings, allow_dangerous_deserialization=True)
+        vectorstore.add_documents(updated_documents)
+    else:
+        vectorstore = FAISS.from_documents(updated_documents, embeddings)
+    vectorstore.save_local(faiss_dir)
+    print(f"✅ FAISS updated for user: {user_id}")

requirements.txt ADDED Viewed

Binary file (7.57 kB). View file