tech5 commited on
Commit
7bdc074
·
1 Parent(s): a55fa62

Copied GitHub project to Hugging Face Space

Browse files
.gitignore ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python cache
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # Jupyter Notebook
7
+ .ipynb_checkpoints/
8
+
9
+ # Virtual environment
10
+ env/
11
+ venv/
12
+ .venv/
13
+ ENV/
14
+ env.bak/
15
+ venv.bak/
16
+
17
+ # VS Code
18
+ .vscode/
19
+
20
+ # PyCharm
21
+ .idea/
22
+
23
+ # OS files
24
+ .DS_Store
25
+ Thumbs.db
26
+
27
+ # Logs and outputs
28
+ *.log
29
+ *.out
30
+ *.err
31
+
32
+ # Python packages
33
+ *.egg
34
+ *.egg-info/
35
+ dist/
36
+ build/
37
+ *.whl
38
+
39
+ # Test and coverage results
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .cache
45
+ nosetests.xml
46
+ coverage.xml
47
+ *.cover
48
+ *.py,cover
49
+
50
+ # Environment variables
51
+ .env
52
+ .env.*
53
+
54
+ # Local notebooks
55
+ *.ipynb
56
+ notebooks/
57
+ *.checkpoint.ipynb
58
+
59
+ # FastAPI/Streamlit uploads or temporary files
60
+ uploads/
61
+ tmp/
62
+ *.bak
63
+ *.swp
64
+
65
+ # MyPy, Pyre, Pytype
66
+ .mypy_cache/
67
+ .pytype/
68
+ .pyre/
69
+
70
+ # FAISS index or data
71
+ *.faiss
72
+ *.index
73
+
Dockerfile ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10
2
+
3
+ WORKDIR /code
4
+
5
+ COPY requirements.txt .
6
+ RUN pip install --no-cache-dir -r requirements.txt
7
+
8
+ COPY . .
9
+
10
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
backend/__init__.py ADDED
File without changes
backend/api/__init__.py ADDED
File without changes
backend/api/main.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, UploadFile, File, Form
2
+ from fastapi.responses import JSONResponse
3
+ from fastapi.middleware.cors import CORSMiddleware
4
+ import os
5
+ import shutil
6
+ from typing import List
7
+ from backend.models.embed import embed_and_store # Import from embed.py
8
+ from backend.models.app import chat_with_user
9
+
10
+ app = FastAPI()
11
+
12
+ # CORS setup for frontend communication(Streamlit)
13
+ app.add_middleware(
14
+ CORSMiddleware,
15
+ allow_origins=["*"],
16
+ allow_credentials=True,
17
+ allow_methods=["*"],
18
+ allow_headers=["*"],
19
+ )
20
+
21
+ # PDF Upload endpoint
22
+ @app.post("/upload/")
23
+ async def upload_pdfs(user_id: str = Form(...), files: List[UploadFile] = File(...)):
24
+ # Create directories for the user
25
+ base_dir = os.path.join("docs", user_id)
26
+ pdf_dir = os.path.join(base_dir, "pdfs")
27
+ image_dir = os.path.join(base_dir,"images")
28
+ faiss_dir = os.path.join(base_dir, "faiss_index")
29
+
30
+ os.makedirs(pdf_dir, exist_ok=True)
31
+ os.makedirs(image_dir,exist_ok=True)
32
+ os.makedirs(faiss_dir, exist_ok=True)
33
+
34
+ # Save uploaded PDFs to the user's directory
35
+ for file in files:
36
+ filename_lower = file.filename.lower()
37
+
38
+ # First check filename extensions
39
+ if filename_lower.endswith(".pdf"):
40
+ file_path = os.path.join(pdf_dir, file.filename)
41
+
42
+ elif any(filename_lower.endswith(ext) for ext in [".png", ".jpg", ".jpeg"]):
43
+ file_path = os.path.join(image_dir, file.filename)
44
+ # Then fall back to content-type if extension not recognized
45
+ elif file.content_type == "application/pdf":
46
+ file_path = os.path.join(pdf_dir, file.filename)
47
+ elif file.content_type and file.content_type.startswith("image/"):
48
+ file_path = os.path.join(image_dir, file.filename)
49
+ else:
50
+ return {"error": f"Unsupported file type: {file.filename} ({file.content_type})"}
51
+ with open(file_path, "wb") as f:
52
+ f.write(await file.read())
53
+
54
+ # Process the PDFs and embed them into FAISS
55
+ try:
56
+ embed_and_store(user_id=user_id)
57
+ return {"message": f"Documents uploaded and embedded successfully for user {user_id}"}
58
+ except Exception as e:
59
+ return JSONResponse(status_code=500, content={"error": str(e)})
60
+
61
+ # Chat endpoint
62
+ @app.post("/chat/")
63
+ async def chat(user_id: str = Form(...), query: str = Form(...)):
64
+ # Load the user's vectorstore and perform the query
65
+ try:
66
+ response = chat_with_user(user_id, query)
67
+ return {"response": response}
68
+ except Exception as e:
69
+ return JSONResponse(status_code=500, content={"error": str(e)})
backend/h.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from PIL import Image
2
+ import pytesseract
3
+
4
+ # Load image and run OCR
5
+ image = Image.open("C:\\Users\\Acer\\Downloads\\ChatGPT Image May 3, 2025, 01_11_31 PM.png")
6
+ text = pytesseract.image_to_string(image)
7
+
8
+ print("Extracted Text:")
9
+ print(text)
backend/models/__init__.py ADDED
File without changes
backend/models/app.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.vectorstores import FAISS
2
+ from langchain_community.embeddings import HuggingFaceEmbeddings
3
+ from langchain.chains.combine_documents import create_stuff_documents_chain
4
+ from langchain.chains import create_retrieval_chain
5
+ from langchain_core.prompts import ChatPromptTemplate
6
+ from langchain_groq import ChatGroq
7
+ import os
8
+ from dotenv import load_dotenv
9
+ load_dotenv()
10
+
11
+ #Loade LLM
12
+ os.environ['GROQ_API_KEY'] = os.getenv('GROQ_API_KEY')
13
+ llm = ChatGroq(model='llama-3.3-70b-versatile')
14
+
15
+ #Funtion for load Vector data
16
+ def load_user_vectorstore(user_id: str):
17
+ faiss_path = os.path.join("docs", user_id, "faiss_index")
18
+ if not os.path.exists(faiss_path):
19
+ raise ValueError(f"No FAISS index found for user '{user_id}'")
20
+
21
+ embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
22
+ return FAISS.load_local(faiss_path, embeddings,allow_dangerous_deserialization=True)
23
+
24
+ prompt = ChatPromptTemplate.from_template("""
25
+ You are a highly skilled document research assistant.
26
+
27
+ Your task is to read the extracted document snippets provided in <context> and respond to the user's question using the following structure:
28
+
29
+ Document-Level Answers:
30
+ Identify relevant content from each document that helps answer the question. Present the findings in a markdown table with **three columns**:
31
+ - `Document ID`: A unique identifier (e.g., DOC001, DOC002)
32
+ - `Extracted Answer`: A short but meaningful excerpt from the document (max 2–3 lines)
33
+ - `Citation`: Include "Page X, Paragraph Y" or "Page X, Sentence Y" based on metadata
34
+
35
+ Format:
36
+ Extract the document id and citation from documents and show in this format below:
37
+
38
+
39
+ | Document ID | Extracted Answer | Citation |
40
+ |-------------|------------------|----------|
41
+ | DOC001 | The company was fined under section 15A for non-compliance… | Page 3, Para 2 |
42
+ | DOC002 | The delay in reporting was noted as a violation of Clause 49… | Page 5, Para 1 |
43
+ ---
44
+
45
+ Synthesized Summary:
46
+ Next, synthesize key **themes or insights** found across the extracted answers. Group the responses by theme (e.g., "Regulatory Non-Compliance", "Disclosure Failures"). For each theme, follow this format:
47
+
48
+ **Theme Name – Short Description:**
49
+ DOC IDs involved: Summarized insight based on their content.
50
+
51
+ Example:
52
+ **Theme 1 – Regulatory Non-Compliance:**
53
+ DOC001, DOC002: Highlighted breaches of SEBI Act and LODR regulations.
54
+
55
+ Return the final response **in markdown format** so it can be rendered on-screen or exported to PDF.
56
+
57
+ <context>
58
+ {context}
59
+ </context>
60
+
61
+ User Question:
62
+ {input}
63
+ """)
64
+
65
+ # Funtion for Asking any Query related to uploaded Documents
66
+ def chat_with_user(user_id:str,query:str):
67
+ vectors = load_user_vectorstore(user_id)
68
+ retriever = vectors.as_retriever()
69
+ # Get top relevant documents
70
+ retrieved_docs = retriever.get_relevant_documents(query)
71
+
72
+ # Chaining and Retrieving the Answer from documents
73
+ stuff_documents = create_stuff_documents_chain(llm,prompt)
74
+ retrieval_chain = create_retrieval_chain(retriever,stuff_documents)
75
+ response = retrieval_chain.invoke({'input':query})
76
+ return response['answer']
77
+
backend/models/embed.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import PyPDFDirectoryLoader
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ from langchain_community.embeddings import HuggingFaceEmbeddings
4
+ from langchain_community.vectorstores import FAISS
5
+ from langchain_core.documents import Document
6
+ import os
7
+ import shutil
8
+ from PIL import Image
9
+ import pytesseract
10
+
11
+
12
+ # Funtion for load Documents and Save it into Vector Stores
13
+ def embed_and_store(user_id: str):
14
+ # Setup user directories
15
+ base_dir = os.path.join("docs", user_id)
16
+ pdf_dir = os.path.join(base_dir, "pdfs")
17
+ image_dir = os.path.join(base_dir,"images")
18
+ faiss_dir = os.path.join(base_dir, "faiss_index")
19
+
20
+ #Using Pytesseract for extracting Image texts
21
+ image_texts = []
22
+ for filename in os.listdir(image_dir):
23
+ if filename.lower().endswith((".png", ".jpg", ".jpeg")):
24
+ image_path = os.path.join(image_dir, filename)
25
+ image = Image.open(image_path)
26
+ text = pytesseract.image_to_string(image)
27
+ image_texts.append((filename, text))
28
+ doc_images = [Document(page_content=text, metadata={"source": fname}) for fname, text in image_texts]
29
+
30
+ # Loade Pdfs using PyPDFDirectoryLoader
31
+ loader = PyPDFDirectoryLoader(pdf_dir)
32
+ docs = loader.load()
33
+ splitter = RecursiveCharacterTextSplitter(chunk_size=2000,chunk_overlap=200)
34
+ document = splitter.split_documents(docs)
35
+ documents = document + doc_images
36
+ updated_documents = []
37
+ for i, doc in enumerate(documents):
38
+ meta = doc.metadata.copy()
39
+
40
+ meta["doc_id"] = meta.get("source", f"doc_{i}") # Use filename or fallback
41
+ meta["chunk_id"] = i
42
+
43
+ # If page number available (for PDF)
44
+ if "page" in meta:
45
+ meta["citation"] = f"{meta['source']} - page {meta['page']}, chunk {i}"
46
+ else:
47
+ meta["citation"] = f"{meta['source']} - chunk {i}"
48
+
49
+ updated_documents.append(Document(page_content=doc.page_content, metadata=meta))
50
+ # Load HuggingFace Embedding model
51
+ embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
52
+
53
+ #Load existing FAISS index if exists
54
+ if os.path.exists(os.path.join(faiss_dir, "index.faiss")):
55
+ vectorstore = FAISS.load_local(faiss_dir, embeddings, allow_dangerous_deserialization=True)
56
+ vectorstore.add_documents(updated_documents)
57
+ else:
58
+ vectorstore = FAISS.from_documents(updated_documents, embeddings)
59
+
60
+ vectorstore.save_local(faiss_dir)
61
+ print(f"✅ FAISS updated for user: {user_id}")
62
+
requirements.txt ADDED
Binary file (7.57 kB). View file