first commit
Browse files- Dockerfile +19 -0
- app/core/config.py +1 -0
- app/main.py +19 -0
- app/models/embeddings.py +10 -0
- app/routes/qa.py +47 -0
- app/utils/chunking.py +12 -0
- app/utils/file_parser.py +12 -0
- requirements.txt +9 -0
Dockerfile
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.10-slim
|
2 |
+
|
3 |
+
WORKDIR /app
|
4 |
+
|
5 |
+
RUN apt-get update && apt-get install -y \
|
6 |
+
build-essential \
|
7 |
+
libglib2.0-0 \
|
8 |
+
libsm6 \
|
9 |
+
libxrender1 \
|
10 |
+
libxext6 \
|
11 |
+
poppler-utils \
|
12 |
+
&& rm -rf /var/lib/apt/lists/*
|
13 |
+
|
14 |
+
COPY requirements.txt .
|
15 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
16 |
+
|
17 |
+
COPY . .
|
18 |
+
|
19 |
+
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
|
app/core/config.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
URL = "*"
|
app/main.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI
|
2 |
+
from fastapi.middleware.cors import CORSMiddleware
|
3 |
+
from app.routes.qa import router as qa_router
|
4 |
+
from dotenv import load_dotenv
|
5 |
+
from app.core.config import URL
|
6 |
+
|
7 |
+
load_dotenv()
|
8 |
+
|
9 |
+
app = FastAPI()
|
10 |
+
|
11 |
+
app.add_middleware(
|
12 |
+
CORSMiddleware,
|
13 |
+
allow_origins=[URL],
|
14 |
+
allow_credentials=True,
|
15 |
+
allow_methods=["*"],
|
16 |
+
allow_headers=["*"],
|
17 |
+
)
|
18 |
+
|
19 |
+
app.include_router(qa_router, prefix="/api")
|
app/models/embeddings.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sentence_transformers import SentenceTransformer
|
2 |
+
import faiss
|
3 |
+
|
4 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
5 |
+
|
6 |
+
def embed_and_index_chunks(chunks):
|
7 |
+
embeddings = model.encode(chunks, convert_to_numpy=True)
|
8 |
+
index = faiss.IndexFlatL2(embeddings.shape[1])
|
9 |
+
index.add(embeddings)
|
10 |
+
return index, embeddings
|
app/routes/qa.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import APIRouter, UploadFile, File, Form
|
2 |
+
import tempfile
|
3 |
+
import shutil
|
4 |
+
from app.models.embeddings import embed_and_index_chunks, model
|
5 |
+
from app.utils.file_parser import extract_text
|
6 |
+
from app.utils.chunking import chunk_text
|
7 |
+
import numpy as np
|
8 |
+
import os
|
9 |
+
import google.generativeai as genai
|
10 |
+
from dotenv import load_dotenv
|
11 |
+
|
12 |
+
load_dotenv()
|
13 |
+
|
14 |
+
router = APIRouter()
|
15 |
+
|
16 |
+
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
|
17 |
+
gemini = genai.GenerativeModel("gemini-2.5-pro")
|
18 |
+
|
19 |
+
@router.post("/ask")
|
20 |
+
async def ask_doc(file: UploadFile = File(...), query: str = Form(...)):
|
21 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=f".{file.filename.split('.')[-1]}") as tmp:
|
22 |
+
shutil.copyfileobj(file.file, tmp)
|
23 |
+
tmp_path = tmp.name
|
24 |
+
|
25 |
+
text = extract_text(tmp_path)
|
26 |
+
chunks = chunk_text(text)
|
27 |
+
index, _ = embed_and_index_chunks(chunks)
|
28 |
+
|
29 |
+
query_embedding = model.encode([query], convert_to_numpy=True)
|
30 |
+
D, I = index.search(query_embedding, k=3)
|
31 |
+
retrieved_chunks = [chunks[i] for i in I[0]]
|
32 |
+
context = "\n\n".join(retrieved_chunks)
|
33 |
+
|
34 |
+
prompt = f"""
|
35 |
+
User Query: {query}
|
36 |
+
|
37 |
+
Relevant Clauses from Document:
|
38 |
+
{context}
|
39 |
+
|
40 |
+
Based on the above, return a JSON with:
|
41 |
+
- decision (approved/rejected)
|
42 |
+
- amount (if applicable)
|
43 |
+
- justification with referenced clauses
|
44 |
+
"""
|
45 |
+
|
46 |
+
response = gemini.generate_content(prompt)
|
47 |
+
return {"result": response.text}
|
app/utils/chunking.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def chunk_text(text, chunk_size=300):
|
2 |
+
sentences = text.split(".")
|
3 |
+
chunks, current = [], ""
|
4 |
+
for s in sentences:
|
5 |
+
if len(current) + len(s) < chunk_size:
|
6 |
+
current += s + "."
|
7 |
+
else:
|
8 |
+
chunks.append(current.strip())
|
9 |
+
current = s + "."
|
10 |
+
if current:
|
11 |
+
chunks.append(current.strip())
|
12 |
+
return chunks
|
app/utils/file_parser.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import fitz # PyMuPDF
|
2 |
+
from docx import Document
|
3 |
+
|
4 |
+
def extract_text(file_path: str) -> str:
|
5 |
+
if file_path.endswith(".pdf"):
|
6 |
+
doc = fitz.open(file_path)
|
7 |
+
return "\n".join(page.get_text() for page in doc)
|
8 |
+
elif file_path.endswith(".docx"):
|
9 |
+
doc = Document(file_path)
|
10 |
+
return "\n".join(p.text for p in doc.paragraphs)
|
11 |
+
else:
|
12 |
+
raise ValueError("Unsupported file type")
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
fastapi
|
2 |
+
uvicorn
|
3 |
+
python-multipart
|
4 |
+
sentence-transformers==2.2.2
|
5 |
+
faiss-cpu
|
6 |
+
PyMuPDF
|
7 |
+
python-docx
|
8 |
+
python-dotenv
|
9 |
+
google-generativeai==0.8.5
|