ah707
commited on
Commit
Β·
e539f46
0
Parent(s):
Initital deploy
Browse files- README.md +0 -0
- app.py +0 -0
- main.py +127 -0
- parser.py +28 -0
- rag.py +50 -0
- requirement.txt +6 -0
- static/index.html +177 -0
README.md
ADDED
File without changes
|
app.py
ADDED
File without changes
|
main.py
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI, UploadFile, File, HTTPException
|
2 |
+
from fastapi.middleware.cors import CORSMiddleware
|
3 |
+
from pydantic import BaseModel
|
4 |
+
import os
|
5 |
+
import logging
|
6 |
+
from parser import parse_pdf, parse_text
|
7 |
+
from rag import RAG
|
8 |
+
import requests
|
9 |
+
|
10 |
+
logging.basicConfig(level=logging.INFO)
|
11 |
+
logger = logging.getLogger(__name__)
|
12 |
+
|
13 |
+
app = FastAPI(title="Simple NotebookLM-like AI Tool")
|
14 |
+
|
15 |
+
# Allow frontend at http://localhost:5500 to access this backend
|
16 |
+
app.add_middleware(
|
17 |
+
CORSMiddleware,
|
18 |
+
allow_origins=["*"], # Replace with ["http://localhost:5500"] for tighter security
|
19 |
+
allow_credentials=True,
|
20 |
+
allow_methods=["*"],
|
21 |
+
allow_headers=["*"],
|
22 |
+
)
|
23 |
+
|
24 |
+
rag = RAG()
|
25 |
+
documents = {} # {file_id: {"file_path": str, "file_name": str}}
|
26 |
+
|
27 |
+
class QueryRequest(BaseModel):
|
28 |
+
question: str
|
29 |
+
file_id: str
|
30 |
+
|
31 |
+
@app.post("/upload")
|
32 |
+
async def upload_file(file: UploadFile = File(...)):
|
33 |
+
try:
|
34 |
+
logger.info(f"Received file: {file.filename}, type: {file.content_type}")
|
35 |
+
|
36 |
+
if file.content_type not in ["application/pdf", "text/plain", "application/octet-stream"]:
|
37 |
+
raise HTTPException(status_code=400, detail="Only PDF or text files allowed")
|
38 |
+
|
39 |
+
file_id = str(len(documents) + 1)
|
40 |
+
safe_name = file.filename.replace("/", "_").replace("\\", "_")
|
41 |
+
file_path = f"uploads/{file_id}_{safe_name}"
|
42 |
+
|
43 |
+
os.makedirs("uploads", exist_ok=True)
|
44 |
+
file_bytes = await file.read()
|
45 |
+
|
46 |
+
with open(file_path, "wb") as f:
|
47 |
+
f.write(file_bytes)
|
48 |
+
|
49 |
+
text = parse_pdf(file_bytes) if file.filename.endswith(".pdf") else parse_text(file_bytes)
|
50 |
+
logger.info(f"Extracted text length: {len(text.strip())}")
|
51 |
+
|
52 |
+
if not text.strip():
|
53 |
+
logger.warning("No text extracted. Assigning default placeholder text.")
|
54 |
+
text = "No extractable text found in this file."
|
55 |
+
|
56 |
+
documents[file_id] = {"file_path": file_path, "file_name": file.filename}
|
57 |
+
|
58 |
+
try:
|
59 |
+
await rag.embed_document(file_id, text)
|
60 |
+
except Exception as e:
|
61 |
+
logger.error(f"Embedding failed: {e}")
|
62 |
+
raise HTTPException(status_code=500, detail="Embedding failed")
|
63 |
+
|
64 |
+
return {"file_id": file_id, "file_name": file.filename}
|
65 |
+
|
66 |
+
except HTTPException as http_err:
|
67 |
+
raise http_err
|
68 |
+
except Exception as e:
|
69 |
+
logger.error(f"Upload failed: {e}")
|
70 |
+
raise HTTPException(status_code=500, detail="Failed to process file")
|
71 |
+
|
72 |
+
@app.post("/query")
|
73 |
+
async def query_file(request: QueryRequest):
|
74 |
+
if request.file_id not in documents:
|
75 |
+
raise HTTPException(status_code=404, detail="File not found")
|
76 |
+
|
77 |
+
try:
|
78 |
+
context = await rag.query_document(request.question, request.file_id)
|
79 |
+
if not context:
|
80 |
+
return {"answer": "No relevant information found in the document."}
|
81 |
+
|
82 |
+
context_text = "\n".join(context)
|
83 |
+
prompt = f"Using the following context, answer the question: {request.question}\n\nContext: {context_text[:5000]}"
|
84 |
+
|
85 |
+
response = requests.post(
|
86 |
+
"http://localhost:11434/api/generate",
|
87 |
+
json={"model": "qwen:7b", "prompt": prompt, "stream": False}
|
88 |
+
)
|
89 |
+
response.raise_for_status()
|
90 |
+
data = response.json()
|
91 |
+
answer = data.get("response", "Error generating answer")
|
92 |
+
|
93 |
+
logger.info(f"Question: {request.question}, Answer: {answer[:50]}...")
|
94 |
+
return {"answer": answer}
|
95 |
+
|
96 |
+
except Exception as e:
|
97 |
+
logger.error(f"Query failed: {e}")
|
98 |
+
raise HTTPException(status_code=500, detail="Error answering question")
|
99 |
+
|
100 |
+
@app.post("/summarize")
|
101 |
+
async def summarize_file(file_id: str):
|
102 |
+
if file_id not in documents:
|
103 |
+
raise HTTPException(status_code=404, detail="File not found")
|
104 |
+
|
105 |
+
try:
|
106 |
+
with open(documents[file_id]["file_path"], "rb") as f:
|
107 |
+
content = f.read()
|
108 |
+
text = parse_pdf(content) if documents[file_id]["file_name"].endswith(".pdf") else parse_text(content)
|
109 |
+
|
110 |
+
if not text.strip():
|
111 |
+
raise ValueError("No text found to summarize.")
|
112 |
+
|
113 |
+
prompt = f"Summarize this text in 100 words or less: {text[:5000]}"
|
114 |
+
response = requests.post(
|
115 |
+
"http://localhost:11434/api/generate",
|
116 |
+
json={"model": "qwen:7b", "prompt": prompt, "stream": False}
|
117 |
+
)
|
118 |
+
response.raise_for_status()
|
119 |
+
data = response.json()
|
120 |
+
summary = data.get("response", "Error generating summary")
|
121 |
+
|
122 |
+
logger.info(f"Summary for file {file_id}: {summary[:50]}...")
|
123 |
+
return {"summary": summary}
|
124 |
+
|
125 |
+
except Exception as e:
|
126 |
+
logger.error(f"Summarization failed: {e}")
|
127 |
+
raise HTTPException(status_code=500, detail="Error generating summary")
|
parser.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from PyPDF2 import PdfReader
|
2 |
+
from io import BytesIO
|
3 |
+
import logging
|
4 |
+
|
5 |
+
logger = logging.getLogger(__name__)
|
6 |
+
|
7 |
+
def parse_pdf(file_content):
|
8 |
+
try:
|
9 |
+
reader = PdfReader(BytesIO(file_content))
|
10 |
+
text = ""
|
11 |
+
for page in reader.pages:
|
12 |
+
extracted = page.extract_text()
|
13 |
+
if extracted:
|
14 |
+
text += extracted
|
15 |
+
return text
|
16 |
+
except Exception as e:
|
17 |
+
logger.error(f"PDF parse error: {e}")
|
18 |
+
return ""
|
19 |
+
|
20 |
+
def parse_text(file_content):
|
21 |
+
try:
|
22 |
+
return file_content.decode("utf-8")
|
23 |
+
except UnicodeDecodeError:
|
24 |
+
try:
|
25 |
+
return file_content.decode("latin1")
|
26 |
+
except Exception as e:
|
27 |
+
logger.error(f"Text parse error: {e}")
|
28 |
+
return ""
|
rag.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sentence_transformers import SentenceTransformer
|
2 |
+
import faiss
|
3 |
+
import numpy as np
|
4 |
+
import textwrap
|
5 |
+
import pickle
|
6 |
+
import os
|
7 |
+
import logging
|
8 |
+
|
9 |
+
logger = logging.getLogger(__name__)
|
10 |
+
|
11 |
+
class RAG:
|
12 |
+
def __init__(self):
|
13 |
+
self.embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
14 |
+
self.index = faiss.IndexFlatL2(384)
|
15 |
+
self.metadata = {}
|
16 |
+
self.index_file = "faiss_index/index.faiss"
|
17 |
+
self.metadata_file = "faiss_index/metadata.pkl"
|
18 |
+
os.makedirs("faiss_index", exist_ok=True)
|
19 |
+
|
20 |
+
if os.path.exists(self.index_file):
|
21 |
+
self.index = faiss.read_index(self.index_file)
|
22 |
+
with open(self.metadata_file, "rb") as f:
|
23 |
+
self.metadata = pickle.load(f)
|
24 |
+
|
25 |
+
async def embed_document(self, file_id: str, text: str):
|
26 |
+
chunks = textwrap.wrap(text, 500)
|
27 |
+
if not chunks:
|
28 |
+
logger.warning(f"Empty document provided for file_id {file_id}")
|
29 |
+
raise ValueError("Document contains no text to embed.")
|
30 |
+
|
31 |
+
embeddings = self.embedder.encode(chunks)
|
32 |
+
ids = np.array([i for i in range(len(self.metadata), len(self.metadata) + len(chunks))])
|
33 |
+
self.index.add(embeddings.astype("float32"))
|
34 |
+
|
35 |
+
for i, chunk in enumerate(chunks):
|
36 |
+
self.metadata[ids[i]] = {"file_id": file_id, "text": chunk}
|
37 |
+
|
38 |
+
faiss.write_index(self.index, self.index_file)
|
39 |
+
with open(self.metadata_file, "wb") as f:
|
40 |
+
pickle.dump(self.metadata, f)
|
41 |
+
|
42 |
+
async def query_document(self, question: str, file_id: str) -> list:
|
43 |
+
query_embedding = self.embedder.encode([question])[0].astype("float32")
|
44 |
+
distances, indices = self.index.search(np.array([query_embedding]), k=3)
|
45 |
+
|
46 |
+
return [
|
47 |
+
self.metadata[idx]["text"]
|
48 |
+
for idx in indices[0]
|
49 |
+
if idx in self.metadata and self.metadata[idx]["file_id"] == file_id
|
50 |
+
]
|
requirement.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
fastapi==0.103.0
|
2 |
+
uvicorn==0.23.2
|
3 |
+
PyPDF2==3.0.1
|
4 |
+
sentence-transformers==2.2.2
|
5 |
+
faiss-cpu-windows==1.7.4
|
6 |
+
requests==2.31.0
|
static/index.html
ADDED
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html lang="en">
|
3 |
+
<head>
|
4 |
+
<meta charset="UTF-8" />
|
5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0"/>
|
6 |
+
<title>Simple NotebookLM-like Tool</title>
|
7 |
+
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/css/bootstrap.min.css" rel="stylesheet">
|
8 |
+
<style>
|
9 |
+
body {
|
10 |
+
background-color: #f8f9fa;
|
11 |
+
display: flex;
|
12 |
+
justify-content: center;
|
13 |
+
align-items: center;
|
14 |
+
min-height: 100vh;
|
15 |
+
margin: 0;
|
16 |
+
font-family: Arial, sans-serif;
|
17 |
+
}
|
18 |
+
.container {
|
19 |
+
max-width: 600px;
|
20 |
+
background: white;
|
21 |
+
padding: 20px;
|
22 |
+
border-radius: 8px;
|
23 |
+
box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
|
24 |
+
}
|
25 |
+
h1 {
|
26 |
+
font-size: 1.8rem;
|
27 |
+
text-align: center;
|
28 |
+
margin-bottom: 20px;
|
29 |
+
}
|
30 |
+
.form-section {
|
31 |
+
margin-bottom: 20px;
|
32 |
+
}
|
33 |
+
#result, #summary {
|
34 |
+
white-space: pre-wrap;
|
35 |
+
background: #f1f1f1;
|
36 |
+
padding: 10px;
|
37 |
+
border-radius: 4px;
|
38 |
+
min-height: 50px;
|
39 |
+
}
|
40 |
+
.btn {
|
41 |
+
width: 100%;
|
42 |
+
}
|
43 |
+
input[type="file"], input[type="text"] {
|
44 |
+
margin-bottom: 10px;
|
45 |
+
}
|
46 |
+
</style>
|
47 |
+
</head>
|
48 |
+
<body>
|
49 |
+
<div class="container">
|
50 |
+
<h1>NotebookLM-like Tool</h1>
|
51 |
+
|
52 |
+
<!-- File Upload Section -->
|
53 |
+
<div class="form-section">
|
54 |
+
<h3>Upload File</h3>
|
55 |
+
<input type="file" id="fileInput" accept=".pdf,.txt" class="form-control">
|
56 |
+
<button onclick="uploadFile()" class="btn btn-primary mt-2">Upload</button>
|
57 |
+
<div id="uploadStatus" class="mt-2"></div>
|
58 |
+
</div>
|
59 |
+
|
60 |
+
<!-- Question Input Section -->
|
61 |
+
<div class="form-section">
|
62 |
+
<h3>Ask a Question</h3>
|
63 |
+
<input type="text" id="questionInput" placeholder="Enter your question" class="form-control">
|
64 |
+
<button onclick="askQuestion()" class="btn btn-primary mt-2">Ask</button>
|
65 |
+
<div id="result" class="mt-2"></div>
|
66 |
+
</div>
|
67 |
+
|
68 |
+
<!-- Summary Section -->
|
69 |
+
<div class="form-section">
|
70 |
+
<h3>Summary</h3>
|
71 |
+
<button onclick="getSummary()" class="btn btn-secondary">Get Summary</button>
|
72 |
+
<div id="summary" class="mt-2"></div>
|
73 |
+
</div>
|
74 |
+
</div>
|
75 |
+
|
76 |
+
<!-- Bootstrap JS Bundle -->
|
77 |
+
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/js/bootstrap.bundle.min.js"></script>
|
78 |
+
<script>
|
79 |
+
let fileId = null;
|
80 |
+
|
81 |
+
async function uploadFile() {
|
82 |
+
const fileInput = document.getElementById('fileInput');
|
83 |
+
const uploadStatus = document.getElementById('uploadStatus');
|
84 |
+
const resultDiv = document.getElementById('result');
|
85 |
+
const summaryDiv = document.getElementById('summary');
|
86 |
+
|
87 |
+
// Reset old states
|
88 |
+
resultDiv.innerText = '';
|
89 |
+
summaryDiv.innerText = '';
|
90 |
+
uploadStatus.className = "mt-2 text-info";
|
91 |
+
|
92 |
+
if (!fileInput.files[0]) {
|
93 |
+
uploadStatus.innerText = 'Please select a file.';
|
94 |
+
uploadStatus.className = "mt-2 text-danger";
|
95 |
+
return;
|
96 |
+
}
|
97 |
+
|
98 |
+
const formData = new FormData();
|
99 |
+
formData.append('file', fileInput.files[0]);
|
100 |
+
|
101 |
+
try {
|
102 |
+
uploadStatus.innerText = 'Uploading...';
|
103 |
+
const response = await fetch('http://localhost:8000/upload', {
|
104 |
+
method: 'POST',
|
105 |
+
body: formData
|
106 |
+
});
|
107 |
+
const data = await response.json();
|
108 |
+
|
109 |
+
if (response.ok) {
|
110 |
+
fileId = data.file_id;
|
111 |
+
uploadStatus.innerText = `β
File uploaded: ${data.file_name} (ID: ${fileId})`;
|
112 |
+
uploadStatus.className = "mt-2 text-success";
|
113 |
+
} else {
|
114 |
+
uploadStatus.innerText = `β Error: ${data.detail}`;
|
115 |
+
uploadStatus.className = "mt-2 text-danger";
|
116 |
+
}
|
117 |
+
} catch (error) {
|
118 |
+
uploadStatus.innerText = 'β Upload failed due to server/network error.';
|
119 |
+
uploadStatus.className = "mt-2 text-danger";
|
120 |
+
console.error(error);
|
121 |
+
}
|
122 |
+
}
|
123 |
+
|
124 |
+
async function askQuestion() {
|
125 |
+
const questionInput = document.getElementById('questionInput');
|
126 |
+
const resultDiv = document.getElementById('result');
|
127 |
+
const question = questionInput.value.trim();
|
128 |
+
|
129 |
+
if (!fileId) {
|
130 |
+
resultDiv.innerText = 'β οΈ Please upload a file first.';
|
131 |
+
return;
|
132 |
+
}
|
133 |
+
if (!question) {
|
134 |
+
resultDiv.innerText = 'β οΈ Please enter a question.';
|
135 |
+
return;
|
136 |
+
}
|
137 |
+
|
138 |
+
try {
|
139 |
+
resultDiv.innerText = 'Fetching answer...';
|
140 |
+
const response = await fetch('http://localhost:8000/query', {
|
141 |
+
method: 'POST',
|
142 |
+
headers: { 'Content-Type': 'application/json' },
|
143 |
+
body: JSON.stringify({ question, file_id: fileId })
|
144 |
+
});
|
145 |
+
const data = await response.json();
|
146 |
+
resultDiv.innerText = response.ok ? data.answer : `β Error: ${data.detail}`;
|
147 |
+
} catch (error) {
|
148 |
+
resultDiv.innerText = 'β Error fetching answer.';
|
149 |
+
console.error(error);
|
150 |
+
}
|
151 |
+
}
|
152 |
+
|
153 |
+
async function getSummary() {
|
154 |
+
const summaryDiv = document.getElementById('summary');
|
155 |
+
|
156 |
+
if (!fileId) {
|
157 |
+
summaryDiv.innerText = 'β οΈ Please upload a file first.';
|
158 |
+
return;
|
159 |
+
}
|
160 |
+
|
161 |
+
try {
|
162 |
+
summaryDiv.innerText = 'Fetching summary...';
|
163 |
+
const response = await fetch('http://localhost:8000/summarize', {
|
164 |
+
method: 'POST',
|
165 |
+
headers: { 'Content-Type': 'application/json' },
|
166 |
+
body: JSON.stringify({ file_id: fileId })
|
167 |
+
});
|
168 |
+
const data = await response.json();
|
169 |
+
summaryDiv.innerText = response.ok ? data.summary : `β Error: ${data.detail}`;
|
170 |
+
} catch (error) {
|
171 |
+
summaryDiv.innerText = 'β Error fetching summary.';
|
172 |
+
console.error(error);
|
173 |
+
}
|
174 |
+
}
|
175 |
+
</script>
|
176 |
+
</body>
|
177 |
+
</html>
|