ah707 commited on
Commit
e539f46
Β·
0 Parent(s):

Initital deploy

Browse files
Files changed (7) hide show
  1. README.md +0 -0
  2. app.py +0 -0
  3. main.py +127 -0
  4. parser.py +28 -0
  5. rag.py +50 -0
  6. requirement.txt +6 -0
  7. static/index.html +177 -0
README.md ADDED
File without changes
app.py ADDED
File without changes
main.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, UploadFile, File, HTTPException
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ from pydantic import BaseModel
4
+ import os
5
+ import logging
6
+ from parser import parse_pdf, parse_text
7
+ from rag import RAG
8
+ import requests
9
+
10
+ logging.basicConfig(level=logging.INFO)
11
+ logger = logging.getLogger(__name__)
12
+
13
+ app = FastAPI(title="Simple NotebookLM-like AI Tool")
14
+
15
+ # Allow frontend at http://localhost:5500 to access this backend
16
+ app.add_middleware(
17
+ CORSMiddleware,
18
+ allow_origins=["*"], # Replace with ["http://localhost:5500"] for tighter security
19
+ allow_credentials=True,
20
+ allow_methods=["*"],
21
+ allow_headers=["*"],
22
+ )
23
+
24
+ rag = RAG()
25
+ documents = {} # {file_id: {"file_path": str, "file_name": str}}
26
+
27
+ class QueryRequest(BaseModel):
28
+ question: str
29
+ file_id: str
30
+
31
+ @app.post("/upload")
32
+ async def upload_file(file: UploadFile = File(...)):
33
+ try:
34
+ logger.info(f"Received file: {file.filename}, type: {file.content_type}")
35
+
36
+ if file.content_type not in ["application/pdf", "text/plain", "application/octet-stream"]:
37
+ raise HTTPException(status_code=400, detail="Only PDF or text files allowed")
38
+
39
+ file_id = str(len(documents) + 1)
40
+ safe_name = file.filename.replace("/", "_").replace("\\", "_")
41
+ file_path = f"uploads/{file_id}_{safe_name}"
42
+
43
+ os.makedirs("uploads", exist_ok=True)
44
+ file_bytes = await file.read()
45
+
46
+ with open(file_path, "wb") as f:
47
+ f.write(file_bytes)
48
+
49
+ text = parse_pdf(file_bytes) if file.filename.endswith(".pdf") else parse_text(file_bytes)
50
+ logger.info(f"Extracted text length: {len(text.strip())}")
51
+
52
+ if not text.strip():
53
+ logger.warning("No text extracted. Assigning default placeholder text.")
54
+ text = "No extractable text found in this file."
55
+
56
+ documents[file_id] = {"file_path": file_path, "file_name": file.filename}
57
+
58
+ try:
59
+ await rag.embed_document(file_id, text)
60
+ except Exception as e:
61
+ logger.error(f"Embedding failed: {e}")
62
+ raise HTTPException(status_code=500, detail="Embedding failed")
63
+
64
+ return {"file_id": file_id, "file_name": file.filename}
65
+
66
+ except HTTPException as http_err:
67
+ raise http_err
68
+ except Exception as e:
69
+ logger.error(f"Upload failed: {e}")
70
+ raise HTTPException(status_code=500, detail="Failed to process file")
71
+
72
+ @app.post("/query")
73
+ async def query_file(request: QueryRequest):
74
+ if request.file_id not in documents:
75
+ raise HTTPException(status_code=404, detail="File not found")
76
+
77
+ try:
78
+ context = await rag.query_document(request.question, request.file_id)
79
+ if not context:
80
+ return {"answer": "No relevant information found in the document."}
81
+
82
+ context_text = "\n".join(context)
83
+ prompt = f"Using the following context, answer the question: {request.question}\n\nContext: {context_text[:5000]}"
84
+
85
+ response = requests.post(
86
+ "http://localhost:11434/api/generate",
87
+ json={"model": "qwen:7b", "prompt": prompt, "stream": False}
88
+ )
89
+ response.raise_for_status()
90
+ data = response.json()
91
+ answer = data.get("response", "Error generating answer")
92
+
93
+ logger.info(f"Question: {request.question}, Answer: {answer[:50]}...")
94
+ return {"answer": answer}
95
+
96
+ except Exception as e:
97
+ logger.error(f"Query failed: {e}")
98
+ raise HTTPException(status_code=500, detail="Error answering question")
99
+
100
+ @app.post("/summarize")
101
+ async def summarize_file(file_id: str):
102
+ if file_id not in documents:
103
+ raise HTTPException(status_code=404, detail="File not found")
104
+
105
+ try:
106
+ with open(documents[file_id]["file_path"], "rb") as f:
107
+ content = f.read()
108
+ text = parse_pdf(content) if documents[file_id]["file_name"].endswith(".pdf") else parse_text(content)
109
+
110
+ if not text.strip():
111
+ raise ValueError("No text found to summarize.")
112
+
113
+ prompt = f"Summarize this text in 100 words or less: {text[:5000]}"
114
+ response = requests.post(
115
+ "http://localhost:11434/api/generate",
116
+ json={"model": "qwen:7b", "prompt": prompt, "stream": False}
117
+ )
118
+ response.raise_for_status()
119
+ data = response.json()
120
+ summary = data.get("response", "Error generating summary")
121
+
122
+ logger.info(f"Summary for file {file_id}: {summary[:50]}...")
123
+ return {"summary": summary}
124
+
125
+ except Exception as e:
126
+ logger.error(f"Summarization failed: {e}")
127
+ raise HTTPException(status_code=500, detail="Error generating summary")
parser.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PyPDF2 import PdfReader
2
+ from io import BytesIO
3
+ import logging
4
+
5
+ logger = logging.getLogger(__name__)
6
+
7
+ def parse_pdf(file_content):
8
+ try:
9
+ reader = PdfReader(BytesIO(file_content))
10
+ text = ""
11
+ for page in reader.pages:
12
+ extracted = page.extract_text()
13
+ if extracted:
14
+ text += extracted
15
+ return text
16
+ except Exception as e:
17
+ logger.error(f"PDF parse error: {e}")
18
+ return ""
19
+
20
+ def parse_text(file_content):
21
+ try:
22
+ return file_content.decode("utf-8")
23
+ except UnicodeDecodeError:
24
+ try:
25
+ return file_content.decode("latin1")
26
+ except Exception as e:
27
+ logger.error(f"Text parse error: {e}")
28
+ return ""
rag.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer
2
+ import faiss
3
+ import numpy as np
4
+ import textwrap
5
+ import pickle
6
+ import os
7
+ import logging
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ class RAG:
12
+ def __init__(self):
13
+ self.embedder = SentenceTransformer("all-MiniLM-L6-v2")
14
+ self.index = faiss.IndexFlatL2(384)
15
+ self.metadata = {}
16
+ self.index_file = "faiss_index/index.faiss"
17
+ self.metadata_file = "faiss_index/metadata.pkl"
18
+ os.makedirs("faiss_index", exist_ok=True)
19
+
20
+ if os.path.exists(self.index_file):
21
+ self.index = faiss.read_index(self.index_file)
22
+ with open(self.metadata_file, "rb") as f:
23
+ self.metadata = pickle.load(f)
24
+
25
+ async def embed_document(self, file_id: str, text: str):
26
+ chunks = textwrap.wrap(text, 500)
27
+ if not chunks:
28
+ logger.warning(f"Empty document provided for file_id {file_id}")
29
+ raise ValueError("Document contains no text to embed.")
30
+
31
+ embeddings = self.embedder.encode(chunks)
32
+ ids = np.array([i for i in range(len(self.metadata), len(self.metadata) + len(chunks))])
33
+ self.index.add(embeddings.astype("float32"))
34
+
35
+ for i, chunk in enumerate(chunks):
36
+ self.metadata[ids[i]] = {"file_id": file_id, "text": chunk}
37
+
38
+ faiss.write_index(self.index, self.index_file)
39
+ with open(self.metadata_file, "wb") as f:
40
+ pickle.dump(self.metadata, f)
41
+
42
+ async def query_document(self, question: str, file_id: str) -> list:
43
+ query_embedding = self.embedder.encode([question])[0].astype("float32")
44
+ distances, indices = self.index.search(np.array([query_embedding]), k=3)
45
+
46
+ return [
47
+ self.metadata[idx]["text"]
48
+ for idx in indices[0]
49
+ if idx in self.metadata and self.metadata[idx]["file_id"] == file_id
50
+ ]
requirement.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ fastapi==0.103.0
2
+ uvicorn==0.23.2
3
+ PyPDF2==3.0.1
4
+ sentence-transformers==2.2.2
5
+ faiss-cpu-windows==1.7.4
6
+ requests==2.31.0
static/index.html ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0"/>
6
+ <title>Simple NotebookLM-like Tool</title>
7
+ <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/css/bootstrap.min.css" rel="stylesheet">
8
+ <style>
9
+ body {
10
+ background-color: #f8f9fa;
11
+ display: flex;
12
+ justify-content: center;
13
+ align-items: center;
14
+ min-height: 100vh;
15
+ margin: 0;
16
+ font-family: Arial, sans-serif;
17
+ }
18
+ .container {
19
+ max-width: 600px;
20
+ background: white;
21
+ padding: 20px;
22
+ border-radius: 8px;
23
+ box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
24
+ }
25
+ h1 {
26
+ font-size: 1.8rem;
27
+ text-align: center;
28
+ margin-bottom: 20px;
29
+ }
30
+ .form-section {
31
+ margin-bottom: 20px;
32
+ }
33
+ #result, #summary {
34
+ white-space: pre-wrap;
35
+ background: #f1f1f1;
36
+ padding: 10px;
37
+ border-radius: 4px;
38
+ min-height: 50px;
39
+ }
40
+ .btn {
41
+ width: 100%;
42
+ }
43
+ input[type="file"], input[type="text"] {
44
+ margin-bottom: 10px;
45
+ }
46
+ </style>
47
+ </head>
48
+ <body>
49
+ <div class="container">
50
+ <h1>NotebookLM-like Tool</h1>
51
+
52
+ <!-- File Upload Section -->
53
+ <div class="form-section">
54
+ <h3>Upload File</h3>
55
+ <input type="file" id="fileInput" accept=".pdf,.txt" class="form-control">
56
+ <button onclick="uploadFile()" class="btn btn-primary mt-2">Upload</button>
57
+ <div id="uploadStatus" class="mt-2"></div>
58
+ </div>
59
+
60
+ <!-- Question Input Section -->
61
+ <div class="form-section">
62
+ <h3>Ask a Question</h3>
63
+ <input type="text" id="questionInput" placeholder="Enter your question" class="form-control">
64
+ <button onclick="askQuestion()" class="btn btn-primary mt-2">Ask</button>
65
+ <div id="result" class="mt-2"></div>
66
+ </div>
67
+
68
+ <!-- Summary Section -->
69
+ <div class="form-section">
70
+ <h3>Summary</h3>
71
+ <button onclick="getSummary()" class="btn btn-secondary">Get Summary</button>
72
+ <div id="summary" class="mt-2"></div>
73
+ </div>
74
+ </div>
75
+
76
+ <!-- Bootstrap JS Bundle -->
77
+ <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/js/bootstrap.bundle.min.js"></script>
78
+ <script>
79
+ let fileId = null;
80
+
81
+ async function uploadFile() {
82
+ const fileInput = document.getElementById('fileInput');
83
+ const uploadStatus = document.getElementById('uploadStatus');
84
+ const resultDiv = document.getElementById('result');
85
+ const summaryDiv = document.getElementById('summary');
86
+
87
+ // Reset old states
88
+ resultDiv.innerText = '';
89
+ summaryDiv.innerText = '';
90
+ uploadStatus.className = "mt-2 text-info";
91
+
92
+ if (!fileInput.files[0]) {
93
+ uploadStatus.innerText = 'Please select a file.';
94
+ uploadStatus.className = "mt-2 text-danger";
95
+ return;
96
+ }
97
+
98
+ const formData = new FormData();
99
+ formData.append('file', fileInput.files[0]);
100
+
101
+ try {
102
+ uploadStatus.innerText = 'Uploading...';
103
+ const response = await fetch('http://localhost:8000/upload', {
104
+ method: 'POST',
105
+ body: formData
106
+ });
107
+ const data = await response.json();
108
+
109
+ if (response.ok) {
110
+ fileId = data.file_id;
111
+ uploadStatus.innerText = `βœ… File uploaded: ${data.file_name} (ID: ${fileId})`;
112
+ uploadStatus.className = "mt-2 text-success";
113
+ } else {
114
+ uploadStatus.innerText = `❌ Error: ${data.detail}`;
115
+ uploadStatus.className = "mt-2 text-danger";
116
+ }
117
+ } catch (error) {
118
+ uploadStatus.innerText = '❌ Upload failed due to server/network error.';
119
+ uploadStatus.className = "mt-2 text-danger";
120
+ console.error(error);
121
+ }
122
+ }
123
+
124
+ async function askQuestion() {
125
+ const questionInput = document.getElementById('questionInput');
126
+ const resultDiv = document.getElementById('result');
127
+ const question = questionInput.value.trim();
128
+
129
+ if (!fileId) {
130
+ resultDiv.innerText = '⚠️ Please upload a file first.';
131
+ return;
132
+ }
133
+ if (!question) {
134
+ resultDiv.innerText = '⚠️ Please enter a question.';
135
+ return;
136
+ }
137
+
138
+ try {
139
+ resultDiv.innerText = 'Fetching answer...';
140
+ const response = await fetch('http://localhost:8000/query', {
141
+ method: 'POST',
142
+ headers: { 'Content-Type': 'application/json' },
143
+ body: JSON.stringify({ question, file_id: fileId })
144
+ });
145
+ const data = await response.json();
146
+ resultDiv.innerText = response.ok ? data.answer : `❌ Error: ${data.detail}`;
147
+ } catch (error) {
148
+ resultDiv.innerText = '❌ Error fetching answer.';
149
+ console.error(error);
150
+ }
151
+ }
152
+
153
+ async function getSummary() {
154
+ const summaryDiv = document.getElementById('summary');
155
+
156
+ if (!fileId) {
157
+ summaryDiv.innerText = '⚠️ Please upload a file first.';
158
+ return;
159
+ }
160
+
161
+ try {
162
+ summaryDiv.innerText = 'Fetching summary...';
163
+ const response = await fetch('http://localhost:8000/summarize', {
164
+ method: 'POST',
165
+ headers: { 'Content-Type': 'application/json' },
166
+ body: JSON.stringify({ file_id: fileId })
167
+ });
168
+ const data = await response.json();
169
+ summaryDiv.innerText = response.ok ? data.summary : `❌ Error: ${data.detail}`;
170
+ } catch (error) {
171
+ summaryDiv.innerText = '❌ Error fetching summary.';
172
+ console.error(error);
173
+ }
174
+ }
175
+ </script>
176
+ </body>
177
+ </html>