Spaces:
Sleeping
Sleeping
Commit Β·
f6bb754
1
Parent(s): b063f48
Updated 5th chapter of OS
Browse files- rag/ingest.py +89 -58
- static/js/design.js +22 -16
rag/ingest.py
CHANGED
|
@@ -1,5 +1,3 @@
|
|
| 1 |
-
# rag/ingest.py
|
| 2 |
-
|
| 3 |
import os
|
| 4 |
import torch
|
| 5 |
from dotenv import load_dotenv
|
|
@@ -11,25 +9,25 @@ from tqdm import tqdm
|
|
| 11 |
|
| 12 |
load_dotenv()
|
| 13 |
|
| 14 |
-
# ββ Config ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 15 |
-
PINECONE_API_KEY
|
| 16 |
-
PINECONE_INDEX
|
| 17 |
-
EMBEDDING_MODEL
|
| 18 |
-
DATA_DIR
|
| 19 |
-
CHUNK_SIZE
|
| 20 |
-
CHUNK_OVERLAP
|
| 21 |
-
BATCH_SIZE
|
| 22 |
-
DIMENSION
|
| 23 |
-
|
| 24 |
-
# ββ Device ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 25 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 26 |
print(f"[INFO] Using device: {device}")
|
| 27 |
|
| 28 |
-
# ββ Load Embedding Model βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 29 |
print("[INFO] Loading embedding model...")
|
| 30 |
embedder = SentenceTransformer(EMBEDDING_MODEL, device=device)
|
| 31 |
|
| 32 |
-
# ββ Pinecone Setup βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 33 |
pc = Pinecone(api_key=PINECONE_API_KEY)
|
| 34 |
|
| 35 |
if PINECONE_INDEX not in [i.name for i in pc.list_indexes()]:
|
|
@@ -43,29 +41,46 @@ if PINECONE_INDEX not in [i.name for i in pc.list_indexes()]:
|
|
| 43 |
|
| 44 |
index = pc.Index(PINECONE_INDEX)
|
| 45 |
|
| 46 |
-
# ββ
|
| 47 |
-
def
|
| 48 |
-
|
| 49 |
-
for
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
def chunk_documents(docs: list) -> list:
|
| 70 |
splitter = RecursiveCharacterTextSplitter(
|
| 71 |
chunk_size=CHUNK_SIZE,
|
|
@@ -75,26 +90,23 @@ def chunk_documents(docs: list) -> list:
|
|
| 75 |
print(f"[INFO] Total chunks: {len(chunks)}")
|
| 76 |
return chunks
|
| 77 |
|
| 78 |
-
# ββ Embed & Upsert βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 79 |
-
def embed_and_upsert(chunks: list):
|
| 80 |
-
texts = [
|
| 81 |
-
f"passage: {chunk.page_content}" for chunk in chunks # e5 prefix
|
| 82 |
-
]
|
| 83 |
print("[INFO] Generating embeddings...")
|
| 84 |
all_vectors = []
|
| 85 |
|
| 86 |
for i in tqdm(range(0, len(texts), BATCH_SIZE)):
|
| 87 |
-
batch_texts
|
| 88 |
-
batch_chunks
|
| 89 |
-
embeddings
|
| 90 |
batch_texts,
|
| 91 |
normalize_embeddings=True,
|
| 92 |
show_progress_bar=False
|
| 93 |
)
|
| 94 |
-
vectors = []
|
| 95 |
for j, (emb, chunk) in enumerate(zip(embeddings, batch_chunks)):
|
| 96 |
-
|
| 97 |
-
"id": f"chunk-{i + j}",
|
| 98 |
"values": emb.tolist(),
|
| 99 |
"metadata": {
|
| 100 |
"text": chunk.page_content,
|
|
@@ -102,20 +114,39 @@ def embed_and_upsert(chunks: list):
|
|
| 102 |
"source": chunk.metadata.get("source", "unknown"),
|
| 103 |
}
|
| 104 |
})
|
| 105 |
-
all_vectors.extend(vectors)
|
| 106 |
|
| 107 |
-
# upsert in batches of 100 (Pinecone limit)
|
| 108 |
print("[INFO] Upserting to Pinecone...")
|
| 109 |
for i in tqdm(range(0, len(all_vectors), 100)):
|
| 110 |
index.upsert(vectors=all_vectors[i: i + 100])
|
| 111 |
|
| 112 |
-
print(f"[DONE] Upserted {len(all_vectors)} chunks
|
| 113 |
|
| 114 |
-
# ββ Main βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 115 |
if __name__ == "__main__":
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
|
|
|
| 119 |
exit(1)
|
| 120 |
-
|
| 121 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
import torch
|
| 3 |
from dotenv import load_dotenv
|
|
|
|
| 9 |
|
| 10 |
load_dotenv()
|
| 11 |
|
| 12 |
+
# ββ Config βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 13 |
+
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
|
| 14 |
+
PINECONE_INDEX = os.getenv("PINECONE_INDEX", "study-saathi")
|
| 15 |
+
EMBEDDING_MODEL = "intfloat/multilingual-e5-large"
|
| 16 |
+
DATA_DIR = "data/os_notes"
|
| 17 |
+
CHUNK_SIZE = 512
|
| 18 |
+
CHUNK_OVERLAP = 64
|
| 19 |
+
BATCH_SIZE = 32
|
| 20 |
+
DIMENSION = 1024
|
| 21 |
+
|
| 22 |
+
# ββ Device βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 23 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 24 |
print(f"[INFO] Using device: {device}")
|
| 25 |
|
| 26 |
+
# ββ Load Embedding Model ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 27 |
print("[INFO] Loading embedding model...")
|
| 28 |
embedder = SentenceTransformer(EMBEDDING_MODEL, device=device)
|
| 29 |
|
| 30 |
+
# ββ Pinecone Setup ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 31 |
pc = Pinecone(api_key=PINECONE_API_KEY)
|
| 32 |
|
| 33 |
if PINECONE_INDEX not in [i.name for i in pc.list_indexes()]:
|
|
|
|
| 41 |
|
| 42 |
index = pc.Index(PINECONE_INDEX)
|
| 43 |
|
| 44 |
+
# ββ Check if file already ingested βββββββββββββββββββββββββββββββββββββββββββ
|
| 45 |
+
def is_already_ingested(filename: str) -> bool:
|
| 46 |
+
"""
|
| 47 |
+
Query Pinecone for any vector whose metadata source == filename.
|
| 48 |
+
If found, the file was already ingested β skip it.
|
| 49 |
+
"""
|
| 50 |
+
topic = os.path.splitext(filename)[0]
|
| 51 |
+
|
| 52 |
+
# use a dummy zero vector just to run a metadata filter query
|
| 53 |
+
dummy_vector = [0.0] * DIMENSION
|
| 54 |
+
|
| 55 |
+
results = index.query(
|
| 56 |
+
vector=dummy_vector,
|
| 57 |
+
top_k=1,
|
| 58 |
+
include_metadata=True,
|
| 59 |
+
filter={"source": {"$eq": filename}}
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
return len(results["matches"]) > 0
|
| 63 |
+
|
| 64 |
+
# ββ Load Documents ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 65 |
+
def load_documents(filepath: str, filename: str) -> list:
|
| 66 |
+
if filename.endswith(".pdf"):
|
| 67 |
+
loader = PyPDFLoader(filepath)
|
| 68 |
+
elif filename.endswith(".txt"):
|
| 69 |
+
loader = TextLoader(filepath, encoding="utf-8")
|
| 70 |
+
else:
|
| 71 |
+
return []
|
| 72 |
+
|
| 73 |
+
loaded = loader.load()
|
| 74 |
+
topic = os.path.splitext(filename)[0]
|
| 75 |
+
|
| 76 |
+
for doc in loaded:
|
| 77 |
+
doc.metadata["topic"] = topic
|
| 78 |
+
doc.metadata["source"] = filename
|
| 79 |
+
|
| 80 |
+
print(f"[LOADED] {filename} β {len(loaded)} page(s)")
|
| 81 |
+
return loaded
|
| 82 |
+
|
| 83 |
+
# ββ Chunk Documents βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 84 |
def chunk_documents(docs: list) -> list:
|
| 85 |
splitter = RecursiveCharacterTextSplitter(
|
| 86 |
chunk_size=CHUNK_SIZE,
|
|
|
|
| 90 |
print(f"[INFO] Total chunks: {len(chunks)}")
|
| 91 |
return chunks
|
| 92 |
|
| 93 |
+
# ββ Embed & Upsert ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 94 |
+
def embed_and_upsert(chunks: list, filename: str):
|
| 95 |
+
texts = [f"passage: {chunk.page_content}" for chunk in chunks]
|
|
|
|
|
|
|
| 96 |
print("[INFO] Generating embeddings...")
|
| 97 |
all_vectors = []
|
| 98 |
|
| 99 |
for i in tqdm(range(0, len(texts), BATCH_SIZE)):
|
| 100 |
+
batch_texts = texts[i: i + BATCH_SIZE]
|
| 101 |
+
batch_chunks = chunks[i: i + BATCH_SIZE]
|
| 102 |
+
embeddings = embedder.encode(
|
| 103 |
batch_texts,
|
| 104 |
normalize_embeddings=True,
|
| 105 |
show_progress_bar=False
|
| 106 |
)
|
|
|
|
| 107 |
for j, (emb, chunk) in enumerate(zip(embeddings, batch_chunks)):
|
| 108 |
+
all_vectors.append({
|
| 109 |
+
"id": f"{os.path.splitext(filename)[0]}-chunk-{i + j}",
|
| 110 |
"values": emb.tolist(),
|
| 111 |
"metadata": {
|
| 112 |
"text": chunk.page_content,
|
|
|
|
| 114 |
"source": chunk.metadata.get("source", "unknown"),
|
| 115 |
}
|
| 116 |
})
|
|
|
|
| 117 |
|
|
|
|
| 118 |
print("[INFO] Upserting to Pinecone...")
|
| 119 |
for i in tqdm(range(0, len(all_vectors), 100)):
|
| 120 |
index.upsert(vectors=all_vectors[i: i + 100])
|
| 121 |
|
| 122 |
+
print(f"[DONE] Upserted {len(all_vectors)} chunks for '{filename}'.")
|
| 123 |
|
| 124 |
+
# ββ Main ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 125 |
if __name__ == "__main__":
|
| 126 |
+
files = [f for f in os.listdir(DATA_DIR) if f.endswith((".pdf", ".txt"))]
|
| 127 |
+
|
| 128 |
+
if not files:
|
| 129 |
+
print("[ERROR] No files found in data/os_notes/")
|
| 130 |
exit(1)
|
| 131 |
+
|
| 132 |
+
print(f"[INFO] Found {len(files)} file(s): {files}\n")
|
| 133 |
+
|
| 134 |
+
for filename in files:
|
| 135 |
+
filepath = os.path.join(DATA_DIR, filename)
|
| 136 |
+
|
| 137 |
+
# ββ SKIP CHECK ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 138 |
+
if is_already_ingested(filename):
|
| 139 |
+
print(f"[SKIP] '{filename}' already in Pinecone. Skipping...\n")
|
| 140 |
+
continue
|
| 141 |
+
|
| 142 |
+
print(f"[NEW] Processing '{filename}'...")
|
| 143 |
+
docs = load_documents(filepath, filename)
|
| 144 |
+
if not docs:
|
| 145 |
+
print(f"[WARN] Could not load '{filename}'. Skipping.\n")
|
| 146 |
+
continue
|
| 147 |
+
|
| 148 |
+
chunks = chunk_documents(docs)
|
| 149 |
+
embed_and_upsert(chunks, filename)
|
| 150 |
+
print()
|
| 151 |
+
|
| 152 |
+
print("[ALL DONE] Ingestion complete. Existing embeddings are untouched.")
|
static/js/design.js
CHANGED
|
@@ -25,7 +25,6 @@ const sidebarClose = document.getElementById("sidebarClose");
|
|
| 25 |
const quickBtns = document.querySelectorAll(".quick-btn");
|
| 26 |
|
| 27 |
// ββ Conversation history ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 28 |
-
// Each entry: { role: "user" | "assistant", content: "..." }
|
| 29 |
let history = [];
|
| 30 |
|
| 31 |
// ββ Sidebar toggle ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -44,16 +43,18 @@ sidebarClose.addEventListener("click", closeSidebar);
|
|
| 44 |
overlay.addEventListener("click", closeSidebar);
|
| 45 |
|
| 46 |
// ββ Auto-resize textarea ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 47 |
-
|
| 48 |
queryInput.style.height = "auto";
|
| 49 |
-
queryInput.style.height = Math.min(queryInput.scrollHeight,
|
| 50 |
-
}
|
|
|
|
|
|
|
| 51 |
|
| 52 |
// ββ Quick prompt buttons ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 53 |
quickBtns.forEach(btn => {
|
| 54 |
btn.addEventListener("click", () => {
|
| 55 |
queryInput.value = btn.dataset.prompt;
|
| 56 |
-
|
| 57 |
queryInput.focus();
|
| 58 |
closeSidebar();
|
| 59 |
});
|
|
@@ -61,7 +62,7 @@ quickBtns.forEach(btn => {
|
|
| 61 |
|
| 62 |
// ββ Clear chat ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 63 |
clearBtn.addEventListener("click", () => {
|
| 64 |
-
history = [];
|
| 65 |
const welcome = document.getElementById("welcomeMsg");
|
| 66 |
chatWindow.innerHTML = "";
|
| 67 |
if (welcome) chatWindow.appendChild(welcome);
|
|
@@ -88,6 +89,8 @@ function addMessage(role, content) {
|
|
| 88 |
bubble.innerHTML = marked.parse(content);
|
| 89 |
bubble.querySelectorAll("pre code").forEach(el => hljs.highlightElement(el));
|
| 90 |
} else {
|
|
|
|
|
|
|
| 91 |
bubble.textContent = content;
|
| 92 |
}
|
| 93 |
|
|
@@ -132,10 +135,7 @@ async function sendMessage() {
|
|
| 132 |
|
| 133 |
const topic = topicInput.value.trim() || null;
|
| 134 |
|
| 135 |
-
// show user bubble
|
| 136 |
addMessage("user", query);
|
| 137 |
-
|
| 138 |
-
// push to history BEFORE sending
|
| 139 |
history.push({ role: "user", content: query });
|
| 140 |
|
| 141 |
// reset input
|
|
@@ -159,7 +159,6 @@ async function sendMessage() {
|
|
| 159 |
addMessage("assistant", `β οΈ **Error:** ${data.error}`);
|
| 160 |
} else {
|
| 161 |
addMessage("assistant", data.response);
|
| 162 |
-
// push assistant reply into history
|
| 163 |
history.push({ role: "assistant", content: data.response });
|
| 164 |
}
|
| 165 |
|
|
@@ -172,13 +171,20 @@ async function sendMessage() {
|
|
| 172 |
}
|
| 173 |
}
|
| 174 |
|
| 175 |
-
// ββ
|
| 176 |
-
sendBtn.addEventListener("click", sendMessage);
|
| 177 |
-
|
| 178 |
-
// ββ Send on Enter (Shift+Enter = new line) ββββββββββββββββββββββββββββββββββββ
|
| 179 |
queryInput.addEventListener("keydown", (e) => {
|
| 180 |
-
if (e.key === "Enter"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
e.preventDefault();
|
| 182 |
sendMessage();
|
| 183 |
}
|
| 184 |
-
});
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
const quickBtns = document.querySelectorAll(".quick-btn");
|
| 26 |
|
| 27 |
// ββ Conversation history ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 28 |
let history = [];
|
| 29 |
|
| 30 |
// ββ Sidebar toggle ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 43 |
overlay.addEventListener("click", closeSidebar);
|
| 44 |
|
| 45 |
// ββ Auto-resize textarea ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 46 |
+
function autoResize() {
|
| 47 |
queryInput.style.height = "auto";
|
| 48 |
+
queryInput.style.height = Math.min(queryInput.scrollHeight, 200) + "px";
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
queryInput.addEventListener("input", autoResize);
|
| 52 |
|
| 53 |
// ββ Quick prompt buttons ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 54 |
quickBtns.forEach(btn => {
|
| 55 |
btn.addEventListener("click", () => {
|
| 56 |
queryInput.value = btn.dataset.prompt;
|
| 57 |
+
autoResize();
|
| 58 |
queryInput.focus();
|
| 59 |
closeSidebar();
|
| 60 |
});
|
|
|
|
| 62 |
|
| 63 |
// ββ Clear chat ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 64 |
clearBtn.addEventListener("click", () => {
|
| 65 |
+
history = [];
|
| 66 |
const welcome = document.getElementById("welcomeMsg");
|
| 67 |
chatWindow.innerHTML = "";
|
| 68 |
if (welcome) chatWindow.appendChild(welcome);
|
|
|
|
| 89 |
bubble.innerHTML = marked.parse(content);
|
| 90 |
bubble.querySelectorAll("pre code").forEach(el => hljs.highlightElement(el));
|
| 91 |
} else {
|
| 92 |
+
// user bubble: preserve newlines and show code syntax as plain text
|
| 93 |
+
bubble.style.whiteSpace = "pre-wrap";
|
| 94 |
bubble.textContent = content;
|
| 95 |
}
|
| 96 |
|
|
|
|
| 135 |
|
| 136 |
const topic = topicInput.value.trim() || null;
|
| 137 |
|
|
|
|
| 138 |
addMessage("user", query);
|
|
|
|
|
|
|
| 139 |
history.push({ role: "user", content: query });
|
| 140 |
|
| 141 |
// reset input
|
|
|
|
| 159 |
addMessage("assistant", `β οΈ **Error:** ${data.error}`);
|
| 160 |
} else {
|
| 161 |
addMessage("assistant", data.response);
|
|
|
|
| 162 |
history.push({ role: "assistant", content: data.response });
|
| 163 |
}
|
| 164 |
|
|
|
|
| 171 |
}
|
| 172 |
}
|
| 173 |
|
| 174 |
+
// ββ Keyboard handling βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
|
|
|
|
|
|
| 175 |
queryInput.addEventListener("keydown", (e) => {
|
| 176 |
+
if (e.key === "Enter") {
|
| 177 |
+
if (e.shiftKey) {
|
| 178 |
+
// Shift+Enter β insert a real newline and resize
|
| 179 |
+
// let the browser insert \n naturally, then resize
|
| 180 |
+
setTimeout(autoResize, 0);
|
| 181 |
+
return; // do NOT send
|
| 182 |
+
}
|
| 183 |
+
// plain Enter β send
|
| 184 |
e.preventDefault();
|
| 185 |
sendMessage();
|
| 186 |
}
|
| 187 |
+
});
|
| 188 |
+
|
| 189 |
+
// ββ Send button βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 190 |
+
sendBtn.addEventListener("click", sendMessage);
|