YousifCreates commited on
Commit
f6bb754
Β·
1 Parent(s): b063f48

Updated 5th chapter of OS

Browse files
Files changed (2) hide show
  1. rag/ingest.py +89 -58
  2. static/js/design.js +22 -16
rag/ingest.py CHANGED
@@ -1,5 +1,3 @@
1
- # rag/ingest.py
2
-
3
  import os
4
  import torch
5
  from dotenv import load_dotenv
@@ -11,25 +9,25 @@ from tqdm import tqdm
11
 
12
  load_dotenv()
13
 
14
- # ── Config ──────────────────────────────────────────────────────────────────
15
- PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
16
- PINECONE_INDEX = os.getenv("PINECONE_INDEX", "study-saathi")
17
- EMBEDDING_MODEL = "intfloat/multilingual-e5-large"
18
- DATA_DIR = "data/os_notes"
19
- CHUNK_SIZE = 512
20
- CHUNK_OVERLAP = 64
21
- BATCH_SIZE = 32
22
- DIMENSION = 1024 # multilingual-e5-large output dim
23
-
24
- # ── Device ──────────────────────────────────────────────────────────────────
25
  device = "cuda" if torch.cuda.is_available() else "cpu"
26
  print(f"[INFO] Using device: {device}")
27
 
28
- # ── Load Embedding Model ─────────────────────────────────────────────────────
29
  print("[INFO] Loading embedding model...")
30
  embedder = SentenceTransformer(EMBEDDING_MODEL, device=device)
31
 
32
- # ── Pinecone Setup ───────────────────────────────────────────────────────────
33
  pc = Pinecone(api_key=PINECONE_API_KEY)
34
 
35
  if PINECONE_INDEX not in [i.name for i in pc.list_indexes()]:
@@ -43,29 +41,46 @@ if PINECONE_INDEX not in [i.name for i in pc.list_indexes()]:
43
 
44
  index = pc.Index(PINECONE_INDEX)
45
 
46
- # ── Load Documents ───────────────────────────────────────────────────────────
47
- def load_documents(data_dir: str) -> list:
48
- docs = []
49
- for filename in os.listdir(data_dir):
50
- filepath = os.path.join(data_dir, filename)
51
- if filename.endswith(".pdf"):
52
- loader = PyPDFLoader(filepath)
53
- elif filename.endswith(".txt"):
54
- loader = TextLoader(filepath, encoding="utf-8")
55
- else:
56
- print(f"[SKIP] Unsupported file: {filename}")
57
- continue
58
- loaded = loader.load()
59
- # attach filename as topic metadata
60
- topic = os.path.splitext(filename)[0]
61
- for doc in loaded:
62
- doc.metadata["topic"] = topic
63
- doc.metadata["source"] = filename
64
- docs.extend(loaded)
65
- print(f"[LOADED] {filename} β€” {len(loaded)} page(s)")
66
- return docs
67
-
68
- # ── Chunk Documents ──────────────────────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  def chunk_documents(docs: list) -> list:
70
  splitter = RecursiveCharacterTextSplitter(
71
  chunk_size=CHUNK_SIZE,
@@ -75,26 +90,23 @@ def chunk_documents(docs: list) -> list:
75
  print(f"[INFO] Total chunks: {len(chunks)}")
76
  return chunks
77
 
78
- # ── Embed & Upsert ───────────────────────────────────────────────────────────
79
- def embed_and_upsert(chunks: list):
80
- texts = [
81
- f"passage: {chunk.page_content}" for chunk in chunks # e5 prefix
82
- ]
83
  print("[INFO] Generating embeddings...")
84
  all_vectors = []
85
 
86
  for i in tqdm(range(0, len(texts), BATCH_SIZE)):
87
- batch_texts = texts[i: i + BATCH_SIZE]
88
- batch_chunks = chunks[i: i + BATCH_SIZE]
89
- embeddings = embedder.encode(
90
  batch_texts,
91
  normalize_embeddings=True,
92
  show_progress_bar=False
93
  )
94
- vectors = []
95
  for j, (emb, chunk) in enumerate(zip(embeddings, batch_chunks)):
96
- vectors.append({
97
- "id": f"chunk-{i + j}",
98
  "values": emb.tolist(),
99
  "metadata": {
100
  "text": chunk.page_content,
@@ -102,20 +114,39 @@ def embed_and_upsert(chunks: list):
102
  "source": chunk.metadata.get("source", "unknown"),
103
  }
104
  })
105
- all_vectors.extend(vectors)
106
 
107
- # upsert in batches of 100 (Pinecone limit)
108
  print("[INFO] Upserting to Pinecone...")
109
  for i in tqdm(range(0, len(all_vectors), 100)):
110
  index.upsert(vectors=all_vectors[i: i + 100])
111
 
112
- print(f"[DONE] Upserted {len(all_vectors)} chunks to Pinecone.")
113
 
114
- # ── Main ─────────────────────────────────────────────────────────────────────
115
  if __name__ == "__main__":
116
- docs = load_documents(DATA_DIR)
117
- if not docs:
118
- print("[ERROR] No documents found in data/os_notes/")
 
119
  exit(1)
120
- chunks = chunk_documents(docs)
121
- embed_and_upsert(chunks)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import torch
3
  from dotenv import load_dotenv
 
9
 
10
  load_dotenv()
11
 
12
+ # ── Config ───────────────────────────────────────────────────────────────────
13
+ PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
14
+ PINECONE_INDEX = os.getenv("PINECONE_INDEX", "study-saathi")
15
+ EMBEDDING_MODEL = "intfloat/multilingual-e5-large"
16
+ DATA_DIR = "data/os_notes"
17
+ CHUNK_SIZE = 512
18
+ CHUNK_OVERLAP = 64
19
+ BATCH_SIZE = 32
20
+ DIMENSION = 1024
21
+
22
+ # ── Device ───────────────────────────────────────────────────────────────────
23
  device = "cuda" if torch.cuda.is_available() else "cpu"
24
  print(f"[INFO] Using device: {device}")
25
 
26
+ # ── Load Embedding Model ──────────────────────────────────────────────────────
27
  print("[INFO] Loading embedding model...")
28
  embedder = SentenceTransformer(EMBEDDING_MODEL, device=device)
29
 
30
+ # ── Pinecone Setup ────────────────────────────────────────────────────────────
31
  pc = Pinecone(api_key=PINECONE_API_KEY)
32
 
33
  if PINECONE_INDEX not in [i.name for i in pc.list_indexes()]:
 
41
 
42
  index = pc.Index(PINECONE_INDEX)
43
 
44
+ # ── Check if file already ingested ───────────────────────────────────────────
45
+ def is_already_ingested(filename: str) -> bool:
46
+ """
47
+ Query Pinecone for any vector whose metadata source == filename.
48
+ If found, the file was already ingested β€” skip it.
49
+ """
50
+ topic = os.path.splitext(filename)[0]
51
+
52
+ # use a dummy zero vector just to run a metadata filter query
53
+ dummy_vector = [0.0] * DIMENSION
54
+
55
+ results = index.query(
56
+ vector=dummy_vector,
57
+ top_k=1,
58
+ include_metadata=True,
59
+ filter={"source": {"$eq": filename}}
60
+ )
61
+
62
+ return len(results["matches"]) > 0
63
+
64
+ # ── Load Documents ────────────────────────────────────────────────────────────
65
+ def load_documents(filepath: str, filename: str) -> list:
66
+ if filename.endswith(".pdf"):
67
+ loader = PyPDFLoader(filepath)
68
+ elif filename.endswith(".txt"):
69
+ loader = TextLoader(filepath, encoding="utf-8")
70
+ else:
71
+ return []
72
+
73
+ loaded = loader.load()
74
+ topic = os.path.splitext(filename)[0]
75
+
76
+ for doc in loaded:
77
+ doc.metadata["topic"] = topic
78
+ doc.metadata["source"] = filename
79
+
80
+ print(f"[LOADED] {filename} β€” {len(loaded)} page(s)")
81
+ return loaded
82
+
83
+ # ── Chunk Documents ───────────────────────────────────────────────────────────
84
  def chunk_documents(docs: list) -> list:
85
  splitter = RecursiveCharacterTextSplitter(
86
  chunk_size=CHUNK_SIZE,
 
90
  print(f"[INFO] Total chunks: {len(chunks)}")
91
  return chunks
92
 
93
+ # ── Embed & Upsert ────────────────────────────────────────────────────────────
94
+ def embed_and_upsert(chunks: list, filename: str):
95
+ texts = [f"passage: {chunk.page_content}" for chunk in chunks]
 
 
96
  print("[INFO] Generating embeddings...")
97
  all_vectors = []
98
 
99
  for i in tqdm(range(0, len(texts), BATCH_SIZE)):
100
+ batch_texts = texts[i: i + BATCH_SIZE]
101
+ batch_chunks = chunks[i: i + BATCH_SIZE]
102
+ embeddings = embedder.encode(
103
  batch_texts,
104
  normalize_embeddings=True,
105
  show_progress_bar=False
106
  )
 
107
  for j, (emb, chunk) in enumerate(zip(embeddings, batch_chunks)):
108
+ all_vectors.append({
109
+ "id": f"{os.path.splitext(filename)[0]}-chunk-{i + j}",
110
  "values": emb.tolist(),
111
  "metadata": {
112
  "text": chunk.page_content,
 
114
  "source": chunk.metadata.get("source", "unknown"),
115
  }
116
  })
 
117
 
 
118
  print("[INFO] Upserting to Pinecone...")
119
  for i in tqdm(range(0, len(all_vectors), 100)):
120
  index.upsert(vectors=all_vectors[i: i + 100])
121
 
122
+ print(f"[DONE] Upserted {len(all_vectors)} chunks for '{filename}'.")
123
 
124
+ # ── Main ──────────────────────────────────────────────────────────────────────
125
  if __name__ == "__main__":
126
+ files = [f for f in os.listdir(DATA_DIR) if f.endswith((".pdf", ".txt"))]
127
+
128
+ if not files:
129
+ print("[ERROR] No files found in data/os_notes/")
130
  exit(1)
131
+
132
+ print(f"[INFO] Found {len(files)} file(s): {files}\n")
133
+
134
+ for filename in files:
135
+ filepath = os.path.join(DATA_DIR, filename)
136
+
137
+ # ── SKIP CHECK ────────────────────────────────────────────────────
138
+ if is_already_ingested(filename):
139
+ print(f"[SKIP] '{filename}' already in Pinecone. Skipping...\n")
140
+ continue
141
+
142
+ print(f"[NEW] Processing '{filename}'...")
143
+ docs = load_documents(filepath, filename)
144
+ if not docs:
145
+ print(f"[WARN] Could not load '{filename}'. Skipping.\n")
146
+ continue
147
+
148
+ chunks = chunk_documents(docs)
149
+ embed_and_upsert(chunks, filename)
150
+ print()
151
+
152
+ print("[ALL DONE] Ingestion complete. Existing embeddings are untouched.")
static/js/design.js CHANGED
@@ -25,7 +25,6 @@ const sidebarClose = document.getElementById("sidebarClose");
25
  const quickBtns = document.querySelectorAll(".quick-btn");
26
 
27
  // ── Conversation history ──────────────────────────────────────────────────────
28
- // Each entry: { role: "user" | "assistant", content: "..." }
29
  let history = [];
30
 
31
  // ── Sidebar toggle ────────────────────────────────────────────────────────────
@@ -44,16 +43,18 @@ sidebarClose.addEventListener("click", closeSidebar);
44
  overlay.addEventListener("click", closeSidebar);
45
 
46
  // ── Auto-resize textarea ──────────────────────────────────────────────────────
47
- queryInput.addEventListener("input", () => {
48
  queryInput.style.height = "auto";
49
- queryInput.style.height = Math.min(queryInput.scrollHeight, 140) + "px";
50
- });
 
 
51
 
52
  // ── Quick prompt buttons ──────────────────────────────────────────────────────
53
  quickBtns.forEach(btn => {
54
  btn.addEventListener("click", () => {
55
  queryInput.value = btn.dataset.prompt;
56
- queryInput.dispatchEvent(new Event("input"));
57
  queryInput.focus();
58
  closeSidebar();
59
  });
@@ -61,7 +62,7 @@ quickBtns.forEach(btn => {
61
 
62
  // ── Clear chat ────────────────────────────────────────────────────────────────
63
  clearBtn.addEventListener("click", () => {
64
- history = []; // reset memory
65
  const welcome = document.getElementById("welcomeMsg");
66
  chatWindow.innerHTML = "";
67
  if (welcome) chatWindow.appendChild(welcome);
@@ -88,6 +89,8 @@ function addMessage(role, content) {
88
  bubble.innerHTML = marked.parse(content);
89
  bubble.querySelectorAll("pre code").forEach(el => hljs.highlightElement(el));
90
  } else {
 
 
91
  bubble.textContent = content;
92
  }
93
 
@@ -132,10 +135,7 @@ async function sendMessage() {
132
 
133
  const topic = topicInput.value.trim() || null;
134
 
135
- // show user bubble
136
  addMessage("user", query);
137
-
138
- // push to history BEFORE sending
139
  history.push({ role: "user", content: query });
140
 
141
  // reset input
@@ -159,7 +159,6 @@ async function sendMessage() {
159
  addMessage("assistant", `⚠️ **Error:** ${data.error}`);
160
  } else {
161
  addMessage("assistant", data.response);
162
- // push assistant reply into history
163
  history.push({ role: "assistant", content: data.response });
164
  }
165
 
@@ -172,13 +171,20 @@ async function sendMessage() {
172
  }
173
  }
174
 
175
- // ── Send on button click ──────────────────────────────────────────────────────
176
- sendBtn.addEventListener("click", sendMessage);
177
-
178
- // ── Send on Enter (Shift+Enter = new line) ────────────────────────────────────
179
  queryInput.addEventListener("keydown", (e) => {
180
- if (e.key === "Enter" && !e.shiftKey) {
 
 
 
 
 
 
 
181
  e.preventDefault();
182
  sendMessage();
183
  }
184
- });
 
 
 
 
25
  const quickBtns = document.querySelectorAll(".quick-btn");
26
 
27
  // ── Conversation history ──────────────────────────────────────────────────────
 
28
  let history = [];
29
 
30
  // ── Sidebar toggle ────────────────────────────────────────────────────────────
 
43
  overlay.addEventListener("click", closeSidebar);
44
 
45
  // ── Auto-resize textarea ──────────────────────────────────────────────────────
46
+ function autoResize() {
47
  queryInput.style.height = "auto";
48
+ queryInput.style.height = Math.min(queryInput.scrollHeight, 200) + "px";
49
+ }
50
+
51
+ queryInput.addEventListener("input", autoResize);
52
 
53
  // ── Quick prompt buttons ──────────────────────────────────────────────────────
54
  quickBtns.forEach(btn => {
55
  btn.addEventListener("click", () => {
56
  queryInput.value = btn.dataset.prompt;
57
+ autoResize();
58
  queryInput.focus();
59
  closeSidebar();
60
  });
 
62
 
63
  // ── Clear chat ────────────────────────────────────────────────────────────────
64
  clearBtn.addEventListener("click", () => {
65
+ history = [];
66
  const welcome = document.getElementById("welcomeMsg");
67
  chatWindow.innerHTML = "";
68
  if (welcome) chatWindow.appendChild(welcome);
 
89
  bubble.innerHTML = marked.parse(content);
90
  bubble.querySelectorAll("pre code").forEach(el => hljs.highlightElement(el));
91
  } else {
92
+ // user bubble: preserve newlines and show code syntax as plain text
93
+ bubble.style.whiteSpace = "pre-wrap";
94
  bubble.textContent = content;
95
  }
96
 
 
135
 
136
  const topic = topicInput.value.trim() || null;
137
 
 
138
  addMessage("user", query);
 
 
139
  history.push({ role: "user", content: query });
140
 
141
  // reset input
 
159
  addMessage("assistant", `⚠️ **Error:** ${data.error}`);
160
  } else {
161
  addMessage("assistant", data.response);
 
162
  history.push({ role: "assistant", content: data.response });
163
  }
164
 
 
171
  }
172
  }
173
 
174
+ // ── Keyboard handling ─────────────────────────────────────────────────────────
 
 
 
175
  queryInput.addEventListener("keydown", (e) => {
176
+ if (e.key === "Enter") {
177
+ if (e.shiftKey) {
178
+ // Shift+Enter β€” insert a real newline and resize
179
+ // let the browser insert \n naturally, then resize
180
+ setTimeout(autoResize, 0);
181
+ return; // do NOT send
182
+ }
183
+ // plain Enter β€” send
184
  e.preventDefault();
185
  sendMessage();
186
  }
187
+ });
188
+
189
+ // ── Send button ───────────────────────────────────────────────────────────────
190
+ sendBtn.addEventListener("click", sendMessage);