OnlyTheTruth03 commited on
Commit
0b4e744
Β·
1 Parent(s): f6e6c81

ingest fix

Browse files
Files changed (1) hide show
  1. src/ingest.py +49 -23
src/ingest.py CHANGED
@@ -1,50 +1,76 @@
1
  # src/ingest.py
2
  import pickle
3
- from io import BytesIO
4
  from pathlib import Path
5
 
6
  import faiss
 
7
  from datasets import load_dataset
8
- from pypdf import PdfReader
9
  from sentence_transformers import SentenceTransformer
10
 
11
- from config import INDEX_DIR, FAISS_INDEX_PATH, DOCS_PATH, DATASET_NAME
 
 
 
 
 
 
12
 
13
 
14
- def build_index():
15
- INDEX_DIR.mkdir(parents=True, exist_ok=True)
 
 
 
 
 
 
 
 
16
 
 
 
 
17
  dataset = load_dataset(DATASET_NAME, split="train")
18
 
19
- texts = []
20
 
21
- for row in dataset:
22
- pdf_obj = row["pdf"] # HF auto column name
23
 
24
- # HF PDF object β†’ bytes
25
- pdf_bytes = pdf_obj["bytes"]
26
- reader = PdfReader(BytesIO(pdf_bytes))
27
 
28
- for page in reader.pages:
29
- text = page.extract_text()
30
- if text:
31
- texts.append(text.strip())
32
 
33
- if not texts:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  raise RuntimeError("❌ No text extracted from PDFs")
35
 
36
- # Embeddings
37
- model = SentenceTransformer("all-MiniLM-L6-v2")
38
- embeddings = model.encode(texts, show_progress_bar=True)
39
 
40
- # FAISS
41
- dim = embeddings.shape[1]
42
  index = faiss.IndexFlatL2(dim)
43
  index.add(embeddings)
44
 
 
 
45
  faiss.write_index(index, str(FAISS_INDEX_PATH))
46
 
47
  with open(DOCS_PATH, "wb") as f:
48
- pickle.dump(texts, f)
49
 
50
- return index, texts
 
1
  # src/ingest.py
2
  import pickle
 
3
  from pathlib import Path
4
 
5
  import faiss
6
+ import pdfplumber
7
  from datasets import load_dataset
 
8
  from sentence_transformers import SentenceTransformer
9
 
10
+ from config import (
11
+ DATASET_NAME,
12
+ FAISS_INDEX_PATH,
13
+ DOCS_PATH,
14
+ CHUNK_SIZE,
15
+ CHUNK_OVERLAP,
16
+ )
17
 
18
 
19
+ def chunk_text(text: str):
20
+ chunks = []
21
+ start = 0
22
+
23
+ while start < len(text):
24
+ end = start + CHUNK_SIZE
25
+ chunks.append(text[start:end])
26
+ start = end - CHUNK_OVERLAP
27
+
28
+ return chunks
29
 
30
+
31
+ def build_index():
32
+ print("πŸ“₯ Loading HF dataset...")
33
  dataset = load_dataset(DATASET_NAME, split="train")
34
 
35
+ embedder = SentenceTransformer("all-MiniLM-L6-v2")
36
 
37
+ documents = []
38
+ embeddings = []
39
 
40
+ print(f"πŸ“„ Dataset rows: {len(dataset)}")
 
 
41
 
42
+ for row_idx, row in enumerate(dataset):
43
+ pdf_obj = row["pdf"]
 
 
44
 
45
+ # βœ… THIS IS THE KEY
46
+ pdf_path = pdf_obj.path
47
+
48
+ print(f"➑️ Processing PDF {row_idx + 1}: {pdf_path}")
49
+
50
+ with pdfplumber.open(pdf_path) as pdf:
51
+ for page in pdf.pages:
52
+ text = page.extract_text()
53
+ if not text:
54
+ continue
55
+
56
+ for chunk in chunk_text(text):
57
+ documents.append(chunk)
58
+ embeddings.append(embedder.encode(chunk))
59
+
60
+ if not documents:
61
  raise RuntimeError("❌ No text extracted from PDFs")
62
 
63
+ print(f"🧠 Creating FAISS index with {len(documents)} chunks")
 
 
64
 
65
+ dim = len(embeddings[0])
 
66
  index = faiss.IndexFlatL2(dim)
67
  index.add(embeddings)
68
 
69
+ FAISS_INDEX_PATH.parent.mkdir(parents=True, exist_ok=True)
70
+
71
  faiss.write_index(index, str(FAISS_INDEX_PATH))
72
 
73
  with open(DOCS_PATH, "wb") as f:
74
+ pickle.dump(documents, f)
75
 
76
+ print("βœ… FAISS index built successfully")