Dyraa18 commited on
Commit
fb2123c
·
verified ·
1 Parent(s): 70b40f1

Upload 8 files

Browse files
.gitattributes CHANGED
@@ -51,3 +51,6 @@ Dataset/Penjas/PJOK_BS_KLS_III.pdf filter=lfs diff=lfs merge=lfs -text
51
  Dataset/Penjas/PJOK_BS_KLS_IV.pdf filter=lfs diff=lfs merge=lfs -text
52
  Dataset/Penjas/PJOK_BS_KLS_V.pdf filter=lfs diff=lfs merge=lfs -text
53
  Dataset/Penjas/PJOK_BS_KLS_VI.pdf filter=lfs diff=lfs merge=lfs -text
 
 
 
 
51
  Dataset/Penjas/PJOK_BS_KLS_IV.pdf filter=lfs diff=lfs merge=lfs -text
52
  Dataset/Penjas/PJOK_BS_KLS_V.pdf filter=lfs diff=lfs merge=lfs -text
53
  Dataset/Penjas/PJOK_BS_KLS_VI.pdf filter=lfs diff=lfs merge=lfs -text
54
+ Rag-Pipeline/Vektor[[:space:]]Database/Ipas/IPA_index.index filter=lfs diff=lfs merge=lfs -text
55
+ Rag-Pipeline/Vektor[[:space:]]Database/Pancasila/PANCASILA_index.index filter=lfs diff=lfs merge=lfs -text
56
+ Rag-Pipeline/Vektor[[:space:]]Database/Penjas/PENJAS_index.index filter=lfs diff=lfs merge=lfs -text
Rag-Pipeline/Vektor Database/Ipas/IPA_index.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45fdf20d00c4cbe679a6d00584f35b95942552ea7b67137779fdcd48c65b5403
3
+ size 15818797
Rag-Pipeline/Vektor Database/Pancasila/PANCASILA_index.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:157e84a8342195a6e39eed3be5f244745fbf8221e92aba797c440067443e8afc
3
+ size 12943405
Rag-Pipeline/Vektor Database/Penjas/PENJAS_index.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:295a1bef4823cf5d42f24aa8be1259962640e8f6beba94a69cd542ef84e231f9
3
+ size 17891373
Rag-Pipeline/chunk.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
2
+ from langchain_core.documents import Document
3
+ import glob
4
+ import json
5
+ import os
6
+
7
+
8
+ folder_path = "D:\Webchatbot\Dataset\Penjas\Clean"
9
+ file_paths = glob.glob(os.path.join(folder_path, "*.txt"))
10
+
11
+ pages = []
12
+ for path in file_paths:
13
+ with open(path, "r", encoding="utf-8") as f:
14
+ text = f.read()
15
+ pages.append(Document(page_content=text, metadata={"source": path}))
16
+
17
+ print(f" Total file terbaca: {len(file_paths)}")
18
+
19
+ text_splitter = RecursiveCharacterTextSplitter(
20
+ chunk_size=300,
21
+ chunk_overlap=50,
22
+ separators=["\n\n", "\n", ".", " "]
23
+ )
24
+
25
+ documents = text_splitter.split_documents(pages)
26
+ all_texts = [doc.page_content for doc in documents]
27
+
28
+
29
+ output_dir = "D:\Webchatbot\Dataset\Penjas\Chunk"
30
+ os.makedirs(output_dir, exist_ok=True)
31
+
32
+ output_path = os.path.join(output_dir, "penjas_chunks.json")
33
+
34
+ data_to_save = [
35
+ {"id": i + 1, "text": chunk}
36
+ for i, chunk in enumerate(all_texts)
37
+ ]
38
+
39
+ with open(output_path, "w", encoding="utf-8") as f:
40
+ json.dump(data_to_save, f, ensure_ascii=False, indent=2)
41
+
42
+ print(f"Hasil chunk disimpan ke: {os.path.abspath(output_path)}")
43
+
44
+ for i, chunk in enumerate(all_texts[:3]):
45
+ print(f"\n--- Chunk {i+1} ---\n{chunk}")
Rag-Pipeline/cleans.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ from pdfminer.high_level import extract_pages
4
+ from pdfminer.layout import LTTextContainer
5
+
6
+
7
+ def extract_and_clean_pdf(path: str, skip_pages: list[int] = None) -> list[str]:
8
+ skip_pages = skip_pages or []
9
+ cleaned_pages = []
10
+
11
+ for i, page_layout in enumerate(extract_pages(path), start=1):
12
+ if i in skip_pages:
13
+ print(f"Halaman {i} dilewati.")
14
+ continue
15
+
16
+ page_text = ""
17
+ for element in page_layout:
18
+ if isinstance(element, LTTextContainer):
19
+ page_text += element.get_text()
20
+
21
+ cleaned_text = clean_text(page_text)
22
+ cleaned_pages.append(cleaned_text)
23
+
24
+ print(f"\nTotal halaman diambil: {len(cleaned_pages)} halaman (dari {i} total halaman).")
25
+ return cleaned_pages
26
+
27
+
28
+ def clean_text(text: str) -> str:
29
+ text = text.replace("\n", " ").replace("\t", " ")
30
+ text = re.sub(r'[^\x20-\x7EÀ-ÿ]', '', text)
31
+ text = re.sub(r'\s+', ' ', text)
32
+ text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)
33
+ text = re.sub(r'([a-z])([0-9])', r'\1 \2', text)
34
+ text = re.sub(r'([0-9])([a-zA-Z])', r'\1 \2', text)
35
+ text = re.sub(r'\s+([.,!?;:])', r'\1', text)
36
+ return text.strip()
37
+
38
+
39
+ def save_cleaned_text(cleaned_pages: list[str], output_path: str):
40
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
41
+
42
+ with open(output_path, "w", encoding="utf-8") as f:
43
+ for page in cleaned_pages:
44
+ f.write(page + "\n\n")
45
+
46
+ print(f"File teks berhasil disimpan ke:\n{output_path}")
47
+
48
+
49
+ if __name__ == "__main__":
50
+ pdf_path = "D:\Webchatbot\Dataset\Penjas\PJOK_BS_KLS_V.pdf"
51
+ output_txt = "D:\Webchatbot\Dataset\Penjas\Clean\Penjas Kelas V.txt"
52
+
53
+ halaman_dihapus = []+ list(range(1,15)) + list(range(188,208))
54
+ hasil = extract_and_clean_pdf(pdf_path, skip_pages=halaman_dihapus)
55
+
56
+ if hasil:
57
+ save_cleaned_text(hasil, output_txt)
58
+ else:
59
+ print("Tidak ada halaman yang diekstrak.")
Rag-Pipeline/cleans2.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os, re, sys
3
+ from typing import List, Optional, Set
4
+ import fitz
5
+ import pytesseract
6
+ from PIL import Image
7
+ from io import BytesIO
8
+
9
+ PDF_PATH = r"D:\Webchatbot\Dataset\Penjas\PJOK_BS_KLS_VI.pdf"
10
+ OUTPUT_TXT = r"D:\Webchatbot\Dataset\Penjas\Clean\Penjas Kelas VI.txt"
11
+ SKIP_PAGES = list(range(1, 22)) + list(range(200, 211)) + list(range(213, 226))
12
+ TESSERACT_CMD = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
13
+ OCR_LANG = "ind+eng"
14
+ DPI = 300
15
+
16
+
17
+ if TESSERACT_CMD:
18
+ pytesseract.pytesseract.tesseract_cmd = TESSERACT_CMD
19
+
20
+
21
+ URL_RE = re.compile(
22
+ r"(https?://\S+|www\.\S+|\b\S+\.(?:com|org|net|edu|gov|go|id|co)\S*)",
23
+ flags=re.IGNORECASE,
24
+ )
25
+ BAB_LINE_RE = re.compile(
26
+ r"^\s*(?:bab|BAB)\s*(?:[0-9]+|[IVXLCDM]+)\s*(?:[:\-–—]\s*.*)?\s*$"
27
+ )
28
+ BAB_PREFIX_RE = re.compile(
29
+ r"^\s*(?:bab|BAB)\s*(?:[0-9]+|[IVXLCDM]+)\s*(?:[:\-–—]\s*)?",
30
+ flags=re.IGNORECASE,
31
+ )
32
+
33
+ def clean_text(text: str) -> str:
34
+ text = URL_RE.sub("", text or "")
35
+
36
+ text = text.replace("\t", " ")
37
+ text = re.sub(r"[^\x09\x0A\x0D\x20-\x7EÀ-ÿ]", "", text)
38
+
39
+ cleaned_lines: List[str] = []
40
+ for raw_ln in text.splitlines():
41
+ ln = re.sub(r"\s+", " ", raw_ln).strip()
42
+ if not ln:
43
+ continue
44
+
45
+ if BAB_LINE_RE.match(ln):
46
+ continue
47
+
48
+ ln = BAB_PREFIX_RE.sub("", ln).strip()
49
+
50
+ if not ln:
51
+ continue
52
+
53
+ cleaned_lines.append(ln)
54
+
55
+ text_out = "\n".join(cleaned_lines).strip()
56
+ return text_out
57
+
58
+ def pixmap_to_pil(pix: fitz.Pixmap) -> Image.Image:
59
+ if pix.alpha:
60
+ pix = fitz.Pixmap(fitz.csRGB, pix)
61
+ img_bytes = pix.tobytes("png")
62
+ return Image.open(BytesIO(img_bytes))
63
+
64
+ def ocr_page(img: Image.Image, lang: str) -> str:
65
+ return clean_text(pytesseract.image_to_string(img, lang=lang))
66
+
67
+ def main():
68
+ if not os.path.exists(PDF_PATH):
69
+ print(f"PDF tidak ditemukan: {PDF_PATH}")
70
+ sys.exit(1)
71
+
72
+ doc = fitz.open(PDF_PATH)
73
+ total = doc.page_count
74
+ skip: Set[int] = set(SKIP_PAGES or [])
75
+
76
+ zoom = DPI / 72.0
77
+ mat = fitz.Matrix(zoom, zoom)
78
+
79
+ results: List[str] = []
80
+ skipped = 0
81
+ kept = 0
82
+
83
+ print(f"[*] Total halaman: {total} | DPI render: {DPI}")
84
+ for page_num in range(1, total + 1):
85
+ if page_num in skip:
86
+ skipped += 1
87
+ print(f"Halaman {page_num} dilewati.")
88
+ continue
89
+
90
+ page = doc.load_page(page_num - 1)
91
+ pix = page.get_pixmap(matrix=mat, alpha=False)
92
+ img = pixmap_to_pil(pix)
93
+
94
+ print(f"Halaman {page_num}: OCR …")
95
+ try:
96
+ txt = ocr_page(img, OCR_LANG)
97
+ except Exception as e:
98
+ print(f"[!] OCR gagal halaman {page_num}: {e}")
99
+ txt = ""
100
+
101
+ if txt.strip():
102
+ results.append(txt.strip())
103
+ kept += 1
104
+ else:
105
+ print(f"Halaman {page_num}: hasil kosong/pendek.")
106
+
107
+ doc.close()
108
+
109
+ os.makedirs(os.path.dirname(OUTPUT_TXT), exist_ok=True)
110
+ with open(OUTPUT_TXT, "w", encoding="utf-8") as f:
111
+ for t in results:
112
+ if not t.strip():
113
+ continue
114
+ f.write(t + "\n\n")
115
+
116
+ print("\nRingkasan:")
117
+ print(f"- Total halaman : {total}")
118
+ print(f"- Dilewati (skip) : {skipped}")
119
+ print(f"- Tersimpan (non-skip): {kept}")
120
+ print(f"[*] Output: {OUTPUT_TXT}")
121
+
122
+ if __name__ == "__main__":
123
+ main()
Rag-Pipeline/embed.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, json
2
+ import torch
3
+ import torch.nn.functional as F
4
+ import numpy as np
5
+ from transformers import AutoTokenizer, AutoModel
6
+
7
+ # ===== PATH =====
8
+ JSON_PATH = "D:\Webchatbot\Dataset\Penjas\Chunk\penjas_chunks.json"
9
+ OUTPUT_DIR = "D:\Webchatbot\Dataset\Penjas\Embedd"
10
+ OUTPUT_NAME = "penjas_embeddings.npy"
11
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
12
+
13
+ # ===== DEVICE =====
14
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
15
+ print(f"Running on device: {device}")
16
+
17
+ # ===== LOAD CHUNKS =====
18
+ with open(JSON_PATH, "r", encoding="utf-8") as f:
19
+ data = json.load(f)
20
+ texts = [item["text"] for item in data]
21
+ print(f"Total chunk: {len(texts)}")
22
+
23
+ # ===== MODEL =====
24
+ MODEL_NAME = "intfloat/multilingual-e5-large"
25
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
26
+ model = AutoModel.from_pretrained(MODEL_NAME).to(device).eval()
27
+
28
+ def mean_pooling(last_hidden_state, attention_mask):
29
+ mask = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
30
+ summed = (last_hidden_state * mask).sum(dim=1)
31
+ counts = mask.sum(dim=1).clamp(min=1e-9)
32
+ return summed / counts
33
+
34
+ @torch.no_grad()
35
+ def get_embeddings(texts, batch_size=32, max_length=512):
36
+ embs = []
37
+ for i in range(0, len(texts), batch_size):
38
+ batch = [f"passage: {t}" for t in texts[i:i+batch_size]]
39
+ inputs = tokenizer(batch, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to(device)
40
+ outputs = model(**inputs)
41
+ pooled = mean_pooling(outputs.last_hidden_state, inputs["attention_mask"])
42
+ pooled = F.normalize(pooled, p=2, dim=1)
43
+ embs.append(pooled.cpu())
44
+ if (i // batch_size) % 10 == 0:
45
+ print(f"Processed: {i+len(batch)}/{len(texts)}")
46
+ return torch.cat(embs, dim=0)
47
+
48
+ embeddings = get_embeddings(texts, batch_size=32)
49
+ print(f"Embeddings shape: {embeddings.shape}")
50
+
51
+ # ===== SAVE NPY =====
52
+ output_path = os.path.join(OUTPUT_DIR, OUTPUT_NAME)
53
+ np.save(output_path, embeddings.numpy())
54
+ print(f"Embeddings disimpan ke: {output_path}")
Rag-Pipeline/faissdb.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import faiss
3
+ import os
4
+
5
+ embeddings_path = "D:\Webchatbot\Dataset\Penjas\Embedd\penjas_embeddings.npy"
6
+
7
+ output_dir = "D:\Webchatbot\Rag-Pipeline\Vektor Database\Penjas"
8
+
9
+ embeddings_np = np.load(embeddings_path)
10
+ print(f"Embeddings shape: {embeddings_np.shape}")
11
+
12
+ dimension = embeddings_np.shape[1]
13
+ index = faiss.IndexFlatL2(dimension)
14
+ index.add(embeddings_np)
15
+ print(f"Total vectors di FAISS: {index.ntotal}")
16
+
17
+ faiss_index_path = os.path.join(output_dir, "PENJAS_index.index")
18
+ faiss.write_index(index, faiss_index_path)
19
+ print(f"FAISS index disimpan ke: {faiss_index_path}")