|
from sentence_transformers import SentenceTransformer |
|
from PyPDF2 import PdfReader |
|
import tiktoken |
|
import groq |
|
import faiss |
|
import numpy as np |
|
import gradio as gr |
|
import json |
|
import os |
|
import pickle |
|
|
|
|
|
os.makedirs("models", exist_ok=True) |
|
|
|
|
|
def load_api_key(): |
|
with open("config.json", "r") as f: |
|
config = json.load(f) |
|
return config["GROQ_API_KEY"] |
|
|
|
GROQ_API_KEY = load_api_key() |
|
|
|
|
|
def extract_text_from_pdf(pdf_file: str) -> str: |
|
with open(pdf_file, 'rb') as pdf: |
|
reader = PdfReader(pdf) |
|
text = " ".join(page.extract_text() or "" for page in reader.pages) |
|
return text |
|
|
|
|
|
def chunk_text(text: str, max_tokens: int = 512) -> list: |
|
tokenizer = tiktoken.get_encoding("cl100k_base") |
|
tokens = tokenizer.encode(text) |
|
|
|
chunks = [] |
|
for i in range(0, len(tokens), max_tokens): |
|
chunk_tokens = tokens[i:i+max_tokens] |
|
chunk_text = tokenizer.decode(chunk_tokens) |
|
chunks.append(chunk_text) |
|
|
|
return chunks |
|
|
|
|
|
model = SentenceTransformer('all-MiniLM-L6-v2') |
|
|
|
def get_embedding(text: str): |
|
return np.array(model.encode(text), dtype=np.float32) |
|
|
|
|
|
d = 384 |
|
index = faiss.IndexFlatL2(d) |
|
text_chunks = [] |
|
|
|
def add_to_db(text_chunks_local): |
|
global text_chunks |
|
text_chunks = text_chunks_local |
|
embeddings = np.array([get_embedding(text) for text in text_chunks], dtype=np.float32).reshape(-1, d) |
|
index.add(embeddings) |
|
|
|
def search_db(query, k=5): |
|
if index.ntotal == 0: |
|
return ["Database masih kosong, silakan tambahkan data."] |
|
|
|
query_embedding = np.array([get_embedding(query)], dtype=np.float32).reshape(1, -1) |
|
distances, indices = index.search(query_embedding, k) |
|
return [text_chunks[i] for i in indices[0] if i < len(text_chunks)] |
|
|
|
def save_to_faiss(index_path="vector_index.faiss"): |
|
faiss.write_index(index, index_path) |
|
|
|
def load_faiss(index_path="vector_index.faiss"): |
|
global index |
|
index = faiss.read_index(index_path) |
|
|
|
def save_embeddings(embeddings_path="models/embeddings.pkl"): |
|
with open(embeddings_path, "wb") as f: |
|
pickle.dump(index, f) |
|
|
|
def load_embeddings(embeddings_path="models/embeddings.pkl"): |
|
global index |
|
with open(embeddings_path, "rb") as f: |
|
index = pickle.load(f) |
|
|
|
|
|
client = groq.Client(api_key=GROQ_API_KEY) |
|
|
|
def query_llama(prompt): |
|
response = client.chat.completions.create( |
|
model="llama3-8b-8192", |
|
messages=[{"role": "user", "content": prompt}], |
|
max_tokens=512 |
|
) |
|
return response.choices[0].message.content.strip() |
|
|
|
|
|
if __name__ == '__main__': |
|
pdf_text = extract_text_from_pdf('dini_anggriyani_synthetic_data.pdf') |
|
text_chunks = chunk_text(pdf_text, max_tokens=1024) |
|
|
|
add_to_db(text_chunks) |
|
save_to_faiss() |
|
save_embeddings() |
|
|
|
retrieved_chunks = search_db("Apa isi dokumen ini?") |
|
context = "\n".join(retrieved_chunks) |
|
|
|
prompt = f"Gunakan informasi berikut untuk menjawab:\n{context}\n\nPertanyaan: Apa isi dokumen ini?" |
|
answer = query_llama(prompt) |
|
print(answer) |
|
|
|
|
|
def chatbot_interface(user_query): |
|
retrieved_chunks = search_db(user_query) |
|
context = "\n".join(retrieved_chunks) |
|
|
|
prompt = f"Gunakan informasi berikut untuk menjawab:\n{context}\n\nPertanyaan: {user_query}" |
|
answer = query_llama(prompt) |
|
|
|
return answer |
|
|
|
iface = gr.Interface(fn=chatbot_interface, inputs="text", outputs="text", title="RAG Chatbot") |
|
iface.launch() |
|
|