Spaces:
Runtime error
Runtime error
File size: 5,208 Bytes
5f3b20a 2abe6e2 5f3b20a 2abe6e2 5f3b20a 2abe6e2 757d2c1 5f3b20a 2abe6e2 5f3b20a 2abe6e2 5f3b20a 2abe6e2 5f3b20a 2abe6e2 5f3b20a 2abe6e2 5f3b20a 2abe6e2 757d2c1 2abe6e2 5f3b20a 2abe6e2 5f3b20a 2abe6e2 5f3b20a 2abe6e2 5f3b20a 2abe6e2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
import os
import argparse
import logging
import time
from collections import defaultdict
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
# PyMuPDF library
try:
import fitz # PyMuPDF
PYMUPDF_AVAILABLE = True
print("✅ PyMuPDF library available")
except ImportError:
PYMUPDF_AVAILABLE = False
print("⚠️ PyMuPDF library is not installed. Install with: pip install PyMuPDF")
# --------------------------------
# Log Output
# --------------------------------
def log(msg):
print(f"[{time.strftime('%H:%M:%S')}] {msg}")
# --------------------------------
# Text Cleaning Function
# --------------------------------
def clean_text(text):
return re.sub(r"[^\uAC00-\uD7A3\u1100-\u11FF\u3130-\u318F\w\s.,!?\"'()$:\-]", "", text)
def apply_corrections(text):
corrections = {
'º©': 'info', 'Ì': 'of', '½': 'operation', 'Ã': '', '©': '',
'’': "'", '“': '"', 'â€': '"'
}
for k, v in corrections.items():
text = text.replace(k, v)
return text
# --------------------------------
# Load the embedding model
def get_embeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", device="cuda"):
return HuggingFaceEmbeddings(
model_name=model_name,
model_kwargs={'device': device},
encode_kwargs={'normalize_embeddings': True}
)
def build_vector_store_batch(documents, embeddings, save_path="vector_db", batch_size=16):
if not documents:
raise ValueError("No documents found. Check if documents are loaded correctly.")
texts = [doc.page_content for doc in documents]
metadatas = [doc.metadata for doc in documents]
# Split into batches
batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]
metadata_batches = [metadatas[i:i + batch_size] for i in range(0, len(metadatas), batch_size)]
print(f"Processing {len(batches)} batches with size {batch_size}")
print(f"Initializing vector store with batch 1/{len(batches)}")
# Use from_documents instead of from_texts (to prevent length issues)
first_docs = [
Document(page_content=text, metadata=meta)
for text, meta in zip(batches[0], metadata_batches[0])
]
vectorstore = FAISS.from_documents(first_docs, embeddings)
# Add remaining batches
for i in tqdm(range(1, len(batches)), desc="Processing batches"):
try:
docs_batch = [
Document(page_content=text, metadata=meta)
for text, meta in zip(batches[i], metadata_batches[i])
]
vectorstore.add_documents(docs_batch)
if i % 10 == 0:
temp_save_path = f"{save_path}_temp"
os.makedirs(os.path.dirname(temp_save_path) if os.path.dirname(temp_save_path) else '.', exist_ok=True)
vectorstore.save_local(temp_save_path)
print(f"Temporary vector store saved to {temp_save_path} after batch {i}")
except Exception as e:
print(f"Error processing batch {i}: {e}")
error_save_path = f"{save_path}_error_at_batch_{i}"
os.makedirs(os.path.dirname(error_save_path) if os.path.dirname(error_save_path) else '.', exist_ok=True)
vectorstore.save_local(error_save_path)
print(f"Partial vector store saved to {error_save_path}")
raise
os.makedirs(os.path.dirname(save_path) if os.path.dirname(save_path) else '.', exist_ok=True)
vectorstore.save_local(save_path)
print(f"Vector store saved to {save_path}")
return vectorstore
def load_vector_store(embeddings, load_path="vector_db"):
if not os.path.exists(load_path):
raise FileNotFoundError(f"Cannot find vector store: {load_path}")
return FAISS.load_local(load_path, embeddings, allow_dangerous_deserialization=True)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Builds a vector store")
parser.add_argument("--folder", type=str, default="dataset", help="Path to the folder containing the documents")
parser.add_argument("--save_path", type=str, default="vector_db", help="Path to save the vector store")
parser.add_argument("--batch_size", type=int, default=16, help="Batch size")
parser.add_argument("--model_name", type=str, default="sentence-transformers/all-MiniLM-L6-v2", help="Name of the embedding model")
parser.add_argument("--device", type=str, default="cuda", choices=["cuda", "cpu"], help="Device to use ('cuda' or 'cpu')")
args = parser.parse_args()
# Import the document processing module
from document_processor import load_documents, split_documents
# Load and split documents
documents = load_documents(args.folder)
chunks = split_documents(documents, chunk_size=800, chunk_overlap=100)
# Load the embedding model
embeddings = get_embeddings(model_name=args.model_name, device=args.device)
# Build the vector store
build_vector_store_batch(chunks, embeddings, args.save_path, args.batch_size) |