File size: 5,208 Bytes
5f3b20a
 
 
2abe6e2
 
5f3b20a
2abe6e2
 
 
 
5f3b20a
2abe6e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
757d2c1
5f3b20a
 
 
 
 
 
 
 
2abe6e2
5f3b20a
 
 
 
2abe6e2
5f3b20a
 
 
 
 
 
2abe6e2
5f3b20a
 
 
 
 
 
2abe6e2
5f3b20a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2abe6e2
5f3b20a
 
 
2abe6e2
 
 
 
757d2c1
2abe6e2
5f3b20a
 
 
2abe6e2
5f3b20a
 
2abe6e2
5f3b20a
 
 
2abe6e2
5f3b20a
 
2abe6e2
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import os
import argparse
import logging
import time
from collections import defaultdict

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

# PyMuPDF library
try:
    import fitz  # PyMuPDF
    PYMUPDF_AVAILABLE = True
    print("✅ PyMuPDF library available")
except ImportError:
    PYMUPDF_AVAILABLE = False
    print("⚠️ PyMuPDF library is not installed. Install with: pip install PyMuPDF")


# --------------------------------
# Log Output
# --------------------------------

def log(msg):
    print(f"[{time.strftime('%H:%M:%S')}] {msg}")

# --------------------------------
# Text Cleaning Function
# --------------------------------

def clean_text(text):
    return re.sub(r"[^\uAC00-\uD7A3\u1100-\u11FF\u3130-\u318F\w\s.,!?\"'()$:\-]", "", text)

def apply_corrections(text):
    corrections = {
        'º©': 'info', 'Ì': 'of', '½': 'operation', 'Ã': '', '©': '',
        '’': "'", '“': '"', 'â€': '"'
    }
    for k, v in corrections.items():
        text = text.replace(k, v)
    return text

# --------------------------------
# Load the embedding model
def get_embeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", device="cuda"):
    return HuggingFaceEmbeddings(
        model_name=model_name,
        model_kwargs={'device': device},
        encode_kwargs={'normalize_embeddings': True}
    )

def build_vector_store_batch(documents, embeddings, save_path="vector_db", batch_size=16):
    if not documents:
        raise ValueError("No documents found. Check if documents are loaded correctly.")

    texts = [doc.page_content for doc in documents]
    metadatas = [doc.metadata for doc in documents]

    # Split into batches
    batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]
    metadata_batches = [metadatas[i:i + batch_size] for i in range(0, len(metadatas), batch_size)]

    print(f"Processing {len(batches)} batches with size {batch_size}")
    print(f"Initializing vector store with batch 1/{len(batches)}")

    # Use from_documents instead of from_texts (to prevent length issues)
    first_docs = [
        Document(page_content=text, metadata=meta)
        for text, meta in zip(batches[0], metadata_batches[0])
    ]
    vectorstore = FAISS.from_documents(first_docs, embeddings)

    # Add remaining batches
    for i in tqdm(range(1, len(batches)), desc="Processing batches"):
        try:
            docs_batch = [
                Document(page_content=text, metadata=meta)
                for text, meta in zip(batches[i], metadata_batches[i])
            ]
            vectorstore.add_documents(docs_batch)

            if i % 10 == 0:
                temp_save_path = f"{save_path}_temp"
                os.makedirs(os.path.dirname(temp_save_path) if os.path.dirname(temp_save_path) else '.', exist_ok=True)
                vectorstore.save_local(temp_save_path)
                print(f"Temporary vector store saved to {temp_save_path} after batch {i}")

        except Exception as e:
            print(f"Error processing batch {i}: {e}")
            error_save_path = f"{save_path}_error_at_batch_{i}"
            os.makedirs(os.path.dirname(error_save_path) if os.path.dirname(error_save_path) else '.', exist_ok=True)
            vectorstore.save_local(error_save_path)
            print(f"Partial vector store saved to {error_save_path}")
            raise

    os.makedirs(os.path.dirname(save_path) if os.path.dirname(save_path) else '.', exist_ok=True)
    vectorstore.save_local(save_path)
    print(f"Vector store saved to {save_path}")

    return vectorstore

def load_vector_store(embeddings, load_path="vector_db"):
    if not os.path.exists(load_path):
        raise FileNotFoundError(f"Cannot find vector store: {load_path}")
    return FAISS.load_local(load_path, embeddings, allow_dangerous_deserialization=True)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Builds a vector store")
    parser.add_argument("--folder", type=str, default="dataset", help="Path to the folder containing the documents")
    parser.add_argument("--save_path", type=str, default="vector_db", help="Path to save the vector store")
    parser.add_argument("--batch_size", type=int, default=16, help="Batch size")
    parser.add_argument("--model_name", type=str, default="sentence-transformers/all-MiniLM-L6-v2", help="Name of the embedding model")
    parser.add_argument("--device", type=str, default="cuda", choices=["cuda", "cpu"], help="Device to use ('cuda' or 'cpu')")

    args = parser.parse_args()

    # Import the document processing module
    from document_processor import load_documents, split_documents

    # Load and split documents
    documents = load_documents(args.folder)
    chunks = split_documents(documents, chunk_size=800, chunk_overlap=100)

    # Load the embedding model
    embeddings = get_embeddings(model_name=args.model_name, device=args.device)

    # Build the vector store
    build_vector_store_batch(chunks, embeddings, args.save_path, args.batch_size)