File size: 2,793 Bytes
07524cb
 
d883f65
07524cb
 
 
 
 
5402334
 
07524cb
5402334
07524cb
 
5402334
d883f65
a1f5731
 
d883f65
5402334
a1f5731
d883f65
a1f5731
5402334
 
a1f5731
 
5402334
 
 
 
 
 
 
 
a1f5731
 
 
5402334
 
a1f5731
d883f65
5402334
a1f5731
5402334
a1f5731
d883f65
5402334
d883f65
 
5402334
a1f5731
 
5402334
 
 
 
 
 
 
d883f65
5402334
 
 
d883f65
5402334
a1f5731
a965de4
5402334
d883f65
07524cb
5402334
07524cb
5402334
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import ray
import logging
import os
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

# Initialize Ray (safe even if already running)
ray.init(ignore_reinit_error=True)

# Logging setup
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Define FAISS index paths
index_directory = 'ipc_embed_db'
index_path_faiss = os.path.join(index_directory, 'index.faiss')
index_path_pkl = os.path.join(index_directory, 'index.pkl')

# Ensure index directory exists
os.makedirs(index_directory, exist_ok=True)

# Load documents
logging.info("πŸ“ Loading legal documents from 'data/' directory...")
loader = DirectoryLoader('data', glob="**/*.txt")  # Recursively load .txt files
documents = loader.load()

# Check if any documents were found
if not documents:
    logging.error("❌ No documents found in 'data/'. Please add .txt files to proceed.")
    ray.shutdown()
    exit()

# Split documents into chunks
logging.info("βœ‚οΈ Splitting documents for embedding...")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

# Load the InLegalBERT embedding model
logging.info("πŸ“¦ Loading HuggingFace embedding model: 'law-ai/InLegalBERT'...")
embeddings = HuggingFaceEmbeddings(model_name="law-ai/InLegalBERT")

# Create and save the FAISS index
def create_faiss_index():
    logging.info("βš™οΈ Creating new FAISS index...")
    faiss_db = FAISS.from_documents(texts, embeddings)
    faiss_db.save_local(index_directory)
    logging.info("βœ… FAISS index saved in '%s'.", index_directory)
    return faiss_db

# Load existing index or create if missing
def load_or_create_faiss_index():
    if os.path.exists(index_path_faiss) and os.path.exists(index_path_pkl):
        logging.info("πŸ“‚ Loading existing FAISS index...")
        try:
            faiss_db = FAISS.load_local(index_directory, embeddings, allow_dangerous_deserialization=True)
            logging.info("βœ… FAISS index loaded successfully.")
            return faiss_db
        except Exception as e:
            logging.warning("⚠️ Failed to load existing index. Recreating... (%s)", str(e))
    else:
        logging.info("❌ FAISS index files not found. Creating new index...")

    return create_faiss_index()

# Build the index
faiss_db = load_or_create_faiss_index()

# Optional: if you want to use the retriever later
# db_retriever = faiss_db.as_retriever(search_type="similarity", search_kwargs={"k": 3})

# Shut down Ray
ray.shutdown()
logging.info("βœ… Indexing process completed successfully.")