Spaces:

adarsh-maurya
/

ApnaLawyer

Running

File size: 2,793 Bytes

import ray
import logging
import os
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

# Initialize Ray (safe even if already running)
ray.init(ignore_reinit_error=True)

# Logging setup
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Define FAISS index paths
index_directory = 'ipc_embed_db'
index_path_faiss = os.path.join(index_directory, 'index.faiss')
index_path_pkl = os.path.join(index_directory, 'index.pkl')

# Ensure index directory exists
os.makedirs(index_directory, exist_ok=True)

# Load documents
logging.info("📁 Loading legal documents from 'data/' directory...")
loader = DirectoryLoader('data', glob="**/*.txt")  # Recursively load .txt files
documents = loader.load()

# Check if any documents were found
if not documents:
    logging.error("❌ No documents found in 'data/'. Please add .txt files to proceed.")
    ray.shutdown()
    exit()

# Split documents into chunks
logging.info("✂️ Splitting documents for embedding...")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

# Load the InLegalBERT embedding model
logging.info("📦 Loading HuggingFace embedding model: 'law-ai/InLegalBERT'...")
embeddings = HuggingFaceEmbeddings(model_name="law-ai/InLegalBERT")

# Create and save the FAISS index
def create_faiss_index():
    logging.info("⚙️ Creating new FAISS index...")
    faiss_db = FAISS.from_documents(texts, embeddings)
    faiss_db.save_local(index_directory)
    logging.info("✅ FAISS index saved in '%s'.", index_directory)
    return faiss_db

# Load existing index or create if missing
def load_or_create_faiss_index():
    if os.path.exists(index_path_faiss) and os.path.exists(index_path_pkl):
        logging.info("📂 Loading existing FAISS index...")
        try:
            faiss_db = FAISS.load_local(index_directory, embeddings, allow_dangerous_deserialization=True)
            logging.info("✅ FAISS index loaded successfully.")
            return faiss_db
        except Exception as e:
            logging.warning("⚠️ Failed to load existing index. Recreating... (%s)", str(e))
    else:
        logging.info("❌ FAISS index files not found. Creating new index...")

    return create_faiss_index()

# Build the index
faiss_db = load_or_create_faiss_index()

# Optional: if you want to use the retriever later
# db_retriever = faiss_db.as_retriever(search_type="similarity", search_kwargs={"k": 3})

# Shut down Ray
ray.shutdown()
logging.info("✅ Indexing process completed successfully.")