File size: 3,111 Bytes
d07fe0d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import os
from dotenv import load_dotenv
import sys

from langchain_community.vectorstores import FAISS
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader

# Load API Key
load_dotenv()
api_key = os.getenv("GOOGLE_API_KEY")

if not api_key:
    raise ValueError("Google API Key not found. Please set it in your .env file.")

# Path to Constitution documents
data_path = "data"  # Put Constitution of India and related PDFs here
if not os.path.exists(data_path):
    raise FileNotFoundError(f"Directory '{data_path}' not found. Place your documents there.")

print(f"Current working directory: {os.getcwd()}")
print(f"Looking for Constitution documents in: {os.path.abspath(data_path)}")

# Load PDFs
documents = []
pdf_files = [f for f in os.listdir(data_path) if f.lower().endswith('.pdf')]
print(f"Found {len(pdf_files)} PDF files")

for pdf_file in pdf_files:
    try:
        pdf_path = os.path.join(data_path, pdf_file)
        print(f"Loading PDF: {pdf_path}")
        loader = PyPDFLoader(pdf_path)
        pdf_docs = loader.load()
        print(f"  - Loaded {len(pdf_docs)} pages from {pdf_file}")
        documents.extend(pdf_docs)
    except Exception as e:
        print(f"Error loading {pdf_file}: {e}")

# Fallback if no documents loaded
if not documents:
    print("No documents were loaded. Trying DirectoryLoader as fallback...")
    try:
        loader = DirectoryLoader(data_path, glob="**/*.pdf")
        documents = loader.load()
        print(f"Loaded {len(documents)} documents with DirectoryLoader")
    except Exception as e:
        print(f"DirectoryLoader failed: {e}")

if not documents:
    print("ERROR: No Constitution documents loaded. Exiting.")
    sys.exit(1)

# Split documents
print(f"Splitting {len(documents)} documents into chunks...")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
docs = text_splitter.split_documents(documents)
print(f"Split into {len(docs)} chunks")

if not docs:
    print("ERROR: No text chunks created after splitting.")
    sys.exit(1)

# Create embeddings & FAISS index
try:
    print("Initializing embedding model...")
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

    print("Testing embedding generation...")
    test_embedding = embeddings.embed_query("Constitution of India test")
    print(f"Test embedding successful, vector length: {len(test_embedding)}")

    print(f"Creating FAISS index for {len(docs)} chunks...")
    faiss_index = FAISS.from_documents(docs, embeddings)

    index_path = "vector_store/faiss_index_constitution"
    os.makedirs(index_path, exist_ok=True)
    faiss_index.save_local(index_path)

    print("✅ Constitution FAISS Index successfully created and saved!")
except Exception as e:
    print(f"ERROR during embedding or indexing: {e}")
    import traceback
    traceback.print_exc()
    sys.exit(1)