File size: 2,253 Bytes
e306edb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import ray
import logging
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from faiss import IndexFlatL2  # Assuming using L2 distance for simplicity

# Initialize Ray
ray.init()

# Set up basic configuration for logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Load documents with logging
logging.info("Loading documents...")
loader = DirectoryLoader('data', glob="./*.txt")
documents = loader.load()

# Extract text from documents and split into manageable texts with logging
logging.info("Extracting and splitting texts from documents...")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200)
texts = []
for document in documents:
    if hasattr(document, 'get_text'):
        text_content = document.get_text()  # Adjust according to actual method
    else:
        text_content = ""  # Default to empty string if no text method is available

    texts.extend(text_splitter.split_text(text_content))

# Define embedding function
def embedding_function(text):
    embeddings_model = HuggingFaceEmbeddings(model_name="law-ai/InLegalBERT")
    return embeddings_model.embed_query(text)

# Create FAISS index for embeddings
index = IndexFlatL2(768)  # Dimension of embeddings, adjust as needed

# Assuming docstore as a simple dictionary to store document texts
docstore = {i: text for i, text in enumerate(texts)}
index_to_docstore_id = {i: i for i in range(len(texts))}

# Initialize FAISS
faiss_db = FAISS(embedding_function, index, docstore, index_to_docstore_id)

# Process and store embeddings
logging.info("Storing embeddings in FAISS...")
for i, text in enumerate(texts):
    embedding = embedding_function(text)
    faiss_db.add_documents([embedding])

# Exporting the vector embeddings database with logging
logging.info("Exporting the vector embeddings database...")
faiss_db.save_local("ipc_embed_db")

# Log a message to indicate the completion of the process
logging.info("Process completed successfully.")

# Shutdown Ray after the process
ray.shutdown()