File size: 5,721 Bytes
38eb958 08df214 e96a966 a5a31ff e96a966 3a757d8 e96a966 3a757d8 e96a966 08df214 e96a966 08df214 e96a966 08df214 ac89d45 e96a966 ac89d45 e96a966 ac89d45 e96a966 3a757d8 bd4e7fa e96a966 3a757d8 a5a31ff 3a757d8 bd4e7fa 3a757d8 a5a31ff 3a757d8 e96a966 a5a31ff e96a966 bd4e7fa 38eb958 e96a966 3a757d8 bd4e7fa a5a31ff 3a757d8 a5a31ff bd4e7fa 3a757d8 bd4e7fa a5a31ff 3a757d8 bd4e7fa e96a966 3a757d8 e96a966 a5a31ff e96a966 bd4e7fa 2b2ba3d 3a757d8 2b2ba3d 3a757d8 2b2ba3d 38eb958 2b2ba3d 3a757d8 38eb958 3a757d8 2b2ba3d 3a757d8 2b2ba3d 7bf713e 2b2ba3d 3a757d8 2b2ba3d 62169f2 3a757d8 e96a966 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
# vector_store.py - Vector store integration with Pinecone
"""
Vector store integration for legal document embeddings using InLegalBERT and Pinecone
"""
import os
import numpy as np
from typing import List, Dict, Any
from langchain_pinecone import PineconeVectorStore
from langchain.embeddings.base import Embeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
class InLegalBERTEmbeddings(Embeddings):
"""Custom LangChain embeddings wrapper for InLegalBERT"""
def __init__(self, model):
self.model = model
def embed_documents(self, texts: List[str]) -> List[List[float]]:
"""Embed a list of documents"""
return self.model.encode(texts).tolist()
def embed_query(self, text: str) -> List[float]:
"""Embed a single query"""
return self.model.encode([text])[0].tolist()
class LegalDocumentVectorStore:
"""Manages vector storage for legal documents"""
def __init__(self):
self.index_name = 'legal-documents'
self.dimension = 768 # InLegalBERT dimension
self._initialized = False
self.clause_tagger = None
self.pc = None
def _initialize_pinecone(self):
"""Initialize Pinecone connection"""
if self._initialized:
return
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
if not PINECONE_API_KEY:
raise ValueError("PINECONE_API_KEY environment variable not set")
# Use modern Pinecone API
from pinecone import Pinecone, ServerlessSpec
self.pc = Pinecone(api_key=PINECONE_API_KEY)
# Create index if doesn't exist
existing_indexes = [index_info["name"] for index_info in self.pc.list_indexes()]
if self.index_name not in existing_indexes:
self.pc.create_index(
name=self.index_name,
dimension=self.dimension,
metric='cosine',
spec=ServerlessSpec(cloud='aws', region='us-east-1')
)
print(f"β
Created Pinecone index: {self.index_name}")
self._initialized = True
def _normalize_embedding(self, embedding):
"""Ensure embedding is always a list of floats"""
if embedding is None:
return None
if isinstance(embedding, np.ndarray):
return embedding.tolist()
if isinstance(embedding, list):
# Already a Python list
return embedding
# Fallback: try converting if it's a torch tensor or similar
try:
return embedding.tolist()
except Exception:
return list(embedding)
def save_document_embeddings_optimized(
self,
chunk_data: List[Dict],
document_id: str,
analysis_results: Dict[str, Any]
) -> bool:
"""Save embeddings using pre-computed vectors with proper text storage"""
try:
self._initialize_pinecone()
# Normalize embeddings safely
valid_chunks = [c for c in chunk_data if c.get("embedding") is not None]
if not valid_chunks:
print("β οΈ No embeddings found in chunk_data")
return False
vectors = []
for i, chunk_info in enumerate(valid_chunks):
normalized_embedding = self._normalize_embedding(chunk_info["embedding"])
if normalized_embedding is None:
continue
metadata = {
'document_id': document_id,
'chunk_index': i,
'total_chunks': len(chunk_data),
'source': 'legal_document',
'has_key_clauses': len(analysis_results.get('key_clauses', [])) > 0,
'risk_count': len(analysis_results.get('risky_terms', [])),
'embedding_model': 'InLegalBERT',
'timestamp': str(np.datetime64('now')),
'text': chunk_info["text"] # Store text in metadata for retrieval
}
vectors.append({
"id": f"{document_id}_chunk_{i}",
"values": normalized_embedding,
"metadata": metadata
})
# Add to Pinecone
index = self.pc.Index(self.index_name)
index.upsert(vectors=vectors)
print(f"β
Saved {len(vectors)} pre-computed embeddings with text to Pinecone")
return True
except Exception as e:
print(f"β Error saving pre-computed embeddings: {e}")
return False
def get_retriever(self, clause_tagger, document_id: str = None):
"""Get retriever for chat functionality with improved settings"""
try:
self._initialize_pinecone()
legal_embeddings = InLegalBERTEmbeddings(clause_tagger.embedding_model)
index = self.pc.Index(self.index_name)
vectorstore = PineconeVectorStore(
index=index,
embedding=legal_embeddings,
text_key="text" # Use text stored in metadata
)
# Configure search parameters
search_kwargs = {'k': 10}
if document_id:
search_kwargs['filter'] = {'document_id': document_id}
return vectorstore.as_retriever(
search_type="similarity",
search_kwargs=search_kwargs
)
except Exception as e:
print(f"β Error creating retriever: {e}")
return None
# Global instance
vector_store = LegalDocumentVectorStore()
|