Spaces:

HF-Pawan
/

Automated-Signature-Analysis-Docker

Running

Automated-Signature-Analysis-Docker / common /rag /embeddings.py

anyonehomep1mane

Initial Changes

5637ddb 4 months ago

3.58 kB

	import warnings
	warnings.filterwarnings(action='ignore')
	import torch
	from langchain_community.vectorstores import FAISS
	from langchain_huggingface import HuggingFaceEmbeddings
	from common.rag.document_loader import fetch_document_chunks
	from dotenv import load_dotenv
	load_dotenv()


	def fetch_vectorstore_retriever():
	"""
	Create and return a FAISS-based retriever for graphology/handwriting analysis documents.

	This function:
	- Loads sentence-transformers/all-MiniLM-L6-v2 embeddings (GPU if available)
	- Builds a FAISS vector store from document chunks obtained via fetch_document_chunks()
	- Returns a similarity search retriever configured to return top 10 most relevant chunks

	Returns
	-------
	langchain_core.retrievers.BaseRetriever
	Configured FAISS retriever ready to be used with .invoke() or .get_relevant_documents()

	Notes
	-----
	- The vector store is recreated from scratch every time this function is called.
	- This can be slow on first run or when document collection is large.
	- Consider caching/persisting the vectorstore in production for better performance.
	- Uses normalize_embeddings=True → cosine similarity is used internally.
	"""

	embeddings = HuggingFaceEmbeddings(
	model_name="sentence-transformers/all-MiniLM-L6-v2",
	model_kwargs={'device': 'cuda' if torch.cuda.is_available() else 'cpu'},
	encode_kwargs={'normalize_embeddings': True}
	)

	vectorstore = FAISS.from_documents(
	documents=fetch_document_chunks(),
	embedding=embeddings
	)

	retriever = vectorstore.as_retriever(
	search_type="similarity",
	search_kwargs={"k": 10}
	)

	return retriever


	def fetch_relevant_document(topic="None"):
	"""
	Retrieve relevant document chunks for graphological analysis of a specific topic/trait.

	Constructs a detailed, structured query optimized for finding handwriting analysis content,
	then retrieves the top 10 most similar document chunks from the FAISS vector store.

	Parameters
	----------
	topic : str, default="None"
	Personality trait, psychological characteristic, writing style aspect or any topic
	for which handwriting analysis information is requested.
	Examples: "ambition", "emotional stability", "aggressiveness", "introversion"

	Returns
	-------
	str
	Concatenated string containing up to 10 relevant document chunks, each prefixed
	with "[Document N]" for clear identification in the RAG context.
	Returns empty context string if topic is "None" or no relevant chunks are found.

	Notes
	-----
	- The query is intentionally very specific and structured to improve retrieval quality
	for handwriting/graphology related content.
	- Uses similarity (cosine) search with k=10 (top 10 results).
	- The returned context is meant to be directly passed into a RAG prompt for LLM analysis.
	"""

	retriever = fetch_vectorstore_retriever()
	query = (
	f"Handwriting sample analysis for: {topic}\n"
	"Extract and summarize: \n"
	"- Observed writing style characteristics (slant, pressure, size, speed, spacing, margins, baseline, letter forms, connections, etc.)\n"
	"- Graphological interpretations of personality traits linked to those features\n"
	"- Overall psychological or personality impression"
	)
	docs = retriever.invoke(query)
	context = "\n\n".join(f"[Document {i+1}]\n{doc.page_content}\n" for i, doc in enumerate(docs))
	return context