Help decoding retrieved vector values to "see" what a chunk looks like in natural language
Hi, I am playing around with this embedding model so I can get a better feel for how to tweak retrieval. My thought was to embed some cleaned html from a sqlite database into a pinecone database. And then query against the pinecone database. I saved a "window" of preceding and succeeding text related to each chunk into the metadata because I would use this metadata for integration with an LLM in the future.
But for now, I also wanted to see the retrieved chunk so I can see how it matches to my query. Yes I can deduce this by my chunk size and metadata values but I still want to "see" my retrieved chunk. But to do so, I would have to decode the retrieved vector values. Has anybody had much success decoding vector values independent of passing them along to an LLM for processing?
For reference, here is my code below. It is only for querying pinecone - it does not include my initial data upserting code. It is working well but lacking a decoding function for me to see my retrieved vectors in natural language.
!pip install transformers pinecone-client
!pip install flash-attn --no-build-isolation
import os
import getpass
import sqlite3
from transformers import AutoModel
from pinecone import Pinecone
from typing import List, Dict
model_DIR = 'jinaai/jina-embeddings-v3'
# Initialize model globally
print("Initializing model...")
model = AutoModel.from_pretrained(model_DIR, trust_remote_code=True, use_flash_attn=False).cuda().eval()
print("Model initialized.")
# Get API keys securely
pinecone_api_key = getpass.getpass("Enter your Pinecone API key: ")
pinecone_environment = "us-east-1"
# Initialize Pinecone
pc = Pinecone(api_key=pinecone_api_key)
# Select the Pinecone index
index_name = "name"
index = pc.Index(index_name)
# Initialize SQLite connection
db_name = 'db_sample.sqlite'
sqlite_table_name = "table_name"
def encode_query(query: str) -> List[float]:
"""Encode the query using Jina embeddings model."""
embeddings = model.encode(
[query], task = "retrieval.query"
)
return embeddings[0].tolist()
def query_pinecone(query_vector: List[float], top_k: int = 5) -> List[Dict]:
"""Query the Pinecone index and return top matches."""
results = index.query(vector=query_vector, top_k=top_k, include_metadata=True)
return results.matches
try:
conn = sqlite3.connect(db_name)
print(f"Successfully connected to SQLite database: {db_name}")
except sqlite3.Error as e:
print(f"Error connecting to SQLite database: {e}")
exit(1)
def get_url_from_sqlite(parent_hash: str, conn: sqlite3.Connection, table_name: str) -> str:
"""Query SQLite database to get URL for a given parent_hash."""
try:
cursor = conn.cursor()
cursor.execute(f"SELECT url FROM {table_name} WHERE hash = ?", (parent_hash,))
result = cursor.fetchone()
return result[0] if result else "URL not found"
except sqlite3.Error as e:
print(f"SQLite error: {e}")
return "Error retrieving URL"
def display_results(results: List[Dict], conn: sqlite3.Connection, table_name: str):
"""Display query results in a user-friendly format."""
print("\nQuery Results:")
print("=" * 50)
for i, result in enumerate(results, 1):
print(f"Result {i}:")
print(f"Score: {result['score']:.4f}")
# Print the content from the 'window' metadata
window_content = result['metadata'].get('window', 'No content available')
print(f"Content: {window_content}")
# Get URL from SQLite database
parent_hash = result['metadata'].get('parent_hash', '')
url = get_url_from_sqlite(parent_hash, conn, table_name)
print(f"URL: {url}")
print("-" * 50)
def main(conn: sqlite3.Connection, table_name: str):
while True:
# Prompt user for query
user_query = input("Enter your query (or 'quit' to exit): ")
if user_query.lower() == 'quit':
break
# Encode the query
query_vector = encode_query(user_query)
# Query Pinecone
results = query_pinecone(query_vector)
# Display results
display_results(results, conn, table_name)
if __name__ == "__main__":
try:
main(conn, sqlite_table_name)
finally:
conn.close()
print("SQLite connection closed.")
I am not sure if I understand correctly what the content of your database is. I think you need to store the chunks together with their embedding representations in the vector database. Decoding the vectors into text is not easily possible and very inefficient.