LyrGen2 / src /utils /test_semantic.py
James Edmunds
Initial commit for cleanstart2
4994b71
"""Test semantic understanding of the lyrics database."""
import numpy as np
import time
from typing import Dict, List, Tuple
from pathlib import Path
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from chromadb.config import Settings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
def initialize_vector_store(lyrics_dir: Path) -> Chroma:
"""Initialize vector store with lyrics."""
print("Initializing vector store...")
embeddings = OpenAIEmbeddings()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=50,
separators=["\n\n", "\n", " ", ""]
)
# Load lyrics
documents = []
for artist_dir in lyrics_dir.iterdir():
if artist_dir.is_dir():
artist_name = artist_dir.name
for lyric_file in artist_dir.glob('*.txt'):
with open(lyric_file, 'r', encoding='utf-8') as f:
text = f.read()
metadata = {
'artist': artist_name,
'song_title': lyric_file.stem,
'source': str(lyric_file)
}
documents.append(
Document(page_content=text, metadata=metadata)
)
if not documents:
raise ValueError("No lyrics found in directory")
print(f"Found {len(documents)} lyrics documents")
# Create vector store
texts = text_splitter.split_documents(documents)
vector_store = Chroma.from_documents(
documents=texts,
embedding=embeddings,
persist_directory="./lyrics_db"
)
return vector_store
def calculate_metrics(similarity_scores: list[float]) -> dict:
"""Calculate metrics for similarity scores."""
return {
"mean": np.mean(similarity_scores),
"median": np.median(similarity_scores),
"std": np.std(similarity_scores),
"min": np.min(similarity_scores),
"max": np.max(similarity_scores)
}
def test_k_parameters(
vector_store: Chroma,
query: str,
k_values: List[int],
search_k_values: List[int]
) -> Dict[Tuple[int, int], Dict]:
"""Test different combinations of k and search_k parameters."""
results = {}
for k in k_values:
for search_k in search_k_values:
if search_k < k:
continue
start_time = time.time()
try:
# Get documents with error handling
docs = vector_store.similarity_search_with_score(
query,
k=k
)
elapsed_time = time.time() - start_time
if not docs:
print(f"No results found for k={k}, search_k={search_k}")
continue
similarities = [1 - score for _, score in docs]
avg_sim = (
np.mean(similarities) if similarities else 0
)
max_sim = (
np.max(similarities) if similarities else 0
)
results[(k, search_k)] = {
"time": elapsed_time,
"avg_similarity": avg_sim,
"max_similarity": max_sim,
"result_count": len(docs)
}
except Exception as e:
print(f"Error with k={k}, search_k={search_k}: {str(e)}")
continue
if not results:
raise ValueError(
"No valid results found for any parameter combination"
)
return results
def test_semantic_understanding(
lyrics_dir: str = "./app/lyrics",
lyrics_db_path: str = None,
optimize_params: bool = True
):
"""Test semantic understanding with parameter optimization."""
print("\n=== Testing Semantic Understanding ===\n")
# Get absolute path to lyrics_db
if lyrics_db_path is None:
lyrics_db_path = Path(__file__).parent.parent / "lyrics_db"
else:
lyrics_db_path = Path(lyrics_db_path)
print(f"Looking for vector store at: {lyrics_db_path}")
# Detailed vector store check
if not lyrics_db_path.exists():
print(f"Error: Vector store not found at {lyrics_db_path}")
return
# Check vector store contents
print("\nChecking vector store contents:")
# Calculate directory size
total_size = sum(
f.stat().st_size
for f in lyrics_db_path.rglob('*')
if f.is_file()
)
dir_size_mb = total_size / 1024 / 1024
print(f"Directory size: {dir_size_mb:.2f} MB")
print("Files found:")
for file in lyrics_db_path.iterdir():
size_mb = file.stat().st_size / 1024 / 1024
print(f"- {file.name} ({size_mb:.2f} MB)")
try:
embeddings = OpenAIEmbeddings()
vector_store = Chroma(
persist_directory=str(lyrics_db_path),
embedding_function=embeddings,
client_settings=Settings(
anonymized_telemetry=False
)
)
# Detailed collection check
collection = vector_store._collection
collection_info = collection.get()
print("\nCollection details:")
if collection_info is None:
print("Error: Collection info is None. Database may be corrupted.")
print("Recommendation: Delete the lyrics_db folder and rebuild.")
return
# Safely get counts with default values
ids = collection_info.get('ids', []) or []
embeddings = collection_info.get('embeddings', []) or []
metadatas = collection_info.get('metadatas', []) or []
print(f"IDs: {len(ids)} items")
print(f"Embeddings: {len(embeddings)} items")
print(f"Metadatas: {len(metadatas)} items")
if not ids:
print("\nError: Vector store exists but appears to be empty.")
print("Collection structure exists but contains no embeddings.")
print(
"Try removing the lyrics_db folder and recreating embeddings."
)
return
msg = f"\nFound valid vector store with {len(ids)} documents"
print(msg)
except Exception as e:
print(f"\nError accessing vector store: {str(e)}")
print("Detailed error information:")
import traceback
traceback.print_exc()
return
# Continue with testing if vector store is found and not empty...
if __name__ == "__main__":
try:
test_semantic_understanding(optimize_params=True)
except Exception as e:
print(f"Error during semantic testing: {e}")