chat-with-avd-doc / explore_metadata.py
rogerscuall's picture
Upload folder using huggingface_hub
890d952 verified
#!/usr/bin/env python3
"""
Test script to explore all metadata fields available in the FAISS database chunks.
"""
import os
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
# Configuration
FAISS_INDEX_PATH = "faiss_index"
EMBEDDINGS_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
def explore_metadata():
"""Explore all metadata fields available in the database chunks."""
print("EXPLORING METADATA IN FAISS DATABASE")
print("=" * 60)
if not os.path.exists(FAISS_INDEX_PATH):
print(f"❌ Error: FAISS index not found at {FAISS_INDEX_PATH}")
return False
try:
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDINGS_MODEL_NAME)
vector_db = FAISS.load_local(FAISS_INDEX_PATH, embeddings, allow_dangerous_deserialization=True)
print(f"βœ… Successfully loaded FAISS index from {FAISS_INDEX_PATH}")
except Exception as e:
print(f"❌ Error loading FAISS index: {e}")
return False
# Get a sample of documents to analyze metadata
sample_queries = [
"Ethernet Interfaces Summary",
"UNUSED",
"interface configuration",
"device information",
"fabric"
]
all_metadata_keys = set()
metadata_examples = {}
print("\nSampling documents to analyze metadata...")
print("-" * 40)
for query in sample_queries:
try:
results = vector_db.similarity_search_with_score(query, k=3)
for doc, score in results:
if doc.metadata:
# Collect all metadata keys
all_metadata_keys.update(doc.metadata.keys())
# Store examples of each metadata field
for key, value in doc.metadata.items():
if key not in metadata_examples:
metadata_examples[key] = []
if value not in metadata_examples[key]:
metadata_examples[key].append(value)
except Exception as e:
print(f"Error with query '{query}': {e}")
# Display metadata analysis
print(f"\nπŸ” METADATA ANALYSIS")
print("=" * 60)
print(f"Total unique metadata keys found: {len(all_metadata_keys)}")
print(f"Metadata keys: {sorted(all_metadata_keys)}")
print(f"\nπŸ“‹ DETAILED METADATA FIELDS:")
print("-" * 40)
for key in sorted(all_metadata_keys):
examples = metadata_examples.get(key, [])
print(f"\nπŸ”‘ Field: '{key}'")
print(f" Unique values found: {len(examples)}")
print(f" Example values:")
for i, example in enumerate(examples[:5]): # Show max 5 examples
print(f" {i+1}: {repr(example)}")
if len(examples) > 5:
print(f" ... and {len(examples) - 5} more")
# Show some detailed examples
print(f"\nπŸ“„ SAMPLE DOCUMENTS WITH FULL METADATA:")
print("-" * 40)
# Get a few documents to show complete metadata
sample_results = vector_db.similarity_search_with_score("Ethernet", k=3)
for i, (doc, score) in enumerate(sample_results):
print(f"\n[SAMPLE {i+1}]")
print(f"Score: {score:.4f}")
print(f"Content Length: {len(doc.page_content)} characters")
print(f"Content Preview: {doc.page_content[:100].replace(chr(10), ' ')}...")
print(f"Complete Metadata:")
if doc.metadata:
for key, value in sorted(doc.metadata.items()):
print(f" {key}: {repr(value)}")
else:
print(" No metadata found")
print("-" * 30)
# Analysis summary
print(f"\nπŸ“Š SUMMARY:")
print("=" * 60)
device_docs = len([ex for ex in metadata_examples.get('device_name', []) if ex])
source_files = len(metadata_examples.get('source', []))
print(f"β€’ Device documents found: {device_docs}")
print(f"β€’ Source files found: {source_files}")
if 'device_name' in all_metadata_keys:
print(f"β€’ Device names: {metadata_examples.get('device_name', [])}")
if 'source' in all_metadata_keys:
print(f"β€’ Source file types: {set(f.split('.')[-1] if '.' in f else 'unknown' for f in metadata_examples.get('source', []))}")
return True
def main():
"""Run the metadata exploration."""
success = explore_metadata()
if success:
print("\nβœ… Metadata exploration completed successfully!")
return 0
else:
print("\n❌ Metadata exploration failed")
return 1
if __name__ == "__main__":
exit(main())