Spaces:

rogerscuall
/

chat-with-avd-doc

Sleeping

App Files Files Community

chat-with-avd-doc / explore_metadata.py

rogerscuall

Upload folder using huggingface_hub

890d952 verified 30 days ago

raw

history blame contribute delete

4.75 kB

	#!/usr/bin/env python3
	"""
	Test script to explore all metadata fields available in the FAISS database chunks.
	"""

	import os
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain_community.vectorstores import FAISS

	# Configuration
	FAISS_INDEX_PATH = "faiss_index"
	EMBEDDINGS_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

	def explore_metadata():
	"""Explore all metadata fields available in the database chunks."""
	print("EXPLORING METADATA IN FAISS DATABASE")
	print("=" * 60)

	if not os.path.exists(FAISS_INDEX_PATH):
	print(f"❌ Error: FAISS index not found at {FAISS_INDEX_PATH}")
	return False

	try:
	embeddings = HuggingFaceEmbeddings(model_name=EMBEDDINGS_MODEL_NAME)
	vector_db = FAISS.load_local(FAISS_INDEX_PATH, embeddings, allow_dangerous_deserialization=True)
	print(f"✅ Successfully loaded FAISS index from {FAISS_INDEX_PATH}")
	except Exception as e:
	print(f"❌ Error loading FAISS index: {e}")
	return False

	# Get a sample of documents to analyze metadata
	sample_queries = [
	"Ethernet Interfaces Summary",
	"UNUSED",
	"interface configuration",
	"device information",
	"fabric"
	]

	all_metadata_keys = set()
	metadata_examples = {}

	print("\nSampling documents to analyze metadata...")
	print("-" * 40)

	for query in sample_queries:
	try:
	results = vector_db.similarity_search_with_score(query, k=3)

	for doc, score in results:
	if doc.metadata:
	# Collect all metadata keys
	all_metadata_keys.update(doc.metadata.keys())

	# Store examples of each metadata field
	for key, value in doc.metadata.items():
	if key not in metadata_examples:
	metadata_examples[key] = []
	if value not in metadata_examples[key]:
	metadata_examples[key].append(value)

	except Exception as e:
	print(f"Error with query '{query}': {e}")

	# Display metadata analysis
	print(f"\n🔍 METADATA ANALYSIS")
	print("=" * 60)
	print(f"Total unique metadata keys found: {len(all_metadata_keys)}")
	print(f"Metadata keys: {sorted(all_metadata_keys)}")

	print(f"\n📋 DETAILED METADATA FIELDS:")
	print("-" * 40)

	for key in sorted(all_metadata_keys):
	examples = metadata_examples.get(key, [])
	print(f"\n🔑 Field: '{key}'")
	print(f" Unique values found: {len(examples)}")
	print(f" Example values:")
	for i, example in enumerate(examples[:5]): # Show max 5 examples
	print(f" {i+1}: {repr(example)}")
	if len(examples) > 5:
	print(f" ... and {len(examples) - 5} more")

	# Show some detailed examples
	print(f"\n📄 SAMPLE DOCUMENTS WITH FULL METADATA:")
	print("-" * 40)

	# Get a few documents to show complete metadata
	sample_results = vector_db.similarity_search_with_score("Ethernet", k=3)

	for i, (doc, score) in enumerate(sample_results):
	print(f"\n[SAMPLE {i+1}]")
	print(f"Score: {score:.4f}")
	print(f"Content Length: {len(doc.page_content)} characters")
	print(f"Content Preview: {doc.page_content[:100].replace(chr(10), ' ')}...")
	print(f"Complete Metadata:")
	if doc.metadata:
	for key, value in sorted(doc.metadata.items()):
	print(f" {key}: {repr(value)}")
	else:
	print(" No metadata found")
	print("-" * 30)

	# Analysis summary
	print(f"\n📊 SUMMARY:")
	print("=" * 60)

	device_docs = len([ex for ex in metadata_examples.get('device_name', []) if ex])
	source_files = len(metadata_examples.get('source', []))

	print(f"• Device documents found: {device_docs}")
	print(f"• Source files found: {source_files}")

	if 'device_name' in all_metadata_keys:
	print(f"• Device names: {metadata_examples.get('device_name', [])}")

	if 'source' in all_metadata_keys:
	print(f"• Source file types: {set(f.split('.')[-1] if '.' in f else 'unknown' for f in metadata_examples.get('source', []))}")

	return True

	def main():
	"""Run the metadata exploration."""
	success = explore_metadata()

	if success:
	print("\n✅ Metadata exploration completed successfully!")
	return 0
	else:
	print("\n❌ Metadata exploration failed")
	return 1

	if __name__ == "__main__":
	exit(main())