DeadPool1236 commited on
Commit
bf68c06
Β·
verified Β·
1 Parent(s): bd796d5

Create cancer_index_store/inspect.py

Browse files
Files changed (1) hide show
  1. cancer_index_store/inspect.py +73 -0
cancer_index_store/inspect.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from pathlib import Path
3
+
4
+ def analyze_vector_store_simple():
5
+ """Analyze the vector store without numpy dependency"""
6
+
7
+ vector_file = Path("cancer_index_store/default__vector_store.json")
8
+
9
+ if not vector_file.exists():
10
+ print("❌ Vector store file not found!")
11
+ return
12
+
13
+ # Load the vector store data
14
+ with open(vector_file, 'r', encoding='utf-8') as f:
15
+ vector_data = json.load(f)
16
+
17
+ print("πŸ” Vector Store Analysis")
18
+ print("=" * 50)
19
+
20
+ # Check the structure
21
+ if 'embedding_dict' in vector_data:
22
+ embeddings = vector_data['embedding_dict']
23
+ print(f"πŸ“Š Total embeddings: {len(embeddings)}")
24
+
25
+ # Show sample embeddings
26
+ print(f"\nπŸ“‹ Sample Document IDs:")
27
+ doc_ids = list(embeddings.keys())[:5]
28
+ for i, doc_id in enumerate(doc_ids):
29
+ vector = embeddings[doc_id]
30
+ print(f" {i+1}. {doc_id}")
31
+ print(f" Vector dimensions: {len(vector)}")
32
+ print(f" First 5 values: {vector[:5]}")
33
+
34
+ # Basic statistics without numpy
35
+ if embeddings:
36
+ first_vector = list(embeddings.values())[0]
37
+ all_values = [val for vec in embeddings.values() for val in vec]
38
+
39
+ print(f"\nπŸ“ˆ Basic Statistics:")
40
+ print(f" Vector dimensions: {len(first_vector)}")
41
+ print(f" Min value: {min(all_values):.6f}")
42
+ print(f" Max value: {max(all_values):.6f}")
43
+ print(f" Avg value: {sum(all_values)/len(all_values):.6f}")
44
+
45
+ if 'text_id_to_ref_doc_id' in vector_data:
46
+ mapping = vector_data['text_id_to_ref_doc_id']
47
+ print(f"\nπŸ”— Text to Document Mapping: {len(mapping)} entries")
48
+ print(" Sample mappings:")
49
+ for i, (text_id, doc_id) in enumerate(list(mapping.items())[:3]):
50
+ print(f" {text_id} β†’ {doc_id}")
51
+
52
+ # Show what semantic search does
53
+ print(f"\n🎯 How Semantic Search Works:")
54
+ print(f" 1. Your question gets converted to a {len(first_vector) if 'embedding_dict' in vector_data else '384'}-dimensional vector")
55
+ print(f" 2. System finds the most similar vectors in this file")
56
+ print(f" 3. Returns documents with highest similarity scores")
57
+ print(f" 4. Similar medical content = Closer vectors in this space")
58
+
59
+ def show_search_example():
60
+ """Show a concrete example of how search works"""
61
+
62
+ print(f"\nπŸ” Concrete Search Example:")
63
+ print(f" Query: 'What are breast cancer symptoms?'")
64
+ print(f" β†’ Converted to vector: [0.123, -0.456, 0.789, ...] (384 numbers)")
65
+ print(f" β†’ Compared against 320 document vectors in default__vector_store.json")
66
+ print(f" β†’ Finds vectors for documents about:")
67
+ print(f" β€’ 'Common symptoms of breast cancer'")
68
+ print(f" β€’ 'Early warning signs and detection'")
69
+ print(f" β€’ 'Patient symptom reporting guidelines'")
70
+
71
+ if __name__ == "__main__":
72
+ analyze_vector_store_simple()
73
+ show_search_example()