More_Advanced_Embeddings_Comparator

Running

App Files Files Community

Chris4K commited on Oct 18, 2024

Commit

1078648

verified ·

1 Parent(s): e0a87f8

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -10

app.py CHANGED Viewed

@@ -325,15 +325,15 @@ def visualize_results(results_df, stats_df):
     axs[1, 0].set_title('Distribution of Result Content Lengths')
     axs[1, 0].set_xticklabels(axs[1, 0].get_xticklabels(), rotation=45, ha='right')
-    embeddings = np.array([embedding for embedding in results_df['embedding'] if isinstance(embedding, np.ndarray)])
-    if len(embeddings) > 1:
-        tsne = TSNE(n_components=2, random_state=42)
-        embeddings_2d = tsne.fit_transform(embeddings)
-        sns.scatterplot(x=embeddings_2d[:, 0], y=embeddings_2d[:, 1], hue=results_df['model'][:len(embeddings)], ax=axs[1, 1])
-        axs[1, 1].set_title('t-SNE Visualization of Result Embeddings')
-    else:
-        axs[1, 1].text(0.5, 0.5, "Not enough data for t-SNE visualization", ha='center', va='center')
     plt.tight_layout()
     return fig
@@ -357,6 +357,7 @@ def compare_embeddings(file, query, model_types, model_names, split_strategy, ch
     }
     for model_type, model_name in zip(model_types, model_names):
         chunks, embedding_model, num_tokens = process_files(
             file.name if file else None,
             model_type,
@@ -369,14 +370,17 @@ def compare_embeddings(file, query, model_types, model_names, split_strategy, ch
             custom_tokenizer_file
         )
         if use_custom_embedding:
             custom_model = create_custom_embedding(chunks)
             embedding_model = CustomEmbeddings(custom_model)
         if optimize_vocab:
             tokenizer, optimized_chunks = optimize_vocabulary(chunks)
             chunks = optimized_chunks
         results, search_time, vector_store, results_raw = search_embeddings(
             chunks,
             embedding_model,
@@ -388,18 +392,26 @@ def compare_embeddings(file, query, model_types, model_names, split_strategy, ch
             phonetic_weight
         )
         stats = calculate_statistics(results_raw, search_time, vector_store, num_tokens, embedding_model, query, top_k)
         stats["model"] = f"{model_type} - {model_name}"
         stats.update(settings)
         formatted_results = format_results(results_raw, stats)
         all_results.extend(formatted_results)
         all_stats.append(stats)
     results_df = pd.DataFrame(all_results)
     stats_df = pd.DataFrame(all_stats)
-    # Generate visualizations
     fig = visualize_results(results_df, stats_df)
     return results_df, stats_df, fig

     axs[1, 0].set_title('Distribution of Result Content Lengths')
     axs[1, 0].set_xticklabels(axs[1, 0].get_xticklabels(), rotation=45, ha='right')
+    #embeddings = np.array([embedding for embedding in results_df['embedding'] if isinstance(embedding, np.ndarray)])
+    #if len(embeddings) > 1:
+    #    tsne = TSNE(n_components=2, random_state=42)
+    #    embeddings_2d = tsne.fit_transform(embeddings)
+    #
+    #    sns.scatterplot(x=embeddings_2d[:, 0], y=embeddings_2d[:, 1], hue=results_df['model'][:len(embeddings)], ax=axs[1, 1])
+    #    axs[1, 1].set_title('t-SNE Visualization of Result Embeddings')
+    #else:
+    #    axs[1, 1].text(0.5, 0.5, "Not enough data for t-SNE visualization", ha='center', va='center')
     plt.tight_layout()
     return fig
     }
     for model_type, model_name in zip(model_types, model_names):
+        # Process the file and generate chunks & embeddings
         chunks, embedding_model, num_tokens = process_files(
             file.name if file else None,
             model_type,
             custom_tokenizer_file
         )
+        # Custom embedding handling
         if use_custom_embedding:
             custom_model = create_custom_embedding(chunks)
             embedding_model = CustomEmbeddings(custom_model)
+        # Optimizing vocabulary if required
         if optimize_vocab:
             tokenizer, optimized_chunks = optimize_vocabulary(chunks)
             chunks = optimized_chunks
+        # Searching embeddings
         results, search_time, vector_store, results_raw = search_embeddings(
             chunks,
             embedding_model,
             phonetic_weight
         )
+        # Storing embeddings into the results for future use
+        result_embeddings = [chunk['embedding'] for chunk in results_raw]  # Assuming each result has an embedding
         stats = calculate_statistics(results_raw, search_time, vector_store, num_tokens, embedding_model, query, top_k)
         stats["model"] = f"{model_type} - {model_name}"
         stats.update(settings)
+        # Formatting results and attaching embeddings
         formatted_results = format_results(results_raw, stats)
+        for i, result in enumerate(formatted_results):
+            result['embedding'] = result_embeddings[i]  # Add the embedding to each result
         all_results.extend(formatted_results)
         all_stats.append(stats)
+    # Create DataFrames with embeddings now included
     results_df = pd.DataFrame(all_results)
     stats_df = pd.DataFrame(all_stats)
+    # Visualization of the results
     fig = visualize_results(results_df, stats_df)
     return results_df, stats_df, fig