Update app.py
Browse files
app.py
CHANGED
@@ -325,15 +325,15 @@ def visualize_results(results_df, stats_df):
|
|
325 |
axs[1, 0].set_title('Distribution of Result Content Lengths')
|
326 |
axs[1, 0].set_xticklabels(axs[1, 0].get_xticklabels(), rotation=45, ha='right')
|
327 |
|
328 |
-
embeddings = np.array([embedding for embedding in results_df['embedding'] if isinstance(embedding, np.ndarray)])
|
329 |
-
if len(embeddings) > 1:
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
else:
|
336 |
-
|
337 |
|
338 |
plt.tight_layout()
|
339 |
return fig
|
@@ -357,6 +357,7 @@ def compare_embeddings(file, query, model_types, model_names, split_strategy, ch
|
|
357 |
}
|
358 |
|
359 |
for model_type, model_name in zip(model_types, model_names):
|
|
|
360 |
chunks, embedding_model, num_tokens = process_files(
|
361 |
file.name if file else None,
|
362 |
model_type,
|
@@ -369,14 +370,17 @@ def compare_embeddings(file, query, model_types, model_names, split_strategy, ch
|
|
369 |
custom_tokenizer_file
|
370 |
)
|
371 |
|
|
|
372 |
if use_custom_embedding:
|
373 |
custom_model = create_custom_embedding(chunks)
|
374 |
embedding_model = CustomEmbeddings(custom_model)
|
375 |
|
|
|
376 |
if optimize_vocab:
|
377 |
tokenizer, optimized_chunks = optimize_vocabulary(chunks)
|
378 |
chunks = optimized_chunks
|
379 |
|
|
|
380 |
results, search_time, vector_store, results_raw = search_embeddings(
|
381 |
chunks,
|
382 |
embedding_model,
|
@@ -388,18 +392,26 @@ def compare_embeddings(file, query, model_types, model_names, split_strategy, ch
|
|
388 |
phonetic_weight
|
389 |
)
|
390 |
|
|
|
|
|
|
|
391 |
stats = calculate_statistics(results_raw, search_time, vector_store, num_tokens, embedding_model, query, top_k)
|
392 |
stats["model"] = f"{model_type} - {model_name}"
|
393 |
stats.update(settings)
|
394 |
|
|
|
395 |
formatted_results = format_results(results_raw, stats)
|
|
|
|
|
|
|
396 |
all_results.extend(formatted_results)
|
397 |
all_stats.append(stats)
|
398 |
|
|
|
399 |
results_df = pd.DataFrame(all_results)
|
400 |
stats_df = pd.DataFrame(all_stats)
|
401 |
|
402 |
-
#
|
403 |
fig = visualize_results(results_df, stats_df)
|
404 |
|
405 |
return results_df, stats_df, fig
|
|
|
325 |
axs[1, 0].set_title('Distribution of Result Content Lengths')
|
326 |
axs[1, 0].set_xticklabels(axs[1, 0].get_xticklabels(), rotation=45, ha='right')
|
327 |
|
328 |
+
#embeddings = np.array([embedding for embedding in results_df['embedding'] if isinstance(embedding, np.ndarray)])
|
329 |
+
#if len(embeddings) > 1:
|
330 |
+
# tsne = TSNE(n_components=2, random_state=42)
|
331 |
+
# embeddings_2d = tsne.fit_transform(embeddings)
|
332 |
+
#
|
333 |
+
# sns.scatterplot(x=embeddings_2d[:, 0], y=embeddings_2d[:, 1], hue=results_df['model'][:len(embeddings)], ax=axs[1, 1])
|
334 |
+
# axs[1, 1].set_title('t-SNE Visualization of Result Embeddings')
|
335 |
+
#else:
|
336 |
+
# axs[1, 1].text(0.5, 0.5, "Not enough data for t-SNE visualization", ha='center', va='center')
|
337 |
|
338 |
plt.tight_layout()
|
339 |
return fig
|
|
|
357 |
}
|
358 |
|
359 |
for model_type, model_name in zip(model_types, model_names):
|
360 |
+
# Process the file and generate chunks & embeddings
|
361 |
chunks, embedding_model, num_tokens = process_files(
|
362 |
file.name if file else None,
|
363 |
model_type,
|
|
|
370 |
custom_tokenizer_file
|
371 |
)
|
372 |
|
373 |
+
# Custom embedding handling
|
374 |
if use_custom_embedding:
|
375 |
custom_model = create_custom_embedding(chunks)
|
376 |
embedding_model = CustomEmbeddings(custom_model)
|
377 |
|
378 |
+
# Optimizing vocabulary if required
|
379 |
if optimize_vocab:
|
380 |
tokenizer, optimized_chunks = optimize_vocabulary(chunks)
|
381 |
chunks = optimized_chunks
|
382 |
|
383 |
+
# Searching embeddings
|
384 |
results, search_time, vector_store, results_raw = search_embeddings(
|
385 |
chunks,
|
386 |
embedding_model,
|
|
|
392 |
phonetic_weight
|
393 |
)
|
394 |
|
395 |
+
# Storing embeddings into the results for future use
|
396 |
+
result_embeddings = [chunk['embedding'] for chunk in results_raw] # Assuming each result has an embedding
|
397 |
+
|
398 |
stats = calculate_statistics(results_raw, search_time, vector_store, num_tokens, embedding_model, query, top_k)
|
399 |
stats["model"] = f"{model_type} - {model_name}"
|
400 |
stats.update(settings)
|
401 |
|
402 |
+
# Formatting results and attaching embeddings
|
403 |
formatted_results = format_results(results_raw, stats)
|
404 |
+
for i, result in enumerate(formatted_results):
|
405 |
+
result['embedding'] = result_embeddings[i] # Add the embedding to each result
|
406 |
+
|
407 |
all_results.extend(formatted_results)
|
408 |
all_stats.append(stats)
|
409 |
|
410 |
+
# Create DataFrames with embeddings now included
|
411 |
results_df = pd.DataFrame(all_results)
|
412 |
stats_df = pd.DataFrame(all_stats)
|
413 |
|
414 |
+
# Visualization of the results
|
415 |
fig = visualize_results(results_df, stats_df)
|
416 |
|
417 |
return results_df, stats_df, fig
|