Chris4K commited on
Commit
1078648
1 Parent(s): e0a87f8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -10
app.py CHANGED
@@ -325,15 +325,15 @@ def visualize_results(results_df, stats_df):
325
  axs[1, 0].set_title('Distribution of Result Content Lengths')
326
  axs[1, 0].set_xticklabels(axs[1, 0].get_xticklabels(), rotation=45, ha='right')
327
 
328
- embeddings = np.array([embedding for embedding in results_df['embedding'] if isinstance(embedding, np.ndarray)])
329
- if len(embeddings) > 1:
330
- tsne = TSNE(n_components=2, random_state=42)
331
- embeddings_2d = tsne.fit_transform(embeddings)
332
-
333
- sns.scatterplot(x=embeddings_2d[:, 0], y=embeddings_2d[:, 1], hue=results_df['model'][:len(embeddings)], ax=axs[1, 1])
334
- axs[1, 1].set_title('t-SNE Visualization of Result Embeddings')
335
- else:
336
- axs[1, 1].text(0.5, 0.5, "Not enough data for t-SNE visualization", ha='center', va='center')
337
 
338
  plt.tight_layout()
339
  return fig
@@ -357,6 +357,7 @@ def compare_embeddings(file, query, model_types, model_names, split_strategy, ch
357
  }
358
 
359
  for model_type, model_name in zip(model_types, model_names):
 
360
  chunks, embedding_model, num_tokens = process_files(
361
  file.name if file else None,
362
  model_type,
@@ -369,14 +370,17 @@ def compare_embeddings(file, query, model_types, model_names, split_strategy, ch
369
  custom_tokenizer_file
370
  )
371
 
 
372
  if use_custom_embedding:
373
  custom_model = create_custom_embedding(chunks)
374
  embedding_model = CustomEmbeddings(custom_model)
375
 
 
376
  if optimize_vocab:
377
  tokenizer, optimized_chunks = optimize_vocabulary(chunks)
378
  chunks = optimized_chunks
379
 
 
380
  results, search_time, vector_store, results_raw = search_embeddings(
381
  chunks,
382
  embedding_model,
@@ -388,18 +392,26 @@ def compare_embeddings(file, query, model_types, model_names, split_strategy, ch
388
  phonetic_weight
389
  )
390
 
 
 
 
391
  stats = calculate_statistics(results_raw, search_time, vector_store, num_tokens, embedding_model, query, top_k)
392
  stats["model"] = f"{model_type} - {model_name}"
393
  stats.update(settings)
394
 
 
395
  formatted_results = format_results(results_raw, stats)
 
 
 
396
  all_results.extend(formatted_results)
397
  all_stats.append(stats)
398
 
 
399
  results_df = pd.DataFrame(all_results)
400
  stats_df = pd.DataFrame(all_stats)
401
 
402
- # Generate visualizations
403
  fig = visualize_results(results_df, stats_df)
404
 
405
  return results_df, stats_df, fig
 
325
  axs[1, 0].set_title('Distribution of Result Content Lengths')
326
  axs[1, 0].set_xticklabels(axs[1, 0].get_xticklabels(), rotation=45, ha='right')
327
 
328
+ #embeddings = np.array([embedding for embedding in results_df['embedding'] if isinstance(embedding, np.ndarray)])
329
+ #if len(embeddings) > 1:
330
+ # tsne = TSNE(n_components=2, random_state=42)
331
+ # embeddings_2d = tsne.fit_transform(embeddings)
332
+ #
333
+ # sns.scatterplot(x=embeddings_2d[:, 0], y=embeddings_2d[:, 1], hue=results_df['model'][:len(embeddings)], ax=axs[1, 1])
334
+ # axs[1, 1].set_title('t-SNE Visualization of Result Embeddings')
335
+ #else:
336
+ # axs[1, 1].text(0.5, 0.5, "Not enough data for t-SNE visualization", ha='center', va='center')
337
 
338
  plt.tight_layout()
339
  return fig
 
357
  }
358
 
359
  for model_type, model_name in zip(model_types, model_names):
360
+ # Process the file and generate chunks & embeddings
361
  chunks, embedding_model, num_tokens = process_files(
362
  file.name if file else None,
363
  model_type,
 
370
  custom_tokenizer_file
371
  )
372
 
373
+ # Custom embedding handling
374
  if use_custom_embedding:
375
  custom_model = create_custom_embedding(chunks)
376
  embedding_model = CustomEmbeddings(custom_model)
377
 
378
+ # Optimizing vocabulary if required
379
  if optimize_vocab:
380
  tokenizer, optimized_chunks = optimize_vocabulary(chunks)
381
  chunks = optimized_chunks
382
 
383
+ # Searching embeddings
384
  results, search_time, vector_store, results_raw = search_embeddings(
385
  chunks,
386
  embedding_model,
 
392
  phonetic_weight
393
  )
394
 
395
+ # Storing embeddings into the results for future use
396
+ result_embeddings = [chunk['embedding'] for chunk in results_raw] # Assuming each result has an embedding
397
+
398
  stats = calculate_statistics(results_raw, search_time, vector_store, num_tokens, embedding_model, query, top_k)
399
  stats["model"] = f"{model_type} - {model_name}"
400
  stats.update(settings)
401
 
402
+ # Formatting results and attaching embeddings
403
  formatted_results = format_results(results_raw, stats)
404
+ for i, result in enumerate(formatted_results):
405
+ result['embedding'] = result_embeddings[i] # Add the embedding to each result
406
+
407
  all_results.extend(formatted_results)
408
  all_stats.append(stats)
409
 
410
+ # Create DataFrames with embeddings now included
411
  results_df = pd.DataFrame(all_results)
412
  stats_df = pd.DataFrame(all_stats)
413
 
414
+ # Visualization of the results
415
  fig = visualize_results(results_df, stats_df)
416
 
417
  return results_df, stats_df, fig