Update app.py
Browse files
app.py
CHANGED
@@ -49,6 +49,8 @@ def download_nltk_resources():
|
|
49 |
|
50 |
download_nltk_resources()
|
51 |
|
|
|
|
|
52 |
FILES_DIR = './files'
|
53 |
|
54 |
# Model Management
|
@@ -159,9 +161,9 @@ def phonetic_match(text, query, method='levenshtein_distance', apply_phonetic=Tr
|
|
159 |
|
160 |
def optimize_query(query, llm_model):
|
161 |
llm = HuggingFacePipeline.from_model_id(
|
162 |
-
model_id=
|
163 |
task="text2text-generation",
|
164 |
-
model_kwargs={"temperature": 0, "
|
165 |
)
|
166 |
multi_query_retriever = MultiQueryRetriever.from_llm(
|
167 |
retriever=get_retriever(vector_store, search_type, search_kwargs),
|
@@ -391,6 +393,8 @@ def visualize_results(results_df, stats_df):
|
|
391 |
|
392 |
sns.barplot(x='model', y='search_time', data=stats_df, ax=axs[0, 0])
|
393 |
axs[0, 0].set_title('Search Time by Model')
|
|
|
|
|
394 |
axs[0, 0].set_xticklabels(axs[0, 0].get_xticklabels(), rotation=45, ha='right')
|
395 |
|
396 |
sns.scatterplot(x='result_diversity', y='rank_correlation', hue='model', data=stats_df, ax=axs[0, 1])
|
@@ -398,6 +402,7 @@ def visualize_results(results_df, stats_df):
|
|
398 |
|
399 |
sns.boxplot(x='model', y='avg_content_length', data=stats_df, ax=axs[1, 0])
|
400 |
axs[1, 0].set_title('Distribution of Result Content Lengths')
|
|
|
401 |
axs[1, 0].set_xticklabels(axs[1, 0].get_xticklabels(), rotation=45, ha='right')
|
402 |
|
403 |
embeddings = np.array([embedding for embedding in results_df['embedding'] if isinstance(embedding, np.ndarray)])
|
@@ -514,6 +519,8 @@ def compare_embeddings(file, query, embedding_models, custom_embedding_model, sp
|
|
514 |
|
515 |
stats = calculate_statistics(results_raw, search_time, vector_store, num_tokens, embedding_model, query, top_k, expected_result)
|
516 |
stats["model"] = f"{model_type} - {model_name}"
|
|
|
|
|
517 |
stats.update(settings)
|
518 |
|
519 |
formatted_results = format_results(results_raw, stats)
|
@@ -605,6 +612,8 @@ def automated_testing(file, query, test_params, expected_result=None):
|
|
605 |
|
606 |
stats = calculate_statistics(results_raw, search_time, vector_store, num_tokens, embedding_model, query, params['top_k'], expected_result)
|
607 |
stats["model"] = f"{params['model_type']} - {params['model_name']}"
|
|
|
|
|
608 |
stats.update(params)
|
609 |
|
610 |
all_results.extend(format_results(results_raw, stats))
|
@@ -705,7 +714,7 @@ Provide your suggestions in a Python dictionary format."""
|
|
705 |
llm = HuggingFacePipeline.from_model_id(
|
706 |
model_id="google/flan-t5-large",
|
707 |
task="text2text-generation",
|
708 |
-
model_kwargs={"temperature": 0.7, "
|
709 |
)
|
710 |
|
711 |
# Generate suggestions
|
|
|
49 |
|
50 |
download_nltk_resources()
|
51 |
|
52 |
+
nltk.download('punkt')
|
53 |
+
|
54 |
FILES_DIR = './files'
|
55 |
|
56 |
# Model Management
|
|
|
161 |
|
162 |
def optimize_query(query, llm_model):
|
163 |
llm = HuggingFacePipeline.from_model_id(
|
164 |
+
model_id="google/flan-t5-large",
|
165 |
task="text2text-generation",
|
166 |
+
model_kwargs={"do_sample": True, "temperature": 0.7, "max_new_tokens": 512},
|
167 |
)
|
168 |
multi_query_retriever = MultiQueryRetriever.from_llm(
|
169 |
retriever=get_retriever(vector_store, search_type, search_kwargs),
|
|
|
393 |
|
394 |
sns.barplot(x='model', y='search_time', data=stats_df, ax=axs[0, 0])
|
395 |
axs[0, 0].set_title('Search Time by Model')
|
396 |
+
axs[0, 0].set_xticks(range(len(axs[0, 0].get_xticklabels())))
|
397 |
+
|
398 |
axs[0, 0].set_xticklabels(axs[0, 0].get_xticklabels(), rotation=45, ha='right')
|
399 |
|
400 |
sns.scatterplot(x='result_diversity', y='rank_correlation', hue='model', data=stats_df, ax=axs[0, 1])
|
|
|
402 |
|
403 |
sns.boxplot(x='model', y='avg_content_length', data=stats_df, ax=axs[1, 0])
|
404 |
axs[1, 0].set_title('Distribution of Result Content Lengths')
|
405 |
+
axs[1, 0].set_xticks(range(len(axs[0, 0].get_xticklabels())))
|
406 |
axs[1, 0].set_xticklabels(axs[1, 0].get_xticklabels(), rotation=45, ha='right')
|
407 |
|
408 |
embeddings = np.array([embedding for embedding in results_df['embedding'] if isinstance(embedding, np.ndarray)])
|
|
|
519 |
|
520 |
stats = calculate_statistics(results_raw, search_time, vector_store, num_tokens, embedding_model, query, top_k, expected_result)
|
521 |
stats["model"] = f"{model_type} - {model_name}"
|
522 |
+
stats["model_type"] = model_type
|
523 |
+
stats["model_name"] = model_name
|
524 |
stats.update(settings)
|
525 |
|
526 |
formatted_results = format_results(results_raw, stats)
|
|
|
612 |
|
613 |
stats = calculate_statistics(results_raw, search_time, vector_store, num_tokens, embedding_model, query, params['top_k'], expected_result)
|
614 |
stats["model"] = f"{params['model_type']} - {params['model_name']}"
|
615 |
+
stats["model_type"] = model_type
|
616 |
+
stats["model_name"] = model_name
|
617 |
stats.update(params)
|
618 |
|
619 |
all_results.extend(format_results(results_raw, stats))
|
|
|
714 |
llm = HuggingFacePipeline.from_model_id(
|
715 |
model_id="google/flan-t5-large",
|
716 |
task="text2text-generation",
|
717 |
+
model_kwargs={"do_sample": True, "temperature": 0.7, "max_new_tokens": 512},
|
718 |
)
|
719 |
|
720 |
# Generate suggestions
|