Spaces:
Sleeping
Sleeping
from sentence_transformers import SentenceTransformer | |
from huggingface_hub import CommitScheduler | |
from datasets import Dataset | |
import gradio as gr | |
import pandas as pd | |
import os | |
from parameter_extractor import ParameterExtractor | |
from DNAseq import DNAseq | |
from helper import list_at_index_0, list_at_index_1 | |
from logger import cts_log_file_create, logger, cts_logger | |
HF_TOKEN = os.environ.get("HF_TOKEN", None) | |
# Create csv file for data logging | |
log_file_path = cts_log_file_create("flagged") | |
# Initialise CommitScheduler | |
scheduler = CommitScheduler( | |
repo_id="kevkev05/CTS-performance-metrics", | |
repo_type="dataset", | |
folder_path=log_file_path.parent, | |
path_in_repo="data", | |
every=5, | |
private=True, | |
token=HF_TOKEN | |
) | |
def chat_to_sequence(sequence, user_query): | |
# Sequence to be analysed/queried | |
input_sequence = sequence | |
# Set DNAseq class expected variable | |
dna = input_sequence | |
# Model | |
model_name = "all-mpnet-base-v2" | |
# Load model | |
model = SentenceTransformer(model_name) | |
# User input | |
user_query = user_query | |
# Set ParameterExtractor class expected variable | |
query = user_query | |
# Bot Response | |
response = "" | |
# Query Code Description Message | |
code_descript_message = '' | |
# kNN semantic similarity threshold / used to determine if query can execute code | |
# kNN semantic similarity values less than the lower threshold should return a code eval response | |
# kNN semantic similarity values more than the lower threshold shouldn't return a code eval response | |
proximal_lower_threshold = 1.1 | |
proximal_upper_threshold = 1.4 | |
threshold_exceeded_message = "Your Query Wasn't Understood. Can You Rephrase The Query" | |
threshold_approximate_message = "Your Query Wasn't Understood Clearly. Try Using The Following Query Formats" | |
# Load the function mapping CSV file into a pandas DataFrame | |
code_function_mapping = pd.read_csv("code_function_mapping.csv") | |
# Load reference query database from JSON file back into a DataFrame | |
ref_query_df = pd.read_json('reference_query_db.json', orient='records') | |
# Create Dataset object using the pandas data frame | |
ref_query_ds = Dataset.from_pandas(ref_query_df) | |
# Load FAISS index | |
ref_query_ds.load_faiss_index('all-mpnet-base-v2_embeddings', 'ref_query_db_index') | |
# Create embeddings for user query | |
query_embedding = model.encode(user_query) | |
# Semantic similarity search user query against sample queries | |
index_result = ref_query_ds.get_nearest_examples("all-mpnet-base-v2_embeddings", query_embedding, k=3) | |
# Retrieve results from dataset object | |
scores, examples = index_result | |
# Create a DataFrame from the examples dictionary | |
result_df = pd.DataFrame(examples) | |
# Add the scores as a new column to the DataFrame | |
result_df['score'] = scores | |
# Sort the DataFrame by the 'Score' column in ascending order | |
# FIASS uses kNN as the similarity algorithm / value of 0 indicates an exact match | |
sorted_df = result_df.sort_values(by='score', ascending=True) | |
# Get the query with the lowest kNN score (first row after sorting) | |
ref_question = sorted_df.iloc[0]['question'] | |
# Get the code for the query with the lowest kNN score (first row after sorting) | |
query_code = sorted_df.iloc[0]['code'] | |
# Get the score for the query with the lowest kNN score (first row after sorting) | |
query_score = sorted_df.iloc[0]['score'] | |
# Description of query code to be executed | |
query_code_description = code_function_mapping[code_function_mapping['code'] == query_code]['description'].values[0] | |
# Extra log entities | |
similarity_metric = "k nearest neighbours" | |
ref_question_2 = sorted_df.iloc[1]['question'] | |
ref_question_3 = sorted_df.iloc[1]['question'] | |
query_score_2 = sorted_df.iloc[1]['score'] | |
query_score_3 = sorted_df.iloc[1]['score'] | |
# logger function log_data parameter input | |
log_data = [ | |
user_query, | |
ref_question, | |
query_score, | |
query_code, | |
ref_question_2, | |
query_score_2, | |
ref_question_3, | |
query_score_3, | |
similarity_metric, | |
model_name, | |
proximal_lower_threshold, | |
proximal_upper_threshold, | |
] | |
# Check the query score against threshold values | |
if query_score >= proximal_upper_threshold: | |
response = threshold_exceeded_message | |
cts_logger(scheduler, log_file_path, log_data, response) | |
print(threshold_exceeded_message) | |
elif proximal_lower_threshold < query_score < proximal_upper_threshold: | |
response = threshold_approximate_message + "\n" + ref_question | |
cts_logger(scheduler, log_file_path, log_data, response) | |
print(threshold_approximate_message, ref_question) | |
else: | |
print("Execute query") | |
# Define the question | |
code = query_code | |
# Filter the DataFrame to find the code that matches the question | |
matching_row = code_function_mapping[code_function_mapping["code"] == code] | |
# Check if there is a match | |
if not matching_row.empty: | |
function = matching_row.iloc[0]["function"] | |
response = str(eval(function)) | |
code_descript_message = query_code_description.title() | |
cts_logger(scheduler, log_file_path, log_data, response) | |
else: | |
response = "Error processing query" | |
query_code = "No Match Error" | |
cts_logger(scheduler, log_file_path, log_data, response) | |
print("No matching code found for the function:", code) | |
return response, code_descript_message | |
return response, code_descript_message | |
ChatToSequence = gr.Interface( | |
fn=chat_to_sequence, | |
inputs=[gr.Textbox(label="Sequence", placeholder="Input DNA Sequence..."), | |
gr.Textbox(label="Query", placeholder="Input Query...")], | |
outputs=[gr.Textbox(label="Response"), gr.Textbox(label="Action Executed")], | |
allow_flagging="never", | |
title="Chat-To-Sequence", | |
description="This Demo App Allows You To Explore Your DNA Sequence Using Natural Language", | |
theme=gr.themes.Soft(), | |
examples=[ | |
["ggcattgaggagaccattgacaccgtcattagcaatgcactacaactgtcacaacctaaa", | |
"What is the length of the sequence"], | |
["ggcattgaggagaccattgacaccgtcattagcaatgcactacaactgtcacaacctaaa", | |
"How many guanines bases are there in the sequence"], | |
["ggcattgaggagaccattgacaccgtcattagcaatgcactacaactgtcacaacctaaa", | |
"What is the base at position 10"], | |
["ggcattgaggagaccattgacaccgtcattagcaatgcactacaactgtcacaacctaaa", | |
"What are the bases from position 2 to 10"], | |
["ggcattgaggagaccattgacaccgtcattagcaatgcactacaactgtcacaacctaaa", | |
"How many bases are there from position 2 to 10"], | |
], | |
).queue() | |
ChatToSequence.launch() | |