jmemcc
enabled batched strings
96fe26a
import gradio as gr
import pandas as pd
import numpy as np
import re
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
def load_model_and_data():
# Load the sentence transformer model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
# Load the dimension data
dimension_df = pd.read_csv('embedded_dimensions.csv')
dimension_df['embedding'] = dimension_df['embedding'].apply(lambda x: np.fromstring(x[1:-1], sep=','))
dimension_questions = dimension_df['text'].tolist()
dimension_embeddings = np.vstack(dimension_df['embedding'].values)
# Load the original dims dataframe for dimension names lookup
dims = pd.read_csv('dimensions_export.csv').drop(columns=['Unnamed: 0']).drop_duplicates()
return model, dimension_questions, dimension_embeddings, dims
def predict_dimension(new_text, model, dimension_questions, dimension_embeddings, dims):
# Clean the input text
clean_text = re.sub(r'\d+', '', new_text)
# Generate embedding for the new text
new_text_embedding = model.encode([clean_text], batch_size=1, show_progress_bar=False)
# Calculate cosine similarity
similarities = cosine_similarity(dimension_embeddings, new_text_embedding)
best_match_index = similarities.argmax()
max_similarity = similarities[best_match_index][0]
best_match_dim_text = dimension_questions[best_match_index]
best_match_dim_name = dims[dims['text'] == best_match_dim_text]['name'].values[0] if len(dims[dims['text'] == best_match_dim_text]['name'].values) > 0 else 'Unknown'
result = {
"Original Text" : new_text,
"Predicted Dimension Name": best_match_dim_name,
"Similar Dimension Statement": best_match_dim_text,
"Confidence": max_similarity
}
return result
def process_input(input_text):
# Split the input text by '╡' character
dimension_statements = input_text.split('╡')
# Load model and data
model, dimension_questions, dimension_embeddings, dims = load_model_and_data()
# Process each statement and collect results
results = []
for statement in dimension_statements:
result = predict_dimension(statement, model, dimension_questions, dimension_embeddings, dims)
results.append(result)
return results
demo = gr.Interface(fn=process_input, inputs="text", outputs="json")
demo.launch()