import gradio as gr import pandas as pd import numpy as np import re from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity def load_model_and_data(): # Load the sentence transformer model model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') # Load the dimension data dimension_df = pd.read_csv('embedded_dimensions.csv') dimension_df['embedding'] = dimension_df['embedding'].apply(lambda x: np.fromstring(x[1:-1], sep=',')) dimension_questions = dimension_df['text'].tolist() dimension_embeddings = np.vstack(dimension_df['embedding'].values) # Load the original dims dataframe for dimension names lookup dims = pd.read_csv('dimensions_export.csv').drop(columns=['Unnamed: 0']).drop_duplicates() return model, dimension_questions, dimension_embeddings, dims def predict_dimension(new_text, model, dimension_questions, dimension_embeddings, dims): # Clean the input text clean_text = re.sub(r'\d+', '', new_text) # Generate embedding for the new text new_text_embedding = model.encode([clean_text], batch_size=1, show_progress_bar=False) # Calculate cosine similarity similarities = cosine_similarity(dimension_embeddings, new_text_embedding) best_match_index = similarities.argmax() max_similarity = similarities[best_match_index][0] best_match_dim_text = dimension_questions[best_match_index] best_match_dim_name = dims[dims['text'] == best_match_dim_text]['name'].values[0] if len(dims[dims['text'] == best_match_dim_text]['name'].values) > 0 else 'Unknown' result = { "Original Text" : new_text, "Predicted Dimension Name": best_match_dim_name, "Similar Dimension Statement": best_match_dim_text, "Confidence": max_similarity } return result def process_input(input_text): # Split the input text by '╡' character dimension_statements = input_text.split('╡') # Load model and data model, dimension_questions, dimension_embeddings, dims = load_model_and_data() # Process each statement and collect results results = [] for statement in dimension_statements: result = predict_dimension(statement, model, dimension_questions, dimension_embeddings, dims) results.append(result) return results demo = gr.Interface(fn=process_input, inputs="text", outputs="json") demo.launch()