Spaces:
Sleeping
Sleeping
import gradio as gr | |
import pandas as pd | |
import numpy as np | |
import re | |
from sentence_transformers import SentenceTransformer | |
from sklearn.metrics.pairwise import cosine_similarity | |
def load_model_and_data(): | |
# Load the sentence transformer model | |
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') | |
# Load the dimension data | |
dimension_df = pd.read_csv('embedded_dimensions.csv') | |
dimension_df['embedding'] = dimension_df['embedding'].apply(lambda x: np.fromstring(x[1:-1], sep=',')) | |
dimension_questions = dimension_df['text'].tolist() | |
dimension_embeddings = np.vstack(dimension_df['embedding'].values) | |
# Load the original dims dataframe for dimension names lookup | |
dims = pd.read_csv('dimensions_export.csv').drop(columns=['Unnamed: 0']).drop_duplicates() | |
return model, dimension_questions, dimension_embeddings, dims | |
def predict_dimension(new_text, model, dimension_questions, dimension_embeddings, dims): | |
# Clean the input text | |
clean_text = re.sub(r'\d+', '', new_text) | |
# Generate embedding for the new text | |
new_text_embedding = model.encode([clean_text], batch_size=1, show_progress_bar=False) | |
# Calculate cosine similarity | |
similarities = cosine_similarity(dimension_embeddings, new_text_embedding) | |
best_match_index = similarities.argmax() | |
max_similarity = similarities[best_match_index][0] | |
best_match_dim_text = dimension_questions[best_match_index] | |
best_match_dim_name = dims[dims['text'] == best_match_dim_text]['name'].values[0] if len(dims[dims['text'] == best_match_dim_text]['name'].values) > 0 else 'Unknown' | |
result = { | |
"Original Text" : new_text, | |
"Predicted Dimension Name": best_match_dim_name, | |
"Similar Dimension Statement": best_match_dim_text, | |
"Confidence": max_similarity | |
} | |
return result | |
def process_input(input_text): | |
# Split the input text by '╡' character | |
dimension_statements = input_text.split('╡') | |
# Load model and data | |
model, dimension_questions, dimension_embeddings, dims = load_model_and_data() | |
# Process each statement and collect results | |
results = [] | |
for statement in dimension_statements: | |
result = predict_dimension(statement, model, dimension_questions, dimension_embeddings, dims) | |
results.append(result) | |
return results | |
demo = gr.Interface(fn=process_input, inputs="text", outputs="json") | |
demo.launch() | |