import gradio as gr from sklearn.metrics.pairwise import cosine_similarity import numpy as np from joblib import load import h5py from io import BytesIO # Load the model and data once at startup with h5py.File('complete_artist_data.hdf5', 'r') as f: # Deserialize the vectorizer vectorizer_bytes = f['vectorizer'][()].tobytes() vectorizer_buffer = BytesIO(vectorizer_bytes) vectorizer = load(vectorizer_buffer) # Load X_artist X_artist = f['X_artist'][:] # Load artist names and decode to strings artist_names = [name.decode() for name in f['artist_names'][:]] def find_similar_artists(new_tags_string, top_n): new_image_tags = [tag.strip() for tag in new_tags_string.split(",")] unseen_tags = set(new_image_tags) - set(vectorizer.vocabulary_.keys()) unseen_tags_str = f'Unseen Tags: {", ".join(unseen_tags)}' if unseen_tags else 'No unseen tags.' X_new_image = vectorizer.transform([','.join(new_image_tags)]) similarities = cosine_similarity(X_new_image, X_artist)[0] top_artist_indices = np.argsort(similarities)[-top_n:][::-1] bottom_artist_indices = np.argsort(similarities)[:top_n] top_artists = [(artist_names[i], similarities[i]) for i in top_artist_indices] bottom_artists = [(artist_names[i], similarities[i]) for i in bottom_artist_indices] top_artists_str = "\n".join([f"{rank+1}. {artist} - similarity score: {score:.4f}" for rank, (artist, score) in enumerate(top_artists)]) bottom_artists_str = "\n".join([f"{rank+1}. {artist} - similarity score: {score:.4f}" for rank, (artist, score) in enumerate(bottom_artists)]) output_str = f"{unseen_tags_str}\n\nTop 10 artists:\n{top_artists_str}\n\nBottom 10 artists:\n{bottom_artists_str}" return output_str iface = gr.Interface( fn=find_similar_artists, inputs=[ gr.Textbox(label="Enter image tags", placeholder="fox, outside, detailed background"), gr.Slider(minimum=1, maximum=100, default=10, step=1, label="Number of artists") ], outputs="text", title="Tagset Completer", description="Enter a list of comma-separated e6 tags" ) iface.launch()