# ========== (c) JP Hwang 25/9/2022 ========== import logging import pandas as pd import numpy as np import streamlit as st import plotly.express as px from scipy import spatial import random # ===== SET UP LOGGER ===== logger = logging.getLogger(__name__) root_logger = logging.getLogger() root_logger.setLevel(logging.INFO) sh = logging.StreamHandler() formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') sh.setFormatter(formatter) root_logger.addHandler(sh) # ===== END LOGGER SETUP ===== desired_width = 320 pd.set_option('display.max_columns', 20) pd.set_option('display.width', desired_width) sizes = [1, 20, 30] def get_top_tokens(ser_in): from collections import Counter tkn_list = '_'.join(ser_in.tolist()).split('_') tkn_counts = Counter(tkn_list) common_tokens = [i[0] for i in tkn_counts.most_common(10)] return common_tokens def build_chart(df_in): fig = px.scatter_3d(df_in, x='r', y='g', z='b', template='plotly_white', color=df_in['simple_name'], color_discrete_sequence=df_in['rgb'], size='size', hover_data=['name']) fig.update_layout( showlegend=False, margin=dict(l=5, r=5, t=20, b=5) ) return fig def preproc_data(): df = pd.read_csv('data/colors.csv', names=['simple_name', 'name', 'hex', 'r', 'g', 'b']) # Preprocessing df['rgb'] = df.apply(lambda x: f'rgb({x.r}, {x.g}, {x.b})', axis=1) # Get top 'basic' color names df = df.assign(category=df.simple_name.apply(lambda x: x.split('_')[-1])) # Set default size attribute df['size'] = sizes[0] return df def get_top_colors(df): top_colors = df['category'].value_counts()[:15].index.tolist() top_colors = [c for c in top_colors if c in df.simple_name.values] return top_colors def main(): st.title('Colorful vectors') st.markdown(""" You might have heard that objects like words or images can be represented by "vectors". What does that mean, exactly? It seems like a tricky concept, but it doesn't have to be. Let's start here, where colors are represented in 3-D space 🌈. Each axis represents how much of primary colors `(red, green, and blue)` each color comprises. For example, `Magenta` is represented by `(255, 0, 255)`, and `(80, 200, 120)` represents `Emerald`. That's all a *vector* is in this context - a sequence of numbers. Take a look at the resulting 3-D image below; it's kind of mesmerising! (You can spin the image around, as well as zoom in/out.) """ ) df = preproc_data() fig = build_chart(df) st.plotly_chart(fig) st.markdown(""" ### Why does this matter? You see here that similar colors are placed close to each other in space. It seems obvious, but **this** is the crux of why a *vector representation* is so powerful. These objects being located *in space* based on their key property (`color`) enables an easy, objective assessment of similarity. Let's take this further: """) # ===== SCALAR SEARCH ===== st.header('Searching in vector space') st.markdown(""" Imagine that you need to identify colors similar to a given color. You could do it by name, for instance looking for colors containing matching words. But remember that in the 3-D chart above, similar colors are physically close to each other. So all you actually need to do is to calculate distances, and collect points based on a threshold! That's probably still a bit abstract - so pick a 'base' color, and we'll go from there. In fact - try a few different colors while you're at it! """) top_colors = get_top_colors(df) # def_choice = random.randrange(len(top_colors)) query = st.selectbox('Pick a "base" color:', top_colors, index=5) match = df[df.simple_name == query].iloc[0] scalar_filter = df.simple_name.str.contains(query) st.markdown(f""" The color `{match.simple_name}` is also represented in our 3-D space by `({match.r}, {match.g}, {match.b})`. Let's see what we can find using either of these properties. (Oh, you can adjust the similarity threshold below as well.) """) with st.expander(f"Similarity search options"): st.markdown(f""" Do you want to find lots of similar colors, or just a select few *very* similar colors to `{match.simple_name}`. """) thresh_sel = st.slider('Select a similarity threshold', min_value=20, max_value=160, value=80, step=20) st.markdown("---") df['size'] = sizes[0] df.loc[scalar_filter, 'size'] = sizes[1] df.loc[df.simple_name == match.simple_name, 'size'] = sizes[2] scalar_fig = build_chart(df) scalar_hits = df[scalar_filter]['name'].values # ===== VECTOR SEARCH ===== vector = match[['r', 'g', 'b']].values.tolist() dist_metric = 'euc' def get_dist(a, b, method): if method == 'euc': return np.linalg.norm(a-b) else: return spatial.distance.cosine(a, b) df['dist'] = df[['r', 'g', 'b']].apply(lambda x: get_dist(x, vector, dist_metric), axis=1) df['size'] = sizes[0] if dist_metric == 'euc': vec_filter = df['dist'] < thresh_sel else: vec_filter = df['dist'] < 0.05 df.loc[vec_filter, 'size'] = sizes[1] df.loc[((df['r'] == vector[0]) & (df['g'] == vector[1]) & (df['b'] == vector[2]) ), 'size'] = sizes[2] vector_fig = build_chart(df) vector_hits = df[vec_filter].sort_values('dist')['name'].values # ===== OUTPUTS ===== col1, col2 = st.columns(2) with col1: st.markdown(f"These colors contain the text: `{match.simple_name}`:") st.plotly_chart(scalar_fig, use_container_width=True) st.markdown(f"Found {len(scalar_hits)} colors containing the string `{query}`.") with st.expander(f"Click to see the whole list"): st.markdown("- " + "\n- ".join(scalar_hits)) with col2: st.markdown(f"These colors are close to the vector `({match.r}, {match.g}, {match.b})`:") st.plotly_chart(vector_fig, use_container_width=True) st.markdown(f"Found {len(vector_hits)} colors similar to `{query}` based on its `(R, G, B)` values.") with st.expander(f"Click to see the whole list"): st.markdown("- " + "\n- ".join(vector_hits)) # ===== REFLECTIONS ===== unique_hits = [c for c in vector_hits if c not in scalar_hits] st.markdown("---") st.header("So what?") st.markdown(""" What did you notice? The thing that stood out to me is how *robust* and *consistent* the vector search results are. It manages to find a bunch of related colors regardless of what it's called. It doesn't matter that the color 'scarlet' does not contain the word 'red'; it goes ahead and finds all the neighboring colors based on a consistent criterion. It easily found these colors which it otherwise would not have based on the name alone: """) with st.expander(f"See list:"): st.markdown("- " + "\n- ".join(unique_hits)) st.markdown(""" I think it's brilliant - think about how much of a pain word searching is, and how inconsistent it is. This has so many advantages! --- """) st.header("Generally speaking...") st.markdown(""" Obviously, this is a pretty simple, self-contained example. Colors are particularly suited for representing using just a few numbers, like our primary colors. One number represents how much `red` each color contains, another for `green`, and the last for `blue`. But that core concept of representing similarity along different properties using numbers is exactly what happens in other domains. The only differences are in *how many* numbers are used, and what they represent. For example, words or documents might be represented by hundreds (e.g. 300 or 768) of AI-derived numbers. We'll take a look at those examples as well later on. Techniques used to visualise those high-dimensional vectors are called dimensionality reduction techniques. If you would like to see this in action, check out [this app](https://huggingface.co/spaces/jphwang/reduce_dimensions). """) st.markdown(""" --- If you liked this - [follow me (@_jphwang) on Twitter](https://twitter.com/_jphwang)! """) if __name__ == '__main__': main()