Spaces:

davidwisdom
/

la-metro

Runtime error

App Files Files Community

David Wisdom commited on Dec 9, 2021

Commit

d256b25

•

1 Parent(s): c05b194

plot the example stops on a map as well

Browse files

Files changed (1) hide show

app.py +50 -15

app.py CHANGED Viewed

@@ -15,7 +15,9 @@ from sklearn.cluster import DBSCAN
 def read_stops(p: str):
   """
-  DOCSTRING
   """
   return pd.read_csv(p)
@@ -38,7 +40,12 @@ def read_encodings(p: str) -> tf.Tensor:
 def cluster_encodings(encodings: tf.Tensor) -> np.ndarray:
   """
-  DOCSTRING
   """
   # I know the hyperparams I want from the EDA I did in the notebook
   clusterer = DBSCAN(eps=0.7, min_samples=100).fit(encodings)
@@ -47,7 +54,11 @@ def cluster_encodings(encodings: tf.Tensor) -> np.ndarray:
 def cluster_lat_lon(df: pd.DataFrame) -> np.ndarray:
   """
-  DOCSTRING
   """
   # I know the hyperparams I want from the EDA I did in the notebook
   clusterer = DBSCAN(eps=0.025, min_samples=100).fit(df[['latitude', 'longitude']])
@@ -56,26 +67,28 @@ def cluster_lat_lon(df: pd.DataFrame) -> np.ndarray:
 def plot_example(df: pd.DataFrame, labels: np.ndarray):
   """
-  DOCSTRING
   """
-  plot_size = 800
   labels = labels.astype('str')
-  fig = px.scatter(df, x='longitude', y='latitude',
-                   hover_name='display_name',
-                   color=labels,
-                   opacity=0.5,
-                   color_discrete_sequence=px.colors.qualitative.Safe,
-                   template='presentation',
-                   width=plot_size,
-                   height=plot_size)
-  # fig.show()
   return fig
 def plot_venice_blvd(df: pd.DataFrame, labels: np.ndarray):
   """
-  DOCSTRING
   """
   px.set_mapbox_access_token(st.secrets['mapbox_token'])
   venice_blvd = {'lat': 34.008350,
@@ -107,9 +120,31 @@ def main(data_path: str, enc_path: str):
   # Display the plots with Streamlit
   st.write('# Example of what DBSCAN does')
   st.plotly_chart(example_fig, use_container_width=True)
   st.write('# Venice Blvd')
   st.plotly_chart(venice_fig, use_container_width=True)

 def read_stops(p: str):
   """
+  Read in the .csv file of metro stops
+  :param p: The path to the .csv file of metro stops
   """
   return pd.read_csv(p)
 def cluster_encodings(encodings: tf.Tensor) -> np.ndarray:
   """
+  Cluster the sentence encodings using DBSCAN.
+  :param encodings: A Tensor of sentence encodings with shape
+                    (number of sentences, 512)
+  :returns: a NumPy array of the cluster labels
   """
   # I know the hyperparams I want from the EDA I did in the notebook
   clusterer = DBSCAN(eps=0.7, min_samples=100).fit(encodings)
 def cluster_lat_lon(df: pd.DataFrame) -> np.ndarray:
   """
+  Cluster the metro stops by their latitude and longitude using DBSCAN.
+  :param df: A Pandas DataFrame of stops that has 'latitude` and 'longitude` columns
+  :returns: a NumPy array of the cluster labels
   """
   # I know the hyperparams I want from the EDA I did in the notebook
   clusterer = DBSCAN(eps=0.025, min_samples=100).fit(df[['latitude', 'longitude']])
 def plot_example(df: pd.DataFrame, labels: np.ndarray):
   """
+  Plot the geographic clustering
+  :param df: A Pandas DataFrame of stops that has 'latitude` and 'longitude` columns
+  :param labels: a NumPy array of the cluster labels
   """
+  px.set_mapbox_access_token(st.secrets['mapbox_token'])
   labels = labels.astype('str')
+  fig = px.scatter_mapbox(df, x='longitude', y='latitude',
+                          hover_name='display_name',
+                          color=labels,
+                          zoom=10,
+                          color_discrete_sequence=px.colors.qualitative.Safe,
   return fig
 def plot_venice_blvd(df: pd.DataFrame, labels: np.ndarray):
   """
+  Plot the metro stops and color them based on their names
+  :param df: A Pandas DataFrame of stops that has 'latitude` and 'longitude` columns
+  :param labels: a NumPy array of the cluster labels
   """
   px.set_mapbox_access_token(st.secrets['mapbox_token'])
   venice_blvd = {'lat': 34.008350,
   # Display the plots with Streamlit
   st.write('# Example of what DBSCAN does')
+  st.write("""As an example of a typical DBSCAN result, I've clustered the
+stops by their geographic location.
+The algorithm finds three clusters.
+Points labeled `-1` aren't part of any cluster.
+Clicking on `-1` in the legend will turn off those points."""
   st.plotly_chart(example_fig, use_container_width=True)
   st.write('# Venice Blvd')
+  st.write("""I encoded the names of all the stops using the Universal Sentence Encoder v4.
+I then clustered those encodings so that I could group the stops based on their names
+instead of their geographic position.
+As I expected, stops on the same road end up close enough to each other that DBSCAN can cluster them together.
+Sometimes, however, a stop has a name that means something to the encoder.
+When that happens, the encoding ends up too far away from the rest of the stops on that road.
+For example, the stops on Venice Blvd get clustered together,
+but the stop `Venice / Lincoln` ends up somewhere else.
+I assume it ends up somewhere else because the encoder recognizes "Lincoln"
+and that meaning overpowers the "Venice" meaning enough that the encoding
+is too far away from the rest of the "Venice" stops.
+A few other examples on Venice Blvd are "Saint Andrews," "Harvard," and "Beethoven."
+There are a few that I don't ascribe much meaning to, such as "Girard" and "Jasmine."
+My mind first jumps to adversarial prompts that use famous names to move the encoding
+around in the encoding space.
+There's a lot more to dig into here but I'll leave it there for now.
+"""
   st.plotly_chart(venice_fig, use_container_width=True)