lambdaofgod commited on
Commit
9f7559c
1 Parent(s): 89d0cf9

scatterplots

Browse files
app.py CHANGED
@@ -5,7 +5,7 @@ import re
5
  from task_visualizations import TaskVisualizations
6
  import plotly.graph_objects as go
7
  from functools import partial
8
- from text_visualization import WordCloudExtractor
9
 
10
  logging.basicConfig(level=logging.INFO)
11
 
@@ -108,6 +108,15 @@ def setup_repository_representations_tab(repos, representation_types):
108
  )
109
 
110
 
 
 
 
 
 
 
 
 
 
111
  ## main
112
  repos_df = load_repo_df(AppConfig.repo_representations_path)
113
  repos = list(repos_df["repo_name"].unique())
@@ -119,18 +128,45 @@ task_visualizations = TaskVisualizations(
119
  AppConfig.selected_task_counts_path,
120
  AppConfig.tasks_path,
121
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
 
123
  with gr.Blocks() as demo:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  with gr.Tab("Explore Repository Representations"):
125
  setup_repository_representations_tab(repos, representation_types)
126
  with gr.Tab("Explore PapersWithCode Tasks"):
127
- task_counts_description = """
128
- ## PapersWithCode Tasks Visualization
129
-
130
- PapersWithCode tasks are grouped by area.
131
-
132
- In addition to showing task distribution across the original dataset we display task counts in the repositories we selected.
133
- """.strip()
134
 
135
  gr.Markdown(task_counts_description)
136
 
@@ -168,4 +204,6 @@ with gr.Blocks() as demo:
168
  outputs=[selected_repos_tasks_plot],
169
  )
170
 
171
- demo.launch()
 
 
 
5
  from task_visualizations import TaskVisualizations
6
  import plotly.graph_objects as go
7
  from functools import partial
8
+ from text_visualization import WordCloudExtractor, EmbeddingVisualizer
9
 
10
  logging.basicConfig(level=logging.INFO)
11
 
 
108
  )
109
 
110
 
111
+ def load_embeddings_intro_description():
112
+ return """
113
+ The following plots show embeddings obtained with MPNet sentence transformer after applying 2d UMAP algorithm for dimensionality reduction.
114
+
115
+ In the first scatterplot we display PapersWithCode tasks that are colored by area.
116
+ """
117
+
118
+ def load_embeddings_description():
119
+ return
120
  ## main
121
  repos_df = load_repo_df(AppConfig.repo_representations_path)
122
  repos = list(repos_df["repo_name"].unique())
 
128
  AppConfig.selected_task_counts_path,
129
  AppConfig.tasks_path,
130
  )
131
+ display_df = pd.read_parquet("data/selected_repos_representations_umap2d.parquet")
132
+ display_df["is_task"] = display_df["representation"] == "task"
133
+ embedding_visualizer = EmbeddingVisualizer(display_df=display_df)
134
+
135
+
136
+ descriptions = {
137
+ "intro": load_embeddings_intro_description(),
138
+
139
+ "Basic representations": """Now we show the embeddings of tasks and repos, using various texts or representations.
140
+
141
+ The fact that selected code and/or dependency signatures (containing mostly repo's file names) are dissimilar from task names
142
+ should not be surprising. For our problem this illustrates the fact that these representations work poorly for retrieval.
143
+ """,
144
+ "Dependency graph based representations": """
145
+ Note the difference between embeddings of generated tasks and repository signatures (which contain them)
146
+ """,
147
+ "READMEs": """
148
+ """
149
+ }
150
 
151
  with gr.Blocks() as demo:
152
+ with gr.Tab("Explore Repository Embeddings"):
153
+
154
+ tab_elems = [
155
+ gr.Markdown("## Tasks by area"),
156
+ gr.Markdown(descriptions["intro"]),
157
+ gr.Plot(embedding_visualizer.make_task_area_scatterplot()),
158
+ ]
159
+
160
+ embedding_plots = embedding_visualizer.make_embedding_plots(color_col="representation")
161
+ for plot_name in ["Basic representations", "Dependency graph based representations", "READMEs"]:
162
+ tab_elems.append(gr.Markdown(f"## {plot_name}"))
163
+ if descriptions.get(plot_name):
164
+ tab_elems.append(gr.Markdown(descriptions[plot_name]))
165
+ tab_elems.append(gr.Plot(embedding_plots[plot_name]))
166
+ gr.Column(tab_elems)
167
  with gr.Tab("Explore Repository Representations"):
168
  setup_repository_representations_tab(repos, representation_types)
169
  with gr.Tab("Explore PapersWithCode Tasks"):
 
 
 
 
 
 
 
170
 
171
  gr.Markdown(task_counts_description)
172
 
 
204
  outputs=[selected_repos_tasks_plot],
205
  )
206
 
207
+ gr.Plot(embedding_visualizer.make_task_area_scatterplot())
208
+
209
+ demo.launch(share=True)
data/selected_repos_representations_umap2d.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26f519620fb265574be6034ed18419b58fa7d345d17b9dc180a938ef3f37ecc8
3
+ size 18983840
pyproject.toml CHANGED
@@ -5,7 +5,15 @@ description = "Add your description here"
5
  readme = "README.md"
6
  requires-python = ">=3.10"
7
  dependencies = [
 
 
 
 
 
 
8
  "pydantic>=2.9.2",
9
  "scikit-learn>=1.5.2",
 
 
10
  "wordcloud>=1.9.3",
11
  ]
 
5
  readme = "README.md"
6
  requires-python = ">=3.10"
7
  dependencies = [
8
+ "bm25s>=0.2.3",
9
+ "datasets>=3.1.0",
10
+ "gradio>=5.5.0",
11
+ "llvmlite==0.41.0",
12
+ "nbformat>=5.10.4",
13
+ "plotly>=5.24.1",
14
  "pydantic>=2.9.2",
15
  "scikit-learn>=1.5.2",
16
+ "sentence-transformers>=3.3.1",
17
+ "umap-learn>=0.5.7",
18
  "wordcloud>=1.9.3",
19
  ]
text_visualization.py CHANGED
@@ -4,6 +4,9 @@ import wordcloud
4
  from pydantic import BaseModel, Field
5
  import numpy as np
6
  import PIL
 
 
 
7
 
8
 
9
  class WordCloudExtractor(BaseModel):
@@ -21,11 +24,11 @@ class WordCloudExtractor(BaseModel):
21
  """
22
  Extract word frequencies from a corpus using TF-IDF vectorization
23
  and generate word cloud frequencies.
24
-
25
  Args:
26
  texts: List of text documents
27
  max_features: Maximum number of words to include
28
-
29
  Returns:
30
  Dictionary of word frequencies suitable for WordCloud
31
  """
@@ -34,17 +37,79 @@ class WordCloudExtractor(BaseModel):
34
  max_features=max_words,
35
  **tfidf_params
36
  )
37
-
38
  # Fit and transform the texts
39
  tfidf_matrix = tfidf.fit_transform(texts)
40
-
41
  # Get feature names (words)
42
  feature_names = tfidf.get_feature_names_out()
43
-
44
  # Calculate mean TF-IDF scores across documents
45
  mean_tfidf = np.array(tfidf_matrix.mean(axis=0)).flatten()
46
-
47
  # Create frequency dictionary
48
  frequencies = dict(zip(feature_names, mean_tfidf))
49
-
50
  return frequencies
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  from pydantic import BaseModel, Field
5
  import numpy as np
6
  import PIL
7
+ import plotly.express as px
8
+ import pandas as pd
9
+ import plotly.graph_objects as go
10
 
11
 
12
  class WordCloudExtractor(BaseModel):
 
24
  """
25
  Extract word frequencies from a corpus using TF-IDF vectorization
26
  and generate word cloud frequencies.
27
+
28
  Args:
29
  texts: List of text documents
30
  max_features: Maximum number of words to include
31
+
32
  Returns:
33
  Dictionary of word frequencies suitable for WordCloud
34
  """
 
37
  max_features=max_words,
38
  **tfidf_params
39
  )
40
+
41
  # Fit and transform the texts
42
  tfidf_matrix = tfidf.fit_transform(texts)
43
+
44
  # Get feature names (words)
45
  feature_names = tfidf.get_feature_names_out()
46
+
47
  # Calculate mean TF-IDF scores across documents
48
  mean_tfidf = np.array(tfidf_matrix.mean(axis=0)).flatten()
49
+
50
  # Create frequency dictionary
51
  frequencies = dict(zip(feature_names, mean_tfidf))
52
+
53
  return frequencies
54
+
55
+
56
+ class EmbeddingVisualizer(BaseModel):
57
+ display_df: pd.DataFrame
58
+ plot_kwargs: Dict[str, Any] = Field(default_factory=lambda: dict(
59
+ range_x=(3, 16.5),
60
+ range_y=(-3, 11),
61
+ width=1200,
62
+ height=800,
63
+ x="x",
64
+ y="y",
65
+ template="plotly_white",
66
+ ))
67
+
68
+ def make_embedding_plots(self, color_col=None, hover_data=["name"], filter_df_fn=None):
69
+ """
70
+ plots Plotly scatterplot of UMAP embeddings
71
+ """
72
+ display_df = self.display_df
73
+ if filter_df_fn is not None:
74
+ display_df = filter_df_fn(display_df)
75
+
76
+ display_df = display_df.sort_values("representation", ascending=False)
77
+ readme_df = display_df[display_df["representation"].isin(["readme", "generated_readme", "task"])]
78
+ raw_df = display_df[display_df["representation"].isin(["dependency_signature", "selected_code", "task"])]
79
+ dependency_df = display_df[display_df["representation"].isin(["repository_signature", "dependency_signature", "generated_tasks", "task"])]
80
+
81
+ plots = [
82
+ self._make_task_and_repos_scatterplot(df, hover_data, color_col)
83
+ for df in [readme_df, raw_df, dependency_df]
84
+ ]
85
+ return dict(zip(["READMEs", "Basic representations", "Dependency graph based representations"], plots))
86
+
87
+ def _make_task_and_repos_scatterplot(self, df, hover_data, color_col):
88
+ # Set opacity and symbol based on is_task
89
+ df['size'] = df['is_task'].apply(lambda x: 0.25 if x else 0.1)
90
+ df['symbol'] = df['is_task'].apply(int)
91
+
92
+ combined_fig = px.scatter(
93
+ df,
94
+ hover_name="name",
95
+ hover_data=hover_data,
96
+ color=color_col,
97
+ color_discrete_sequence=px.colors.qualitative.Set1,
98
+ opacity=0.5,
99
+ **self.plot_kwargs
100
+ )
101
+ combined_fig.data = combined_fig.data[::-1]
102
+
103
+ return combined_fig
104
+
105
+ def make_task_area_scatterplot(self, n_areas=6):
106
+ display_df = self.display_df
107
+ displayed_tasks_df = display_df[display_df["representation"] == "task"].sort_values("representation")
108
+ displayed_tasks_df = displayed_tasks_df.merge(pd.read_csv("data/paperswithcode_tasks.csv"), left_on="name", right_on="task")
109
+ displayed_tasks_df= displayed_tasks_df[displayed_tasks_df["area"].isin(displayed_tasks_df["area"].value_counts().head(n_areas).index)]
110
+ tasks_fig = px.scatter(displayed_tasks_df, color="area", hover_data=["name"], opacity=0.7, **self.plot_kwargs)
111
+ print("N DISPLAYED TASKS", len(displayed_tasks_df))
112
+ return tasks_fig
113
+
114
+ class Config:
115
+ arbitrary_types_allowed = True