Spaces:
Runtime error
Runtime error
File size: 4,480 Bytes
869e1b8 f5c0c01 21d27ae f5c0c01 9f7559c 21d27ae f5c0c01 21d27ae f5c0c01 42de6bd f5c0c01 42de6bd 9f7559c 42de6bd 9f7559c 42de6bd f5c0c01 42de6bd 9f7559c 42de6bd 9f7559c 42de6bd 9f7559c 42de6bd 9f7559c 42de6bd 9f7559c 42de6bd 9f7559c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
from typing import Dict, Any, Iterable
from sklearn.feature_extraction.text import TfidfVectorizer
import wordcloud
from pydantic import BaseModel, Field
import numpy as np
import PIL
import plotly.express as px
import pandas as pd
import plotly.graph_objects as go
class WordCloudExtractor(BaseModel):
max_words: int = 50
wordcloud_params: Dict[str, Any] = Field(default_factory=dict)
tfidf_params: Dict[str, Any] = Field(default_factory=lambda: {"stop_words": "english"})
def extract_wordcloud_image(self, texts) -> PIL.Image.Image:
frequencies = self._extract_frequencies(texts, self.max_words, tfidf_params=self.tfidf_params)
wc = wordcloud.WordCloud(**self.wordcloud_params).generate_from_frequencies(frequencies)
return wc.to_image()
@classmethod
def _extract_frequencies(cls, texts, max_words=100, tfidf_params: dict={}) -> Dict[str, float]:
"""
Extract word frequencies from a corpus using TF-IDF vectorization
and generate word cloud frequencies.
Args:
texts: List of text documents
max_features: Maximum number of words to include
Returns:
Dictionary of word frequencies suitable for WordCloud
"""
# Initialize TF-IDF vectorizer
tfidf = TfidfVectorizer(
max_features=max_words,
**tfidf_params
)
# Fit and transform the texts
tfidf_matrix = tfidf.fit_transform(texts)
# Get feature names (words)
feature_names = tfidf.get_feature_names_out()
# Calculate mean TF-IDF scores across documents
mean_tfidf = np.array(tfidf_matrix.mean(axis=0)).flatten()
# Create frequency dictionary
frequencies = dict(zip(feature_names, mean_tfidf))
return frequencies
class EmbeddingVisualizer(BaseModel):
display_df: pd.DataFrame
plot_kwargs: Dict[str, Any] = Field(default_factory=lambda: dict(
range_x=(3, 16.5),
range_y=(-3, 11),
width=1200,
height=800,
x="x",
y="y",
template="plotly_white",
))
def make_embedding_plots(self, color_col=None, hover_data=["name"], filter_df_fn=None):
"""
plots Plotly scatterplot of UMAP embeddings
"""
display_df = self.display_df
if filter_df_fn is not None:
display_df = filter_df_fn(display_df)
display_df = display_df.sort_values("representation", ascending=False)
readme_df = display_df[display_df["representation"].isin(["readme", "generated_readme", "task"])]
raw_df = display_df[display_df["representation"].isin(["dependency_signature", "selected_code", "task"])]
dependency_df = display_df[display_df["representation"].isin(["repository_signature", "dependency_signature", "generated_tasks", "task"])]
plots = [
self._make_task_and_repos_scatterplot(df, hover_data, color_col)
for df in [readme_df, raw_df, dependency_df]
]
return dict(zip(["READMEs", "Basic representations", "Dependency graph based representations"], plots))
def _make_task_and_repos_scatterplot(self, df, hover_data, color_col):
# Set opacity and symbol based on is_task
df['size'] = df['is_task'].apply(lambda x: 0.25 if x else 0.1)
df['symbol'] = df['is_task'].apply(int)
combined_fig = px.scatter(
df,
hover_name="name",
hover_data=hover_data,
color=color_col,
color_discrete_sequence=px.colors.qualitative.Set1,
opacity=0.5,
**self.plot_kwargs
)
combined_fig.data = combined_fig.data[::-1]
return combined_fig
def make_task_area_scatterplot(self, n_areas=6):
display_df = self.display_df
displayed_tasks_df = display_df[display_df["representation"] == "task"].sort_values("representation")
displayed_tasks_df = displayed_tasks_df.merge(pd.read_csv("data/paperswithcode_tasks.csv"), left_on="name", right_on="task")
displayed_tasks_df= displayed_tasks_df[displayed_tasks_df["area"].isin(displayed_tasks_df["area"].value_counts().head(n_areas).index)]
tasks_fig = px.scatter(displayed_tasks_df, color="area", hover_data=["name"], opacity=0.7, **self.plot_kwargs)
print("N DISPLAYED TASKS", len(displayed_tasks_df))
return tasks_fig
class Config:
arbitrary_types_allowed = True
|