Spaces:
Running
Running
Update github_repo_analyzer.py
Browse files- github_repo_analyzer.py +36 -14
github_repo_analyzer.py
CHANGED
@@ -2,22 +2,25 @@ import os
|
|
2 |
import sys
|
3 |
import tempfile
|
4 |
import shutil
|
5 |
-
from urllib.parse import urlparse, quote
|
6 |
import requests
|
7 |
-
from github import Github
|
8 |
-
from git import Repo
|
9 |
-
from collections import defaultdict
|
10 |
import time
|
11 |
-
import numpy as np
|
12 |
-
from sklearn.feature_extraction.text import TfidfVectorizer
|
13 |
-
from sklearn.cluster import KMeans
|
14 |
-
from sklearn.metrics.pairwise import cosine_similarity
|
15 |
import subprocess
|
16 |
import json
|
17 |
-
from pathlib import Path
|
18 |
import traceback
|
19 |
import argparse
|
20 |
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
def run_semgrep(repo_path):
|
23 |
try:
|
@@ -211,6 +214,12 @@ def parse_llm_response(response):
|
|
211 |
return []
|
212 |
|
213 |
def cluster_and_filter_items(items, n_clusters=5, n_items=10):
|
|
|
|
|
|
|
|
|
|
|
|
|
214 |
# Combine title and body for text analysis
|
215 |
texts = [f"{item['title']} {item['body']}" for item in items]
|
216 |
|
@@ -218,27 +227,40 @@ def cluster_and_filter_items(items, n_clusters=5, n_items=10):
|
|
218 |
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
|
219 |
tfidf_matrix = vectorizer.fit_transform(texts)
|
220 |
|
|
|
|
|
|
|
221 |
# Perform clustering
|
222 |
-
|
223 |
-
|
|
|
|
|
224 |
|
225 |
# Get cluster centers
|
226 |
cluster_centers = kmeans.cluster_centers_
|
227 |
|
228 |
# Find items closest to cluster centers
|
229 |
filtered_items = []
|
230 |
-
for i in range(
|
231 |
cluster_items = [item for item, label in zip(items, kmeans.labels_) if label == i]
|
232 |
cluster_vectors = tfidf_matrix[kmeans.labels_ == i]
|
233 |
|
|
|
|
|
|
|
234 |
# Calculate similarities to cluster center
|
235 |
similarities = cosine_similarity(cluster_vectors, cluster_centers[i].reshape(1, -1)).flatten()
|
236 |
|
237 |
# Sort items by similarity and select top ones
|
238 |
sorted_items = [x for _, x in sorted(zip(similarities, cluster_items), key=lambda pair: pair[0], reverse=True)]
|
239 |
-
filtered_items.extend(sorted_items[:
|
|
|
|
|
|
|
|
|
|
|
240 |
|
241 |
-
return filtered_items
|
242 |
|
243 |
def safe_filter_open_items(open_items, closed_patterns, n_items=10):
|
244 |
try:
|
|
|
2 |
import sys
|
3 |
import tempfile
|
4 |
import shutil
|
|
|
5 |
import requests
|
|
|
|
|
|
|
6 |
import time
|
|
|
|
|
|
|
|
|
7 |
import subprocess
|
8 |
import json
|
|
|
9 |
import traceback
|
10 |
import argparse
|
11 |
import re
|
12 |
+
import warnings
|
13 |
+
import numpy as np
|
14 |
+
|
15 |
+
from collections import defaultdict
|
16 |
+
from pathlib import Path
|
17 |
+
from urllib.parse import urlparse, quote
|
18 |
+
from github import Github
|
19 |
+
from git import Repo
|
20 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
21 |
+
from sklearn.cluster import KMeans
|
22 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
23 |
+
from sklearn.exceptions import ConvergenceWarning
|
24 |
|
25 |
def run_semgrep(repo_path):
|
26 |
try:
|
|
|
214 |
return []
|
215 |
|
216 |
def cluster_and_filter_items(items, n_clusters=5, n_items=10):
|
217 |
+
if len(items) == 0:
|
218 |
+
return []
|
219 |
+
|
220 |
+
if len(items) <= n_items:
|
221 |
+
return items
|
222 |
+
|
223 |
# Combine title and body for text analysis
|
224 |
texts = [f"{item['title']} {item['body']}" for item in items]
|
225 |
|
|
|
227 |
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
|
228 |
tfidf_matrix = vectorizer.fit_transform(texts)
|
229 |
|
230 |
+
# Determine the number of clusters
|
231 |
+
n_clusters = min(n_clusters, len(items))
|
232 |
+
|
233 |
# Perform clustering
|
234 |
+
with warnings.catch_warnings():
|
235 |
+
warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
236 |
+
kmeans = KMeans(n_clusters=n_clusters)
|
237 |
+
kmeans.fit(tfidf_matrix)
|
238 |
|
239 |
# Get cluster centers
|
240 |
cluster_centers = kmeans.cluster_centers_
|
241 |
|
242 |
# Find items closest to cluster centers
|
243 |
filtered_items = []
|
244 |
+
for i in range(n_clusters):
|
245 |
cluster_items = [item for item, label in zip(items, kmeans.labels_) if label == i]
|
246 |
cluster_vectors = tfidf_matrix[kmeans.labels_ == i]
|
247 |
|
248 |
+
if cluster_vectors.shape[0] == 0:
|
249 |
+
continue
|
250 |
+
|
251 |
# Calculate similarities to cluster center
|
252 |
similarities = cosine_similarity(cluster_vectors, cluster_centers[i].reshape(1, -1)).flatten()
|
253 |
|
254 |
# Sort items by similarity and select top ones
|
255 |
sorted_items = [x for _, x in sorted(zip(similarities, cluster_items), key=lambda pair: pair[0], reverse=True)]
|
256 |
+
filtered_items.extend(sorted_items[:max(1, n_items // n_clusters)])
|
257 |
+
|
258 |
+
# If we didn't get enough items, add more from the original list
|
259 |
+
if len(filtered_items) < n_items:
|
260 |
+
remaining_items = [item for item in items if item not in filtered_items]
|
261 |
+
filtered_items.extend(remaining_items[:n_items - len(filtered_items)])
|
262 |
|
263 |
+
return filtered_items[:n_items]
|
264 |
|
265 |
def safe_filter_open_items(open_items, closed_patterns, n_items=10):
|
266 |
try:
|