codelion commited on
Commit
be31411
1 Parent(s): 687ab5b

Update github_repo_analyzer.py

Browse files
Files changed (1) hide show
  1. github_repo_analyzer.py +36 -14
github_repo_analyzer.py CHANGED
@@ -2,22 +2,25 @@ import os
2
  import sys
3
  import tempfile
4
  import shutil
5
- from urllib.parse import urlparse, quote
6
  import requests
7
- from github import Github
8
- from git import Repo
9
- from collections import defaultdict
10
  import time
11
- import numpy as np
12
- from sklearn.feature_extraction.text import TfidfVectorizer
13
- from sklearn.cluster import KMeans
14
- from sklearn.metrics.pairwise import cosine_similarity
15
  import subprocess
16
  import json
17
- from pathlib import Path
18
  import traceback
19
  import argparse
20
  import re
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  def run_semgrep(repo_path):
23
  try:
@@ -211,6 +214,12 @@ def parse_llm_response(response):
211
  return []
212
 
213
  def cluster_and_filter_items(items, n_clusters=5, n_items=10):
 
 
 
 
 
 
214
  # Combine title and body for text analysis
215
  texts = [f"{item['title']} {item['body']}" for item in items]
216
 
@@ -218,27 +227,40 @@ def cluster_and_filter_items(items, n_clusters=5, n_items=10):
218
  vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
219
  tfidf_matrix = vectorizer.fit_transform(texts)
220
 
 
 
 
221
  # Perform clustering
222
- kmeans = KMeans(n_clusters=min(n_clusters, len(items)))
223
- kmeans.fit(tfidf_matrix)
 
 
224
 
225
  # Get cluster centers
226
  cluster_centers = kmeans.cluster_centers_
227
 
228
  # Find items closest to cluster centers
229
  filtered_items = []
230
- for i in range(min(n_clusters, len(items))):
231
  cluster_items = [item for item, label in zip(items, kmeans.labels_) if label == i]
232
  cluster_vectors = tfidf_matrix[kmeans.labels_ == i]
233
 
 
 
 
234
  # Calculate similarities to cluster center
235
  similarities = cosine_similarity(cluster_vectors, cluster_centers[i].reshape(1, -1)).flatten()
236
 
237
  # Sort items by similarity and select top ones
238
  sorted_items = [x for _, x in sorted(zip(similarities, cluster_items), key=lambda pair: pair[0], reverse=True)]
239
- filtered_items.extend(sorted_items[:min(n_items // n_clusters, len(sorted_items))])
 
 
 
 
 
240
 
241
- return filtered_items
242
 
243
  def safe_filter_open_items(open_items, closed_patterns, n_items=10):
244
  try:
 
2
  import sys
3
  import tempfile
4
  import shutil
 
5
  import requests
 
 
 
6
  import time
 
 
 
 
7
  import subprocess
8
  import json
 
9
  import traceback
10
  import argparse
11
  import re
12
+ import warnings
13
+ import numpy as np
14
+
15
+ from collections import defaultdict
16
+ from pathlib import Path
17
+ from urllib.parse import urlparse, quote
18
+ from github import Github
19
+ from git import Repo
20
+ from sklearn.feature_extraction.text import TfidfVectorizer
21
+ from sklearn.cluster import KMeans
22
+ from sklearn.metrics.pairwise import cosine_similarity
23
+ from sklearn.exceptions import ConvergenceWarning
24
 
25
  def run_semgrep(repo_path):
26
  try:
 
214
  return []
215
 
216
  def cluster_and_filter_items(items, n_clusters=5, n_items=10):
217
+ if len(items) == 0:
218
+ return []
219
+
220
+ if len(items) <= n_items:
221
+ return items
222
+
223
  # Combine title and body for text analysis
224
  texts = [f"{item['title']} {item['body']}" for item in items]
225
 
 
227
  vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
228
  tfidf_matrix = vectorizer.fit_transform(texts)
229
 
230
+ # Determine the number of clusters
231
+ n_clusters = min(n_clusters, len(items))
232
+
233
  # Perform clustering
234
+ with warnings.catch_warnings():
235
+ warnings.filterwarnings("ignore", category=ConvergenceWarning)
236
+ kmeans = KMeans(n_clusters=n_clusters)
237
+ kmeans.fit(tfidf_matrix)
238
 
239
  # Get cluster centers
240
  cluster_centers = kmeans.cluster_centers_
241
 
242
  # Find items closest to cluster centers
243
  filtered_items = []
244
+ for i in range(n_clusters):
245
  cluster_items = [item for item, label in zip(items, kmeans.labels_) if label == i]
246
  cluster_vectors = tfidf_matrix[kmeans.labels_ == i]
247
 
248
+ if cluster_vectors.shape[0] == 0:
249
+ continue
250
+
251
  # Calculate similarities to cluster center
252
  similarities = cosine_similarity(cluster_vectors, cluster_centers[i].reshape(1, -1)).flatten()
253
 
254
  # Sort items by similarity and select top ones
255
  sorted_items = [x for _, x in sorted(zip(similarities, cluster_items), key=lambda pair: pair[0], reverse=True)]
256
+ filtered_items.extend(sorted_items[:max(1, n_items // n_clusters)])
257
+
258
+ # If we didn't get enough items, add more from the original list
259
+ if len(filtered_items) < n_items:
260
+ remaining_items = [item for item in items if item not in filtered_items]
261
+ filtered_items.extend(remaining_items[:n_items - len(filtered_items)])
262
 
263
+ return filtered_items[:n_items]
264
 
265
  def safe_filter_open_items(open_items, closed_patterns, n_items=10):
266
  try: