frankjosh commited on
Commit
877a41e
Β·
verified Β·
1 Parent(s): b8a76c3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -42
app.py CHANGED
@@ -24,7 +24,7 @@ if 'feedback' not in st.session_state:
24
  st.session_state.feedback = {}
25
 
26
  # Define subset size
27
- SUBSET_SIZE = 500 # Starting with 500 items for quick testing
28
 
29
  class TextDataset(Dataset):
30
  def __init__(self, texts: List[str], tokenizer, max_length: int = 512):
@@ -44,6 +44,109 @@ class TextDataset(Dataset):
44
  return_tensors="pt"
45
  )
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  @st.cache_resource
48
  def load_data_and_model():
49
  """Load the dataset and model with optimized memory usage"""
@@ -111,7 +214,7 @@ def precompute_embeddings(data: pd.DataFrame, model, tokenizer, batch_size: int
111
  batch_size=batch_size,
112
  shuffle=False,
113
  collate_fn=partial(collate_fn, pad_token_id=tokenizer.pad_token_id),
114
- num_workers=2, # Reduced workers for smaller dataset
115
  pin_memory=True
116
  )
117
 
@@ -175,7 +278,6 @@ st.info(f"Running with a subset of {SUBSET_SIZE} repositories for testing purpos
175
  # Precompute embeddings for the subset
176
  data = precompute_embeddings(data, model, tokenizer)
177
 
178
-
179
  # Main App Interface
180
  st.title("Repository Recommender System πŸš€")
181
  st.caption("Testing Version - Running on subset of data")
@@ -199,50 +301,16 @@ if search_button and user_query.strip():
199
  # Generate query embedding and get recommendations
200
  query_embedding = generate_query_embedding(model, tokenizer, user_query)
201
  recommendations = find_similar_repos(query_embedding, data, top_n)
202
-
203
  # Save to history
204
  st.session_state.history.append({
205
  'query': user_query,
206
  'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
207
  'results': recommendations['repo'].tolist()
208
  })
209
-
210
- # Display recommendations
211
- st.markdown("### 🎯 Top Recommendations")
212
- for idx, row in recommendations.iterrows():
213
- st.markdown(f"#### Repository {idx + 1}: {row['repo']}")
214
-
215
- # Repository details in columns
216
- col1, col2 = st.columns([2, 1])
217
- with col1:
218
- st.markdown(f"**URL:** [View Repository]({row['url']})")
219
- st.markdown(f"**Path:** `{row['path']}`")
220
- with col2:
221
- st.metric("Match Score", f"{row['similarity']:.2%}")
222
-
223
- # Feedback buttons in columns
224
- feedback_col1, feedback_col2 = st.columns([1, 4])
225
- with feedback_col1:
226
- if st.button("πŸ‘", key=f"like_{idx}"):
227
- save_feedback(row['repo'], 'likes')
228
- st.success("Thanks for your feedback!")
229
- if st.button("πŸ‘Ž", key=f"dislike_{idx}"):
230
- save_feedback(row['repo'], 'dislikes')
231
- st.success("Thanks for your feedback!")
232
-
233
- # Case Study and Documentation in tabs instead of nested expanders
234
- tab1, tab2 = st.tabs(["πŸ“‘ Case Study Brief", "πŸ“š Documentation"])
235
-
236
- with tab1:
237
- st.markdown(generate_case_study(row))
238
-
239
- with tab2:
240
- if row['docstring']:
241
- st.markdown(row['docstring'])
242
- else:
243
- st.info("No documentation available")
244
-
245
- st.markdown("---") # Separator between repositories
246
 
247
  # Sidebar for History and Stats
248
  with st.sidebar:
@@ -274,4 +342,4 @@ st.markdown(
274
  GPU Status: {'🟒 Enabled' if torch.cuda.is_available() else 'πŸ”΄ Disabled'} |
275
  Model: CodeT5-Small
276
  """
277
- )
 
24
  st.session_state.feedback = {}
25
 
26
  # Define subset size
27
+ SUBSET_SIZE = 500 # Starting with 500 items for quick testing
28
 
29
  class TextDataset(Dataset):
30
  def __init__(self, texts: List[str], tokenizer, max_length: int = 512):
 
44
  return_tensors="pt"
45
  )
46
 
47
+ def generate_case_study(row: Dict[str, Any]) -> str:
48
+ """Generate a detailed case study for a repository using available metadata"""
49
+ # Extract relevant information from the row
50
+ summary = row.get('summary', '').strip()
51
+ docstring = row.get('docstring', '').strip()
52
+ repo_name = row.get('repo', '').strip()
53
+
54
+ # Generate a more detailed overview using available information
55
+ overview = summary if summary else "This repository provides a software implementation"
56
+ if docstring:
57
+ # Extract the first paragraph of the docstring for additional context
58
+ first_para = docstring.split('\n\n')[0].strip()
59
+ overview = f"{overview}. {first_para}"
60
+
61
+ # Analyze the repository path to infer technology stack
62
+ path_components = row.get('path', '').lower().split('/')
63
+ tech_stack = []
64
+
65
+ # Common technology indicators in paths
66
+ if any('python' in comp for comp in path_components):
67
+ tech_stack.append("Python")
68
+ if any('tensorflow' in comp or 'tf' in comp for comp in path_components):
69
+ tech_stack.append("TensorFlow")
70
+ if any('pytorch' in comp for comp in path_components):
71
+ tech_stack.append("PyTorch")
72
+ if any('react' in comp for comp in path_components):
73
+ tech_stack.append("React")
74
+
75
+ tech_stack_str = ", ".join(tech_stack) if tech_stack else "various technologies"
76
+
77
+ case_study = f"""
78
+ ### Overview
79
+ {overview}
80
+
81
+ ### Technical Implementation
82
+ This project is built using {tech_stack_str}. The implementation focuses on providing a robust and maintainable solution for {summary.lower() if summary else 'the specified requirements'}.
83
+
84
+ ### Key Features
85
+ - Primary functionality: {summary if summary else 'Implementation of core project requirements'}
86
+ - Complete documentation and code examples
87
+ - Well-structured implementation following best practices
88
+ - Modular design for easy integration and customization
89
+
90
+ ### Use Cases
91
+ This repository is particularly valuable for:
92
+ - Developers implementing similar functionality in their projects
93
+ - Teams looking for reference implementations and best practices
94
+ - Projects requiring similar technical capabilities
95
+ - Learning and educational purposes in related technical domains
96
+
97
+ ### Integration Considerations
98
+ The repository can be integrated into existing projects, with consideration for:
99
+ - Compatibility with existing technology stacks
100
+ - Required dependencies and prerequisites
101
+ - Potential customization needs
102
+ - Performance and scalability requirements
103
+ """
104
+ return case_study
105
+
106
+ def display_recommendations(recommendations: pd.DataFrame):
107
+ """Display recommendations in a list format with all details"""
108
+ st.markdown("### 🎯 Top Recommendations")
109
+
110
+ # Create a list of recommendations
111
+ for idx, row in recommendations.iterrows():
112
+ with st.container():
113
+ # Header with repository name and match score
114
+ col1, col2 = st.columns([3, 1])
115
+ with col1:
116
+ st.markdown(f"### {idx + 1}. {row['repo']}")
117
+ with col2:
118
+ st.metric("Match Score", f"{row['similarity']:.2%}")
119
+
120
+ # Repository details
121
+ st.markdown(f"**URL:** [View Repository]({row['url']})")
122
+ st.markdown(f"**Path:** `{row['path']}`")
123
+
124
+ # Feedback buttons
125
+ col1, col2, col3 = st.columns([1, 1, 4])
126
+ with col1:
127
+ if st.button("πŸ‘", key=f"like_{idx}"):
128
+ st.session_state.feedback[row['repo']] = st.session_state.feedback.get(row['repo'], {'likes': 0, 'dislikes': 0})
129
+ st.session_state.feedback[row['repo']]['likes'] += 1
130
+ st.success("Thanks for your feedback!")
131
+ with col2:
132
+ if st.button("πŸ‘Ž", key=f"dislike_{idx}"):
133
+ st.session_state.feedback[row['repo']] = st.session_state.feedback.get(row['repo'], {'likes': 0, 'dislikes': 0})
134
+ st.session_state.feedback[row['repo']]['dislikes'] += 1
135
+ st.success("Thanks for your feedback!")
136
+
137
+ # Documentation and case study in tabs
138
+ tab1, tab2 = st.tabs(["πŸ“š Documentation", "πŸ“‘ Case Study"])
139
+ with tab1:
140
+ if row['docstring']:
141
+ st.markdown(row['docstring'])
142
+ else:
143
+ st.info("No documentation available")
144
+
145
+ with tab2:
146
+ st.markdown(generate_case_study(row))
147
+
148
+ st.markdown("---")
149
+
150
  @st.cache_resource
151
  def load_data_and_model():
152
  """Load the dataset and model with optimized memory usage"""
 
214
  batch_size=batch_size,
215
  shuffle=False,
216
  collate_fn=partial(collate_fn, pad_token_id=tokenizer.pad_token_id),
217
+ num_workers=2,
218
  pin_memory=True
219
  )
220
 
 
278
  # Precompute embeddings for the subset
279
  data = precompute_embeddings(data, model, tokenizer)
280
 
 
281
  # Main App Interface
282
  st.title("Repository Recommender System πŸš€")
283
  st.caption("Testing Version - Running on subset of data")
 
301
  # Generate query embedding and get recommendations
302
  query_embedding = generate_query_embedding(model, tokenizer, user_query)
303
  recommendations = find_similar_repos(query_embedding, data, top_n)
304
+
305
  # Save to history
306
  st.session_state.history.append({
307
  'query': user_query,
308
  'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
309
  'results': recommendations['repo'].tolist()
310
  })
311
+
312
+ # Display recommendations using the new function
313
+ display_recommendations(recommendations)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
 
315
  # Sidebar for History and Stats
316
  with st.sidebar:
 
342
  GPU Status: {'🟒 Enabled' if torch.cuda.is_available() else 'πŸ”΄ Disabled'} |
343
  Model: CodeT5-Small
344
  """
345
+ )