Spaces:

frankjosh
/

repo_recommender

Sleeping

App Files Files Community

frankjosh commited on 16 days ago

Commit

877a41e

verified ·

1 Parent(s): b8a76c3

Update app.py

Browse files

Files changed (1) hide show

app.py +110 -42

app.py CHANGED Viewed

@@ -24,7 +24,7 @@ if 'feedback' not in st.session_state:
     st.session_state.feedback = {}
 # Define subset size
-SUBSET_SIZE = 500 # Starting with 500 items for quick testing
 class TextDataset(Dataset):
     def __init__(self, texts: List[str], tokenizer, max_length: int = 512):
@@ -44,6 +44,109 @@ class TextDataset(Dataset):
             return_tensors="pt"
         )
 @st.cache_resource
 def load_data_and_model():
     """Load the dataset and model with optimized memory usage"""
@@ -111,7 +214,7 @@ def precompute_embeddings(data: pd.DataFrame, model, tokenizer, batch_size: int
         batch_size=batch_size,
         shuffle=False,
         collate_fn=partial(collate_fn, pad_token_id=tokenizer.pad_token_id),
-        num_workers=2,  # Reduced workers for smaller dataset
         pin_memory=True
     )
@@ -175,7 +278,6 @@ st.info(f"Running with a subset of {SUBSET_SIZE} repositories for testing purpos
 # Precompute embeddings for the subset
 data = precompute_embeddings(data, model, tokenizer)
 # Main App Interface
 st.title("Repository Recommender System 🚀")
 st.caption("Testing Version - Running on subset of data")
@@ -199,50 +301,16 @@ if search_button and user_query.strip():
         # Generate query embedding and get recommendations
         query_embedding = generate_query_embedding(model, tokenizer, user_query)
         recommendations = find_similar_repos(query_embedding, data, top_n)
         # Save to history
         st.session_state.history.append({
             'query': user_query,
             'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
             'results': recommendations['repo'].tolist()
         })
-        # Display recommendations
-        st.markdown("### 🎯 Top Recommendations")
-        for idx, row in recommendations.iterrows():
-            st.markdown(f"#### Repository {idx + 1}: {row['repo']}")
-            # Repository details in columns
-            col1, col2 = st.columns([2, 1])
-            with col1:
-                st.markdown(f"**URL:** [View Repository]({row['url']})")
-                st.markdown(f"**Path:** `{row['path']}`")
-            with col2:
-                st.metric("Match Score", f"{row['similarity']:.2%}")
-            # Feedback buttons in columns
-            feedback_col1, feedback_col2 = st.columns([1, 4])
-            with feedback_col1:
-                if st.button("👍", key=f"like_{idx}"):
-                    save_feedback(row['repo'], 'likes')
-                    st.success("Thanks for your feedback!")
-                if st.button("👎", key=f"dislike_{idx}"):
-                    save_feedback(row['repo'], 'dislikes')
-                    st.success("Thanks for your feedback!")
-            # Case Study and Documentation in tabs instead of nested expanders
-            tab1, tab2 = st.tabs(["📑 Case Study Brief", "📚 Documentation"])
-            with tab1:
-                st.markdown(generate_case_study(row))
-            with tab2:
-                if row['docstring']:
-                    st.markdown(row['docstring'])
-                else:
-                    st.info("No documentation available")
-            st.markdown("---")  # Separator between repositories
 # Sidebar for History and Stats
 with st.sidebar:
@@ -274,4 +342,4 @@ st.markdown(
     GPU Status: {'🟢 Enabled' if torch.cuda.is_available() else '🔴 Disabled'} |
     Model: CodeT5-Small
     """
-)

     st.session_state.feedback = {}
 # Define subset size
+SUBSET_SIZE = 500  # Starting with 500 items for quick testing
 class TextDataset(Dataset):
     def __init__(self, texts: List[str], tokenizer, max_length: int = 512):
             return_tensors="pt"
         )
+def generate_case_study(row: Dict[str, Any]) -> str:
+    """Generate a detailed case study for a repository using available metadata"""
+    # Extract relevant information from the row
+    summary = row.get('summary', '').strip()
+    docstring = row.get('docstring', '').strip()
+    repo_name = row.get('repo', '').strip()
+    # Generate a more detailed overview using available information
+    overview = summary if summary else "This repository provides a software implementation"
+    if docstring:
+        # Extract the first paragraph of the docstring for additional context
+        first_para = docstring.split('\n\n')[0].strip()
+        overview = f"{overview}. {first_para}"
+    # Analyze the repository path to infer technology stack
+    path_components = row.get('path', '').lower().split('/')
+    tech_stack = []
+    # Common technology indicators in paths
+    if any('python' in comp for comp in path_components):
+        tech_stack.append("Python")
+    if any('tensorflow' in comp or 'tf' in comp for comp in path_components):
+        tech_stack.append("TensorFlow")
+    if any('pytorch' in comp for comp in path_components):
+        tech_stack.append("PyTorch")
+    if any('react' in comp for comp in path_components):
+        tech_stack.append("React")
+    tech_stack_str = ", ".join(tech_stack) if tech_stack else "various technologies"
+    case_study = f"""
+### Overview
+{overview}
+### Technical Implementation
+This project is built using {tech_stack_str}. The implementation focuses on providing a robust and maintainable solution for {summary.lower() if summary else 'the specified requirements'}.
+### Key Features
+- Primary functionality: {summary if summary else 'Implementation of core project requirements'}
+- Complete documentation and code examples
+- Well-structured implementation following best practices
+- Modular design for easy integration and customization
+### Use Cases
+This repository is particularly valuable for:
+- Developers implementing similar functionality in their projects
+- Teams looking for reference implementations and best practices
+- Projects requiring similar technical capabilities
+- Learning and educational purposes in related technical domains
+### Integration Considerations
+The repository can be integrated into existing projects, with consideration for:
+- Compatibility with existing technology stacks
+- Required dependencies and prerequisites
+- Potential customization needs
+- Performance and scalability requirements
+    """
+    return case_study
+def display_recommendations(recommendations: pd.DataFrame):
+    """Display recommendations in a list format with all details"""
+    st.markdown("### 🎯 Top Recommendations")
+    # Create a list of recommendations
+    for idx, row in recommendations.iterrows():
+        with st.container():
+            # Header with repository name and match score
+            col1, col2 = st.columns([3, 1])
+            with col1:
+                st.markdown(f"### {idx + 1}. {row['repo']}")
+            with col2:
+                st.metric("Match Score", f"{row['similarity']:.2%}")
+            # Repository details
+            st.markdown(f"**URL:** [View Repository]({row['url']})")
+            st.markdown(f"**Path:** `{row['path']}`")
+            # Feedback buttons
+            col1, col2, col3 = st.columns([1, 1, 4])
+            with col1:
+                if st.button("👍", key=f"like_{idx}"):
+                    st.session_state.feedback[row['repo']] = st.session_state.feedback.get(row['repo'], {'likes': 0, 'dislikes': 0})
+                    st.session_state.feedback[row['repo']]['likes'] += 1
+                    st.success("Thanks for your feedback!")
+            with col2:
+                if st.button("👎", key=f"dislike_{idx}"):
+                    st.session_state.feedback[row['repo']] = st.session_state.feedback.get(row['repo'], {'likes': 0, 'dislikes': 0})
+                    st.session_state.feedback[row['repo']]['dislikes'] += 1
+                    st.success("Thanks for your feedback!")
+            # Documentation and case study in tabs
+            tab1, tab2 = st.tabs(["📚 Documentation", "📑 Case Study"])
+            with tab1:
+                if row['docstring']:
+                    st.markdown(row['docstring'])
+                else:
+                    st.info("No documentation available")
+            with tab2:
+                st.markdown(generate_case_study(row))
+            st.markdown("---")
 @st.cache_resource
 def load_data_and_model():
     """Load the dataset and model with optimized memory usage"""
         batch_size=batch_size,
         shuffle=False,
         collate_fn=partial(collate_fn, pad_token_id=tokenizer.pad_token_id),
+        num_workers=2,
         pin_memory=True
     )
 # Precompute embeddings for the subset
 data = precompute_embeddings(data, model, tokenizer)
 # Main App Interface
 st.title("Repository Recommender System 🚀")
 st.caption("Testing Version - Running on subset of data")
         # Generate query embedding and get recommendations
         query_embedding = generate_query_embedding(model, tokenizer, user_query)
         recommendations = find_similar_repos(query_embedding, data, top_n)
         # Save to history
         st.session_state.history.append({
             'query': user_query,
             'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
             'results': recommendations['repo'].tolist()
         })
+        # Display recommendations using the new function
+        display_recommendations(recommendations)
 # Sidebar for History and Stats
 with st.sidebar:
     GPU Status: {'🟢 Enabled' if torch.cuda.is_available() else '🔴 Disabled'} |
     Model: CodeT5-Small
     """
+)