Spaces:

ror-12
/

skill-engine

Running

App Files Files Community

Aman Githala commited on Jan 17

Commit

677f286

0 Parent(s):

Deploying AI Engine

Browse files

Files changed (10) hide show

.gitignore +20 -0
Dockerfile +26 -0
analyzer/__init__.py +0 -0
analyzer/github_fetcher.py +89 -0
analyzer/graphcodebert.py +61 -0
analyzer/heuristics.py +77 -0
analyzer/scorer.py +122 -0
app.py +41 -0
requirements.txt +46 -0
seed_references.py +236 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,20 @@

+# Python junk
+__pycache__/
+*.py[cod]
+*$py.class
+# Environments
+venv/
+env/
+.env
+# Mac system files
+.DS_Store
+# Large data files (We generate these in Docker, don't upload them)
+reference_embeddings/
+*.pkl
+# IDE settings
+.vscode/
+.idea/

Dockerfile ADDED Viewed

	@@ -0,0 +1,26 @@

+# 1. Use a lightweight Python base image
+FROM python:3.11-slim
+# 2. Set the working directory inside the container
+WORKDIR /app
+# 3. Copy requirements first (to cache dependencies)
+COPY requirements.txt .
+# 4. Install dependencies
+# We add --no-cache-dir to keep the image small
+RUN pip install --default-timeout=1000 --no-cache-dir -r requirements.txt
+# 5. Copy the rest of your code
+COPY . .
+# 6. RUN THE SEED SCRIPT (Crucial Step)
+# This generates the reference_embeddings/*.pkl files INSIDE the image.
+# So when you ship this, the "Brain" is already pre-loaded.
+RUN python seed_references.py
+# 7. Expose the port the app runs on
+EXPOSE 8000
+# 8. Command to run the app
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]

analyzer/__init__.py ADDED Viewed

File without changes

analyzer/github_fetcher.py ADDED Viewed

	@@ -0,0 +1,89 @@

+from github import Github
+from github import Auth
+import os
+def fetch_user_data(username: str, token: str):
+    """
+    Fetches public repos for a user.
+    """
+    try:
+        auth = Auth.Token(token)
+        g = Github(auth=auth)
+        user = g.get_user(username)
+        # ⚡️ OPTIMIZATION: Only fetch top 10 most recent repos
+        repos = user.get_repos(sort="updated", direction="desc")[:10]
+        repo_data = []
+        # print(f"   (Debug) Scanning {len(repos)} repositories for {username}...")
+        for repo in repos:
+            repo_data.append({
+                "name": repo.name,
+                "description": repo.description,
+                "language": repo.language,
+                "updated_at": repo.updated_at,
+                "created_at": repo.created_at,
+                "stars": repo.stargazers_count,
+                "size": repo.size,
+                "object": repo
+            })
+        return repo_data
+    except Exception as e:
+        print(f"Error fetching GitHub data: {e}")
+        return []
+def fetch_file_content(repo_object, extension_filter_list):
+    """
+    Recursively searches for code files with Strict Limits.
+    """
+    files_content = []
+    # Queue: (path, depth)
+    dirs_to_check = [("", 0)]
+    max_files = 3       # ⚡️ STOP after finding 3 good files (was 5 or 10)
+    max_depth = 3       # Depth limit (folder inside folder inside folder)
+    max_dirs_scanned = 20 # ⚡️ HARD LIMIT: Don't check more than 20 folders per repo
+    scanned_count = 0
+    try:
+        while dirs_to_check and len(files_content) < max_files:
+            if scanned_count > max_dirs_scanned:
+                break # Give up on this repo, it's too big/messy
+            scanned_count += 1
+            current_path, depth = dirs_to_check.pop(0)
+            if depth > max_depth: continue
+            # Get contents
+            try:
+                contents = repo_object.get_contents(current_path)
+            except:
+                continue # Skip if permission denied or empty
+            for file_content in contents:
+                if file_content.type == "file":
+                    # Check extensions
+                    if any(file_content.path.endswith(ext) for ext in extension_filter_list):
+                        try:
+                            decoded = file_content.decoded_content.decode('utf-8')
+                            # Only keep files between 50 and 100,000 chars to avoid memory crashes
+                            if 50 < len(decoded) < 100000:
+                                files_content.append(decoded)
+                                # print(f"      [Found] {file_content.path}")
+                                if len(files_content) >= max_files: break
+                        except:
+                            pass
+                elif file_content.type == "dir":
+                    # Smart Skip: Ignore huge dependency folders
+                    if file_content.name not in ["node_modules", "venv", ".git", "build", "dist", "vendor", "ios", "android"]:
+                        dirs_to_check.append((file_content.path, depth + 1))
+    except Exception as e:
+        pass
+    return files_content

analyzer/graphcodebert.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import torch
+from transformers import AutoTokenizer, AutoModel
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
+# 1. Load the model GLOBALLY so we only do it once (saves RAM)
+print("⏳ Loading GraphCodeBERT (CPU)... this may take a minute...")
+# We use the specific Microsoft pre-trained model for code
+tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")
+model = AutoModel.from_pretrained("microsoft/graphcodebert-base")
+# Force CPU usage (Safety Rule: No Overheating)
+device = torch.device("cpu")
+model.to(device)
+print("✅ Model Loaded.")
+def get_embedding(code_snippet):
+    """
+    Converts a string of code into a mathematical vector.
+    """
+    if not code_snippet or not isinstance(code_snippet, str):
+        return np.zeros((768,)) # Return empty vector if code is bad
+    # Truncate to 512 tokens.
+    # If we don't truncate, the model will crash on large files.
+    inputs = tokenizer(code_snippet, return_tensors="pt", max_length=512, truncation=True, padding=True)
+    # Move inputs to CPU
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    with torch.no_grad(): # Disable gradient calculation to save massive RAM
+        outputs = model(**inputs)
+        # We take the embedding of the [CLS] token (the first one)
+        # which represents the "whole meaning" of the code snippet.
+        embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
+    # Flatten to a simple 1D array
+    return embedding.flatten()
+def compute_similarity(user_code_embeddings, reference_embedding):
+    """
+    Compares the user's code vectors against the 'Gold Standard' reference.
+    """
+    if not user_code_embeddings:
+        return 0.0
+    # Ensure formats are correct for scikit-learn
+    # We stack the user's multiple files into a matrix
+    user_matrix = np.vstack(user_code_embeddings)
+    # Reshape reference to be a 1-row matrix
+    ref_matrix = reference_embedding.reshape(1, -1)
+    # Calculate cosine similarity (0 to 1) for every file
+    scores = cosine_similarity(user_matrix, ref_matrix)
+    # We return the AVERAGE similarity.
+    # (You could also take max() if you want to be lenient)
+    return float(np.mean(scores))

analyzer/heuristics.py ADDED Viewed

	@@ -0,0 +1,77 @@

+from datetime import datetime, timezone
+def analyze_complexity(code_files):
+    """
+    Estimates complexity based on file length and imports.
+    Cheap CPU heuristic.
+    """
+    if not code_files: return "Low"
+    # Calculate average line length of the code snippets
+    # (Assuming code_files contains raw string content of files)
+    avg_len = sum(len(c) for c in code_files) / len(code_files)
+    # Rough count of imports to see if it uses external libraries
+    imports = sum(c.count("import ") for c in code_files)
+    # Arbitrary thresholds for the hackathon demo
+    if avg_len > 2000 and imports > 5: return "High"
+    if avg_len > 500: return "Medium"
+    return "Low"
+def analyze_maturity(repos):
+    """
+    Checks if the project looks real or just a tutorial copy.
+    """
+    score = 0
+    for r in repos:
+        # If it has stars, people like it -> likely real
+        if r['stars'] > 0: score += 1
+        # If it's larger than 500KB, it's likely not just a "hello world"
+        if r['size'] > 500: score += 1
+    if score > 3: return "Maintained"
+    if score > 1: return "Developing"
+    return "Experimental"
+def analyze_consistency(repos, skill_name):
+    """
+    Checks if the skill appears across multiple projects.
+    """
+    count = 0
+    skill_lower = skill_name.lower()
+    for r in repos:
+        # Check language field
+        if r['language'] and skill_lower in r['language'].lower():
+            count += 1
+            continue
+        # Check description
+        if r['description'] and skill_lower in r['description'].lower():
+            count += 1
+    if count >= 3: return "Consistent"
+    if count >= 1: return "Occasional"
+    return "One-off"
+def analyze_recency(repos):
+    """
+    Checks if the user has pushed code recently.
+    """
+    if not repos: return "Dormant"
+    # Sort repos by update time to find the latest one
+    latest = max(repos, key=lambda x: x['updated_at'])
+    last_update = latest['updated_at']
+    # Ensure last_update is timezone-aware
+    if last_update.tzinfo is None:
+        last_update = last_update.replace(tzinfo=timezone.utc)
+    now = datetime.now(timezone.utc)
+    delta = (now - last_update).days
+    if delta < 90: return "Active"   # Last 3 months
+    if delta < 365: return "Stale"   # Last year
+    return "Dormant"

analyzer/scorer.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import pickle
+import os
+import numpy as np
+# Import our logic modules
+from .github_fetcher import fetch_user_data, fetch_file_content
+from .graphcodebert import get_embedding, compute_similarity
+from .heuristics import analyze_complexity, analyze_maturity, analyze_consistency, analyze_recency
+# --- THE ULTIMATE SKILL MAP ---
+SKILL_MAP = {
+    "Python": [".py", ".ipynb"],
+    "Java": [".java"],
+    "C": [".c", ".h"],
+    "C++": [".cpp", ".hpp", ".cc", ".cxx", ".h"],
+    "C#": [".cs"],
+    "Go": [".go"],
+    "Rust": [".rs"],
+    "JavaScript": [".js", ".jsx", ".mjs", ".html"],
+    "TypeScript": [".ts", ".tsx"],
+    "PHP": [".php"],
+    "Ruby": [".rb"],
+    "Swift": [".swift"],
+    "Kotlin": [".kt", ".kts"],
+    "HTML": [".html", ".htm", ".xhtml"],
+    "CSS": [".css", ".scss", ".sass", ".less"],
+    "React": [".jsx", ".tsx", ".js", ".ts"],
+    "Vue": [".vue", ".js", ".ts"],
+    "Angular": [".ts", ".html"],
+    "Next.js": [".jsx", ".tsx", ".js", ".ts"],
+    "Django": [".py"],
+    "Flask": [".py"],
+    "FastAPI": [".py"],
+    "Node.js": [".js", ".ts", ".json"],
+    "Pandas": [".py", ".ipynb"],
+    "NumPy": [".py", ".ipynb"],
+    "PyTorch": [".py", ".ipynb"],
+    "TensorFlow": [".py", ".ipynb"],
+    "Flutter": [".dart"],
+    "React Native": [".jsx", ".tsx", ".js", ".ts"],
+    "Solidity": [".sol"],
+    "Docker": ["Dockerfile", ".dockerfile", "docker-compose.yml"],
+    "SQL": [".sql", ".ddl"]
+}
+def generate_reference_if_missing(skill):
+    if not os.path.exists("reference_embeddings"):
+        os.makedirs("reference_embeddings")
+    safe_name = skill.lower().replace("++", "plusplus").replace("#", "sharp").replace(" ", "")
+    path = f"reference_embeddings/{safe_name}.pkl"
+    if not os.path.exists(path):
+        dummy_code = "def main(): print('hello world')"
+        emb = get_embedding(dummy_code)
+        with open(path, "wb") as f:
+            pickle.dump(emb, f)
+    with open(path, "rb") as f:
+        return pickle.load(f)
+def analyze_user(username, skills, github_token):
+    results = {}
+    # 1. Fetch Repos
+    print(f"🔍 Fetching repos for {username}...")
+    repos = fetch_user_data(username, github_token)
+    if not repos:
+        return {"error": "User not found or no public repos."}
+    for skill in skills:
+        print(f"  Analyzing skill: {skill}...")
+        extensions = SKILL_MAP.get(skill, [".txt"])
+        code_snippets = []
+        relevant_repos = []
+        # ⚡️ SPEED LIMIT: Only deep scan the top 6 repos
+        max_repos_to_scan = 6
+        for repo in repos[:max_repos_to_scan]:
+            found_files = fetch_file_content(repo['object'], extensions)
+            if found_files:
+                code_snippets.extend(found_files)
+                relevant_repos.append(repo)
+            # ⚡️ EARLY EXIT: If we have > 3 snippets, STOP searching other repos.
+            # We don't need to see ALL their code, just enough to judge.
+            if len(code_snippets) >= 3:
+                break
+        # 3. AI Analysis
+        ref_emb = generate_reference_if_missing(skill)
+        user_embeddings = [get_embedding(code) for code in code_snippets]
+        sim_score = 0.0
+        evidence_label = "Weak"
+        if user_embeddings:
+            sim_score = compute_similarity(user_embeddings, ref_emb)
+        # ⚖️ CALIBRATION: Stricter Thresholds
+        # 0.75 means "Very similar to professional reference"
+        # 0.45 means "Vaguely similar"
+        if sim_score > 0.75: evidence_label = "Strong"
+        elif sim_score > 0.45: evidence_label = "Moderate"
+        results[skill] = {
+            "semantic_similarity": {
+                "score": round(sim_score, 2),
+                "evidence": evidence_label
+            },
+            "complexity": analyze_complexity(code_snippets),
+            "project_maturity": analyze_maturity(relevant_repos),
+            "consistency": analyze_consistency(repos, skill),
+            "recency": analyze_recency(relevant_repos)
+        }
+    return results

app.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from typing import List
+import os
+import uvicorn
+# Import our logic
+from analyzer.scorer import analyze_user
+app = FastAPI(title="Skill Evidence Engine")
+# Define what the JSON input must look like
+class AnalysisRequest(BaseModel):
+    github_username: str
+    skills: List[str]
+@app.get("/")
+def home():
+    return {"status": "System is online. Use POST /analyze/github"}
+@app.post("/analyze/github")
+def analyze(request: AnalysisRequest):
+    # Retrieve the token we "exported" earlier
+    TOKEN = os.getenv("GITHUB_TOKEN")
+    if not TOKEN:
+        raise HTTPException(
+            status_code=500,
+            detail="Server missing GITHUB_TOKEN. Did you set the environment variable?"
+        )
+    try:
+        results = analyze_user(request.github_username, request.skills, TOKEN)
+        return results
+    except Exception as e:
+        print(f"Server Error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+if __name__ == "__main__":
+    # This allows you to run it with 'python app.py' directly
+    uvicorn.run(app, host="0.0.0.0", port=8000)

requirements.txt ADDED Viewed

	@@ -0,0 +1,46 @@

+annotated-types==0.7.0
+anyio==4.12.1
+certifi==2026.1.4
+cffi==2.0.0
+charset-normalizer==3.4.4
+click==8.3.1
+cryptography==46.0.3
+fastapi==0.109.0
+filelock==3.20.3
+fsspec==2026.1.0
+h11==0.16.0
+hf-xet==1.2.0
+huggingface-hub==0.36.0
+idna==3.11
+Jinja2==3.1.6
+joblib==1.5.3
+MarkupSafe==3.0.3
+mpmath==1.3.0
+networkx==3.6
+numpy==2.4.1
+packaging==25.0
+pycparser==2.23
+pydantic==2.12.5
+pydantic_core==2.41.5
+PyGithub==2.8.1
+PyJWT==2.10.1
+PyNaCl==1.6.2
+python-multipart==0.0.21
+PyYAML==6.0.3
+regex==2026.1.15
+requests==2.32.5
+safetensors==0.7.0
+scikit-learn==1.8.0
+scipy==1.17.0
+setuptools==80.9.0
+starlette==0.35.1
+sympy==1.14.0
+threadpoolctl==3.6.0
+tokenizers==0.22.2
+torch==2.9.1
+tqdm==4.67.1
+transformers==4.57.6
+typing-inspection==0.4.2
+typing_extensions==4.15.0
+urllib3==2.6.3
+uvicorn==0.27.0

seed_references.py ADDED Viewed

	@@ -0,0 +1,236 @@

+import pickle
+import os
+import numpy as np
+from analyzer.graphcodebert import get_embedding
+# Ensure the folder exists
+if not os.path.exists("reference_embeddings"):
+    os.makedirs("reference_embeddings")
+print("⏳ Generating Professional Reference Embeddings (The Ultimate List)...")
+# ==========================================
+#  PROFESSIONAL CODE SNIPPETS (Gold Standard)
+# ==========================================
+# 1. PYTHON (General)
+python_code = """
+class DataProcessor:
+    def __init__(self, data: list[dict]):
+        self.data = data
+        self._cache = {}
+    @property
+    def processed_data(self):
+        if 'clean' not in self._cache:
+            self._cache['clean'] = [d for d in self.data if d.get('active')]
+        return self._cache['clean']
+"""
+# 2. DJANGO (Backend Web)
+django_code = """
+from django.db import models
+from django.views.generic import ListView
+class Product(models.Model):
+    name = models.CharField(max_length=255)
+    price = models.DecimalField(max_digits=10, decimal_places=2)
+    stock = models.IntegerField(default=0)
+    def is_in_stock(self):
+        return self.stock > 0
+class ProductListView(ListView):
+    model = Product
+    template_name = 'products/list.html'
+    context_object_name = 'products'
+    def get_queryset(self):
+        return Product.objects.filter(stock__gt=0).order_by('-price')
+"""
+# 3. PANDAS (Data Science)
+pandas_code = """
+import pandas as pd
+import numpy as np
+def analyze_sales(file_path):
+    df = pd.read_csv(file_path)
+    # Group by category and calculate aggregate metrics
+    summary = df.groupby('category').agg({
+        'revenue': ['sum', 'mean'],
+        'quantity': 'sum',
+        'customer_id': pd.Series.nunique
+    })
+    # Calculate rolling average
+    df['rolling_avg'] = df['revenue'].rolling(window=7).mean()
+    # Filter high-value transactions
+    high_value = df[df['revenue'] > df['revenue'].quantile(0.95)]
+    return summary, high_value
+"""
+# 4. FLUTTER (Mobile)
+flutter_code = """
+import 'package:flutter/material.dart';
+class UserProfile extends StatelessWidget {
+  final User user;
+  const UserProfile({Key? key, required this.user}) : super(key: key);
+  @override
+  Widget build(BuildContext context) {
+    return Scaffold(
+      appBar: AppBar(title: Text(user.name)),
+      body: ListView.builder(
+        itemCount: user.posts.length,
+        itemBuilder: (context, index) {
+          return Card(
+            margin: EdgeInsets.all(8.0),
+            child: ListTile(
+              leading: CircleAvatar(backgroundImage: NetworkImage(user.avatar)),
+              title: Text(user.posts[index].title),
+              subtitle: Text(user.posts[index].date),
+              trailing: Icon(Icons.arrow_forward_ios),
+            ),
+          );
+        },
+      ),
+    );
+  }
+}
+"""
+# 5. DOCKER (DevOps)
+docker_code = """
+# Multi-stage build for optimized image size
+FROM node:18-alpine AS builder
+WORKDIR /app
+COPY package*.json ./
+RUN npm ci
+COPY . .
+RUN npm run build
+FROM nginx:alpine
+COPY --from=builder /app/build /usr/share/nginx/html
+EXPOSE 80
+CMD ["nginx", "-g", "daemon off;"]
+"""
+# 6. SQL (Database)
+sql_code = """
+SELECT
+    u.id,
+    u.username,
+    COUNT(o.id) as total_orders,
+    SUM(o.amount) as total_spent
+FROM users u
+JOIN orders o ON u.id = o.user_id
+WHERE o.created_at >= '2023-01-01'
+GROUP BY u.id, u.username
+HAVING COUNT(o.id) > 5
+ORDER BY total_spent DESC;
+"""
+# 7. C (Systems)
+c_code = """
+struct task_struct *find_task_by_vpid(pid_t vpid) {
+    struct task_struct *task;
+    rcu_read_lock();
+    task = pid_task(find_vpid(vpid), PIDTYPE_PID);
+    if (task) get_task_struct(task);
+    rcu_read_unlock();
+    return task;
+}
+"""
+# 8. C++ (Competitive / Systems)
+cpp_code = """
+#include <vector>
+#include <algorithm>
+#include <iostream>
+template <typename T>
+class Matrix {
+    std::vector<std::vector<T>> data;
+public:
+    Matrix(int rows, int cols) : data(rows, std::vector<T>(cols)) {}
+    void multiply(const Matrix& other) {
+        // Simple O(N^3) multiplication logic
+        for(int i=0; i<rows; i++) {
+            for(int j=0; j<cols; j++) {
+                // ... implementation ...
+            }
+        }
+    }
+};
+"""
+# 9. JAVASCRIPT / REACT (Web)
+js_code = """
+import React, { useState, useEffect } from 'react';
+export const Dashboard = () => {
+  const [data, setData] = useState([]);
+  useEffect(() => {
+    fetch('/api/data')
+      .then(res => res.json())
+      .then(json => setData(json.filter(item => item.isActive)));
+  }, []);
+  return (
+    <div className="grid">
+      {data.map(item => <Card key={item.id} title={item.name} />)}
+    </div>
+  );
+};
+"""
+# ==========================================
+#  MAPPING & GENERATION
+# ==========================================
+references = {
+    # Core Languages
+    "python": python_code,
+    "c": c_code,
+    "cplusplus": cpp_code,   # Mapped name for C++
+    "javascript": js_code,
+    "typescript": js_code,   # Similar enough for embeddings
+    "java": cpp_code,        # Java/C++ are structurally similar enough
+    # Frameworks
+    "django": django_code,
+    "flask": django_code,    # Both are Python backend
+    "pandas": pandas_code,
+    "numpy": pandas_code,
+    # Web
+    "react": js_code,
+    "html": js_code,         # Often mixed
+    # Mobile
+    "flutter": flutter_code,
+    # DevOps
+    "docker": docker_code,
+    # Database
+    "sql": sql_code
+}
+count = 0
+for skill, code in references.items():
+    print(f"   ... Processing {skill} ...")
+    emb = get_embedding(code)
+    # Save to file
+    with open(f"reference_embeddings/{skill}.pkl", "wb") as f:
+        pickle.dump(emb, f)
+    count += 1
+print(f" Done! Generated {count} professional references.")