Aman Githala commited on
Commit
677f286
·
0 Parent(s):

Deploying AI Engine

Browse files
.gitignore ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python junk
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # Environments
7
+ venv/
8
+ env/
9
+ .env
10
+
11
+ # Mac system files
12
+ .DS_Store
13
+
14
+ # Large data files (We generate these in Docker, don't upload them)
15
+ reference_embeddings/
16
+ *.pkl
17
+
18
+ # IDE settings
19
+ .vscode/
20
+ .idea/
Dockerfile ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 1. Use a lightweight Python base image
2
+ FROM python:3.11-slim
3
+
4
+ # 2. Set the working directory inside the container
5
+ WORKDIR /app
6
+
7
+ # 3. Copy requirements first (to cache dependencies)
8
+ COPY requirements.txt .
9
+
10
+ # 4. Install dependencies
11
+ # We add --no-cache-dir to keep the image small
12
+ RUN pip install --default-timeout=1000 --no-cache-dir -r requirements.txt
13
+
14
+ # 5. Copy the rest of your code
15
+ COPY . .
16
+
17
+ # 6. RUN THE SEED SCRIPT (Crucial Step)
18
+ # This generates the reference_embeddings/*.pkl files INSIDE the image.
19
+ # So when you ship this, the "Brain" is already pre-loaded.
20
+ RUN python seed_references.py
21
+
22
+ # 7. Expose the port the app runs on
23
+ EXPOSE 8000
24
+
25
+ # 8. Command to run the app
26
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
analyzer/__init__.py ADDED
File without changes
analyzer/github_fetcher.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from github import Github
2
+ from github import Auth
3
+ import os
4
+
5
+ def fetch_user_data(username: str, token: str):
6
+ """
7
+ Fetches public repos for a user.
8
+ """
9
+ try:
10
+ auth = Auth.Token(token)
11
+ g = Github(auth=auth)
12
+ user = g.get_user(username)
13
+
14
+ # ⚡️ OPTIMIZATION: Only fetch top 10 most recent repos
15
+ repos = user.get_repos(sort="updated", direction="desc")[:10]
16
+
17
+ repo_data = []
18
+ # print(f" (Debug) Scanning {len(repos)} repositories for {username}...")
19
+
20
+ for repo in repos:
21
+ repo_data.append({
22
+ "name": repo.name,
23
+ "description": repo.description,
24
+ "language": repo.language,
25
+ "updated_at": repo.updated_at,
26
+ "created_at": repo.created_at,
27
+ "stars": repo.stargazers_count,
28
+ "size": repo.size,
29
+ "object": repo
30
+ })
31
+
32
+ return repo_data
33
+ except Exception as e:
34
+ print(f"Error fetching GitHub data: {e}")
35
+ return []
36
+
37
+ def fetch_file_content(repo_object, extension_filter_list):
38
+ """
39
+ Recursively searches for code files with Strict Limits.
40
+ """
41
+ files_content = []
42
+ # Queue: (path, depth)
43
+ dirs_to_check = [("", 0)]
44
+
45
+ max_files = 3 # ⚡️ STOP after finding 3 good files (was 5 or 10)
46
+ max_depth = 3 # Depth limit (folder inside folder inside folder)
47
+ max_dirs_scanned = 20 # ⚡️ HARD LIMIT: Don't check more than 20 folders per repo
48
+
49
+ scanned_count = 0
50
+
51
+ try:
52
+ while dirs_to_check and len(files_content) < max_files:
53
+ if scanned_count > max_dirs_scanned:
54
+ break # Give up on this repo, it's too big/messy
55
+
56
+ scanned_count += 1
57
+ current_path, depth = dirs_to_check.pop(0)
58
+
59
+ if depth > max_depth: continue
60
+
61
+ # Get contents
62
+ try:
63
+ contents = repo_object.get_contents(current_path)
64
+ except:
65
+ continue # Skip if permission denied or empty
66
+
67
+ for file_content in contents:
68
+ if file_content.type == "file":
69
+ # Check extensions
70
+ if any(file_content.path.endswith(ext) for ext in extension_filter_list):
71
+ try:
72
+ decoded = file_content.decoded_content.decode('utf-8')
73
+ # Only keep files between 50 and 100,000 chars to avoid memory crashes
74
+ if 50 < len(decoded) < 100000:
75
+ files_content.append(decoded)
76
+ # print(f" [Found] {file_content.path}")
77
+ if len(files_content) >= max_files: break
78
+ except:
79
+ pass
80
+
81
+ elif file_content.type == "dir":
82
+ # Smart Skip: Ignore huge dependency folders
83
+ if file_content.name not in ["node_modules", "venv", ".git", "build", "dist", "vendor", "ios", "android"]:
84
+ dirs_to_check.append((file_content.path, depth + 1))
85
+
86
+ except Exception as e:
87
+ pass
88
+
89
+ return files_content
analyzer/graphcodebert.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoTokenizer, AutoModel
3
+ import numpy as np
4
+ from sklearn.metrics.pairwise import cosine_similarity
5
+
6
+ # 1. Load the model GLOBALLY so we only do it once (saves RAM)
7
+ print("⏳ Loading GraphCodeBERT (CPU)... this may take a minute...")
8
+
9
+ # We use the specific Microsoft pre-trained model for code
10
+ tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")
11
+ model = AutoModel.from_pretrained("microsoft/graphcodebert-base")
12
+
13
+ # Force CPU usage (Safety Rule: No Overheating)
14
+ device = torch.device("cpu")
15
+ model.to(device)
16
+
17
+ print("✅ Model Loaded.")
18
+
19
+ def get_embedding(code_snippet):
20
+ """
21
+ Converts a string of code into a mathematical vector.
22
+ """
23
+ if not code_snippet or not isinstance(code_snippet, str):
24
+ return np.zeros((768,)) # Return empty vector if code is bad
25
+
26
+ # Truncate to 512 tokens.
27
+ # If we don't truncate, the model will crash on large files.
28
+ inputs = tokenizer(code_snippet, return_tensors="pt", max_length=512, truncation=True, padding=True)
29
+
30
+ # Move inputs to CPU
31
+ inputs = {k: v.to(device) for k, v in inputs.items()}
32
+
33
+ with torch.no_grad(): # Disable gradient calculation to save massive RAM
34
+ outputs = model(**inputs)
35
+ # We take the embedding of the [CLS] token (the first one)
36
+ # which represents the "whole meaning" of the code snippet.
37
+ embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
38
+
39
+ # Flatten to a simple 1D array
40
+ return embedding.flatten()
41
+
42
+ def compute_similarity(user_code_embeddings, reference_embedding):
43
+ """
44
+ Compares the user's code vectors against the 'Gold Standard' reference.
45
+ """
46
+ if not user_code_embeddings:
47
+ return 0.0
48
+
49
+ # Ensure formats are correct for scikit-learn
50
+ # We stack the user's multiple files into a matrix
51
+ user_matrix = np.vstack(user_code_embeddings)
52
+
53
+ # Reshape reference to be a 1-row matrix
54
+ ref_matrix = reference_embedding.reshape(1, -1)
55
+
56
+ # Calculate cosine similarity (0 to 1) for every file
57
+ scores = cosine_similarity(user_matrix, ref_matrix)
58
+
59
+ # We return the AVERAGE similarity.
60
+ # (You could also take max() if you want to be lenient)
61
+ return float(np.mean(scores))
analyzer/heuristics.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime, timezone
2
+
3
+ def analyze_complexity(code_files):
4
+ """
5
+ Estimates complexity based on file length and imports.
6
+ Cheap CPU heuristic.
7
+ """
8
+ if not code_files: return "Low"
9
+
10
+ # Calculate average line length of the code snippets
11
+ # (Assuming code_files contains raw string content of files)
12
+ avg_len = sum(len(c) for c in code_files) / len(code_files)
13
+
14
+ # Rough count of imports to see if it uses external libraries
15
+ imports = sum(c.count("import ") for c in code_files)
16
+
17
+ # Arbitrary thresholds for the hackathon demo
18
+ if avg_len > 2000 and imports > 5: return "High"
19
+ if avg_len > 500: return "Medium"
20
+ return "Low"
21
+
22
+ def analyze_maturity(repos):
23
+ """
24
+ Checks if the project looks real or just a tutorial copy.
25
+ """
26
+ score = 0
27
+ for r in repos:
28
+ # If it has stars, people like it -> likely real
29
+ if r['stars'] > 0: score += 1
30
+ # If it's larger than 500KB, it's likely not just a "hello world"
31
+ if r['size'] > 500: score += 1
32
+
33
+ if score > 3: return "Maintained"
34
+ if score > 1: return "Developing"
35
+ return "Experimental"
36
+
37
+ def analyze_consistency(repos, skill_name):
38
+ """
39
+ Checks if the skill appears across multiple projects.
40
+ """
41
+ count = 0
42
+ skill_lower = skill_name.lower()
43
+
44
+ for r in repos:
45
+ # Check language field
46
+ if r['language'] and skill_lower in r['language'].lower():
47
+ count += 1
48
+ continue
49
+
50
+ # Check description
51
+ if r['description'] and skill_lower in r['description'].lower():
52
+ count += 1
53
+
54
+ if count >= 3: return "Consistent"
55
+ if count >= 1: return "Occasional"
56
+ return "One-off"
57
+
58
+ def analyze_recency(repos):
59
+ """
60
+ Checks if the user has pushed code recently.
61
+ """
62
+ if not repos: return "Dormant"
63
+
64
+ # Sort repos by update time to find the latest one
65
+ latest = max(repos, key=lambda x: x['updated_at'])
66
+ last_update = latest['updated_at']
67
+
68
+ # Ensure last_update is timezone-aware
69
+ if last_update.tzinfo is None:
70
+ last_update = last_update.replace(tzinfo=timezone.utc)
71
+
72
+ now = datetime.now(timezone.utc)
73
+ delta = (now - last_update).days
74
+
75
+ if delta < 90: return "Active" # Last 3 months
76
+ if delta < 365: return "Stale" # Last year
77
+ return "Dormant"
analyzer/scorer.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ import os
3
+ import numpy as np
4
+
5
+ # Import our logic modules
6
+ from .github_fetcher import fetch_user_data, fetch_file_content
7
+ from .graphcodebert import get_embedding, compute_similarity
8
+ from .heuristics import analyze_complexity, analyze_maturity, analyze_consistency, analyze_recency
9
+
10
+ # --- THE ULTIMATE SKILL MAP ---
11
+ SKILL_MAP = {
12
+ "Python": [".py", ".ipynb"],
13
+ "Java": [".java"],
14
+ "C": [".c", ".h"],
15
+ "C++": [".cpp", ".hpp", ".cc", ".cxx", ".h"],
16
+ "C#": [".cs"],
17
+ "Go": [".go"],
18
+ "Rust": [".rs"],
19
+ "JavaScript": [".js", ".jsx", ".mjs", ".html"],
20
+ "TypeScript": [".ts", ".tsx"],
21
+ "PHP": [".php"],
22
+ "Ruby": [".rb"],
23
+ "Swift": [".swift"],
24
+ "Kotlin": [".kt", ".kts"],
25
+ "HTML": [".html", ".htm", ".xhtml"],
26
+ "CSS": [".css", ".scss", ".sass", ".less"],
27
+ "React": [".jsx", ".tsx", ".js", ".ts"],
28
+ "Vue": [".vue", ".js", ".ts"],
29
+ "Angular": [".ts", ".html"],
30
+ "Next.js": [".jsx", ".tsx", ".js", ".ts"],
31
+ "Django": [".py"],
32
+ "Flask": [".py"],
33
+ "FastAPI": [".py"],
34
+ "Node.js": [".js", ".ts", ".json"],
35
+ "Pandas": [".py", ".ipynb"],
36
+ "NumPy": [".py", ".ipynb"],
37
+ "PyTorch": [".py", ".ipynb"],
38
+ "TensorFlow": [".py", ".ipynb"],
39
+ "Flutter": [".dart"],
40
+ "React Native": [".jsx", ".tsx", ".js", ".ts"],
41
+ "Solidity": [".sol"],
42
+ "Docker": ["Dockerfile", ".dockerfile", "docker-compose.yml"],
43
+ "SQL": [".sql", ".ddl"]
44
+ }
45
+
46
+ def generate_reference_if_missing(skill):
47
+ if not os.path.exists("reference_embeddings"):
48
+ os.makedirs("reference_embeddings")
49
+
50
+ safe_name = skill.lower().replace("++", "plusplus").replace("#", "sharp").replace(" ", "")
51
+ path = f"reference_embeddings/{safe_name}.pkl"
52
+
53
+ if not os.path.exists(path):
54
+ dummy_code = "def main(): print('hello world')"
55
+ emb = get_embedding(dummy_code)
56
+ with open(path, "wb") as f:
57
+ pickle.dump(emb, f)
58
+
59
+ with open(path, "rb") as f:
60
+ return pickle.load(f)
61
+
62
+ def analyze_user(username, skills, github_token):
63
+ results = {}
64
+
65
+ # 1. Fetch Repos
66
+ print(f"🔍 Fetching repos for {username}...")
67
+ repos = fetch_user_data(username, github_token)
68
+
69
+ if not repos:
70
+ return {"error": "User not found or no public repos."}
71
+
72
+ for skill in skills:
73
+ print(f" Analyzing skill: {skill}...")
74
+
75
+ extensions = SKILL_MAP.get(skill, [".txt"])
76
+
77
+ code_snippets = []
78
+ relevant_repos = []
79
+
80
+ # ⚡️ SPEED LIMIT: Only deep scan the top 6 repos
81
+ max_repos_to_scan = 6
82
+
83
+ for repo in repos[:max_repos_to_scan]:
84
+ found_files = fetch_file_content(repo['object'], extensions)
85
+
86
+ if found_files:
87
+ code_snippets.extend(found_files)
88
+ relevant_repos.append(repo)
89
+
90
+ # ⚡️ EARLY EXIT: If we have > 3 snippets, STOP searching other repos.
91
+ # We don't need to see ALL their code, just enough to judge.
92
+ if len(code_snippets) >= 3:
93
+ break
94
+
95
+ # 3. AI Analysis
96
+ ref_emb = generate_reference_if_missing(skill)
97
+ user_embeddings = [get_embedding(code) for code in code_snippets]
98
+
99
+ sim_score = 0.0
100
+ evidence_label = "Weak"
101
+
102
+ if user_embeddings:
103
+ sim_score = compute_similarity(user_embeddings, ref_emb)
104
+
105
+ # ⚖️ CALIBRATION: Stricter Thresholds
106
+ # 0.75 means "Very similar to professional reference"
107
+ # 0.45 means "Vaguely similar"
108
+ if sim_score > 0.75: evidence_label = "Strong"
109
+ elif sim_score > 0.45: evidence_label = "Moderate"
110
+
111
+ results[skill] = {
112
+ "semantic_similarity": {
113
+ "score": round(sim_score, 2),
114
+ "evidence": evidence_label
115
+ },
116
+ "complexity": analyze_complexity(code_snippets),
117
+ "project_maturity": analyze_maturity(relevant_repos),
118
+ "consistency": analyze_consistency(repos, skill),
119
+ "recency": analyze_recency(relevant_repos)
120
+ }
121
+
122
+ return results
app.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from pydantic import BaseModel
3
+ from typing import List
4
+ import os
5
+ import uvicorn
6
+
7
+ # Import our logic
8
+ from analyzer.scorer import analyze_user
9
+
10
+ app = FastAPI(title="Skill Evidence Engine")
11
+
12
+ # Define what the JSON input must look like
13
+ class AnalysisRequest(BaseModel):
14
+ github_username: str
15
+ skills: List[str]
16
+
17
+ @app.get("/")
18
+ def home():
19
+ return {"status": "System is online. Use POST /analyze/github"}
20
+
21
+ @app.post("/analyze/github")
22
+ def analyze(request: AnalysisRequest):
23
+ # Retrieve the token we "exported" earlier
24
+ TOKEN = os.getenv("GITHUB_TOKEN")
25
+
26
+ if not TOKEN:
27
+ raise HTTPException(
28
+ status_code=500,
29
+ detail="Server missing GITHUB_TOKEN. Did you set the environment variable?"
30
+ )
31
+
32
+ try:
33
+ results = analyze_user(request.github_username, request.skills, TOKEN)
34
+ return results
35
+ except Exception as e:
36
+ print(f"Server Error: {e}")
37
+ raise HTTPException(status_code=500, detail=str(e))
38
+
39
+ if __name__ == "__main__":
40
+ # This allows you to run it with 'python app.py' directly
41
+ uvicorn.run(app, host="0.0.0.0", port=8000)
requirements.txt ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ annotated-types==0.7.0
2
+ anyio==4.12.1
3
+ certifi==2026.1.4
4
+ cffi==2.0.0
5
+ charset-normalizer==3.4.4
6
+ click==8.3.1
7
+ cryptography==46.0.3
8
+ fastapi==0.109.0
9
+ filelock==3.20.3
10
+ fsspec==2026.1.0
11
+ h11==0.16.0
12
+ hf-xet==1.2.0
13
+ huggingface-hub==0.36.0
14
+ idna==3.11
15
+ Jinja2==3.1.6
16
+ joblib==1.5.3
17
+ MarkupSafe==3.0.3
18
+ mpmath==1.3.0
19
+ networkx==3.6
20
+ numpy==2.4.1
21
+ packaging==25.0
22
+ pycparser==2.23
23
+ pydantic==2.12.5
24
+ pydantic_core==2.41.5
25
+ PyGithub==2.8.1
26
+ PyJWT==2.10.1
27
+ PyNaCl==1.6.2
28
+ python-multipart==0.0.21
29
+ PyYAML==6.0.3
30
+ regex==2026.1.15
31
+ requests==2.32.5
32
+ safetensors==0.7.0
33
+ scikit-learn==1.8.0
34
+ scipy==1.17.0
35
+ setuptools==80.9.0
36
+ starlette==0.35.1
37
+ sympy==1.14.0
38
+ threadpoolctl==3.6.0
39
+ tokenizers==0.22.2
40
+ torch==2.9.1
41
+ tqdm==4.67.1
42
+ transformers==4.57.6
43
+ typing-inspection==0.4.2
44
+ typing_extensions==4.15.0
45
+ urllib3==2.6.3
46
+ uvicorn==0.27.0
seed_references.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ import os
3
+ import numpy as np
4
+ from analyzer.graphcodebert import get_embedding
5
+
6
+ # Ensure the folder exists
7
+ if not os.path.exists("reference_embeddings"):
8
+ os.makedirs("reference_embeddings")
9
+
10
+ print("⏳ Generating Professional Reference Embeddings (The Ultimate List)...")
11
+
12
+ # ==========================================
13
+ # PROFESSIONAL CODE SNIPPETS (Gold Standard)
14
+ # ==========================================
15
+
16
+ # 1. PYTHON (General)
17
+ python_code = """
18
+ class DataProcessor:
19
+ def __init__(self, data: list[dict]):
20
+ self.data = data
21
+ self._cache = {}
22
+
23
+ @property
24
+ def processed_data(self):
25
+ if 'clean' not in self._cache:
26
+ self._cache['clean'] = [d for d in self.data if d.get('active')]
27
+ return self._cache['clean']
28
+ """
29
+
30
+ # 2. DJANGO (Backend Web)
31
+ django_code = """
32
+ from django.db import models
33
+ from django.views.generic import ListView
34
+
35
+ class Product(models.Model):
36
+ name = models.CharField(max_length=255)
37
+ price = models.DecimalField(max_digits=10, decimal_places=2)
38
+ stock = models.IntegerField(default=0)
39
+
40
+ def is_in_stock(self):
41
+ return self.stock > 0
42
+
43
+ class ProductListView(ListView):
44
+ model = Product
45
+ template_name = 'products/list.html'
46
+ context_object_name = 'products'
47
+
48
+ def get_queryset(self):
49
+ return Product.objects.filter(stock__gt=0).order_by('-price')
50
+ """
51
+
52
+ # 3. PANDAS (Data Science)
53
+ pandas_code = """
54
+ import pandas as pd
55
+ import numpy as np
56
+
57
+ def analyze_sales(file_path):
58
+ df = pd.read_csv(file_path)
59
+ # Group by category and calculate aggregate metrics
60
+ summary = df.groupby('category').agg({
61
+ 'revenue': ['sum', 'mean'],
62
+ 'quantity': 'sum',
63
+ 'customer_id': pd.Series.nunique
64
+ })
65
+
66
+ # Calculate rolling average
67
+ df['rolling_avg'] = df['revenue'].rolling(window=7).mean()
68
+
69
+ # Filter high-value transactions
70
+ high_value = df[df['revenue'] > df['revenue'].quantile(0.95)]
71
+ return summary, high_value
72
+ """
73
+
74
+ # 4. FLUTTER (Mobile)
75
+ flutter_code = """
76
+ import 'package:flutter/material.dart';
77
+
78
+ class UserProfile extends StatelessWidget {
79
+ final User user;
80
+
81
+ const UserProfile({Key? key, required this.user}) : super(key: key);
82
+
83
+ @override
84
+ Widget build(BuildContext context) {
85
+ return Scaffold(
86
+ appBar: AppBar(title: Text(user.name)),
87
+ body: ListView.builder(
88
+ itemCount: user.posts.length,
89
+ itemBuilder: (context, index) {
90
+ return Card(
91
+ margin: EdgeInsets.all(8.0),
92
+ child: ListTile(
93
+ leading: CircleAvatar(backgroundImage: NetworkImage(user.avatar)),
94
+ title: Text(user.posts[index].title),
95
+ subtitle: Text(user.posts[index].date),
96
+ trailing: Icon(Icons.arrow_forward_ios),
97
+ ),
98
+ );
99
+ },
100
+ ),
101
+ );
102
+ }
103
+ }
104
+ """
105
+
106
+ # 5. DOCKER (DevOps)
107
+ docker_code = """
108
+ # Multi-stage build for optimized image size
109
+ FROM node:18-alpine AS builder
110
+ WORKDIR /app
111
+ COPY package*.json ./
112
+ RUN npm ci
113
+ COPY . .
114
+ RUN npm run build
115
+
116
+ FROM nginx:alpine
117
+ COPY --from=builder /app/build /usr/share/nginx/html
118
+ EXPOSE 80
119
+ CMD ["nginx", "-g", "daemon off;"]
120
+ """
121
+
122
+ # 6. SQL (Database)
123
+ sql_code = """
124
+ SELECT
125
+ u.id,
126
+ u.username,
127
+ COUNT(o.id) as total_orders,
128
+ SUM(o.amount) as total_spent
129
+ FROM users u
130
+ JOIN orders o ON u.id = o.user_id
131
+ WHERE o.created_at >= '2023-01-01'
132
+ GROUP BY u.id, u.username
133
+ HAVING COUNT(o.id) > 5
134
+ ORDER BY total_spent DESC;
135
+ """
136
+
137
+ # 7. C (Systems)
138
+ c_code = """
139
+ struct task_struct *find_task_by_vpid(pid_t vpid) {
140
+ struct task_struct *task;
141
+ rcu_read_lock();
142
+ task = pid_task(find_vpid(vpid), PIDTYPE_PID);
143
+ if (task) get_task_struct(task);
144
+ rcu_read_unlock();
145
+ return task;
146
+ }
147
+ """
148
+
149
+ # 8. C++ (Competitive / Systems)
150
+ cpp_code = """
151
+ #include <vector>
152
+ #include <algorithm>
153
+ #include <iostream>
154
+
155
+ template <typename T>
156
+ class Matrix {
157
+ std::vector<std::vector<T>> data;
158
+ public:
159
+ Matrix(int rows, int cols) : data(rows, std::vector<T>(cols)) {}
160
+
161
+ void multiply(const Matrix& other) {
162
+ // Simple O(N^3) multiplication logic
163
+ for(int i=0; i<rows; i++) {
164
+ for(int j=0; j<cols; j++) {
165
+ // ... implementation ...
166
+ }
167
+ }
168
+ }
169
+ };
170
+ """
171
+
172
+ # 9. JAVASCRIPT / REACT (Web)
173
+ js_code = """
174
+ import React, { useState, useEffect } from 'react';
175
+
176
+ export const Dashboard = () => {
177
+ const [data, setData] = useState([]);
178
+
179
+ useEffect(() => {
180
+ fetch('/api/data')
181
+ .then(res => res.json())
182
+ .then(json => setData(json.filter(item => item.isActive)));
183
+ }, []);
184
+
185
+ return (
186
+ <div className="grid">
187
+ {data.map(item => <Card key={item.id} title={item.name} />)}
188
+ </div>
189
+ );
190
+ };
191
+ """
192
+
193
+ # ==========================================
194
+ # MAPPING & GENERATION
195
+ # ==========================================
196
+
197
+ references = {
198
+ # Core Languages
199
+ "python": python_code,
200
+ "c": c_code,
201
+ "cplusplus": cpp_code, # Mapped name for C++
202
+ "javascript": js_code,
203
+ "typescript": js_code, # Similar enough for embeddings
204
+ "java": cpp_code, # Java/C++ are structurally similar enough
205
+
206
+ # Frameworks
207
+ "django": django_code,
208
+ "flask": django_code, # Both are Python backend
209
+ "pandas": pandas_code,
210
+ "numpy": pandas_code,
211
+
212
+ # Web
213
+ "react": js_code,
214
+ "html": js_code, # Often mixed
215
+
216
+ # Mobile
217
+ "flutter": flutter_code,
218
+
219
+ # DevOps
220
+ "docker": docker_code,
221
+
222
+ # Database
223
+ "sql": sql_code
224
+ }
225
+
226
+ count = 0
227
+ for skill, code in references.items():
228
+ print(f" ... Processing {skill} ...")
229
+ emb = get_embedding(code)
230
+
231
+ # Save to file
232
+ with open(f"reference_embeddings/{skill}.pkl", "wb") as f:
233
+ pickle.dump(emb, f)
234
+ count += 1
235
+
236
+ print(f" Done! Generated {count} professional references.")