raghuram00 commited on
Commit
c75e060
·
1 Parent(s): 1a976d8

chore: project cleanup and standard readme

Browse files
Files changed (4) hide show
  1. .gitattributes +0 -35
  2. README.md +38 -10
  3. Untitled2.ipynb +0 -0
  4. train_extracted.py +0 -315
.gitattributes DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -1,11 +1,39 @@
1
- ---
2
- title: Code Complexity Predictor
3
- emoji: 📉
4
- colorFrom: blue
5
- colorTo: yellow
6
- sdk: docker
7
- pinned: false
8
- license: apache-2.0
9
- ---
 
 
 
 
 
 
 
 
10
 
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ⚙️ Code Complexity Predictor
2
+
3
+ An AI-powered web application that instantly predicts the Big-O Time Complexity of Python and Java code snippets using **GraphCodeBERT**.
4
+
5
+ ## 🚀 Features
6
+ - **Intelligent Analysis:** Powered by Microsoft's GraphCodeBERT fine-tuned on the CodeParrot/CodeComplex dataset.
7
+ - **Premium Interface:** A stunning Glassmorphism dark-mode UI with syntax highlighting and micro-animations.
8
+ - **Lightning Fast:** Built on a lightweight FastAPI backend for near-instant inference.
9
+ - **Cloud-Ready:** Completely containerized with Docker, configured for automatic deploy on Render.com.
10
+
11
+ ## 🛠️ Tech Stack
12
+ - **Frontend:** HTML5, Vector CSS (Vanilla), JavaScript, PrismJS
13
+ - **Backend:** Python, FastAPI, Uvicorn
14
+ - **AI/ML:** PyTorch, HuggingFace Transformers (`GraphCodeBERT`)
15
+ - **Deployment:** Docker, Render
16
+
17
+ ## 💻 Running Locally
18
 
19
+ 1. **Install Dependencies**
20
+ ```bash
21
+ pip install -r requirements.txt
22
+ ```
23
+
24
+ 2. **Download Model files**
25
+ Ensure you have configured `download_model.py` with your Google Drive File ID, then run:
26
+ ```bash
27
+ python download_model.py
28
+ ```
29
+
30
+ 3. **Start the Server**
31
+ ```bash
32
+ uvicorn backend.main:app --host 0.0.0.0 --port 8000 --reload
33
+ ```
34
+
35
+ 4. **Open the App**
36
+ Navigate to `http://localhost:8000` in your web browser.
37
+
38
+ ---
39
+ *Built with ❤️ for algorithmic analysis.*
Untitled2.ipynb DELETED
The diff for this file is too large to render. See raw diff
 
train_extracted.py DELETED
@@ -1,315 +0,0 @@
1
- !pip install transformers datasets torch scikit-learn
2
-
3
- # --- CELL ---
4
-
5
- from datasets import load_dataset
6
-
7
- dataset = load_dataset("codeparrot/codecomplex")
8
- print(dataset)
9
- print(dataset['train'][0])
10
-
11
- # --- CELL ---
12
-
13
- import pandas as pd
14
-
15
- df = pd.DataFrame(dataset['train'])
16
-
17
- # Check complexity labels
18
- print("Complexity classes:")
19
- print(df['complexity'].value_counts())
20
-
21
- print("\nLanguages:")
22
- print(df['from'].value_counts())
23
-
24
- print("\nTotal samples:", len(df))
25
-
26
- # --- CELL ---
27
-
28
- from sklearn.preprocessing import LabelEncoder
29
- from sklearn.model_selection import train_test_split
30
-
31
- # Encode labels
32
- le = LabelEncoder()
33
- df['label'] = le.fit_transform(df['complexity'])
34
-
35
- print("Label mapping:")
36
- for i, cls in enumerate(le.classes_):
37
- print(f" {cls} → {i}")
38
-
39
- # Split data
40
- train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])
41
-
42
- print(f"\nTrain size: {len(train_df)}")
43
- print(f"Test size: {len(test_df)}")
44
-
45
- # --- CELL ---
46
-
47
- from transformers import AutoTokenizer
48
-
49
- tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
50
-
51
- print("✅ CodeBERT tokenizer loaded!")
52
-
53
- # Test it
54
- sample = df['src'][0][:200]
55
- tokens = tokenizer(sample, truncation=True, max_length=512, return_tensors="pt")
56
- print("Sample token shape:", tokens['input_ids'].shape)
57
-
58
- # --- CELL ---
59
-
60
- import torch
61
- from torch.utils.data import Dataset
62
-
63
- class CodeDataset(Dataset):
64
- def __init__(self, dataframe, tokenizer, max_length=512):
65
- self.data = dataframe
66
- self.tokenizer = tokenizer
67
- self.max_length = max_length
68
-
69
- def __len__(self):
70
- return len(self.data)
71
-
72
- def __getitem__(self, idx):
73
- code = str(self.data.iloc[idx]['src'])
74
- label = int(self.data.iloc[idx]['label'])
75
-
76
- encoding = self.tokenizer(
77
- code,
78
- truncation=True,
79
- max_length=self.max_length,
80
- padding='max_length',
81
- return_tensors='pt'
82
- )
83
-
84
- return {
85
- 'input_ids': encoding['input_ids'].squeeze(),
86
- 'attention_mask': encoding['attention_mask'].squeeze(),
87
- 'label': torch.tensor(label, dtype=torch.long)
88
- }
89
-
90
- # Create datasets
91
- train_dataset = CodeDataset(train_df.reset_index(drop=True), tokenizer)
92
- test_dataset = CodeDataset(test_df.reset_index(drop=True), tokenizer)
93
-
94
- print(f"✅ Train dataset: {len(train_dataset)} samples")
95
- print(f"✅ Test dataset: {len(test_dataset)} samples")
96
-
97
- # --- CELL ---
98
-
99
- from transformers import AutoModelForSequenceClassification
100
- import torch
101
-
102
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
103
- print(f"Using device: {device}")
104
-
105
- model = AutoModelForSequenceClassification.from_pretrained(
106
- "microsoft/codebert-base",
107
- num_labels=7
108
- )
109
-
110
- model = model.to(device)
111
- print("✅ CodeBERT model loaded!")
112
- print(f"Total parameters: {sum(p.numel() for p in model.parameters()):,}")
113
-
114
- # --- CELL ---
115
-
116
- from torch.utils.data import DataLoader
117
- from torch.optim import AdamW
118
- from transformers import get_linear_schedule_with_warmup
119
-
120
- # DataLoaders
121
- train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
122
- test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)
123
-
124
- # Optimizer
125
- optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
126
-
127
- # Scheduler
128
- total_steps = len(train_loader) * 3 # 3 epochs
129
- scheduler = get_linear_schedule_with_warmup(
130
- optimizer,
131
- num_warmup_steps=total_steps // 10,
132
- num_training_steps=total_steps
133
- )
134
-
135
- print(f"✅ DataLoaders ready!")
136
- print(f"Total training steps: {total_steps}")
137
- print(f"Steps per epoch: {len(train_loader)}")
138
-
139
- # --- CELL ---
140
-
141
- from tqdm import tqdm
142
-
143
- def train_epoch(model, loader, optimizer, scheduler, device):
144
- model.train()
145
- total_loss = 0
146
- correct = 0
147
- total = 0
148
-
149
- for batch in tqdm(loader, desc="Training"):
150
- input_ids = batch['input_ids'].to(device)
151
- attention_mask = batch['attention_mask'].to(device)
152
- labels = batch['label'].to(device)
153
-
154
- optimizer.zero_grad()
155
- outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
156
- loss = outputs.loss
157
- logits = outputs.logits
158
-
159
- loss.backward()
160
- torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
161
- optimizer.step()
162
- scheduler.step()
163
-
164
- total_loss += loss.item()
165
- preds = torch.argmax(logits, dim=1)
166
- correct += (preds == labels).sum().item()
167
- total += labels.size(0)
168
-
169
- return total_loss / len(loader), correct / total
170
-
171
-
172
- def evaluate(model, loader, device):
173
- model.eval()
174
- correct = 0
175
- total = 0
176
-
177
- with torch.no_grad():
178
- for batch in tqdm(loader, desc="Evaluating"):
179
- input_ids = batch['input_ids'].to(device)
180
- attention_mask = batch['attention_mask'].to(device)
181
- labels = batch['label'].to(device)
182
-
183
- outputs = model(input_ids=input_ids, attention_mask=attention_mask)
184
- preds = torch.argmax(outputs.logits, dim=1)
185
- correct += (preds == labels).sum().item()
186
- total += labels.size(0)
187
-
188
- return correct / total
189
-
190
-
191
- # Train for 3 epochs
192
- best_accuracy = 0
193
-
194
- for epoch in range(3):
195
- print(f"\n🔄 Epoch {epoch+1}/3")
196
- train_loss, train_acc = train_epoch(model, train_loader, optimizer, scheduler, device)
197
- test_acc = evaluate(model, test_loader, device)
198
-
199
- print(f"Loss: {train_loss:.4f} | Train Acc: {train_acc*100:.2f}% | Test Acc: {test_acc*100:.2f}%")
200
-
201
- if test_acc > best_accuracy:
202
- best_accuracy = test_acc
203
- torch.save(model.state_dict(), "best_model.pt")
204
- print(f"✅ Best model saved! Accuracy: {best_accuracy*100:.2f}%")
205
-
206
- # --- CELL ---
207
-
208
- # Train 2 more epochs
209
- for epoch in range(2):
210
- print(f"\n🔄 Epoch {epoch+4}/5")
211
- train_loss, train_acc = train_epoch(model, train_loader, optimizer, scheduler, device)
212
- test_acc = evaluate(model, test_loader, device)
213
-
214
- print(f"Loss: {train_loss:.4f} | Train Acc: {train_acc*100:.2f}% | Test Acc: {test_acc*100:.2f}%")
215
-
216
- if test_acc > best_accuracy:
217
- best_accuracy = test_acc
218
- torch.save(model.state_dict(), "best_model.pt")
219
- print(f"✅ Best model saved! Accuracy: {best_accuracy*100:.2f}%")
220
-
221
- # --- CELL ---
222
-
223
- from google.colab import drive
224
- drive.mount('/content/drive')
225
-
226
- # --- CELL ---
227
-
228
- import shutil
229
-
230
- # Copy files to Google Drive
231
- shutil.copy("best_model.pt", "/content/drive/MyDrive/best_model.pt")
232
- shutil.copy("label_encoder.pkl", "/content/drive/MyDrive/label_encoder.pkl")
233
-
234
- print("✅ Files saved to Google Drive!")
235
-
236
- # --- CELL ---
237
-
238
- # Test the model directly in Colab
239
- test_codes = [
240
- "public int findMax(int[] arr) { int max = arr[0]; for (int i = 1; i < arr.length; i++) { if (arr[i] > max) max = arr[i]; } return max; }",
241
- "return arr[0];",
242
- "for(int i=0;i<n;i++) for(int j=0;j<n;j++) sum+=arr[i][j];",
243
- ]
244
-
245
- for code in test_codes:
246
- inputs = tokenizer(code, truncation=True, max_length=512, padding='max_length', return_tensors='pt')
247
- input_ids = inputs['input_ids'].to(device)
248
- attention_mask = inputs['attention_mask'].to(device)
249
-
250
- with torch.no_grad():
251
- outputs = model(input_ids=input_ids, attention_mask=attention_mask)
252
- pred = torch.argmax(outputs.logits, dim=1).item()
253
-
254
- print(f"Code: {code[:50]}...")
255
- print(f"Predicted: {le.inverse_transform([pred])[0]}\n")
256
-
257
- # --- CELL ---
258
-
259
- import torch.nn as nn
260
-
261
- # Count class frequencies
262
- class_counts = df['label'].value_counts().sort_index().values
263
- total = sum(class_counts)
264
- class_weights = torch.tensor([total/c for c in class_counts], dtype=torch.float).to(device)
265
-
266
- print("Class weights:", class_weights)
267
-
268
- # New training loop with weighted loss
269
- def train_epoch_weighted(model, loader, optimizer, scheduler, device, weights):
270
- model.train()
271
- total_loss = 0
272
- correct = 0
273
- total = 0
274
- criterion = nn.CrossEntropyLoss(weight=weights)
275
-
276
- for batch in tqdm(loader, desc="Training"):
277
- input_ids = batch['input_ids'].to(device)
278
- attention_mask = batch['attention_mask'].to(device)
279
- labels = batch['label'].to(device)
280
-
281
- optimizer.zero_grad()
282
- outputs = model(input_ids=input_ids, attention_mask=attention_mask)
283
- loss = criterion(outputs.logits, labels)
284
-
285
- loss.backward()
286
- torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
287
- optimizer.step()
288
- scheduler.step()
289
-
290
- total_loss += loss.item()
291
- preds = torch.argmax(outputs.logits, dim=1)
292
- correct += (preds == labels).sum().item()
293
- total += labels.size(0)
294
-
295
- return total_loss / len(loader), correct / total
296
-
297
- # Retrain with weights
298
- optimizer3 = AdamW(model.parameters(), lr=5e-6)
299
- scheduler3 = get_linear_schedule_with_warmup(optimizer3, num_warmup_steps=30, num_training_steps=len(train_loader)*3)
300
-
301
- for epoch in range(3):
302
- print(f"\n🔄 Epoch {epoch+1}/3")
303
- train_loss, train_acc = train_epoch_weighted(model, train_loader, optimizer3, scheduler3, device, class_weights)
304
- test_acc = evaluate(model, test_loader, device)
305
- print(f"Loss: {train_loss:.4f} | Train Acc: {train_acc*100:.2f}% | Test Acc: {test_acc*100:.2f}%")
306
- if test_acc > best_accuracy:
307
- best_accuracy = test_acc
308
- torch.save(model.state_dict(), "best_model.pt")
309
- print(f"✅ Best model saved! Accuracy: {best_accuracy*100:.2f}%")
310
-
311
- # --- CELL ---
312
-
313
- import shutil
314
- shutil.copy("best_model.pt", "/content/drive/MyDrive/best_model.pt")
315
- print("✅ Saved to Google Drive!")