Spaces:
Running
Running
Final clean version for Hugging Face deployment
Browse files- .gitignore +17 -5
- Dockerfile +5 -16
- README.md +53 -20
- app/app.py +52 -0
- clear_cache +7 -0
- data/code_alpaca_20k.json +0 -0
- data/final_coding_dataset.jsonl +0 -0
- render.yaml +9 -0
- requirements.txt +4 -3
- train/preprocess_dataset.py +40 -0
- train/train_model.py +63 -0
.gitignore
CHANGED
@@ -1,10 +1,22 @@
|
|
1 |
-
# Ignore virtual
|
2 |
.venv/
|
3 |
-
|
|
|
|
|
|
|
4 |
*.pt
|
5 |
-
*.safetensors
|
6 |
*.bin
|
7 |
*.ckpt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
*.log
|
9 |
-
|
10 |
-
|
|
|
|
1 |
+
# Ignore virtual environment
|
2 |
.venv/
|
3 |
+
|
4 |
+
# Ignore model outputs and checkpoints
|
5 |
+
model/
|
6 |
+
logs/
|
7 |
*.pt
|
|
|
8 |
*.bin
|
9 |
*.ckpt
|
10 |
+
*.safetensors
|
11 |
+
|
12 |
+
# Ignore system + temp files
|
13 |
+
__pycache__/
|
14 |
+
*.pyc
|
15 |
+
.DS_Store
|
16 |
+
|
17 |
+
# Ignore cache
|
18 |
+
*.cache/
|
19 |
*.log
|
20 |
+
|
21 |
+
# Ignore Streamlit config if any
|
22 |
+
.streamlit/
|
Dockerfile
CHANGED
@@ -1,21 +1,10 @@
|
|
1 |
-
FROM python:3.
|
2 |
|
3 |
WORKDIR /app
|
4 |
|
5 |
-
|
6 |
-
|
7 |
-
curl \
|
8 |
-
software-properties-common \
|
9 |
-
git \
|
10 |
-
&& rm -rf /var/lib/apt/lists/*
|
11 |
|
12 |
-
COPY
|
13 |
-
COPY src/ ./src/
|
14 |
|
15 |
-
|
16 |
-
|
17 |
-
EXPOSE 8501
|
18 |
-
|
19 |
-
HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
|
20 |
-
|
21 |
-
ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
|
|
|
1 |
+
FROM python:3.11-slim
|
2 |
|
3 |
WORKDIR /app
|
4 |
|
5 |
+
COPY requirements.txt .
|
6 |
+
RUN pip install --upgrade pip && pip install -r requirements.txt
|
|
|
|
|
|
|
|
|
7 |
|
8 |
+
COPY . .
|
|
|
9 |
|
10 |
+
CMD ["streamlit", "run", "app/app.py", "--server.port=8501", "--server.enableCORS=false"]
|
|
|
|
|
|
|
|
|
|
|
|
README.md
CHANGED
@@ -1,33 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
---
|
11 |
|
12 |
-
|
13 |
|
14 |
-
|
15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
-
## 🛠 Tech Stack
|
18 |
-
- Model: Flan-T5-small
|
19 |
-
- Framework: Streamlit
|
20 |
-
- Dataset: CodeAlpaca-20K
|
21 |
-
- Fine-Tuning: HuggingFace Transformers
|
22 |
|
23 |
-
|
24 |
-
👉 [Try it here](https://huggingface.co/spaces/Tuathe/CodeMentor-AI)
|
25 |
|
26 |
-
##
|
27 |
|
28 |
```bash
|
29 |
git clone https://github.com/chetan10510/CodeMentor-AI.git
|
30 |
cd CodeMentor-AI
|
31 |
-
python -m venv .venv
|
|
|
32 |
pip install -r requirements.txt
|
33 |
streamlit run app/app.py
|
|
|
1 |
+
# CodeMentor AI – ChatGPT for Coding Interviews (Fine-Tuned Flan-T5)
|
2 |
+
|
3 |
+
CodeMentor AI is a fine-tuned language model specialized for solving **coding interview questions**, built on top of **TinyLlama-1.1B-Chat**, trained with 20K+ prompts, and deployed with a sleek **ChatGPT-style UI using Streamlit**.
|
4 |
+
|
5 |
+
---
|
6 |
+
|
7 |
+
## Features
|
8 |
+
|
9 |
+
- Fine-tuned LLM using HuggingFace Transformers
|
10 |
+
- Trained on 20K+ high-quality coding problems (CodeAlpaca dataset)
|
11 |
+
- Clean ChatGPT-style frontend built with Streamlit
|
12 |
+
- Docker-ready for easy deployment
|
13 |
+
- Optimized for local + cloud usage
|
14 |
+
- Can run inference via terminal or web UI
|
15 |
+
|
16 |
+
---
|
17 |
+
|
18 |
+
## Tech Stack
|
19 |
+
|
20 |
+
- `Flan-T5-small` (HuggingFace)
|
21 |
+
- `Transformers` + `Datasets`
|
22 |
+
- `Streamlit`
|
23 |
+
- `Docker` for packaging
|
24 |
+
- `Render` or `HuggingFace Spaces` for deployment
|
25 |
+
|
26 |
---
|
27 |
+
|
28 |
+
## Training Details
|
29 |
+
|
30 |
+
| Config | Value |
|
31 |
+
|----------------|-------------------------|
|
32 |
+
| Model | `google/flan-t5-small` |
|
33 |
+
| Epochs | 6 |
|
34 |
+
| Batch Size | 1 (with gradient accumulation) |
|
35 |
+
| Learning Rate | 5e-5 |
|
36 |
+
| Max Length | 512 tokens |
|
37 |
+
| GPU | GTX 1650 (4GB VRAM) |
|
38 |
+
| Total Samples | ~20,000 examples |
|
39 |
+
| Training Time | ~4 hours |
|
40 |
+
|
41 |
---
|
42 |
|
43 |
+
## Folder Structure
|
44 |
|
45 |
+
CodeMentor-AI/
|
46 |
+
│
|
47 |
+
├── data/ # Raw + Processed Datasets
|
48 |
+
├── model/codementor-flan/ # Saved fine-tuned model
|
49 |
+
├── train/ # Preprocessing + Training scripts
|
50 |
+
├── app/app.py # Streamlit Chat UI
|
51 |
+
├── requirements.txt # All dependencies
|
52 |
+
├── Dockerfile # Docker config
|
53 |
+
├── render.yaml # Optional Render deployment config
|
54 |
|
|
|
|
|
|
|
|
|
|
|
55 |
|
56 |
+
---
|
|
|
57 |
|
58 |
+
## to Run Locally
|
59 |
|
60 |
```bash
|
61 |
git clone https://github.com/chetan10510/CodeMentor-AI.git
|
62 |
cd CodeMentor-AI
|
63 |
+
python -m venv .venv
|
64 |
+
.venv\Scripts\activate # Windows
|
65 |
pip install -r requirements.txt
|
66 |
streamlit run app/app.py
|
app/app.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
3 |
+
import torch
|
4 |
+
|
5 |
+
# Load model and tokenizer
|
6 |
+
@st.cache_resource
|
7 |
+
def load_model():
|
8 |
+
model = AutoModelForSeq2SeqLM.from_pretrained("model/codementor-flan")
|
9 |
+
tokenizer = AutoTokenizer.from_pretrained("model/codementor-flan")
|
10 |
+
return model, tokenizer
|
11 |
+
|
12 |
+
model, tokenizer = load_model()
|
13 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
14 |
+
model.to(device)
|
15 |
+
|
16 |
+
# Streamlit page config
|
17 |
+
st.set_page_config(page_title="CodeMentor AI", page_icon="💻", layout="centered")
|
18 |
+
|
19 |
+
st.markdown(
|
20 |
+
"<h1 style='text-align: center;'>CodeMentor AI</h1>",
|
21 |
+
unsafe_allow_html=True
|
22 |
+
)
|
23 |
+
st.markdown(
|
24 |
+
"<p style='text-align: center; font-size:18px;'>Your AI Coding Interview Assistant</p>",
|
25 |
+
unsafe_allow_html=True
|
26 |
+
)
|
27 |
+
|
28 |
+
# Sidebar info
|
29 |
+
with st.sidebar:
|
30 |
+
st.title("About CodeMentor AI")
|
31 |
+
st.info(
|
32 |
+
"This assistant is fine-tuned on 20k+ coding problems. "
|
33 |
+
"Ask any Data Structures, Algorithms, or Python/Java coding question!"
|
34 |
+
)
|
35 |
+
st.markdown("---")
|
36 |
+
st.markdown("Created by Chetan")
|
37 |
+
|
38 |
+
# Chat interface
|
39 |
+
user_input = st.text_area("Ask your coding question here:", height=150)
|
40 |
+
|
41 |
+
if st.button("Get Answer"):
|
42 |
+
if not user_input.strip():
|
43 |
+
st.warning("Please enter a question.")
|
44 |
+
else:
|
45 |
+
with st.spinner("Generating answer..."):
|
46 |
+
prompt = f"### Question:\n{user_input}\n\n### Answer:\n"
|
47 |
+
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True).to(device)
|
48 |
+
outputs = model.generate(**inputs, max_new_tokens=256)
|
49 |
+
answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
50 |
+
answer = answer.split("### Answer:")[-1].strip()
|
51 |
+
st.success("Response:")
|
52 |
+
st.code(answer, language="python")
|
clear_cache
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
python -c "import torch; torch.cuda.empty_cache()"
|
2 |
+
- clear torch caching obviously bruh
|
3 |
+
|
4 |
+
|
5 |
+
Generate a random integer between 4 and 8 (inclusively)
|
6 |
+
Write a SQL query to find the total number of orders placed between two given dates
|
7 |
+
Create a program that can calculate the distance between two points in three-dimensional space.
|
data/code_alpaca_20k.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/final_coding_dataset.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
render.yaml
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
services:
|
2 |
+
- type: web
|
3 |
+
name: CodeMentorAI
|
4 |
+
env: docker
|
5 |
+
plan: free
|
6 |
+
region: oregon
|
7 |
+
dockerContext: .
|
8 |
+
dockerfilePath: Dockerfile
|
9 |
+
autoDeploy: false
|
requirements.txt
CHANGED
@@ -1,3 +1,4 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
|
|
|
1 |
+
streamlit
|
2 |
+
transformers
|
3 |
+
torch
|
4 |
+
sentencepiece
|
train/preprocess_dataset.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
|
4 |
+
# Paths
|
5 |
+
input_path = "../data/code_alpaca_20k.json"
|
6 |
+
output_path = "../data/final_coding_dataset.jsonl"
|
7 |
+
|
8 |
+
# Make sure output folder exists
|
9 |
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
10 |
+
|
11 |
+
# Load dataset
|
12 |
+
with open(input_path, "r", encoding="utf-8") as f:
|
13 |
+
data = json.load(f)
|
14 |
+
|
15 |
+
# Format into prompt-completion pairs
|
16 |
+
processed = []
|
17 |
+
for example in data:
|
18 |
+
instruction = example.get("instruction", "").strip()
|
19 |
+
input_text = example.get("input", "").strip()
|
20 |
+
output_text = example.get("output", "").strip()
|
21 |
+
|
22 |
+
if instruction and output_text:
|
23 |
+
prompt = instruction
|
24 |
+
if input_text:
|
25 |
+
prompt += "\n\n" + input_text
|
26 |
+
|
27 |
+
processed.append({
|
28 |
+
"prompt": prompt,
|
29 |
+
"completion": output_text
|
30 |
+
})
|
31 |
+
|
32 |
+
# Save in JSONL format
|
33 |
+
with open(output_path, "w", encoding="utf-8") as f:
|
34 |
+
for item in processed:
|
35 |
+
json.dump(item, f)
|
36 |
+
f.write("\n")
|
37 |
+
|
38 |
+
print(f"Preprocessing complete. Total examples: {len(processed)}")
|
39 |
+
print(f"Saved to: {output_path}")
|
40 |
+
|
train/train_model.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import torch
|
3 |
+
from datasets import load_dataset
|
4 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer
|
5 |
+
|
6 |
+
# Config
|
7 |
+
model_name = "google/flan-t5-small"
|
8 |
+
data_path = "data/final_coding_dataset.jsonl"
|
9 |
+
|
10 |
+
# Load dataset
|
11 |
+
dataset = load_dataset("json", data_files=data_path, split="train")
|
12 |
+
|
13 |
+
# Format data for T5
|
14 |
+
def format_example(example):
|
15 |
+
return {
|
16 |
+
"input_text": f"Question: {example['prompt']}",
|
17 |
+
"target_text": example["completion"]
|
18 |
+
}
|
19 |
+
|
20 |
+
dataset = dataset.map(format_example)
|
21 |
+
|
22 |
+
# Tokenizer
|
23 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
24 |
+
|
25 |
+
def tokenize(batch):
|
26 |
+
input_enc = tokenizer(batch["input_text"], padding="max_length", truncation=True, max_length=512)
|
27 |
+
target_enc = tokenizer(batch["target_text"], padding="max_length", truncation=True, max_length=128)
|
28 |
+
input_enc["labels"] = target_enc["input_ids"]
|
29 |
+
return input_enc
|
30 |
+
|
31 |
+
dataset = dataset.map(tokenize, batched=True)
|
32 |
+
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
|
33 |
+
|
34 |
+
# Load model
|
35 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
|
36 |
+
|
37 |
+
# Training args
|
38 |
+
training_args = TrainingArguments(
|
39 |
+
output_dir="model/codementor-flan",
|
40 |
+
num_train_epochs=6, # use epochs here
|
41 |
+
per_device_train_batch_size=2,
|
42 |
+
gradient_accumulation_steps=2,
|
43 |
+
save_steps=100,
|
44 |
+
save_total_limit=2,
|
45 |
+
logging_steps=100,
|
46 |
+
report_to="none",
|
47 |
+
fp16=False
|
48 |
+
)
|
49 |
+
|
50 |
+
# Trainer
|
51 |
+
trainer = Trainer(
|
52 |
+
model=model,
|
53 |
+
args=training_args,
|
54 |
+
train_dataset=dataset,
|
55 |
+
tokenizer=tokenizer
|
56 |
+
)
|
57 |
+
|
58 |
+
# Train
|
59 |
+
trainer.train()
|
60 |
+
|
61 |
+
# Save final model
|
62 |
+
model.save_pretrained("model/codementor-flan")
|
63 |
+
tokenizer.save_pretrained("model/codementor-flan")
|