Spaces:

Tuathe
/

CodeMentor-AI

Running

App Files Files

xet

Community

Tuathe commited on Jul 31

Commit

72df28d

1 Parent(s): 53e67ba

Final clean version for Hugging Face deployment

Browse files

Files changed (11) hide show

.gitignore +17 -5
Dockerfile +5 -16
README.md +53 -20
app/app.py +52 -0
clear_cache +7 -0
data/code_alpaca_20k.json +0 -0
data/final_coding_dataset.jsonl +0 -0
render.yaml +9 -0
requirements.txt +4 -3
train/preprocess_dataset.py +40 -0
train/train_model.py +63 -0

.gitignore CHANGED Viewed

@@ -1,10 +1,22 @@
-# Ignore virtual environments and model weights
 .venv/
-__pycache__/
 *.pt
-*.safetensors
 *.bin
 *.ckpt
 *.log
-model/
-data/*.jsonl

+# Ignore virtual environment
 .venv/
+# Ignore model outputs and checkpoints
+model/
+logs/
 *.pt
 *.bin
 *.ckpt
+*.safetensors
+# Ignore system + temp files
+__pycache__/
+*.pyc
+.DS_Store
+# Ignore cache
+*.cache/
 *.log
+# Ignore Streamlit config if any
+.streamlit/

Dockerfile CHANGED Viewed

@@ -1,21 +1,10 @@
-FROM python:3.9-slim
 WORKDIR /app
-RUN apt-get update && apt-get install -y \
-    build-essential \
-    curl \
-    software-properties-common \
-    git \
-    && rm -rf /var/lib/apt/lists/*
-COPY requirements.txt ./
-COPY src/ ./src/
-RUN pip3 install -r requirements.txt
-EXPOSE 8501
-HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
-ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]

+FROM python:3.11-slim
 WORKDIR /app
+COPY requirements.txt .
+RUN pip install --upgrade pip && pip install -r requirements.txt
+COPY . .
+CMD ["streamlit", "run", "app/app.py", "--server.port=8501", "--server.enableCORS=false"]

README.md CHANGED Viewed

@@ -1,33 +1,66 @@
 ---
-title: CodeMentor AI
-emoji: 🧠
-colorFrom: purple
-colorTo: blue
-sdk: streamlit
-sdk_version: "1.30.0"
-app_file: app/app.py
-pinned: false
 ---
-# CodeMentor AI
-A fine-tuned LLM app built with Flan-T5-small to help students and developers master coding interviews.
-It supports offline and online inference, trained on 20K+ coding questions and answers.
-## 🛠 Tech Stack
-- Model: Flan-T5-small
-- Framework: Streamlit
-- Dataset: CodeAlpaca-20K
-- Fine-Tuning: HuggingFace Transformers
-## 🔗 Live Demo
-👉 [Try it here](https://huggingface.co/spaces/Tuathe/CodeMentor-AI)
-## 🚀 How to Run Locally
 ```bash
 git clone https://github.com/chetan10510/CodeMentor-AI.git
 cd CodeMentor-AI
-python -m venv .venv && .venv\Scripts\activate
 pip install -r requirements.txt
 streamlit run app/app.py

+# CodeMentor AI – ChatGPT for Coding Interviews (Fine-Tuned Flan-T5)
+CodeMentor AI is a fine-tuned language model specialized for solving **coding interview questions**, built on top of **TinyLlama-1.1B-Chat**, trained with 20K+ prompts, and deployed with a sleek **ChatGPT-style UI using Streamlit**.
+---
+##  Features
+-  Fine-tuned LLM using HuggingFace Transformers
+-  Trained on 20K+ high-quality coding problems (CodeAlpaca dataset)
+-  Clean ChatGPT-style frontend built with Streamlit
+-  Docker-ready for easy deployment
+-  Optimized for local + cloud usage
+-  Can run inference via terminal or web UI
+---
+##  Tech Stack
+- `Flan-T5-small` (HuggingFace)
+- `Transformers` + `Datasets`
+- `Streamlit`
+- `Docker` for packaging
+- `Render` or `HuggingFace Spaces` for deployment
 ---
+##  Training Details
+| Config         | Value                   |
+|----------------|-------------------------|
+| Model          | `google/flan-t5-small`  |
+| Epochs         | 6                       |
+| Batch Size     | 1 (with gradient accumulation) |
+| Learning Rate  | 5e-5                    |
+| Max Length     | 512 tokens              |
+| GPU            | GTX 1650 (4GB VRAM)     |
+| Total Samples  | ~20,000 examples        |
+| Training Time  | ~4 hours                |
 ---
+##  Folder Structure
+CodeMentor-AI/
+│
+├── data/ # Raw + Processed Datasets
+├── model/codementor-flan/ # Saved fine-tuned model
+├── train/ # Preprocessing + Training scripts
+├── app/app.py # Streamlit Chat UI
+├── requirements.txt # All dependencies
+├── Dockerfile # Docker config
+├── render.yaml # Optional Render deployment config
+---
+##  to Run Locally
 ```bash
 git clone https://github.com/chetan10510/CodeMentor-AI.git
 cd CodeMentor-AI
+python -m venv .venv
+.venv\Scripts\activate       # Windows
 pip install -r requirements.txt
 streamlit run app/app.py

app/app.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import streamlit as st
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+import torch
+# Load model and tokenizer
+@st.cache_resource
+def load_model():
+    model = AutoModelForSeq2SeqLM.from_pretrained("model/codementor-flan")
+    tokenizer = AutoTokenizer.from_pretrained("model/codementor-flan")
+    return model, tokenizer
+model, tokenizer = load_model()
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model.to(device)
+# Streamlit page config
+st.set_page_config(page_title="CodeMentor AI", page_icon="💻", layout="centered")
+st.markdown(
+    "<h1 style='text-align: center;'>CodeMentor AI</h1>",
+    unsafe_allow_html=True
+)
+st.markdown(
+    "<p style='text-align: center; font-size:18px;'>Your AI Coding Interview Assistant</p>",
+    unsafe_allow_html=True
+)
+# Sidebar info
+with st.sidebar:
+    st.title("About CodeMentor AI")
+    st.info(
+        "This assistant is fine-tuned on 20k+ coding problems. "
+        "Ask any Data Structures, Algorithms, or Python/Java coding question!"
+    )
+    st.markdown("---")
+    st.markdown("Created by Chetan")
+# Chat interface
+user_input = st.text_area("Ask your coding question here:", height=150)
+if st.button("Get Answer"):
+    if not user_input.strip():
+        st.warning("Please enter a question.")
+    else:
+        with st.spinner("Generating answer..."):
+            prompt = f"### Question:\n{user_input}\n\n### Answer:\n"
+            inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True).to(device)
+            outputs = model.generate(**inputs, max_new_tokens=256)
+            answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
+            answer = answer.split("### Answer:")[-1].strip()
+            st.success("Response:")
+            st.code(answer, language="python")

clear_cache ADDED Viewed

	@@ -0,0 +1,7 @@

+python -c "import torch; torch.cuda.empty_cache()"
+- clear torch caching obviously bruh
+Generate a random integer between 4 and 8 (inclusively)
+Write a SQL query to find the total number of orders placed between two given dates
+Create a program that can calculate the distance between two points in three-dimensional space.

data/code_alpaca_20k.json ADDED Viewed

The diff for this file is too large to render. See raw diff

data/final_coding_dataset.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

render.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+services:
+  - type: web
+    name: CodeMentorAI
+    env: docker
+    plan: free
+    region: oregon
+    dockerContext: .
+    dockerfilePath: Dockerfile
+    autoDeploy: false

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
-altair
-pandas
-streamlit

+streamlit
+transformers
+torch
+sentencepiece

train/preprocess_dataset.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import json
+import os
+# Paths
+input_path = "../data/code_alpaca_20k.json"
+output_path = "../data/final_coding_dataset.jsonl"
+# Make sure output folder exists
+os.makedirs(os.path.dirname(output_path), exist_ok=True)
+# Load dataset
+with open(input_path, "r", encoding="utf-8") as f:
+    data = json.load(f)
+# Format into prompt-completion pairs
+processed = []
+for example in data:
+    instruction = example.get("instruction", "").strip()
+    input_text = example.get("input", "").strip()
+    output_text = example.get("output", "").strip()
+    if instruction and output_text:
+        prompt = instruction
+        if input_text:
+            prompt += "\n\n" + input_text
+        processed.append({
+            "prompt": prompt,
+            "completion": output_text
+        })
+# Save in JSONL format
+with open(output_path, "w", encoding="utf-8") as f:
+    for item in processed:
+        json.dump(item, f)
+        f.write("\n")
+print(f"Preprocessing complete. Total examples: {len(processed)}")
+print(f"Saved to: {output_path}")

train/train_model.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import os
+import torch
+from datasets import load_dataset
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer
+# Config
+model_name = "google/flan-t5-small"
+data_path = "data/final_coding_dataset.jsonl"
+# Load dataset
+dataset = load_dataset("json", data_files=data_path, split="train")
+# Format data for T5
+def format_example(example):
+    return {
+        "input_text": f"Question: {example['prompt']}",
+        "target_text": example["completion"]
+    }
+dataset = dataset.map(format_example)
+# Tokenizer
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+def tokenize(batch):
+    input_enc = tokenizer(batch["input_text"], padding="max_length", truncation=True, max_length=512)
+    target_enc = tokenizer(batch["target_text"], padding="max_length", truncation=True, max_length=128)
+    input_enc["labels"] = target_enc["input_ids"]
+    return input_enc
+dataset = dataset.map(tokenize, batched=True)
+dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
+# Load model
+model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+# Training args
+training_args = TrainingArguments(
+    output_dir="model/codementor-flan",
+    num_train_epochs=6,                      #  use epochs here
+    per_device_train_batch_size=2,
+    gradient_accumulation_steps=2,
+    save_steps=100,
+    save_total_limit=2,
+    logging_steps=100,
+    report_to="none",
+    fp16=False
+)
+# Trainer
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=dataset,
+    tokenizer=tokenizer
+)
+# Train
+trainer.train()
+# Save final model
+model.save_pretrained("model/codementor-flan")
+tokenizer.save_pretrained("model/codementor-flan")