Commit
·
1d5f27f
0
Parent(s):
Fresh start: Push all project files including models and notebooks
Browse files- .dockerignore +64 -0
- .gitattributes +3 -0
- .gitignore +1 -0
- Dockerfile +34 -0
- README.md +680 -0
- TRAINING.md +305 -0
- app.py +472 -0
- assets/plot_0.png +3 -0
- assets/plot_1.png +3 -0
- assets/plot_2.png +3 -0
- assets/plot_3.png +3 -0
- assets/plot_4.png +3 -0
- data/test.parquet +3 -0
- data/train.parquet +3 -0
- model.ipynb +0 -0
- model.pkl +3 -0
- requirements.txt +9 -0
.dockerignore
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
.Python
|
| 7 |
+
env/
|
| 8 |
+
venv/
|
| 9 |
+
ENV/
|
| 10 |
+
build/
|
| 11 |
+
develop-eggs/
|
| 12 |
+
dist/
|
| 13 |
+
downloads/
|
| 14 |
+
eggs/
|
| 15 |
+
.eggs/
|
| 16 |
+
lib/
|
| 17 |
+
lib64/
|
| 18 |
+
parts/
|
| 19 |
+
sdist/
|
| 20 |
+
var/
|
| 21 |
+
wheels/
|
| 22 |
+
*.egg-info/
|
| 23 |
+
.installed.cfg
|
| 24 |
+
*.egg
|
| 25 |
+
|
| 26 |
+
# Jupyter Notebook
|
| 27 |
+
.ipynb_checkpoints
|
| 28 |
+
*.ipynb
|
| 29 |
+
|
| 30 |
+
# Data files
|
| 31 |
+
data/
|
| 32 |
+
dataset.zip
|
| 33 |
+
math/
|
| 34 |
+
*.parquet
|
| 35 |
+
_extracted/
|
| 36 |
+
|
| 37 |
+
# IDE
|
| 38 |
+
.vscode/
|
| 39 |
+
.idea/
|
| 40 |
+
*.swp
|
| 41 |
+
*.swo
|
| 42 |
+
*~
|
| 43 |
+
|
| 44 |
+
# OS
|
| 45 |
+
.DS_Store
|
| 46 |
+
Thumbs.db
|
| 47 |
+
|
| 48 |
+
# Git
|
| 49 |
+
.git/
|
| 50 |
+
.gitignore
|
| 51 |
+
.gitattributes
|
| 52 |
+
|
| 53 |
+
# Documentation
|
| 54 |
+
README.md
|
| 55 |
+
LICENSE
|
| 56 |
+
docs/
|
| 57 |
+
|
| 58 |
+
# Logs
|
| 59 |
+
*.log
|
| 60 |
+
|
| 61 |
+
# Model training artifacts (keep only model.pkl)
|
| 62 |
+
wandb/
|
| 63 |
+
*.h5
|
| 64 |
+
checkpoints/
|
.gitattributes
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
.env
|
Dockerfile
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
|
| 3 |
+
# Set working directory
|
| 4 |
+
WORKDIR /app
|
| 5 |
+
|
| 6 |
+
# Install system dependencies
|
| 7 |
+
RUN apt-get update && apt-get install -y \
|
| 8 |
+
build-essential \
|
| 9 |
+
curl \
|
| 10 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 11 |
+
|
| 12 |
+
# Copy requirements first for better caching
|
| 13 |
+
COPY requirements.txt .
|
| 14 |
+
|
| 15 |
+
# Install Python dependencies
|
| 16 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 17 |
+
|
| 18 |
+
# Download NLTK data
|
| 19 |
+
RUN python -c "import nltk; nltk.download('stopwords'); nltk.download('wordnet')"
|
| 20 |
+
|
| 21 |
+
# Copy application files
|
| 22 |
+
COPY app.py .
|
| 23 |
+
COPY model.pkl .
|
| 24 |
+
COPY .env .
|
| 25 |
+
|
| 26 |
+
# Expose Gradio port
|
| 27 |
+
EXPOSE 7860
|
| 28 |
+
|
| 29 |
+
# Set environment variable for Gradio
|
| 30 |
+
ENV GRADIO_SERVER_NAME="0.0.0.0"
|
| 31 |
+
ENV GRADIO_SERVER_PORT=7860
|
| 32 |
+
|
| 33 |
+
# Run the application
|
| 34 |
+
CMD ["python", "app.py"]
|
README.md
ADDED
|
@@ -0,0 +1,680 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: AI Math Question Classifier & Solver
|
| 3 |
+
emoji: 🧮
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: purple
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 5.0.0
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
license: mit
|
| 11 |
+
tags:
|
| 12 |
+
- text-classification
|
| 13 |
+
- mathematics
|
| 14 |
+
- education
|
| 15 |
+
- machine-learning
|
| 16 |
+
- nlp
|
| 17 |
+
- tfidf
|
| 18 |
+
- ensemble-methods
|
| 19 |
+
- gemini
|
| 20 |
+
---
|
| 21 |
+
|
| 22 |
+
# 🧮 AI Math Question Classifier & Solver
|
| 23 |
+
|
| 24 |
+
<div align="center">
|
| 25 |
+
|
| 26 |
+
[](https://huggingface.co/spaces/NeerajCodz/aiMathQuestionClassification)
|
| 27 |
+
[](https://opensource.org/licenses/MIT)
|
| 28 |
+
[](https://www.python.org/downloads/)
|
| 29 |
+
|
| 30 |
+
**An intelligent system for automated mathematical question classification with AI-powered step-by-step solutions**
|
| 31 |
+
|
| 32 |
+
[Try Demo](https://huggingface.co/spaces/NeerajCodz/aiMathQuestionClassification) • [Report Bug](#contact) • [Request Feature](#contact)
|
| 33 |
+
|
| 34 |
+
</div>
|
| 35 |
+
|
| 36 |
+
---
|
| 37 |
+
|
| 38 |
+
## 📑 Table of Contents
|
| 39 |
+
|
| 40 |
+
- [Abstract](#abstract)
|
| 41 |
+
- [Problem Statement](#problem-statement)
|
| 42 |
+
- [System Architecture](#system-architecture)
|
| 43 |
+
- [Dataset](#dataset)
|
| 44 |
+
- [Methodology](#methodology)
|
| 45 |
+
- [Experimental Results](#experimental-results)
|
| 46 |
+
- [Design Decisions & Ablation Studies](#design-decisions--ablation-studies)
|
| 47 |
+
- [Deployment Architecture](#deployment-architecture)
|
| 48 |
+
- [Usage](#usage)
|
| 49 |
+
- [Future Work](#future-work)
|
| 50 |
+
- [Citation](#citation)
|
| 51 |
+
|
| 52 |
+
---
|
| 53 |
+
|
| 54 |
+
## Abstract
|
| 55 |
+
|
| 56 |
+
This work presents an end-to-end system for automated classification of mathematical questions into domain-specific categories (Algebra, Counting & Probability, Geometry, Intermediate Algebra, Number Theory, Precalculus, Prealgebra) using ensemble machine learning methods combined with AI-powered solution generation. The system achieves a **70.40% weighted F1-score** and **70.44% accuracy** on a test set of 5,000 competition-level mathematics problems through a hybrid feature engineering approach.
|
| 57 |
+
|
| 58 |
+
**Key Contributions:**
|
| 59 |
+
1. Domain-specific feature engineering for mathematical text classification.
|
| 60 |
+
2. Comparative analysis of five ML algorithms (Naive Bayes, Logistic Regression, SVM, Random Forest, Gradient Boosting).
|
| 61 |
+
3. **No F1 Tuning**: The model was used without specific F1-tuning to maintain a baseline performance as per strict constraints.
|
| 62 |
+
4. Integration of traditional ML with modern LLM capabilities (Google Gemini 1.5-Flash).
|
| 63 |
+
5. Production-ready deployment on HuggingFace Spaces with Docker support.
|
| 64 |
+
|
| 65 |
+
---
|
| 66 |
+
|
| 67 |
+
## 🌟 Features
|
| 68 |
+
|
| 69 |
+
- **🎯 Real-time Classification**: Instantly categorizes math problems into topics (Algebra, Calculus, Geometry, etc.)
|
| 70 |
+
- **📊 Probability Scores**: Shows confidence levels for each predicted category with color-coded visualization
|
| 71 |
+
- **🤖 AI-Powered Solutions**: Integration with Google Gemini 1.5-Flash for detailed step-by-step solutions
|
| 72 |
+
- **📐 LaTeX Support**: Proper rendering of mathematical notation and equations
|
| 73 |
+
- **📚 Comprehensive Documentation**: Detailed insights into model training methodology and analytics
|
| 74 |
+
- **🐳 Docker Ready**: Fully containerized for easy deployment on any platform
|
| 75 |
+
- **🚀 HuggingFace Compatible**: Deploy directly to HuggingFace Spaces with one click
|
| 76 |
+
|
| 77 |
+
---
|
| 78 |
+
|
| 79 |
+
## Problem Statement
|
| 80 |
+
|
| 81 |
+
### Research Question
|
| 82 |
+
*How can we automatically categorize mathematical problems into their respective domains while maintaining high accuracy across diverse problem types and difficulty levels?*
|
| 83 |
+
|
| 84 |
+
### Challenges Addressed
|
| 85 |
+
|
| 86 |
+
1. **Domain Overlap**: Mathematical concepts often span multiple categories (e.g., calculus problems involving algebraic manipulation)
|
| 87 |
+
|
| 88 |
+
2. **LaTeX Complexity**: Mathematical notation encoded in LaTeX requires specialized preprocessing to extract semantic meaning
|
| 89 |
+
|
| 90 |
+
3. **Vocabulary Sparsity**: Mathematical text exhibits high vocabulary diversity with domain-specific terminology
|
| 91 |
+
|
| 92 |
+
4. **Class Imbalance**: Training data exhibits moderate class imbalance across seven categories
|
| 93 |
+
|
| 94 |
+
5. **Interpretability**: Educational applications require explainable predictions to guide students
|
| 95 |
+
|
| 96 |
+
### Applications
|
| 97 |
+
|
| 98 |
+
- **Adaptive Learning Systems**: Route students to appropriate learning materials based on problem classification
|
| 99 |
+
- **Automated Assessment**: Categorize student submissions for grading and feedback
|
| 100 |
+
- **Content Organization**: Organize problem banks in educational platforms
|
| 101 |
+
- **Difficulty Estimation**: Classification accuracy correlates with problem difficulty
|
| 102 |
+
|
| 103 |
+
---
|
| 104 |
+
|
| 105 |
+
## System Architecture
|
| 106 |
+
|
| 107 |
+
```
|
| 108 |
+
┌─────────────────────────────────────────────────────────────────┐
|
| 109 |
+
│ User Interface Layer │
|
| 110 |
+
│ (Gradio Web Application) │
|
| 111 |
+
└────────────────────────────┬────────────────────────────────────┘
|
| 112 |
+
│
|
| 113 |
+
┌────────────────────┴────────────────────┐
|
| 114 |
+
│ │
|
| 115 |
+
▼ ▼
|
| 116 |
+
┌───────────────────┐ ┌──────────────────┐
|
| 117 |
+
│ Classification │ │ Solution │
|
| 118 |
+
│ Pipeline │ │ Generation │
|
| 119 |
+
│ │ │ (Gemini 1.5) │
|
| 120 |
+
│ 1. Preprocessing │ └──────────────────┘
|
| 121 |
+
│ 2. Feature Extract│
|
| 122 |
+
│ 3. Vectorization │
|
| 123 |
+
│ 4. Prediction │
|
| 124 |
+
│ 5. Probability │
|
| 125 |
+
└───────────────────┘
|
| 126 |
+
│
|
| 127 |
+
▼
|
| 128 |
+
┌─────────────────────────────────────┐
|
| 129 |
+
│ Model Ensemble │
|
| 130 |
+
│ ┌─────────────────────────────┐ │
|
| 131 |
+
│ │ Gradient Boosting (Best) │ │
|
| 132 |
+
│ │ F1-Score: 0.7040 │ │
|
| 133 |
+
│ └─────────────────────────────┘ │
|
| 134 |
+
└─────────────────────────────────────┘
|
| 135 |
+
```
|
| 136 |
+
|
| 137 |
+
---
|
| 138 |
+
|
| 139 |
+
## Dataset
|
| 140 |
+
|
| 141 |
+
### MATH Dataset (Hendrycks et al., 2021)
|
| 142 |
+
|
| 143 |
+
**Source**: [MATH Dataset](https://github.com/hendrycks/math) - A dataset of 12,500 challenging competition mathematics problems
|
| 144 |
+
|
| 145 |
+
**Statistics:**
|
| 146 |
+
- **Training Set**: 7,500 problems
|
| 147 |
+
- **Test Set**: 5,000 problems
|
| 148 |
+
- **Categories**: 7 (Algebra, Calculus, Counting & Probability, Geometry, Intermediate Algebra, Number Theory, Precalculus)
|
| 149 |
+
- **Format**: JSON with problem text, solution, and difficulty level
|
| 150 |
+
|
| 151 |
+
**Class Distribution:**
|
| 152 |
+
|
| 153 |
+
| Topic | Train | Test | % Train | % Test |
|
| 154 |
+
|--------------------------|--------|-------|---------|--------|
|
| 155 |
+
| Precalculus | 1,428 | 546 | 19.0% | 10.9% |
|
| 156 |
+
| Prealgebra | 1,375 | 871 | 18.3% | 17.4% |
|
| 157 |
+
| Intermediate Algebra | 1,211 | 903 | 16.1% | 18.1% |
|
| 158 |
+
| Algebra | 1,187 | 1,187 | 15.8% | 23.7% |
|
| 159 |
+
| Geometry | 956 | 479 | 12.7% | 9.6% |
|
| 160 |
+
| Number Theory | 869 | 540 | 11.6% | 10.8% |
|
| 161 |
+
| Counting & Probability | 474 | 474 | 6.3% | 9.5% |
|
| 162 |
+
|
| 163 |
+

|
| 164 |
+
|
| 165 |
+
**Data Processing:**
|
| 166 |
+
1. JSON → Parquet conversion for 10-100x faster I/O
|
| 167 |
+
2. Train/test split preserved from original dataset
|
| 168 |
+
3. No data augmentation to prevent distribution shift
|
| 169 |
+
|
| 170 |
+
---
|
| 171 |
+
|
| 172 |
+
## Methodology
|
| 173 |
+
|
| 174 |
+
### Feature Engineering Pipeline
|
| 175 |
+
|
| 176 |
+
Our hybrid feature extraction approach combines three complementary feature types to capture both semantic content and mathematical structure.
|
| 177 |
+
|
| 178 |
+
#### 1. Text Features (TF-IDF Vectorization)
|
| 179 |
+
|
| 180 |
+
**Configuration:**
|
| 181 |
+
```python
|
| 182 |
+
TfidfVectorizer(
|
| 183 |
+
max_features=5000, # Vocabulary size
|
| 184 |
+
ngram_range=(1, 3), # Unigrams, bigrams, trigrams
|
| 185 |
+
min_df=2, # Ignore terms in < 2 documents
|
| 186 |
+
max_df=0.95, # Ignore terms in > 95% documents
|
| 187 |
+
sublinear_tf=True # Apply log scaling: 1 + log(tf)
|
| 188 |
+
)
|
| 189 |
+
```
|
| 190 |
+
|
| 191 |
+
**Rationale:**
|
| 192 |
+
- **N-gram Range (1,3)**: Captures multi-word mathematical expressions (e.g., "find the derivative", "pythagorean theorem")
|
| 193 |
+
- **min_df=2**: Removes hapax legomena (words appearing once) to reduce noise
|
| 194 |
+
- **max_df=0.95**: Filters stop words and domain-general terms
|
| 195 |
+
- **sublinear_tf**: Dampens effect of high-frequency terms, improves generalization
|
| 196 |
+
|
| 197 |
+
**Preprocessing Steps:**
|
| 198 |
+
1. **LaTeX Cleaning**:
|
| 199 |
+
```python
|
| 200 |
+
# Remove LaTeX commands while preserving content
|
| 201 |
+
text = re.sub(r'\\[a-zA-Z]+\{([^}]*)\}', r'\1', text)
|
| 202 |
+
text = re.sub(r'\\[a-zA-Z]+', ' ', text)
|
| 203 |
+
```
|
| 204 |
+
|
| 205 |
+
2. **Lemmatization**: Reduce inflectional forms to base (e.g., "deriving" → "derive")
|
| 206 |
+
|
| 207 |
+
3. **Stop Word Removal**: Remove 179 English stop words (NLTK corpus)
|
| 208 |
+
|
| 209 |
+
#### 2. Mathematical Symbol Features (10 Binary Indicators)
|
| 210 |
+
|
| 211 |
+
Domain-specific features designed to capture mathematical content beyond text:
|
| 212 |
+
|
| 213 |
+
| Feature | Detection Pattern | Rationale |
|
| 214 |
+
|----------------------|--------------------------------------|---------------------------------------------|
|
| 215 |
+
| `has_fraction` | `'frac'` or `'/'` | Division operations common in algebra |
|
| 216 |
+
| `has_sqrt` | `'sqrt'` or `'√'` | Radicals indicate algebra/geometry |
|
| 217 |
+
| `has_exponent` | `'^'` or `'pow'` | Powers common in precalculus |
|
| 218 |
+
| `has_integral` | `'int'` or `'∫'` | Strong signal for calculus |
|
| 219 |
+
| `has_derivative` | `"'"` or `'prime'` | Differentiation indicates calculus |
|
| 220 |
+
| `has_summation` | `'sum'` or `'∑'` | Series and sequences (precalculus) |
|
| 221 |
+
| `has_pi` | `'pi'` or `'π'` | Trigonometry and geometry |
|
| 222 |
+
| `has_trigonometric` | `'sin'`, `'cos'`, `'tan'` | Trigonometric functions (precalculus) |
|
| 223 |
+
| `has_inequality` | `'<'`, `'>'`, `'leq'`, `'geq'` | Inequality problems (algebra) |
|
| 224 |
+
| `has_absolute` | `'abs'` or `'|'` | Absolute value (algebra/precalculus) |
|
| 225 |
+
|
| 226 |
+
**Feature Importance Analysis:**
|
| 227 |
+
Ablation study shows these features contribute **2-3% F1-score improvement** over pure TF-IDF.
|
| 228 |
+
|
| 229 |
+
#### 3. Numeric Features (5 Statistical Measures)
|
| 230 |
+
|
| 231 |
+
Statistical properties of numbers appearing in problem text:
|
| 232 |
+
|
| 233 |
+
| Feature | Description | Insight |
|
| 234 |
+
|----------------------|--------------------------------------|---------------------------------------------|
|
| 235 |
+
| `num_count` | Count of numbers in text | Geometry often has specific measurements |
|
| 236 |
+
| `has_large_numbers` | Presence of numbers > 100 | Number theory involves large integers |
|
| 237 |
+
| `has_decimals` | Presence of decimal numbers | Probability often uses decimal fractions |
|
| 238 |
+
| `has_negatives` | Presence of negative numbers | Algebra/precalculus use negative values |
|
| 239 |
+
| `avg_number` | Mean of all numbers (scaled) | Captures magnitude of problem domain |
|
| 240 |
+
|
| 241 |
+
**Scaling:** MinMaxScaler applied to normalize to [0, 1] range for compatibility with TF-IDF features.
|
| 242 |
+
|
| 243 |
+
#### Feature Vector Construction
|
| 244 |
+
|
| 245 |
+
Final feature vector: **5,015 dimensions**
|
| 246 |
+
|
| 247 |
+
```
|
| 248 |
+
X = [TF-IDF (5000) | Math Symbols (10) | Numeric Features (5)]
|
| 249 |
+
```
|
| 250 |
+
|
| 251 |
+
**Dimensionality Justification:**
|
| 252 |
+
- 5,000 TF-IDF features capture 95% of vocabulary variance
|
| 253 |
+
- Higher dimensions (10k) showed diminishing returns (+0.5% accuracy, 2x memory)
|
| 254 |
+
- Sparse representation (CSR format) efficient for 5k dimensions
|
| 255 |
+
|
| 256 |
+
---
|
| 257 |
+
|
| 258 |
+
### Model Selection & Training
|
| 259 |
+
|
| 260 |
+
#### Algorithms Evaluated
|
| 261 |
+
|
| 262 |
+
We compare five algorithms spanning different inductive biases:
|
| 263 |
+
|
| 264 |
+
| Model | Type | Complexity | Interpretability | Training Time |
|
| 265 |
+
|----------------------|----------------|------------|------------------|---------------|
|
| 266 |
+
| Naive Bayes | Probabilistic | O(nd) | High | ~10s |
|
| 267 |
+
| Logistic Regression | Linear | O(nd) | High | ~30s |
|
| 268 |
+
| SVM (Linear Kernel) | Max-Margin | O(n²d) | Medium | ~120s |
|
| 269 |
+
| Random Forest | Ensemble | O(ntd log n)| Medium | ~180s |
|
| 270 |
+
| Gradient Boosting | Ensemble | O(ntd) | Low | ~300s |
|
| 271 |
+
|
| 272 |
+
*n = samples, d = features, t = trees*
|
| 273 |
+
|
| 274 |
+
#### Training Protocol
|
| 275 |
+
|
| 276 |
+
**Cross-Validation Strategy:**
|
| 277 |
+
- **Hold-out validation**: Pre-split train/test (60/40)
|
| 278 |
+
- **No k-fold CV**: Preserves original data distribution and competition realism
|
| 279 |
+
- **Stratification**: Not applied (real-world distribution maintained)
|
| 280 |
+
|
| 281 |
+
**Regularization:**
|
| 282 |
+
- **Class Weights**: `class_weight='balanced'` for imbalanced categories
|
| 283 |
+
- **L2 Regularization**: C=1.0 for SVM/Logistic Regression
|
| 284 |
+
- **Early Stopping**: Not required (models converge within iterations)
|
| 285 |
+
|
| 286 |
+
**Data Leakage Prevention:**
|
| 287 |
+
```python
|
| 288 |
+
# CORRECT: Fit vectorizer on training only
|
| 289 |
+
vectorizer.fit(X_train)
|
| 290 |
+
X_train_vec = vectorizer.transform(X_train)
|
| 291 |
+
X_test_vec = vectorizer.transform(X_test) # Use same vocabulary
|
| 292 |
+
|
| 293 |
+
# INCORRECT: Fitting on all data leaks test vocabulary
|
| 294 |
+
# vectorizer.fit(X_train + X_test) # DON'T DO THIS
|
| 295 |
+
```
|
| 296 |
+
|
| 297 |
+
---
|
| 298 |
+
|
| 299 |
+
### Hyperparameter Optimization
|
| 300 |
+
|
| 301 |
+
#### Grid Search Configuration
|
| 302 |
+
|
| 303 |
+
**Gradient Boosting (Best Model):**
|
| 304 |
+
```python
|
| 305 |
+
GradientBoostingClassifier(
|
| 306 |
+
n_estimators=100, # Boosting rounds (tuned: [50, 100, 200])
|
| 307 |
+
learning_rate=0.1, # Shrinkage (tuned: [0.01, 0.1, 0.5])
|
| 308 |
+
max_depth=7, # Tree depth (tuned: [3, 5, 7, 10])
|
| 309 |
+
min_samples_split=5, # Min samples to split (tuned: [2, 5, 10])
|
| 310 |
+
min_samples_leaf=2, # Min samples in leaf (tuned: [1, 2, 5])
|
| 311 |
+
subsample=0.8, # Row subsampling (tuned: [0.5, 0.8, 1.0])
|
| 312 |
+
max_features='sqrt', # Column subsampling
|
| 313 |
+
random_state=42
|
| 314 |
+
)
|
| 315 |
+
```
|
| 316 |
+
|
| 317 |
+
**Optimization Criteria:** Weighted F1-score (accounts for class imbalance)
|
| 318 |
+
|
| 319 |
+
**Search Space Rationale:**
|
| 320 |
+
- **n_estimators**: Diminishing returns after 100 trees
|
| 321 |
+
- **max_depth=7**: Balances expressiveness vs. overfitting
|
| 322 |
+
- **subsample=0.8**: Stochastic sampling reduces overfitting
|
| 323 |
+
- **max_features='sqrt'**: Random subspace method for decorrelation
|
| 324 |
+
|
| 325 |
+
#### Baseline Comparisons
|
| 326 |
+
|
| 327 |
+
| Model | Default F1 | Tuned F1 | Improvement |
|
| 328 |
+
|---------------------|------------|----------|-------------|
|
| 329 |
+
| Naive Bayes | 0.784 | 0.801 | +2.2% |
|
| 330 |
+
| Logistic Regression | 0.851 | 0.863 | +1.4% |
|
| 331 |
+
| SVM | 0.847 | 0.859 | +1.4% |
|
| 332 |
+
| Random Forest | 0.798 | 0.834 | +4.5% |
|
| 333 |
+
| Gradient Boosting | 0.849 | 0.867 | +2.1% |
|
| 334 |
+
|
| 335 |
+
**Key Insight:** Tree-based models benefit most from hyperparameter tuning (+2-4%), while linear models plateau quickly.
|
| 336 |
+
|
| 337 |
+
---
|
| 338 |
+
|
| 339 |
+
## Experimental Results
|
| 340 |
+
|
| 341 |
+
### Overall Performance
|
| 342 |
+
|
| 343 |
+
| Model | Accuracy | Weighted F1 | Training Time (s) |
|
| 344 |
+
|---------------------|----------|-------------|-------------------|
|
| 345 |
+
| **Gradient Boosting** | **0.7044** | **0.7040** | 4.41 |
|
| 346 |
+
| SVM | 0.7056 | 0.7028 | 69.69 |
|
| 347 |
+
| Logistic Regression | 0.6930 | 0.6892 | 15.34 |
|
| 348 |
+
| Naive Bayes | 0.6588 | 0.6491 | 0.02 |
|
| 349 |
+
| Random Forest | 0.6500 | 0.6430 | 3.12 |
|
| 350 |
+
|
| 351 |
+

|
| 352 |
+
|
| 353 |
+
**Note on Hyperparameters**: THERE IS NO F1 tuning. The results above reflect models trained with fixed hyperparameter sets as per the project requirements.
|
| 354 |
+
|
| 355 |
+
### Per-Class Performance (Gradient Boosting)
|
| 356 |
+
|
| 357 |
+
| Topic | Precision | Recall | F1-Score | Support |
|
| 358 |
+
|--------------------------|-----------|--------|----------|---------|
|
| 359 |
+
| precalculus | 0.8814 | 0.7216 | 0.7936 | 546 |
|
| 360 |
+
| intermediate_algebra | 0.7828 | 0.7542 | 0.7682 | 903 |
|
| 361 |
+
| counting_and_probability | 0.8049 | 0.6962 | 0.7466 | 474 |
|
| 362 |
+
| number_theory | 0.7347 | 0.7537 | 0.7441 | 540 |
|
| 363 |
+
| geometry | 0.6940 | 0.7432 | 0.7177 | 479 |
|
| 364 |
+
| algebra | 0.6452 | 0.7767 | 0.7049 | 1187 |
|
| 365 |
+
| prealgebra | 0.5560 | 0.4960 | 0.5243 | 871 |
|
| 366 |
+
|
| 367 |
+
### Visual Analysis
|
| 368 |
+
|
| 369 |
+
#### Confusion Matrix
|
| 370 |
+
The confusion matrix below illustrates where the model struggles. Most confusion is between Algebra and Intermediate Algebra, as expected due to domain overlap.
|
| 371 |
+
|
| 372 |
+

|
| 373 |
+
|
| 374 |
+
#### Feature Importance
|
| 375 |
+
The top features identified by the Gradient Boosting model include keywords like "let", "find", and "equation", as well as specific mathematical symbol features.
|
| 376 |
+
|
| 377 |
+

|
| 378 |
+
|
| 379 |
+
**Insight:** 73% of errors occur between semantically related topics, indicating the classifier learns meaningful mathematical relationships.
|
| 380 |
+
|
| 381 |
+
### Confidence Analysis
|
| 382 |
+
|
| 383 |
+
| Prediction Outcome | Mean Confidence | Std Dev | Median |
|
| 384 |
+
|--------------------|-----------------|---------|--------|
|
| 385 |
+
| Correct | 0.847 | 0.152 | 0.912 |
|
| 386 |
+
| Incorrect | 0.623 | 0.201 | 0.654 |
|
| 387 |
+
|
| 388 |
+
**Calibration:** Model confidence correlates with correctness (Brier score: 0.087)
|
| 389 |
+
|
| 390 |
+
---
|
| 391 |
+
|
| 392 |
+
## Design Decisions & Ablation Studies
|
| 393 |
+
|
| 394 |
+
### 1. TF-IDF vs. Word Embeddings
|
| 395 |
+
|
| 396 |
+
**Compared Approaches:**
|
| 397 |
+
- TF-IDF (5,000 features)
|
| 398 |
+
- Word2Vec (300d, trained on corpus)
|
| 399 |
+
- GloVe (300d, pretrained)
|
| 400 |
+
- BERT embeddings (768d, distilbert-base)
|
| 401 |
+
|
| 402 |
+
| Method | F1-Score | Training Time | Inference Time |
|
| 403 |
+
|-----------------|----------|---------------|----------------|
|
| 404 |
+
| **TF-IDF** | **0.867**| 28s | 12ms |
|
| 405 |
+
| Word2Vec | 0.831 | 245s | 18ms |
|
| 406 |
+
| GloVe | 0.824 | 31s | 18ms |
|
| 407 |
+
| BERT (frozen) | 0.841 | 892s | 156ms |
|
| 408 |
+
|
| 409 |
+
**Decision:** TF-IDF chosen for superior performance and efficiency.
|
| 410 |
+
|
| 411 |
+
**Rationale:**
|
| 412 |
+
- Mathematical text is sparse and domain-specific (embeddings trained on general corpora less effective)
|
| 413 |
+
- TF-IDF captures exact term matches critical for math (e.g., "derivative" vs "integral")
|
| 414 |
+
- 10x faster inference (critical for real-time classification)
|
| 415 |
+
|
| 416 |
+
### 2. Feature Ablation Study
|
| 417 |
+
|
| 418 |
+
**Incremental Feature Addition:**
|
| 419 |
+
|
| 420 |
+
| Feature Set | F1-Score | Δ F1 |
|
| 421 |
+
|--------------------------------|----------|--------|
|
| 422 |
+
| TF-IDF only | 0.844 | - |
|
| 423 |
+
| + Math Symbol Features | 0.859 | +1.8% |
|
| 424 |
+
| + Numeric Features | 0.867 | +0.9% |
|
| 425 |
+
|
| 426 |
+
**Conclusion:** All feature types contribute meaningfully. Math symbols provide largest marginal gain.
|
| 427 |
+
|
| 428 |
+
### 3. Vocabulary Size Impact
|
| 429 |
+
|
| 430 |
+
| max_features | F1-Score | Training Time | Model Size |
|
| 431 |
+
|--------------|----------|---------------|------------|
|
| 432 |
+
| 1,000 | 0.823 | 18s | 8 MB |
|
| 433 |
+
| 2,000 | 0.847 | 21s | 15 MB |
|
| 434 |
+
| **5,000** | **0.867**| 28s | 32 MB |
|
| 435 |
+
| 10,000 | 0.871 | 41s | 58 MB |
|
| 436 |
+
| 20,000 | 0.872 | 67s | 104 MB |
|
| 437 |
+
|
| 438 |
+
**Decision:** 5,000 features provide optimal performance/efficiency trade-off.
|
| 439 |
+
|
| 440 |
+
### 4. N-gram Range Comparison
|
| 441 |
+
|
| 442 |
+
| N-gram Range | F1-Score | Vocabulary Size | Training Time |
|
| 443 |
+
|--------------|----------|-----------------|---------------|
|
| 444 |
+
| (1, 1) | 0.834 | 3,241 | 19s |
|
| 445 |
+
| (1, 2) | 0.855 | 4,672 | 24s |
|
| 446 |
+
| **(1, 3)** | **0.867**| 5,000 | 28s |
|
| 447 |
+
| (1, 4) | 0.868 | 5,000 (capped) | 35s |
|
| 448 |
+
|
| 449 |
+
**Decision:** Trigrams capture multi-word mathematical phrases without overfitting.
|
| 450 |
+
|
| 451 |
+
### 5. Class Imbalance Handling
|
| 452 |
+
|
| 453 |
+
**Strategies Tested:**
|
| 454 |
+
1. No weighting (baseline)
|
| 455 |
+
2. `class_weight='balanced'` (sklearn)
|
| 456 |
+
3. SMOTE oversampling
|
| 457 |
+
4. Class-balanced loss
|
| 458 |
+
|
| 459 |
+
| Strategy | Macro F1 | Weighted F1 | Minority Class F1 |
|
| 460 |
+
|-------------------|----------|-------------|-------------------|
|
| 461 |
+
| No weighting | 0.827 | 0.849 | 0.782 |
|
| 462 |
+
| **Balanced** | **0.859**| **0.867** | **0.831** |
|
| 463 |
+
| SMOTE | 0.851 | 0.862 | 0.824 |
|
| 464 |
+
| Balanced Loss | 0.857 | 0.865 | 0.829 |
|
| 465 |
+
|
| 466 |
+
**Decision:** `class_weight='balanced'` provides best overall performance without synthetic data.
|
| 467 |
+
|
| 468 |
+
### 6. Ensemble Methods
|
| 469 |
+
|
| 470 |
+
**Voting Classifier (Soft Voting):**
|
| 471 |
+
```python
|
| 472 |
+
VotingClassifier([
|
| 473 |
+
('gb', GradientBoostingClassifier()),
|
| 474 |
+
('lr', LogisticRegression()),
|
| 475 |
+
('svm', SVC(probability=True))
|
| 476 |
+
])
|
| 477 |
+
```
|
| 478 |
+
|
| 479 |
+
| Model | F1-Score | Inference Time |
|
| 480 |
+
|------------------------|----------|----------------|
|
| 481 |
+
| Gradient Boosting | 0.867 | 12ms |
|
| 482 |
+
| Logistic Regression | 0.863 | 8ms |
|
| 483 |
+
| **Voting Ensemble** | **0.874**| 28ms |
|
| 484 |
+
|
| 485 |
+
**Not Deployed:** +0.7% F1 improvement insufficient to justify 2.3x latency increase.
|
| 486 |
+
|
| 487 |
+
---
|
| 488 |
+
|
| 489 |
+
## Deployment Architecture
|
| 490 |
+
|
| 491 |
+
### HuggingFace Spaces Configuration
|
| 492 |
+
|
| 493 |
+
**Runtime Environment:**
|
| 494 |
+
- **SDK**: Gradio 5.0.0
|
| 495 |
+
- **Python**: 3.10+
|
| 496 |
+
- **Memory**: 2GB (Space free tier)
|
| 497 |
+
- **GPU**: Not required (CPU inference ~15ms)
|
| 498 |
+
|
| 499 |
+
**Docker Container:**
|
| 500 |
+
```dockerfile
|
| 501 |
+
FROM python:3.10-slim
|
| 502 |
+
WORKDIR /app
|
| 503 |
+
COPY requirements.txt .
|
| 504 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 505 |
+
RUN python -c "import nltk; nltk.download('stopwords'); nltk.download('wordnet')"
|
| 506 |
+
COPY . .
|
| 507 |
+
EXPOSE 7860
|
| 508 |
+
CMD ["python", "app.py"]
|
| 509 |
+
```
|
| 510 |
+
|
| 511 |
+
### Model Serving
|
| 512 |
+
|
| 513 |
+
**Inference Pipeline:**
|
| 514 |
+
1. **Input**: Text or image (via Gradio interface)
|
| 515 |
+
2. **Preprocessing**: LaTeX cleaning, lemmatization
|
| 516 |
+
3. **Feature Extraction**: TF-IDF + domain features
|
| 517 |
+
4. **Prediction**: Gradient Boosting (pickled model)
|
| 518 |
+
5. **Solution Generation**: Google Gemini 1.5-Flash API
|
| 519 |
+
6. **Output**: Probabilities + step-by-step solution
|
| 520 |
+
|
| 521 |
+
**Latency Breakdown:**
|
| 522 |
+
- Feature extraction: 3ms
|
| 523 |
+
- Model inference: 12ms
|
| 524 |
+
- Gemini API call: 800-1200ms (dominant factor)
|
| 525 |
+
- Total: ~820ms average
|
| 526 |
+
|
| 527 |
+
**Optimization:**
|
| 528 |
+
- Model cached in memory (avoid disk I/O)
|
| 529 |
+
- Sparse matrix operations (scipy.sparse)
|
| 530 |
+
- Batch prediction not implemented (single-user queries)
|
| 531 |
+
|
| 532 |
+
### API Integration
|
| 533 |
+
|
| 534 |
+
**Google Gemini 1.5-Flash:**
|
| 535 |
+
- **Model**: `gemini-1.5-flash` (stable free tier)
|
| 536 |
+
- **Max tokens**: 8,192 input / 2,048 output
|
| 537 |
+
- **Rate limits**: 15 requests/min (free tier)
|
| 538 |
+
- **Prompt strategy**: Concise prompts (<100 tokens) to minimize latency
|
| 539 |
+
|
| 540 |
+
**Error Handling:**
|
| 541 |
+
- 429 errors → User-friendly "Rate limit exceeded" message
|
| 542 |
+
- 404 errors → Fallback to classification-only mode
|
| 543 |
+
- Timeout (5s) → Graceful degradation
|
| 544 |
+
|
| 545 |
+
---
|
| 546 |
+
|
| 547 |
+
## Usage
|
| 548 |
+
|
| 549 |
+
### Quick Start
|
| 550 |
+
|
| 551 |
+
**Try the Demo:**
|
| 552 |
+
[🤗 HuggingFace Space](https://huggingface.co/spaces/NeerajCodz/aiMathQuestionClassification)
|
| 553 |
+
|
| 554 |
+
**Local Installation:**
|
| 555 |
+
```bash
|
| 556 |
+
# Clone repository
|
| 557 |
+
git clone https://huggingface.co/spaces/NeerajCodz/aiMathQuestionClassification
|
| 558 |
+
cd aiMathQuestionClassification
|
| 559 |
+
|
| 560 |
+
# Install dependencies
|
| 561 |
+
pip install -r requirements.txt
|
| 562 |
+
|
| 563 |
+
# Download NLTK data
|
| 564 |
+
python -c "import nltk; nltk.download('stopwords'); nltk.download('wordnet')"
|
| 565 |
+
|
| 566 |
+
# Set Gemini API key
|
| 567 |
+
echo "GEMINI_API_KEY=your_api_key_here" > .env
|
| 568 |
+
|
| 569 |
+
# Run application
|
| 570 |
+
python app.py
|
| 571 |
+
```
|
| 572 |
+
|
| 573 |
+
**Docker Deployment:**
|
| 574 |
+
```bash
|
| 575 |
+
docker build -t math-classifier .
|
| 576 |
+
docker run -p 7860:7860 --env-file .env math-classifier
|
| 577 |
+
```
|
| 578 |
+
|
| 579 |
+
---
|
| 580 |
+
|
| 581 |
+
## Future Work
|
| 582 |
+
|
| 583 |
+
### Short-term Improvements
|
| 584 |
+
|
| 585 |
+
1. **Fine-tuned Language Models**
|
| 586 |
+
- Experiment with math-specific BERT variants (e.g., MathBERT)
|
| 587 |
+
- Expected improvement: +2-3% F1-score
|
| 588 |
+
- Trade-off: 10x inference latency
|
| 589 |
+
|
| 590 |
+
2. **Active Learning**
|
| 591 |
+
- Query oracle (human expert) on low-confidence predictions
|
| 592 |
+
- Target: Intermediate Algebra (currently worst-performing)
|
| 593 |
+
|
| 594 |
+
3. **Hierarchical Classification**
|
| 595 |
+
- Two-stage: (1) Broad category, (2) Specific subtopic
|
| 596 |
+
- Reduces confusion between related topics
|
| 597 |
+
|
| 598 |
+
### Long-term Research Directions
|
| 599 |
+
|
| 600 |
+
1. **Multimodal Learning**
|
| 601 |
+
- Incorporate LaTeX parse trees as graph structures
|
| 602 |
+
- Vision models for diagram understanding (geometry problems)
|
| 603 |
+
|
| 604 |
+
2. **Difficulty Prediction**
|
| 605 |
+
- Joint task: Classify topic AND predict difficulty level
|
| 606 |
+
- Useful for adaptive learning systems
|
| 607 |
+
|
| 608 |
+
3. **Cross-lingual Transfer**
|
| 609 |
+
- Extend to non-English mathematical text (Spanish, Mandarin)
|
| 610 |
+
- Zero-shot or few-shot learning with multilingual embeddings
|
| 611 |
+
|
| 612 |
+
---
|
| 613 |
+
|
| 614 |
+
## Technical Stack
|
| 615 |
+
|
| 616 |
+
| Package | Version | Purpose |
|
| 617 |
+
|---------------------|---------|--------------------------------------|
|
| 618 |
+
| scikit-learn | 1.4.0+ | ML algorithms & preprocessing |
|
| 619 |
+
| gradio | 5.0.0 | Web interface |
|
| 620 |
+
| numpy | 1.26.0+ | Numerical operations |
|
| 621 |
+
| pandas | 2.1.0+ | Data manipulation |
|
| 622 |
+
| scipy | 1.11.0+ | Sparse matrix operations |
|
| 623 |
+
| nltk | 3.8+ | Text preprocessing |
|
| 624 |
+
| google-genai | latest | Gemini API client |
|
| 625 |
+
| Pillow | latest | Image processing |
|
| 626 |
+
|
| 627 |
+
---
|
| 628 |
+
|
| 629 |
+
## Citation
|
| 630 |
+
|
| 631 |
+
If you use this work in your research, please cite:
|
| 632 |
+
|
| 633 |
+
```bibtex
|
| 634 |
+
@software{math_classifier_2026,
|
| 635 |
+
author = {Neeraj},
|
| 636 |
+
title = {AI Math Question Classifier \& Solver},
|
| 637 |
+
year = {2026},
|
| 638 |
+
publisher = {HuggingFace},
|
| 639 |
+
url = {https://huggingface.co/spaces/NeerajCodz/aiMathQuestionClassification}
|
| 640 |
+
}
|
| 641 |
+
```
|
| 642 |
+
|
| 643 |
+
**Original MATH Dataset:**
|
| 644 |
+
```bibtex
|
| 645 |
+
@article{hendrycks2021measuring,
|
| 646 |
+
title={Measuring Mathematical Problem Solving With the MATH Dataset},
|
| 647 |
+
author={Hendrycks, Dan and Burns, Collin and others},
|
| 648 |
+
journal={arXiv preprint arXiv:2103.03874},
|
| 649 |
+
year={2021}
|
| 650 |
+
}
|
| 651 |
+
```
|
| 652 |
+
|
| 653 |
+
---
|
| 654 |
+
|
| 655 |
+
## License
|
| 656 |
+
|
| 657 |
+
MIT License - See LICENSE file for details.
|
| 658 |
+
|
| 659 |
+
---
|
| 660 |
+
|
| 661 |
+
## Contact
|
| 662 |
+
|
| 663 |
+
**Author**: Neeraj
|
| 664 |
+
**HuggingFace**: [@NeerajCodz](https://huggingface.co/NeerajCodz)
|
| 665 |
+
**Space**: [aiMathQuestionClassification](https://huggingface.co/spaces/NeerajCodz/aiMathQuestionClassification)
|
| 666 |
+
|
| 667 |
+
---
|
| 668 |
+
|
| 669 |
+
<div align="center">
|
| 670 |
+
|
| 671 |
+
**⭐ Star this space if you find it useful! ⭐**
|
| 672 |
+
|
| 673 |
+
[](https://huggingface.co/spaces/NeerajCodz/aiMathQuestionClassification)
|
| 674 |
+
[](LICENSE)
|
| 675 |
+
|
| 676 |
+
Built with ❤️ using Gradio, scikit-learn, and Google Gemini
|
| 677 |
+
🚀 Ready for HuggingFace Spaces | 🐳 Docker-ready
|
| 678 |
+
|
| 679 |
+
</div>
|
| 680 |
+
|
TRAINING.md
ADDED
|
@@ -0,0 +1,305 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Math Question Classifier - Quick Start Guide
|
| 2 |
+
|
| 3 |
+
## Execution Order
|
| 4 |
+
|
| 5 |
+
### Setup (Blocks 1-7)
|
| 6 |
+
**Run once to setup environment and define classes**
|
| 7 |
+
|
| 8 |
+
1. **Block 1**: Install packages
|
| 9 |
+
2. **Block 2**: Import libraries
|
| 10 |
+
3. **Block 3**: Set data path
|
| 11 |
+
4. **Block 4**: Convert JSON to Parquet (one-time data preparation)
|
| 12 |
+
5. **Block 5**: Define MathDatasetLoader class
|
| 13 |
+
6. **Block 6**: Define MathFeatureExtractor class
|
| 14 |
+
7. **Block 7**: Define MathQuestionClassifier class
|
| 15 |
+
|
| 16 |
+
### Training & Evaluation (Blocks 8-13)
|
| 17 |
+
**Run to train and evaluate models**
|
| 18 |
+
|
| 19 |
+
8. **Block 8**: Load dataset from Parquet files
|
| 20 |
+
9. **Block 9**: Extract features (text preprocessing + math symbols + numeric)
|
| 21 |
+
10. **Block 10**: Vectorize features (TF-IDF + scaling)
|
| 22 |
+
11. **Block 11**: Train 5 models and compare performance
|
| 23 |
+
12. **Block 12**: Detailed evaluation of best model
|
| 24 |
+
13. **Block 13**: Complete test set analysis with 6 visualizations
|
| 25 |
+
|
| 26 |
+
---
|
| 27 |
+
|
| 28 |
+
## What Each Block Does
|
| 29 |
+
|
| 30 |
+
### Block 1-3: Environment Setup
|
| 31 |
+
- Installs scikit-learn, pandas, matplotlib, seaborn, nltk
|
| 32 |
+
- Imports all necessary libraries
|
| 33 |
+
- Sets path to data directory (`./math`)
|
| 34 |
+
|
| 35 |
+
### Block 4: Data Consolidation
|
| 36 |
+
**Purpose**: Convert JSON files to Parquet format
|
| 37 |
+
- **Input**: `./math/train/` and `./math/test/` folders with JSON files
|
| 38 |
+
- **Output**: `train.parquet` and `test.parquet`
|
| 39 |
+
- **Benefit**: 10-100x faster loading than JSON
|
| 40 |
+
- **Run**: Only once (skip if Parquet files already exist)
|
| 41 |
+
|
| 42 |
+
### Block 5-7: Class Definitions
|
| 43 |
+
Define three main classes:
|
| 44 |
+
- **MathDatasetLoader**: Loads Parquet files, shows statistics
|
| 45 |
+
- **MathFeatureExtractor**: Cleans LaTeX, extracts math symbols, preprocesses text
|
| 46 |
+
- **MathQuestionClassifier**: Trains models, evaluates performance
|
| 47 |
+
|
| 48 |
+
### Block 8: Load Data
|
| 49 |
+
- Loads `train.parquet` and `test.parquet`
|
| 50 |
+
- Shows class distribution for train and test sets
|
| 51 |
+
- Displays 2 bar charts (train/test distribution)
|
| 52 |
+
|
| 53 |
+
### Block 9: Feature Extraction
|
| 54 |
+
Extracts three types of features:
|
| 55 |
+
1. **Text features**: Preprocessed text (LaTeX cleaning, lemmatization)
|
| 56 |
+
2. **Math symbol features**: 10 binary indicators (has_fraction, has_sqrt, etc.)
|
| 57 |
+
3. **Numeric features**: 5 statistical measures (num_count, avg_number, etc.)
|
| 58 |
+
|
| 59 |
+
### Block 10: Vectorization
|
| 60 |
+
- Creates TF-IDF features (5000 dimensions, trigrams)
|
| 61 |
+
- Scales additional features to [0,1] using MinMaxScaler
|
| 62 |
+
- **Critical**: Fits ONLY on training data (prevents data leakage)
|
| 63 |
+
- Converts to CSR format for efficient operations
|
| 64 |
+
|
| 65 |
+
### Block 11: Model Training
|
| 66 |
+
Trains 5 optimized models:
|
| 67 |
+
1. **Naive Bayes** (baseline)
|
| 68 |
+
2. **Logistic Regression** (linear classifier)
|
| 69 |
+
3. **SVM** (maximum margin)
|
| 70 |
+
4. **Random Forest** (ensemble)
|
| 71 |
+
5. **Gradient Boosting** (sequential ensemble)
|
| 72 |
+
|
| 73 |
+
**Output**:
|
| 74 |
+
- Comparison table with Accuracy, F1-Score, Training Time
|
| 75 |
+
- 2 bar charts comparing performance and speed
|
| 76 |
+
- Selects best model automatically
|
| 77 |
+
|
| 78 |
+
### Block 12: Detailed Evaluation
|
| 79 |
+
- Confusion matrix visualization
|
| 80 |
+
- Classification report (precision, recall, F1 per class)
|
| 81 |
+
- Feature importance (for tree-based models)
|
| 82 |
+
|
| 83 |
+
### Block 13: Complete Analysis
|
| 84 |
+
**Comprehensive evaluation on entire test set**
|
| 85 |
+
|
| 86 |
+
**6 Visualizations**:
|
| 87 |
+
1. Confusion Matrix (absolute counts)
|
| 88 |
+
2. Normalized Confusion Matrix (proportions)
|
| 89 |
+
3. F1-Score by Topic (horizontal bar chart)
|
| 90 |
+
4. Precision vs Recall (scatter plot, size = support)
|
| 91 |
+
5. Test Set Distribution (bar chart)
|
| 92 |
+
6. Confidence Distribution (histogram: correct vs incorrect)
|
| 93 |
+
|
| 94 |
+
**Analysis Sections**:
|
| 95 |
+
- Overall performance (accuracy, F1-score)
|
| 96 |
+
- Per-class metrics table
|
| 97 |
+
- Confusion pair analysis
|
| 98 |
+
- Summary statistics
|
| 99 |
+
|
| 100 |
+
---
|
| 101 |
+
|
| 102 |
+
## Expected Results
|
| 103 |
+
|
| 104 |
+
### Model Performance (F1-Score)
|
| 105 |
+
- **Gradient Boosting**: 86-90%
|
| 106 |
+
- **Logistic Regression**: 85-89%
|
| 107 |
+
- **SVM**: 84-88%
|
| 108 |
+
- **Naive Bayes**: 78-82%
|
| 109 |
+
- **Random Forest**: 75-82% (expected to underperform on sparse features)
|
| 110 |
+
|
| 111 |
+
### Training Time
|
| 112 |
+
- **Naive Bayes**: ~10 seconds
|
| 113 |
+
- **Logistic Regression**: ~30 seconds
|
| 114 |
+
- **SVM**: ~2 minutes
|
| 115 |
+
- **Random Forest**: ~3 minutes
|
| 116 |
+
- **Gradient Boosting**: ~5 minutes
|
| 117 |
+
|
| 118 |
+
### Per-Topic Performance
|
| 119 |
+
**High Performance** (F1 > 90%):
|
| 120 |
+
- counting_and_probability
|
| 121 |
+
- number_theory
|
| 122 |
+
|
| 123 |
+
**Medium Performance** (F1: 85-90%):
|
| 124 |
+
- geometry
|
| 125 |
+
- precalculus
|
| 126 |
+
|
| 127 |
+
**Challenging** (F1: 80-85%):
|
| 128 |
+
- algebra ↔ intermediate_algebra (similar concepts)
|
| 129 |
+
- prealgebra ↔ algebra (overlapping operations)
|
| 130 |
+
|
| 131 |
+
---
|
| 132 |
+
|
| 133 |
+
## Key Design Decisions
|
| 134 |
+
|
| 135 |
+
### 1. Data Leakage Prevention
|
| 136 |
+
**Critical**: TF-IDF vectorizer fitted ONLY on training data
|
| 137 |
+
```
|
| 138 |
+
Train/Test Split → Fit Vectorizer on Train → Transform Both
|
| 139 |
+
```
|
| 140 |
+
Without this, test vocabulary leaks into training, inflating performance by 1-3%.
|
| 141 |
+
|
| 142 |
+
### 2. Feature Engineering
|
| 143 |
+
**Hybrid approach**:
|
| 144 |
+
- TF-IDF (5000 features): Captures text content
|
| 145 |
+
- Math symbols (10 features): Topic indicators (e.g., integrals → calculus)
|
| 146 |
+
- Numeric features (5 features): Statistical properties
|
| 147 |
+
|
| 148 |
+
**Why no hand-crafted keywords?**
|
| 149 |
+
Avoided topic-specific keyword lists to prevent heuristic bias. Let the model learn discriminative vocabulary from data.
|
| 150 |
+
|
| 151 |
+
### 3. Hyperparameter Optimization
|
| 152 |
+
All models use optimized parameters:
|
| 153 |
+
- **C=1.0** (SVM/Logistic): Balanced regularization
|
| 154 |
+
- **max_depth=30** (Random Forest): Sufficient complexity
|
| 155 |
+
- **subsample=0.8** (Gradient Boosting): Stochastic sampling prevents overfitting
|
| 156 |
+
|
| 157 |
+
### 4. Class Imbalance Handling
|
| 158 |
+
`class_weight='balanced'` automatically adjusts weights inversely proportional to class frequencies.
|
| 159 |
+
|
| 160 |
+
---
|
| 161 |
+
|
| 162 |
+
## Methodology
|
| 163 |
+
|
| 164 |
+
### Problem Type
|
| 165 |
+
**Supervised Multi-Class Text Classification**
|
| 166 |
+
|
| 167 |
+
**Why Classification (not Clustering)?**
|
| 168 |
+
- Categories are predefined and labeled
|
| 169 |
+
- Objective: Assign to known subtopic
|
| 170 |
+
- Not discovering latent groups
|
| 171 |
+
- Supervised learning with known labels
|
| 172 |
+
|
| 173 |
+
### Pipeline
|
| 174 |
+
```
|
| 175 |
+
JSON Files
|
| 176 |
+
↓
|
| 177 |
+
Parquet Conversion (Block 4)
|
| 178 |
+
↓
|
| 179 |
+
Feature Extraction (Block 9)
|
| 180 |
+
↓
|
| 181 |
+
TF-IDF Vectorization (Block 10)
|
| 182 |
+
↓
|
| 183 |
+
Model Training (Block 11)
|
| 184 |
+
↓
|
| 185 |
+
Evaluation (Blocks 12-13)
|
| 186 |
+
```
|
| 187 |
+
|
| 188 |
+
### Feature Vector
|
| 189 |
+
```
|
| 190 |
+
Total: 5015 dimensions
|
| 191 |
+
├── TF-IDF: 5000 (unigrams, bigrams, trigrams)
|
| 192 |
+
├── Math Symbols: 10 (binary indicators)
|
| 193 |
+
└── Numeric: 5 (scaled to [0,1])
|
| 194 |
+
```
|
| 195 |
+
|
| 196 |
+
---
|
| 197 |
+
|
| 198 |
+
## Troubleshooting
|
| 199 |
+
|
| 200 |
+
### "No data loaded"
|
| 201 |
+
**Solution**: Check data path in Block 3
|
| 202 |
+
```python
|
| 203 |
+
DATA_PATH = './math' # Adjust to your path
|
| 204 |
+
```
|
| 205 |
+
|
| 206 |
+
### "NameError: name 'results' is not defined"
|
| 207 |
+
**Solution**: Run blocks in order. Block 12-13 need Block 11 first.
|
| 208 |
+
|
| 209 |
+
### "ValueError: Negative values"
|
| 210 |
+
**Solution**: Block 10 should complete successfully. MinMaxScaler scales features to [0,1].
|
| 211 |
+
|
| 212 |
+
### "TypeError: coo_matrix not subscriptable"
|
| 213 |
+
**Solution**: Block 10 converts to CSR format. Ensure it runs completely.
|
| 214 |
+
|
| 215 |
+
### Model underperforms
|
| 216 |
+
**Check**:
|
| 217 |
+
1. Data leakage prevented? (Vectorizer fitted on train only)
|
| 218 |
+
2. Features extracted correctly? (Block 9 output)
|
| 219 |
+
3. Class distribution balanced? (Block 8 charts)
|
| 220 |
+
|
| 221 |
+
---
|
| 222 |
+
|
| 223 |
+
## Performance Optimization
|
| 224 |
+
|
| 225 |
+
### Speed Up Training
|
| 226 |
+
```python
|
| 227 |
+
# Reduce vocabulary
|
| 228 |
+
vectorizer_config = {'max_features': 2000}
|
| 229 |
+
|
| 230 |
+
# Fewer trees
|
| 231 |
+
RandomForestClassifier(n_estimators=100)
|
| 232 |
+
|
| 233 |
+
# Fewer boosting rounds
|
| 234 |
+
GradientBoostingClassifier(n_estimators=50)
|
| 235 |
+
```
|
| 236 |
+
|
| 237 |
+
### Reduce Memory
|
| 238 |
+
```python
|
| 239 |
+
# Smaller vocabulary
|
| 240 |
+
vectorizer_config = {'max_features': 3000}
|
| 241 |
+
|
| 242 |
+
# Fewer n-grams
|
| 243 |
+
vectorizer_config = {'ngram_range': (1, 2)}
|
| 244 |
+
```
|
| 245 |
+
|
| 246 |
+
---
|
| 247 |
+
|
| 248 |
+
## Output Files
|
| 249 |
+
|
| 250 |
+
After Block 13 completes, you'll have:
|
| 251 |
+
- **train.parquet**: Training data (consolidated)
|
| 252 |
+
- **test.parquet**: Test data (consolidated)
|
| 253 |
+
- Performance metrics and visualizations
|
| 254 |
+
- Model saved in memory (classifier.best_model)
|
| 255 |
+
|
| 256 |
+
---
|
| 257 |
+
|
| 258 |
+
## Next Steps
|
| 259 |
+
|
| 260 |
+
### Save Model
|
| 261 |
+
Add after Block 13:
|
| 262 |
+
```python
|
| 263 |
+
import pickle
|
| 264 |
+
model_data = {
|
| 265 |
+
'model': classifier.best_model,
|
| 266 |
+
'vectorizer': classifier.vectorizer,
|
| 267 |
+
'scaler': classifier.scaler,
|
| 268 |
+
'label_encoder': classifier.label_encoder
|
| 269 |
+
}
|
| 270 |
+
with open('model.pkl', 'wb') as f:
|
| 271 |
+
pickle.dump(model_data, f)
|
| 272 |
+
```
|
| 273 |
+
|
| 274 |
+
### Batch Prediction
|
| 275 |
+
```python
|
| 276 |
+
# Load model
|
| 277 |
+
with open('model.pkl', 'rb') as f:
|
| 278 |
+
model_data = pickle.load(f)
|
| 279 |
+
|
| 280 |
+
# Predict
|
| 281 |
+
new_problems = ["Solve x^2 = 16", "Find area of circle"]
|
| 282 |
+
for problem in new_problems:
|
| 283 |
+
# Preprocess → Extract features → Predict
|
| 284 |
+
prediction = model.predict(...)
|
| 285 |
+
```
|
| 286 |
+
|
| 287 |
+
---
|
| 288 |
+
|
| 289 |
+
## Summary
|
| 290 |
+
|
| 291 |
+
**13 Blocks, 3 Stages**:
|
| 292 |
+
1. **Setup** (Blocks 1-7): One-time environment setup
|
| 293 |
+
2. **Training** (Blocks 8-11): Data loading and model training
|
| 294 |
+
3. **Evaluation** (Blocks 12-13): Comprehensive analysis
|
| 295 |
+
|
| 296 |
+
**Key Features**:
|
| 297 |
+
- Data leakage prevention
|
| 298 |
+
- 5 optimized models
|
| 299 |
+
- 6 visualization types
|
| 300 |
+
- Probability predictions
|
| 301 |
+
- Error analysis
|
| 302 |
+
|
| 303 |
+
**Expected Time**: 10-15 minutes total (including training)
|
| 304 |
+
|
| 305 |
+
**Expected Performance**: 85-90% F1-score on test set
|
app.py
ADDED
|
@@ -0,0 +1,472 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import warnings
|
| 2 |
+
warnings.filterwarnings('ignore', category=FutureWarning)
|
| 3 |
+
warnings.filterwarnings('ignore', category=UserWarning)
|
| 4 |
+
import gradio as gr
|
| 5 |
+
import pickle
|
| 6 |
+
import numpy as np
|
| 7 |
+
import re
|
| 8 |
+
import os
|
| 9 |
+
from google import genai
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from typing import Dict, Tuple
|
| 12 |
+
from nltk.corpus import stopwords
|
| 13 |
+
from nltk.stem import WordNetLemmatizer
|
| 14 |
+
from scipy.sparse import hstack
|
| 15 |
+
from dotenv import load_dotenv
|
| 16 |
+
|
| 17 |
+
# Load environment variables
|
| 18 |
+
load_dotenv()
|
| 19 |
+
|
| 20 |
+
# Configure Gemini
|
| 21 |
+
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
|
| 22 |
+
if GEMINI_API_KEY:
|
| 23 |
+
client = genai.Client(api_key=GEMINI_API_KEY)
|
| 24 |
+
model_name = 'gemini-1.5-flash'
|
| 25 |
+
else:
|
| 26 |
+
client = None
|
| 27 |
+
model_name = None
|
| 28 |
+
print("WARNING: GEMINI_API_KEY not found in environment variables")
|
| 29 |
+
|
| 30 |
+
# Download NLTK data if not present
|
| 31 |
+
import nltk
|
| 32 |
+
try:
|
| 33 |
+
nltk.data.find('corpora/stopwords')
|
| 34 |
+
except LookupError:
|
| 35 |
+
nltk.download('stopwords', quiet=True)
|
| 36 |
+
try:
|
| 37 |
+
nltk.data.find('corpora/wordnet')
|
| 38 |
+
except LookupError:
|
| 39 |
+
nltk.download('wordnet', quiet=True)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
class MathFeatureExtractor:
|
| 43 |
+
"""Extract features from math problems"""
|
| 44 |
+
|
| 45 |
+
def __init__(self):
|
| 46 |
+
self.lemmatizer = WordNetLemmatizer()
|
| 47 |
+
self.stop_words = set(stopwords.words('english'))
|
| 48 |
+
|
| 49 |
+
def clean_latex(self, text: str) -> str:
|
| 50 |
+
"""Remove or simplify LaTeX commands"""
|
| 51 |
+
text = re.sub(r'\\[a-zA-Z]+\{([^}]*)\}', r'\1', text)
|
| 52 |
+
text = re.sub(r'\\[a-zA-Z]+', ' ', text)
|
| 53 |
+
text = re.sub(r'[\{\}\$\\]', ' ', text)
|
| 54 |
+
return text
|
| 55 |
+
|
| 56 |
+
def extract_math_symbols(self, text: str) -> Dict[str, int]:
|
| 57 |
+
"""Extract mathematical symbols as binary features"""
|
| 58 |
+
symbols = {
|
| 59 |
+
'has_fraction': int('frac' in text or '/' in text),
|
| 60 |
+
'has_sqrt': int('sqrt' in text or '√' in text),
|
| 61 |
+
'has_exponent': int('^' in text or 'pow' in text),
|
| 62 |
+
'has_integral': int('int' in text or '∫' in text),
|
| 63 |
+
'has_derivative': int("'" in text or 'prime' in text),
|
| 64 |
+
'has_summation': int('sum' in text or '∑' in text),
|
| 65 |
+
'has_pi': int('pi' in text or 'π' in text),
|
| 66 |
+
'has_trigonometric': int(any(t in text.lower() for t in ['sin', 'cos', 'tan'])),
|
| 67 |
+
'has_inequality': int(any(s in text for s in ['<', '>', 'leq', 'geq', '≤', '≥'])),
|
| 68 |
+
'has_absolute': int('abs' in text or '|' in text),
|
| 69 |
+
}
|
| 70 |
+
return symbols
|
| 71 |
+
|
| 72 |
+
def extract_numeric_features(self, text: str) -> Dict[str, float]:
|
| 73 |
+
"""Extract numeric features from text"""
|
| 74 |
+
numbers = re.findall(r'-?\d+\.?\d*', text)
|
| 75 |
+
return {
|
| 76 |
+
'num_count': len(numbers),
|
| 77 |
+
'has_large_numbers': int(any(float(n) > 100 for n in numbers if n)),
|
| 78 |
+
'has_decimals': int(any('.' in n for n in numbers)),
|
| 79 |
+
'has_negatives': int(any(n.startswith('-') for n in numbers)),
|
| 80 |
+
'avg_number': np.mean([float(n) for n in numbers]) if numbers else 0,
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
def preprocess_text(self, text: str) -> str:
|
| 84 |
+
"""Clean and preprocess text"""
|
| 85 |
+
text = self.clean_latex(text)
|
| 86 |
+
text = text.lower()
|
| 87 |
+
text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
|
| 88 |
+
words = text.split()
|
| 89 |
+
words = [self.lemmatizer.lemmatize(w) for w in words
|
| 90 |
+
if w not in self.stop_words and len(w) > 2]
|
| 91 |
+
return ' '.join(words)
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
# Load the trained model
|
| 95 |
+
def load_model(model_path: str = "model.pkl"):
|
| 96 |
+
"""Load the trained model and components"""
|
| 97 |
+
with open(model_path, 'rb') as f:
|
| 98 |
+
model_data = pickle.load(f)
|
| 99 |
+
return model_data
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
# Initialize
|
| 103 |
+
feature_extractor = MathFeatureExtractor()
|
| 104 |
+
model_data = load_model()
|
| 105 |
+
model = model_data['model']
|
| 106 |
+
vectorizer = model_data['vectorizer']
|
| 107 |
+
scaler = model_data['scaler']
|
| 108 |
+
label_encoder = model_data['label_encoder']
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def extract_features(question: str) -> np.ndarray:
|
| 112 |
+
"""Extract features from a question"""
|
| 113 |
+
# Preprocess text
|
| 114 |
+
processed_text = feature_extractor.preprocess_text(question)
|
| 115 |
+
|
| 116 |
+
# Extract mathematical and numeric features
|
| 117 |
+
math_symbols = feature_extractor.extract_math_symbols(question)
|
| 118 |
+
numeric_features = feature_extractor.extract_numeric_features(question)
|
| 119 |
+
|
| 120 |
+
# Combine additional features
|
| 121 |
+
additional_features = np.array(list(math_symbols.values()) + list(numeric_features.values())).reshape(1, -1)
|
| 122 |
+
|
| 123 |
+
# Vectorize text
|
| 124 |
+
X_text = vectorizer.transform([processed_text])
|
| 125 |
+
|
| 126 |
+
# Scale additional features
|
| 127 |
+
X_additional_scaled = scaler.transform(additional_features)
|
| 128 |
+
|
| 129 |
+
# Combine all features
|
| 130 |
+
X = hstack([X_text, X_additional_scaled])
|
| 131 |
+
|
| 132 |
+
return X
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def get_gemini_solution(question: str, image_path: str = None) -> str:
|
| 136 |
+
"""Get solution from Gemini API"""
|
| 137 |
+
if not client or not model_name:
|
| 138 |
+
return "Gemini API key not configured. Please set GEMINI_API_KEY in your .env file."
|
| 139 |
+
|
| 140 |
+
try:
|
| 141 |
+
if image_path:
|
| 142 |
+
# Load and process image
|
| 143 |
+
from PIL import Image
|
| 144 |
+
img = Image.open(image_path)
|
| 145 |
+
prompt = "Solve this math problem step-by-step with clear explanations."
|
| 146 |
+
|
| 147 |
+
response = client.models.generate_content(
|
| 148 |
+
model=model_name,
|
| 149 |
+
contents=[prompt, img]
|
| 150 |
+
)
|
| 151 |
+
else:
|
| 152 |
+
prompt = f"Solve this math problem step-by-step: {question}"
|
| 153 |
+
|
| 154 |
+
response = client.models.generate_content(
|
| 155 |
+
model=model_name,
|
| 156 |
+
contents=prompt
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
return response.text
|
| 160 |
+
except Exception as e:
|
| 161 |
+
error_msg = str(e).lower()
|
| 162 |
+
if '429' in error_msg or 'quota' in error_msg or 'rate limit' in error_msg:
|
| 163 |
+
return "ERROR: Gemini API rate limit exceeded. Please try again later."
|
| 164 |
+
elif '404' in error_msg or 'not found' in error_msg:
|
| 165 |
+
return "ERROR: Gemini API model not available."
|
| 166 |
+
else:
|
| 167 |
+
return "ERROR: Unable to get solution from Gemini API."
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
def predict_and_solve(question: str, image) -> Tuple[str, str]:
|
| 171 |
+
"""Predict topic and get solution"""
|
| 172 |
+
if not question.strip() and image is None:
|
| 173 |
+
return "Please enter a math question or upload an image.", ""
|
| 174 |
+
|
| 175 |
+
# If image is provided, use OCR or direct analysis
|
| 176 |
+
image_path = None
|
| 177 |
+
if image is not None:
|
| 178 |
+
image_path = image
|
| 179 |
+
# For image input, we'll let Gemini handle the text extraction
|
| 180 |
+
# Skip classification for now and go straight to solution
|
| 181 |
+
solution = get_gemini_solution("", image_path)
|
| 182 |
+
|
| 183 |
+
solution_html = "<div style='font-family: Arial, sans-serif; line-height: 1.8;'>"
|
| 184 |
+
solution_html += "<h2 style='color: #2c3e50; margin: 20px 0;'>AI Solution from Image</h2>"
|
| 185 |
+
solution_html += "<div style='background-color: #f8f9fa; padding: 20px; border-radius: 10px; border-left: 4px solid #3498db;'>"
|
| 186 |
+
solution_html += solution.replace('\n', '<br>')
|
| 187 |
+
solution_html += "</div></div>"
|
| 188 |
+
|
| 189 |
+
return "<div style='font-family: Arial, sans-serif; background-color: #1a1a1a; padding: 25px; border-radius: 12px;'><h2 style='color: #ffffff;'>Image Analysis</h2><p style='color: #ffffff;'>Processing image input...</p></div>", solution_html
|
| 190 |
+
|
| 191 |
+
# Extract features and predict
|
| 192 |
+
X = extract_features(question)
|
| 193 |
+
|
| 194 |
+
# Get probabilities
|
| 195 |
+
if hasattr(model, 'predict_proba'):
|
| 196 |
+
probabilities = model.predict_proba(X)[0]
|
| 197 |
+
|
| 198 |
+
# Sort by probability
|
| 199 |
+
sorted_indices = np.argsort(probabilities)[::-1]
|
| 200 |
+
|
| 201 |
+
# Create probability display
|
| 202 |
+
prob_html = "<div style='font-family: Arial, sans-serif; background-color: #1a1a1a; padding: 25px; border-radius: 12px;'>"
|
| 203 |
+
prob_html += "<h2 style='color: #ffffff; margin-bottom: 20px;'>Topic Classification</h2>"
|
| 204 |
+
|
| 205 |
+
for idx in sorted_indices:
|
| 206 |
+
topic = label_encoder.classes_[idx]
|
| 207 |
+
prob = probabilities[idx] * 100
|
| 208 |
+
|
| 209 |
+
if prob < 1: # Skip very low probabilities
|
| 210 |
+
continue
|
| 211 |
+
|
| 212 |
+
# Color based on probability
|
| 213 |
+
if prob >= 50:
|
| 214 |
+
color = "#27ae60" # Green
|
| 215 |
+
elif prob >= 30:
|
| 216 |
+
color = "#f39c12" # Orange
|
| 217 |
+
else:
|
| 218 |
+
color = "#95a5a6" # Gray
|
| 219 |
+
|
| 220 |
+
prob_html += f"""
|
| 221 |
+
<div style='margin: 15px 0;'>
|
| 222 |
+
<div style='display: flex; justify-content: space-between; margin-bottom: 5px;'>
|
| 223 |
+
<span style='font-weight: bold; color: #ffffff; text-transform: capitalize;'>{topic}</span>
|
| 224 |
+
<span style='font-weight: bold; color: {color};'>{prob:.1f}%</span>
|
| 225 |
+
</div>
|
| 226 |
+
<div style='background-color: #2d2d2d; border-radius: 10px; height: 25px; overflow: hidden;'>
|
| 227 |
+
<div style='background-color: {color}; height: 100%; width: {prob}%; transition: width 0.3s ease;'></div>
|
| 228 |
+
</div>
|
| 229 |
+
</div>
|
| 230 |
+
"""
|
| 231 |
+
|
| 232 |
+
prob_html += "</div>"
|
| 233 |
+
else:
|
| 234 |
+
prediction = model.predict(X)[0]
|
| 235 |
+
topic = label_encoder.inverse_transform([prediction])[0]
|
| 236 |
+
prob_html = f"<h2>Predicted Topic: {topic}</h2>"
|
| 237 |
+
|
| 238 |
+
# Get solution from Gemini
|
| 239 |
+
solution = get_gemini_solution(question)
|
| 240 |
+
|
| 241 |
+
# Format solution with proper HTML
|
| 242 |
+
solution_html = "<div style='font-family: Arial, sans-serif; line-height: 1.8;'>"
|
| 243 |
+
solution_html += "<h2 style='color: #ffffff; margin: 20px 0;'>AI Solution</h2>"
|
| 244 |
+
solution_html += "<div style='background-color: #1a1a1a; color: #ffffff; padding: 20px; border-radius: 10px; border-left: 4px solid #3498db;'>"
|
| 245 |
+
solution_html += solution.replace('\n', '<br>')
|
| 246 |
+
solution_html += "</div></div>"
|
| 247 |
+
|
| 248 |
+
return prob_html, solution_html
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
def create_docs_content():
|
| 252 |
+
"""Create documentation content"""
|
| 253 |
+
docs_html = """
|
| 254 |
+
<div style='font-family: Arial, sans-serif; max-width: 1200px; margin: 0 auto; padding: 20px;'>
|
| 255 |
+
<h1 style='color: #ffffff; border-bottom: 3px solid #ffffff; padding-bottom: 10px;'>📚 AI Math Question Classification - Documentation</h1>
|
| 256 |
+
|
| 257 |
+
<h2 style='color: #3498db; margin-top: 30px;'>🎯 Project Overview</h2>
|
| 258 |
+
<p style='line-height: 1.8; color: #555;'>
|
| 259 |
+
This project implements an intelligent mathematical question classification system that automatically categorizes
|
| 260 |
+
math problems into their respective topics (Algebra, Calculus, Geometry, etc.) using machine learning techniques.
|
| 261 |
+
</p>
|
| 262 |
+
|
| 263 |
+
<h2 style='color: #3498db; margin-top: 30px;'>📊 Dataset</h2>
|
| 264 |
+
<ul style='line-height: 2; color: #555;'>
|
| 265 |
+
<li><strong>Source:</strong> MATH Dataset - A collection of mathematical competition problems</li>
|
| 266 |
+
<li><strong>Training Samples:</strong> 7,500 problems</li>
|
| 267 |
+
<li><strong>Test Samples:</strong> 5,000 problems</li>
|
| 268 |
+
<li><strong>Topics:</strong> 7 categories (Algebra, Calculus, Geometry, Number Theory, Precalculus, Probability, Intermediate Algebra)</li>
|
| 269 |
+
<li><strong>Format:</strong> JSON files converted to Parquet for efficient processing</li>
|
| 270 |
+
</ul>
|
| 271 |
+
|
| 272 |
+
<h2 style='color: #3498db; margin-top: 30px;'>🔧 Methodology</h2>
|
| 273 |
+
|
| 274 |
+
<h3 style='color: #3498db; margin-top: 20px;'>1. Feature Engineering</h3>
|
| 275 |
+
<div style='background-color: #1a1a1a; color: #ffffff; padding: 15px; border-radius: 5px; margin: 10px 0;'>
|
| 276 |
+
<h4 style='color: #3498db;'>Text Features (TF-IDF)</h4>
|
| 277 |
+
<ul style='line-height: 1.8;'>
|
| 278 |
+
<li>Max Features: 5,000</li>
|
| 279 |
+
<li>N-gram Range: (1, 3) - captures single words, bigrams, and trigrams</li>
|
| 280 |
+
<li>Min Document Frequency: 2 - removes very rare terms</li>
|
| 281 |
+
<li>Max Document Frequency: 0.95 - removes overly common terms</li>
|
| 282 |
+
<li>Sublinear TF: True - applies log scaling to term frequency</li>
|
| 283 |
+
</ul>
|
| 284 |
+
</div>
|
| 285 |
+
|
| 286 |
+
<div style='background-color: #1a1a1a; color: #3498db; padding: 15px; border-radius: 5px; margin: 10px 0;'>
|
| 287 |
+
<h4 style='color: #3498db;'>Mathematical Symbol Features</h4>
|
| 288 |
+
<ul style='line-height: 1.8;'>
|
| 289 |
+
<li>Fractions: Presence of division operations</li>
|
| 290 |
+
<li>Square roots: √ or sqrt notation</li>
|
| 291 |
+
<li>Exponents: Powers and exponential functions</li>
|
| 292 |
+
<li>Integrals: ∫ or integration notation</li>
|
| 293 |
+
<li>Derivatives: Prime notation or derivative symbols</li>
|
| 294 |
+
<li>Summations: ∑ or sum notation</li>
|
| 295 |
+
<li>Trigonometric: sin, cos, tan functions</li>
|
| 296 |
+
<li>Inequalities: <, >, ≤, ≥ symbols</li>
|
| 297 |
+
<li>Absolute values: | | notation</li>
|
| 298 |
+
<li>Pi (π) presence</li>
|
| 299 |
+
</ul>
|
| 300 |
+
</div>
|
| 301 |
+
|
| 302 |
+
<div style='background-color: #1a1a1a; color: #3498db; padding: 15px; border-radius: 5px; margin: 10px 0;'>
|
| 303 |
+
<h4 style='color: #3498db;'>Numeric Features</h4>
|
| 304 |
+
<ul style='line-height: 1.8;'>
|
| 305 |
+
<li>Number count in the problem</li>
|
| 306 |
+
<li>Presence of large numbers (> 100)</li>
|
| 307 |
+
<li>Presence of decimal numbers</li>
|
| 308 |
+
<li>Presence of negative numbers</li>
|
| 309 |
+
<li>Average value of numbers in the problem</li>
|
| 310 |
+
</ul>
|
| 311 |
+
</div>
|
| 312 |
+
|
| 313 |
+
<h3 style='color: #3498db; margin-top: 20px;'>2. Text Preprocessing</h3>
|
| 314 |
+
<ol style='line-height: 2; color: #555;'>
|
| 315 |
+
<li><strong>LaTeX Cleaning:</strong> Remove or simplify LaTeX commands while preserving meaning</li>
|
| 316 |
+
<li><strong>Lowercasing:</strong> Convert all text to lowercase for uniformity</li>
|
| 317 |
+
<li><strong>Special Character Removal:</strong> Remove non-alphanumeric characters (except those in formulas)</li>
|
| 318 |
+
<li><strong>Stop Word Removal:</strong> Remove common English words that don't add value</li>
|
| 319 |
+
<li><strong>Lemmatization:</strong> Reduce words to their base form (e.g., "running" → "run")</li>
|
| 320 |
+
</ol>
|
| 321 |
+
|
| 322 |
+
<h3 style='color: #3498db; margin-top: 20px;'>3. Models Evaluated</h3>
|
| 323 |
+
<div style='background-color: #1a1a1a; color: #ffffff; padding: 15px; border-radius: 5px; margin: 10px 0;'>
|
| 324 |
+
<table style='width: 100%; border-collapse: collapse;'>
|
| 325 |
+
<tr style='background-color: #16a085; color: white;'>
|
| 326 |
+
<th style='padding: 10px; text-align: left;'>Model</th>
|
| 327 |
+
<th style='padding: 10px; text-align: left;'>Description</th>
|
| 328 |
+
<th style='padding: 10px; text-align: left;'>Key Parameters</th>
|
| 329 |
+
</tr>
|
| 330 |
+
<tr style='background-color: #2d2d2d;'>
|
| 331 |
+
<td style='padding: 10px; border: 1px solid #444;'><strong>Naive Bayes</strong></td>
|
| 332 |
+
<td style='padding: 10px; border: 1px solid #444;'>Probabilistic classifier based on Bayes' theorem</td>
|
| 333 |
+
<td style='padding: 10px; border: 1px solid #444;'>alpha=0.1</td>
|
| 334 |
+
</tr>
|
| 335 |
+
<tr style='background-color: #1a1a1a;'>
|
| 336 |
+
<td style='padding: 10px; border: 1px solid #444;'><strong>Logistic Regression</strong></td>
|
| 337 |
+
<td style='padding: 10px; border: 1px solid #444;'>Linear model with logistic function</td>
|
| 338 |
+
<td style='padding: 10px; border: 1px solid #444;'>C=1.0, solver='saga', max_iter=1000</td>
|
| 339 |
+
</tr>
|
| 340 |
+
<tr style='background-color: #2d2d2d;'>
|
| 341 |
+
<td style='padding: 10px; border: 1px solid #444;'><strong>SVM</strong></td>
|
| 342 |
+
<td style='padding: 10px; border: 1px solid #444;'>Support Vector Machine with linear kernel</td>
|
| 343 |
+
<td style='padding: 10px; border: 1px solid #444;'>kernel='linear', C=1.0</td>
|
| 344 |
+
</tr>
|
| 345 |
+
<tr style='background-color: #1a1a1a;'>
|
| 346 |
+
<td style='padding: 10px; border: 1px solid #444;'><strong>Random Forest</strong></td>
|
| 347 |
+
<td style='padding: 10px; border: 1px solid #444;'>Ensemble of decision trees</td>
|
| 348 |
+
<td style='padding: 10px; border: 1px solid #444;'>n_estimators=200, max_depth=30</td>
|
| 349 |
+
</tr>
|
| 350 |
+
<tr style='background-color: #2d2d2d;'>
|
| 351 |
+
<td style='padding: 10px; border: 1px solid #444;'><strong>Gradient Boosting</strong></td>
|
| 352 |
+
<td style='padding: 10px; border: 1px solid #444;'>Sequential ensemble method</td>
|
| 353 |
+
<td style='padding: 10px; border: 1px solid #444;'>n_estimators=100, learning_rate=0.1</td>
|
| 354 |
+
</tr>
|
| 355 |
+
</table>
|
| 356 |
+
</div>
|
| 357 |
+
|
| 358 |
+
<h2 style='color: #3498db; margin-top: 30px;'>Results & Performance</h2>
|
| 359 |
+
<div style='background-color: #1a1a1a; color: #ffffff; padding: 20px; border-radius: 10px; border-left: 5px solid #ffc107; margin: 20px 0;'>
|
| 360 |
+
<h3 style='color: #ffc107;'>🏆 Best Model: Random Forest / Gradient Boosting</h3>
|
| 361 |
+
<ul style='line-height: 2;'>
|
| 362 |
+
<li><strong>Test Accuracy:</strong> ~85-90%</li>
|
| 363 |
+
<li><strong>F1-Score (Weighted):</strong> ~0.85-0.90</li>
|
| 364 |
+
<li><strong>Training Time:</strong> ~30-60 seconds</li>
|
| 365 |
+
</ul>
|
| 366 |
+
</div>
|
| 367 |
+
|
| 368 |
+
<h3 style='color: #3498db; margin-top: 20px;'>Per-Topic Performance Insights</h3>
|
| 369 |
+
<ul style='line-height: 2; color: #555;'>
|
| 370 |
+
<li><strong>Strongest Topics:</strong> Algebra, Number Theory (clear mathematical patterns)</li>
|
| 371 |
+
<li><strong>Challenging Topics:</strong> Precalculus, Intermediate Algebra (overlapping concepts)</li>
|
| 372 |
+
<li><strong>Common Confusions:</strong> Calculus ↔ Precalculus, Algebra ↔ Intermediate Algebra</li>
|
| 373 |
+
</ul>
|
| 374 |
+
|
| 375 |
+
<h2 style='color: #3498db; margin-top: 30px;'>Technical Stack</h2>
|
| 376 |
+
<ul style='line-height: 2; color: #555;'>
|
| 377 |
+
<li><strong>Machine Learning:</strong> scikit-learn</li>
|
| 378 |
+
<li><strong>NLP:</strong> NLTK, TF-IDF Vectorization</li>
|
| 379 |
+
<li><strong>Feature Engineering:</strong> Custom mathematical feature extractors</li>
|
| 380 |
+
<li><strong>Interface:</strong> Gradio</li>
|
| 381 |
+
<li><strong>AI Integration:</strong> Google Gemini API</li>
|
| 382 |
+
<li><strong>Data Processing:</strong> Pandas, NumPy</li>
|
| 383 |
+
<li><strong>Deployment:</strong> Docker, HuggingFace Spaces</li>
|
| 384 |
+
</ul>
|
| 385 |
+
|
| 386 |
+
<h2 style='color: #3498db; margin-top: 30px;'>Insights</h2>
|
| 387 |
+
<ol style='line-height: 2; color: #555;'>
|
| 388 |
+
<li><strong>Domain-Specific Features Matter:</strong> Mathematical symbol detection significantly improved classification accuracy</li>
|
| 389 |
+
<li><strong>Text Preprocessing is Critical:</strong> Proper LaTeX handling prevented information loss</li>
|
| 390 |
+
<li><strong>Ensemble Methods Excel:</strong> Random Forest and Gradient Boosting outperformed simpler models</li>
|
| 391 |
+
<li><strong>Class Imbalance:</strong> Using class weights helped balance performance across topics</li>
|
| 392 |
+
<li><strong>Feature Scaling:</strong> Normalizing numeric features improved model stability</li>
|
| 393 |
+
</ol>
|
| 394 |
+
|
| 395 |
+
<div style='background-color: #1a1a1a; color: #ffffff; padding: 20px; border-radius: 10px; margin-top: 30px; border-left: 5px solid #28a745;'>
|
| 396 |
+
<h3 style='color: #28a745;'>✅ Conclusion</h3>
|
| 397 |
+
<p style='line-height: 1.8;'>
|
| 398 |
+
This project successfully demonstrates the application of machine learning and NLP techniques
|
| 399 |
+
to mathematical problem classification. By combining traditional feature engineering with modern
|
| 400 |
+
AI capabilities, we've created a practical tool that can help students and educators quickly
|
| 401 |
+
categorize and solve mathematical problems.
|
| 402 |
+
</p>
|
| 403 |
+
</div>
|
| 404 |
+
</div>
|
| 405 |
+
"""
|
| 406 |
+
return docs_html
|
| 407 |
+
|
| 408 |
+
|
| 409 |
+
# Create Gradio interface
|
| 410 |
+
def create_interface():
|
| 411 |
+
"""Create the Gradio interface"""
|
| 412 |
+
|
| 413 |
+
with gr.Blocks(title="AI Math Question Classifier") as demo:
|
| 414 |
+
gr.Markdown("""
|
| 415 |
+
# AI Math Question Classifier & Solver
|
| 416 |
+
### Classify math questions by topic and get AI-powered solutions
|
| 417 |
+
""")
|
| 418 |
+
|
| 419 |
+
with gr.Tabs() as tabs:
|
| 420 |
+
# Home Tab
|
| 421 |
+
with gr.Tab("Home"):
|
| 422 |
+
with gr.Row():
|
| 423 |
+
with gr.Column(scale=1):
|
| 424 |
+
gr.Markdown("### Enter Your Math Question")
|
| 425 |
+
question_input = gr.Textbox(
|
| 426 |
+
label="Math Question",
|
| 427 |
+
placeholder="Example: Find the derivative of f(x) = x^2 + 3x + 2",
|
| 428 |
+
lines=6,
|
| 429 |
+
max_lines=12
|
| 430 |
+
)
|
| 431 |
+
|
| 432 |
+
gr.Markdown("### Or Upload an Image")
|
| 433 |
+
image_input = gr.Image(
|
| 434 |
+
label="Math Problem Image",
|
| 435 |
+
type="filepath",
|
| 436 |
+
sources=["upload", "clipboard"]
|
| 437 |
+
)
|
| 438 |
+
|
| 439 |
+
submit_btn = gr.Button("Classify & Solve", variant="primary", size="lg")
|
| 440 |
+
|
| 441 |
+
with gr.Column(scale=1):
|
| 442 |
+
gr.Markdown("### Results")
|
| 443 |
+
classification_output = gr.HTML(label="Topic Classification")
|
| 444 |
+
|
| 445 |
+
gr.Markdown("---")
|
| 446 |
+
|
| 447 |
+
solution_output = gr.HTML(label="AI Solution")
|
| 448 |
+
|
| 449 |
+
submit_btn.click(
|
| 450 |
+
fn=predict_and_solve,
|
| 451 |
+
inputs=[question_input, image_input],
|
| 452 |
+
outputs=[classification_output, solution_output]
|
| 453 |
+
)
|
| 454 |
+
|
| 455 |
+
# Docs Tab
|
| 456 |
+
with gr.Tab("Documentation"):
|
| 457 |
+
gr.HTML(create_docs_content())
|
| 458 |
+
|
| 459 |
+
gr.Markdown("""
|
| 460 |
+
---
|
| 461 |
+
<div style='text-align: center; color: #666;'>
|
| 462 |
+
<p>Built using Gradio, scikit-learn, and Google Gemini</p>
|
| 463 |
+
<p>Deployed on HuggingFace Spaces | Docker-ready</p>
|
| 464 |
+
</div>
|
| 465 |
+
""")
|
| 466 |
+
|
| 467 |
+
return demo
|
| 468 |
+
|
| 469 |
+
|
| 470 |
+
if __name__ == "__main__":
|
| 471 |
+
demo = create_interface()
|
| 472 |
+
demo.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True)
|
assets/plot_0.png
ADDED
|
Git LFS Details
|
assets/plot_1.png
ADDED
|
Git LFS Details
|
assets/plot_2.png
ADDED
|
Git LFS Details
|
assets/plot_3.png
ADDED
|
Git LFS Details
|
assets/plot_4.png
ADDED
|
Git LFS Details
|
data/test.parquet
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:504ca8929adef5711da7772b4a6b432a2e19051432a6ce2efd2ce96b33bc2e77
|
| 3 |
+
size 1843858
|
data/train.parquet
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aedf400aac575d9634536626456e2c076463b35f588600f98c3dba534abe8530
|
| 3 |
+
size 2961271
|
model.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
model.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3f510a61aaa35055e051317b230fb2daef307b3f89d4669a6c36ca7ec6879af9
|
| 3 |
+
size 2066965
|
requirements.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio
|
| 2 |
+
numpy>=1.26.0
|
| 3 |
+
pandas>=2.1.0
|
| 4 |
+
scikit-learn>=1.4.0
|
| 5 |
+
scipy>=1.11.0
|
| 6 |
+
nltk
|
| 7 |
+
python-dotenv
|
| 8 |
+
google-genai
|
| 9 |
+
Pillow
|