mohakapoor commited on
Commit
ada63c0
·
0 Parent(s):

Initial project setup on Dev branch

Browse files
Files changed (7) hide show
  1. .gitattributes +14 -0
  2. .gitignore +147 -0
  3. README.md +143 -0
  4. src/collate.py +34 -0
  5. src/config.py +20 -0
  6. src/data.py +63 -0
  7. src/vocab.py +35 -0
.gitattributes ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .pth filter=lfs diff=lfs merge=lfs -text
2
+ .pt filter=lfs diff=lfs merge=lfs -text
3
+ .ckpt filter=lfs diff=lfs merge=lfs -text
4
+ .bin filter=lfs diff=lfs merge=lfs -text
5
+ checkpoints/** filter=lfs diff=lfs merge=lfs -text
6
+ .png filter=lfs diff=lfs merge=lfs -text
7
+ **/.png filter=lfs diff=lfs merge=lfs -text
8
+ .jpg filter=lfs diff=lfs merge=lfs -text
9
+ **/.jpg filter=lfs diff=lfs merge=lfs -text
10
+ .jpeg filter=lfs diff=lfs merge=lfs -text
11
+ **/.jpeg filter=lfs diff=lfs merge=lfs -text
12
+ .gif filter=lfs diff=lfs merge=lfs -text
13
+ **/.gif filter=lfs diff=lfs merge=lfs -text
14
+ Metrics/** filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ```bash
2
+ #!/usr/bin/env bash
3
+ # Create a .gitignore that keeps the Dataset folder but ignores its contents,
4
+ # plus common Python/ML ignores. Run this from your repo root.
5
+
6
+ set -e
7
+
8
+ cat > .gitignore << 'EOF'
9
+ # Keep the Dataset folder but ignore its contents
10
+ Dataset/
11
+ !Dataset/.gitkeep
12
+ !Dataset/**/
13
+ Dataset/**/*
14
+
15
+ Dataset_test/
16
+ !Dataset_test/.gitkeep
17
+ !Dataset_test/**/
18
+ Dataset_test/**/*
19
+
20
+ # Python
21
+ __pycache__/
22
+ *.py[cod]
23
+ *$py.class
24
+ *.pyo
25
+ *.pyd
26
+ *.so
27
+ *.egg-info/
28
+ .eggs/
29
+ dist/
30
+ build/
31
+ pip-wheel-metadata/
32
+ wheels/
33
+ .pytest_cache/
34
+ .coverage
35
+ #.coverage.* # uncomment if you create multiple coverage files
36
+ htmlcov/
37
+ .cache/
38
+ .mypy_cache/
39
+ .pyre/
40
+ .pytype/
41
+ .dmypy.json
42
+ .pyre_check/
43
+ .ipynb_checkpoints/
44
+ .site/
45
+
46
+ # Virtual environments
47
+ .env
48
+ .venv
49
+ env/
50
+ venv/
51
+ ENV/
52
+ env.bak/
53
+ venv.bak/
54
+
55
+ # Logs and runtime
56
+ *.log
57
+ logs/
58
+ *.pid
59
+ *.seed
60
+ *.out
61
+ *.err
62
+
63
+ # Jupyter
64
+ .ipynb_checkpoints
65
+ *.ipynb_checkpoints
66
+
67
+ # IDE/editor
68
+ .vscode/
69
+ .history/
70
+ .idea/
71
+ *.code-workspace
72
+
73
+ # OS-specific
74
+ .DS_Store
75
+ Thumbs.db
76
+ desktop.ini
77
+
78
+ # Images/artifacts (remove if you plan to commit images outside Dataset)
79
+ *.png
80
+ *.jpg
81
+ *.jpeg
82
+ *.bmp
83
+ *.gif
84
+ *.tiff
85
+ *.webp
86
+
87
+ # Models and checkpoints
88
+ checkpoints/
89
+ *.ckpt
90
+ *.onnx
91
+ *.tflite
92
+ *.pth
93
+ *.pt
94
+ *.bin
95
+ *.safetensors
96
+ runs/
97
+ outputs/
98
+ artifacts/
99
+
100
+ # Data/cache
101
+ data/
102
+ datasets/
103
+ .input/
104
+ .output/
105
+ .cache/
106
+ tmp/
107
+ temp/
108
+ *.tar
109
+ *.tar.gz
110
+ *.zip
111
+ *.7z
112
+
113
+ # Config/private
114
+ *.env
115
+ .env.*
116
+ secrets.*
117
+ *.key
118
+ *.pem
119
+
120
+ # Node/JS (if present)
121
+ node_modules/
122
+ npm-debug.log*
123
+ yarn-debug.log*
124
+ yarn-error.log*
125
+ pnpm-lock.yaml
126
+
127
+ # Rust (if present)
128
+ target/
129
+
130
+ # C/C++ build (if present)
131
+ CMakeFiles/
132
+ CMakeCache.txt
133
+ cmake-build-*/
134
+ *.o
135
+ *.obj
136
+ *.exe
137
+ *.dll
138
+ *.lib
139
+ *.a
140
+ *.out
141
+
142
+ # Java (if present)
143
+ *.class
144
+ .gradle/
145
+ build/
146
+ EOF
147
+
README.md ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CAPTCHA OCR Project
2
+
3
+ A PyTorch-based CAPTCHA recognition system using synthetic data generation and CTC-based sequence modeling.
4
+
5
+ ## 🎯 Project Overview
6
+
7
+ This project implements an end-to-end CAPTCHA OCR system that can recognize text in CAPTCHA images. It uses:
8
+ - **Synthetic CAPTCHA generation** for training data
9
+ - **CRNN (CNN + RNN) architecture** for sequence recognition
10
+ - **CTC (Connectionist Temporal Classification)** loss for training
11
+ - **PyTorch** with CUDA support for GPU acceleration
12
+
13
+ ## 🏗️ Current Status
14
+
15
+ ### ✅ Completed Components
16
+ - **Dataset Generation**: Synthetic CAPTCHA creation with train/val/test splits
17
+ - **Configuration**: Centralized config with image dimensions and training parameters
18
+ - **Vocabulary System**: Character encoding/decoding with CTC blank token support
19
+ - **CTC Collate Function**: Proper batching for variable-length sequences
20
+ - **CTC Decoding**: Greedy decode for inference
21
+
22
+ ### 🔧 In Progress / Next Steps
23
+ - **PyTorch Dataset Class**: Image loading and preprocessing
24
+ - **CRNN Model**: CNN encoder + BiLSTM + linear output
25
+ - **Training Loop**: Complete training pipeline with validation
26
+ - **Metrics**: CER (Character Error Rate) and exact match accuracy
27
+ - **Inference Pipeline**: Model loading and prediction
28
+
29
+ ## 📁 Project Structure
30
+
31
+ ```
32
+ CaptchaDetect/
33
+ ├── Dataset/ # Full dataset (100k images) - for Colab training
34
+ ├── Dataset_test/ # Test dataset (1k images) - for local development
35
+ │ └── captchas/
36
+ │ ├── train/ # 80% of data
37
+ │ ├── val/ # 10% of data
38
+ │ └── test/ # 10% of data
39
+ ├── src/
40
+ │ ├── config.py # Configuration and hyperparameters
41
+ │ ├── vocab.py # Character vocabulary and CTC encoding
42
+ │ ├── data.py # Dataset generation script
43
+ │ ├── collate.py # CTC batching function
44
+ │ └── [model files] # Coming soon...
45
+ ├── .gitignore # Ignores dataset contents, keeps structure
46
+ └── README.md # This file
47
+ ```
48
+
49
+ ## 🚀 Quick Start
50
+
51
+ ### 1. Environment Setup
52
+ ```bash
53
+ # Install PyTorch with CUDA support (adjust version as needed)
54
+ pip3 install torch torchvision --index-url https://download.pytorch.org/whl/cu128
55
+
56
+ # Install other dependencies
57
+ pip install captcha pandas pillow
58
+ ```
59
+
60
+ ### 2. Generate Test Dataset
61
+ ```bash
62
+ cd src
63
+ python data.py
64
+ ```
65
+ This creates 1,000 synthetic CAPTCHAs in `Dataset_test/captchas/` with proper train/val/test splits.
66
+
67
+ ### 3. Configuration
68
+ Edit `src/config.py` to adjust:
69
+ - Image dimensions (H=48, W_max=224)
70
+ - Batch sizes (32 for local GTX 1650, 128 for Colab T4)
71
+ - Training parameters
72
+
73
+ ## 🎮 Usage
74
+
75
+ ### Local Development (GTX 1650)
76
+ - Use `Dataset_test` (1k images)
77
+ - Batch size: 32-48
78
+ - Good for rapid iteration and testing
79
+
80
+ ### Colab Training (Tesla T4)
81
+ - Use `Dataset` (100k images)
82
+ - Batch size: 128
83
+ - Expected training time: 2-4 hours for 40 epochs
84
+
85
+ ## 🔬 Technical Details
86
+
87
+ ### Model Architecture
88
+ - **CNN Encoder**: Reduces image to sequence representation
89
+ - **BiLSTM**: Processes sequential features
90
+ - **Linear Output**: Maps to vocabulary size (including blank token)
91
+
92
+ ### CTC Training
93
+ - **Input**: Images resized to 48×224
94
+ - **Output**: Character sequences (a-z, A-Z, 0-9)
95
+ - **Loss**: CTCLoss with blank=0
96
+ - **Decoding**: Greedy CTC decode
97
+
98
+ ### Data Format
99
+ - **Images**: Grayscale, normalized tensors
100
+ - **Labels**: CSV with filename and text label
101
+ - **Batching**: Variable-length sequences handled by custom collate
102
+
103
+ ## 📊 Performance Expectations
104
+
105
+ ### GTX 1650 (4GB VRAM)
106
+ - Training time: 3-8 hours for 100k×40 epochs
107
+ - Batch size: 32-48
108
+ - Memory efficient with H=48
109
+
110
+ ### Tesla T4 (16GB VRAM)
111
+ - Training time: 2-4 hours for 100k×40 epochs
112
+ - Batch size: 128
113
+ - Mixed precision (AMP) enabled
114
+
115
+ ## 🛠️ Development Workflow
116
+
117
+ 1. **Implement Dataset class** - Load and preprocess images
118
+ 2. **Build CRNN model** - CNN + BiLSTM architecture
119
+ 3. **Create training loop** - With validation and checkpoints
120
+ 4. **Add metrics** - CER and accuracy tracking
121
+ 5. **Test on small dataset** - Verify everything works
122
+ 6. **Scale to full dataset** - Train on Colab
123
+
124
+ ## 🤝 Contributing
125
+
126
+ This is a learning project! Feel free to:
127
+ - Ask questions about implementation details
128
+ - Experiment with different architectures
129
+ - Improve the data generation or training pipeline
130
+
131
+ ## 📚 Resources
132
+
133
+ - [CTC Paper](https://www.cs.toronto.edu/~graves/icml_2006.pdf)
134
+ - [CRNN Architecture](https://arxiv.org/abs/1507.05717)
135
+ - [PyTorch CTC Tutorial](https://pytorch.org/docs/stable/generated/torch.nn.CTCLoss.html)
136
+
137
+ ## 📝 License
138
+
139
+ This project is for educational purposes. Feel free to use and modify as needed.
140
+
141
+ ---
142
+
143
+ **Happy coding! 🚀**
src/collate.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List,Tuple
2
+ import torch
3
+ from src.config import cfg
4
+ from src.vocab import encode_text
5
+
6
+ def ctc_collate(batch: List[Tuple[torch.Tensor, str, str]]):
7
+ """
8
+ batch: list of (image_tensor [C,H,W_max], label_str, rel_path)
9
+ returns:
10
+ images: [B,C,H,W_max]
11
+ targets_flat: [sum(len(label_i))]
12
+ target_lengths: [B]
13
+ input_lengths: [B] (all equal if same W_max/stride)
14
+ rel_paths: list[str]
15
+ """
16
+
17
+ images = torch.stack([item[0] for item in batch],dim =0)
18
+
19
+ labels = [item[1] for item in batch]
20
+ encoded = [torch.tensor(encode_text(t),dtype = torch.long) for t in labels]
21
+ target_lengths = torch.tensor([len(t) for t in encoded],dtype = torch.long)
22
+ if len(encoded) > 0:
23
+ targets_flat = torch.cat(encoded,dim = 0)
24
+ else:
25
+ targets_flat = torch.empty(0,dtype = torch.long)
26
+
27
+
28
+ B, C, H, W = images.shape
29
+
30
+ input_len = W // cfg.total_stride
31
+ input_lengths = torch.full((B,), input_len, dtype=torch.long)
32
+
33
+ rel_paths = [item[2] for item in batch]
34
+ return images, targets_flat, target_lengths, input_lengths, rel_paths
src/config.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import string
3
+ from dataclasses import dataclass
4
+
5
+ @dataclass
6
+ class Config:
7
+ data_root: str = os.getenv("DATA_ROOT","Dataset_test\captchas")
8
+
9
+ chars: str = string.ascii_letters + string.digits
10
+
11
+ H: int = 48
12
+ W_max: int = 224
13
+ grayscale: bool = True
14
+
15
+ total_stride: int = 4 #
16
+ batch_size: int = 32
17
+ num_workers: int = 4
18
+ amp: bool = True
19
+
20
+ cfg = Config()
src/data.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from captcha.image import ImageCaptcha
2
+ import random
3
+ import string
4
+ import os
5
+ import csv
6
+ import pandas as pd
7
+
8
+ # config
9
+ DATASET_DIR = "Dataset_test/captchas"
10
+ LABELS = "Dataset_test/labels.csv"
11
+ NUM_IMAGES = 1000
12
+ CHARS = string.ascii_letters + string.digits
13
+ CAPTCHA_LEN_LOWER_LIMIT = 5
14
+ CAPTCHA_LEN_UPPER_LIMIT = 7
15
+ directories = [["train",0.8],["test",0.1],["val",0.1]]
16
+
17
+ os.makedirs(DATASET_DIR, exist_ok=True)
18
+ image = ImageCaptcha(width=160, height=60)
19
+
20
+
21
+ with open(LABELS,mode="w",newline="") as f:
22
+ writer = csv.writer(f)
23
+ writer.writerow(["filename","label"])
24
+ OUTPUT_DIR = os.path.join(DATASET_DIR,directories[0][0])
25
+ os.makedirs(OUTPUT_DIR,exist_ok=True)
26
+ for i in range(NUM_IMAGES):
27
+ if i%(NUM_IMAGES/100) ==0:
28
+ print(f"{i} images made")
29
+ if i>(0.8*NUM_IMAGES-1) and i<(0.9*NUM_IMAGES):
30
+ OUTPUT_DIR = os.path.join(DATASET_DIR,directories[1][0])
31
+ os.makedirs(OUTPUT_DIR,exist_ok=True)
32
+ elif i>(0.9*NUM_IMAGES-1):
33
+
34
+ OUTPUT_DIR = os.path.join(DATASET_DIR,directories[2][0])
35
+ os.makedirs(OUTPUT_DIR,exist_ok=True)
36
+ text = ''.join(random.choices(CHARS, k=random.randint(CAPTCHA_LEN_LOWER_LIMIT,CAPTCHA_LEN_UPPER_LIMIT)))
37
+ filename = f"{text}_{i}.png"
38
+ filepath = os.path.join(OUTPUT_DIR, filename)
39
+ image.write(text, filepath)
40
+ writer.writerow([filename,text])
41
+
42
+ print("Data Generated!")
43
+
44
+
45
+ df = pd.read_csv(LABELS)
46
+
47
+ n = len(df)
48
+ train_end = int(n * directories[0][1])
49
+ val_end = train_end + int(n * directories[2][1])
50
+
51
+ # Split datasets
52
+ df_train = df.iloc[:train_end]
53
+ df_val = df.iloc[train_end:val_end]
54
+ df_test = df.iloc[val_end:]
55
+
56
+ # Save
57
+ df_train.to_csv(os.path.join(DATASET_DIR,"train/labels.csv"), index=False)
58
+ df_val.to_csv(os.path.join(DATASET_DIR,"val/labels.csv"), index=False)
59
+ df_test.to_csv(os.path.join(DATASET_DIR,"test/labels.csv"), index=False)
60
+
61
+ print("Labels Generated")
62
+
63
+
src/vocab.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ from src.config import cfg
3
+
4
+ itos = ["<blank>"] + list(cfg.chars)
5
+
6
+ stoi = {c: i+1 for i,c in enumerate(cfg.chars)}
7
+
8
+ def encode_text(text: str) -> List[int]:
9
+ return [stoi[c] for c in text]
10
+
11
+ def decode_indices(indices: List[int]) -> str:
12
+ return "".join(itos[i] for i in indices if i != 0)
13
+
14
+ def ctc_greedy_decode(logits) -> List[str]:
15
+ """
16
+ Greedy CTC decode for a batch.
17
+ logits: torch.Tensor of shape [T, B, V] (before softmax or log_softmax).
18
+ Returns: list of B decoded strings.
19
+ """
20
+ import torch
21
+ pred = logits.argmax(dim=-1)
22
+ B = pred.shape[1]
23
+ decoded = []
24
+ for b in range(B):
25
+ prev = -1
26
+ chars = []
27
+ for t in pred[:,b].tolist():
28
+ if t!=0 and t!= prev:
29
+ chars.append(itos[t])
30
+ prev = t
31
+ decoded.append("".join(chars))
32
+ return decoded
33
+
34
+ def vocab_size() -> int:
35
+ return len(itos)