| import torch |
| import gradio as gr |
| from torch import nn |
| import torch.nn.functional as F |
| import os |
| from torchvision import transforms |
| from torch.utils.data import DataLoader,random_split,Dataset |
| from PIL import Image,UnidentifiedImageError |
| import string |
| import matplotlib.pyplot as plt |
|
|
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
|
| characters = string.ascii_letters + string.digits |
| idx_to_char = {idx: char for idx, char in enumerate(characters)} |
|
|
|
|
| |
| AFFN_KERNEL=5 |
| AFFN_STRIDE=1 |
| AFFN_DEPTH=1 |
|
|
| |
| CRNN_KERNEL=5 |
| CRNN_POOL_KERNEL=2 |
| CRNN_DROPOUT=0.3 |
| CRNN_LATENT=128 |
| LSTM_HIDDEN_DIM=32 |
| VOCAB_SIZE=26*2+10 |
| OUTPUT_LENGTH=5 |
|
|
|
|
| class Encoder(nn.Sequential): |
| def __init__(self,n,kernel_size,stride): |
| super().__init__( |
| nn.Conv2d(in_channels=4**(n-1),out_channels=4**n,kernel_size=kernel_size,stride=stride), |
| nn.BatchNorm2d(num_features=4**n), |
| nn.ReLU(inplace=False) |
| ) |
|
|
| |
| class Decoder(nn.Sequential): |
| def __init__(self,n,kernel_size,stride): |
| super().__init__( |
| nn.ConvTranspose2d(in_channels=4**n,out_channels=4**(n-1),kernel_size=kernel_size,stride=stride), |
| nn.BatchNorm2d(num_features=4**(n-1)), |
| nn.ReLU(inplace=False) |
| ) |
|
|
| |
| class AFFN(nn.Module): |
| def __init__(self,n): |
| super().__init__() |
| self.n= n |
| self.alpha = nn.Parameter(torch.randn(n-1).to(device)).to(device) |
| self.encoders = [] |
| self.decoders = [] |
| for i in range(1,n+1): |
| self.encoders.append(Encoder(i,AFFN_KERNEL,AFFN_STRIDE).to(device)) |
| for i in range(n,0,-1): |
| self.decoders.append(Decoder(i,AFFN_KERNEL,AFFN_STRIDE).to(device)) |
|
|
| def forward(self,x): |
| residuals = [] |
| for i,enc in enumerate(self.encoders): |
| x= enc(x) |
| if i < self.n-1: |
| x = x * (1 - self.alpha[i]) |
| residuals.append(x * self.alpha[i]) |
| for i,dec in enumerate(self.decoders): |
| x= dec(x) |
| if i < self.n-1: |
| x= x + residuals.pop() |
| return x |
|
|
|
|
| class CRNN(nn.Module): |
| def __init__(self, in_channels, kernel_size, pool_kernel_size, dropout, latent_dim, lstm_hidden_dim, vocab_size, output_length=5): |
| super().__init__() |
| self.lstm_hidden_dim = lstm_hidden_dim |
| self.output_length = output_length |
| self.vocab_size = vocab_size |
|
|
| self.conv1 = nn.Sequential( |
| nn.Conv2d(in_channels=in_channels, out_channels=in_channels*2, kernel_size=kernel_size, padding=2), |
| nn.BatchNorm2d(num_features=in_channels*2), |
| nn.ReLU(inplace=False), |
| nn.MaxPool2d(kernel_size=pool_kernel_size) |
| ) |
| self.conv2 = nn.Sequential( |
| nn.Conv2d(in_channels=in_channels*2, out_channels=in_channels*4, kernel_size=kernel_size, padding=2), |
| nn.BatchNorm2d(num_features=in_channels*4), |
| nn.ReLU(inplace=False), |
| nn.MaxPool2d(kernel_size=pool_kernel_size) |
| ) |
| self.flatten = nn.Flatten() |
| self.dropout = nn.Dropout(dropout) |
| self.latent_fc = nn.LazyLinear(latent_dim) |
| self.lstm = nn.LSTM(input_size=latent_dim, hidden_size=lstm_hidden_dim, num_layers=1, batch_first=True) |
| self.output_fc = nn.Linear(lstm_hidden_dim, vocab_size) |
|
|
| |
| def forward(self, x): |
| batch_size = x.size(0) |
|
|
| |
| conv1_out = self.conv1(x) |
| conv2_out = self.conv2(conv1_out) |
| flattened = self.flatten(conv2_out) |
| dropped = self.dropout(flattened) |
| latent = self.latent_fc(dropped) |
|
|
| lstm_input = latent.unsqueeze(1) |
|
|
| |
| h0 = torch.zeros(1, batch_size, self.lstm_hidden_dim, device=x.device) |
| c0 = torch.zeros(1, batch_size, self.lstm_hidden_dim, device=x.device) |
|
|
| outputs = [] |
|
|
| |
| for _ in range(self.output_length): |
| out, (h0, c0) = self.lstm(lstm_input, (h0, c0)) |
|
|
| logits = self.output_fc(out.squeeze(1)) |
|
|
| outputs.append(logits) |
|
|
| outputs = torch.stack(outputs, dim=1) |
|
|
| return outputs |
| output=CRNN(64,CRNN_KERNEL,CRNN_POOL_KERNEL,CRNN_DROPOUT,CRNN_LATENT,LSTM_HIDDEN_DIM,VOCAB_SIZE,OUTPUT_LENGTH).to(device)(torch.zeros((2,64,256,256)).to(device)) |
|
|
|
|
| class CaptchaCrackNet(nn.Module): |
| def __init__(self): |
| super().__init__() |
| self.affn=AFFN(AFFN_DEPTH).to(device) |
|
|
| self.conv1=nn.Sequential( |
| nn.Conv2d(in_channels=1,out_channels=32,kernel_size=5,padding=2), |
| nn.ReLU(inplace=False), |
| nn.MaxPool2d(kernel_size=2) |
| ) |
|
|
| self.conv2=nn.Sequential( |
| nn.Conv2d(in_channels=32,out_channels=48,kernel_size=5,padding=2), |
| nn.ReLU(inplace=False), |
| nn.MaxPool2d(kernel_size=2) |
| ) |
|
|
| self.conv3=nn.Sequential( |
| nn.Conv2d(in_channels=48,out_channels=64,kernel_size=5,padding=2), |
| nn.ReLU(inplace=False), |
| nn.MaxPool2d(kernel_size=2) |
| ) |
|
|
| self.res=nn.Conv2d(in_channels=1, out_channels=32, kernel_size=5, stride=2, padding=2) |
|
|
| self.crnn=CRNN(64,CRNN_KERNEL,CRNN_POOL_KERNEL,CRNN_DROPOUT,CRNN_LATENT,LSTM_HIDDEN_DIM,VOCAB_SIZE,OUTPUT_LENGTH).to(device) |
|
|
| def forward(self,x): |
| affn_out=self.affn(x) |
| res_out=self.res(x) |
| conv1_out=self.conv1(affn_out) |
| conv2_out=self.conv2(conv1_out+res_out) |
| conv3_out=self.conv3(conv2_out) |
| output=self.crnn(conv3_out) |
| return output |
|
|
|
|
| torch.manual_seed(42) |
| model=CaptchaCrackNet().to(device) |
| optimizer=torch.optim.Adam(model.parameters()) |
|
|
|
|
|
|
| characters = string.ascii_letters + string.digits |
| idx_to_char = {idx: char for idx, char in enumerate(characters)} |
|
|
|
|
| def to_text(arr): |
| ans='' |
| for c in arr: |
| ans=ans+idx_to_char[c.item()] |
| return ans |
| def predict_captcha(image): |
| try: |
| if image is None: |
| return "No image provided" |
|
|
| |
| if isinstance(image, dict) and 'data' in image: |
| image = image['data'] |
|
|
| |
| if isinstance(image, str) and image.startswith('data:image'): |
| import base64 |
| from io import BytesIO |
| image_data = base64.b64decode(image.split(',')[1]) |
| image = Image.open(BytesIO(image_data)) |
| elif not isinstance(image, Image.Image): |
| from io import BytesIO |
| image = Image.open(BytesIO(image)) |
|
|
| |
| transform = transforms.Compose([ |
| transforms.Resize((40, 150)), |
| transforms.Grayscale(), |
| transforms.ToTensor(), |
| transforms.Lambda(lambda x: x / 255), |
| ]) |
|
|
| image_tensor = transform(image).unsqueeze(0).to(device) |
|
|
| with torch.no_grad(): |
| output = model(image_tensor) |
| prediction = output.squeeze(0).argmax(axis=1) |
| result = to_text(prediction) |
| print(f"Predicted text: {result}") |
| return result |
|
|
| except Exception as e: |
| print(f"Error details: {str(e)}") |
| return f"Error processing image: {str(e)}" |
| |
|
|
| checkpoint = torch.load(r'model/final.pth', map_location=device) |
| |
| print("Checkpoint keys:", checkpoint.keys()) |
| model.load_state_dict(checkpoint['model_state_dict']) |
|
|
| optimizer.load_state_dict(checkpoint['optimizer_state_dict']) |
| model.eval() |
|
|
| |
| iface = gr.Interface( |
| fn=predict_captcha, |
| inputs=gr.Image(type="pil", label="Upload CAPTCHA Image"), |
| outputs=gr.Textbox(label="Predicted Text"), |
| title="CAPTCHA Recognition", |
| description="Upload a CAPTCHA image to get the predicted text." |
| ) |
|
|
| if __name__ == "__main__": |
| iface.launch( |
| |
| ) |