alphapen_trocr_large_60000 / inference_idefics2.py
hadrakey's picture
Training in progress, step 1000
9c909e3 verified
from PIL import Image
import requests
from transformers import AutoModelForCausalLM
from transformers import AutoProcessor
from transformers import BitsAndBytesConfig
from transformers import TrOCRProcessor, VisionEncoderDecoderModel, AutoModelForVision2Seq
import torch
import pandas as pd
from torchmetrics.text import CharErrorRate
from peft import PeftModel, PeftConfig
from torchmetrics.text import CharErrorRate
from datasets import Dataset, DatasetDict, Image
# Define train and test size.
TRAIN_SAMPLES = 1000
TEST_SAMPLES = 200
TEST_SIZE = 0.166 #
DEVICE = "cuda:0"
peft_model_id = "hadrakey/alphapen_idefics2_finetune_v1"
config = PeftConfig.from_pretrained(peft_model_id)
processor = AutoProcessor.from_pretrained(config.base_model_name_or_path, trust_remote_code=True)
base_model = AutoModelForVision2Seq.from_pretrained(config.base_model_name_or_path, device_map="auto", trust_remote_code=True, torch_dtype="auto")
model = PeftModel.from_pretrained(base_model, peft_model_id)
model = model.to(DEVICE)
# Define the directory containing the images.
df_path = "/mnt/data1/Datasets/AlphaPen/" + "testing_data.csv"
df = pd.read_csv(df_path)
df.dropna(inplace=True)
sample = df.iloc[:5000,:]
sample.reset_index(inplace=True)
sample["id"] = range(sample.shape[0])
sample["query"] = "What is shown in this image?"
root_dir = "/mnt/data1/Datasets/OCR/Alphapen/clean_data/final_cropped_rotated_"
image_paths = [root_dir + img for img in sample.filename]
# Create a list of other columns such as id, query, and answer.
ids = sample['id'].tolist()
queries = sample['query'].tolist()
answers = sample['text'].tolist()
# Create the dataset dictionary.
dataset_dict = {
'id': ids,
'image': image_paths,
'query': queries,
'answers': answers
}
# Create the dataset.
dataset = Dataset.from_dict(dataset_dict)
# Cast the 'image' column to Image type.
dataset = dataset.cast_column("image", Image())
# Split the dataset into train and test.
# split_dataset = dataset.train_test_split(test_size=TEST_SIZE, shuffle=False)
# train_dataset = split_dataset["train"]
# eval_dataset = split_dataset["test"]
cer_metric = CharErrorRate()
cer_idefics = []
idefics_output = []
for idx in range(len(dataset)):
test_example = dataset[idx]
image = test_example["image"]
query = test_example["query"]
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Answer briefly."},
{"type": "image"},
{"type": "text", "text": query}
]
}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=[text.strip()], images=[image], return_tensors="pt", padding=True)
inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
generated_ids = model.generate(**inputs, max_new_tokens=64)
generated_texts = processor.batch_decode(generated_ids[:, inputs["input_ids"].size(1):], skip_special_tokens=True)
idefics_output.append(generated_texts[0])
cer_idefics.append(cer_metric(generated_texts[0].lower(), test_example["answers"].lower()).detach().numpy())
# print(generated_texts, test_example["answers"], cer_idefics)
sample["idefics"] = idefics_output
sample["cer"] = cer_idefics
sample.to_csv("/mnt/data1/Datasets/AlphaPen/" + "sample_idefics_v1.csv")