#허깅페이스에서 돌아갈 수 있도록 바꾸어 보았음 import torch from transformers import BertTokenizerFast, BertForQuestionAnswering, Trainer, TrainingArguments from datasets import load_dataset from collections import defaultdict # 데이터 불러오기 dataset_load = load_dataset('Multimodal-Fatima/OK-VQA_train') dataset = dataset_load['train'].select(range(300)) # 불필요한 특성 선택 selected_features = ['image', 'answers', 'question'] selected_dataset = dataset.map(lambda ex: {feature: ex[feature] for feature in selected_features}) # 소프트 인코딩 answers_to_id = defaultdict(lambda: len(answers_to_id)) selected_dataset = selected_dataset.map(lambda ex: { 'answers': [answers_to_id[ans] for ans in ex['answers']], 'question': ex['question'], 'image': ex['image'] }) id_to_answers = {v: k for k, v in answers_to_id.items()} id_to_labels = {k: ex['answers'] for k, ex in enumerate(selected_dataset)} selected_dataset = selected_dataset.map(lambda ex: {'answers': id_to_labels.get(ex['answers'][0]), 'question': ex['question'], 'image': ex['image']}) flattened_features = [] for ex in selected_dataset: flattened_example = { 'answers': ex['answers'], 'question': ex['question'], 'image': ex['image'], } flattened_features.append(flattened_example) # 모델 가져오기 from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer model_name = 'microsoft/git-base-vqav2' model = AutoModelForSequenceClassification.from_pretrained(model_name) # Trainer를 사용하여 모델 학습 tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-cased') def preprocess_function(examples): tokenized_inputs = tokenizer(examples['question'], truncation=True, padding=True) return { 'input_ids': tokenized_inputs['input_ids'], 'attention_mask': tokenized_inputs['attention_mask'], 'pixel_values': [(4, 3, 244, 244)] * len(tokenized_inputs['input_ids']), 'pixel_mask': [1] * len(tokenized_inputs['input_ids']), 'labels': [[label] for label in examples['answers']] } dataset = load_dataset("Multimodal-Fatima/OK-VQA_train")['train'].select(range(300)) ok_vqa_dataset = dataset.map(preprocess_function, batched=True) ok_vqa_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'pixel_values', 'pixel_mask', 'labels']) training_args = TrainingArguments( output_dir='./results', num_train_epochs=20, per_device_train_batch_size=4, logging_steps=500, ) trainer = Trainer( model=model, args=training_args, train_dataset=ok_vqa_dataset ) # 모델 학습 trainer.train() import gradio as gr import torch from transformers import BertTokenizer, BertForSequenceClassification # 모델 초기화 및 가중치 불러오기 model_name = 'microsoft/git-base-vqav2' # 사용할 모델의 이름 model = BertForSequenceClassification.from_pretrained(model_name) tokenizer = BertTokenizer.from_pretrained(model_name) # 예측 함수 정의 def predict_answer(image, question): inputs = tokenizer(question, return_tensors='pt') input_ids = inputs['input_ids'] attention_mask = inputs['attention_mask'] # 이미지와 관련된 처리 수행 # 이미지 처리 코드를 여기에 추가해야 합니다 (입력된 이미지에 대한 전처리 등) # 모델에 입력 데이터를 전달하여 예측 수행 with torch.no_grad(): outputs = model(input_ids=input_ids, attention_mask=attention_mask) # 예측 결과에서 가장 높은 확률을 가진 레이블 ID 가져오기 predicted_label_id = torch.argmax(outputs.logits).item() predicted_label = id_to_label_fn(predicted_label_id) return predicted_label iface = gr.Interface( fn=predict_answer, inputs=["image", "text"], outputs="text", title="Visual Question Answering", description="Input an image and a question to get the model's answer.", example=[ "https://your_image_url.jpg", "What is shown in the image?" ] ) iface.launch()