Seeker38 commited on
Commit
83aec7a
1 Parent(s): 5dad05f

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +111 -0
README.md ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - vi
4
+ pretty_name: Well-known Vietnamese people and corresponding abstracts in Wikipedia
5
+ source_datasets:
6
+ - original
7
+ size_categories:
8
+ - 1K<n<10K
9
+ tags:
10
+ - wikipedia
11
+ - images
12
+ - text
13
+ - LM
14
+ dataset_info:
15
+ features:
16
+ - name: image
17
+ dtype: image
18
+ - name: title
19
+ dtype: string
20
+ - name: text
21
+ dtype: string
22
+ license: mit
23
+ datasets:
24
+ - Seeker38/vietnamese_face_wiki
25
+ metrics:
26
+ - bleu
27
+ ---
28
+
29
+ # Image Captioning - Fine Tune ViT-PhoBERT Model
30
+
31
+ This is ViT-PhoBERT fine tune Model on [vietnamese_face_wiki dataset](https://huggingface.co/datasets/Seeker38/vietnamese_face_wiki)
32
+
33
+
34
+
35
+ # How to use
36
+
37
+ import needed library
38
+ ```python
39
+ import numpy as np
40
+ import pandas as pd
41
+ import torch
42
+ import matplotlib.pyplot as plt
43
+ from PIL import Image
44
+ from datasets import load_dataset
45
+ from torch.utils.data import Dataset
46
+ from transformers import AutoImageProcessor, AutoTokenizer, VisionEncoderDecoderModel
47
+
48
+ ```
49
+
50
+ ### load the dataset you need
51
+ ```python
52
+ from datasets import load_dataset
53
+
54
+ dataset = load_dataset("Seeker38/augmented_vi_face_wiki", split="train")
55
+ ```
56
+
57
+ ### load the model
58
+ ```python
59
+ from transformers import AutoImageProcessor, AutoTokenizer, VisionEncoderDecoderModel
60
+ model = VisionEncoderDecoderModel.from_pretrained("Seeker38/ViT_PhoBert_face_vi_wiki")
61
+ phobert_tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2", add_special_tokens=True)
62
+
63
+ if phobert_tokenizer.pad_token is None:
64
+ phobert_tokenizer.add_special_tokens({'pad_token': '[PAD]'})
65
+ ```
66
+
67
+ ### contruct caption generate method
68
+ ```python
69
+ def generate_caption(model, dataset, tokenizer, device, num_images=20, max_length=50):
70
+ model.eval()
71
+
72
+ sampled_indices = random.sample(range(len(dataset)), num_images)
73
+ sampled_images = [dataset[idx]['image'] for idx in sampled_indices]
74
+ pixel_values_list = []
75
+
76
+ for image in sampled_images:
77
+ image = image.resize((224, 224))
78
+ image = np.array(image, dtype=np.uint8)
79
+ image = torch.tensor(np.moveaxis(image, -1, 0), dtype=torch.float32)
80
+ pixel_values_list.append(image)
81
+
82
+ pixel_values = torch.stack(pixel_values_list).to(device)
83
+
84
+ with torch.no_grad():
85
+ outputs = model.generate(pixel_values, num_beams=10, max_length=max_length, early_stopping=True, length_penalty=1.0)
86
+
87
+ decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
88
+
89
+ # Display the images and their captions in a single column
90
+ fig, axs = plt.subplots(num_images, 2, figsize=(15, 5 * num_images))
91
+
92
+ for i, (image, caption) in enumerate(zip(sampled_images, decoded_preds)):
93
+ axs[i, 0].imshow(image)
94
+ axs[i, 0].axis('off')
95
+ axs[i, 1].text(0, 0.5, caption, wrap=True, fontsize=12)
96
+ axs[i, 1].axis('off')
97
+
98
+ plt.tight_layout()
99
+
100
+ # Save the plot to a local file
101
+ output_file = "/kaggle/working/generated_captions.png"
102
+ plt.savefig(output_file)
103
+ plt.show()
104
+
105
+ print(f"Plot saved as {output_file}")
106
+ ```
107
+
108
+ ### Run and enjoy
109
+ ```python
110
+ generate_caption(model, dataset, phobert_tokenizer, device,5,70)
111
+ ```