seawolf2357 commited on
Commit
4d95222
ยท
verified ยท
1 Parent(s): 1e442f4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1 -20
app.py CHANGED
@@ -6,25 +6,6 @@ import torch
6
  from PIL import Image
7
  import requests
8
 
9
- # ๋ชจ๋ธ๊ณผ ํ† ํฌ๋‚˜์ด์ € ๋กœ๋“œ
10
- model_id = "nlpconnect/vit-gpt2-image-captioning"
11
- model = VisionEncoderDecoderModel.from_pretrained(model_id)
12
- feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)
13
- tokenizer = AutoTokenizer.from_pretrained(model_id)
14
-
15
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
16
- model.to(device)
17
-
18
- def predict_caption(image):
19
- image = Image.open(image)
20
- inputs = feature_extractor(images=image, return_tensors="pt")
21
- pixel_values = inputs["pixel_values"].to(device)
22
-
23
- output_ids = model.generate(pixel_values)
24
- caption = tokenizer.decode(output_ids[0], skip_special_tokens=True)
25
- return caption
26
-
27
-
28
  # ์ด๋ฏธ์ง€ ์ธ์‹ ํŒŒ์ดํ”„๋ผ์ธ ๋กœ๋“œ
29
  image_model = pipeline("image-classification", model="google/vit-base-patch16-224")
30
 
@@ -75,7 +56,7 @@ def classify_and_generate_voice(uploaded_image):
75
  iface = gr.Interface(
76
  fn=classify_and_generate_voice,
77
  inputs=gr.Image(type="pil"),
78
- outputs=[gr.Textbox(label="Caption"), gr.Label(), gr.Audio(), gr.Audio()],
79
  title="msVision_3",
80
  description="์ด๋ฏธ์ง€๋ฅผ ์—…๋กœ๋“œํ•˜๋ฉด, ์‚ฌ๋ฌผ์„ ์ธ์‹ํ•˜๊ณ  ํ•ด๋‹นํ•˜๋Š” ์Œ์„ฑ ๋ฐ ์Œ์•…์„ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.(recognizes object and generate Voice&Music)",
81
  examples=["dog.jpg", "cat.png", "cafe.jpg"]
 
6
  from PIL import Image
7
  import requests
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  # ์ด๋ฏธ์ง€ ์ธ์‹ ํŒŒ์ดํ”„๋ผ์ธ ๋กœ๋“œ
10
  image_model = pipeline("image-classification", model="google/vit-base-patch16-224")
11
 
 
56
  iface = gr.Interface(
57
  fn=classify_and_generate_voice,
58
  inputs=gr.Image(type="pil"),
59
+ outputs=[ gr.Label(), gr.Audio(), gr.Audio()],
60
  title="msVision_3",
61
  description="์ด๋ฏธ์ง€๋ฅผ ์—…๋กœ๋“œํ•˜๋ฉด, ์‚ฌ๋ฌผ์„ ์ธ์‹ํ•˜๊ณ  ํ•ด๋‹นํ•˜๋Š” ์Œ์„ฑ ๋ฐ ์Œ์•…์„ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.(recognizes object and generate Voice&Music)",
62
  examples=["dog.jpg", "cat.png", "cafe.jpg"]