Update app.py
Browse files
app.py
CHANGED
@@ -3,20 +3,20 @@ import gradio as gr
|
|
3 |
import torch
|
4 |
import scipy.io.wavfile as wavfile
|
5 |
|
6 |
-
# Load translation model
|
7 |
translator = pipeline("translation_en_to_ar", model="Helsinki-NLP/opus-mt-en-ar")
|
8 |
|
9 |
-
# Load the
|
10 |
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
|
11 |
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
|
12 |
|
13 |
-
# Load
|
14 |
qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
|
15 |
|
16 |
-
# Load
|
17 |
narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
|
18 |
|
19 |
-
# Combined context for
|
20 |
combined_context = """
|
21 |
Tigers are the largest species among the Felidae and classified in the genus Panthera. They are apex predators, primarily preying on ungulates such as deer and wild boar.
|
22 |
Elephants are the largest land animals on Earth. They are known for their large ears, tusks made of ivory, and their trunks.
|
@@ -33,34 +33,34 @@ Pandas are large bears native to China, distinguished by their black and white c
|
|
33 |
"""
|
34 |
|
35 |
# List of known animal names for extraction
|
36 |
-
animal_names = ["tiger", "elephant", "deer", "lion", "penguin", "dog", "cat", "giraffe", "zebra","dolphin","
|
37 |
|
38 |
-
# Function to extract animal name from the caption
|
39 |
def extract_animal_from_caption(caption):
|
40 |
for animal in animal_names:
|
41 |
if animal in caption.lower():
|
42 |
return animal
|
43 |
return None
|
44 |
|
45 |
-
# Function to generate audio from text
|
46 |
def generate_audio(text):
|
47 |
# Generate the narrated text
|
48 |
narrated_text = narrator(text)
|
49 |
-
# Save the audio to WAV file
|
50 |
wavfile.write("output.wav", rate=narrated_text["sampling_rate"], data=narrated_text["audio"][0])
|
51 |
return "output.wav"
|
52 |
|
53 |
-
# Function to recognize animal and
|
54 |
def recognize_animal_and_get_info(image):
|
55 |
-
# Step 1: Generate a caption for the image using BLIP
|
56 |
inputs = blip_processor(images=image, return_tensors="pt")
|
57 |
caption_ids = blip_model.generate(**inputs)
|
58 |
caption = blip_processor.decode(caption_ids[0], skip_special_tokens=True)
|
59 |
|
60 |
-
# Step 2: Extract the animal name from the caption
|
61 |
animal_name = extract_animal_from_caption(caption)
|
62 |
|
63 |
-
# Step 3: Use the QA model to
|
64 |
if animal_name:
|
65 |
question = f"Describe a {animal_name}?"
|
66 |
answer = qa_model(question=question, context=combined_context)
|
@@ -68,7 +68,7 @@ def recognize_animal_and_get_info(image):
|
|
68 |
else:
|
69 |
info = "Sorry, I couldn't identify the animal in the image."
|
70 |
|
71 |
-
# Translate caption and
|
72 |
translated_caption = translator(caption)[0]['translation_text']
|
73 |
translated_info = translator(info)[0]['translation_text']
|
74 |
|
@@ -77,7 +77,7 @@ def recognize_animal_and_get_info(image):
|
|
77 |
|
78 |
return caption, info, audio_file, translated_caption, translated_info # Return all outputs
|
79 |
|
80 |
-
# Define the Gradio interface with tabs
|
81 |
with gr.Blocks(css=".gradio-container { background-color: beige; }") as iface:
|
82 |
gr.Markdown("# Animal Recognition")
|
83 |
gr.Markdown("Upload an animal image to generate a caption and insights about the identified animal.")
|
@@ -95,7 +95,7 @@ with gr.Blocks(css=".gradio-container { background-color: beige; }") as iface:
|
|
95 |
translated_caption_output = gr.Textbox(label="Translated Caption", interactive=False)
|
96 |
translated_info_output = gr.Textbox(label="Translated Animal Insight", interactive=False)
|
97 |
|
98 |
-
# Define the action
|
99 |
image_input.change(
|
100 |
fn=recognize_animal_and_get_info,
|
101 |
inputs=image_input,
|
|
|
3 |
import torch
|
4 |
import scipy.io.wavfile as wavfile
|
5 |
|
6 |
+
# Load translation model to translate English text to Arabic
|
7 |
translator = pipeline("translation_en_to_ar", model="Helsinki-NLP/opus-mt-en-ar")
|
8 |
|
9 |
+
# Load the BLIP image captioning model and processor
|
10 |
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
|
11 |
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
|
12 |
|
13 |
+
# Load a question-answering model for retrieving information
|
14 |
qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
|
15 |
|
16 |
+
# Load a text-to-speech model for generating audio from text
|
17 |
narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
|
18 |
|
19 |
+
# Combined context for known animals
|
20 |
combined_context = """
|
21 |
Tigers are the largest species among the Felidae and classified in the genus Panthera. They are apex predators, primarily preying on ungulates such as deer and wild boar.
|
22 |
Elephants are the largest land animals on Earth. They are known for their large ears, tusks made of ivory, and their trunks.
|
|
|
33 |
"""
|
34 |
|
35 |
# List of known animal names for extraction
|
36 |
+
animal_names = ["tiger", "elephant", "deer", "lion", "penguin", "dog", "cat", "giraffe", "zebra", "dolphin", "panda", "crocodile"]
|
37 |
|
38 |
+
# Function to extract animal name from the generated caption
|
39 |
def extract_animal_from_caption(caption):
|
40 |
for animal in animal_names:
|
41 |
if animal in caption.lower():
|
42 |
return animal
|
43 |
return None
|
44 |
|
45 |
+
# Function to generate audio from text using the text-to-speech model
|
46 |
def generate_audio(text):
|
47 |
# Generate the narrated text
|
48 |
narrated_text = narrator(text)
|
49 |
+
# Save the audio to a WAV file
|
50 |
wavfile.write("output.wav", rate=narrated_text["sampling_rate"], data=narrated_text["audio"][0])
|
51 |
return "output.wav"
|
52 |
|
53 |
+
# Function to recognize the animal in the image and provide relevant information
|
54 |
def recognize_animal_and_get_info(image):
|
55 |
+
# Step 1: Generate a caption for the uploaded image using BLIP
|
56 |
inputs = blip_processor(images=image, return_tensors="pt")
|
57 |
caption_ids = blip_model.generate(**inputs)
|
58 |
caption = blip_processor.decode(caption_ids[0], skip_special_tokens=True)
|
59 |
|
60 |
+
# Step 2: Extract the animal name from the generated caption
|
61 |
animal_name = extract_animal_from_caption(caption)
|
62 |
|
63 |
+
# Step 3: Use the QA model to retrieve information based on the combined context
|
64 |
if animal_name:
|
65 |
question = f"Describe a {animal_name}?"
|
66 |
answer = qa_model(question=question, context=combined_context)
|
|
|
68 |
else:
|
69 |
info = "Sorry, I couldn't identify the animal in the image."
|
70 |
|
71 |
+
# Translate both the caption and the information to Arabic
|
72 |
translated_caption = translator(caption)[0]['translation_text']
|
73 |
translated_info = translator(info)[0]['translation_text']
|
74 |
|
|
|
77 |
|
78 |
return caption, info, audio_file, translated_caption, translated_info # Return all outputs
|
79 |
|
80 |
+
# Define the Gradio interface with tabs for displaying results
|
81 |
with gr.Blocks(css=".gradio-container { background-color: beige; }") as iface:
|
82 |
gr.Markdown("# Animal Recognition")
|
83 |
gr.Markdown("Upload an animal image to generate a caption and insights about the identified animal.")
|
|
|
95 |
translated_caption_output = gr.Textbox(label="Translated Caption", interactive=False)
|
96 |
translated_info_output = gr.Textbox(label="Translated Animal Insight", interactive=False)
|
97 |
|
98 |
+
# Define the action to take when an image is uploaded
|
99 |
image_input.change(
|
100 |
fn=recognize_animal_and_get_info,
|
101 |
inputs=image_input,
|