Spaces:

Norahsal
/

Animals

Sleeping

App Files Files Community

Norahsal commited on Sep 29, 2024

Commit

1ae1a92

verified ·

1 Parent(s): 2334683

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -16

app.py CHANGED Viewed

@@ -3,20 +3,20 @@ import gradio as gr
 import torch
 import scipy.io.wavfile as wavfile
-# Load translation model
 translator = pipeline("translation_en_to_ar", model="Helsinki-NLP/opus-mt-en-ar")
-# Load the pre-trained BLIP image captioning model and processor
 blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
 blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
-# Load the question-answering model for information retrieval
 qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
-# Load the text-to-speech model
 narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
-# Combined context for all animals
 combined_context = """
 Tigers are the largest species among the Felidae and classified in the genus Panthera. They are apex predators, primarily preying on ungulates such as deer and wild boar.
 Elephants are the largest land animals on Earth. They are known for their large ears, tusks made of ivory, and their trunks.
@@ -33,34 +33,34 @@ Pandas are large bears native to China, distinguished by their black and white c
 """
 # List of known animal names for extraction
-animal_names = ["tiger", "elephant", "deer", "lion", "penguin", "dog", "cat", "giraffe", "zebra","dolphin","Pandas","Crocodiles","Pandas"]
-# Function to extract animal name from the caption
 def extract_animal_from_caption(caption):
     for animal in animal_names:
         if animal in caption.lower():
             return animal
     return None
-# Function to generate audio from text
 def generate_audio(text):
     # Generate the narrated text
     narrated_text = narrator(text)
-    # Save the audio to WAV file
     wavfile.write("output.wav", rate=narrated_text["sampling_rate"], data=narrated_text["audio"][0])
     return "output.wav"
-# Function to recognize animal and get info, including translation
 def recognize_animal_and_get_info(image):
-    # Step 1: Generate a caption for the image using BLIP
     inputs = blip_processor(images=image, return_tensors="pt")
     caption_ids = blip_model.generate(**inputs)
     caption = blip_processor.decode(caption_ids[0], skip_special_tokens=True)
-    # Step 2: Extract the animal name from the caption
     animal_name = extract_animal_from_caption(caption)
-    # Step 3: Use the QA model to provide information based on the combined context
     if animal_name:
         question = f"Describe a {animal_name}?"
         answer = qa_model(question=question, context=combined_context)
@@ -68,7 +68,7 @@ def recognize_animal_and_get_info(image):
     else:
         info = "Sorry, I couldn't identify the animal in the image."
-    # Translate caption and info to Arabic
     translated_caption = translator(caption)[0]['translation_text']
     translated_info = translator(info)[0]['translation_text']
@@ -77,7 +77,7 @@ def recognize_animal_and_get_info(image):
     return caption, info, audio_file, translated_caption, translated_info  # Return all outputs
-# Define the Gradio interface with tabs
 with gr.Blocks(css=".gradio-container { background-color: beige; }") as iface:
     gr.Markdown("# Animal Recognition")
     gr.Markdown("Upload an animal image to generate a caption and insights about the identified animal.")
@@ -95,7 +95,7 @@ with gr.Blocks(css=".gradio-container { background-color: beige; }") as iface:
         translated_caption_output = gr.Textbox(label="Translated Caption", interactive=False)
         translated_info_output = gr.Textbox(label="Translated Animal Insight", interactive=False)
-    # Define the action on image upload
     image_input.change(
         fn=recognize_animal_and_get_info,
         inputs=image_input,

 import torch
 import scipy.io.wavfile as wavfile
+# Load translation model to translate English text to Arabic
 translator = pipeline("translation_en_to_ar", model="Helsinki-NLP/opus-mt-en-ar")
+# Load the BLIP image captioning model and processor
 blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
 blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
+# Load a question-answering model for retrieving information
 qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
+# Load a text-to-speech model for generating audio from text
 narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
+# Combined context for known animals
 combined_context = """
 Tigers are the largest species among the Felidae and classified in the genus Panthera. They are apex predators, primarily preying on ungulates such as deer and wild boar.
 Elephants are the largest land animals on Earth. They are known for their large ears, tusks made of ivory, and their trunks.
 """
 # List of known animal names for extraction
+animal_names = ["tiger", "elephant", "deer", "lion", "penguin", "dog", "cat", "giraffe", "zebra", "dolphin", "panda", "crocodile"]
+# Function to extract animal name from the generated caption
 def extract_animal_from_caption(caption):
     for animal in animal_names:
         if animal in caption.lower():
             return animal
     return None
+# Function to generate audio from text using the text-to-speech model
 def generate_audio(text):
     # Generate the narrated text
     narrated_text = narrator(text)
+    # Save the audio to a WAV file
     wavfile.write("output.wav", rate=narrated_text["sampling_rate"], data=narrated_text["audio"][0])
     return "output.wav"
+# Function to recognize the animal in the image and provide relevant information
 def recognize_animal_and_get_info(image):
+    # Step 1: Generate a caption for the uploaded image using BLIP
     inputs = blip_processor(images=image, return_tensors="pt")
     caption_ids = blip_model.generate(**inputs)
     caption = blip_processor.decode(caption_ids[0], skip_special_tokens=True)
+    # Step 2: Extract the animal name from the generated caption
     animal_name = extract_animal_from_caption(caption)
+    # Step 3: Use the QA model to retrieve information based on the combined context
     if animal_name:
         question = f"Describe a {animal_name}?"
         answer = qa_model(question=question, context=combined_context)
     else:
         info = "Sorry, I couldn't identify the animal in the image."
+    # Translate both the caption and the information to Arabic
     translated_caption = translator(caption)[0]['translation_text']
     translated_info = translator(info)[0]['translation_text']
     return caption, info, audio_file, translated_caption, translated_info  # Return all outputs
+# Define the Gradio interface with tabs for displaying results
 with gr.Blocks(css=".gradio-container { background-color: beige; }") as iface:
     gr.Markdown("# Animal Recognition")
     gr.Markdown("Upload an animal image to generate a caption and insights about the identified animal.")
         translated_caption_output = gr.Textbox(label="Translated Caption", interactive=False)
         translated_info_output = gr.Textbox(label="Translated Animal Insight", interactive=False)
+    # Define the action to take when an image is uploaded
     image_input.change(
         fn=recognize_animal_and_get_info,
         inputs=image_input,