Norahsal commited on
Commit
1ae1a92
1 Parent(s): 2334683

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -16
app.py CHANGED
@@ -3,20 +3,20 @@ import gradio as gr
3
  import torch
4
  import scipy.io.wavfile as wavfile
5
 
6
- # Load translation model
7
  translator = pipeline("translation_en_to_ar", model="Helsinki-NLP/opus-mt-en-ar")
8
 
9
- # Load the pre-trained BLIP image captioning model and processor
10
  blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
11
  blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
12
 
13
- # Load the question-answering model for information retrieval
14
  qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
15
 
16
- # Load the text-to-speech model
17
  narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
18
 
19
- # Combined context for all animals
20
  combined_context = """
21
  Tigers are the largest species among the Felidae and classified in the genus Panthera. They are apex predators, primarily preying on ungulates such as deer and wild boar.
22
  Elephants are the largest land animals on Earth. They are known for their large ears, tusks made of ivory, and their trunks.
@@ -33,34 +33,34 @@ Pandas are large bears native to China, distinguished by their black and white c
33
  """
34
 
35
  # List of known animal names for extraction
36
- animal_names = ["tiger", "elephant", "deer", "lion", "penguin", "dog", "cat", "giraffe", "zebra","dolphin","Pandas","Crocodiles","Pandas"]
37
 
38
- # Function to extract animal name from the caption
39
  def extract_animal_from_caption(caption):
40
  for animal in animal_names:
41
  if animal in caption.lower():
42
  return animal
43
  return None
44
 
45
- # Function to generate audio from text
46
  def generate_audio(text):
47
  # Generate the narrated text
48
  narrated_text = narrator(text)
49
- # Save the audio to WAV file
50
  wavfile.write("output.wav", rate=narrated_text["sampling_rate"], data=narrated_text["audio"][0])
51
  return "output.wav"
52
 
53
- # Function to recognize animal and get info, including translation
54
  def recognize_animal_and_get_info(image):
55
- # Step 1: Generate a caption for the image using BLIP
56
  inputs = blip_processor(images=image, return_tensors="pt")
57
  caption_ids = blip_model.generate(**inputs)
58
  caption = blip_processor.decode(caption_ids[0], skip_special_tokens=True)
59
 
60
- # Step 2: Extract the animal name from the caption
61
  animal_name = extract_animal_from_caption(caption)
62
 
63
- # Step 3: Use the QA model to provide information based on the combined context
64
  if animal_name:
65
  question = f"Describe a {animal_name}?"
66
  answer = qa_model(question=question, context=combined_context)
@@ -68,7 +68,7 @@ def recognize_animal_and_get_info(image):
68
  else:
69
  info = "Sorry, I couldn't identify the animal in the image."
70
 
71
- # Translate caption and info to Arabic
72
  translated_caption = translator(caption)[0]['translation_text']
73
  translated_info = translator(info)[0]['translation_text']
74
 
@@ -77,7 +77,7 @@ def recognize_animal_and_get_info(image):
77
 
78
  return caption, info, audio_file, translated_caption, translated_info # Return all outputs
79
 
80
- # Define the Gradio interface with tabs
81
  with gr.Blocks(css=".gradio-container { background-color: beige; }") as iface:
82
  gr.Markdown("# Animal Recognition")
83
  gr.Markdown("Upload an animal image to generate a caption and insights about the identified animal.")
@@ -95,7 +95,7 @@ with gr.Blocks(css=".gradio-container { background-color: beige; }") as iface:
95
  translated_caption_output = gr.Textbox(label="Translated Caption", interactive=False)
96
  translated_info_output = gr.Textbox(label="Translated Animal Insight", interactive=False)
97
 
98
- # Define the action on image upload
99
  image_input.change(
100
  fn=recognize_animal_and_get_info,
101
  inputs=image_input,
 
3
  import torch
4
  import scipy.io.wavfile as wavfile
5
 
6
+ # Load translation model to translate English text to Arabic
7
  translator = pipeline("translation_en_to_ar", model="Helsinki-NLP/opus-mt-en-ar")
8
 
9
+ # Load the BLIP image captioning model and processor
10
  blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
11
  blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
12
 
13
+ # Load a question-answering model for retrieving information
14
  qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
15
 
16
+ # Load a text-to-speech model for generating audio from text
17
  narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
18
 
19
+ # Combined context for known animals
20
  combined_context = """
21
  Tigers are the largest species among the Felidae and classified in the genus Panthera. They are apex predators, primarily preying on ungulates such as deer and wild boar.
22
  Elephants are the largest land animals on Earth. They are known for their large ears, tusks made of ivory, and their trunks.
 
33
  """
34
 
35
  # List of known animal names for extraction
36
+ animal_names = ["tiger", "elephant", "deer", "lion", "penguin", "dog", "cat", "giraffe", "zebra", "dolphin", "panda", "crocodile"]
37
 
38
+ # Function to extract animal name from the generated caption
39
  def extract_animal_from_caption(caption):
40
  for animal in animal_names:
41
  if animal in caption.lower():
42
  return animal
43
  return None
44
 
45
+ # Function to generate audio from text using the text-to-speech model
46
  def generate_audio(text):
47
  # Generate the narrated text
48
  narrated_text = narrator(text)
49
+ # Save the audio to a WAV file
50
  wavfile.write("output.wav", rate=narrated_text["sampling_rate"], data=narrated_text["audio"][0])
51
  return "output.wav"
52
 
53
+ # Function to recognize the animal in the image and provide relevant information
54
  def recognize_animal_and_get_info(image):
55
+ # Step 1: Generate a caption for the uploaded image using BLIP
56
  inputs = blip_processor(images=image, return_tensors="pt")
57
  caption_ids = blip_model.generate(**inputs)
58
  caption = blip_processor.decode(caption_ids[0], skip_special_tokens=True)
59
 
60
+ # Step 2: Extract the animal name from the generated caption
61
  animal_name = extract_animal_from_caption(caption)
62
 
63
+ # Step 3: Use the QA model to retrieve information based on the combined context
64
  if animal_name:
65
  question = f"Describe a {animal_name}?"
66
  answer = qa_model(question=question, context=combined_context)
 
68
  else:
69
  info = "Sorry, I couldn't identify the animal in the image."
70
 
71
+ # Translate both the caption and the information to Arabic
72
  translated_caption = translator(caption)[0]['translation_text']
73
  translated_info = translator(info)[0]['translation_text']
74
 
 
77
 
78
  return caption, info, audio_file, translated_caption, translated_info # Return all outputs
79
 
80
+ # Define the Gradio interface with tabs for displaying results
81
  with gr.Blocks(css=".gradio-container { background-color: beige; }") as iface:
82
  gr.Markdown("# Animal Recognition")
83
  gr.Markdown("Upload an animal image to generate a caption and insights about the identified animal.")
 
95
  translated_caption_output = gr.Textbox(label="Translated Caption", interactive=False)
96
  translated_info_output = gr.Textbox(label="Translated Animal Insight", interactive=False)
97
 
98
+ # Define the action to take when an image is uploaded
99
  image_input.change(
100
  fn=recognize_animal_and_get_info,
101
  inputs=image_input,