File size: 8,175 Bytes
6502634
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203c01e
6502634
 
 
 
7f5317b
 
6502634
7f5317b
6502634
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203c01e
6502634
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ba62ec1
6502634
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
# -*- coding: utf-8 -*-
"""I/O 25: Radiology with MedGemma, Gemini Native TTS

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/#fileId=https%3A//storage.googleapis.com/kaggle-colab-exported-notebooks/rishirajacharya/i-o-25-radiology-with-medgemma-gemini-native-tts.b5cf5dca-3453-45b1-b7c0-ec7c22aedf1b.ipynb%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com/20250521/auto/storage/goog4_request%26X-Goog-Date%3D20250521T170634Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D4441930d90141e32bd35bf0fd9c6e0f2bd595d3f7bd8cc7bfba27ff7b748cbcc733510dcc0305f8c3287c046c839400e7dae360042459f12e4c3d17506d2b7216fa8d255dff5e5c32f9237a805460cb9bfd88ddf9e4667eaff48eb0f9fe329bd71acc2e6750ac73801f7ddcc55218bae1a50bf69cc93026abfa48ace82e44de442b3404141088839809add42482050efecbfd4e82b9bd154e28bb4e3c6fa765460abb8158d2006cc5989429408c0659c011e5b73fec46e6e384317c3305c16c6b0e1e69bb9f5872028a50cb676eae4a013f474e1c6f67bcda7eb52b8738450d88c8fb0c4b4e80c088004ba96e32dff67c91fbf53cbc4d38815f68c26e1a25793

# Google I/O 2025 Demo: Radiology with MedGemma & Gemini's Native TTS
## Built by [Rishiraj Acharya](https://www.linkedin.com/in/rishirajacharya/) (Google Developer Expert in Kaggle, Cloud, AI)

This demo showcases two of the exciting announcements from Google I/O 2025: **MedGemma** and **Gemini's native text-to-speech (TTS)**. It features a radiology voice assistant powered by MedGemma, which translates complex medical image reports into simple, understandable language. Combined with Gemini's natural-sounding TTS, the assistant provides an intuitive, voice-driven experience—highlighting key areas in radiology images and making medical insights more accessible.

### 🔐 Securing API Keys

We use secret tokens to authenticate with Hugging Face and Google’s Gemini APIs. This keeps our access safe and secure.
"""

import spaces
from google import genai
from google.genai import types
import os

# hf_token = os.getenv('HF_TOKEN')
# !huggingface-cli login --token $hf_token

gemini_api_key = os.getenv('GEMINI_API_KEY')
client = genai.Client(api_key=gemini_api_key)

"""### 🧠 Loading MedGemma for Radiology Insights

Here, we load the **MedGemma** model—an image-text model tuned for medical contexts. We use 4-bit quantization to optimize performance and memory usage on GPU.
"""

from transformers import pipeline, BitsAndBytesConfig
import torch

model_kwargs = dict(torch_dtype=torch.bfloat16, device_map="cuda:0", quantization_config=BitsAndBytesConfig(load_in_4bit=True))
pipe = pipeline("image-text-to-text", model="google/medgemma-4b-it", model_kwargs=model_kwargs)
pipe.model.generation_config.do_sample = False

"""### 🩻 Radiology Image Interpretation Logic

This function uses MedGemma to generate a plain-language report based on the provided prompt and image. It prepares a structured message and passes it to the model for inference.
"""

from PIL import Image

@spaces.GPU
def infer(prompt: str, image: Image.Image, system: str = None) -> str:
    image_filename = "image.png"
    image.save(image_filename)

    messages = []
    if system:
        messages.append({
            "role": "system",
            "content": [{"type": "text", "text": system}]
        })
    messages.append({
        "role": "user",
        "content": [
            {"type": "text", "text": prompt},
            {"type": "image", "image": image}
        ]
    })

    output = pipe(text=messages, max_new_tokens=2048)
    response = output[0]["generated_text"][-1]["content"]

    return response

"""### 🔊 Prepare for Gemini's Native TTS

We define a helper function to convert Gemini’s audio output into a proper `.wav` file. This is key to bringing our radiology assistant’s voice to life!
"""

import wave

def wave_file(filename, pcm, channels=1, rate=24000, sample_width=2):
   with wave.open(filename, "wb") as wf:
      wf.setnchannels(channels)
      wf.setsampwidth(sample_width)
      wf.setframerate(rate)
      wf.writeframes(pcm)

"""### 🤖 Bringing It All Together

This function ties the image analysis and voice generation together. Based on user input, it fetches the image, generates the report using MedGemma, and speaks it out using Gemini's native TTS.
"""

import gradio as gr
import requests

def _do_predictions(text, image_file, image_url, source_type):
    if source_type == "url":
        image = Image.open(requests.get(image_url, headers={"User-Agent": "example"}, stream=True).raw)
    else:
        image = image_file
    report = infer(text, image)

    response = client.models.generate_content(
       model="gemini-2.5-flash-preview-tts",
       contents=report,
       config=types.GenerateContentConfig(
          response_modalities=["AUDIO"],
          speech_config=types.SpeechConfig(
             voice_config=types.VoiceConfig(
                prebuilt_voice_config=types.PrebuiltVoiceConfig(
                   voice_name='Kore',
                )
             )
          ),
       )
    )

    data = response.candidates[0].content.parts[0].inline_data.data
    file_name='out.wav'
    wave_file(file_name, data)

    return report, file_name

"""### 🖼️ Interactive Web UI with Gradio

Finally, we build an easy-to-use interface using Gradio. Users can upload an image or provide a URL, type a prompt, and receive both a text and audio response powered by **MedGemma + Gemini TTS**.
"""

def toggle_image_src(choice):
    if choice == "url":
        return gr.update(visible=False), gr.update(visible=True)
    else:
        return gr.update(visible=True), gr.update(visible=False)

with gr.Blocks() as demo:
    gr.Markdown(
        """
        # Google I/O 2025 Demo: Radiology with MedGemma & Gemini's Native TTS
        ## Built by [Rishiraj Acharya](https://www.linkedin.com/in/rishirajacharya/) (Google Developer Expert in Kaggle, Cloud, AI)

        This demo showcases two of the exciting announcements from Google I/O 2025: **MedGemma** and **Gemini's native text-to-speech (TTS)**. It features a radiology voice assistant powered by MedGemma, which translates complex medical image reports into simple, understandable language. Combined with Gemini's natural-sounding TTS, the assistant provides an intuitive, voice-driven experience—highlighting key areas in radiology images and making medical insights more accessible.
        """
    )
    with gr.Row():
        with gr.Column():
            with gr.Row():
                text = gr.Text(label="Instructions", lines=2, interactive=True)
                with gr.Column():
                    radio = gr.Radio(["file", "url"], value="file",
                                        label="Input Image Source")
                    image_file = gr.Image(label="File", type="pil", visible=True)
                    image_url = gr.Textbox(label="URL", visible=False)
            with gr.Row():
                submit = gr.Button("Generate")
        with gr.Column():
            output = gr.Textbox(label="Generated Report")
            audio_output = gr.Audio(label="Generated Report (wav)")
    submit.click(_do_predictions, inputs=[text, image_file, image_url, radio],
                    outputs=[output, audio_output])
    radio.change(toggle_image_src, radio, [image_file, image_url], queue=False, show_progress=False)
    gr.Examples(
        fn=_do_predictions,
        examples=[
                ["Describe this X-ray", Image.open(requests.get("https://google-rad-explain.hf.space/static/images/Effusion2.jpg", headers={"User-Agent": "example"}, stream=True).raw), None, "file"],
                ["Describe this CT",  None, "https://google-rad-explain.hf.space/static/images/CT-Tumor.jpg", "url"],
            ],
        inputs=[text, image_file, image_url, radio],
        outputs=[output, audio_output]
    )
    gr.Markdown("""
    ### Disclaimer
    This demonstration is for illustrative purposes only. It is not intended to diagnose or suggest treatment of any disease or condition, and should not be used for medical advice.
    """)

    demo.queue(max_size=8 * 4).launch(share=True)