rishiraj commited on
Commit
6502634
·
verified ·
1 Parent(s): 7f9eae9

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +171 -0
app.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """I/O 25: Radiology with MedGemma, Gemini Native TTS
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/#fileId=https%3A//storage.googleapis.com/kaggle-colab-exported-notebooks/rishirajacharya/i-o-25-radiology-with-medgemma-gemini-native-tts.b5cf5dca-3453-45b1-b7c0-ec7c22aedf1b.ipynb%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com/20250521/auto/storage/goog4_request%26X-Goog-Date%3D20250521T170634Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D4441930d90141e32bd35bf0fd9c6e0f2bd595d3f7bd8cc7bfba27ff7b748cbcc733510dcc0305f8c3287c046c839400e7dae360042459f12e4c3d17506d2b7216fa8d255dff5e5c32f9237a805460cb9bfd88ddf9e4667eaff48eb0f9fe329bd71acc2e6750ac73801f7ddcc55218bae1a50bf69cc93026abfa48ace82e44de442b3404141088839809add42482050efecbfd4e82b9bd154e28bb4e3c6fa765460abb8158d2006cc5989429408c0659c011e5b73fec46e6e384317c3305c16c6b0e1e69bb9f5872028a50cb676eae4a013f474e1c6f67bcda7eb52b8738450d88c8fb0c4b4e80c088004ba96e32dff67c91fbf53cbc4d38815f68c26e1a25793
8
+
9
+ # Google I/O 2025 Demo: Radiology with MedGemma & Gemini's Native TTS
10
+ ## Built by [Rishiraj Acharya](https://www.linkedin.com/in/rishirajacharya/) (Google Developer Expert in Kaggle, Cloud, AI)
11
+
12
+ This demo showcases two of the exciting announcements from Google I/O 2025: **MedGemma** and **Gemini's native text-to-speech (TTS)**. It features a radiology voice assistant powered by MedGemma, which translates complex medical image reports into simple, understandable language. Combined with Gemini's natural-sounding TTS, the assistant provides an intuitive, voice-driven experience—highlighting key areas in radiology images and making medical insights more accessible.
13
+
14
+ ### 🔐 Securing API Keys
15
+
16
+ We use secret tokens to authenticate with Hugging Face and Google’s Gemini APIs. This keeps our access safe and secure.
17
+ """
18
+
19
+ from google import genai
20
+ from google.genai import types
21
+ import os
22
+
23
+ hf_token = os.getenv('HF_TOKEN')
24
+ gemini_api_key = os.getenv('GEMINI_API_KEY')
25
+
26
+ !huggingface-cli login --token $hf_token
27
+ client = genai.Client(api_key=gemini_api_key)
28
+
29
+ """### 🧠 Loading MedGemma for Radiology Insights
30
+
31
+ Here, we load the **MedGemma** model—an image-text model tuned for medical contexts. We use 4-bit quantization to optimize performance and memory usage on GPU.
32
+ """
33
+
34
+ from transformers import pipeline, BitsAndBytesConfig
35
+ import torch
36
+
37
+ model_kwargs = dict(torch_dtype=torch.bfloat16, device_map="cuda:0", quantization_config=BitsAndBytesConfig(load_in_4bit=True))
38
+ pipe = pipeline("image-text-to-text", model="google/medgemma-4b-it", model_kwargs=model_kwargs)
39
+ pipe.model.generation_config.do_sample = False
40
+
41
+ """### 🩻 Radiology Image Interpretation Logic
42
+
43
+ This function uses MedGemma to generate a plain-language report based on the provided prompt and image. It prepares a structured message and passes it to the model for inference.
44
+ """
45
+
46
+ from PIL import Image
47
+
48
+ def infer(prompt: str, image: Image.Image, system: str = None) -> str:
49
+ image_filename = "image.png"
50
+ image.save(image_filename)
51
+
52
+ messages = []
53
+ if system:
54
+ messages.append({
55
+ "role": "system",
56
+ "content": [{"type": "text", "text": system}]
57
+ })
58
+ messages.append({
59
+ "role": "user",
60
+ "content": [
61
+ {"type": "text", "text": prompt},
62
+ {"type": "image", "image": image}
63
+ ]
64
+ })
65
+
66
+ output = pipe(text=messages, max_new_tokens=2048)
67
+ response = output[0]["generated_text"][-1]["content"]
68
+
69
+ return response
70
+
71
+ """### 🔊 Prepare for Gemini's Native TTS
72
+
73
+ We define a helper function to convert Gemini’s audio output into a proper `.wav` file. This is key to bringing our radiology assistant’s voice to life!
74
+ """
75
+
76
+ import wave
77
+
78
+ def wave_file(filename, pcm, channels=1, rate=24000, sample_width=2):
79
+ with wave.open(filename, "wb") as wf:
80
+ wf.setnchannels(channels)
81
+ wf.setsampwidth(sample_width)
82
+ wf.setframerate(rate)
83
+ wf.writeframes(pcm)
84
+
85
+ """### 🤖 Bringing It All Together
86
+
87
+ This function ties the image analysis and voice generation together. Based on user input, it fetches the image, generates the report using MedGemma, and speaks it out using Gemini's native TTS.
88
+ """
89
+
90
+ import gradio as gr
91
+ import requests
92
+
93
+ def _do_predictions(text, image_file, image_url, source_type):
94
+ if source_type == "url":
95
+ image = Image.open(requests.get(image_url, headers={"User-Agent": "example"}, stream=True).raw)
96
+ else:
97
+ image = image_file
98
+ report = infer(text, image)
99
+
100
+ response = client.models.generate_content(
101
+ model="gemini-2.5-flash-preview-tts",
102
+ contents=report,
103
+ config=types.GenerateContentConfig(
104
+ response_modalities=["AUDIO"],
105
+ speech_config=types.SpeechConfig(
106
+ voice_config=types.VoiceConfig(
107
+ prebuilt_voice_config=types.PrebuiltVoiceConfig(
108
+ voice_name='Kore',
109
+ )
110
+ )
111
+ ),
112
+ )
113
+ )
114
+
115
+ data = response.candidates[0].content.parts[0].inline_data.data
116
+ file_name='out.wav'
117
+ wave_file(file_name, data)
118
+
119
+ return report, file_name
120
+
121
+ """### 🖼️ Interactive Web UI with Gradio
122
+
123
+ Finally, we build an easy-to-use interface using Gradio. Users can upload an image or provide a URL, type a prompt, and receive both a text and audio response powered by **MedGemma + Gemini TTS**.
124
+ """
125
+
126
+ def toggle_image_src(choice):
127
+ if choice == "url":
128
+ return gr.update(visible=False), gr.update(visible=True)
129
+ else:
130
+ return gr.update(visible=True), gr.update(visible=False)
131
+
132
+ with gr.Blocks() as demo:
133
+ gr.Markdown(
134
+ """
135
+ # Google I/O 2025 Demo: Radiology with MedGemma & Gemini's Native TTS
136
+
137
+ This demo showcases two of the exciting announcements from Google I/O 2025: **MedGemma** and **Gemini's native text-to-speech (TTS)**. It features a radiology voice assistant powered by MedGemma, which translates complex medical image reports into simple, understandable language. Combined with Gemini's natural-sounding TTS, the assistant provides an intuitive, voice-driven experience—highlighting key areas in radiology images and making medical insights more accessible.
138
+ """
139
+ )
140
+ with gr.Row():
141
+ with gr.Column():
142
+ with gr.Row():
143
+ text = gr.Text(label="Instructions", lines=2, interactive=True)
144
+ with gr.Column():
145
+ radio = gr.Radio(["file", "url"], value="file",
146
+ label="Input Image Source")
147
+ image_file = gr.Image(label="File", type="pil", visible=True)
148
+ image_url = gr.Textbox(label="URL", visible=False)
149
+ with gr.Row():
150
+ submit = gr.Button("Generate")
151
+ with gr.Column():
152
+ output = gr.Textbox(label="Generated Report")
153
+ audio_output = gr.Audio(label="Generated Report (wav)")
154
+ submit.click(_do_predictions, inputs=[text, image_file, image_url, radio],
155
+ outputs=[output, audio_output])
156
+ radio.change(toggle_image_src, radio, [image_file, image_url], queue=False, show_progress=False)
157
+ gr.Examples(
158
+ fn=_do_predictions,
159
+ examples=[
160
+ ["Describe this X-ray", Image.open(requests.get("https://google-rad-explain.hf.space/static/images/Effusion2.jpg", headers={"User-Agent": "example"}, stream=True).raw), None, "file"],
161
+ ["Describe this CT", None, "https://google-rad-explain.hf.space/static/images/CT-Tumor.jpg", "url"],
162
+ ],
163
+ inputs=[text, image_file, image_url, radio],
164
+ outputs=[output, audio_output]
165
+ )
166
+ gr.Markdown("""
167
+ ### Disclaimer
168
+ This demonstration is for illustrative purposes only. It is not intended to diagnose or suggest treatment of any disease or condition, and should not be used for medical advice.
169
+ """)
170
+
171
+ demo.queue(max_size=8 * 4).launch(share=True)