import base64 import vertexai from vertexai.generative_models import GenerativeModel, Part import vertexai.preview.generative_models as generative_models import os import gradio as gr import tempfile def get_credentials(): creds_json_str = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON") if creds_json_str is None: raise ValueError("GOOGLE_APPLICATION_CREDENTIALS_JSON not found in environment") # create a temporary file with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".json") as temp: temp.write(creds_json_str) # write in json format temp_filename = temp.name return temp_filename os.environ["GOOGLE_APPLICATION_CREDENTIALS"]= get_credentials() def get_matching_format(file_path): file_formats = { "png": "image/png", "jpeg": "image/jpeg", "jpg": "image/jpeg", # Adding jpg to handle common JPEG extension "aac": "audio/aac", "flac": "audio/flac", "mp3": "audio/mp3", "m4a": "audio/m4a", "mpeg": "audio/mpeg", "mpga": "audio/mpga", "mp4": "audio/mp4", "opus": "audio/opus", "pcm": "audio/pcm", "wav": "audio/wav", "webm": ["audio/webm", "video/webm"], # webm can be both audio and video "flv": "video/x-flv", "mov": "video/mov", "mpegps": "video/mpegps", "mpg": "video/mpg", "3gpp": "video/3gpp", "wmv": "video/wmv", "pdf": "application/pdf" } # Extract file extension file_extension = file_path.split('.')[-1].lower() # Find and return the matching format if file_extension in file_formats: return file_formats[file_extension] else: return None def encode_file(file_path): mime_type = get_matching_format(file_path) if mime_type: with open(file_path, 'rb') as file: file_content = file.read() encoded_content = base64.b64encode(file_content) encoded_string = encoded_content.decode('utf-8') return encoded_string else: return None def multiturn_generate_content(file_path, user_query): encoded_string = encode_file(file_path) mime_type = get_matching_format(file_path) if encoded_string: vertexai.init(project="imgcp-ff81e7053b072ce5", location="us-central1") model = GenerativeModel( "gemini-1.5-flash-001", ) chat = model.start_chat() doc = Part.from_data( mime_type=mime_type, data=base64.b64decode(encoded_string) ) return chat.send_message( [doc, user_query], generation_config={ "max_output_tokens": 8192, "temperature": 1, "top_p": 0.95, } ).text else: return "Model Error" demo = gr.Blocks() with demo: gr.Blocks(theme="base") # gr.Markdown("") with gr.Tabs(): with gr.TabItem("Use Cases"): gr.Markdown("""

Gemini Multimodal

""") gr.Markdown("""This Model performs well at a variety of multimodal tasks such as visual understanding, classification, summarization, and creating content from image, audio and video. It's adept at processing visual and text inputs such as photographs, documents, infographics, and screenshots.""") gr.Markdown("""""") with gr.TabItem("Upload"): gr.Markdown("""Note: Please upload the file and submit your query in the next tab.""") with gr.Row(): filepath = gr.File(type='filepath') with gr.TabItem("Chat"): with gr.Column(): text_input_one = gr.Textbox(lines=15, show_label=False, container=True) image_output = gr.Textbox(show_label=False, min_width=120) text_button_one = gr.Button("Submit") # text_button.click(encode_file, inputs=text_input) text_button_one.click(multiturn_generate_content, inputs=[filepath, image_output], outputs=text_input_one) demo.launch(debug=True)