File size: 5,371 Bytes
0094af2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import base64
import vertexai
from vertexai.generative_models import GenerativeModel, Part
import vertexai.preview.generative_models as generative_models
import os 
import gradio as gr
import tempfile


def get_credentials():
    creds_json_str = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
    if creds_json_str is None:
        raise ValueError("GOOGLE_APPLICATION_CREDENTIALS_JSON not found in environment")

    # create a temporary file
    with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".json") as temp:
        temp.write(creds_json_str) # write in json format
        temp_filename = temp.name 

    return temp_filename
    
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]= get_credentials()

def get_matching_format(file_path):
    file_formats = {
        "png": "image/png",
        "jpeg": "image/jpeg",
        "jpg": "image/jpeg",  # Adding jpg to handle common JPEG extension
        "aac": "audio/aac",
        "flac": "audio/flac",
        "mp3": "audio/mp3",
        "m4a": "audio/m4a",
        "mpeg": "audio/mpeg",
        "mpga": "audio/mpga",
        "mp4": "audio/mp4",
        "opus": "audio/opus",
        "pcm": "audio/pcm",
        "wav": "audio/wav",
        "webm": ["audio/webm", "video/webm"],  # webm can be both audio and video
        "flv": "video/x-flv",
        "mov": "video/mov",
        "mpegps": "video/mpegps",
        "mpg": "video/mpg",
        "3gpp": "video/3gpp",
        "wmv": "video/wmv",
        "pdf": "application/pdf"
    }

    # Extract file extension
    file_extension = file_path.split('.')[-1].lower()

    # Find and return the matching format
    if file_extension in file_formats:
        return file_formats[file_extension]
    else:
        return None

def encode_file(file_path):
    mime_type = get_matching_format(file_path)
    if mime_type:
        with open(file_path, 'rb') as file:
            file_content = file.read()

        encoded_content = base64.b64encode(file_content)
        encoded_string = encoded_content.decode('utf-8')
        
        return encoded_string
    else:
        return None

def multiturn_generate_content(file_path, user_query):
    encoded_string = encode_file(file_path)
    mime_type = get_matching_format(file_path)
    if encoded_string:
        vertexai.init(project="imgcp-ff81e7053b072ce5", location="us-central1")

        model = GenerativeModel(
            "gemini-1.5-flash-001",
        )
        chat = model.start_chat()

        doc = Part.from_data(
            mime_type=mime_type,
            data=base64.b64decode(encoded_string)
        )
        return chat.send_message(
            [doc, user_query],
            generation_config={
                "max_output_tokens": 8192,
                "temperature": 1,
                "top_p": 0.95,
            }
        ).text
    else:
        return "Model Error"




demo = gr.Blocks()


with demo:
    gr.Blocks(theme="base")
    # gr.Markdown("")
    
    with gr.Tabs():
        with gr.TabItem("Use Cases"):
            gr.Markdown("""<h1>Gemini Multimodal</h1>""")
            gr.Markdown("""<b>This Model performs well at a variety of multimodal tasks such as visual understanding, classification, summarization, and creating content from image, audio and video. It's adept at processing visual and text inputs such as photographs, documents, infographics, and screenshots.</b>""")
            
            gr.Markdown("""<ul>
                        <li><b>Visual Information Seeking:</b> Use external knowledge combined with information extracted from the input image or video to answer questions.</li>
                        <li><b>Object Recognition:</b> Answer questions related to fine-grained identification of the objects in images and videos.</li>
                        <li><b>Digital Content Understanding:</b> Answer questions and extract information from visual content like infographics, charts, figures, tables, and web pages.</li>
                        <li><b>Structured Content Generation:</b> Generate responses based on multimodal inputs in formats like HTML and JSON.</li>
                        <li><b>Captioning and Description:</b> Generate descriptions of images and videos with varying levels of detail.</li>
                        <li><b>Reasoning:</b> Compositionally infer new information without memorization or retrieval.</li>
                        <li><b>Audio:</b> Analyze speech files for summarization, transcription, and Q&A.</li>
                        <li><b>Multimodal Processing:</b> Process multiple types of input media at the same time, such as video and audio input.</li>
                        </ul>""")

        with gr.TabItem("Upload"):
            gr.Markdown("""<b>Note: Please upload the file and submit your query in the next tab.</b>""")
            with gr.Row():
                filepath = gr.File(type='filepath')

        with gr.TabItem("Chat"):
            with gr.Column():
                text_input_one = gr.Textbox(lines=15, show_label=False, container=True)
                image_output = gr.Textbox(show_label=False, min_width=120)
            text_button_one = gr.Button("Submit")
            

    # text_button.click(encode_file, inputs=text_input)
    text_button_one.click(multiturn_generate_content, inputs=[filepath, image_output], outputs=text_input_one)

demo.launch(debug=True)