File size: 4,888 Bytes
d670229
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import streamlit as st
import openai
import os
import base64
import cv2
from moviepy.editor import VideoFileClip

# Set API key and organization ID from environment variables
openai.api_key = os.getenv('OPENAI_API_KEY')
openai.organization = os.getenv('OPENAI_ORG_ID')

# Define the model to be used
MODEL = "gpt-4o"

def process_text():
    text_input = st.text_input("Enter your text:")
    if text_input:
        completion = openai.ChatCompletion.create(
            model=MODEL,
            messages=[
                {"role": "system", "content": "You are a helpful assistant. Help me with my math homework!"},
                {"role": "user", "content": f"Hello! Could you solve {text_input}?"}
            ]
        )
        st.write("Assistant: " + completion.choices[0].message.content)

def process_image(image_input):
    if image_input:
        base64_image = base64.b64encode(image_input.read()).decode("utf-8")
        response = openai.ChatCompletion.create(
            model=MODEL,
            messages=[
                {"role": "system", "content": "You are a helpful assistant that responds in Markdown. Help me with my math homework!"},
                {"role": "user", "content": [
                    {"type": "text", "text": "What's the area of the triangle?"},
                    {"type": "image_url", "image_url": {
                        "url": f"data:image/png;base64,{base64_image}"}
                    }
                ]}
            ],
            temperature=0.0,
        )
        st.markdown(response.choices[0].message.content)

def process_audio(audio_input):
    if audio_input:
        transcription = openai.Audio.transcriptions.create(
            model="whisper-1",
            file=audio_input,
        )
        response = openai.ChatCompletion.create(
            model=MODEL,
            messages=[
                {"role": "system", "content": "You are generating a transcript summary. Create a summary of the provided transcription. Respond in Markdown."},
                {"role": "user", "content": [
                    {"type": "text", "text": f"The audio transcription is: {transcription.text}"}
                ]},
            ],
            temperature=0,
        )
        st.markdown(response.choices[0].message.content)

def process_video(video_input):
    if video_input:
        base64Frames, audio_path = process_video_frames(video_input)
        transcription = openai.Audio.transcriptions.create(
            model="whisper-1",
            file=open(audio_path, "rb"),
        )
        response = openai.ChatCompletion.create(
            model=MODEL,
            messages=[
                {"role": "system", "content": "You are generating a video summary. Create a summary of the provided video and its transcript. Respond in Markdown"},
                {"role": "user", "content": [
                    "These are the frames from the video.",
                    *map(lambda x: {"type": "image_url",
                                    "image_url": {"url": f'data:image/jpg;base64,{x}', "detail": "low"}}, base64Frames),
                    {"type": "text", "text": f"The audio transcription is: {transcription.text}"}
                ]},
            ],
            temperature=0,
        )
        st.markdown(response.choices[0].message.content)

def process_video_frames(video_path, seconds_per_frame=2):
    base64Frames = []
    base_video_path, _ = os.path.splitext(video_path.name)
    video = cv2.VideoCapture(video_path.name)
    total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = video.get(cv2.CAP_PROP_FPS)
    frames_to_skip = int(fps * seconds_per_frame)
    curr_frame = 0
    while curr_frame < total_frames - 1:
        video.set(cv2.CAP_PROP_POS_FRAMES, curr_frame)
        success, frame = video.read()
        if not success:
            break
        _, buffer = cv2.imencode(".jpg", frame)
        base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
        curr_frame += frames_to_skip
    video.release()
    audio_path = f"{base_video_path}.mp3"
    clip = VideoFileClip(video_path.name)
    clip.audio.write_audiofile(audio_path, bitrate="32k")
    clip.audio.close()
    clip.close()
    return base64Frames, audio_path

def main():
    st.title("Omni Demo")
    option = st.selectbox("Select an option", ("Text", "Image", "Audio", "Video"))
    if option == "Text":
        process_text()
    elif option == "Image":
        image_input = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
        process_image(image_input)
    elif option == "Audio":
        audio_input = st.file_uploader("Upload an audio file", type=["mp3", "wav"])
        process_audio(audio_input)
    elif option == "Video":
        video_input = st.file_uploader("Upload a video file", type=["mp4"])
        process_video(video_input)

if __name__ == "__main__":
    main()