File size: 5,784 Bytes
ad76407
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
import cv2
import streamlit as st
import tempfile
import base64
import os
from dotenv import load_dotenv
from openai import OpenAI
import assemblyai as aai
from moviepy.editor import *




# Load environment variables
load_dotenv()
aai.settings.api_key = os.getenv("ASSEMBLYAI_API_KEY")
OpenAI.api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI()
 
def main():
    st.title('Insightly Video Content Moderation')
 
    # Video upload section
    uploaded_video = st.file_uploader('Upload a video', type=["mp4", "avi", "mov"])
 
    if uploaded_video is not None:
        # Save the video to a temp file
        tfile = tempfile.NamedTemporaryFile(delete=False)
        tfile.write(uploaded_video.read())
        video_file_path = tfile.name
        tfile.close()

        transcriber = aai.Transcriber()
        transcript = transcriber.transcribe(tfile.name)

        # Process the video and display frames in a grid layout
        base64_frames = video_to_base64_frames(video_file_path)
        display_frame_grid(base64_frames[::30])  # Display every 30th frame in a 3-column grid
 
    st.write("Actions:")  # Header for the actions/buttons section

    # Creating four columns to align the buttons
    col1, col2, col3, col4 = st.columns(4)

    with col1:
        if st.button("Description"):
            st.session_state['description'] = generate_description(base64_frames) if 'description' not in st.session_state else st.session_state['description']
    
    with col2:
        if st.button("Frame Description"):
            st.session_state['frame_description'] = generate_frame_description(base64_frames) if 'frame_description' not in st.session_state else st.session_state['frame_description']

    with col3:
        if st.button("Generate Transcript"):
            st.session_state['transcript'] = transcript.text if 'transcript' not in st.session_state else st.session_state['transcript']
    
    with col4:
        if st.button("Category of Video"):
            st.session_state['category'] = generate_category(base64_frames) if 'category' not in st.session_state else st.session_state['category']

    # If any value exists in session state then display it
    if 'description' in st.session_state and st.session_state['description']:
        st.subheader("Video Description")
        st.write(st.session_state['description'])
    
    if 'frame_description' in st.session_state and st.session_state['frame_description']:
        st.subheader("Frame Description")
        st.write(st.session_state['frame_description'])
    
    if 'transcript' in st.session_state and st.session_state['transcript']:
        st.subheader("Video Transcript")
        st.write(st.session_state['transcript'])
    
    if 'category' in st.session_state and st.session_state['category']:
        st.subheader("Video Category")
        st.write(st.session_state['category'])




            

def video_to_base64_frames(video_file_path):
    # Logic to extract all frames from the video and convert them to base64
    video = cv2.VideoCapture(video_file_path)
    base64_frames = []
 
    while video.isOpened():
        success, frame = video.read()
        if not success:
            break
 
        _, buffer = cv2.imencode('.jpg', frame)
        base64_frame = base64.b64encode(buffer).decode('utf-8')
        base64_frames.append(base64_frame)
 
    video.release()
    return base64_frames

#########################################
#Generate Video description
def generate_description(base64_frames):
    prompt_messages = [
        {
            "role": "user",
            "content": [
                "1. Generate a description for this sequence of video frames in about 90 words.\
                Return the following : 1. List of objects in the video 2. Any restrictive content or sensitive content and if so which frame.",
                *map(lambda x: {"image": x, "resize": 428}, base64_frames[0::30]),
            ],
        },
    ]
    response = client.chat.completions.create(
        model="gpt-4-vision-preview",
        messages=prompt_messages,
        max_tokens=3000,
    )
    return response.choices[0].message.content

#Generate frame description
def generate_frame_description(base64_frames):
    prompt_messages = [
        {
            "role": "user",
            "content": [
                "Describe what is happening in each frame.",
                *map(lambda x: {"image": x, "resize": 428}, base64_frames[0::30]),
            ],
        },
    ]
    response = client.chat.completions.create(
        model="gpt-4-vision-preview",
        messages=prompt_messages,
        max_tokens=3000,
    )
    return response.choices[0].message.content



#Generate Category of Video
def generate_category(base64_frames):
    prompt_messages = [
        {
            "role": "user",
            "content": [
                "What category can this video be tagged to?",
                *map(lambda x: {"image": x, "resize": 428}, base64_frames[0::30]),
            ],
        },
    ]
    response = client.chat.completions.create(
        model="gpt-4-vision-preview",
        messages=prompt_messages,
        max_tokens=3000,
    )
    return response.choices[0].message.content




######################## 
def display_frame_grid(base64_frames):
    cols_per_row = 3
    n_frames = len(base64_frames)
    for idx in range(0, n_frames, cols_per_row):
        cols = st.columns(cols_per_row)
        for col_index in range(cols_per_row):
            frame_idx = idx + col_index
            if frame_idx < n_frames:
                with cols[col_index]:
                    frame = base64_frames[frame_idx]
                    st.image(base64.b64decode(frame), caption=f'Frame {frame_idx * 30 + 1}', width=200)
 
if __name__ == '__main__':
    main()