File size: 2,725 Bytes
ffde8eb
 
9aaba9b
ffde8eb
c2024d5
 
fcb35fa
bda9523
0bcd58d
90d99db
d9b10e7
90d99db
 
 
 
 
 
 
c2024d5
 
 
5561bb0
c2024d5
 
90d99db
d9b10e7
c2024d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ffde8eb
0f893bd
0bcd58d
 
 
 
 
 
 
 
 
 
 
bda9523
 
 
c2024d5
 
 
d9b10e7
 
ffde8eb
b893b69
ad23ad1
 
ee00b19
 
 
 
ad23ad1
ee00b19
459f414
ffde8eb
9e391aa
 
 
ee00b19
 
90d99db
c2024d5
0bcd58d
459f414
ffde8eb
 
 
339c973
ffde8eb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import gradio as gr
import onnxruntime
from transformers import AutoTokenizer
import torch
import os
from transformers import pipeline
import subprocess
import moviepy.editor as mp
import base64

# token  = AutoTokenizer.from_pretrained('ProsusAI/finbert')

inf_session = onnxruntime.InferenceSession('classifier1-quantized.onnx')
input_name = inf_session.get_inputs()[0].name
output_name = inf_session.get_outputs()[0].name

classes = ['Art', 'Astrology', 'Biology', 'Chemistry', 'Economics', 'History', 'Literature', 'Philosophy', 'Physics', 'Politics', 'Psychology', 'Sociology']

### --- Audio/Video to txt ---###
device = "cuda:0" if torch.cuda.is_available() else "cpu"
pipe = pipeline("automatic-speech-recognition", 
                    model="openai/whisper-tiny.en",
                    chunk_length_s=30, device=device)

### --- Text Summary --- ###
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", device=device)
     

def video_identity(video):
    transcription = pipe(video)["text"]
    return transcription

def summary(text):
    text = text.split('.')
    max_chunk = 500
    current_chunk = 0
    chunks = []
    
    
    for t in text:
        if len(chunks) == current_chunk + 1:
            if len(chunks[current_chunk]) + len(t.split(' ')) <= max_chunk:
                chunks[current_chunk].extend(t.split(' '))
            else:
                current_chunk += 1
                chunks.append(t.split(' '))
        else:
          chunks.append(t.split(' '))
    
    for chunk in range(len(chunks)):
        chunks[chunk] =' '.join(chunks[chunk])
    
    summ = summarizer(chunks,max_length = 100)

    return summ

def classify(video_file,encoded_video):
    
    if encoded_video != "":
    
        decoded_file_data = base64.b64decode(encoded_video)

        with open("temp_video.mp4", "wb") as f:
            f.write(decoded_file_data)
        
        video_file = "temp_video.mp4"

    clip = mp.VideoFileClip(video_file)
    clip.audio.write_audiofile(r"audio.wav")
    
    full_text = video_identity(r"audio.wav")
    sum = summary(full_text)[0]['summary_text']

    
    # input_ids = token(sum)['input_ids'][:512]
    logits = inf_session.run([output_name],{input_name : [sum]})[0]
    logits = torch.FloatTensor(logits)
    probs = torch.sigmoid(logits)[0]
    probs = list(probs)
    label = classes[probs.index(max(probs))]

    final = {
    'text':full_text,
    'summary':sum,
    'label':label,
}
    return final

text1 = gr.Textbox(label="Text")
text2 =  gr.Textbox(label="Summary")



    
iface = gr.Interface(fn=classify,
                     inputs=['video','text'],
                     outputs = ['json'])
iface.launch(inline=False)