ysharma HF staff commited on
Commit
527d5e6
1 Parent(s): a27c450
Files changed (1) hide show
  1. app.py +115 -1
app.py CHANGED
@@ -1 +1,115 @@
1
- import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ #final
4
+ import gradio as gr
5
+ #import json
6
+ #from difflib import Differ
7
+ import ffmpeg
8
+ #import os
9
+ from pathlib import Path
10
+ #import time
11
+
12
+ API_URL = "https://api-inference.huggingface.co/models/facebook/wav2vec2-base-960h"
13
+ headers = {"Authorization": "Bearer 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"}
14
+
15
+ #convert video to audio
16
+ video_path = Path("/content/gdrive/My Drive/AI/videoedit/ShiaLaBeouf.mp4")
17
+ audio_memory, _ = ffmpeg.input(video_path).output('-', format="wav", ac=1, ar='16k').overwrite_output().global_args('-loglevel', 'quiet').run(capture_stdout=True)
18
+
19
+ #calling the hosted model
20
+ def query_api(audio_bytes: bytes):
21
+ """
22
+ Query for Huggingface Inference API for Automatic Speech Recognition task
23
+ """
24
+ payload = json.dumps({
25
+ "inputs": base64.b64encode(audio_bytes).decode("utf-8"),
26
+ "parameters": {
27
+ "return_timestamps": "char",
28
+ "chunk_length_s": 10,
29
+ "stride_length_s": [4, 2]
30
+ },
31
+ "options": {"use_gpu": False}
32
+ }).encode("utf-8")
33
+
34
+ response = requests.request(
35
+ "POST", API_URL, headers=headers, data=payload)
36
+ json_reponse = json.loads(response.content.decode("utf-8"))
37
+ return json_reponse
38
+
39
+ #Getting transcripts using wav2Vec2 huggingface hosted accelerated inference
40
+ #sending audio file in request along with stride and chunk length information
41
+ model_response = query_api(audio_memory)
42
+
43
+ #model response has both - transcripts as well as character timestamps or chunks
44
+ transcription = model_response["text"].lower()
45
+ chnk = model_response["chunks"]
46
+
47
+ #creating lists from chunks to consume downstream easily
48
+ timestamps = [[chunk["text"].lower(), chunk["timestamp"][0], chunk["timestamp"][1]]
49
+ for chunk in chnk]
50
+
51
+
52
+ #getting word timestams from character timestamps
53
+ def get_word_timestamps(timestamps):
54
+ words, word = [], []
55
+ letter_timestamp, word_timestamp, words_timestamp = [], [], []
56
+ for idx,entry in enumerate(timestamps):
57
+ word.append(entry[0])
58
+ letter_timestamp.append(entry[1])
59
+ if entry[0] == ' ':
60
+ words.append(''.join(word))
61
+ word_timestamp.append(letter_timestamp[0])
62
+ word_timestamp.append(timestamps[idx-1][2])
63
+ words_timestamp.append(word_timestamp)
64
+ word, word_timestamp, letter_timestamp = [], [], []
65
+
66
+ words = [word.strip() for word in words]
67
+ return words, words_timestamp
68
+
69
+ words, words_timestamp = get_word_timestamps(timestamps)
70
+ #words = [word.strip() for word in words]
71
+
72
+ print(f"Total words in the audio transcript is:{len(words)}, transcript word list is :{words}")
73
+ print(f"Total Word timestamps derived fromcharacter timestamp are :{len(words_timestamp)}, Word timestamps are :{words_timestamp}")
74
+
75
+ #creating list from input gif transcript
76
+ gif = "don't let your dreams be dreams"
77
+ giflist = gif.split()
78
+
79
+ #getting index of gif words in main transcript
80
+ def get_gif_word_indexes(total_words_list, gif_words_list):
81
+ if not gif_words_list:
82
+ return
83
+ # just optimization
84
+ lengthgif_words_list = len(gif_words_list)
85
+ firstgif_words_list = gif_words_list[0]
86
+ for idx, item in enumerate(total_words_list):
87
+ if item == firstgif_words_list:
88
+ if total_words_list[idx:idx+lengthgif_words_list] == gif_words_list:
89
+ yield tuple(range(idx, idx+lengthgif_words_list))
90
+
91
+ #getting gif indexes from the generator
92
+ giflist_indxs = list(list(get_gif_word_indexes(words, giflist))[0])
93
+
94
+ #getting start and end timestamps for gif transcript
95
+ def get_gif_timestamps(giflist_indxs):
96
+ #giflist_indxs = list(list(get_gif_word_indexes(words, giflist))[0])
97
+ min_idx = min(giflist_indxs)
98
+ max_idx = max(giflist_indxs)
99
+
100
+ gif_words_timestamp = words_timestamp[min_idx : max_idx+1]
101
+ start_seconds, end_seconds = gif_words_timestamp[0][0], gif_words_timestamp[-1][-1]
102
+ return start_seconds, end_seconds
103
+
104
+ #getting start and end timestamps for a gif video
105
+ start_seconds, end_seconds = get_gif_timestamps(giflist_indxs)
106
+
107
+ #extracting the video and building and serving a .gif image
108
+ def generate_gif(start_seconds, end_seconds):
109
+ final_clip = video.subclip(start_seconds, end_seconds)
110
+ #final_clip.write_videofile("/content/gdrive/My Drive/AI/videoedit/gif1.mp4")
111
+ final_clip.write_gif("/content/gdrive/My Drive/AI/videoedit/gif1.gif",)
112
+ final_clip.close()
113
+ return
114
+
115
+ generate_gif(start_seconds, end_seconds)