abrar-adnan commited on
Commit
197af76
1 Parent(s): 90452ba

added audio transcription

Browse files
Files changed (2) hide show
  1. app.py +31 -15
  2. requirements.txt +0 -0
app.py CHANGED
@@ -6,8 +6,9 @@ from fastai.vision.all import load_learner
6
  import time
7
  import base64
8
  from deepface import DeepFace
9
- import torchaudio, torch
10
- import subprocess
 
11
 
12
  # import pathlib
13
  # temp = pathlib.PosixPath
@@ -22,6 +23,32 @@ backends = [
22
  'mediapipe'
23
  ]
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  model = load_learner("gaze-recognizer-v3.pkl")
26
 
27
  def video_processing(video_file, encoded_video):
@@ -45,19 +72,8 @@ def video_processing(video_file, encoded_video):
45
 
46
  start_time = time.time()
47
 
48
- subprocess.call(["ffmpeg", "-i", 'temp_video.mp4', "audio.wav"])
49
- waveform, sample_rate = torchaudio.load("audio.wav")
50
- waveform, sample_rate
51
-
52
- waveform, sample_rate = torchaudio.load("audio.wav")
53
- resampler = torchaudio.transforms.Resample(sample_rate, 16000)
54
- waveform = resampler(waveform)[0]
55
-
56
- input_features = processor(waveform.squeeze(dim=0), return_tensors="pt").input_features
57
- predicted_ids = model.generate(input_features)
58
-
59
- transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
60
- print(transcription[0])
61
 
62
  video_capture = cv2.VideoCapture(video_file)
63
  on_camera = 0
 
6
  import time
7
  import base64
8
  from deepface import DeepFace
9
+ import torchaudio
10
+ import moviepy.editor as mp
11
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
12
 
13
  # import pathlib
14
  # temp = pathlib.PosixPath
 
23
  'mediapipe'
24
  ]
25
 
26
+ def getTranscription(path):
27
+ # Insert Local Video File Path
28
+ clip = mp.VideoFileClip(path)
29
+
30
+ # Insert Local Audio File Path
31
+ clip.audio.write_audiofile(r"audio.wav")
32
+
33
+ waveform, sample_rate = torchaudio.load("audio.wav")
34
+ waveform, sample_rate
35
+
36
+ waveform, sample_rate = torchaudio.load("audio.wav")
37
+ resampler = torchaudio.transforms.Resample(sample_rate, 16000)
38
+ waveform = resampler(waveform)[0]
39
+
40
+ processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
41
+ model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
42
+ model.config.forced_decoder_ids = None
43
+
44
+ input_features = processor(waveform.squeeze(dim=0), return_tensors="pt").input_features
45
+ predicted_ids = model.generate(input_features)
46
+
47
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
48
+
49
+ return transcription[0]
50
+
51
+
52
  model = load_learner("gaze-recognizer-v3.pkl")
53
 
54
  def video_processing(video_file, encoded_video):
 
72
 
73
  start_time = time.time()
74
 
75
+ transcription = getTranscription(video_file)
76
+ print(transcription)
 
 
 
 
 
 
 
 
 
 
 
77
 
78
  video_capture = cv2.VideoCapture(video_file)
79
  on_camera = 0
requirements.txt CHANGED
Binary files a/requirements.txt and b/requirements.txt differ