dipesh1701 commited on
Commit
635cd84
1 Parent(s): 3caf94a
Files changed (1) hide show
  1. app.py +16 -12
app.py CHANGED
@@ -23,15 +23,19 @@ secret_token = ""
23
 
24
  # model = whisper.load_model("base")
25
 
26
- model_id = "stabilityai/stable-diffusion-2"
27
- scheduler = EulerDiscreteScheduler.from_pretrained(model_id,
28
- subfolder="scheduler")
29
 
30
- pipe = StableDiffusionPipeline.from_pretrained(model_id,
31
- scheduler=scheduler,
32
- revision="fp16",
33
- torch_dtype=torch.float16)
34
- pipe = pipe.to("cuda")
 
 
 
 
 
 
35
 
36
  def transcribe(audio):
37
 
@@ -40,14 +44,14 @@ def transcribe(audio):
40
  audio = whisper.pad_or_trim(audio)
41
 
42
  # make log-Mel spectrogram and move to the same device as the model
43
- mel = whisper.log_mel_spectrogram(audio).to(model.device)
44
 
45
  # detect the spoken language
46
  _, probs = model.detect_language(mel)
47
 
48
  # decode the audio
49
- options = whisper.DecodingOptions()
50
- result = whisper.decode(model, mel, options)
51
  result_text = result.text
52
 
53
  # Pass the generated text to Audio
@@ -55,7 +59,7 @@ def transcribe(audio):
55
  resp = chatgpt_api.send_message(result_text)
56
  out_result = resp['message']
57
 
58
- out_image = pipe(out_result, height=768, width=768).images[0]
59
 
60
  return [result_text, out_result, out_image]
61
 
 
23
 
24
  # model = whisper.load_model("base")
25
 
26
+ from diffusers import DiffusionPipeline
 
 
27
 
28
+ pipeline = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1")
29
+
30
+ # model_id = "stabilityai/stable-diffusion-2"
31
+ # scheduler = EulerDiscreteScheduler.from_pretrained(model_id,
32
+ # subfolder="scheduler")
33
+
34
+ # pipe = StableDiffusionPipeline.from_pretrained(model_id,
35
+ # scheduler=scheduler,
36
+ # revision="fp16",
37
+ # torch_dtype=torch.float16)
38
+ # pipe = pipe.to("cuda")
39
 
40
  def transcribe(audio):
41
 
 
44
  audio = whisper.pad_or_trim(audio)
45
 
46
  # make log-Mel spectrogram and move to the same device as the model
47
+ mel = model.log_mel_spectrogram(audio).to(model.device)
48
 
49
  # detect the spoken language
50
  _, probs = model.detect_language(mel)
51
 
52
  # decode the audio
53
+ options = model.DecodingOptions()
54
+ result = model.decode(model, mel, options)
55
  result_text = result.text
56
 
57
  # Pass the generated text to Audio
 
59
  resp = chatgpt_api.send_message(result_text)
60
  out_result = resp['message']
61
 
62
+ out_image = pipeline(out_result, height=768, width=768).images[0]
63
 
64
  return [result_text, out_result, out_image]
65