umair894 commited on
Commit
f91ac88
·
1 Parent(s): c50b965

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +107 -107
app.py CHANGED
@@ -557,131 +557,131 @@ second_of_silence = AudioSegment.silent() # use default
557
  second_of_silence.export("sil.wav", format='wav')
558
 
559
 
560
- # def generate_speech(history,chatbot_role):
561
- # # Must set autoplay to True first
562
- # yield (history, chatbot_role, "", wave_header_chunk() )
563
 
564
- # first_sentence=True
565
- # language="autodetect" # will predict from first sentence
566
 
567
- # for sentence, history in get_sentence(history,chatbot_role):
568
- # if sentence != "":
569
- # if first_sentence:
570
- # language = detect_language(sentence)
571
- # first_sentence=False
572
 
573
- # print("BG: inserting sentence to queue")
574
 
575
- # generated_speech = generate_speech_for_sentence(history, chatbot_role, sentence,return_as_byte=True,language=language)
576
- # if generated_speech is not None:
577
- # _, audio_dict = generated_speech
578
- # # We are using byte streaming
579
- # yield (history, chatbot_role, sentence, audio_dict["value"] )
580
 
581
 
582
- # # will generate speech audio file per sentence
583
- # def generate_speech_for_sentence(history, chatbot_role, sentence, return_as_byte=True, language="autodetect"):
584
 
585
- # wav_bytestream = b""
586
 
587
- # if len(sentence)==0:
588
- # print("EMPTY SENTENCE")
589
- # return
590
 
591
- # # Sometimes prompt </s> coming on output remove it
592
- # # Some post process for speech only
593
- # sentence = sentence.replace("</s>", "")
594
- # # remove code from speech
595
- # sentence = re.sub("```.*```", "", sentence, flags=re.DOTALL)
596
- # sentence = re.sub("`.*`", "", sentence, flags=re.DOTALL)
597
 
598
- # sentence = re.sub("\(.*\)", "", sentence, flags=re.DOTALL)
599
 
600
- # sentence = sentence.replace("```", "")
601
- # sentence = sentence.replace("...", " ")
602
- # sentence = sentence.replace("(", " ")
603
- # sentence = sentence.replace(")", " ")
604
- # sentence = sentence.replace("<|assistant|>","")
605
-
606
- # if len(sentence)==0:
607
- # print("EMPTY SENTENCE after processing")
608
- # return
609
 
610
- # # A fast fix for last chacter, may produce weird sounds if it is with text
611
- # if (sentence[-1] in ["!", "?", ".", ","]) or (sentence[-2] in ["!", "?", ".", ","]):
612
- # # just add a space
613
- # sentence = sentence[:-1] + " " + sentence[-1]
614
- # print("Sentence for speech:", sentence)
615
 
616
 
617
- # try:
618
- # SENTENCE_SPLIT_LENGTH=350
619
- # if len(sentence)<SENTENCE_SPLIT_LENGTH:
620
- # # no problem continue on
621
- # sentence_list = [sentence]
622
- # else:
623
- # # Until now nltk likely split sentences properly but we need additional
624
- # # check for longer sentence and split at last possible position
625
- # # Do whatever necessary, first break at hypens then spaces and then even split very long words
626
- # sentence_list=textwrap.wrap(sentence,SENTENCE_SPLIT_LENGTH)
627
- # print("SPLITTED LONG SENTENCE:",sentence_list)
628
 
629
- # for sentence in sentence_list:
630
 
631
- # if any(c.isalnum() for c in sentence):
632
- # if language=="autodetect":
633
- # #on first call autodetect, nexts sentence calls will use same language
634
- # language = detect_language(sentence)
635
 
636
- # #exists at least 1 alphanumeric (utf-8)
637
- # audio_stream = get_voice_streaming(
638
- # sentence, language, latent_map[chatbot_role]
639
- # )
640
- # else:
641
- # # likely got a ' or " or some other text without alphanumeric in it
642
- # audio_stream = None
643
 
644
- # # XTTS is actually using streaming response but we are playing audio by sentence
645
- # # If you want direct XTTS voice streaming (send each chunk to voice ) you may set DIRECT_STREAM=1 environment variable
646
- # if audio_stream is not None:
647
- # wav_chunks = wave_header_chunk()
648
- # frame_length = 0
649
- # for chunk in audio_stream:
650
- # try:
651
- # wav_bytestream += chunk
652
- # wav_chunks += chunk
653
- # frame_length += len(chunk)
654
- # except:
655
- # # hack to continue on playing. sometimes last chunk is empty , will be fixed on next TTS
656
- # continue
657
-
658
- # if audio_stream is not None:
659
- # if not return_as_byte:
660
- # audio_unique_filename = "/tmp/"+ str(uuid.uuid4())+".wav"
661
- # with open(audio_unique_filename, "wb") as f:
662
- # f.write(wav_chunks)
663
- # #Will write filename to context variable
664
- # return (history , gr.Audio.update(value=audio_unique_filename, autoplay=True))
665
- # else:
666
- # return (history , gr.Audio.update(value=wav_bytestream, autoplay=True))
667
- # except RuntimeError as e:
668
- # if "device-side assert" in str(e):
669
- # # cannot do anything on cuda device side error, need tor estart
670
- # print(
671
- # f"Exit due to: Unrecoverable exception caused by prompt:{sentence}",
672
- # flush=True,
673
- # )
674
- # gr.Warning("Unhandled Exception encounter, please retry in a minute")
675
- # print("Cuda device-assert Runtime encountered need restart")
676
-
677
- # # HF Space specific.. This error is unrecoverable need to restart space
678
- # api.restart_space(repo_id=repo_id)
679
- # else:
680
- # print("RuntimeError: non device-side assert error:", str(e))
681
- # raise e
682
-
683
- # print("All speech ended")
684
- # return
685
 
686
 
687
  latent_map = {}
 
557
  second_of_silence.export("sil.wav", format='wav')
558
 
559
 
560
+ def generate_speech(history,chatbot_role):
561
+ # Must set autoplay to True first
562
+ yield (history, chatbot_role, "", wave_header_chunk() )
563
 
564
+ first_sentence=True
565
+ language="autodetect" # will predict from first sentence
566
 
567
+ for sentence, history in get_sentence(history,chatbot_role):
568
+ if sentence != "":
569
+ if first_sentence:
570
+ language = detect_language(sentence)
571
+ first_sentence=False
572
 
573
+ print("BG: inserting sentence to queue")
574
 
575
+ generated_speech = generate_speech_for_sentence(history, chatbot_role, sentence,return_as_byte=True,language=language)
576
+ if generated_speech is not None:
577
+ _, audio_dict = generated_speech
578
+ # We are using byte streaming
579
+ yield (history, chatbot_role, sentence, audio_dict["value"] )
580
 
581
 
582
+ # will generate speech audio file per sentence
583
+ def generate_speech_for_sentence(history, chatbot_role, sentence, return_as_byte=True, language="autodetect"):
584
 
585
+ wav_bytestream = b""
586
 
587
+ if len(sentence)==0:
588
+ print("EMPTY SENTENCE")
589
+ return
590
 
591
+ # Sometimes prompt </s> coming on output remove it
592
+ # Some post process for speech only
593
+ sentence = sentence.replace("</s>", "")
594
+ # remove code from speech
595
+ sentence = re.sub("```.*```", "", sentence, flags=re.DOTALL)
596
+ sentence = re.sub("`.*`", "", sentence, flags=re.DOTALL)
597
 
598
+ sentence = re.sub("\(.*\)", "", sentence, flags=re.DOTALL)
599
 
600
+ sentence = sentence.replace("```", "")
601
+ sentence = sentence.replace("...", " ")
602
+ sentence = sentence.replace("(", " ")
603
+ sentence = sentence.replace(")", " ")
604
+ sentence = sentence.replace("<|assistant|>","")
605
+
606
+ if len(sentence)==0:
607
+ print("EMPTY SENTENCE after processing")
608
+ return
609
 
610
+ # A fast fix for last chacter, may produce weird sounds if it is with text
611
+ if (sentence[-1] in ["!", "?", ".", ","]) or (sentence[-2] in ["!", "?", ".", ","]):
612
+ # just add a space
613
+ sentence = sentence[:-1] + " " + sentence[-1]
614
+ print("Sentence for speech:", sentence)
615
 
616
 
617
+ try:
618
+ SENTENCE_SPLIT_LENGTH=350
619
+ if len(sentence)<SENTENCE_SPLIT_LENGTH:
620
+ # no problem continue on
621
+ sentence_list = [sentence]
622
+ else:
623
+ # Until now nltk likely split sentences properly but we need additional
624
+ # check for longer sentence and split at last possible position
625
+ # Do whatever necessary, first break at hypens then spaces and then even split very long words
626
+ sentence_list=textwrap.wrap(sentence,SENTENCE_SPLIT_LENGTH)
627
+ print("SPLITTED LONG SENTENCE:",sentence_list)
628
 
629
+ for sentence in sentence_list:
630
 
631
+ if any(c.isalnum() for c in sentence):
632
+ if language=="autodetect":
633
+ #on first call autodetect, nexts sentence calls will use same language
634
+ language = detect_language(sentence)
635
 
636
+ #exists at least 1 alphanumeric (utf-8)
637
+ audio_stream = get_voice_streaming(
638
+ sentence, language, latent_map[chatbot_role]
639
+ )
640
+ else:
641
+ # likely got a ' or " or some other text without alphanumeric in it
642
+ audio_stream = None
643
 
644
+ # XTTS is actually using streaming response but we are playing audio by sentence
645
+ # If you want direct XTTS voice streaming (send each chunk to voice ) you may set DIRECT_STREAM=1 environment variable
646
+ if audio_stream is not None:
647
+ wav_chunks = wave_header_chunk()
648
+ frame_length = 0
649
+ for chunk in audio_stream:
650
+ try:
651
+ wav_bytestream += chunk
652
+ wav_chunks += chunk
653
+ frame_length += len(chunk)
654
+ except:
655
+ # hack to continue on playing. sometimes last chunk is empty , will be fixed on next TTS
656
+ continue
657
+
658
+ if audio_stream is not None:
659
+ if not return_as_byte:
660
+ audio_unique_filename = "/tmp/"+ str(uuid.uuid4())+".wav"
661
+ with open(audio_unique_filename, "wb") as f:
662
+ f.write(wav_chunks)
663
+ #Will write filename to context variable
664
+ return (history , gr.Audio.update(value=audio_unique_filename, autoplay=True))
665
+ else:
666
+ return (history , gr.Audio.update(value=wav_bytestream, autoplay=True))
667
+ except RuntimeError as e:
668
+ if "device-side assert" in str(e):
669
+ # cannot do anything on cuda device side error, need tor estart
670
+ print(
671
+ f"Exit due to: Unrecoverable exception caused by prompt:{sentence}",
672
+ flush=True,
673
+ )
674
+ gr.Warning("Unhandled Exception encounter, please retry in a minute")
675
+ print("Cuda device-assert Runtime encountered need restart")
676
+
677
+ # HF Space specific.. This error is unrecoverable need to restart space
678
+ api.restart_space(repo_id=repo_id)
679
+ else:
680
+ print("RuntimeError: non device-side assert error:", str(e))
681
+ raise e
682
+
683
+ print("All speech ended")
684
+ return
685
 
686
 
687
  latent_map = {}