import re import xml.etree.ElementTree as ET from xml.sax import saxutils #import nltk # Chunked generation originally from https://github.com/serp-ai/bark-with-voice-clone def split_and_recombine_text(text, desired_length=100, max_length=150): # return nltk.sent_tokenize(text) # from https://github.com/neonbjb/tortoise-tts """Split text it into chunks of a desired length trying to keep sentences intact.""" # normalize text, remove redundant whitespace and convert non-ascii quotes to ascii text = re.sub(r"\n\n+", "\n", text) text = re.sub(r"\s+", " ", text) text = re.sub(r"[“”]", '"', text) rv = [] in_quote = False current = "" split_pos = [] pos = -1 end_pos = len(text) - 1 def seek(delta): nonlocal pos, in_quote, current is_neg = delta < 0 for _ in range(abs(delta)): if is_neg: pos -= 1 current = current[:-1] else: pos += 1 current += text[pos] if text[pos] == '"': in_quote = not in_quote return text[pos] def peek(delta): p = pos + delta return text[p] if p < end_pos and p >= 0 else "" def commit(): nonlocal rv, current, split_pos rv.append(current) current = "" split_pos = [] while pos < end_pos: c = seek(1) # do we need to force a split? if len(current) >= max_length: if len(split_pos) > 0 and len(current) > (desired_length / 2): # we have at least one sentence and we are over half the desired length, seek back to the last split d = pos - split_pos[-1] seek(-d) else: # no full sentences, seek back until we are not in the middle of a word and split there while c not in "!?.,\n " and pos > 0 and len(current) > desired_length: c = seek(-1) commit() # check for sentence boundaries elif not in_quote and (c in "!?]\n" or (c == "." and peek(1) in "\n ")): # seek forward if we have consecutive boundary markers but still within the max length while ( pos < len(text) - 1 and len(current) < max_length and peek(1) in "!?.]" ): c = seek(1) split_pos.append(pos) if len(current) >= desired_length: commit() # treat end of quote as a boundary if its followed by a space or newline elif in_quote and peek(1) == '"' and peek(2) in "\n ": seek(2) split_pos.append(pos) rv.append(current) # clean up, remove lines with only whitespace or punctuation rv = [s.strip() for s in rv] rv = [s for s in rv if len(s) > 0 and not re.match(r"^[\s\.,;:!?]*$", s)] return rv def is_ssml(value): try: ET.fromstring(value) except ET.ParseError: return False return True def build_ssml(rawtext, selected_voice): texts = rawtext.split("\n") joinedparts = "" for textpart in texts: textpart = textpart.strip() if len(textpart) < 1: continue joinedparts = joinedparts + f"\n{saxutils.escape(textpart)}" ssml = f""" {joinedparts} """ return ssml def create_clips_from_ssml(ssmlinput): # Parse the XML tree = ET.ElementTree(ET.fromstring(ssmlinput)) root = tree.getroot() # Create an empty list voice_list = [] # Loop through all voice tags for voice in root.iter('{http://www.w3.org/2001/10/synthesis}voice'): # Extract the voice name attribute and the content text voice_name = voice.attrib['name'] voice_content = voice.text.strip() if voice.text else '' if(len(voice_content) > 0): parts = split_and_recombine_text(voice_content) for p in parts: if(len(p) > 1): # add to tuple list voice_list.append((voice_name, p)) return voice_list