Spaces:
Paused
Paused
File size: 7,674 Bytes
2da7ed3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 |
# base seamless imports
# ---------------------------------
import io
import json
import matplotlib as mpl
import matplotlib.pyplot as plt
import mmap
import numpy as np
import soundfile
import torchaudio
import torch
from pydub import AudioSegment
# ---------------------------------
# seamless-streaming specific imports
# ---------------------------------
import math
from simuleval.data.segments import SpeechSegment, EmptySegment
from seamless_communication.streaming.agents.seamless_streaming_s2st import (
SeamlessStreamingS2STVADAgent,
)
from simuleval.utils.arguments import cli_argument_list
from simuleval import options
from typing import Union, List
from simuleval.data.segments import Segment, TextSegment
from simuleval.agents.pipeline import TreeAgentPipeline
from simuleval.agents.states import AgentStates
# ---------------------------------
# seamless setup
# source: https://colab.research.google.com/github/kauterry/seamless_communication/blob/main/Seamless_Tutorial.ipynb?
SAMPLE_RATE = 16000
# PM - THis class is used to simulate the audio frontend in the seamless streaming pipeline
# need to replace this with the actual audio frontend
# TODO: replacement class that takes in PCM-16 bytes and returns SpeechSegment
class AudioFrontEnd:
def __init__(self, wav_file, segment_size) -> None:
self.samples, self.sample_rate = soundfile.read(wav_file)
print(self.sample_rate, "sample rate")
assert self.sample_rate == SAMPLE_RATE
# print(len(self.samples), self.samples[:100])
self.samples = self.samples # .tolist()
self.segment_size = segment_size
self.step = 0
def send_segment(self):
"""
This is the front-end logic in simuleval instance.py
"""
num_samples = math.ceil(self.segment_size / 1000 * self.sample_rate)
if self.step < len(self.samples):
if self.step + num_samples >= len(self.samples):
samples = self.samples[self.step :]
is_finished = True
else:
samples = self.samples[self.step : self.step + num_samples]
is_finished = False
self.samples = self.samples[self.step:]
self.step = min(self.step + num_samples, len(self.samples))
segment = SpeechSegment(
content=samples,
sample_rate=self.sample_rate,
finished=is_finished,
)
else:
# Finish reading this audio
segment = EmptySegment(
finished=True,
)
self.step = 0
self.samples = []
return segment
# samples = self.samples[:num_samples]
# self.samples = self.samples[num_samples:]
# segment = SpeechSegment(
# content=samples,
# sample_rate=self.sample_rate,
# finished=False,
# )
def add_segments(self, wav):
new_samples, _ = soundfile.read(wav)
self.samples = np.concatenate((self.samples, new_samples))
class OutputSegments:
def __init__(self, segments: Union[List[Segment], Segment]):
if isinstance(segments, Segment):
segments = [segments]
self.segments: List[Segment] = [s for s in segments]
@property
def is_empty(self):
return all(segment.is_empty for segment in self.segments)
@property
def finished(self):
return all(segment.finished for segment in self.segments)
def get_audiosegment(samples, sr):
b = io.BytesIO()
soundfile.write(b, samples, samplerate=sr, format="wav")
b.seek(0)
return AudioSegment.from_file(b)
def reset_states(system, states):
if isinstance(system, TreeAgentPipeline):
states_iter = states.values()
else:
states_iter = states
for state in states_iter:
state.reset()
def get_states_root(system, states) -> AgentStates:
if isinstance(system, TreeAgentPipeline):
# self.states is a dict
return states[system.source_module]
else:
# self.states is a list
return system.states[0]
def build_streaming_system(model_configs, agent_class):
parser = options.general_parser()
parser.add_argument("-f", "--f", help="a dummy argument to fool ipython", default="1")
agent_class.add_args(parser)
args, _ = parser.parse_known_args(cli_argument_list(model_configs))
system = agent_class.from_args(args)
return system
def run_streaming_inference(system, audio_frontend, system_states, tgt_lang):
# NOTE: Here for visualization, we calculate delays offset from audio
# *BEFORE* VAD segmentation.
# In contrast for SimulEval evaluation, we assume audios are pre-segmented,
# and Average Lagging, End Offset metrics are based on those pre-segmented audios.
# Thus, delays here are *NOT* comparable to SimulEval per-segment delays
delays = {"s2st": [], "s2tt": []}
prediction_lists = {"s2st": [], "s2tt": []}
speech_durations = []
curr_delay = 0
target_sample_rate = None
while True:
input_segment = audio_frontend.send_segment()
input_segment.tgt_lang = tgt_lang
curr_delay += len(input_segment.content) / SAMPLE_RATE * 1000
if input_segment.finished:
# a hack, we expect a real stream to end with silence
get_states_root(system, system_states).source_finished = True
# Translation happens here
if isinstance(input_segment, EmptySegment):
return None, None, None, None
output_segments = OutputSegments(system.pushpop(input_segment, system_states))
if not output_segments.is_empty:
for segment in output_segments.segments:
# NOTE: another difference from SimulEval evaluation -
# delays are accumulated per-token
if isinstance(segment, SpeechSegment):
pred_duration = 1000 * len(segment.content) / segment.sample_rate
speech_durations.append(pred_duration)
delays["s2st"].append(curr_delay)
prediction_lists["s2st"].append(segment.content)
target_sample_rate = segment.sample_rate
elif isinstance(segment, TextSegment):
delays["s2tt"].append(curr_delay)
prediction_lists["s2tt"].append(segment.content)
print(curr_delay, segment.content)
if output_segments.finished:
reset_states(system, system_states)
if input_segment.finished:
# an assumption of SimulEval agents -
# once source_finished=True, generate until output translation is finished
break
return delays, prediction_lists, speech_durations, target_sample_rate
def get_s2st_delayed_targets(delays, target_sample_rate, prediction_lists, speech_durations):
# get calculate intervals + durations for s2st
intervals = []
start = prev_end = prediction_offset = delays["s2st"][0]
target_samples = [0.0] * int(target_sample_rate * prediction_offset / 1000)
for i, delay in enumerate(delays["s2st"]):
start = max(prev_end, delay)
if start > prev_end:
# Wait source speech, add discontinuity with silence
target_samples += [0.0] * int(
target_sample_rate * (start - prev_end) / 1000
)
target_samples += prediction_lists["s2st"][i]
duration = speech_durations[i]
prev_end = start + duration
intervals.append([start, duration])
return target_samples, intervals
|