Spaces:

elith
/

PHASR

Sleeping

App Files Files Community

PHASR / app.py

oriki101

modify app.py

02f5074 about 1 year ago

raw

history blame contribute delete

3.03 kB

	from pathlib import Path

	import gradio as gr
	import librosa
	import numpy as np
	from espnet2.bin.asr_inference import Speech2Text
	from espnet_model_zoo.downloader import ModelDownloader

	# sample_data直下に各ファイルが存在するとしている
	base_dir = Path("./config")
	MODEL_FILE = base_dir / "31epoch.pth"
	TRAIN_CONFIG = base_dir / "config.yaml"
	NORM_CONFIG = base_dir / "feats_stats.npz"
	DEVICE = "cpu"
	RESAMPLING_RATE = 16000

	# モデル
	speech2text = Speech2Text(
	asr_train_config=TRAIN_CONFIG, asr_model_file=MODEL_FILE, device=DEVICE
	)


	# リサンプリング
	def resample(audio: np.ndarray, original_sr: int) -> tuple[np.ndarray, int]:
	"""
	入力された音声信号を元のサンプルレートからリサンプリング

	Args:
	audio (np.ndarray): リサンプリングする音声信号。
	original_sr (int): 音声信号の元のサンプルレート。

	Returns:
	tuple[np.ndarray, int]: リサンプリングされた音声信号と目標のサンプルレート
	"""
	# int16あのでfloatに変換
	if audio.dtype in [np.int16, np.int32]:
	audio = audio.astype(np.float32) / np.iinfo(audio.dtype).max
	# audioのサンプリングレートをoriginal_srから16kに調整する
	resampled_audio = librosa.resample(
	audio, orig_sr=original_sr, target_sr=RESAMPLING_RATE
	)
	return resampled_audio, RESAMPLING_RATE


	# 文字起こし
	# def transcribe(input):
	def transcribe(input: tuple[int, np.ndarray]) -> str:
	"""
	入力された音声信号をテキストに変換

	Args:
	input (tuple[int, np.ndarray]): サンプルレートと音声データを含むタプル。

	Returns:
	str: 音声信号から文字起こしされたテキスト。
	"""
	if input is None:
	raise gr.Error("音声ファイルが提出されていません。実行する前に音声ファイルをアップロードしてください。")

	sr = input[0]
	audio = input[1]
	audio, _ = resample(audio, sr)
	# 認識
	nbests = speech2text(audio)
	text, *_ = nbests[0]
	return text


	# ウェブアプリを作成 themeなくしている
	demo_all = gr.Blocks()
	demo_radio = gr.Interface(
	fn=transcribe,
	inputs=gr.Audio(sources="microphone", type="numpy", label="microphoneFile"),
	outputs="text",
	title="録音した音声をテキストに変換",
	description=("録音した音声をテキストに文字起こしします。"),
	)
	demo_sound = gr.Interface(
	fn=transcribe,
	inputs=gr.Audio(sources="upload", type="numpy", label="Audiofile"),
	outputs="text",
	title="アップロードした音声をテキストに変換",
	description=("アップロードした音声データをテキストに文字起こしします。"),
	)

	# タブにより統合されたウェブアプリとする
	with demo_all:
	gr.TabbedInterface([demo_radio, demo_sound], ["Microphone", "Audio File"])
	# ウェブアプリを起動
	demo_all.launch(share=True)