Spaces:

LarissaE
/

test

Runtime error

App Files Files Community

test / app.py

LarissaE

Changes to google translate

47520da almost 2 years ago

raw

history blame

3.7 kB

	import os
	import gradio as gr
	from transformers import pipeline
	from pytube import YouTube
	from datasets import Dataset, Audio
	from moviepy.editor import AudioFileClip
	import googletrans
	from googletrans import Translator

	pipe = pipeline(model="rafat0421/whisper-small-hi")

	def download_from_youtube(url):
	streams = YouTube(url).streams.filter(only_audio=True, file_extension='mp4') #Downloads the video from the given YouTube URL and returns the path to the audio file.
	fpath = streams.first().download()
	return fpath

	def get_timestamp(seconds):
	minutes = int(seconds / 60)
	seconds = int(seconds % 60)
	return f"{str(minutes).zfill(2)}:{str(seconds).zfill(2)}" #Creates %M:%S timestamp from seconds.

	def create_segments(audio_fpath, seconds_max):
	if not os.path.exists("segmented_audios"):
	os.makedirs("segmented_audios")

	sound = AudioFileClip(audio_fpath)
	n_full_segments = int(sound.duration / 30)
	len_last_segment = sound.duration % 30

	max_segments = int(seconds_max / 30)
	if n_full_segments > max_segments:
	n_full_segments = max_segments
	len_last_segment = 0

	segment_paths = []
	segment_start_times = []

	segments_available = n_full_segments + 1
	for i in range(min(segments_available, max_segments)):
	start = i * 30

	# Skip last segment if it is smaller than two seconds
	is_last_segment = i == n_full_segments
	if is_last_segment and not len_last_segment > 2:
	continue
	elif is_last_segment:
	end = start + len_last_segment
	else:
	end = (i + 1) * 30

	segment_path = os.path.join("segmented_audios", f"segment_{i}.wav")
	segment = sound.subclip(start, end)
	segment.write_audiofile(segment_path)
	segment_paths.append(segment_path)
	segment_start_times.append(start)

	return segment_paths, segment_start_times

	def get_translation(text):
	translator = Translator(service_urls=['translate.googleapis.com'])
	translated_text = translator.translate(text, lang_tgt="en").text

	return translated_text

	#translation_pipeline = pipeline("translation", model=model, tokenizer=tokenizer, src_lang=src_lang, tgt_lang=tgt_lang, max_length=400, device=device)
	#result = translation_pipeline(text)
	#return result[0]['translation_text']

	#return "Under Development..."

	def transcribe(audio, url, seconds_max):
	if url:
	fpath = download_from_youtube(url)
	segment_paths, segment_start_times = create_segments(fpath, seconds_max)

	audio_dataset = Dataset.from_dict({"audio": segment_paths}).cast_column("audio", Audio(sampling_rate=16000))
	pred = pipe(audio_dataset["audio"])
	text = ""
	n_segments = len(segment_start_times)
	for i, (seconds, output) in enumerate(zip(segment_start_times, pred)):
	text += f"[Segment {i+1}/{n_segments}, start time {get_timestamp(seconds)}]\n"
	text += f"{output['text']}\n"
	text += f"[Translation]\n{get_translation(output['text'])}\n\n"
	return text

	else:
	text = pipe(audio)["text"]
	return text

	iface = gr.Interface(
	fn=transcribe,
	inputs=[
	gr.Audio(source="microphone", type="filepath", label="Transcribe from Microphone"),
	gr.Text(max_lines=1, placeholder="YouTube Link", label="Transcribe from YouTube URL"),
	gr.Slider(minimum=30, maximum=600, value=30, step=30, label="Number of seconds to transcribe")
	],
	outputs="text",
	title="Whisper: transcribe Swedish language audio to text",
	description="Swedish Text Transcription using Transformers.",
	)

	iface.launch()