Spaces:
Sleeping
Sleeping
import streamlit as st | |
import os | |
from pydub import AudioSegment | |
from transformers import T5ForConditionalGeneration, T5Tokenizer | |
import torch | |
import whisper | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
models = T5ForConditionalGeneration.from_pretrained("Michau/t5-base-en-generate-headline") | |
tokenizer = T5Tokenizer.from_pretrained("Michau/t5-base-en-generate-headline") | |
models = models.to(device) | |
model = whisper.load_model("base") | |
st.title("Audio Analysis") | |
# Arguments input | |
st.subheader("Enter YouTube link and file name:") | |
url = st.text_input("YouTube link") | |
name = st.text_input("File name") | |
# Process audio and generate headings | |
if st.button("Process"): | |
if os.path.exists("audio.mp3"): | |
os.remove("audio.mp3") | |
os.system("youtube-dl "+"--write-thumbnail "+"--skip-download "+url + " -o logo.png") | |
os.system("yt-dlp -f 140 -o audio.mp3 " + url) | |
while not os.path.exists("audio.mp3"): | |
continue | |
if os.path.exists("segments"): | |
os.system("rm -rf segments") | |
audio = AudioSegment.from_file("audio.mp3") | |
segment_length = 30 * 1000 | |
if not os.path.exists("segments"): | |
os.makedirs("segments") | |
for i, segment in enumerate(audio[::segment_length]): | |
segment.export(f"segments/{i}.mp3", format="mp3") | |
original_text = "" | |
audio_list = os.listdir("segments") | |
headings = [] | |
original_texts = [] | |
dataForWeb = {} | |
for i in range(len(audio_list)): | |
st.write(f"Processing segment {i+1}/{len(audio_list)}") | |
audio = whisper.load_audio(f"segments/{i}.mp3") | |
audio = whisper.pad_or_trim(audio) | |
mel = whisper.log_mel_spectrogram(audio).to(model.device) | |
_, probs = model.detect_language(mel) | |
options = whisper.DecodingOptions(fp16=False) | |
result = whisper.decode(model, mel, options) | |
text = "headline: " + result.text | |
max_len = 256 | |
encoding = tokenizer.encode_plus(text, return_tensors="pt") | |
input_ids = encoding["input_ids"].to(device) | |
attention_masks = encoding["attention_mask"].to(device) | |
beam_outputs = models.generate( | |
input_ids=input_ids, | |
attention_mask=attention_masks, | |
max_length=64, | |
num_beams=3, | |
early_stopping=True, | |
) | |
generated_heading = tokenizer.decode(beam_outputs[0]) | |
headings.append(generated_heading) | |
original_texts.append(result.text) | |
dataForWeb[i] = { | |
"heading": generated_heading, | |
"text": result.text | |
} | |
original_text += "\n" | |
original_text += "<h3>" + generated_heading + "</h3>" | |
original_text += "\n" | |
original_text += "<p>" + result.text + "</p>" | |
with open(name, "w") as f: | |
f.write(original_text) | |
st.success("Audio processing completed!") | |
# Display results | |
st.subheader("Generated Headings and Text:") | |
for i, heading in enumerate(headings): | |
st.write(f"Segment {i+1}:") | |
st.write("Heading:", heading) | |
st.write("Text:", original_texts[i]) | |
st.write("-----------") | |