GPTTUBE / app.py
Dhruv Pai Dukle
Add application file
1947bbe
raw
history blame
3.12 kB
import streamlit as st
import os
from pydub import AudioSegment
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch
import whisper
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
models = T5ForConditionalGeneration.from_pretrained("Michau/t5-base-en-generate-headline")
tokenizer = T5Tokenizer.from_pretrained("Michau/t5-base-en-generate-headline")
models = models.to(device)
model = whisper.load_model("base")
st.title("Audio Analysis")
# Arguments input
st.subheader("Enter YouTube link and file name:")
url = st.text_input("YouTube link")
name = st.text_input("File name")
# Process audio and generate headings
if st.button("Process"):
if os.path.exists("audio.mp3"):
os.remove("audio.mp3")
os.system("youtube-dl "+"--write-thumbnail "+"--skip-download "+url + " -o logo.png")
os.system("yt-dlp -f 140 -o audio.mp3 " + url)
while not os.path.exists("audio.mp3"):
continue
if os.path.exists("segments"):
os.system("rm -rf segments")
audio = AudioSegment.from_file("audio.mp3")
segment_length = 30 * 1000
if not os.path.exists("segments"):
os.makedirs("segments")
for i, segment in enumerate(audio[::segment_length]):
segment.export(f"segments/{i}.mp3", format="mp3")
original_text = ""
audio_list = os.listdir("segments")
headings = []
original_texts = []
dataForWeb = {}
for i in range(len(audio_list)):
st.write(f"Processing segment {i+1}/{len(audio_list)}")
audio = whisper.load_audio(f"segments/{i}.mp3")
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio).to(model.device)
_, probs = model.detect_language(mel)
options = whisper.DecodingOptions(fp16=False)
result = whisper.decode(model, mel, options)
text = "headline: " + result.text
max_len = 256
encoding = tokenizer.encode_plus(text, return_tensors="pt")
input_ids = encoding["input_ids"].to(device)
attention_masks = encoding["attention_mask"].to(device)
beam_outputs = models.generate(
input_ids=input_ids,
attention_mask=attention_masks,
max_length=64,
num_beams=3,
early_stopping=True,
)
generated_heading = tokenizer.decode(beam_outputs[0])
headings.append(generated_heading)
original_texts.append(result.text)
dataForWeb[i] = {
"heading": generated_heading,
"text": result.text
}
original_text += "\n"
original_text += "<h3>" + generated_heading + "</h3>"
original_text += "\n"
original_text += "<p>" + result.text + "</p>"
with open(name, "w") as f:
f.write(original_text)
st.success("Audio processing completed!")
# Display results
st.subheader("Generated Headings and Text:")
for i, heading in enumerate(headings):
st.write(f"Segment {i+1}:")
st.write("Heading:", heading)
st.write("Text:", original_texts[i])
st.write("-----------")