Spaces:
Runtime error
Runtime error
File size: 5,388 Bytes
199a0ec |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 |
from transformers import (
pipeline,
AutoModelForSpeechSeq2Seq,
AutoProcessor,
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
)
import torch
import os
import random
def yt2mp3(url, outputMp3F):
tmpVideoF=random.random()
os.system(f"./bin/youtube-dl -o /tmp/{tmpVideoF} --verbose " + url)
os.system(f"ffmpeg -y -i /tmp/{tmpVideoF}.* -vn -ar 44100 -ac 2 -b:a 192k {outputMp3F}")
def speech2text(mp3_file):
# Set the computation device to GPU (if available) or CPU
device = 'cuda:0'
# Choose data type based on CUDA availability (float16 for GPU, float32 for CPU)
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
# Model identifier for the speech-to-text model
model_id = "distil-whisper/distil-large-v2"
# Load the model with specified configurations for efficient processing
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_id,
torch_dtype=torch_dtype,
low_cpu_mem_usage=True,
use_safetensors=True,
use_flash_attention_2=True
)
# Move the model to the specified device (GPU/CPU)
model.to(device)
# Load the processor for the model (handling tokenization and feature extraction)
processor = AutoProcessor.from_pretrained(model_id)
# Set up a speech recognition pipeline with the model and processor
pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
max_new_tokens=128,
chunk_length_s=15,
batch_size=16,
torch_dtype=torch_dtype,
device=device,
)
# Process the MP3 file through the pipeline to get the speech recognition result
result = pipe(mp3_file)
# Extract the text from the recognition result
text_from_video = result["text"]
# Return the extracted text
return text_from_video
def chat(system_prompt, text):
"""
It is not a good practice to load the model again and again,
but for the sake of simlicity for demo, let's keep as it is
"""
# Define the model name to be used for the chat function
model_name = "meta-llama/Llama-2-7b-chat-hf"
# Authentication token for Hugging Face API
token = os.environ['HUGGINGFACE_TOKEN']
# Configure the model to load in a quantized 8-bit format for efficiency
bnb_config = BitsAndBytesConfig(
load_in_8bit=True
)
# Set the device map to load the model on GPU 0
device_map = {"": 0}
# Load the model from Hugging Face with the specified configuration
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=bnb_config,
device_map=device_map,
use_auth_token=token
)
# Load the tokenizer for the model
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=token)
# Create a text-generation pipeline with the loaded model and tokenizer
llama_pipeline = pipeline(task="text-generation", model=model, tokenizer=tokenizer)
# Format the input text with special tokens for the model
text = f"""
<s>[INST] <<SYS>>
{system_prompt}
<</SYS>>
{text}[/INST]
"""
# Generate sequences using the pipeline with specified parameters
sequences = llama_pipeline(
text,
do_sample=True,
top_k=10,
num_return_sequences=1,
eos_token_id=tokenizer.eos_token_id,
max_length=32000
)
# Extract the generated text from the sequences
generated_text = sequences[0]["generated_text"]
# Trim the generated text to remove the instruction part
generated_text = generated_text[generated_text.find('[/INST]')+len('[/INST]'):]
# Return the processed generated text
return generated_text
def summarize(text):
# Define the maximum input length for each iteration of summarization
input_len = 10000
# Start an infinite loop to repeatedly summarize the text
while True:
# Print the current length of the text
print(len(text))
# Call the chat function to summarize the text. Only the first 'input_len' characters are considered for summarization
summary = chat("", "Summarize the following: " + text[0:input_len])
if len(text) < input_len:
return summary
# Concatenate the current summary with the remaining part of the text for the next iteration
text = summary + " " + text[input_len:]
import gradio as gr
# Fungsi dan impor yang sudah Anda miliki sebelumnya
# Fungsi untuk merangkum teks dari URL YouTube
def summarize_from_youtube(url):
# Unduh audio dari URL YouTube dan transkripsi ucapan menjadi teks
outputMp3F = "./files/audio.mp3"
yt2mp3(url=url, outputMp3F=outputMp3F)
transcribed = speech2text(mp3_file=outputMp3F)
# Rangkum teks yang telah ditranskripsi
summary = summarize(transcribed)
return summary
# Konfigurasi antarmuka Gradio
youtube_url = gr.inputs.Textbox(lines=1, label="Masukkan URL YouTube")
output_text = gr.outputs.Textbox(label="Ringkasan")
# Membuat antarmuka Gradio
gr.Interface(
fn=summarize_from_youtube,
inputs=youtube_url,
outputs=output_text,
title="Peringkas YouTube",
description="Masukkan URL YouTube untuk merangkum kontennya."
).launch()
|