from transformers import ( pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, ) import torch import os import random def yt2mp3(url, outputMp3F): tmpVideoF=random.random() os.system(f"./bin/youtube-dl -o /tmp/{tmpVideoF} --verbose " + url) os.system(f"ffmpeg -y -i /tmp/{tmpVideoF}.* -vn -ar 44100 -ac 2 -b:a 192k {outputMp3F}") def speech2text(mp3_file): device = 'cuda:0' torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 model_id = "distil-whisper/distil-large-v2" model = AutoModelForSpeechSeq2Seq.from_pretrained( model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, use_flash_attention_2=True ) model.to(device) processor = AutoProcessor.from_pretrained(model_id) pipe = pipeline( "automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=128, chunk_length_s=15, batch_size=16, torch_dtype=torch_dtype, device=device, ) result = pipe(mp3_file) text_from_video = result["text"] return text_from_video def chat(system_prompt, text): model_name = "meta-llama/Llama-2-7b-chat-hf" token = os.environ['HUGGINGFACE_TOKEN'] bnb_config = BitsAndBytesConfig( load_in_8bit=True ) device_map = {"": 0} model = AutoModelForCausalLM.from_pretrained( model_name, quantization_config=bnb_config, device_map=device_map, use_auth_token=token ) tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=token) llama_pipeline = pipeline(task="text-generation", model=model, tokenizer=tokenizer) text = f""" [INST] <> {system_prompt} <> {text}[/INST] """ sequences = llama_pipeline( text, do_sample=True, top_k=10, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id, max_length=32000 ) generated_text = sequences[0]["generated_text"] generated_text = generated_text[generated_text.find('[/INST]')+len('[/INST]'):] return generated_text def summarize(text): input_len = 10000 while True: summary = chat("", "Summarize the following: " + text[0:input_len]) if len(text) < input_len: return summary text = summary + " " + text[input_len:] import gradio as gr import gradio.inputs as inputs def summarize_from_youtube(url): outputMp3F = "./files/audio.mp3" yt2mp3(url=url, outputMp3F=outputMp3F) transcribed = speech2text(mp3_file=outputMp3F) summary = summarize(transcribed) return summary youtube_url = gr.inputs.Textbox(lines=1, label="Masukkan URL YouTube") output_text = gr.outputs.Textbox(label="Summary") gr.Interface( fn=summarize_from_youtube, inputs=youtube_url, outputs=output_text, title="YouTube Summarizer", description="Masukkan URL YouTube untuk merangkum kontennya." ).launch()