File size: 5,388 Bytes
199a0ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
from transformers import (
    pipeline,
    AutoModelForSpeechSeq2Seq,
    AutoProcessor,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)
import torch
import os
import random

def yt2mp3(url, outputMp3F):
    tmpVideoF=random.random()
    os.system(f"./bin/youtube-dl -o /tmp/{tmpVideoF} --verbose " + url)
    os.system(f"ffmpeg -y -i /tmp/{tmpVideoF}.* -vn -ar 44100 -ac 2 -b:a 192k {outputMp3F}")


def speech2text(mp3_file):
    # Set the computation device to GPU (if available) or CPU
    device = 'cuda:0'

    # Choose data type based on CUDA availability (float16 for GPU, float32 for CPU)
    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

    # Model identifier for the speech-to-text model
    model_id = "distil-whisper/distil-large-v2"

    # Load the model with specified configurations for efficient processing
    model = AutoModelForSpeechSeq2Seq.from_pretrained(
        model_id, 
        torch_dtype=torch_dtype, 
        low_cpu_mem_usage=True, 
        use_safetensors=True,
        use_flash_attention_2=True
    )

    # Move the model to the specified device (GPU/CPU)
    model.to(device)

    # Load the processor for the model (handling tokenization and feature extraction)
    processor = AutoProcessor.from_pretrained(model_id)

    # Set up a speech recognition pipeline with the model and processor
    pipe = pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        max_new_tokens=128,
        chunk_length_s=15,
        batch_size=16,
        torch_dtype=torch_dtype,
        device=device,
    )

    # Process the MP3 file through the pipeline to get the speech recognition result
    result = pipe(mp3_file)

    # Extract the text from the recognition result
    text_from_video = result["text"]

    # Return the extracted text
    return text_from_video


def chat(system_prompt, text):
    """
    It is not a good practice to load the model again and again,
    but for the sake of simlicity for demo, let's keep as it is
    """

    # Define the model name to be used for the chat function
    model_name = "meta-llama/Llama-2-7b-chat-hf"
    # Authentication token for Hugging Face API
    token = os.environ['HUGGINGFACE_TOKEN']

    # Configure the model to load in a quantized 8-bit format for efficiency
    bnb_config = BitsAndBytesConfig(
        load_in_8bit=True
    )

    # Set the device map to load the model on GPU 0
    device_map = {"": 0}
    # Load the model from Hugging Face with the specified configuration
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map=device_map,
        use_auth_token=token
    )

    # Load the tokenizer for the model
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=token)

    # Create a text-generation pipeline with the loaded model and tokenizer
    llama_pipeline = pipeline(task="text-generation", model=model, tokenizer=tokenizer)

    # Format the input text with special tokens for the model
    text = f"""
    <s>[INST] <<SYS>>
    {system_prompt}
    <</SYS>>
    {text}[/INST]
    """

    # Generate sequences using the pipeline with specified parameters
    sequences = llama_pipeline(
        text,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        max_length=32000
    )

    # Extract the generated text from the sequences
    generated_text = sequences[0]["generated_text"]
    # Trim the generated text to remove the instruction part
    generated_text = generated_text[generated_text.find('[/INST]')+len('[/INST]'):]

    # Return the processed generated text
    return generated_text

def summarize(text):
    # Define the maximum input length for each iteration of summarization
    input_len = 10000

    # Start an infinite loop to repeatedly summarize the text
    while True:
        # Print the current length of the text
        print(len(text))
        # Call the chat function to summarize the text. Only the first 'input_len' characters are considered for summarization
        summary = chat("", "Summarize the following: " + text[0:input_len])

        if len(text) < input_len:
            return summary
        
        # Concatenate the current summary with the remaining part of the text for the next iteration
        text = summary + " " + text[input_len:]

import gradio as gr

# Fungsi dan impor yang sudah Anda miliki sebelumnya

# Fungsi untuk merangkum teks dari URL YouTube
def summarize_from_youtube(url):
    # Unduh audio dari URL YouTube dan transkripsi ucapan menjadi teks
    outputMp3F = "./files/audio.mp3"
    yt2mp3(url=url, outputMp3F=outputMp3F)
    transcribed = speech2text(mp3_file=outputMp3F)
    # Rangkum teks yang telah ditranskripsi
    summary = summarize(transcribed)
    return summary

# Konfigurasi antarmuka Gradio
youtube_url = gr.inputs.Textbox(lines=1, label="Masukkan URL YouTube")
output_text = gr.outputs.Textbox(label="Ringkasan")

# Membuat antarmuka Gradio
gr.Interface(
    fn=summarize_from_youtube, 
    inputs=youtube_url, 
    outputs=output_text, 
    title="Peringkas YouTube",
    description="Masukkan URL YouTube untuk merangkum kontennya."
).launch()