dioarafl commited on
Commit
199a0ec
1 Parent(s): e8de046

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +167 -0
app.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import (
2
+ pipeline,
3
+ AutoModelForSpeechSeq2Seq,
4
+ AutoProcessor,
5
+ AutoModelForCausalLM,
6
+ AutoTokenizer,
7
+ BitsAndBytesConfig,
8
+ )
9
+ import torch
10
+ import os
11
+ import random
12
+
13
+ def yt2mp3(url, outputMp3F):
14
+ tmpVideoF=random.random()
15
+ os.system(f"./bin/youtube-dl -o /tmp/{tmpVideoF} --verbose " + url)
16
+ os.system(f"ffmpeg -y -i /tmp/{tmpVideoF}.* -vn -ar 44100 -ac 2 -b:a 192k {outputMp3F}")
17
+
18
+
19
+ def speech2text(mp3_file):
20
+ # Set the computation device to GPU (if available) or CPU
21
+ device = 'cuda:0'
22
+
23
+ # Choose data type based on CUDA availability (float16 for GPU, float32 for CPU)
24
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
25
+
26
+ # Model identifier for the speech-to-text model
27
+ model_id = "distil-whisper/distil-large-v2"
28
+
29
+ # Load the model with specified configurations for efficient processing
30
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
31
+ model_id,
32
+ torch_dtype=torch_dtype,
33
+ low_cpu_mem_usage=True,
34
+ use_safetensors=True,
35
+ use_flash_attention_2=True
36
+ )
37
+
38
+ # Move the model to the specified device (GPU/CPU)
39
+ model.to(device)
40
+
41
+ # Load the processor for the model (handling tokenization and feature extraction)
42
+ processor = AutoProcessor.from_pretrained(model_id)
43
+
44
+ # Set up a speech recognition pipeline with the model and processor
45
+ pipe = pipeline(
46
+ "automatic-speech-recognition",
47
+ model=model,
48
+ tokenizer=processor.tokenizer,
49
+ feature_extractor=processor.feature_extractor,
50
+ max_new_tokens=128,
51
+ chunk_length_s=15,
52
+ batch_size=16,
53
+ torch_dtype=torch_dtype,
54
+ device=device,
55
+ )
56
+
57
+ # Process the MP3 file through the pipeline to get the speech recognition result
58
+ result = pipe(mp3_file)
59
+
60
+ # Extract the text from the recognition result
61
+ text_from_video = result["text"]
62
+
63
+ # Return the extracted text
64
+ return text_from_video
65
+
66
+
67
+ def chat(system_prompt, text):
68
+ """
69
+ It is not a good practice to load the model again and again,
70
+ but for the sake of simlicity for demo, let's keep as it is
71
+ """
72
+
73
+ # Define the model name to be used for the chat function
74
+ model_name = "meta-llama/Llama-2-7b-chat-hf"
75
+ # Authentication token for Hugging Face API
76
+ token = os.environ['HUGGINGFACE_TOKEN']
77
+
78
+ # Configure the model to load in a quantized 8-bit format for efficiency
79
+ bnb_config = BitsAndBytesConfig(
80
+ load_in_8bit=True
81
+ )
82
+
83
+ # Set the device map to load the model on GPU 0
84
+ device_map = {"": 0}
85
+ # Load the model from Hugging Face with the specified configuration
86
+ model = AutoModelForCausalLM.from_pretrained(
87
+ model_name,
88
+ quantization_config=bnb_config,
89
+ device_map=device_map,
90
+ use_auth_token=token
91
+ )
92
+
93
+ # Load the tokenizer for the model
94
+ tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=token)
95
+
96
+ # Create a text-generation pipeline with the loaded model and tokenizer
97
+ llama_pipeline = pipeline(task="text-generation", model=model, tokenizer=tokenizer)
98
+
99
+ # Format the input text with special tokens for the model
100
+ text = f"""
101
+ <s>[INST] <<SYS>>
102
+ {system_prompt}
103
+ <</SYS>>
104
+ {text}[/INST]
105
+ """
106
+
107
+ # Generate sequences using the pipeline with specified parameters
108
+ sequences = llama_pipeline(
109
+ text,
110
+ do_sample=True,
111
+ top_k=10,
112
+ num_return_sequences=1,
113
+ eos_token_id=tokenizer.eos_token_id,
114
+ max_length=32000
115
+ )
116
+
117
+ # Extract the generated text from the sequences
118
+ generated_text = sequences[0]["generated_text"]
119
+ # Trim the generated text to remove the instruction part
120
+ generated_text = generated_text[generated_text.find('[/INST]')+len('[/INST]'):]
121
+
122
+ # Return the processed generated text
123
+ return generated_text
124
+
125
+ def summarize(text):
126
+ # Define the maximum input length for each iteration of summarization
127
+ input_len = 10000
128
+
129
+ # Start an infinite loop to repeatedly summarize the text
130
+ while True:
131
+ # Print the current length of the text
132
+ print(len(text))
133
+ # Call the chat function to summarize the text. Only the first 'input_len' characters are considered for summarization
134
+ summary = chat("", "Summarize the following: " + text[0:input_len])
135
+
136
+ if len(text) < input_len:
137
+ return summary
138
+
139
+ # Concatenate the current summary with the remaining part of the text for the next iteration
140
+ text = summary + " " + text[input_len:]
141
+
142
+ import gradio as gr
143
+
144
+ # Fungsi dan impor yang sudah Anda miliki sebelumnya
145
+
146
+ # Fungsi untuk merangkum teks dari URL YouTube
147
+ def summarize_from_youtube(url):
148
+ # Unduh audio dari URL YouTube dan transkripsi ucapan menjadi teks
149
+ outputMp3F = "./files/audio.mp3"
150
+ yt2mp3(url=url, outputMp3F=outputMp3F)
151
+ transcribed = speech2text(mp3_file=outputMp3F)
152
+ # Rangkum teks yang telah ditranskripsi
153
+ summary = summarize(transcribed)
154
+ return summary
155
+
156
+ # Konfigurasi antarmuka Gradio
157
+ youtube_url = gr.inputs.Textbox(lines=1, label="Masukkan URL YouTube")
158
+ output_text = gr.outputs.Textbox(label="Ringkasan")
159
+
160
+ # Membuat antarmuka Gradio
161
+ gr.Interface(
162
+ fn=summarize_from_youtube,
163
+ inputs=youtube_url,
164
+ outputs=output_text,
165
+ title="Peringkas YouTube",
166
+ description="Masukkan URL YouTube untuk merangkum kontennya."
167
+ ).launch()