dioarafl commited on
Commit
de9ee5d
1 Parent(s): d895af0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -45
app.py CHANGED
@@ -3,7 +3,7 @@ from transformers import (
3
  AutoModelForSpeechSeq2Seq,
4
  AutoProcessor,
5
  AutoModelForCausalLM,
6
- AutoTokenizer,
7
  BitsAndBytesConfig,
8
  )
9
  import torch
@@ -17,16 +17,12 @@ def yt2mp3(url, outputMp3F):
17
 
18
 
19
  def speech2text(mp3_file):
20
- # Set the computation device to GPU (if available) or CPU
21
  device = 'cuda:0'
22
 
23
- # Choose data type based on CUDA availability (float16 for GPU, float32 for CPU)
24
  torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
25
 
26
- # Model identifier for the speech-to-text model
27
  model_id = "distil-whisper/distil-large-v2"
28
 
29
- # Load the model with specified configurations for efficient processing
30
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
31
  model_id,
32
  torch_dtype=torch_dtype,
@@ -35,13 +31,10 @@ def speech2text(mp3_file):
35
  use_flash_attention_2=True
36
  )
37
 
38
- # Move the model to the specified device (GPU/CPU)
39
  model.to(device)
40
 
41
- # Load the processor for the model (handling tokenization and feature extraction)
42
  processor = AutoProcessor.from_pretrained(model_id)
43
 
44
- # Set up a speech recognition pipeline with the model and processor
45
  pipe = pipeline(
46
  "automatic-speech-recognition",
47
  model=model,
@@ -54,35 +47,23 @@ def speech2text(mp3_file):
54
  device=device,
55
  )
56
 
57
- # Process the MP3 file through the pipeline to get the speech recognition result
58
  result = pipe(mp3_file)
59
 
60
- # Extract the text from the recognition result
61
  text_from_video = result["text"]
62
 
63
- # Return the extracted text
64
  return text_from_video
65
 
66
 
67
  def chat(system_prompt, text):
68
- """
69
- It is not a good practice to load the model again and again,
70
- but for the sake of simlicity for demo, let's keep as it is
71
- """
72
-
73
- # Define the model name to be used for the chat function
74
  model_name = "meta-llama/Llama-2-7b-chat-hf"
75
- # Authentication token for Hugging Face API
76
  token = os.environ['HUGGINGFACE_TOKEN']
77
 
78
- # Configure the model to load in a quantized 8-bit format for efficiency
79
  bnb_config = BitsAndBytesConfig(
80
  load_in_8bit=True
81
  )
82
 
83
- # Set the device map to load the model on GPU 0
84
  device_map = {"": 0}
85
- # Load the model from Hugging Face with the specified configuration
86
  model = AutoModelForCausalLM.from_pretrained(
87
  model_name,
88
  quantization_config=bnb_config,
@@ -90,13 +71,10 @@ def chat(system_prompt, text):
90
  use_auth_token=token
91
  )
92
 
93
- # Load the tokenizer for the model
94
  tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=token)
95
 
96
- # Create a text-generation pipeline with the loaded model and tokenizer
97
  llama_pipeline = pipeline(task="text-generation", model=model, tokenizer=tokenizer)
98
 
99
- # Format the input text with special tokens for the model
100
  text = f"""
101
  <s>[INST] <<SYS>>
102
  {system_prompt}
@@ -104,7 +82,6 @@ def chat(system_prompt, text):
104
  {text}[/INST]
105
  """
106
 
107
- # Generate sequences using the pipeline with specified parameters
108
  sequences = llama_pipeline(
109
  text,
110
  do_sample=True,
@@ -114,54 +91,38 @@ def chat(system_prompt, text):
114
  max_length=32000
115
  )
116
 
117
- # Extract the generated text from the sequences
118
  generated_text = sequences[0]["generated_text"]
119
- # Trim the generated text to remove the instruction part
120
  generated_text = generated_text[generated_text.find('[/INST]')+len('[/INST]'):]
121
 
122
- # Return the processed generated text
123
  return generated_text
124
 
125
  def summarize(text):
126
- # Define the maximum input length for each iteration of summarization
127
  input_len = 10000
128
 
129
- # Start an infinite loop to repeatedly summarize the text
130
  while True:
131
- # Print the current length of the text
132
- print(len(text))
133
- # Call the chat function to summarize the text. Only the first 'input_len' characters are considered for summarization
134
  summary = chat("", "Summarize the following: " + text[0:input_len])
135
 
136
  if len(text) < input_len:
137
  return summary
138
 
139
- # Concatenate the current summary with the remaining part of the text for the next iteration
140
  text = summary + " " + text[input_len:]
141
 
142
  import gradio as gr
143
 
144
- # Fungsi dan impor yang sudah Anda miliki sebelumnya
145
-
146
- # Fungsi untuk merangkum teks dari URL YouTube
147
  def summarize_from_youtube(url):
148
- # Unduh audio dari URL YouTube dan transkripsi ucapan menjadi teks
149
  outputMp3F = "./files/audio.mp3"
150
  yt2mp3(url=url, outputMp3F=outputMp3F)
151
  transcribed = speech2text(mp3_file=outputMp3F)
152
- # Rangkum teks yang telah ditranskripsi
153
  summary = summarize(transcribed)
154
  return summary
155
 
156
- # Konfigurasi antarmuka Gradio
157
- youtube_url = gr.inputs.Textbox(lines=1, label="Masukkan URL YouTube")
158
- output_text = gr.outputs.Textbox(label="Ringkasan")
159
 
160
- # Membuat antarmuka Gradio
161
  gr.Interface(
162
  fn=summarize_from_youtube,
163
  inputs=youtube_url,
164
  outputs=output_text,
165
- title="Peringkas YouTube",
166
- description="Masukkan URL YouTube untuk merangkum kontennya."
167
  ).launch()
 
3
  AutoModelForSpeechSeq2Seq,
4
  AutoProcessor,
5
  AutoModelForCausalLM,
6
+ AutoTokenizer,
7
  BitsAndBytesConfig,
8
  )
9
  import torch
 
17
 
18
 
19
  def speech2text(mp3_file):
 
20
  device = 'cuda:0'
21
 
 
22
  torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
23
 
 
24
  model_id = "distil-whisper/distil-large-v2"
25
 
 
26
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
27
  model_id,
28
  torch_dtype=torch_dtype,
 
31
  use_flash_attention_2=True
32
  )
33
 
 
34
  model.to(device)
35
 
 
36
  processor = AutoProcessor.from_pretrained(model_id)
37
 
 
38
  pipe = pipeline(
39
  "automatic-speech-recognition",
40
  model=model,
 
47
  device=device,
48
  )
49
 
 
50
  result = pipe(mp3_file)
51
 
 
52
  text_from_video = result["text"]
53
 
 
54
  return text_from_video
55
 
56
 
57
  def chat(system_prompt, text):
 
 
 
 
 
 
58
  model_name = "meta-llama/Llama-2-7b-chat-hf"
 
59
  token = os.environ['HUGGINGFACE_TOKEN']
60
 
 
61
  bnb_config = BitsAndBytesConfig(
62
  load_in_8bit=True
63
  )
64
 
 
65
  device_map = {"": 0}
66
+
67
  model = AutoModelForCausalLM.from_pretrained(
68
  model_name,
69
  quantization_config=bnb_config,
 
71
  use_auth_token=token
72
  )
73
 
 
74
  tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=token)
75
 
 
76
  llama_pipeline = pipeline(task="text-generation", model=model, tokenizer=tokenizer)
77
 
 
78
  text = f"""
79
  <s>[INST] <<SYS>>
80
  {system_prompt}
 
82
  {text}[/INST]
83
  """
84
 
 
85
  sequences = llama_pipeline(
86
  text,
87
  do_sample=True,
 
91
  max_length=32000
92
  )
93
 
 
94
  generated_text = sequences[0]["generated_text"]
 
95
  generated_text = generated_text[generated_text.find('[/INST]')+len('[/INST]'):]
96
 
 
97
  return generated_text
98
 
99
  def summarize(text):
 
100
  input_len = 10000
101
 
 
102
  while True:
 
 
 
103
  summary = chat("", "Summarize the following: " + text[0:input_len])
104
 
105
  if len(text) < input_len:
106
  return summary
107
 
 
108
  text = summary + " " + text[input_len:]
109
 
110
  import gradio as gr
111
 
 
 
 
112
  def summarize_from_youtube(url):
 
113
  outputMp3F = "./files/audio.mp3"
114
  yt2mp3(url=url, outputMp3F=outputMp3F)
115
  transcribed = speech2text(mp3_file=outputMp3F)
 
116
  summary = summarize(transcribed)
117
  return summary
118
 
119
+ youtube_url = gr.inputs.Textbox(lines=1, label="Enter YouTube URL")
120
+ output_text = gr.outputs.Textbox(label="Summary")
 
121
 
 
122
  gr.Interface(
123
  fn=summarize_from_youtube,
124
  inputs=youtube_url,
125
  outputs=output_text,
126
+ title="YouTube Summarizer",
127
+ description="Enter a YouTube URL to summarize its content."
128
  ).launch()