nirajandhakal commited on
Commit
4e8fa7f
1 Parent(s): b311c2f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -92
app.py CHANGED
@@ -1,105 +1,70 @@
1
  import os
2
- import io
3
- import sys
4
- import time
5
- import gradio as gr
6
- from typing import List, Union, Any, Dict, Optional
7
  import torch
 
 
 
8
  import numpy as np
9
- import librosa
10
- from transformers import SpeechSynthesisPipeline, AutoTokenizer, Wav2Vec2ForCTC, Wav2Vec2Processor
11
- from pypdfium2 import PdfDocument, Page
 
 
 
 
 
 
 
 
 
 
12
 
13
- def convert_pdf_to_text(filepath):
14
- doc = PdfDocument(filepath)
15
- text = ""
16
- for page in doc.pages():
17
- text += page.extract_text() + "\n\n"
 
 
 
18
  return text
19
 
20
- class QuantizedSpeechT5TTSPipe:
21
- def __init__(self):
22
- self.tokenizer = AutoTokenizer.from_pretrained('facebook/wav2vec2-base-960h')
23
- self.model = Wav2Vec2ForCTC.from_pretrained('/model/quantized_model').half().cuda()
24
- self.processor = Wav2Vec2Processor.from_pretrained("/model/quantized_vocab")
25
-
26
- def _pad_and_concatenate(self, tensor_list: List[torch.Tensor], padding_value=0):
27
- max_length = max([tensor.size(0) for tensor in tensor_list])
28
- padded_tensors = []
29
- for tensor in tensor_list:
30
- pad_width = max_length - tensor.size(0)
31
- if pad_width > 0:
32
- tensor_padded = torch.cat((tensor, torch.full((pad_width, tensor.size(1)), fill_value=padding_value).type_as(tensor)))
33
- else:
34
- tensor_padded = tensor
35
- padded_tensors.append(tensor_padded)
36
-
37
- return torch.stack(padded_tensors)
38
-
39
- def preprocess(self, inputs: Union[str, List[str]], **kwargs) -> dict:
40
- if isinstance(inputs, str):
41
- inputs = [inputs]
42
- batch_encodings = self.tokenizer(inputs, truncation=True, padding='longest', return_tensors="pt").input_values
43
- return {"batch_encodings": batch_encodings}
44
-
45
- def postprocess(self, outputs: Dict[str, torch.Tensor], **kwargs) -> Union[List[str], List[bytes]]:
46
- logits = outputs["logits"].cpu().detach().numpy()
47
- ids = np.argmax(logits, axis=-1)
48
- cleaned_ids = [id_seq[:np.where(np.array(id_seq) == 2)[0][0]] for id_seq in ids] # Remove CTC blanks
49
- decoded_strings = self.tokenizer.decode(cleaned_ids)
50
-
51
- audios = []
52
- for text in decoded_strings:
53
- input_values = self.processor(text, sampling_rate=16000, return_tensors="pt").input_values
54
- input_values = input_values.cuda().unsqueeze(0)
55
-
56
- mel_outputs = self.model(input_values).mel_output
57
- _, predicted_ids = torch.topk(mel_outputs.float(), k=1, dim=-1)
58
- predicted_ids = predicted_ids.squeeze(-1).tolist()[0]
59
-
60
- raw_waveform = self.processor.post_processing(predicted_ids)
61
- waveform = raw_waveform * 32768 / max(abs(raw_waveform))
62
- wav_data = np.int16(waveform)
63
- audio = io.BytesIO()
64
- sf.write(audio, int(44100), wav_data, format="WAV")
65
- audios.append(audio.getvalue())
66
- return audios
67
-
68
- def generate(self, text: str):
69
- processed_inputs = self.preprocess(text)
70
- outputs = self.model(**processed_inputs)
71
- results = self.postprocess(outputs)
72
- return results
73
 
74
- if __name__ == "__main__":
75
- tts = QuantizedSpeechT5TTSPipe()
76
- sample_text = 'Hello world! This is a test.'
77
- result = tts.generate(sample_text)
78
- print(f'Generated {len(result)} audio files from "{sample_text}"')
 
 
 
 
 
79
 
80
- def main(pdf_file: gr.File, output_filename: str):
81
- start_time = time.time()
82
- pdf_text = convert_pdf_to_text(pdf_file)
83
- print(f'Processed PDF content in {time.time() - start_time:.4f} seconds')
84
 
85
- pipe = QuantizedSpeechT5TTSPipe()
86
- start_time = time.time()
87
- audios = pipe.generate(pdf_text)
88
- print(f'Generated {len(audios)} audio files in {time.time() - start_time:.4f} seconds')
89
 
90
- zip_buffer = BytesIO()
91
- with ZipFile(zip_buffer, mode='w') as zf:
92
- for i, audio in enumerate(audios):
93
- filename = f"{i}_{output_filename}.wav"
94
- zf.writestr(filename, audio)
95
- zip_buffer.seek(0)
96
 
97
- return {'zip': zip_buffer}
 
98
 
99
- iface = gr.Interface(fn=main,
100
- inputs="file",
101
- outputs="binary",
102
- input_types=['pdf'],
103
- output_types=['download'])
104
 
105
- iface.launch()
 
 
 
 
 
1
  import os
 
 
 
 
 
2
  import torch
3
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
4
+ import PyPDF2
5
+ import sounddevice as sd
6
  import numpy as np
7
+ from gtts import gTTS
8
+ from io import BytesIO
9
+ import gradio as gr
10
+
11
+ def load_quantized_model(model_name):
12
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
13
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
14
+
15
+ # Quantize the model
16
+ model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
17
+ model.eval()
18
+
19
+ return model, tokenizer
20
 
21
+ def pdf_to_text(pdf_bytes):
22
+ pdf_file_obj = BytesIO(pdf_bytes)
23
+ pdf_reader = PyPDF2.PdfFileReader(pdf_file_obj)
24
+ text = ''
25
+ for page_num in range(pdf_reader.numPages):
26
+ page_obj = pdf_reader.getPage(page_num)
27
+ text += page_obj.extractText()
28
+ pdf_file_obj.close()
29
  return text
30
 
31
+ def generate_audio(model, tokenizer, text):
32
+ input_ids = torch.tensor(tokenizer.encode(text, return_tensors="pt")).cuda()
33
+ with torch.no_grad():
34
+ outputs = model.generate(input_ids, max_length=500, pad_token_id=tokenizer.eos_token_id)
35
+ output_text = tokenizer.decode(outputs[0])
36
+ return output_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
+ def save_and_play_audio(text):
39
+ tts = gTTS(text=text, lang='en')
40
+ output_file = "output.mp3"
41
+ tts.save(output_file)
42
+
43
+ data, fs = sd.default.read_audio(output_file)
44
+ sd.play(data, fs)
45
+ sd.wait()
46
+
47
+ return output_file
48
 
49
+ def main(pdf_file):
50
+ # Load the quantized model
51
+ model, tokenizer = load_quantized_model("microsoft/speecht5_tts")
 
52
 
53
+ # Move the model to the GPU if available
54
+ if torch.cuda.is_available():
55
+ model.cuda()
 
56
 
57
+ # Convert the uploaded PDF file to text
58
+ text = pdf_to_text(pdf_file.read())
 
 
 
 
59
 
60
+ # Generate audio from the text
61
+ audio_text = generate_audio(model, tokenizer, text)
62
 
63
+ # Save and play the audio
64
+ output_file = save_and_play_audio(audio_text)
 
 
 
65
 
66
+ return {"output_file": output_file}
67
+
68
+ if __name__ == "__main__":
69
+ app = gr.Interface(main, inputs=gr.inputs.File(type="pdf"), outputs="text")
70
+ app.launch()