speech2viddeo / speech2video.py
Re1th's picture
Upload speech2video.py
d646ee8 verified
# -*- coding: utf-8 -*-
"""Speech2Video.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1CcYNY0wwS05Ml7UVv4oY7cHjlVrhTbIq
"""
from google.colab import drive
drive.mount('/content/drive')
!apt-get install python3-pyaudio
!pip install SpeechRecognition
!pip install pydub
from pydub import AudioSegment
import speech_recognition as sr
import re
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('wordnet')
!pip install modelscope==1.4.2
!pip install open_clip_torch
!pip install pytorch-lightning
from modelscope.pipelines import pipeline
from modelscope.outputs import OutputKeys
p = pipeline('text-to-video-synthesis', 'damo/text-to-video-synthesis')
def convert_to_wav(input_file, output_file):
audio = AudioSegment.from_ogg(input_file)
audio.export(output_file, format="wav")
# Function to convert audio file to text
def speech_to_text(audio_file):
recognizer = sr.Recognizer()
with sr.AudioFile(audio_file) as source:
audio = recognizer.record(source)
try:
text = recognizer.recognize_google(audio)
return text
except sr.UnknownValueError:
print("Sorry, could not understand audio")
return ""
except sr.RequestError as e:
print("Error fetching results; {0}".format(e))
return ""
# Function to preprocess text
def preprocess_text(text):
# Remove non-alphabetic characters
text = re.sub(r'[^a-zA-Z\s]', '', text)
# Tokenize the text
tokens = word_tokenize(text)
porter_stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stemmed_tokens = [porter_stemmer.stem(token) for token in tokens]
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
lemmatized_text = ' '.join(lemmatized_tokens)
return lemmatized_text
# Main function
def main():
# Input and output file paths
input_file = "/content/drive/MyDrive/IV II PROJECT/WhatsApp Audio 2024-03-24 at 8.52.04 AM.ogg"
output_file = "/content/drive/MyDrive/IV II PROJECT/converted_audio.wav"
# Convert .ogg to .wav
convert_to_wav(input_file, output_file)
# Convert audio to text
text = speech_to_text(output_file)
print("Text from audio:", text)
# Preprocess text
preprocessed_text = preprocess_text(text)
print("Preprocessed text:", preprocessed_text)
test_text = {
'text': preprocessed_text,
}
output_video_path = p(test_text,)[OutputKeys.OUTPUT_VIDEO]
print('output_video_path:', output_video_path)
from google.colab import files
files.download(output_video_path)
if __name__ == "__main__":
main()