Re1th
/

speech2viddeo

Model card Files Files and versions

speech2viddeo / speech2video.py

Re1th's picture

Upload speech2video.py

d646ee8 verified about 2 years ago

history blame contribute delete

2.8 kB

	# -- coding: utf-8 --
	"""Speech2Video.ipynb

	Automatically generated by Colaboratory.

	Original file is located at
	https://colab.research.google.com/drive/1CcYNY0wwS05Ml7UVv4oY7cHjlVrhTbIq
	"""

	from google.colab import drive
	drive.mount('/content/drive')

	!apt-get install python3-pyaudio
	!pip install SpeechRecognition
	!pip install pydub

	from pydub import AudioSegment
	import speech_recognition as sr
	import re
	import nltk
	from nltk.stem import PorterStemmer, WordNetLemmatizer
	from nltk.tokenize import word_tokenize

	nltk.download('punkt')
	nltk.download('wordnet')

	!pip install modelscope==1.4.2
	!pip install open_clip_torch
	!pip install pytorch-lightning

	from modelscope.pipelines import pipeline
	from modelscope.outputs import OutputKeys

	p = pipeline('text-to-video-synthesis', 'damo/text-to-video-synthesis')

	def convert_to_wav(input_file, output_file):
	audio = AudioSegment.from_ogg(input_file)
	audio.export(output_file, format="wav")

	# Function to convert audio file to text
	def speech_to_text(audio_file):
	recognizer = sr.Recognizer()
	with sr.AudioFile(audio_file) as source:
	audio = recognizer.record(source)
	try:
	text = recognizer.recognize_google(audio)
	return text
	except sr.UnknownValueError:
	print("Sorry, could not understand audio")
	return ""
	except sr.RequestError as e:
	print("Error fetching results; {0}".format(e))
	return ""

	# Function to preprocess text
	def preprocess_text(text):
	# Remove non-alphabetic characters
	text = re.sub(r'[^a-zA-Z\s]', '', text)

	# Tokenize the text
	tokens = word_tokenize(text)

	porter_stemmer = PorterStemmer()
	lemmatizer = WordNetLemmatizer()

	stemmed_tokens = [porter_stemmer.stem(token) for token in tokens]
	lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

	lemmatized_text = ' '.join(lemmatized_tokens)

	return lemmatized_text

	# Main function
	def main():
	# Input and output file paths
	input_file = "/content/drive/MyDrive/IV II PROJECT/WhatsApp Audio 2024-03-24 at 8.52.04 AM.ogg"
	output_file = "/content/drive/MyDrive/IV II PROJECT/converted_audio.wav"

	# Convert .ogg to .wav
	convert_to_wav(input_file, output_file)

	# Convert audio to text
	text = speech_to_text(output_file)
	print("Text from audio:", text)

	# Preprocess text
	preprocessed_text = preprocess_text(text)
	print("Preprocessed text:", preprocessed_text)

	test_text = {
	'text': preprocessed_text,
	}
	output_video_path = p(test_text,)[OutputKeys.OUTPUT_VIDEO]
	print('output_video_path:', output_video_path)
	from google.colab import files
	files.download(output_video_path)

	if __name__ == "__main__":
	main()