Steven Zhang commited on
Commit
a210e7f
0 Parent(s):

reset commit

Browse files
.gitattributes ADDED
File without changes
.gitignore ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py,cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ target/
76
+
77
+ # Jupyter Notebook
78
+ .ipynb_checkpoints
79
+
80
+ # IPython
81
+ profile_default/
82
+ ipython_config.py
83
+
84
+ # pyenv
85
+ .python-version
86
+
87
+ # pipenv
88
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
90
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
91
+ # install all needed dependencies.
92
+ #Pipfile.lock
93
+
94
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95
+ __pypackages__/
96
+
97
+ # Celery stuff
98
+ celerybeat-schedule
99
+ celerybeat.pid
100
+
101
+ # SageMath parsed files
102
+ *.sage.py
103
+
104
+ # Environments
105
+ .env
106
+ .venv
107
+ env/
108
+ venv/
109
+ ENV/
110
+ env.bak/
111
+ venv.bak/
112
+
113
+ # Spyder project settings
114
+ .spyderproject
115
+ .spyproject
116
+
117
+ # Rope project settings
118
+ .ropeproject
119
+
120
+ # mkdocs documentation
121
+ /site
122
+
123
+ # mypy
124
+ .mypy_cache/
125
+ .dmypy.json
126
+ dmypy.json
127
+
128
+ # Pyre type checker
129
+ .pyre/
.idea/.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ # Default ignored files
2
+ /shelf/
3
+ /workspace.xml
.idea/2022-summer-speech-translation.iml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <module type="PYTHON_MODULE" version="4">
3
+ <component name="NewModuleRootManager">
4
+ <content url="file://$MODULE_DIR$" />
5
+ <orderEntry type="inheritedJdk" />
6
+ <orderEntry type="sourceFolder" forTests="false" />
7
+ </component>
8
+ </module>
.idea/inspectionProfiles/profiles_settings.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
1
+ <component name="InspectionProjectProfileManager">
2
+ <settings>
3
+ <option name="USE_PROJECT_PROFILE" value="false" />
4
+ <version value="1.0" />
5
+ </settings>
6
+ </component>
.idea/misc.xml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9" project-jdk-type="Python SDK" />
4
+ </project>
.idea/modules.xml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectModuleManager">
4
+ <modules>
5
+ <module fileurl="file://$PROJECT_DIR$/.idea/2022-summer-speech-translation.iml" filepath="$PROJECT_DIR$/.idea/2022-summer-speech-translation.iml" />
6
+ </modules>
7
+ </component>
8
+ </project>
.idea/vcs.xml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="VcsDirectoryMappings">
4
+ <mapping directory="" vcs="Git" />
5
+ <mapping directory="$PROJECT_DIR$" vcs="Git" />
6
+ </component>
7
+ </project>
AudioToText/audiotospeech.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ # IMPORTS
4
+ import os
5
+ import numpy as np
6
+ import requests
7
+ import tensorflow as tf
8
+ from tensorflow import keras
9
+ from tensorflow.keras import layers
10
+
11
+ # MODEL STUFF
12
+ # The set of characters accepted in the transcription.
13
+ characters = [x for x in "abcdefghijklmnopqrstuvwxyz'?! "]
14
+ # Mapping characters to integers
15
+ char_to_num = keras.layers.StringLookup(vocabulary=characters, oov_token="")
16
+ # Mapping integers back to original characters
17
+ num_to_char = keras.layers.StringLookup(
18
+ vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
19
+ )
20
+
21
+
22
+ # An integer scalar Tensor. The window length in samples.
23
+ frame_length = 256
24
+ # An integer scalar Tensor. The number of samples to step.
25
+ frame_step = 160
26
+ # An integer scalar Tensor. The size of the FFT to apply.
27
+ # If not provided, uses the smallest power of 2 enclosing frame_length.
28
+ fft_length = 384
29
+
30
+ # MODEL LOSS
31
+ def CTCLoss(y_true, y_pred):
32
+ # Compute the training-time loss value
33
+ batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
34
+ input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
35
+ label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")
36
+
37
+ input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
38
+ label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")
39
+
40
+ loss = keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
41
+ return loss
42
+
43
+ # BUILD MODEL
44
+ def build_model(input_dim, output_dim, rnn_layers=5, rnn_units=128):
45
+ """Model similar to DeepSpeech2."""
46
+ # Model's input
47
+ input_spectrogram = layers.Input((None, input_dim), name="input")
48
+ # Expand the dimension to use 2D CNN.
49
+ x = layers.Reshape((-1, input_dim, 1), name="expand_dim")(input_spectrogram)
50
+ # Convolution layer 1
51
+ x = layers.Conv2D(
52
+ filters=32,
53
+ kernel_size=[11, 41],
54
+ strides=[2, 2],
55
+ padding="same",
56
+ use_bias=False,
57
+ name="conv_1",
58
+ )(x)
59
+ x = layers.BatchNormalization(name="conv_1_bn")(x)
60
+ x = layers.ReLU(name="conv_1_relu")(x)
61
+ # Convolution layer 2
62
+ x = layers.Conv2D(
63
+ filters=32,
64
+ kernel_size=[11, 21],
65
+ strides=[1, 2],
66
+ padding="same",
67
+ use_bias=False,
68
+ name="conv_2",
69
+ )(x)
70
+ x = layers.BatchNormalization(name="conv_2_bn")(x)
71
+ x = layers.ReLU(name="conv_2_relu")(x)
72
+ # Reshape the resulted volume to feed the RNNs layers
73
+ x = layers.Reshape((-1, x.shape[-2] * x.shape[-1]))(x)
74
+ # RNN layers
75
+ for i in range(1, rnn_layers + 1):
76
+ recurrent = layers.GRU(
77
+ units=rnn_units,
78
+ activation="tanh",
79
+ recurrent_activation="sigmoid",
80
+ use_bias=True,
81
+ return_sequences=True,
82
+ reset_after=True,
83
+ name=f"gru_{i}",
84
+ )
85
+ x = layers.Bidirectional(
86
+ recurrent, name=f"bidirectional_{i}", merge_mode="concat"
87
+ )(x)
88
+ if i < rnn_layers:
89
+ x = layers.Dropout(rate=0.5)(x)
90
+ # Dense layer
91
+ x = layers.Dense(units=rnn_units * 2, name="dense_1")(x)
92
+ x = layers.ReLU(name="dense_1_relu")(x)
93
+ x = layers.Dropout(rate=0.5)(x)
94
+ # Classification layer
95
+ output = layers.Dense(units=output_dim + 1, activation="softmax")(x)
96
+ # Model
97
+ model = keras.Model(input_spectrogram, output, name="DeepSpeech_2")
98
+ # Optimizer
99
+ opt = keras.optimizers.Adam(learning_rate=1e-4)
100
+ # Compile the model and return
101
+ model.compile(optimizer=opt, loss=CTCLoss)
102
+ return model
103
+
104
+ # GET AND INSTANTIATE MODEL
105
+ model = build_model(
106
+ input_dim = fft_length // 2 + 1,
107
+ output_dim = char_to_num.vocabulary_size(),
108
+ rnn_units = 512,
109
+ )
110
+
111
+
112
+ # GET TEXT FROM MODEL PREDICTION
113
+ # A utility function to decode the output of the network
114
+ def decode_batch_predictions(pred):
115
+ input_len = np.ones(pred.shape[0]) * pred.shape[1]
116
+ # Use greedy search. For complex tasks, you can use beam search
117
+ results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0]
118
+ # Iterate over the results and get back the text
119
+ output_text = []
120
+ for result in results:
121
+ result = tf.strings.reduce_join(num_to_char(result)).numpy().decode("utf-8")
122
+ output_text.append(result)
123
+ return output_text
124
+
125
+
126
+ # PATH TO CKPT
127
+ # google share link
128
+ ckpt_link = 'https://drive.google.com/file/d/14mT_wJMuIqUEJSS12aAc6bnPCjYuLWGf/view?usp=sharing'
129
+
130
+ # Define the local filename to save data
131
+ local_file = 'AudioToTextCKPT.hdf5'
132
+
133
+ # Make http request for remote file data
134
+ data = requests.get(ckpt_link)
135
+
136
+ # Save file data to local copy
137
+ with open(local_file, 'wb')as file:
138
+ file.write(data.content)
139
+
140
+ ckpt = local_file
141
+
142
+
143
+ # LOAD CKPT TO MODEL
144
+ model.load_weights(ckpt)
145
+
146
+ # CONVERT AUDIO TO TEXT
147
+ def AudioToText(wav_file):
148
+ ###########################################
149
+ ## Process the Audio
150
+ ##########################################
151
+ # 1. Read wav file
152
+ file = tf.io.read_file(wav_file)
153
+ # 2. Decode the wav file
154
+ audio, _ = tf.audio.decode_wav(file)
155
+ audio = tf.squeeze(audio, axis=-1)
156
+ # 3. Change type to float
157
+ audio = tf.cast(audio, tf.float32)
158
+ # 4. Get the spectrogram
159
+ spectrogram = tf.signal.stft(
160
+ audio, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length
161
+ )
162
+ # 5. We only need the magnitude, which can be derived by applying tf.abs
163
+ spectrogram = tf.abs(spectrogram)
164
+ spectrogram = tf.math.pow(spectrogram, 0.5)
165
+ # 6. normalisation
166
+ means = tf.math.reduce_mean(spectrogram, 1, keepdims=True)
167
+ stddevs = tf.math.reduce_std(spectrogram, 1, keepdims=True)
168
+ spectrogram = (spectrogram - means) / (stddevs + 1e-10)
169
+
170
+ pred = model.predict(spectrogram)
171
+
172
+ output_text = decode_batch_predictions(pred)
173
+
174
+ return output_text
175
+
176
+
177
+ # testing model
178
+ print(AudioToText('testWav.wav'))
AudioToText/testWav.wav ADDED
Binary file (288 kB). View file
Autocorrect/autocorrectreal.ipynb ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {
7
+ "colab": {
8
+ "base_uri": "https://localhost:8080/"
9
+ },
10
+ "id": "wOvxbAShg-_s",
11
+ "outputId": "0e9a0f9a-fd6e-4ce0-81f6-8da736bd06be"
12
+ },
13
+ "outputs": [],
14
+ "source": [
15
+ "from google.colab import drive\n",
16
+ "drive.mount('/content/drive')"
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "code",
21
+ "execution_count": null,
22
+ "metadata": {
23
+ "colab": {
24
+ "base_uri": "https://localhost:8080/"
25
+ },
26
+ "id": "THLGsHmchJ9g",
27
+ "outputId": "d590fb47-7b15-4176-9b6e-719090ed2cbd"
28
+ },
29
+ "outputs": [],
30
+ "source": [
31
+ "!pip install textdistance"
32
+ ]
33
+ },
34
+ {
35
+ "cell_type": "code",
36
+ "execution_count": null,
37
+ "metadata": {
38
+ "id": "eFxAvy03hPCX"
39
+ },
40
+ "outputs": [],
41
+ "source": [
42
+ "import re\n",
43
+ "from collections import Counter\n",
44
+ "import numpy as np\n",
45
+ "import pandas as pd\n",
46
+ "import textdistance\n",
47
+ "\n",
48
+ "w = []\n",
49
+ "with open('/content/drive/MyDrive/words.txt', 'r') as f:\n",
50
+ " file_name_data = f.read()\n",
51
+ " file_name_data = file_name_data.lower()\n",
52
+ " w = re.findall('\\w+', file_name_data)\n",
53
+ "\n",
54
+ "v = set(w)"
55
+ ]
56
+ },
57
+ {
58
+ "cell_type": "code",
59
+ "execution_count": null,
60
+ "metadata": {
61
+ "colab": {
62
+ "base_uri": "https://localhost:8080/"
63
+ },
64
+ "id": "RPON8Pm7h9Dx",
65
+ "outputId": "dd1309fd-3362-41c9-8f19-affe4739df3e"
66
+ },
67
+ "outputs": [],
68
+ "source": [
69
+ "print(f\"First 10 words: \\n{w[0:10]}\")\n",
70
+ "print(f\"{len(v)} total words \")"
71
+ ]
72
+ },
73
+ {
74
+ "cell_type": "code",
75
+ "execution_count": null,
76
+ "metadata": {
77
+ "id": "U4s_UDWKig11"
78
+ },
79
+ "outputs": [],
80
+ "source": [
81
+ "from nltk.metrics.distance import edit_distance\n",
82
+ "def edit(input_sentence):\n",
83
+ " sentence = input_sentence.split()\n",
84
+ " \n",
85
+ " for i in sentence:\n",
86
+ " if i.lower() in w:\n",
87
+ " continue\n",
88
+ " else:\n",
89
+ " distances = ((edit_distance(i,\n",
90
+ " word), word)\n",
91
+ " for word in w)\n",
92
+ " closest = min(distances)\n",
93
+ " sentence[sentence.index(i)] = closest[1]\n",
94
+ " output_sentence = ' '.join(sentence)\n",
95
+ "\n",
96
+ " return output_sentence"
97
+ ]
98
+ },
99
+ {
100
+ "cell_type": "code",
101
+ "execution_count": null,
102
+ "metadata": {
103
+ "colab": {
104
+ "base_uri": "https://localhost:8080/"
105
+ },
106
+ "id": "c0af01o_i5X0",
107
+ "outputId": "fff4600b-163d-40c8-ce3b-c0b735ec286e"
108
+ },
109
+ "outputs": [],
110
+ "source": [
111
+ "print(edit(\"My namee is uncele Steven\"))\n",
112
+ "print(edit(\"moneeyeh is greeat\"))"
113
+ ]
114
+ }
115
+ ],
116
+ "metadata": {
117
+ "colab": {
118
+ "name": "autocorrectreal.ipynb",
119
+ "provenance": []
120
+ },
121
+ "kernelspec": {
122
+ "display_name": "Python 3",
123
+ "name": "python3"
124
+ },
125
+ "language_info": {
126
+ "name": "python"
127
+ }
128
+ },
129
+ "nbformat": 4,
130
+ "nbformat_minor": 0
131
+ }
README.md ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
1
+ # 2022-summer-speech-translation
2
+
3
+ To Run:
4
+
5
+ - Add how to
TestTranslation/translation.py ADDED
@@ -0,0 +1,280 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """translation.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1PADMvkToYgpdhvQYlZw4q8O-gLvsvGmK
8
+ """
9
+
10
+ import pathlib
11
+ import random
12
+ import string
13
+ import re
14
+ import numpy as np
15
+ import tensorflow as tf
16
+ from tensorflow import keras
17
+ from tensorflow.keras import layers
18
+ # googled fix to "cannot find TextVectorization"
19
+ from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
20
+ import os
21
+ import gdown
22
+
23
+ text_file = keras.utils.get_file(
24
+ fname = "spa-eng.zip",
25
+ origin = "http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip",
26
+ extract = True,
27
+ )
28
+ text_file = pathlib.Path(text_file).parent / "spa-eng" / "spa.txt"
29
+
30
+ # change: added utf-8 encoding
31
+ with open(text_file, encoding="utf-8") as f:
32
+ lines = f.read().split("\n")[:-1]
33
+ text_pairs = []
34
+ for line in lines:
35
+ eng, spa = line.split("\t")
36
+ spa = "[start] " + spa + " [end]"
37
+ text_pairs.append((eng, spa))
38
+
39
+ for _ in range(5):
40
+ print(random.choice(text_pairs))
41
+
42
+ random.shuffle(text_pairs)
43
+ num_val_samples = int(0.15 * len(text_pairs))
44
+ num_train_samples = len(text_pairs) - 2 * num_val_samples
45
+ train_pairs = text_pairs[:num_train_samples]
46
+ val_pairs = text_pairs[num_train_samples : num_train_samples + num_val_samples]
47
+ test_pairs = text_pairs[num_train_samples + num_val_samples :]
48
+
49
+ print(f"{len(text_pairs)} total pairs")
50
+ print(f"{len(train_pairs)} training pairs")
51
+ print(f"{len(val_pairs)} validation pairs")
52
+ print(f"{len(test_pairs)} test pairs")
53
+
54
+ strip_chars = string.punctuation + "¿"
55
+ strip_chars = strip_chars.replace("[", "")
56
+ strip_chars = strip_chars.replace("]", "")
57
+
58
+ vocab_size = 15000
59
+ sequence_length = 20
60
+ batch_size = 64
61
+
62
+
63
+ def custom_standardization(input_string):
64
+ lowercase = tf.strings.lower(input_string)
65
+ return tf.strings.regex_replace(lowercase, "[%s]" % re.escape(strip_chars), "")
66
+
67
+
68
+ eng_vectorization = TextVectorization(
69
+ max_tokens=vocab_size,
70
+ output_mode="int",
71
+ output_sequence_length=sequence_length,
72
+ )
73
+ spa_vectorization = TextVectorization(
74
+ max_tokens=vocab_size,
75
+ output_mode="int",
76
+ output_sequence_length=sequence_length + 1,
77
+ standardize=custom_standardization,
78
+ )
79
+ train_eng_texts = [pair[0] for pair in train_pairs]
80
+ train_spa_texts = [pair[1] for pair in train_pairs]
81
+ eng_vectorization.adapt(train_eng_texts)
82
+ spa_vectorization.adapt(train_spa_texts)
83
+
84
+ def format_dataset(eng, spa):
85
+ eng = eng_vectorization(eng)
86
+ spa = spa_vectorization(spa)
87
+ return (
88
+ {
89
+ "encoder_inputs": eng,
90
+ "decoder_inputs": spa[:, :-1],
91
+ },
92
+ spa[:, 1:],
93
+ )
94
+
95
+
96
+ def make_dataset(pairs):
97
+ eng_texts, spa_texts = zip(*pairs)
98
+ eng_texts = list(eng_texts)
99
+ spa_texts = list(spa_texts)
100
+ dataset = tf.data.Dataset.from_tensor_slices((eng_texts, spa_texts))
101
+ dataset = dataset.batch(batch_size)
102
+ dataset = dataset.map(format_dataset)
103
+ return dataset.shuffle(2048).prefetch(16).cache()
104
+
105
+
106
+ train_ds = make_dataset(train_pairs)
107
+ val_ds = make_dataset(val_pairs)
108
+
109
+ for inputs, targets in train_ds.take(1):
110
+ print(f'inputs["encoder_inputs"].shape: {inputs["encoder_inputs"].shape}')
111
+ print(f'inputs["decoder_inputs"].shape: {inputs["decoder_inputs"].shape}')
112
+ print(f"targets.shape: {targets.shape}")
113
+
114
+ class TransformerEncoder(layers.Layer):
115
+ def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
116
+ super(TransformerEncoder, self).__init__(**kwargs)
117
+ self.embed_dim = embed_dim
118
+ self.dense_dim = dense_dim
119
+ self.num_heads = num_heads
120
+ self.attention = layers.MultiHeadAttention(
121
+ num_heads=num_heads, key_dim=embed_dim
122
+ )
123
+ self.dense_proj = keras.Sequential(
124
+ [
125
+ layers.Dense(dense_dim, activation="relu"),
126
+ layers.Dense(embed_dim),
127
+ ]
128
+ )
129
+ self.layernorm_1 = layers.LayerNormalization()
130
+ self.layernorm_2 = layers.LayerNormalization()
131
+ self.supports_masking = True
132
+
133
+ def call(self, inputs, mask=None):
134
+ if mask is not None:
135
+ padding_mask = tf.cast(mask[:, tf.newaxis, tf.newaxis, :], dtype="int32")
136
+ attention_output = self.attention(
137
+ query=inputs, value=inputs, key=inputs, attention_mask=padding_mask
138
+ )
139
+ proj_input = self.layernorm_1(inputs + attention_output)
140
+ proj_output = self.dense_proj(proj_input)
141
+ return self.layernorm_2(proj_input + proj_output)
142
+
143
+
144
+ class PositionalEmbedding(layers.Layer):
145
+ def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
146
+ super(PositionalEmbedding, self).__init__(**kwargs)
147
+ self.token_embeddings = layers.Embedding(
148
+ input_dim=vocab_size, output_dim=embed_dim
149
+ )
150
+ self.position_embeddings = layers.Embedding(
151
+ input_dim=sequence_length, output_dim=embed_dim
152
+ )
153
+ self.sequence_length = sequence_length
154
+ self.vocab_size = vocab_size
155
+ self.embed_dim = embed_dim
156
+
157
+ def call(self, inputs):
158
+ length = tf.shape(inputs)[-1]
159
+ positions = tf.range(start=0, limit=length, delta=1)
160
+ embedded_tokens = self.token_embeddings(inputs)
161
+ embedded_positions = self.position_embeddings(positions)
162
+ return embedded_tokens + embedded_positions
163
+
164
+ def compute_mask(self, inputs, mask=None):
165
+ return tf.math.not_equal(inputs, 0)
166
+
167
+
168
+ class TransformerDecoder(layers.Layer):
169
+ def __init__(self, embed_dim, latent_dim, num_heads, **kwargs):
170
+ super(TransformerDecoder, self).__init__(**kwargs)
171
+ self.embed_dim = embed_dim
172
+ self.latent_dim = latent_dim
173
+ self.num_heads = num_heads
174
+ self.attention_1 = layers.MultiHeadAttention(
175
+ num_heads=num_heads, key_dim=embed_dim
176
+ )
177
+ self.attention_2 = layers.MultiHeadAttention(
178
+ num_heads=num_heads, key_dim=embed_dim
179
+ )
180
+ self.dense_proj = keras.Sequential(
181
+ [
182
+ layers.Dense(latent_dim, activation="relu"),
183
+ layers.Dense(embed_dim),
184
+ ]
185
+ )
186
+ self.layernorm_1 = layers.LayerNormalization()
187
+ self.layernorm_2 = layers.LayerNormalization()
188
+ self.layernorm_3 = layers.LayerNormalization()
189
+ self.supports_masking = True
190
+
191
+ def call(self, inputs, encoder_outputs, mask=None):
192
+ causal_mask = self.get_causal_attention_mask(inputs)
193
+ if mask is not None:
194
+ padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
195
+ padding_mask = tf.minimum(padding_mask, causal_mask)
196
+
197
+ attention_output_1 = self.attention_1(
198
+ query=inputs, value=inputs, key=inputs, attention_mask=causal_mask
199
+ )
200
+ out_1 = self.layernorm_1(inputs + attention_output_1)
201
+
202
+ attention_output_2 = self.attention_2(
203
+ query=out_1,
204
+ value=encoder_outputs,
205
+ key=encoder_outputs,
206
+ attention_mask=padding_mask,
207
+ )
208
+ out_2 = self.layernorm_2(out_1 + attention_output_2)
209
+
210
+ proj_output = self.dense_proj(out_2)
211
+ return self.layernorm_3(out_2 + proj_output)
212
+
213
+ def get_causal_attention_mask(self, inputs):
214
+ input_shape = tf.shape(inputs)
215
+ batch_size, sequence_length = input_shape[0], input_shape[1]
216
+ i = tf.range(sequence_length)[:, tf.newaxis]
217
+ j = tf.range(sequence_length)
218
+ mask = tf.cast(i >= j, dtype="int32")
219
+ mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
220
+ mult = tf.concat(
221
+ [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)],
222
+ axis=0,
223
+ )
224
+ return tf.tile(mask, mult)
225
+
226
+ embed_dim = 256
227
+ latent_dim = 2048
228
+ num_heads = 8
229
+
230
+ encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="encoder_inputs")
231
+ x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)
232
+ encoder_outputs = TransformerEncoder(embed_dim, latent_dim, num_heads)(x)
233
+ encoder = keras.Model(encoder_inputs, encoder_outputs)
234
+
235
+ decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="decoder_inputs")
236
+ encoded_seq_inputs = keras.Input(shape=(None, embed_dim), name="decoder_state_inputs")
237
+ x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
238
+ x = TransformerDecoder(embed_dim, latent_dim, num_heads)(x, encoded_seq_inputs)
239
+ x = layers.Dropout(0.5)(x)
240
+ decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x)
241
+ decoder = keras.Model([decoder_inputs, encoded_seq_inputs], decoder_outputs)
242
+
243
+ decoder_outputs = decoder([decoder_inputs, encoder_outputs])
244
+ transformer = keras.Model(
245
+ [encoder_inputs, decoder_inputs], decoder_outputs, name="transformer"
246
+ )
247
+
248
+
249
+
250
+
251
+
252
+ transformer.summary()
253
+
254
+ #load weights using gdown
255
+ gdown.download_folder("https://drive.google.com/drive/folders/1DwN-MlL6MMh7qVJbwoLrWBSMVBN5zbBi")
256
+ transformer.load_weights("./EngToSpanishckpts/cp.ckpt").expect_partial()
257
+
258
+ spa_vocab = spa_vectorization.get_vocabulary()
259
+ spa_index_lookup = dict(zip(range(len(spa_vocab)), spa_vocab))
260
+ max_decoded_sentence_length = 20
261
+
262
+
263
+ def decode_sequence(input_sentence):
264
+ tokenized_input_sentence = eng_vectorization([input_sentence])
265
+ decoded_sentence = "[start]"
266
+ for i in range(max_decoded_sentence_length):
267
+ tokenized_target_sentence = spa_vectorization([decoded_sentence])[:, :-1]
268
+ predictions = transformer([tokenized_input_sentence, tokenized_target_sentence])
269
+
270
+ sampled_token_index = np.argmax(predictions[0, i, :])
271
+ sampled_token = spa_index_lookup[sampled_token_index]
272
+ decoded_sentence += " " + sampled_token
273
+
274
+ if sampled_token == "[end]":
275
+ break
276
+ return decoded_sentence
277
+
278
+
279
+
280
+
TestTranslation/translation_test.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from translation import *
2
+
3
+
4
+ test_eng_texts = [pair[0] for pair in test_pairs]
5
+ input_sentence = "This is a test."
6
+ translated = decode_sequence(input_sentence)
7
+ print(input_sentence)
8
+ print(translated)
9
+
10
+ for _ in range(30):
11
+ input_sentence = random.choice(test_eng_texts)
12
+ translated = decode_sequence(input_sentence)
13
+ print(input_sentence)
14
+ print(translated)
TestTranslation/translation_train.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from translation import *
2
+ # steven's addition: saving checkpoints
3
+ checkpoint_path = "ckpts-translator/cp.ckpt"
4
+ checkpoint_dir = os.path.dirname(checkpoint_path)
5
+
6
+ cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
7
+ save_weights_only=True,
8
+ verbose=1)
9
+
10
+ epochs = 20 # This should be at least 30 for convergence
11
+ transformer.compile(
12
+ "rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
13
+ )
14
+ transformer.fit(train_ds, epochs=epochs, validation_data=val_ds, callbacks=[cp_callback])
Video/Wav2Lip_TenDeepfake_eng.ipynb ADDED
The diff for this file is too large to render. See raw diff