Steven Zhang commited on
Commit
21c0ae2
1 Parent(s): db80ce1

autocorrect merged, finished training spanish

Browse files
.idea/2022-summer-speech-translation.iml CHANGED
@@ -1,7 +1,9 @@
1
  <?xml version="1.0" encoding="UTF-8"?>
2
  <module type="PYTHON_MODULE" version="4">
3
  <component name="NewModuleRootManager">
4
- <content url="file://$MODULE_DIR$" />
 
 
5
  <orderEntry type="inheritedJdk" />
6
  <orderEntry type="sourceFolder" forTests="false" />
7
  </component>
1
  <?xml version="1.0" encoding="UTF-8"?>
2
  <module type="PYTHON_MODULE" version="4">
3
  <component name="NewModuleRootManager">
4
+ <content url="file://$MODULE_DIR$">
5
+ <excludeFolder url="file://$MODULE_DIR$/EngToSpanishckpts" />
6
+ </content>
7
  <orderEntry type="inheritedJdk" />
8
  <orderEntry type="sourceFolder" forTests="false" />
9
  </component>
Autocorrect/autocorrectreal.py CHANGED
@@ -7,11 +7,6 @@ Original file is located at
7
  https://colab.research.google.com/drive/1aH5mYp1dxyn55XMjtVUllBvg37nqGVir
8
  """
9
 
10
- from google.colab import drive
11
- drive.mount('/content/drive')
12
-
13
- !pip install textdistance
14
-
15
  import re
16
  from collections import Counter
17
  import numpy as np
@@ -19,13 +14,11 @@ import pandas as pd
19
  import textdistance
20
 
21
  w = []
22
- with open('/content/drive/MyDrive/words.txt', 'r') as f:
23
  file_name_data = f.read()
24
  file_name_data = file_name_data.lower()
25
  w = re.findall('\w+', file_name_data)
26
 
27
- print(f"First 10 words: \n{w[0:10]}")
28
- print(f"{len(w)} total words ")
29
 
30
  from nltk.metrics.distance import edit_distance
31
  def edit(input_sentence):
@@ -40,8 +33,4 @@ def edit(input_sentence):
40
  sentence[sentence.index(i)] = closest[1]
41
  output_sentence = ' '.join(sentence)
42
 
43
- return output_sentence
44
-
45
- print(edit("My namee is uncele sdtevven"))
46
- print(edit("moneeyeh is greeat"))
47
- print(edit("establishe that nitrgen is theh essentil vegchtable as of animal living matter"))
7
  https://colab.research.google.com/drive/1aH5mYp1dxyn55XMjtVUllBvg37nqGVir
8
  """
9
 
 
 
 
 
 
10
  import re
11
  from collections import Counter
12
  import numpy as np
14
  import textdistance
15
 
16
  w = []
17
+ with open('Autocorrect/words.txt', 'r') as f:
18
  file_name_data = f.read()
19
  file_name_data = file_name_data.lower()
20
  w = re.findall('\w+', file_name_data)
21
 
 
 
22
 
23
  from nltk.metrics.distance import edit_distance
24
  def edit(input_sentence):
33
  sentence[sentence.index(i)] = closest[1]
34
  output_sentence = ' '.join(sentence)
35
 
36
+ return output_sentence
 
 
 
 
TestTranslation/translation.py CHANGED
@@ -251,9 +251,10 @@ transformer = keras.Model(
251
 
252
  transformer.summary()
253
 
254
- #load weights using gdown
 
255
  gdown.download_folder("https://drive.google.com/drive/folders/1DwN-MlL6MMh7qVJbwoLrWBSMVBN5zbBi")
256
- transformer.load_weights("./EngToSpanishckpts/cp.ckpt").expect_partial()
257
 
258
  spa_vocab = spa_vectorization.get_vocabulary()
259
  spa_index_lookup = dict(zip(range(len(spa_vocab)), spa_vocab))
@@ -275,6 +276,8 @@ def decode_sequence(input_sentence):
275
  break
276
  return decoded_sentence
277
 
278
-
 
 
279
 
280
 
251
 
252
  transformer.summary()
253
 
254
+ # load weights using gdown
255
+ print(os.listdir())
256
  gdown.download_folder("https://drive.google.com/drive/folders/1DwN-MlL6MMh7qVJbwoLrWBSMVBN5zbBi")
257
+ transformer.load_weights("./EngToSpanishckpts/cp.ckpt")
258
 
259
  spa_vocab = spa_vectorization.get_vocabulary()
260
  spa_index_lookup = dict(zip(range(len(spa_vocab)), spa_vocab))
276
  break
277
  return decoded_sentence
278
 
279
+ transformer.compile(
280
+ "rmsprop", loss="sparse_categorical_crossentropy"
281
+ )
282
 
283
 
TestTranslation/translation_test.py CHANGED
@@ -1,5 +1,6 @@
1
  from TestTranslation.translation import *
2
 
 
3
 
4
  test_eng_texts = [pair[0] for pair in test_pairs]
5
  input_sentence = "This is a test."
1
  from TestTranslation.translation import *
2
 
3
+ transformer.evaluate(train_ds)
4
 
5
  test_eng_texts = [pair[0] for pair in test_pairs]
6
  input_sentence = "This is a test."
TestTranslation/translation_train.py CHANGED
@@ -1,6 +1,6 @@
1
  from translation import *
2
  # steven's addition: saving checkpoints
3
- checkpoint_path = "ckpts-translator/cp.ckpt"
4
  checkpoint_dir = os.path.dirname(checkpoint_path)
5
 
6
  cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
@@ -8,7 +8,5 @@ cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
8
  verbose=1)
9
 
10
  epochs = 20 # This should be at least 30 for convergence
11
- transformer.compile(
12
- "rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
13
- )
14
  transformer.fit(train_ds, epochs=epochs, validation_data=val_ds, callbacks=[cp_callback])
1
  from translation import *
2
  # steven's addition: saving checkpoints
3
+ checkpoint_path = "./EngToSpanishckpts/cp.ckpt"
4
  checkpoint_dir = os.path.dirname(checkpoint_path)
5
 
6
  cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
8
  verbose=1)
9
 
10
  epochs = 20 # This should be at least 30 for convergence
11
+
 
 
12
  transformer.fit(train_ds, epochs=epochs, validation_data=val_ds, callbacks=[cp_callback])
app.py CHANGED
@@ -1,6 +1,6 @@
1
  # have to run this locally as streamlit run app.py
2
  import streamlit as st
3
-
4
  from TestTranslation.translation import *
5
 
6
 
@@ -11,7 +11,9 @@ option = st.selectbox("Select input type:", ("text input", "audio input"))
11
  if option == "text input":
12
  input_sentence = st.text_input("Enter input sentence:")
13
  if input_sentence is not None and len(input_sentence) > 0:
14
- translated = decode_sequence(input_sentence)
 
 
15
  st.write(translated)
16
  input_sentence = None
17
  else:
1
  # have to run this locally as streamlit run app.py
2
  import streamlit as st
3
+ from Autocorrect.autocorrectreal import edit
4
  from TestTranslation.translation import *
5
 
6
 
11
  if option == "text input":
12
  input_sentence = st.text_input("Enter input sentence:")
13
  if input_sentence is not None and len(input_sentence) > 0:
14
+ edited = edit(input_sentence)
15
+ st.write("Autocorrected sentence: " + edited)
16
+ translated = decode_sequence(edited)[8:-5]
17
  st.write(translated)
18
  input_sentence = None
19
  else: