File size: 5,436 Bytes
1ffd672
f8d0565
 
ff1bae1
b3ebb49
8d7cf3a
b3ebb49
dc83dff
f8d0565
b3ebb49
 
 
 
4c71707
8d7cf3a
b3ebb49
 
8112d7a
f39c7a2
4c71707
b3ebb49
 
f8d0565
73363ec
663ac44
 
 
 
 
 
 
fa6f9dd
5caf6ad
0f7a10e
baf12a8
df07559
74e2a8d
 
df07559
73363ec
 
 
676b3fa
74e2a8d
 
676b3fa
b77d75e
df07559
f8d0565
4fbd840
1ec80fa
f8d0565
0c3b1db
464e88d
aae5aef
4fbd840
464e88d
 
 
f8d0565
 
464e88d
b3ebb49
1ec80fa
 
 
 
 
464e88d
f8d0565
b3ebb49
4fbd840
 
 
 
 
f8d0565
1ffd672
 
71f3e01
 
f655e07
71f3e01
b3ebb49
cc4299c
 
f8d0565
cc4299c
 
 
 
 
 
 
 
f9db145
cc4299c
73363ec
71f3e01
cc4299c
 
 
baf12a8
cc4299c
 
 
 
71f3e01
cc4299c
 
 
 
 
 
 
 
 
 
6e454d6
978694b
cc4299c
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import streamlit as st
import langcodes
from allosaurus.app import read_recognizer
from pathlib import Path
import string
from itertools import permutations
from collections import defaultdict
import torchaudio

@st.cache
def get_supported_codes():
  model = read_recognizer()  
  supported_codes = []
  supported_codes.append("ipa") # default option
  for combo in permutations(string.ascii_lowercase, r=3):
    code = "".join(combo)
    if model.is_available(code):
      supported_codes.append(code)
      
  
  return supported_codes
  
  
def get_path_to_wav_format(uploaded_file, suppress_outputs=False):
#  st.write(dir(uploaded_file))
#  st.write(type(uploaded_file))
#  st.write(uploaded_file)
  uploaded_bytes = uploaded_file.getvalue()
  actual_file_path = Path(uploaded_file.name)
  actual_file_path.write_bytes(uploaded_bytes)
    

  if ".wav" in uploaded_file.name:
    return Path(uploaded_file.name)
  if ".mp3" in uploaded_file.name or ".ogg" in uploaded_file.name:
    new_desired_path = actual_file_path.with_suffix(".wav")
    encoding="PCM_S" # Prevent encoding errors. https://stackoverflow.com/questions/60352850/wave-error-unknown-format-3-arises-when-trying-to-convert-a-wav-file-into-text
    bits_per_sample=16
    waveform, sample_rate = torchaudio.load(actual_file_path)
    if not suppress_outputs:
      st.info(f"Allosaurus requires .wav files. Converting with torchaudio, encoding={encoding}, bits_per_sample={bits_per_sample}")
      st.info(f"Uploaded file sample_rate: {sample_rate}")
    torchaudio.save(new_desired_path, waveform, sample_rate, 
                    encoding=encoding, 
                    bits_per_sample=bits_per_sample,
                    )

    return new_desired_path

@st.cache
def get_langcode_description(input_code, url=False):
  langcode = "ipa" # the default allosaurus recognizer
  description = "the default universal setting, not specific to any language"
  
  if not input_code or input_code==langcode:
    return description
    


  try:
    lang = langcodes.get(input_code)
    alpha3 = lang.to_alpha3()
    langcode = alpha3 
    display_name = lang.display_name()
    if url:
      description = f"[{display_name}](https://iso639-3.sil.org/code/{alpha3})"
    else:
      description = display_name 
    
  except langcodes.LanguageTagError as e:
    pass    
  return description

@st.cache    
def get_langcode_with_description(input_code):
  return f"{input_code}: {get_langcode_description(input_code)}"


if __name__ == "__main__":
  st.header("Phonemize Audio files with [Allosaurus](https://github.com/xinjli/allosaurus)")
  st.write("Allosaurus is a pretrained universal phone recognizer. It can be used to recognize phones in more than 2000 languages. It is written by Li, Xinjian and Dalmia, Siddharth and Li, Juncheng and Lee, Matthew and Littell, Patrick and Yao, Jiali and Anastasopoulos, Antonios and Mortensen, David R and Neubig, Graham and Black, Alan W and Florian, Metze. [Click here to visit their repository](https://github.com/xinjli/allosaurus)")
  st.write("I, [Colin Leong](cdleong.github.io) did not create Allosaurus, but I have created this web app (kindly hosted by Hugging Face) to make it convenient to use: simply upload your files below, and they will be transcribed to phonetic IPA symbols!")
  st.write(f"**Feedback:** Provide feedback regarding this web app at https://twitter.com/cleong110, or via slack: https://masakhane-nlp.slack.com/archives/D01DU3MHP7A")
  supported_codes = get_supported_codes()
  index_of_desired_default = supported_codes.index("ipa")  
  with st.form("Allosaurus form"):
  
    langcode = st.selectbox("ISO code for input language. Allosaurus doesn't need this, but it can improve accuracy", 
                            options=supported_codes, 
                            index=index_of_desired_default,
                            format_func=get_langcode_with_description
                            )
    
    model = read_recognizer()
    description = get_langcode_description(langcode, url=True)
  
    st.write(f"Instructing Allosaurus to recognize using language {langcode}. That is, {description}")
  
    st.subheader("Upload your files here")
    uploaded_files = st.file_uploader("Choose a file", type=[
                                                            ".wav", 
                                                            ".mp3", 
                                                            ".ogg",
                                                            ],
                                                      accept_multiple_files=True,
                                                            )
    
    submitted = st.form_submit_button("Run phone recognition!")
    if submitted:
      results = {} # for better download/display
      
      uploaded_files_count = len(uploaded_files)
      suppress_output_threshold = 2
      my_bar = st.progress(0)
      for i, uploaded_file in enumerate(uploaded_files):
        
        if uploaded_file is not None:            
          wav_file = get_path_to_wav_format(uploaded_file, uploaded_files_count>suppress_output_threshold)    
          with st.spinner(f"transcribing {uploaded_file.name}..."):
            result = model.recognize(wav_file, langcode)
          results[uploaded_file.name] = result
        files_done = i+1
        my_bar.progress(files_done/uploaded_files_count)
      st.write(results)