Spaces:

jasspier
/

asr_arena

Runtime error

App Files Files Community

jasspier commited on May 27

Commit

e2bcfc6

•

1 Parent(s): 12ff2bb

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -10

app.py CHANGED Viewed

@@ -3,35 +3,55 @@ import torch
 import torchaudio
 from torchaudio.transforms import Resample
-# 定义一个简化的模型类（假设模型是LSTM架构）
-class ASRModel(torch.nn.Module):
     def __init__(self):
-        super(ASRModel, self).__init__()
-        self.lstm = torch.nn.LSTM(input_size=160, hidden_size=256, num_layers=3, batch_first=True)
-        self.linear = torch.nn.Linear(256, 29)  # 假设29个输出类用于字符
     def forward(self, x):
-        x, _ = self.lstm(x)
-        x = self.linear(x)
         return x
 # 定义模型路径
 model_path = "https://huggingface.co/Tele-AI/TeleSpeech-ASR1.0/resolve/main/large.pt"
 # 下载模型文件
 torch.hub.download_url_to_file(model_path, 'large.pt')
 # 初始化模型
-model = ASRModel()
 # 加载模型参数
 checkpoint = torch.load('large.pt', map_location=torch.device('cpu'))
-state_dict = checkpoint['model']  # 假设模型权重保存在 'model' 键中
-model.load_state_dict(state_dict)
 model.eval()
 # 定义处理函数
 def transcribe(audio):
     waveform, sample_rate = torchaudio.load(audio)
     resample = Resample(orig_freq=sample_rate, new_freq=16000)
     waveform = resample(waveform)
@@ -41,6 +61,7 @@ def transcribe(audio):
         logits = model(input_values)
     predicted_ids = torch.argmax(logits, dim=-1)
     transcription = ''.join([chr(i) for i in predicted_ids[0].tolist()])  # 解码预测到字符
     return transcription
 # 创建 Gradio 界面
@@ -52,4 +73,5 @@ iface = gr.Interface(
     description="Upload an audio file or record your voice to transcribe speech to text using the TeleSpeech ASR model."
 )
 iface.launch()

 import torchaudio
 from torchaudio.transforms import Resample
+# 使用一个假设的 Transformer ASR 模型结构
+class TransformerASRModel(torch.nn.Module):
     def __init__(self):
+        super(TransformerASRModel, self).__init__()
+        # 定义模型架构，这里需要根据实际情况进行调整
+        self.encoder = torch.nn.TransformerEncoderLayer(d_model=512, nhead=8)
+        self.decoder = torch.nn.Linear(512, 29)  # 假设29个输出类用于字符
     def forward(self, x):
+        x = self.encoder(x)
+        x = self.decoder(x)
         return x
 # 定义模型路径
 model_path = "https://huggingface.co/Tele-AI/TeleSpeech-ASR1.0/resolve/main/large.pt"
 # 下载模型文件
+print("Downloading model file...")
 torch.hub.download_url_to_file(model_path, 'large.pt')
+print("Model file downloaded.")
 # 初始化模型
+model = TransformerASRModel()
 # 加载模型参数
+print("Loading model checkpoint...")
 checkpoint = torch.load('large.pt', map_location=torch.device('cpu'))
+print("Checkpoint keys:", checkpoint.keys())
+# 打印模型参数中的键
+if 'model' in checkpoint:
+    state_dict = checkpoint['model']
+    print("Model state_dict keys:", state_dict.keys())
+else:
+    print("Key 'model' not found in checkpoint.")
+    state_dict = checkpoint
+# 加载模型状态字典
+try:
+    model.load_state_dict(state_dict)
+    print("Model state_dict loaded successfully.")
+except Exception as e:
+    print("Error loading model state_dict:", str(e))
 model.eval()
 # 定义处理函数
 def transcribe(audio):
+    print("Transcribing audio...")
     waveform, sample_rate = torchaudio.load(audio)
     resample = Resample(orig_freq=sample_rate, new_freq=16000)
     waveform = resample(waveform)
         logits = model(input_values)
     predicted_ids = torch.argmax(logits, dim=-1)
     transcription = ''.join([chr(i) for i in predicted_ids[0].tolist()])  # 解码预测到字符
+    print("Transcription:", transcription)
     return transcription
 # 创建 Gradio 界面
     description="Upload an audio file or record your voice to transcribe speech to text using the TeleSpeech ASR model."
 )
+print("Launching Gradio interface...")
 iface.launch()