Ansh9728 commited on
Commit
cbb8952
·
verified ·
1 Parent(s): 90b940b

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +91 -0
README.md ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ datasets:
4
+ - marsyas/gtzan
5
+ language:
6
+ - en
7
+ metrics:
8
+ - accuracy
9
+ base_model:
10
+ - ntu-spml/distilhubert
11
+ pipeline_tag: audio-classification
12
+ ---
13
+
14
+ ## Model Details
15
+
16
+ ### Model Description
17
+
18
+ <!-- Provide a longer summary of what this model is. -->
19
+ DistilHuBERT by NTU Speech Processing & Machine Learning Lab
20
+
21
+ The base model pretrained on 16kHz sampled speech audio. When using the model make sure that your speech input is also sampled at 16Khz.
22
+
23
+ Note: This model does not have a tokenizer as it was pretrained on audio alone. In order to use this model speech recognition, a tokenizer should be created and the model should be fine-tuned on labeled text data. Check out this blog for more in-detail explanation of how to fine-tune the model.
24
+
25
+
26
+
27
+ ### Model Architecture and Objective
28
+
29
+ HubertForSequenceClassification(
30
+ (hubert): HubertModel(
31
+ (feature_extractor): HubertFeatureEncoder(
32
+ (conv_layers): ModuleList(
33
+ (0): HubertGroupNormConvLayer(
34
+ (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
35
+ (activation): GELUActivation()
36
+ (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
37
+ )
38
+ (1-4): 4 x HubertNoLayerNormConvLayer(
39
+ (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
40
+ (activation): GELUActivation()
41
+ )
42
+ (5-6): 2 x HubertNoLayerNormConvLayer(
43
+ (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
44
+ (activation): GELUActivation()
45
+ )
46
+ )
47
+ )
48
+ (feature_projection): HubertFeatureProjection(
49
+ (projection): Linear(in_features=512, out_features=768, bias=True)
50
+ (dropout): Dropout(p=0.0, inplace=False)
51
+ )
52
+ (encoder): HubertEncoder(
53
+ (pos_conv_embed): HubertPositionalConvEmbedding(
54
+ (conv): ParametrizedConv1d(
55
+ 768, 768, kernel_size=(128,), stride=(1,), padding=(64,), groups=16
56
+ (parametrizations): ModuleDict(
57
+ (weight): ParametrizationList(
58
+ (0): _WeightNorm()
59
+ )
60
+ )
61
+ )
62
+ (padding): HubertSamePadLayer()
63
+ (activation): GELUActivation()
64
+ )
65
+ (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
66
+ (dropout): Dropout(p=0.1, inplace=False)
67
+ (layers): ModuleList(
68
+ (0-1): 2 x HubertEncoderLayer(
69
+ (attention): HubertSdpaAttention(
70
+ (k_proj): Linear(in_features=768, out_features=768, bias=True)
71
+ (v_proj): Linear(in_features=768, out_features=768, bias=True)
72
+ (q_proj): Linear(in_features=768, out_features=768, bias=True)
73
+ (out_proj): Linear(in_features=768, out_features=768, bias=True)
74
+ )
75
+ (dropout): Dropout(p=0.1, inplace=False)
76
+ (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
77
+ (feed_forward): HubertFeedForward(
78
+ (intermediate_dropout): Dropout(p=0.1, inplace=False)
79
+ (intermediate_dense): Linear(in_features=768, out_features=3072, bias=True)
80
+ (intermediate_act_fn): GELUActivation()
81
+ (output_dense): Linear(in_features=3072, out_features=768, bias=True)
82
+ (output_dropout): Dropout(p=0.1, inplace=False)
83
+ )
84
+ (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
85
+ )
86
+ )
87
+ )
88
+ )
89
+ (projector): Linear(in_features=768, out_features=256, bias=True)
90
+ (classifier): Linear(in_features=256, out_features=10, bias=True)
91
+ )