Fun-ASR-Nano-2512 / config.yaml
pengzhendong's picture
Update config.yaml
a7088d6 verified
model: FunASRNano
model_conf:
lsm_weight: 0.1
length_normalized_loss: true
audio_encoder: SenseVoiceEncoderSmall
audio_encoder_conf:
output_size: 512
attention_heads: 4
linear_units: 2048
num_blocks: 50
tp_blocks: 20
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.1
input_layer: pe
pos_enc_class: SinusoidalPositionEncoder
normalize_before: true
kernel_size: 11
sanm_shfit: 0
selfattention_layer_type: sanm
freeze: true
freeze_layer_num: -1
feat_permute: true
llm: Qwen3-0.6b
llm_conf:
hub: hf
freeze: true
llm_dtype: bf16
init_param_path: Qwen3-0.6B
use_lora: false
lora_conf:
freeze_lora: true
task_type: CAUSAL_LM
r: 16
lora_alpha: 32
lora_dropout: 0.05
bias: none
target_modules:
- q_proj
- v_proj
init_param_path: ""
audio_adaptor: Transformer
audio_adaptor_conf:
downsample_rate: 1
use_low_frame_rate: true
ffn_dim: 2048
llm_dim: 1024
encoder_dim: 512
n_layer: 2
freeze: true
ctc_decoder: Transformer
detach_ctc_decoder: true
ctc_decoder_conf:
downsample_rate: 1
ffn_dim: 2048
llm_dim: 512
encoder_dim: 512
n_layer: 5
freeze: false
ctc_weight: 1.0
ctc_conf:
dropout_rate: 0.0
ctc_type: builtin
reduce: true
ignore_nan_grad: true
frontend: WavFrontend
frontend_conf:
fs: 16000
window: hamming
n_mels: 80
frame_length: 25
frame_shift: 10
lfr_m: 7
lfr_n: 6
cmvn_file: null
train_conf:
use_lora: ${llm_conf.use_lora}
accum_grad: 1
grad_clip: 5
max_epoch: 2
keep_nbest_models: 200
log_interval: 100
effective_save_name_excludes:
- llm.
resume: true
validate_interval: 2000
save_checkpoint_interval: 2000
avg_nbest_model: 100
use_bf16: false
use_deepspeed: true
deepspeed_config: null
save_init_model: false
optim: adamw
optim_conf:
lr: 5.0e-06
weight_decay: 0.0
scheduler: warmuplr
scheduler_conf:
warmup_steps: 2500
dataset: FunASR
dataset_conf:
index_ds: FunASR
batch_sampler: BatchSampler
batch_type: token
batch_size: 6000
max_token_length: 3500
shuffle: true
sort_size: 1024
batch_size_scale_ratio_max: 2
num_workers: 4
audio_adaptor_downsample_rate: ${audio_adaptor_conf.downsample_rate}
audio_encoder_downsample_rate: 6
data_split_num: 256
batch_size_sample_max: 10
retry: 2000
batch_size_token_max: 6000
max_source_length: 12000
max_target_length: 2048
prompt_classes: MultiContextPrompt
prompt_conf:
max_neg_hotwords_num: 0
min_neg_hotwords_num: 0
use_hist: false
use_one_pass_result: true
use_hotwords: true
use_asr_hotwords: true
chinese_hotwords_list: null
english_hotwords_list: null
ctc_tokenizer: SenseVoiceTokenizer
ctc_target_normalize: true
ctc_tokenizer_conf:
vocab_path: null
is_multilingual: true
num_languages: 8749
min_source_length: 10
batch_size_scale_threshold: 3000
use_dynamic_output_ratio: 0.0
tokenizer: HuggingfaceTokenizer
tokenizer_conf:
init_param_path: ${llm_conf.init_param_path}
enable_tf32: true
debug: false
train_data_set_list: null
valid_data_set_list: null
init_param: null
output_dir: null