101 lines
2.5 KiB
JSON
101 lines
2.5 KiB
JSON
{
|
|
"backbone": {
|
|
"d_model": 2048,
|
|
"d_intermediate": 0,
|
|
"attn_mlp_d_intermediate": 8192,
|
|
"n_layer": 26,
|
|
"ssm_cfg": {},
|
|
"attn_layer_idx": [
|
|
0,
|
|
1,
|
|
2,
|
|
3,
|
|
4,
|
|
5,
|
|
6,
|
|
7,
|
|
8,
|
|
9,
|
|
10,
|
|
11,
|
|
12,
|
|
13,
|
|
14,
|
|
15,
|
|
16,
|
|
17,
|
|
18,
|
|
19,
|
|
20,
|
|
21,
|
|
22,
|
|
23,
|
|
24,
|
|
25
|
|
],
|
|
"attn_cfg": {
|
|
"causal": true,
|
|
"num_heads": 16,
|
|
"num_heads_kv": 4,
|
|
"rotary_emb_dim": 128,
|
|
"rotary_emb_interleaved": true,
|
|
"qkv_proj_bias": false,
|
|
"out_proj_bias": false
|
|
},
|
|
"rms_norm": false,
|
|
"residual_in_fp32": false,
|
|
"norm_epsilon": 1e-05
|
|
},
|
|
"prefix_conditioner": {
|
|
"conditioners": [
|
|
{
|
|
"type": "EspeakPhonemeConditioner",
|
|
"name": "espeak"
|
|
},
|
|
{
|
|
"cond_dim": 128,
|
|
"uncond_type": "learned",
|
|
"projection": "linear",
|
|
"type": "PassthroughConditioner",
|
|
"name": "speaker"
|
|
},
|
|
{
|
|
"input_dim": 8,
|
|
"uncond_type": "learned",
|
|
"type": "FourierConditioner",
|
|
"name": "emotion"
|
|
},
|
|
{
|
|
"min_val": 0,
|
|
"max_val": 24000,
|
|
"uncond_type": "learned",
|
|
"type": "FourierConditioner",
|
|
"name": "fmax"
|
|
},
|
|
{
|
|
"min_val": 0,
|
|
"max_val": 400,
|
|
"uncond_type": "learned",
|
|
"type": "FourierConditioner",
|
|
"name": "pitch_std"
|
|
},
|
|
{
|
|
"min_val": 0,
|
|
"max_val": 40,
|
|
"uncond_type": "learned",
|
|
"type": "FourierConditioner",
|
|
"name": "speaking_rate"
|
|
},
|
|
{
|
|
"min_val": -1,
|
|
"max_val": 126,
|
|
"uncond_type": "learned",
|
|
"type": "IntegerConditioner",
|
|
"name": "language_id"
|
|
}
|
|
],
|
|
"projection": "linear"
|
|
},
|
|
"eos_token_id": 1024,
|
|
"masked_token_id": 1025
|
|
} |