| name: prod_lingua_7B_2T_lin_hq_cd_128N | |
| dump_dir: /p/lustre5/kirchenb/common-pile-root/lingua/output/prod_lingua_7B_2T_lin_hq_cd_128N | |
| seed: 777 | |
| grad_acc_steps: 1 | |
| gc_collect_freq: 1000 | |
| probe_freq: null | |
| steps: 239000 | |
| data: | |
| root_dir: /p/vast1/pretrain/datasets/common_pile/common-pile-chunked | |
| sources: | |
| stackv2_edu: 0.170363004622478 | |
| doab: 0.160392905980941 | |
| wikimedia: 0.15321084473113 | |
| stackexchange: 0.149577338855193 | |
| peS2o: 0.121808388328754 | |
| cccc: 0.116267077616518 | |
| arxiv_papers: 0.0649775722614287 | |
| data_provenance_initiative: 0.0455351865398593 | |
| pressbooks: 0.00768988231013495 | |
| libretexts: 0.00481335361292812 | |
| news: 0.00331307394000609 | |
| foodista: 0.00111435553323384 | |
| oercommons: 0.000692183554680895 | |
| python_enhancement_proposals: 0.000151098755264962 | |
| public_domain_review: 9.37333574480524e-05 | |
| batch_size: 4 | |
| seq_len: 4096 | |
| n_views: 2 | |
| seed: 42 | |
| add_bos: true | |
| add_eos: true | |
| load_async: true | |
| prefetch_size: 4096 | |
| tokenizer: | |
| name: tiktoken | |
| path: /p/vast1/pretrain/datasets/common_pile/common-pile-chunked/tokenizer/common-pile-tokenizer.tiktoken | |
| optim: | |
| lr: 0.002 | |
| weight_decay: 0.2 | |
| epsilon: 1.0e-08 | |
| beta1: 0.9 | |
| beta2: 0.95 | |
| clip: 1.0 | |
| scheduler: linear | |
| warmup: -324821 | |
| lr_min_ratio: 0.0 | |
| cycle_length: 1.0 | |
| cosine_theta: 1.0 | |
| annealing_step: 1000 | |
| decay_fraction: 0.1 | |
| exp_factor: 0.5 | |
| model: | |
| dim: 4096 | |
| n_layers: 32 | |
| head_dim: null | |
| n_heads: 32 | |
| n_kv_heads: null | |
| ffn_dim_multiplier: 1.0 | |
| multiple_of: 256 | |
| norm_eps: 1.0e-05 | |
| rope_theta: 100000.0 | |
| init_base_std: null | |
| init_std_factor: disabled | |
| max_seqlen: 4096 | |
| seed: 42 | |
| vocab_size: 64256 | |
| weight_tying: false | |
| sliding_window: null | |
| distributed: | |
| dp_shard: 1 | |
| dp_replicate: 512 | |
| tp_size: 1 | |
| selective_activation_checkpointing: false | |
| compile: true | |
| fsdp_type: full_shard | |
| model_dtype: bf16 | |
| float8_recipe: null | |
| float8_filter: layers\.[0-9]+\. | |
| matmul_allow_tf32: false | |
| detect_anomaly: false | |
| compile_cache_size_limit: 8 | |
| spawn_method: forkserver | |
| env: | |
| MKL_SERVICE_FORCE_INTEL: GNU | |
| OMP_NUM_THREADS: '1' | |
| MKL_NUM_THREADS: '1' | |
| ENABLE_INTRA_NODE_COMM: '1' | |
| TORCH_NCCL_AVOID_RECORD_STREAMS: '1' | |
| NCCL_IB_TIMEOUT: '22' | |
| NCCL_DEBUG: INFO | |
| TORCH_NCCL_ASYNC_ERROR_HANDLING: '1' | |
| checkpoint: | |
| dump: | |
| every: 1000 | |
| keep: -1 | |
| eval: | |
| every: 900 | |
| keep: 11 | |
| path: /p/lustre5/kirchenb/common-pile-root/lingua/output/prod_lingua_7B_2T_lin_hq_cd_128N/checkpoints | |
| init_ckpt_path: null | |
| continue_training_from_init: false | |
| ignore_data_loader_state: true | |
| ignore_lr_scheduler_state: true | |
| profiling: | |
| run: false | |
| trace_folder: profiling | |
| mem_warmup: 0 | |
| mem_steps: 4 | |
| profile_warmup: 100 | |
| profile_steps: 4 | |
| logging: | |
| freq: 1 | |
| acc_freq: null | |
| wandb: null | |
| async_eval_gpus: null | |
| eval: null | |