| | |
| | save_data: data |
| | overwrite: True |
| | seed: 1234 |
| | report_every: 100 |
| | valid_metrics: ["BLEU"] |
| | tensorboard: true |
| | tensorboard_log_dir: tensorboard |
| |
|
| | |
| | src_vocab: ar.eole.vocab |
| | tgt_vocab: en.eole.vocab |
| | src_vocab_size: 32000 |
| | tgt_vocab_size: 32000 |
| | vocab_size_multiple: 8 |
| | share_vocab: false |
| | n_sample: 0 |
| |
|
| | data: |
| | corpus_1: |
| | path_src: hf://quickmt/quickmt-train.ar-en/ar |
| | path_tgt: hf://quickmt/quickmt-train.ar-en/en |
| | path_sco: hf://quickmt/quickmt-train.ar-en/sco |
| | weight: 2 |
| | corpus_2: |
| | path_src: hf://quickmt/newscrawl2024-en-backtranslated-ar/ar |
| | path_tgt: hf://quickmt/newscrawl2024-en-backtranslated-ar/en |
| | path_sco: hf://quickmt/newscrawl2024-en-backtranslated-ar/sco |
| | weight: 1 |
| | corpus_3: |
| | path_src: hf://quickmt/madlad400-en-backtranslated-ar/ar |
| | path_tgt: hf://quickmt/madlad400-en-backtranslated-ar/en |
| | path_sco: hf://quickmt/madlad400-en-backtranslated-ar/sco |
| | weight: 2 |
| | valid: |
| | path_src: valid.ar |
| | path_tgt: valid.en |
| |
|
| |
|
| | transforms: [sentencepiece, filtertoolong] |
| | transforms_configs: |
| | sentencepiece: |
| | src_subword_model: "ar.spm.model" |
| | tgt_subword_model: "en.spm.model" |
| | filtertoolong: |
| | src_seq_length: 256 |
| | tgt_seq_length: 256 |
| |
|
| | training: |
| | |
| | model_path: quickmt-ar-en-eole-model |
| | keep_checkpoint: 4 |
| | train_steps: 200000 |
| | save_checkpoint_steps: 5000 |
| | valid_steps: 5000 |
| | |
| | |
| | world_size: 1 |
| | gpu_ranks: [0] |
| |
|
| | |
| | |
| | batch_type: "tokens" |
| | batch_size: 15000 |
| | valid_batch_size: 2048 |
| | batch_size_multiple: 8 |
| | accum_count: [8] |
| | accum_steps: [0] |
| |
|
| | |
| | compute_dtype: "fp16" |
| | optim: "adamw" |
| | use_amp: True |
| | learning_rate: 3.0 |
| | warmup_steps: 5000 |
| | decay_method: "noam" |
| | adam_beta2: 0.998 |
| |
|
| | |
| | bucket_size: 256000 |
| | num_workers: 4 |
| | prefetch_factor: 128 |
| | |
| | |
| | dropout_steps: [0] |
| | dropout: [0.1] |
| | attention_dropout: [0.1] |
| | max_grad_norm: 0 |
| | label_smoothing: 0.1 |
| | average_decay: 0.0001 |
| | param_init_method: xavier_uniform |
| | normalization: "tokens" |
| |
|
| | model: |
| | architecture: "transformer" |
| | share_embeddings: false |
| | share_decoder_embeddings: false |
| | add_estimator: false |
| | add_ffnbias: true |
| | add_qkvbias: false |
| | layer_norm: standard |
| | mlp_activation_fn: gelu |
| | hidden_size: 768 |
| | encoder: |
| | layers: 12 |
| | decoder: |
| | layers: 2 |
| | heads: 16 |
| | transformer_ff: 4096 |
| | embeddings: |
| | word_vec_size: 768 |
| | position_encoding_type: "SinusoidalInterleaved" |
| |
|
| |
|