Nvidia NeMo#

This example shows how to launch Nvidia NeMo jobs with SkyPilot.

Included files#

nemo_bert.yaml

# Distributed training a BERT model with Nvidia NeMo
#
# Finetunes a BERT-like model on the GLUE CoLA task. Uses the NeMo toolkit
# to train across multiple nodes, each node having a V100 GPU.
#
# Uses glue_benchmark.py script from the NeMo examples:
# https://github.com/NVIDIA/NeMo/blob/2ce45369f7ab6cd20c376d1ed393160f5e54be0c/examples/nlp/glue_benchmark/glue_benchmark.py
#
# Usage:
#   sky launch -c nemo_bert nemo_bert.yaml
#
#   # Or try on spot A100 GPUs:
#   sky launch -c nemo_bert nemo_bert.yaml --use-spot --gpus A100:1
#
#   # Terminate cluster after you're done
#   sky down nemo_bert

resources:
  accelerators: V100:1

num_nodes: 2

setup: |
  conda activate nemo
  if [ $? -eq 0 ]; then
      echo "conda env exists"
  else
      conda create -y --name nemo python==3.10.12
      conda activate nemo
  
      # Install PyTorch
      pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
      
      # Install nemo
      sudo apt-get update
      sudo apt-get install -y libsndfile1 ffmpeg
      pip install Cython
      pip install nemo_toolkit['all']
    
      # Clone the NeMo repo to get the examples
      git clone https://github.com/NVIDIA/NeMo.git
      
      # Download GLUE dataset
      wget https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/70e86a10fbf4ab4ec3f04c9ba82ba58f87c530bf/download_glue_data.py
      python download_glue_data.py --data_dir glue_data --tasks CoLA
  fi

run: |
  conda activate nemo
  
  # Get the number of nodes and master address from SkyPilot envvars
  num_nodes=`echo "$SKYPILOT_NODE_IPS" | wc -l`
  master_addr=`echo "$SKYPILOT_NODE_IPS" | head -n1`
  
  # Run glue_benchmark.py
  python -m torch.distributed.run \
    --nproc_per_node=${SKYPILOT_NUM_GPUS_PER_NODE} \
    --nnodes=${num_nodes} \
    --node_rank=${SKYPILOT_NODE_RANK} \
    --master_addr=${master_addr} \
    --master_port=8008 \
    NeMo/examples/nlp/glue_benchmark/glue_benchmark.py \
    model.dataset.data_dir=glue_data/CoLA  \
    model.task_name=cola \
    trainer.max_epochs=10 \
    trainer.num_nodes=${num_nodes}

nemo_gpt_distributed.yaml

# Distributed training a GPT style model with Nvidia NeMo on multiple nodes.
#
# Inspired from https://github.com/NVIDIA/NeMo/blob/main/docs/source/nlp/nemo_megatron/gpt/gpt_training.rst
#
# Note that we provide a read-only bucket at gs://sky-wiki-data that is used to
# download preprocessed data to local disk. If you want to preprocess the data
# yourself, see nemo_gpt_preprocessing.yaml.
#
# We use a shared bucket to store the index files that are used to coordinate
# between the head and worker nodes. This shared bucket is mounted as a
# network filesystem (NFS) on the head and worker nodes.
#
# After the script completes, the model checkpoints will be saved in
# /ckpts on the head node (can be changed to /shared for cloud storage).
#
# Usage:
#   sky launch --env SHARED_NFS_BUCKET_NAME=<unique_bucket_name> -c nemo_gpt nemo_gpt_distributed.yaml
#
#   # Terminate cluster after you're done
#   sky down nemo_gpt

resources:
  cpus: 8+
  memory: 64+
  accelerators: A100-80GB:1
  image_id: docker:nvcr.io/nvidia/nemo:24.05

num_nodes: 2

envs:
  DATASET_ROOT: /wiki
  SHARED_NFS_ROOT: /shared
  SHARED_NFS_BUCKET_NAME: # Enter a unique bucket name here for the shared directory - if it doesn't exist SkyPilot will create it
  CHECKPOINT_PATH: /ckpts # Store checkpoints at a local path. You can change this to /shared for checkpointing to cloud bucket at every callback, but this will slow down training.

file_mounts:
  ${DATASET_ROOT}:
    source: gs://sky-wiki-data    # This is a read-only bucket provided by SkyPilot for the dataset
    mode: COPY

  # The SHARED_NFS_ROOT path acts as a network filesystem (NFS) between the
  # head and worker nodes. In NeMo, the head node writes an indexmap to this
  # shared filesystem that is read by workers.
  #
  # Note that NeMo requires this shared filesystem to be strongly consistent -
  # any writes made by the head should be immediately visible to the workers.
  ${SHARED_NFS_ROOT}:
    name: ${SHARED_NFS_BUCKET_NAME}
    store: gcs  # We recommend using GCS in mount mode - S3 based mounts may fail with "transport endpoint is not connected" error.
    mode: MOUNT

setup: |
  conda deactivate
  
  # Clone NeMo repo if not already present
  if [ ! -d NeMo ]; then
      git clone https://github.com/NVIDIA/NeMo.git
      cd NeMo 
      git checkout 5df8e11255802a2ce2f33db6362e60990e215b64
  fi

run: |
  conda deactivate
  # ============= Training =============  
  # Get the number of nodes and master address from SkyPilot envvars
  num_nodes=`echo "$SKYPILOT_NODE_IPS" | wc -l`
  master_addr=`echo "$SKYPILOT_NODE_IPS" | head -n1`
  
  # Kill any existing megatron processes
  pkill -f -9 megatron
  
  mkdir -p ${CHECKPOINT_PATH}
  
  echo "Writing checkpoints to ${CHECKPOINT_PATH}"
  echo "Writing index files to shared storage ${SHARED_NFS_ROOT}"
  
  python -m torch.distributed.run \
    --nproc_per_node=${SKYPILOT_NUM_GPUS_PER_NODE} \
    --nnodes=${num_nodes} \
    --node_rank=${SKYPILOT_NODE_RANK} \
    --master_addr=${master_addr} \
    --master_port=12375 \
    NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py  \
    --config-path=conf \
    --config-name=megatron_gpt_config \
    trainer.devices=${SKYPILOT_NUM_GPUS_PER_NODE} \
    trainer.num_nodes=${num_nodes} \
    trainer.max_epochs=null \
    trainer.max_steps=300000 \
    trainer.val_check_interval=50 \
    trainer.log_every_n_steps=50 \
    trainer.limit_val_batches=50 \
    trainer.limit_test_batches=50 \
    trainer.accumulate_grad_batches=1 \
    trainer.precision=16 \
    model.mcore_gpt=True \
    model.micro_batch_size=6 \
    model.global_batch_size=192 \
    model.tensor_model_parallel_size=1 \
    model.pipeline_model_parallel_size=1 \
    model.max_position_embeddings=1024 \
    model.encoder_seq_length=1024 \
    model.hidden_size=768 \
    model.ffn_hidden_size=3072 \
    model.num_layers=12 \
    model.num_attention_heads=12 \
    model.init_method_std=0.021 \
    model.hidden_dropout=0.1 \
    model.layernorm_epsilon=1e-5 \
    model.tokenizer.vocab_file=${DATASET_ROOT}/gpt2-vocab.json \
    model.tokenizer.merge_file=${DATASET_ROOT}/gpt2-merges.txt \
    model.data.data_prefix=[1.0,${DATASET_ROOT}/hfbpe_gpt_training_data_text_document] \
    model.data.num_workers=2 \
    model.data.seq_length=1024 \
    model.data.splits_string=\'980,10,10\' \
    model.data.index_mapping_dir=${SHARED_NFS_ROOT} \
    model.optim.name=fused_adam \
    model.optim.lr=6e-4 \
    model.optim.betas=[0.9,0.95] \
    model.optim.weight_decay=0.1 \
    model.optim.sched.name=CosineAnnealing \
    model.optim.sched.warmup_steps=750 \
    model.optim.sched.constant_steps=80000 \
    model.optim.sched.min_lr=6e-5 \
    exp_manager.resume_if_exists=True \
    exp_manager.resume_ignore_no_checkpoint=True \
    exp_manager.create_checkpoint_callback=True \
    +exp_manager.checkpoint_callback_params.dirpath=${CHECKPOINT_PATH} \
    exp_manager.checkpoint_callback_params.monitor=val_loss \
    exp_manager.checkpoint_callback_params.save_top_k=3 \
    exp_manager.checkpoint_callback_params.mode=min \
    exp_manager.checkpoint_callback_params.always_save_nemo=True
  
  # Optional - if writing checkpoints to a local directory,
  # copy final checkpoints to the shared bucket at the end of training (~6 GB)
  # if [ ${SKYPILOT_NODE_RANK} -eq 0 ]; then
  #     mkdir -p ${SHARED_NFS_ROOT}/results
  #     cp -R ${CHECKPOINT_PATH}
  # fi

nemo_gpt_preprocessing.yaml

# Prepares the wiki dataset for training with NeMo. Downloads the data, runs
# preprocessing and saves the data in mmap format on a cloud bucket. This same
# bucket can then be used for training.
#
# This YAML is for demonstration purposes and is not a necessary step before
# running nemo_gpt_train.yaml. Since this preprocessing can take
# upto 6 hours, we provide a read-only bucket with the preprocessed data (gs://sky-wiki-data)
# that can be downloaded to your bucket (see nemo_gpt_train.yaml).
#
# Usage:
#   sky launch -s -c nemo_gpt_preprocessing nemo_gpt_preprocessing.yaml
#
#   # Terminate cluster after you're done
#   sky down nemo_gpt_preprocessing

num_nodes: 1

envs:
  LOCAL_DATASET_ROOT: /wiki
  DATASET_BUCKET_ROOT: /bucket
  BUCKET_NAME: # Enter a unique bucket name here - if it doesn't exist SkyPilot will create it

file_mounts:
  ${DATASET_BUCKET_ROOT}:
    name: ${BUCKET_NAME}
    store: gcs  # We recommend using GCS for large datasets in mount mode - S3 based mounts may fail with "transport endpoint is not connected" error.
    mode: MOUNT

setup: |
  conda activate nemo
  if [ $? -eq 0 ]; then
      echo "Nemo conda env exists"
  else
      echo "Setup start"
  
      conda create -y --name nemo python==3.10.12
      conda activate nemo
  
      # Install PyTorch
      pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
      
      # Install nemo
      git clone https://github.com/NVIDIA/NeMo.git
      cd NeMo
      git checkout b4ad7eaa7873d632391d6985aa6b359f39c20bab
      pip install Cython
      pip install .[all]
      cd ..
  
      # Install megatron-core
      # We install in editable mode because setup.py does not install all 
      # required modules if we install in non-editable mode.
      git clone https://github.com/NVIDIA/Megatron-LM
      cd Megatron-LM
      git checkout dc21350806361564b8ce61d4a8d247cb195cc5f0
      pip install -e .  
      cd ..
      
      # Install ninja for faster compilation
      pip install ninja packaging
  
      # Install transformer engine and flash-attn (Takes ~1hr to compile)
      MAX_JOBS=4 pip install flash-attn==2.0.4 --no-build-isolation # Version upper capped by TransformerEngine
      MAX_JOBS=4 pip install git+https://github.com/NVIDIA/TransformerEngine.git@stable
      
      pip install pytorch-extension
  
      # Install Apex
      git clone https://github.com/NVIDIA/apex.git
      cd apex
      git checkout 52e18c894223800cb611682dce27d88050edf1de
      pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
      cd ..
  fi

run: |
  conda activate nemo
  
  # ======== Download and preprocess the wikipedia dataset ========
  if [ -f ${LOCAL_DATASET_ROOT}/train_data.jsonl ]; then
      echo "Dataset exists"
  else
      # Install axel for faster downloads
      sudo apt-get install -y axel
  
      mkdir -p ${LOCAL_DATASET_ROOT}
      cd ${LOCAL_DATASET_ROOT}
  
      # Download the wikipedia dataset (takes ~15 min)
      axel -n 20 https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
      
      # Preprocess the wikipedia dataset (takes ~2 hours)
      pip install wikiextractor
      python -m wikiextractor.WikiExtractor enwiki-latest-pages-articles.xml.bz2 --json
      find text -name 'wiki_*' -exec cat {} \; > train_data.jsonl
  fi
  
  # ======== Download tokenizer files ========
  # Check if the tokenizer files exist
  if [ -f ${LOCAL_DATASET_ROOT}/gpt2-vocab.json ]; then
      echo "Tokenizer files exist"
  else
      # Download the tokenizer files
      cd {LOCAL_DATASET_ROOT}
      axel -n 20 https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
      axel -n 20 https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
  fi
  
  # ======== Convert data to mmap format and write to bucket ========
  # Check if the mmap files exist
  if [ -f ${LOCAL_DATASET_ROOT}/hfbpe_gpt_training_data_text_document.bin ]; then
      echo "Mmap files exist"
  else
      # Convert the data to mmap format`
      cd ${LOCAL_DATASET_ROOT}
      python $HOME/sky_workdir/NeMo/scripts/nlp_language_modeling/preprocess_data_for_megatron.py \
        --input=train_data.jsonl \
        --json-keys=text \
        --tokenizer-library=megatron \
        --vocab gpt2-vocab.json \
        --dataset-impl mmap \
        --tokenizer-type GPT2BPETokenizer \
        --merge-file gpt2-merges.txt \
        --output-prefix=hfbpe_gpt_training_data \
        --append-eod \
        --workers=32
  fi
  
  echo "Done preprocessing dataset, copying to mounted bucket now."
  cp {gpt2-merges.txt,gpt2-vocab.json,hfbpe_gpt_training_data_text_document.bin,hfbpe_gpt_training_data_text_document.idx} ${DATASET_BUCKET_ROOT}
  echo "Done copying - data is now available on ${BUCKET_NAME} bucket."

nemo_gpt_singlenode.yaml

# Single node training a GPT style model with Nvidia NeMo
#
# This script downloads data from read-only bucket at gs://sky-wiki-data.
# If you want to preprocess the data yourself, see nemo_gpt_preprocessing.yaml.
#
# The specific model used here should fit on GPU with 16GB memory.
#
# After the script completes, the model checkpoints will be saved in
# /ckpts (configurable through CHECKPOINT_PATH env var) on the head node.
#
# Usage:
#   sky launch -c nemo_gpt nemo_gpt_singlenode.yaml
#
#   # Or try on spot A100 GPUs:
#   sky launch -c nemo_gpt nemo_gpt_singlenode.yaml --use-spot --gpus A100:1
#
#   # Terminate cluster after you're done
#   sky down nemo_gpt

resources:
  cpus: 8+
  memory: 64+
  accelerators: A100-80GB:1
  image_id: docker:nvcr.io/nvidia/nemo:24.05

num_nodes: 1

envs:
  DATASET_ROOT: /wiki
  CHECKPOINT_PATH: /ckpts


file_mounts:
  ${DATASET_ROOT}:
    source: gs://sky-wiki-data    # This is a read-only bucket provided by SkyPilot for the dataset
    mode: COPY

setup: |
  conda deactivate
  
  # Clone NeMo repo if not already present
  if [ ! -d NeMo ]; then
      git clone https://github.com/NVIDIA/NeMo.git
      cd NeMo 
      git checkout 5df8e11255802a2ce2f33db6362e60990e215b64
  fi
  
  # Install gsutil if it doesn't exist
  if ! command -v gsutil &> /dev/null
  then
      pip install gsutil
  else
      echo "gsutil exists"
  fi

run: |
  conda deactivate

  # Kill any existing megatron processes
  pkill -f -9 megatron
  
  mkdir -p ${CHECKPOINT_PATH}
  
  # ============= Training =============    
  python NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py  \
    --config-path=conf \
    --config-name=megatron_gpt_config \
    trainer.devices=${SKYPILOT_NUM_GPUS_PER_NODE} \
    trainer.num_nodes=1 \
    trainer.max_epochs=null \
    trainer.max_steps=300000 \
    trainer.val_check_interval=50 \
    trainer.log_every_n_steps=50 \
    trainer.limit_val_batches=50 \
    trainer.limit_test_batches=50 \
    trainer.accumulate_grad_batches=1 \
    trainer.precision=16 \
    model.mcore_gpt=True \
    model.micro_batch_size=6 \
    model.global_batch_size=192 \
    model.tensor_model_parallel_size=1 \
    model.pipeline_model_parallel_size=1 \
    model.max_position_embeddings=1024 \
    model.encoder_seq_length=1024 \
    model.hidden_size=768 \
    model.ffn_hidden_size=3072 \
    model.num_layers=12 \
    model.num_attention_heads=12 \
    model.init_method_std=0.021 \
    model.hidden_dropout=0.1 \
    model.layernorm_epsilon=1e-5 \
    model.tokenizer.vocab_file=${DATASET_ROOT}/gpt2-vocab.json \
    model.tokenizer.merge_file=${DATASET_ROOT}/gpt2-merges.txt \
    model.data.data_prefix=[1.0,${DATASET_ROOT}/hfbpe_gpt_training_data_text_document] \
    model.data.num_workers=2 \
    model.data.seq_length=1024 \
    model.data.splits_string=\'980,10,10\' \
    model.optim.name=fused_adam \
    model.optim.lr=6e-4 \
    model.optim.betas=[0.9,0.95] \
    model.optim.weight_decay=0.1 \
    model.optim.sched.name=CosineAnnealing \
    model.optim.sched.warmup_steps=750 \
    model.optim.sched.constant_steps=80000 \
    model.optim.sched.min_lr=6e-5 \
    exp_manager.resume_if_exists=True \
    exp_manager.resume_ignore_no_checkpoint=True \
    exp_manager.create_checkpoint_callback=True \
    +exp_manager.checkpoint_callback_params.dirpath=${CHECKPOINT_PATH} \
    exp_manager.checkpoint_callback_params.monitor=val_loss \
    exp_manager.checkpoint_callback_params.save_top_k=3 \
    exp_manager.checkpoint_callback_params.mode=min \
    exp_manager.checkpoint_callback_params.always_save_nemo=False