Source: llm/fairseq2

Fairseq2: Meta FAIR’s Sequence Modeling Toolkit#

Fairseq2 is Meta FAIR’s next-generation sequence modeling toolkit. It provides recipes for LLM instruction finetuning and preference optimization, with multi-GPU and multi-node support via DDP, FSDP, and tensor parallelism.

Why SkyPilot + Fairseq2?#

SkyPilot makes fine-tuning with fairseq2 effortless:

Run anywhere - Same YAML works on Kubernetes, Slurm, AWS, GCP, Azure, and 20+ other clouds
Multi-node with zero setup - Handles distributed fine-tuning across nodes automatically
No vendor lock-in - Checkpoints saved to your own cloud storage

Quick Start#

First, set up your Hugging Face token (see Preparation section for details):

export HF_TOKEN=your_hf_token_here

Launch instruction fine-tuning on a single GPU:

cd llm/fairseq2
sky launch -c fairseq2-sft sft.sky.yaml --secret HF_TOKEN

Monitor training progress:

sky logs fairseq2-sft

Examples#

Instruction Fine-tuning (SFT)#

Fine-tune Llama 3.2 1B on the GSM8K math reasoning dataset:

sky launch -c fairseq2-sft sft.sky.yaml --secret HF_TOKEN

Use a larger model:

sky launch -c fairseq2-sft sft.sky.yaml --secret HF_TOKEN \
  --gpus H100:8 --env MODEL=llama3_1_70b

Multi-Node Fine-tuning#

Fine-tune across multiple nodes with FSDP:

sky launch -c fairseq2-multi multinode.sky.yaml --secret HF_TOKEN

# Use a larger model:
sky launch -c fairseq2-multi multinode.sky.yaml --secret HF_TOKEN \
  --gpus H100:8 --env MODEL=llama3_1_70b

Preparation#

1. Request Model Access#

Llama models are gated and require access approval:

Request access to Llama models on Hugging Face

2. Get Your Hugging Face Token#

Go to Hugging Face Settings > Tokens
Create a new token with “Read” permissions
Copy the token for the next step

3. Set Environment Variable#

export HF_TOKEN="your_token_here"

4. Install SkyPilot#

uv pip install "skypilot-nightly[aws,gcp,kubernetes,slurm]"
# See: https://docs.skypilot.co/en/latest/getting-started/installation.html

5. Verify Setup#

sky check

Learn More#

Included files#

llama_hf_assets.yaml

name: llama3_2_1b@user
base: llama3
model_arch: llama3_2_1b
checkpoint: "hg://meta-llama/Llama-3.2-1B"
tokenizer: "hg://meta-llama/Llama-3.2-1B"
---
name: llama3_2_1b_instruct@user
base: llama3_instruct
model_arch: llama3_2_1b
checkpoint: "hg://meta-llama/Llama-3.2-1B-Instruct"
tokenizer: "hg://meta-llama/Llama-3.2-1B-Instruct"
---
name: llama3_1_8b@user
base: llama3
model_arch: llama3_1_8b
checkpoint: "hg://meta-llama/Llama-3.1-8B"
tokenizer: "hg://meta-llama/Llama-3.1-8B"
---
name: llama3_1_8b_instruct@user
base: llama3_instruct
model_arch: llama3_1_8b
checkpoint: "hg://meta-llama/Llama-3.1-8B-Instruct"
tokenizer: "hg://meta-llama/Llama-3.1-8B-Instruct"
---
name: llama3_1_70b@user
base: llama3
model_arch: llama3_1_70b
checkpoint: "hg://meta-llama/Llama-3.1-70B"
tokenizer: "hg://meta-llama/Llama-3.1-70B"
---
name: llama3_1_70b_instruct@user
base: llama3_instruct
model_arch: llama3_1_70b
checkpoint: "hg://meta-llama/Llama-3.1-70B-Instruct"
tokenizer: "hg://meta-llama/Llama-3.1-70B-Instruct"

multinode.sky.yaml

# Fairseq2: Multi-node Fine-tuning on SkyPilot
#
# Fine-tune a Llama model across multiple nodes using fairseq2 with FSDP.
# Defaults to Llama 3.2 1B on 2 nodes.
#
# Usage:
#   sky launch -c fairseq2-multi multinode.sky.yaml --secret HF_TOKEN
#
# Use a larger model:
#   sky launch -c fairseq2-multi multinode.sky.yaml --secret HF_TOKEN \
#     --gpus H100:8 --env MODEL=llama3_1_70b

envs:
  MODEL: llama3_2_1b
  MAX_NUM_STEPS: 2000
  # Max sequence length (controls memory usage; reduce if OOM).
  # max_num_tokens is set to 2x this value, following fairseq2 defaults.
  MAX_SEQ_LEN: 2048
  # Change this to your own checkpoint bucket
  CHECKPOINT_BUCKET_NAME: sky-fairseq2-checkpoints

secrets:
  HF_TOKEN: null  # Pass with `--secret HF_TOKEN` in CLI

workdir:
  url: https://github.com/facebookresearch/fairseq2.git
  ref: main

resources:
  accelerators: L40S

num_nodes: 2

file_mounts:
  # Shared cloud storage for checkpoints.
  # NOTE: If you're on a Slurm cluster with NFS, you may remove this mount and
  # point the checkpoints symlink to a shared NFS path instead.
  /output:
    name: $CHECKPOINT_BUCKET_NAME
    mode: MOUNT
  # Register Llama asset cards so fairseq2 can download gated models from HF.
  # See https://facebookresearch.github.io/fairseq2/stable/basics/assets.html
  ~/.config/fairseq2/assets/llama_hf.yaml: llama_hf_assets.yaml

setup: |
  # Install system dependencies (libsndfile required by fairseq2n)
  sudo apt-get update && sudo apt-get install -y libsndfile1

  # Create virtual environment
  uv venv --python 3.11 --seed
  source .venv/bin/activate

  # Install PyTorch 2.7.1 with CUDA 12.6 and fairseq2 nightly
  # fairseq2 native (fairseq2n) requires an exact PyTorch+CUDA match;
  # see https://github.com/facebookresearch/fairseq2#variants
  uv pip install torch==2.7.1 torchvision torchaudio \
    --index-url https://download.pytorch.org/whl/cu126
  uv pip install fairseq2 --pre \
    --extra-index-url https://fair.pkg.atmeta.com/fairseq2/whl/nightly/pt2.7.1/cu126
  uv pip install huggingface_hub

  # Login to HuggingFace for model access (if token provided)
  if [ -n "$HF_TOKEN" ]; then
    hf auth login --token $HF_TOKEN
  fi

  # Pre-download model weights on every node. fairseq2 only downloads on
  # rank 0 and expects a shared filesystem; we must cache on each node.
  HF_REPO=$(grep -A3 "name: ${MODEL}@" ~/.config/fairseq2/assets/llama_hf.yaml | grep checkpoint | sed 's/.*hg:\/\///' | tr -d '"')
  if [ -n "$HF_REPO" ]; then
    echo "Pre-downloading model $HF_REPO..."
    hf download "$HF_REPO"
  fi

  # Download the GSM8K dataset formatted for fairseq2
  echo "Downloading fairseq2-lm-gsm8k dataset..."
  mkdir -p ~/datasets/facebook/fairseq2-lm-gsm8k

  # Download all dataset files
  hf download facebook/fairseq2-lm-gsm8k \
    --repo-type dataset --local-dir ~/datasets/facebook/fairseq2-lm-gsm8k

  echo "Dataset download completed."

run: |
  set -e
  # Unset CONDA_PREFIX so fairseq2n uses system libsndfile (from apt)
  # instead of searching only in conda's site-packages.
  unset CONDA_PREFIX
  source .venv/bin/activate

  # Extract master address for distributed fine-tuning
  MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1)
  MASTER_PORT=29500

  echo "Node rank: $SKYPILOT_NODE_RANK"
  echo "Master address: $MASTER_ADDR:$MASTER_PORT"
  echo "Total nodes: $SKYPILOT_NUM_NODES"
  echo "GPUs per node: $SKYPILOT_NUM_GPUS_PER_NODE"

  # Use local dir for training output (logs use append, unsupported by MOUNT).
  # Symlink checkpoints/ to the shared bucket mount so all nodes can write shards.
  # NOTE: On Slurm with NFS, you may skip the symlink and use the NFS path directly.
  OUTPUT_DIR=~/output/${MODEL}-multinode
  mkdir -p $OUTPUT_DIR
  ln -sfn /output $OUTPUT_DIR/checkpoints

  # Run distributed fine-tuning with torchrun
  torchrun \
    --nnodes=$SKYPILOT_NUM_NODES \
    --nproc_per_node=$SKYPILOT_NUM_GPUS_PER_NODE \
    --node_rank=$SKYPILOT_NODE_RANK \
    --master_addr=$MASTER_ADDR \
    --master_port=$MASTER_PORT \
    -m recipes.lm.sft \
    $OUTPUT_DIR \
    --config \
    model.name=$MODEL \
    tokenizer.name=$MODEL \
    regime.num_steps=$MAX_NUM_STEPS \
    dataset.max_seq_len=$MAX_SEQ_LEN \
    dataset.max_num_tokens=$((MAX_SEQ_LEN * 2)) \
    trainer.mixed_precision.dtype=bfloat16 \
    gang.tensor_parallel_size=$SKYPILOT_NUM_GPUS_PER_NODE \
    common.no_sweep_dir=true

  echo "Distributed fine-tuning completed! Checkpoints saved to: $OUTPUT_DIR"

sft.sky.yaml

# Fairseq2: Instruction Fine-tuning LLMs on SkyPilot
#
# Fine-tune a Llama model on GSM8K using fairseq2, Meta FAIR's sequence
# modeling toolkit. Defaults to Llama 3.2 1B on a single GPU.
#
# Usage:
#   sky launch -c fairseq2-sft sft.sky.yaml --secret HF_TOKEN
#
# Use a larger model:
#   sky launch -c fairseq2-sft sft.sky.yaml --secret HF_TOKEN \
#     --gpus H100:8 --env MODEL=llama3_1_70b

envs:
  MODEL: llama3_2_1b
  MAX_NUM_STEPS: 2000
  # Max sequence length (controls memory usage; reduce if OOM).
  # max_num_tokens is set to 2x this value, following fairseq2 defaults.
  MAX_SEQ_LEN: 2048
  # Change this to your own checkpoint bucket
  CHECKPOINT_BUCKET_NAME: sky-fairseq2-checkpoints

secrets:
  HF_TOKEN: null  # Pass with `--secret HF_TOKEN` in CLI

workdir:
  url: https://github.com/facebookresearch/fairseq2.git
  ref: main

resources:
  accelerators: L40S

file_mounts:
  # Shared cloud storage for checkpoints.
  # NOTE: If you're on a Slurm cluster with NFS, you may remove this mount and
  # point the checkpoints symlink to a shared NFS path instead.
  /output:
    name: $CHECKPOINT_BUCKET_NAME
    mode: MOUNT
  # Register Llama asset cards so fairseq2 can download gated models from HF.
  # See https://facebookresearch.github.io/fairseq2/stable/basics/assets.html
  ~/.config/fairseq2/assets/llama_hf.yaml: llama_hf_assets.yaml

setup: |
  # Install system dependencies (libsndfile required by fairseq2n)
  sudo apt-get update && sudo apt-get install -y libsndfile1

  # Create virtual environment
  uv venv --python 3.11 --seed
  source .venv/bin/activate

  # Install PyTorch 2.7.1 with CUDA 12.6 and fairseq2 nightly
  # fairseq2 native (fairseq2n) requires an exact PyTorch+CUDA match;
  # see https://github.com/facebookresearch/fairseq2#variants
  uv pip install torch==2.7.1 torchvision torchaudio \
    --index-url https://download.pytorch.org/whl/cu126
  uv pip install fairseq2 --pre \
    --extra-index-url https://fair.pkg.atmeta.com/fairseq2/whl/nightly/pt2.7.1/cu126
  uv pip install huggingface_hub

  # Login to HuggingFace for model access (if token provided)
  if [ -n "$HF_TOKEN" ]; then
    hf auth login --token $HF_TOKEN
  fi

  # Download the GSM8K dataset formatted for fairseq2
  echo "Downloading fairseq2-lm-gsm8k dataset..."
  mkdir -p ~/datasets/facebook/fairseq2-lm-gsm8k

  # Download all dataset files
  hf download facebook/fairseq2-lm-gsm8k \
    --repo-type dataset --local-dir ~/datasets/facebook/fairseq2-lm-gsm8k

  echo "Dataset download completed."

run: |
  set -e
  # Unset CONDA_PREFIX so fairseq2n uses system libsndfile (from apt)
  # instead of searching only in conda's site-packages.
  unset CONDA_PREFIX
  source .venv/bin/activate

  # Use local dir for training output (logs use append, unsupported by MOUNT).
  # Symlink checkpoints/ to the shared bucket mount so all nodes can write shards.
  # NOTE: On Slurm with NFS, you may skip the symlink and use the NFS path directly.
  OUTPUT_DIR=~/output/${MODEL}-sft
  mkdir -p $OUTPUT_DIR
  ln -sfn /output $OUTPUT_DIR/checkpoints

  # Run instruction fine-tuning on GSM8K dataset
  # torchrun launches the fine-tuning recipe (supports multi-GPU with FSDP)
  NUM_GPUS=${SKYPILOT_NUM_GPUS_PER_NODE:-1}
  torchrun \
    --nnodes=1 \
    --nproc_per_node=$NUM_GPUS \
    -m recipes.lm.sft \
    $OUTPUT_DIR \
    --config \
    model.name=$MODEL \
    tokenizer.name=$MODEL \
    regime.num_steps=$MAX_NUM_STEPS \
    dataset.max_seq_len=$MAX_SEQ_LEN \
    dataset.max_num_tokens=$((MAX_SEQ_LEN * 2)) \
    trainer.mixed_precision.dtype=bfloat16 \
    common.no_sweep_dir=true

  echo "Fine-tuning completed! Checkpoints saved to: $OUTPUT_DIR"

  # Validate the fine-tuned model by generating on test samples (rank 0 only)
  if [ "${SKYPILOT_NODE_RANK:-0}" -eq 0 ]; then
    head -10 ~/datasets/facebook/fairseq2-lm-gsm8k/sft_test/test.jsonl > /tmp/test_samples.jsonl

    python -m recipes.lm.generate $OUTPUT_DIR/eval --config \
      model.name=$MODEL \
      model.dtype=bfloat16 \
      tokenizer.name=$MODEL \
      common.assets.prev_checkpoint_dir=$OUTPUT_DIR/checkpoints \
      "dataset.config_overrides.paths=[/tmp/test_samples.jsonl]" \
      common.no_sweep_dir=true
  fi

  echo "All outputs saved to: $OUTPUT_DIR"