Source: llm/fairseq2
Fairseq2: Meta FAIR’s Sequence Modeling Toolkit#
Fairseq2 is Meta FAIR’s next-generation sequence modeling toolkit. It provides recipes for LLM instruction finetuning and preference optimization, with multi-GPU and multi-node support via DDP, FSDP, and tensor parallelism.
Why SkyPilot + Fairseq2?#
SkyPilot makes fine-tuning with fairseq2 effortless:
Run anywhere - Same YAML works on Kubernetes, Slurm, AWS, GCP, Azure, and 20+ other clouds
Multi-node with zero setup - Handles distributed fine-tuning across nodes automatically
No vendor lock-in - Checkpoints saved to your own cloud storage
Quick Start#
First, set up your Hugging Face token (see Preparation section for details):
export HF_TOKEN=your_hf_token_here
Launch instruction fine-tuning on a single GPU:
cd llm/fairseq2
sky launch -c fairseq2-sft sft.sky.yaml --secret HF_TOKEN
Monitor training progress:
sky logs fairseq2-sft
Examples#
Instruction Fine-tuning (SFT)#
Fine-tune Llama 3.2 1B on the GSM8K math reasoning dataset:
sky launch -c fairseq2-sft sft.sky.yaml --secret HF_TOKEN
Use a larger model:
sky launch -c fairseq2-sft sft.sky.yaml --secret HF_TOKEN \
--gpus H100:8 --env MODEL=llama3_1_70b
Multi-Node Fine-tuning#
Fine-tune across multiple nodes with FSDP:
sky launch -c fairseq2-multi multinode.sky.yaml --secret HF_TOKEN
# Use a larger model:
sky launch -c fairseq2-multi multinode.sky.yaml --secret HF_TOKEN \
--gpus H100:8 --env MODEL=llama3_1_70b
Preparation#
1. Request Model Access#
Llama models are gated and require access approval:
Request access to Llama models on Hugging Face
2. Get Your Hugging Face Token#
Create a new token with “Read” permissions
Copy the token for the next step
3. Set Environment Variable#
export HF_TOKEN="your_token_here"
4. Install SkyPilot#
uv pip install "skypilot-nightly[aws,gcp,kubernetes,slurm]"
# See: https://docs.skypilot.co/en/latest/getting-started/installation.html
5. Verify Setup#
sky check
Learn More#
Included files#
llama_hf_assets.yaml
name: llama3_2_1b@user
base: llama3
model_arch: llama3_2_1b
checkpoint: "hg://meta-llama/Llama-3.2-1B"
tokenizer: "hg://meta-llama/Llama-3.2-1B"
---
name: llama3_2_1b_instruct@user
base: llama3_instruct
model_arch: llama3_2_1b
checkpoint: "hg://meta-llama/Llama-3.2-1B-Instruct"
tokenizer: "hg://meta-llama/Llama-3.2-1B-Instruct"
---
name: llama3_1_8b@user
base: llama3
model_arch: llama3_1_8b
checkpoint: "hg://meta-llama/Llama-3.1-8B"
tokenizer: "hg://meta-llama/Llama-3.1-8B"
---
name: llama3_1_8b_instruct@user
base: llama3_instruct
model_arch: llama3_1_8b
checkpoint: "hg://meta-llama/Llama-3.1-8B-Instruct"
tokenizer: "hg://meta-llama/Llama-3.1-8B-Instruct"
---
name: llama3_1_70b@user
base: llama3
model_arch: llama3_1_70b
checkpoint: "hg://meta-llama/Llama-3.1-70B"
tokenizer: "hg://meta-llama/Llama-3.1-70B"
---
name: llama3_1_70b_instruct@user
base: llama3_instruct
model_arch: llama3_1_70b
checkpoint: "hg://meta-llama/Llama-3.1-70B-Instruct"
tokenizer: "hg://meta-llama/Llama-3.1-70B-Instruct"
multinode.sky.yaml
# Fairseq2: Multi-node Fine-tuning on SkyPilot
#
# Fine-tune a Llama model across multiple nodes using fairseq2 with FSDP.
# Defaults to Llama 3.2 1B on 2 nodes.
#
# Usage:
# sky launch -c fairseq2-multi multinode.sky.yaml --secret HF_TOKEN
#
# Use a larger model:
# sky launch -c fairseq2-multi multinode.sky.yaml --secret HF_TOKEN \
# --gpus H100:8 --env MODEL=llama3_1_70b
envs:
MODEL: llama3_2_1b
MAX_NUM_STEPS: 2000
# Max sequence length (controls memory usage; reduce if OOM).
# max_num_tokens is set to 2x this value, following fairseq2 defaults.
MAX_SEQ_LEN: 2048
# Change this to your own checkpoint bucket
CHECKPOINT_BUCKET_NAME: sky-fairseq2-checkpoints
secrets:
HF_TOKEN: null # Pass with `--secret HF_TOKEN` in CLI
workdir:
url: https://github.com/facebookresearch/fairseq2.git
ref: main
resources:
accelerators: L40S
num_nodes: 2
file_mounts:
# Shared cloud storage for checkpoints.
# NOTE: If you're on a Slurm cluster with NFS, you may remove this mount and
# point the checkpoints symlink to a shared NFS path instead.
/output:
name: $CHECKPOINT_BUCKET_NAME
mode: MOUNT
# Register Llama asset cards so fairseq2 can download gated models from HF.
# See https://facebookresearch.github.io/fairseq2/stable/basics/assets.html
~/.config/fairseq2/assets/llama_hf.yaml: llama_hf_assets.yaml
setup: |
# Install system dependencies (libsndfile required by fairseq2n)
sudo apt-get update && sudo apt-get install -y libsndfile1
# Create virtual environment
uv venv --python 3.11 --seed
source .venv/bin/activate
# Install PyTorch 2.7.1 with CUDA 12.6 and fairseq2 nightly
# fairseq2 native (fairseq2n) requires an exact PyTorch+CUDA match;
# see https://github.com/facebookresearch/fairseq2#variants
uv pip install torch==2.7.1 torchvision torchaudio \
--index-url https://download.pytorch.org/whl/cu126
uv pip install fairseq2 --pre \
--extra-index-url https://fair.pkg.atmeta.com/fairseq2/whl/nightly/pt2.7.1/cu126
uv pip install huggingface_hub
# Login to HuggingFace for model access (if token provided)
if [ -n "$HF_TOKEN" ]; then
hf auth login --token $HF_TOKEN
fi
# Pre-download model weights on every node. fairseq2 only downloads on
# rank 0 and expects a shared filesystem; we must cache on each node.
HF_REPO=$(grep -A3 "name: ${MODEL}@" ~/.config/fairseq2/assets/llama_hf.yaml | grep checkpoint | sed 's/.*hg:\/\///' | tr -d '"')
if [ -n "$HF_REPO" ]; then
echo "Pre-downloading model $HF_REPO..."
hf download "$HF_REPO"
fi
# Download the GSM8K dataset formatted for fairseq2
echo "Downloading fairseq2-lm-gsm8k dataset..."
mkdir -p ~/datasets/facebook/fairseq2-lm-gsm8k
# Download all dataset files
hf download facebook/fairseq2-lm-gsm8k \
--repo-type dataset --local-dir ~/datasets/facebook/fairseq2-lm-gsm8k
echo "Dataset download completed."
run: |
set -e
# Unset CONDA_PREFIX so fairseq2n uses system libsndfile (from apt)
# instead of searching only in conda's site-packages.
unset CONDA_PREFIX
source .venv/bin/activate
# Extract master address for distributed fine-tuning
MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1)
MASTER_PORT=29500
echo "Node rank: $SKYPILOT_NODE_RANK"
echo "Master address: $MASTER_ADDR:$MASTER_PORT"
echo "Total nodes: $SKYPILOT_NUM_NODES"
echo "GPUs per node: $SKYPILOT_NUM_GPUS_PER_NODE"
# Use local dir for training output (logs use append, unsupported by MOUNT).
# Symlink checkpoints/ to the shared bucket mount so all nodes can write shards.
# NOTE: On Slurm with NFS, you may skip the symlink and use the NFS path directly.
OUTPUT_DIR=~/output/${MODEL}-multinode
mkdir -p $OUTPUT_DIR
ln -sfn /output $OUTPUT_DIR/checkpoints
# Run distributed fine-tuning with torchrun
torchrun \
--nnodes=$SKYPILOT_NUM_NODES \
--nproc_per_node=$SKYPILOT_NUM_GPUS_PER_NODE \
--node_rank=$SKYPILOT_NODE_RANK \
--master_addr=$MASTER_ADDR \
--master_port=$MASTER_PORT \
-m recipes.lm.sft \
$OUTPUT_DIR \
--config \
model.name=$MODEL \
tokenizer.name=$MODEL \
regime.num_steps=$MAX_NUM_STEPS \
dataset.max_seq_len=$MAX_SEQ_LEN \
dataset.max_num_tokens=$((MAX_SEQ_LEN * 2)) \
trainer.mixed_precision.dtype=bfloat16 \
gang.tensor_parallel_size=$SKYPILOT_NUM_GPUS_PER_NODE \
common.no_sweep_dir=true
echo "Distributed fine-tuning completed! Checkpoints saved to: $OUTPUT_DIR"
sft.sky.yaml
# Fairseq2: Instruction Fine-tuning LLMs on SkyPilot
#
# Fine-tune a Llama model on GSM8K using fairseq2, Meta FAIR's sequence
# modeling toolkit. Defaults to Llama 3.2 1B on a single GPU.
#
# Usage:
# sky launch -c fairseq2-sft sft.sky.yaml --secret HF_TOKEN
#
# Use a larger model:
# sky launch -c fairseq2-sft sft.sky.yaml --secret HF_TOKEN \
# --gpus H100:8 --env MODEL=llama3_1_70b
envs:
MODEL: llama3_2_1b
MAX_NUM_STEPS: 2000
# Max sequence length (controls memory usage; reduce if OOM).
# max_num_tokens is set to 2x this value, following fairseq2 defaults.
MAX_SEQ_LEN: 2048
# Change this to your own checkpoint bucket
CHECKPOINT_BUCKET_NAME: sky-fairseq2-checkpoints
secrets:
HF_TOKEN: null # Pass with `--secret HF_TOKEN` in CLI
workdir:
url: https://github.com/facebookresearch/fairseq2.git
ref: main
resources:
accelerators: L40S
file_mounts:
# Shared cloud storage for checkpoints.
# NOTE: If you're on a Slurm cluster with NFS, you may remove this mount and
# point the checkpoints symlink to a shared NFS path instead.
/output:
name: $CHECKPOINT_BUCKET_NAME
mode: MOUNT
# Register Llama asset cards so fairseq2 can download gated models from HF.
# See https://facebookresearch.github.io/fairseq2/stable/basics/assets.html
~/.config/fairseq2/assets/llama_hf.yaml: llama_hf_assets.yaml
setup: |
# Install system dependencies (libsndfile required by fairseq2n)
sudo apt-get update && sudo apt-get install -y libsndfile1
# Create virtual environment
uv venv --python 3.11 --seed
source .venv/bin/activate
# Install PyTorch 2.7.1 with CUDA 12.6 and fairseq2 nightly
# fairseq2 native (fairseq2n) requires an exact PyTorch+CUDA match;
# see https://github.com/facebookresearch/fairseq2#variants
uv pip install torch==2.7.1 torchvision torchaudio \
--index-url https://download.pytorch.org/whl/cu126
uv pip install fairseq2 --pre \
--extra-index-url https://fair.pkg.atmeta.com/fairseq2/whl/nightly/pt2.7.1/cu126
uv pip install huggingface_hub
# Login to HuggingFace for model access (if token provided)
if [ -n "$HF_TOKEN" ]; then
hf auth login --token $HF_TOKEN
fi
# Download the GSM8K dataset formatted for fairseq2
echo "Downloading fairseq2-lm-gsm8k dataset..."
mkdir -p ~/datasets/facebook/fairseq2-lm-gsm8k
# Download all dataset files
hf download facebook/fairseq2-lm-gsm8k \
--repo-type dataset --local-dir ~/datasets/facebook/fairseq2-lm-gsm8k
echo "Dataset download completed."
run: |
set -e
# Unset CONDA_PREFIX so fairseq2n uses system libsndfile (from apt)
# instead of searching only in conda's site-packages.
unset CONDA_PREFIX
source .venv/bin/activate
# Use local dir for training output (logs use append, unsupported by MOUNT).
# Symlink checkpoints/ to the shared bucket mount so all nodes can write shards.
# NOTE: On Slurm with NFS, you may skip the symlink and use the NFS path directly.
OUTPUT_DIR=~/output/${MODEL}-sft
mkdir -p $OUTPUT_DIR
ln -sfn /output $OUTPUT_DIR/checkpoints
# Run instruction fine-tuning on GSM8K dataset
# torchrun launches the fine-tuning recipe (supports multi-GPU with FSDP)
NUM_GPUS=${SKYPILOT_NUM_GPUS_PER_NODE:-1}
torchrun \
--nnodes=1 \
--nproc_per_node=$NUM_GPUS \
-m recipes.lm.sft \
$OUTPUT_DIR \
--config \
model.name=$MODEL \
tokenizer.name=$MODEL \
regime.num_steps=$MAX_NUM_STEPS \
dataset.max_seq_len=$MAX_SEQ_LEN \
dataset.max_num_tokens=$((MAX_SEQ_LEN * 2)) \
trainer.mixed_precision.dtype=bfloat16 \
common.no_sweep_dir=true
echo "Fine-tuning completed! Checkpoints saved to: $OUTPUT_DIR"
# Validate the fine-tuned model by generating on test samples (rank 0 only)
if [ "${SKYPILOT_NODE_RANK:-0}" -eq 0 ]; then
head -10 ~/datasets/facebook/fairseq2-lm-gsm8k/sft_test/test.jsonl > /tmp/test_samples.jsonl
python -m recipes.lm.generate $OUTPUT_DIR/eval --config \
model.name=$MODEL \
model.dtype=bfloat16 \
tokenizer.name=$MODEL \
common.assets.prev_checkpoint_dir=$OUTPUT_DIR/checkpoints \
"dataset.config_overrides.paths=[/tmp/test_samples.jsonl]" \
common.no_sweep_dir=true
fi
echo "All outputs saved to: $OUTPUT_DIR"