thunder.distributed.ddp¶

thunder.distributed.ddp(model, *, broadcast_from=0, bucket_size_in_mb=25.0)[source]¶

Thunder’s Distributed Data Parallel.

This function does two things. One is to broadcast the parameters hosted on the rank specified by broadcast_from to all the other ranks belonging to default process_group. The other is to update the behavior of backward trace generation and optimization of it so that each gradient gets pre-averaged, i.e., divided by world size, and asynchronously all-reduced.

Parameters:

model¶ (Module) – A model before thunder.jit applied
model (Module) –
broadcast_from (int | None) –
bucket_size_in_mb (float) –

Keyword Arguments:

broadcast_from¶ – The rank of the device hosting the parameters to broadcast. If None is passed, broadcasting will be skipped. Skipping can be useful for models whose weights have been loaded from a checkpoint. Defaults to 0.
bucket_size_in_mb¶ – Size of a gradient bucket.

Return type:

Module

ddp_example.py¶

# $ torchrun --nproc-per-node=<N_GPU> ddp_example.py
import os
import math

import torch
import torch.distributed as tdist
import torch.nn as nn
import torch.nn.functional as F

import thunder
import thunder.distributed as dist


LOCAL_RANK = int(os.environ["LOCAL_RANK"])
BATCH_SIZE = 8
IN_FEATURES = 32
OUT_FEATURES = 64
N_CLASSES = 4


def get_batch() -> tuple[torch.Tensor, torch.Tensor]:
    x = torch.randn(BATCH_SIZE, IN_FEATURES, device=f"cuda:{LOCAL_RANK}", requires_grad=True)
    y = torch.randn(BATCH_SIZE, N_CLASSES, device=f"cuda:{LOCAL_RANK}").softmax(dim=1).requires_grad_()
    return x, y


def new_gelu(a: torch.Tensor):
    return 0.5 * a * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (a + 0.044715 * torch.pow(a, 3.0))))


class MyModel(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.l1 = nn .Linear(IN_FEATURES, OUT_FEATURES)
        self.l2 = nn.Linear(OUT_FEATURES, N_CLASSES)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        h = new_gelu(self.l1(x))
        return self.l2(h)


def main():
    tdist.init_process_group(backend="nccl")

    model = MyModel().to(LOCAL_RANK)
    compiled = dist.ddp(thunder.jit(model))
    optimizer = torch.optim.AdamW(compiled.parameters())
    losses = []
    loss_all_reduce_workers = []

    for _ in range(10):
        optimizer.zero_grad()
        x, y = get_batch()
        out = compiled(x)
        loss = F.cross_entropy(y, out)
        loss.backward()
        optimizer.step()
        with torch.no_grad():
            losses.append(loss.detach())
            loss_all_reduce_workers.append(tdist.all_reduce(losses[-1], op=tdist.ReduceOp.AVG, async_op=True))

    if LOCAL_RANK == 0:
        for i, (loss, worker)  in enumerate(zip(losses, loss_all_reduce_workers)):
            assert worker.wait()
            print(f"# {i}-th loss: {loss.item()}")


if __name__ == "__main__":
    main()