For running the video classification experiment introduced here, I’m using the following script:
import pytorchvideo.models.resnet
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import pytorch_lightning
import pytorchvideo.data
import torch.utils.data
from pytorchvideo.transforms import (
ApplyTransformToKey,
Normalize,
RandomShortSideScale,
RemoveKey,
ShortSideScale,
UniformTemporalSubsample
)
from torchvision.transforms import (
Compose,
Lambda,
RandomCrop,
RandomHorizontalFlip
)
class KineticsDataModule(pytorch_lightning.LightningDataModule):
def __init__(self):
# Dataset configuration
self._DATA_PATH = '/mnt1'
self._DATA_PATH_val = '/mnt2'
#self._log_hyperparams()
self._CLIP_DURATION = 2 # Duration of sampled clip for each video
self._BATCH_SIZE = 8
self._NUM_WORKERS = 8 # Number of parallel processes fetching data
self.prepare_data_per_node = False
def train_dataloader(self):
"""
Create the Kinetics train partition from the list of video labels
in {self._DATA_PATH}/train.csv. Add transform that subsamples and
normalizes the video before applying the scale, crop and flip augmentations.
"""
train_transform = Compose(
[
ApplyTransformToKey(
key="video",
transform=Compose(
[
UniformTemporalSubsample(8),
Lambda(lambda x: x / 255.0),
Normalize((0.45, 0.45, 0.45), (0.225, 0.225, 0.225)),
RandomShortSideScale(min_size=256, max_size=320),
RandomCrop(244),
RandomHorizontalFlip(p=0.5),
]
),
),
]
)
train_dataset = pytorchvideo.data.Kinetics(
data_path=os.path.join(self._DATA_PATH, "train_25.csv"),
clip_sampler=pytorchvideo.data.make_clip_sampler("random", self._CLIP_DURATION),
transform=train_transform
)
return torch.utils.data.DataLoader(
train_dataset,
batch_size=self._BATCH_SIZE,
num_workers=self._NUM_WORKERS,
)
def _log_hyperparams(self, params):
# params is an argparse.Namespace
# your code to record hyperparameters goes here
pass
def valid_dataloader(self):
"""
Create the Kinetics validation partition from the list of video labels
in {self._DATA_PATH_val}/valid.csv. Add transform that subsamples and
normalizes the video before applying the scale, crop and flip augmentations.
"""
valid_transform = Compose(
[
ApplyTransformToKey(
key="video",
transform=Compose(
[
UniformTemporalSubsample(8),
Lambda(lambda x: x / 255.0),
Normalize((0.45, 0.45, 0.45), (0.225, 0.225, 0.225)),
RandomShortSideScale(min_size=256, max_size=320),
RandomCrop(244),
RandomHorizontalFlip(p=0.5),
]
),
),
]
)
valid_dataset = pytorchvideo.data.Kinetics(
data_path=os.path.join(self._DATA_PATH_val, "val_25.csv"),
clip_sampler=pytorchvideo.data.make_clip_sampler("random", self._CLIP_DURATION),
transform=valid_transform
)
return torch.utils.data.DataLoader(
valid_dataset,
batch_size=self._BATCH_SIZE,
num_workers=self._NUM_WORKERS,
)
def make_kinetics_resnet():
return pytorchvideo.models.resnet.create_resnet(
input_channel=3, # RGB input from Kinetics
model_depth=50, # For the tutorial let's just use a 50 layer network
model_num_class=400, # Kinetics has 400 classes so we need out final head to align
norm=nn.BatchNorm3d,
activation=nn.ReLU,
)
class VideoClassificationLightningModule(pytorch_lightning.LightningModule):
def __init__(self):
super().__init__()
self.model = make_kinetics_resnet()
def forward(self, x):
return self.model(x)
def training_step(self, batch, batch_idx):
# The model expects a video tensor of shape (B, C, T, H, W), which is the
# format provided by the dataset
y_hat = self.model(batch["video"])
# Compute cross entropy loss, loss.backwards will be called behind the scenes
# by PyTorchLightning after being returned from this method.
loss = F.cross_entropy(y_hat, batch["label"])
# Log the train loss to Tensorboard
self.log("train_loss", loss.item())
return loss
def validation_step(self, batch, batch_idx):
y_hat = self.model(batch["video"])
loss = F.cross_entropy(y_hat, batch["label"])
self.log("val_loss", loss)
return loss
def configure_optimizers(self):
"""
Setup the Adam optimizer. Note, that this function also can return a lr scheduler, which is
usually useful for training video models.
"""
return torch.optim.Adam(self.parameters(), lr=1e-1)
def train():
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
classification_module = VideoClassificationLightningModule()
data_module = KineticsDataModule()
trainer = pytorch_lightning.Trainer()
trainer.fit(classification_module, data_module)
if name == ‘main’:
train()
Unfortunately I receive error from the trainer as follows:

It is worth mentioning that I only use one GPU for training.
How I can fix the problem?