Source code for lightning.pytorch.profilers.xla
# Copyright The Lightning AI team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from typing_extensions import override
from lightning.fabric.accelerators.xla import _XLA_AVAILABLE
from lightning.pytorch.profilers.profiler import Profiler
log = logging.getLogger(__name__)
[docs]class XLAProfiler(Profiler):
STEP_FUNCTIONS = {"validation_step", "test_step", "predict_step"}
RECORD_FUNCTIONS = {
"training_step",
"backward",
"validation_step",
"test_step",
"predict_step",
}
def __init__(self, port: int = 9012) -> None:
"""XLA Profiler will help you debug and optimize training workload performance for your models using Cloud TPU
performance tools.
Args:
port: the port to start the profiler server on. An exception is
raised if the provided port is invalid or busy.
"""
if not _XLA_AVAILABLE:
raise ModuleNotFoundError(str(_XLA_AVAILABLE))
super().__init__(dirpath=None, filename=None)
self.port = port
self._recording_map: dict = {}
self._step_recoding_map: dict = {}
self._start_trace: bool = False
[docs] @override
def start(self, action_name: str) -> None:
import torch_xla.debug.profiler as xp
# The action name is formatted as '[TYPE]{class name}.{hook name}'
# Example: [LightningModule]BoringModel.training_step
if action_name.split(".")[-1] in self.RECORD_FUNCTIONS:
if not self._start_trace:
self.server = xp.start_server(self.port)
self._start_trace = True
if action_name.split(".")[-1] in self.STEP_FUNCTIONS:
step = self._get_step_num(action_name)
recording = xp.StepTrace(action_name, step_num=step)
else:
recording = xp.Trace(action_name)
recording.__enter__()
self._recording_map[action_name] = recording
[docs] @override
def stop(self, action_name: str) -> None:
if action_name in self._recording_map:
self._recording_map[action_name].__exit__(None, None, None)
del self._recording_map[action_name]
def _get_step_num(self, action_name: str) -> int:
if action_name not in self._step_recoding_map:
self._step_recoding_map[action_name] = 1
else:
self._step_recoding_map[action_name] += 1
return self._step_recoding_map[action_name]