Source code for lightning.fabric.plugins.environments.torchelastic
# Copyright The Lightning AI team.## Licensed under the Apache License, Version 2.0 (the "License");# you may not use this file except in compliance with the License.# You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License.importloggingimportosimporttorch.distributedfromtyping_extensionsimportoverridefromlightning.fabric.plugins.environments.cluster_environmentimportClusterEnvironmentfromlightning.fabric.utilities.rank_zeroimportrank_zero_warnlog=logging.getLogger(__name__)
[docs]classTorchElasticEnvironment(ClusterEnvironment):"""Environment for fault-tolerant and elastic training with `torchelastic <https://pytorch.org/elastic/>`_"""@property@overridedefcreates_processes_externally(self)->bool:returnTrue@property@overridedefmain_address(self)->str:if"MASTER_ADDR"notinos.environ:rank_zero_warn("MASTER_ADDR environment variable is not defined. Set as localhost")os.environ["MASTER_ADDR"]="127.0.0.1"log.debug(f"MASTER_ADDR: {os.environ['MASTER_ADDR']}")returnos.environ["MASTER_ADDR"]@property@overridedefmain_port(self)->int:if"MASTER_PORT"notinos.environ:rank_zero_warn("MASTER_PORT environment variable is not defined. Set as 12910")os.environ["MASTER_PORT"]="12910"log.debug(f"MASTER_PORT: {os.environ['MASTER_PORT']}")returnint(os.environ["MASTER_PORT"])
[docs]@staticmethod@overridedefdetect()->bool:"""Returns ``True`` if the current process was launched using the torchelastic command."""# if not available (for example on MacOS), `is_torchelastic_launched` is not definedreturntorch.distributed.is_available()andtorch.distributed.is_torchelastic_launched()
@overridedefset_world_size(self,size:int)->None:log.debug("TorchElasticEnvironment.set_world_size was called, but setting world size is not allowed. Ignored.")
@overridedefset_global_rank(self,rank:int)->None:log.debug("TorchElasticEnvironment.set_global_rank was called, but setting global rank is not allowed. Ignored.")
[docs]@overridedefvalidate_settings(self,num_devices:int,num_nodes:int)->None:ifnum_devices*num_nodes!=self.world_size():raiseValueError(f"You set `devices={num_devices}` and `num_nodes={num_nodes}` in Lightning, but the product"f" ({num_devices} * {num_nodes}) does not match the world size ({self.world_size()}).")
To analyze traffic and optimize your experience, we serve cookies on this
site. By clicking or navigating, you agree to allow our usage of cookies.
Read PyTorch Lightning's
Privacy Policy.