Source code for lightning.fabric.plugins.environments.lightning
# Copyright The Lightning AI team.## Licensed under the Apache License, Version 2.0 (the "License");# you may not use this file except in compliance with the License.# You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License.importosimportsocketfromlightning.fabric.plugins.environments.cluster_environmentimportClusterEnvironmentfromlightning.fabric.utilities.rank_zeroimportrank_zero_only
[docs]classLightningEnvironment(ClusterEnvironment):"""The default environment used by Lightning for a single node or free cluster (not managed). There are two modes the Lightning environment can operate with: 1. The user only launches the main process by :code:`python train.py ...` with no additional environment variables set. Lightning will spawn new worker processes for distributed training in the current node. 2. The user launches all processes manually or with utilities like :code:`torch.distributed.launch`. The appropriate environment variables need to be set, and at minimum :code:`LOCAL_RANK`. If the main address and port are not provided, the default environment will choose them automatically. It is recommended to use this default environment for single-node distributed training as it provides a convenient way to launch the training script. """def__init__(self)->None:super().__init__()self._main_port:int=-1self._global_rank:int=0self._world_size:int=1@propertydefcreates_processes_externally(self)->bool:"""Returns whether the cluster creates the processes or not. If at least :code:`LOCAL_RANK` is available as environment variable, Lightning assumes the user acts as the process launcher/job scheduler and Lightning will not launch new processes. """return"LOCAL_RANK"inos.environ@propertydefmain_address(self)->str:returnos.environ.get("MASTER_ADDR","127.0.0.1")@propertydefmain_port(self)->int:ifself._main_port==-1:self._main_port=int(os.environ.get("MASTER_PORT",find_free_network_port()))returnself._main_port
deffind_free_network_port()->int:"""Finds a free port on localhost. It is useful in single-node training when we don't want to connect to a real main node but have to set the `MASTER_PORT` environment variable. """s=socket.socket(socket.AF_INET,socket.SOCK_STREAM)s.bind(("",0))port=s.getsockname()[1]s.close()returnport
To analyze traffic and optimize your experience, we serve cookies on this site. By clicking or navigating, you agree to allow our usage of cookies. Read PyTorch Lightning's Privacy Policy.