# Copyright The PyTorch Lightning team.## Licensed under the Apache License, Version 2.0 (the "License");# you may not use this file except in compliance with the License.# You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License.importjsonimportosfromtypingimportAny,Callable,Dict,List,Optional,Unionimporttorchfromtorch.utils.dataimportDataLoaderimportpytorch_lightningasplfrompytorch_lightning.overrides.baseimport_LightningModuleWrapperBase,_LightningPrecisionModuleWrapperBasefrompytorch_lightning.plugins.environments.cluster_environmentimportClusterEnvironmentfrompytorch_lightning.plugins.io.checkpoint_pluginimportCheckpointIOfrompytorch_lightning.plugins.precisionimportPrecisionPluginfrompytorch_lightning.strategies.parallelimportParallelStrategyfrompytorch_lightning.trainer.statesimportRunningStage,TrainerFnfrompytorch_lightning.utilitiesimport_IPU_AVAILABLE,_POPTORCH_AVAILABLEfrompytorch_lightning.utilities.apply_funcimportapply_to_collectionfrompytorch_lightning.utilities.cloud_ioimportget_filesystemfrompytorch_lightning.utilities.dataimport_get_dataloader_init_kwargsfrompytorch_lightning.utilities.enumsimportPrecisionTypefrompytorch_lightning.utilities.exceptionsimportMisconfigurationExceptionfrompytorch_lightning.utilities.typesimportSTEP_OUTPUTif_POPTORCH_AVAILABLE:importpoptorchelse:poptorch=NoneclassLightningIPUModule(_LightningModuleWrapperBase):def__init__(self,pl_module:Union["pl.LightningModule",_LightningPrecisionModuleWrapperBase],precision:Union[str,int])->None:super().__init__(pl_module)self.precision=precisiondefforward(self,*inputs:Any,**kwargs:Any)->Any:ifself.precisionin(PrecisionType.MIXED,PrecisionType.HALF):inputs=self._move_float_tensors_to_half(inputs)returnsuper().forward(*inputs,**kwargs)@staticmethoddefbatch_to(data:torch.Tensor)->torch.Tensor:returndata.half()def_move_float_tensors_to_half(self,batch:Any)->Any:batch=apply_to_collection(batch,(torch.FloatTensor,torch.cuda.FloatTensor),function=self.batch_to)returnbatch
[docs]classIPUStrategy(ParallelStrategy):"""Plugin for training on IPU devices."""strategy_name="ipu_strategy"def__init__(self,accelerator:Optional["pl.accelerators.accelerator.Accelerator"]=None,device_iterations:int=1,autoreport:bool=False,autoreport_dir:Optional[str]=None,parallel_devices:Optional[List[torch.device]]=None,cluster_environment:Optional[ClusterEnvironment]=None,checkpoint_io:Optional[CheckpointIO]=None,precision_plugin:Optional[PrecisionPlugin]=None,training_opts:Optional["poptorch.Options"]=None,inference_opts:Optional["poptorch.Options"]=None,)->None:""" Arguments: device_iterations: Number of iterations to run on device at once before returning to host. This can be used as an optimization to speed up training. https://docs.graphcore.ai/projects/poptorch-user-guide/en/0.1.67/batching.html autoreport: Enable auto-reporting for IPUs using PopVision https://docs.graphcore.ai/projects/graphcore-popvision-user-guide/en/latest/graph/graph.html autoreport_dir: Optional directory to store autoReport output. training_opts: Optional ``poptorch.Options`` to override the default created options for training. inference_opts: Optional ``poptorch.Options`` to override the default created options for validation/testing and predicting. """super().__init__(accelerator=accelerator,parallel_devices=parallel_devices,cluster_environment=cluster_environment,checkpoint_io=checkpoint_io,precision_plugin=precision_plugin,)ifnot_IPU_AVAILABLE:raiseMisconfigurationException("The IPU Accelerator requires IPU devices to run. ""Learn more or get started with IPUs at https://www.graphcore.ai/getstarted")self.device_iterations=device_iterationsself.autoreport=autoreportself.autoreport_dir=autoreport_dirself.poptorch_models={}self._training_opts=training_optsself._inference_opts=inference_optsifself.autoreport:options={"autoReport.all":self.autoreport}ifself.autoreport_dir:self._fs=get_filesystem(str(self.autoreport_dir))self._fs.makedirs(self.autoreport_dir,exist_ok=True)options["autoReport.directory"]=self.autoreport_diros.environ["POPLAR_ENGINE_OPTIONS"]=json.dumps(options)self._update_dataloader_original:Optional[Callable]=None
[docs]defsetup(self,trainer:"pl.Trainer")->None:# set the `accumulate_grad_batches` property as early as possibleself._handle_gradient_accumulation_steps()# patch the dataloader creation function with the custom `poptorch.DataLoader`.# this violates the intended control flow for the plugins, but since this is experimental, we have chosen# to use the simpler solution before adding abstractions to override the `DataLoader` classself._update_dataloader_original=pl.trainer.connectors.data_connector._update_dataloaderpl.trainer.connectors.data_connector._update_dataloader=self._convert_to_poptorch_loadersuper().setup(trainer)model=LightningIPUModule(self.lightning_module,self.precision_plugin.precision)self.model=model# reset the backupself.poptorch_models={}# Separate models are instantiated for different stages, but they share the same weights on host.# When validation/test models are run, weights are synced first.trainer_fn=self.lightning_module.trainer.state.fniftrainer_fnin(TrainerFn.FITTING,TrainerFn.TUNING):# Create model for training and validation which will run on fittraining_opts=self.training_optsinference_opts=self.inference_optsoptimizer=self.lightning_module.trainer.optimizers[0]model=poptorch.trainingModel(model=model,options=training_opts,optimizer=optimizer)self.poptorch_models[RunningStage.TRAINING]=modelifself.lightning_module.trainer.enable_validation:model=poptorch.inferenceModel(model=model,options=inference_opts)self.poptorch_models[RunningStage.VALIDATING]=modeleliftrainer_fn==TrainerFn.VALIDATING:model=poptorch.inferenceModel(model=model,options=self.inference_opts)self.poptorch_models[RunningStage.VALIDATING]=modeleliftrainer_fn==TrainerFn.TESTING:model=poptorch.inferenceModel(model=model,options=self.inference_opts)self.poptorch_models[RunningStage.TESTING]=modeleliftrainer_fn==TrainerFn.PREDICTING:model=poptorch.inferenceModel(model=model,options=self.inference_opts)self.poptorch_models[RunningStage.PREDICTING]=model
[docs]defsetup_optimizers(self,trainer:"pl.Trainer")->None:super().setup_optimizers(trainer)iflen(self.optimizers)>1:raiseMisconfigurationException("IPUs currently only support one optimizer.")
@propertydefreplication_factor(self)->int:ifnotself.lightning_moduleornotself.poptorch_models:# The plugin has been passed in by the user and has not been connected to the Trainer.# Check if the user has passed in custom poptorch.Options to infer number of IPUs being used.# In this scenario we prioritize the training options.ifself._training_opts:returnself._training_opts.replication_factorifself._inference_opts:returnself._inference_opts.replication_factorreturnlen(self.parallel_devices)stage=self.lightning_module.trainer.state.stagereturnself.poptorch_models[stage]._options.toDict()["replication_factor"]def_create_opts(self,training:bool)->"poptorch.Options":opts=poptorch.Options()opts.deviceIterations(self.device_iterations)opts.replicationFactor(self.replication_factor)gradient_accumulation=self.lightning_module.trainer.accumulate_grad_batchesiftrainingelse1opts.Training.gradientAccumulation(gradient_accumulation)ifos.environ.get("PL_GLOBAL_SEED"):opts.randomSeed(int(os.environ["PL_GLOBAL_SEED"]))returnopts@propertydeftraining_opts(self)->"poptorch.Options":ifself._training_optsisNone:self._training_opts=self._create_opts(training=True)returnself._training_opts@propertydefinference_opts(self)->"poptorch.Options":ifself._inference_optsisNone:self._inference_opts=self._create_opts(training=False)returnself._inference_opts@propertydeflightning_module(self)->Optional["pl.LightningModule"]:returnself.model.moduleifisinstance(self.model,LightningIPUModule)elseself.modeldef_convert_to_poptorch_loader(self,dataloader:DataLoader,sampler,mode:Optional[RunningStage]=None)->"poptorch.DataLoader":ifisinstance(dataloader,poptorch.DataLoader):# the user is returning the `poptorch.DataLoader` directly, don't change anything.returndataloaderdl_kwargs=_get_dataloader_init_kwargs(dataloader,sampler)opts=self.training_optsifmode==RunningStage.TRAININGelseself.inference_optsdataloader=poptorch.DataLoader(opts,**dl_kwargs)returndataloaderdef_handle_gradient_accumulation_steps(self)->None:"""Override the trainer.accumulation_scheduler to act as ``accumulate_grad_batches=1`` if gradient accumulation has been set. ``optimizer_step`` will be called on every batch, and the IPU will handle grad accumulation internally. """accumulation_scheduler=self.lightning_module.trainer.accumulation_schedulerifaccumulation_scheduler.epochs!=[0]:raiseMisconfigurationException("IPUs currently does not support different `accumulate_grad_batches` at different epochs.")# TODO(@tchaton): Add support for accumulate_grad_batches being a dictionaryaccumulation_scheduler.scheduling.update({0:1})@propertydef_n_replicate(self):opts=self.training_optsifself.lightning_module.trainingelseself.inference_optsaccumulate_grad_batches=opts.Training.gradient_accumulationdevice_iterations=opts.device_iterationsreplication_factor=opts.replication_factorreturnreplication_factor*device_iterations*accumulate_grad_batchesdef_prepare_input(self,args:Any):defto_tuple(x):returntuple(x)defto_tensor(x):returntorch.tensor(x).unsqueeze(0).repeat(self._n_replicate)args=apply_to_collection(args,dtype=list,function=to_tuple)args=apply_to_collection(args,dtype=(int,float),function=to_tensor)returnargsdef_step(self,stage:RunningStage,*args:Any,**kwargs:Any):args=self._prepare_input(args)poptorch_model=self.poptorch_models[stage]self.lightning_module._running_torchscript=Trueout=poptorch_model(*args,**kwargs)self.lightning_module._running_torchscript=Falsereturnout
def_compiled(self,model:Any):# Required to ensure we only attach compiled models, as they are compiled lazily.returnmodel._executableisnotNonedef_detach_models(self):"""Detaches all stage specific models from IPU devices."""fork,modelinself.poptorch_models.items():ifself._compiled(model)andmodel.isAttachedToDevice():model.detachFromDevice()def_load_model(self,stage:str):"""Loads the stage specific accelerator model onto device if compiled and not attached to IPU devices. Args: stage: The stage to load """self._detach_models()model=self.poptorch_models[stage]ifself._compiled(model)andnotmodel.isAttachedToDevice():model.attachToDevice()
[docs]defon_train_batch_start(self,batch:Any,batch_idx:int,dataloader_idx:int=0)->None:# Updates optimizer stats if LR scheduler modified the optimizer stateoptimizer=self.optimizers[0]self.poptorch_models[RunningStage.TRAINING].setOptimizer(optimizer)
To analyze traffic and optimize your experience, we serve cookies on this site. By clicking or navigating, you agree to allow our usage of cookies. Read PyTorch Lightning's Privacy Policy.