I am trying to run my pytorch-lghtning code on TPU in GCP.
import numpy as np # linear algebra
import pandas as pd
import os
import string
from typing import Optional
# for checkpoint
# https://pytorch-lightning.readthedocs.io/en/latest/common/weights_loading.html
from torch.utils.data import DataLoader, Dataset, random_split
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pytorch_lightning as pl
from pytorch_lightning import LightningDataModule
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks import ModelCheckpoint
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder, StandardScaler
import logging
logging.getLogger("lightning").setLevel(logging.ERROR)
class ModelDataset(Dataset):
def __init__(self, X, y):
self.X = X
self.y = y
self.len = len(self.X)
def __getitem__(self, index):
return self.X[index], self.y[index]
def __len__(self):
return self.len
class ModelDataModule(LightningDataModule):
def __init__(self):
super().__init__()
self.train_root_path = './20news-bydate-train'
self.test_root_path = './20news-bydate-test'
self.batch_size = 64
def get_data(self, root_path):
data = []
label = []
folders = ['rec.sport.hockey', 'sci.electronics' , 'rec.autos']
for i, folder_name in enumerate(folders):
files = []
folder_path = os.path.join(root_path, folder_name)
files.extend([f for f in os.listdir(folder_path)])
for file in files:
file_path = os.path.join(folder_path, file)
with open(file_path, 'r', encoding='latin1') as f:
content = f.read()
data.append(content)
label.append(i)
return data, label
def preprocess(self, words):
table = str.maketrans('', '', '\t')
words = [word.translate(table) for word in words]
n_table = str.maketrans('', '', '\n')
words = [word.translate(n_table) for word in words]
punctuations = (string.punctuation).replace("'", "")
trans_table = str.maketrans('', '', punctuations)
stripped_words = [word.translate(trans_table) for word in words]
words = [str for str in stripped_words if str]
p_words = []
for word in words:
if (word[0] and word[len(word)-1] == "'"):
word = word[1:len(word)-1]
elif(word[0] == "'"):
word = word[1:len(word)]
else:
word = word
p_words.append(word)
words = p_words.copy()
words = [word for word in words if not word.isdigit()]
words = [word for word in words if not len(word) == 1]
words = [str for str in words if str]
words = [word.lower() for word in words]
words = [word for word in words if len(word) > 2]
return " ".join(words)
def prepare_data(self):
### called only on 1 GPU
# get data
train_data, train_label = self.get_data(self.train_root_path)
test_data, test_label = self.get_data(self.test_root_path)
# preprocess
for i, text in enumerate(train_data):
train_data[i] = self.preprocess(text.strip().split())
for i, text in enumerate(test_data):
test_data[i] = self.preprocess(text.strip().split())
# feature engineering
### data
vectorizer = TfidfVectorizer(max_df=0.75,stop_words='english')
X_train = vectorizer.fit_transform(train_data)
X_test = vectorizer.transform(test_data)
X_train = X_train.toarray()
X_test = X_test.toarray()
pca_1k = PCA(n_components=1024)
X_train1k = pca_1k.fit_transform(X_train)
X_test1k = pca_1k.transform(X_test)
print (pca_1k.explained_variance_ratio_.cumsum()[-1])
### label
le = LabelEncoder()
y_train = le.fit_transform(train_label)
y_test = le.transform(test_label)
# scaling
scaler = StandardScaler()
scaler.fit(X_train1k)
self.train_data = torch.tensor(scaler.transform(X_train1k))
self.test_data = torch.tensor(scaler.transform(X_test1k))
self.train_label = torch.tensor(y_train)
self.test_label = torch.tensor(y_test)
def setup(self, stage: Optional[str] = None):
# called on every GPU
train_data = ModelDataset(self.train_data, self.train_label)
test_dataset = ModelDataset(self.test_data, self.test_label)
train_len = (len(train_data)//10)*8
val_len = len(train_data) - train_len
print(train_len, val_len)
train_dataset, val_dataset = random_split(train_data, [train_len, val_len],
generator=torch.Generator().manual_seed(42))
self.train = train_dataset
self.val = val_dataset
self.test = test_dataset
def train_dataloader(self):
train_loader = DataLoader(dataset=self.train,
batch_size=self.batch_size, shuffle=True, drop_last=True)
return train_loader
def val_dataloader(self):
val_loader = DataLoader(dataset=self.val,
batch_size=self.batch_size)
return val_loader
def test_dataloader(self):
test_loader = DataLoader(dataset=self.test,
batch_size=self.batch_size)
return test_loader
class LightningFFModel(pl.LightningModule):
def __init__(self, input_size, hidden_size, learning_rate):
super().__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.fc1 = torch.nn.Linear(self.input_size, self.hidden_size)
self.relu = torch.nn.ReLU()
self.fc2 = torch.nn.Linear(self.hidden_size, 3)
self.criterion = nn.CrossEntropyLoss()
self.lr = learning_rate
def forward(self, x):
hidden = self.fc1(x)
relu = self.relu(hidden)
output = self.fc2(relu)
return output
def cross_entropy_loss(self, outputs, labels):
return self.criterion(outputs, labels)
def training_step(self, train_batch, batch_idx):
x, y = train_batch
outputs = self.forward(x.float())
loss = self.cross_entropy_loss(outputs, y)
self.log('train_loss', loss, rank_zero_only=True)
return loss
def validation_step(self, val_batch, batch_idx):
x, y = val_batch
outputs = self.forward(x.float())
loss = self.cross_entropy_loss(outputs, y)
self.log('val_loss', loss, rank_zero_only=True)
def test_step(self, batch, batch_idx):
x, y = batch
outputs = self.forward(x.float())
loss = self.cross_entropy_loss(outputs, y)
self.log("test_loss", loss, rank_zero_only=True)
def configure_optimizers(self):
optimizer = optim.SGD(self.parameters(),
lr=(self.lr or self.learning_rate), momentum=0.9)
return optimizer
if __name__=='__main__':
input_size = 1024
hidden_size = 128
model_dm = ModelDataModule()
# train
model = LightningFFModel(input_size, hidden_size, 0.005)
early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=0.00, patience=3, verbose=False, mode="min")
checkpoint_callback = ModelCheckpoint(
monitor='val_loss',
dirpath='pt_checkpoints',
filename='epoch{epoch:02d}',
auto_insert_metric_name=False)
trainer = pl.Trainer(tpu_cores=8, max_epochs=5,
callbacks=[early_stop_callback, checkpoint_callback],
auto_lr_find=True,
log_every_n_steps=1)
trainer.fit(model, datamodule=model_dm)
I am getting the error
terminate called after throwing an instance of 'c10::CUDAError' what(): CUDA error: initialization error CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect. For debugging consider passing CUDA_LAUNCH_BLOCKING=1. ... torch.multiprocessing.spawn.ProcessExitedException: process 7 terminated with signal SIGABRT
The full stack trace is as follows,
GPU available: True, used: False
TPU available: True, using: 8 TPU cores
IPU available: False, using: 0 IPUs
/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py:1296: UserWarning: GPU available but not used. Set the gpus flag in your trainer `Trainer(gpus=1)` or script `--gpus=1`.
"GPU available but not used. Set the gpus flag in your trainer"
Validation sanity check: 0it [00:00, ?it/s]/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/data_loading.py:106: UserWarning: The dataloader, val dataloader 0, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 8 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.
f"The dataloader, {name}, does not have many workers which may be a bottleneck."
/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/data_loading.py:106: UserWarning: The dataloader, train dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 8 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.
f"The dataloader, {name}, does not have many workers which may be a bottleneck."
Epoch 0: 0%| | 0/3 [00:00<00:00, 2368.33it/s]terminate called after throwing an instance of 'c10::CUDAError'
what(): CUDA error: initialization error
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Exception raised from getDevice at /opt/conda/conda-bld/pytorch_1623448265233/work/c10/cuda/impl/CUDAGuardImpl.h:38 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x42 (0x7fbac23cfa22 in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x9b5b (0x7fbac262ab5b in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10_cuda.so)
frame #2: <unknown function> + 0x3280aaa (0x7fbac5ac9aaa in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #3: torch::autograd::Engine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x2a (0x7fbac5acabfa in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #4: torch::autograd::python::PythonEngine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x53 (0x7fbb3bc9dc43 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_python.so)
frame #5: <unknown function> + 0xc9067 (0x7fbb70d75067 in /opt/conda/lib/python3.7/site-packages/pyarrow/../../../libstdc++.so.6)
frame #6: <unknown function> + 0x76db (0x7fbb8340f6db in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #7: clone + 0x3f (0x7fbb8278b71f in /lib/x86_64-linux-gnu/libc.so.6)
terminate called after throwing an instance of 'terminate called after throwing an instance of 'c10::CUDAErrorc10::CUDAError'
'
what(): what(): CUDA error: initialization error
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Exception raised from getDevice at /opt/conda/conda-bld/pytorch_1623448265233/work/c10/cuda/impl/CUDAGuardImpl.h:38 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x42 (0x7fbac23cfa22 in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x9b5b (0x7fbac262ab5b in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10_cuda.so)
frame #2: <unknown function> + 0x3280aaa (0x7fbac5ac9aaa in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #3: torch::autograd::Engine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x2a (0x7fbac5acabfa in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #4: torch::autograd::python::PythonEngine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x53 (0x7fbb3bc9dc43 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_python.so)
frame #5: <unknown function> + 0xc9067 (0x7fbb70d75067 in /opt/conda/lib/python3.7/site-packages/pyarrow/../../../libstdc++.so.6)
frame #6: <unknown function> + 0x76db (0x7fbb8340f6db in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #7: clone + 0x3f (0x7fbb8278b71f in /lib/x86_64-linux-gnu/libc.so.6)
CUDA error: initialization error
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Exception raised from getDevice at /opt/conda/conda-bld/pytorch_1623448265233/work/c10/cuda/impl/CUDAGuardImpl.h:38 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x42 (0x7fbac23cfa22 in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x9b5b (0x7fbac262ab5b in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10_cuda.so)
frame #2: <unknown function> + 0x3280aaa (0x7fbac5ac9aaa in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #3: torch::autograd::Engine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x2a (0x7fbac5acabfa in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #4: torch::autograd::python::PythonEngine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x53 (0x7fbb3bc9dc43 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_python.so)
frame #5: <unknown function> + 0xc9067 (0x7fbb70d75067 in /opt/conda/lib/python3.7/site-packages/pyarrow/../../../libstdc++.so.6)
frame #6: <unknown function> + 0x76db (0x7fbb8340f6db in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #7: clone + 0x3f (0x7fbb8278b71f in /lib/x86_64-linux-gnu/libc.so.6)
terminate called after throwing an instance of 'c10::CUDAError'
what(): CUDA error: initialization error
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Exception raised from getDevice at /opt/conda/conda-bld/pytorch_1623448265233/work/c10/cuda/impl/CUDAGuardImpl.h:38 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x42 (0x7fbac23cfa22 in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x9b5b (0x7fbac262ab5b in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10_cuda.so)
frame #2: <unknown function> + 0x3280aaa (0x7fbac5ac9aaa in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #3: torch::autograd::Engine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x2a (0x7fbac5acabfa in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #4: torch::autograd::python::PythonEngine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x53 (0x7fbb3bc9dc43 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_python.so)
frame #5: <unknown function> + 0xc9067 (0x7fbb70d75067 in /opt/conda/lib/python3.7/site-packages/pyarrow/../../../libstdc++.so.6)
frame #6: <unknown function> + 0x76db (0x7fbb8340f6db in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #7: clone + 0x3f (0x7fbb8278b71f in /lib/x86_64-linux-gnu/libc.so.6)
terminate called after throwing an instance of 'c10::CUDAError'
what(): CUDA error: initialization error
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Exception raised from getDevice at /opt/conda/conda-bld/pytorch_1623448265233/work/c10/cuda/impl/CUDAGuardImpl.h:38 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x42 (0x7fbac23cfa22 in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x9b5b (0x7fbac262ab5b in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10_cuda.so)
frame #2: <unknown function> + 0x3280aaa (0x7fbac5ac9aaa in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #3: torch::autograd::Engine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x2a (0x7fbac5acabfa in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #4: torch::autograd::python::PythonEngine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x53 (0x7fbb3bc9dc43 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_python.so)
frame #5: <unknown function> + 0xc9067 (0x7fbb70d75067 in /opt/conda/lib/python3.7/site-packages/pyarrow/../../../libstdc++.so.6)
frame #6: <unknown function> + 0x76db (0x7fbb8340f6db in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #7: clone + 0x3f (0x7fbb8278b71f in /lib/x86_64-linux-gnu/libc.so.6)
terminate called after throwing an instance of 'c10::CUDAError'
what(): CUDA error: initialization error
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Exception raised from getDevice at /opt/conda/conda-bld/pytorch_1623448265233/work/c10/cuda/impl/CUDAGuardImpl.h:38 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x42 (0x7fbac23cfa22 in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x9b5b (0x7fbac262ab5b in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10_cuda.so)
frame #2: <unknown function> + 0x3280aaa (0x7fbac5ac9aaa in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #3: torch::autograd::Engine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x2a (0x7fbac5acabfa in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #4: torch::autograd::python::PythonEngine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x53 (0x7fbb3bc9dc43 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_python.so)
frame #5: <unknown function> + 0xc9067 (0x7fbb70d75067 in /opt/conda/lib/python3.7/site-packages/pyarrow/../../../libstdc++.so.6)
frame #6: <unknown function> + 0x76db (0x7fbb8340f6db in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #7: clone + 0x3f (0x7fbb8278b71f in /lib/x86_64-linux-gnu/libc.so.6)
terminate called after throwing an instance of 'c10::CUDAError'
what(): CUDA error: initialization error
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Exception raised from getDevice at /opt/conda/conda-bld/pytorch_1623448265233/work/c10/cuda/impl/CUDAGuardImpl.h:38 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x42 (0x7fbac23cfa22 in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x9b5b (0x7fbac262ab5b in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10_cuda.so)
frame #2: <unknown function> + 0x3280aaa (0x7fbac5ac9aaa in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #3: torch::autograd::Engine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x2a (0x7fbac5acabfa in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #4: torch::autograd::python::PythonEngine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x53 (0x7fbb3bc9dc43 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_python.so)
frame #5: <unknown function> + 0xc9067 (0x7fbb70d75067 in /opt/conda/lib/python3.7/site-packages/pyarrow/../../../libstdc++.so.6)
frame #6: <unknown function> + 0x76db (0x7fbb8340f6db in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #7: clone + 0x3f (0x7fbb8278b71f in /lib/x86_64-linux-gnu/libc.so.6)
terminate called after throwing an instance of 'c10::CUDAError'
what(): CUDA error: initialization error
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Exception raised from getDevice at /opt/conda/conda-bld/pytorch_1623448265233/work/c10/cuda/impl/CUDAGuardImpl.h:38 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x42 (0x7fbac23cfa22 in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x9b5b (0x7fbac262ab5b in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10_cuda.so)
frame #2: <unknown function> + 0x3280aaa (0x7fbac5ac9aaa in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #3: torch::autograd::Engine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x2a (0x7fbac5acabfa in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #4: torch::autograd::python::PythonEngine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x53 (0x7fbb3bc9dc43 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_python.so)
frame #5: <unknown function> + 0xc9067 (0x7fbb70d75067 in /opt/conda/lib/python3.7/site-packages/pyarrow/../../../libstdc++.so.6)
frame #6: <unknown function> + 0x76db (0x7fbb8340f6db in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #7: clone + 0x3f (0x7fbb8278b71f in /lib/x86_64-linux-gnu/libc.so.6)
Traceback (most recent call last):
File "pt_test.py", line 229, in <module>
trainer.fit(model, datamodule=model_dm)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 553, in fit
self._run(model)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 918, in _run
self._dispatch()
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 986, in _dispatch
self.accelerator.start_training(self)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/accelerators/accelerator.py", line 92, in start_training
self.training_type_plugin.start_training(trainer)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/plugins/training_type/tpu_spawn.py", line 267, in start_training
xmp.spawn(self.new_process, **self.xmp_spawn_kwargs)
File "/opt/conda/lib/python3.7/site-packages/torch_xla/distributed/xla_multiprocessing.py", line 394, in spawn
start_method=start_method)
File "/opt/conda/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 188, in start_processes
while not context.join():
File "/opt/conda/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 136, in join
signal_name=name
torch.multiprocessing.spawn.ProcessExitedException: process 7 terminated with signal SIGABRT
python: 3.7
pytorch: 1.9
pytorch-lightning: 1.4.4
cuda: 11.1
tpu: v2-8
This works on single GPU and Error occurs at trainer.fit
What does c10-cudaerror
mean? Is it something related to cuda version 10?
Any ideas what’s going wrong?