Back to course overview

Dataset And Dataloader - PyTorch Beginner 09

Learn all the basics you need to get started with this deep learning framework! In this part we see how we can use the built-in Dataset and DataLoader classes and improve our pipeline with batch training. See how we can write our own Dataset class and use available built-in datasets.

All code from this course can be found on GitHub.

Dataset and DataLoader in PyTorch

import torch import torchvision from import Dataset, DataLoader import numpy as np import math # gradient computation etc. not efficient for whole data set # -> divide dataset into small batches ''' # training loop for epoch in range(num_epochs): # loop over all batches for i in range(total_batches): batch_x, batch_y = ... ''' # epoch = one forward and backward pass of ALL training samples # batch_size = number of training samples used in one forward/backward pass # number of iterations = number of passes, each pass (forward+backward) using [batch_size] number of sampes # e.g : 100 samples, batch_size=20 -> 100/20=5 iterations for 1 epoch # --> DataLoader can do the batch computation for us # Implement a custom Dataset: # inherit Dataset # implement __init__ , __getitem__ , and __len__ class WineDataset(Dataset): def __init__(self): # Initialize data, download, etc. # read with numpy or pandas xy = np.loadtxt('./data/wine/wine.csv', delimiter=',', dtype=np.float32, skiprows=1) self.n_samples = xy.shape[0] # here the first column is the class label, the rest are the features self.x_data = torch.from_numpy(xy[:, 1:]) # size [n_samples, n_features] self.y_data = torch.from_numpy(xy[:, [0]]) # size [n_samples, 1] # support indexing such that dataset[i] can be used to get i-th sample def __getitem__(self, index): return self.x_data[index], self.y_data[index] # we can call len(dataset) to return the size def __len__(self): return self.n_samples # create dataset dataset = WineDataset() # get first sample and unpack first_data = dataset[0] features, labels = first_data print(features, labels) # Load whole dataset with DataLoader # shuffle: shuffle data, good for training # num_workers: faster loading with multiple subprocesses # !!! IF YOU GET AN ERROR DURING LOADING, SET num_workers TO 0 !!! train_loader = DataLoader(dataset=dataset, batch_size=4, shuffle=True, num_workers=2) # convert to an iterator and look at one random sample dataiter = iter(train_loader) data = features, labels = data print(features, labels) # Dummy Training loop num_epochs = 2 total_samples = len(dataset) n_iterations = math.ceil(total_samples/4) print(total_samples, n_iterations) for epoch in range(num_epochs): for i, (inputs, labels) in enumerate(train_loader): # here: 178 samples, batch_size = 4, n_iters=178/4=44.5 -> 45 iterations # Run your training process if (i+1) % 5 == 0: print(f'Epoch: {epoch+1}/{num_epochs}, Step {i+1}/{n_iterations}| Inputs {inputs.shape} | Labels {labels.shape}') # some famous datasets are available in torchvision.datasets # e.g. MNIST, Fashion-MNIST, CIFAR10, COCO train_dataset = torchvision.datasets.MNIST(root='./data', train=True, transform=torchvision.transforms.ToTensor(), download=True) train_loader = DataLoader(dataset=train_dataset, batch_size=3, shuffle=True) # look at one random sample dataiter = iter(train_loader) data = inputs, targets = data print(inputs.shape, targets.shape)

FREE VS Code / PyCharm Extensions I Use

🪁 Code faster with Kite, AI-powered autocomplete: Link *

✅ Write cleaner code with Sourcery, instant refactoring suggestions: Link *

* These are affiliate links. By clicking on it you will not have any additional costs, instead you will support me and my project. Thank you! 🙏

Check out my Courses