Python Engineer

Free Python and Machine Learning Tutorials

Become A Patron and get exclusive content! Get access to ML From Scratch notebooks, join a private Slack channel, get priority response, and more! I really appreciate the support!

back to course overview

Dataset And Dataloader - PyTorch Beginner 09

04 Jan 2020

Learn all the basics you need to get started with this deep learning framework! In this part we see how we can use the built-in Dataset and DataLoader classes and improve our pipeline with batch training. See how we can write our own Dataset class and use available built-in datasets.

All code from this course can be found on GitHub.

Dataset and DataLoader in PyTorch

import torch import torchvision from torch.utils.data import Dataset, DataLoader import numpy as np import math # gradient computation etc. not efficient for whole data set # -> divide dataset into small batches ''' # training loop for epoch in range(num_epochs): # loop over all batches for i in range(total_batches): batch_x, batch_y = ... ''' # epoch = one forward and backward pass of ALL training samples # batch_size = number of training samples used in one forward/backward pass # number of iterations = number of passes, each pass (forward+backward) using [batch_size] number of sampes # e.g : 100 samples, batch_size=20 -> 100/20=5 iterations for 1 epoch # --> DataLoader can do the batch computation for us # Implement a custom Dataset: # inherit Dataset # implement __init__ , __getitem__ , and __len__ class WineDataset(Dataset): def __init__(self): # Initialize data, download, etc. # read with numpy or pandas xy = np.loadtxt('./data/wine/wine.csv', delimiter=',', dtype=np.float32, skiprows=1) self.n_samples = xy.shape[0] # here the first column is the class label, the rest are the features self.x_data = torch.from_numpy(xy[:, 1:]) # size [n_samples, n_features] self.y_data = torch.from_numpy(xy[:, [0]]) # size [n_samples, 1] # support indexing such that dataset[i] can be used to get i-th sample def __getitem__(self, index): return self.x_data[index], self.y_data[index] # we can call len(dataset) to return the size def __len__(self): return self.n_samples # create dataset dataset = WineDataset() # get first sample and unpack first_data = dataset[0] features, labels = first_data print(features, labels) # Load whole dataset with DataLoader # shuffle: shuffle data, good for training # num_workers: faster loading with multiple subprocesses # !!! IF YOU GET AN ERROR DURING LOADING, SET num_workers TO 0 !!! train_loader = DataLoader(dataset=dataset, batch_size=4, shuffle=True, num_workers=2) # convert to an iterator and look at one random sample dataiter = iter(train_loader) data = dataiter.next() features, labels = data print(features, labels) # Dummy Training loop num_epochs = 2 total_samples = len(dataset) n_iterations = math.ceil(total_samples/4) print(total_samples, n_iterations) for epoch in range(num_epochs): for i, (inputs, labels) in enumerate(train_loader): # here: 178 samples, batch_size = 4, n_iters=178/4=44.5 -> 45 iterations # Run your training process if (i+1) % 5 == 0: print(f'Epoch: {epoch+1}/{num_epochs}, Step {i+1}/{n_iterations}| Inputs {inputs.shape} | Labels {labels.shape}') # some famous datasets are available in torchvision.datasets # e.g. MNIST, Fashion-MNIST, CIFAR10, COCO train_dataset = torchvision.datasets.MNIST(root='./data', train=True, transform=torchvision.transforms.ToTensor(), download=True) train_loader = DataLoader(dataset=train_dataset, batch_size=3, shuffle=True) # look at one random sample dataiter = iter(train_loader) data = dataiter.next() inputs, targets = data print(inputs.shape, targets.shape)