Skip to content

Instantly share code, notes, and snippets.

@khlam
Last active October 13, 2021 19:56
Show Gist options
  • Select an option

  • Save khlam/2b999576ac30d73ecddd99da33637a30 to your computer and use it in GitHub Desktop.

Select an option

Save khlam/2b999576ac30d73ecddd99da33637a30 to your computer and use it in GitHub Desktop.
pytorch-csv-lazy-read-csv
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
class lazyLoadCSVDataset(Dataset):
def __init__(self, data_CSV_path, rows_each_fetch=500, shuffle=True):
self.dataPath = data_CSV_path
self.rowsEachFetch = rows_each_fetch
self.shuffle = shuffle
self.n = sum(1 for line in open(self.dataPath)) - 2 # subtract header and empty last line from total line count
self.totalFetchCalls = int(np.around(self.n / self.rowsEachFetch))
self.reader = pd.read_csv(self.dataPath, sep=',', chunksize=self.rowsEachFetch, comment='#', header=0, iterator=True)
self.data = self.reader.get_chunk(self.rowsEachFetch).values # fetch the next chunk of lines from the file
self.data = torch.as_tensor(self.data, dtype=torch.float32)
self.data = self.data.to(device)
if (self.shuffle == True):
self.data = self.data[torch.randperm(self.data.shape[0])]
self.chunkItr = 0 # chunk iterator
def __len__(self):
return self.n
def reachedEndOfChunk(self):
if (self.chunkItr == self.data.shape[0]): # when we reach end of chunk, reset chunk iterator and return true
self.chunkItr = 0
if (self.data.shape[0] < self.rowsEachFetch):
self.reader = pd.read_csv(self.dataPath, sep=',', chunksize=self.rowsEachFetch, comment='#', header=0, iterator=True)
return True
else:
return False
def __getitem__(self, index):
if (self.reachedEndOfChunk() == True): # If we reach the end of chunk then fetch a new chunk
self.data = self.reader.get_chunk(self.rowsEachFetch).values # fetch the next chunk of lines from the file
self.data = torch.as_tensor(self.data, dtype=torch.float32)
self.data = self.data.to(device)
if (self.shuffle == True):
self.data = self.data[torch.randperm(self.data.shape[0])]
tensorData = self.data[self.chunkItr]
x = tensorData[:-1] # data
y = tensorData[-1] # labels
# your transforms here
self.chunkItr += 1 # every time we have not reached the end of the chunk, increment chunkItr and return false
return x, y
trainDataset = lazyLoadCSVDataset(data_CSV_path=trainPath, rows_each_fetch=300000, shuffle=True)
trainLoader = DataLoader(dataset=trainDataset, batch_size=batchSize, shuffle=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment