khlam · October 13, 2021 19:56
diff --git a/pytorch-csv-lazy-read-csv.py b/pytorch-csv-lazy-read-csv.py
 import numpy as np
 import pandas as pd

 import torch

 from torch.utils.data import Dataset, DataLoader

 class lazyLoadCSVDataset(Dataset):
  def __init__(self, data_CSV_path, rows_each_fetch=500, shuffle=True):
    self.dataPath = data_CSV_path
    
    self.rowsEachFetch = rows_each_fetch

    self.shuffle = shuffle
    
    self.n = sum(1 for line in open(self.dataPath)) - 2 # subtract header and empty last line from total line count
    
    self.totalFetchCalls = int(np.around(self.n / self.rowsEachFetch))
    
    self.reader = pd.read_csv(self.dataPath, sep=',', chunksize=self.rowsEachFetch, comment='#', header=0, iterator=True)
    self.data = self.reader.get_chunk(self.rowsEachFetch).values # fetch the next chunk of lines from the file
    self.data = torch.as_tensor(self.data, dtype=torch.float32)
    self.data = self.data.to(device)

    if (self.shuffle == True):
      self.data = self.data[torch.randperm(self.data.shape[0])]
    
    self.chunkItr = 0 # chunk iterator

  def __len__(self):
    return self.n

  def reachedEndOfChunk(self): 
    if (self.chunkItr == self.data.shape[0]): # when we reach end of chunk, reset chunk iterator and return true
      self.chunkItr = 0
      if (self.data.shape[0] < self.rowsEachFetch):
        self.reader = pd.read_csv(self.dataPath, sep=',', chunksize=self.rowsEachFetch, comment='#', header=0, iterator=True)
      return True
    else:
      return False

  def __getitem__(self, index):
    if (self.reachedEndOfChunk() == True):  # If we reach the end of chunk then fetch a new chunk
      self.data = self.reader.get_chunk(self.rowsEachFetch).values # fetch the next chunk of lines from the file
      self.data = torch.as_tensor(self.data, dtype=torch.float32)
      self.data = self.data.to(device)
      
      if (self.shuffle == True):
        self.data = self.data[torch.randperm(self.data.shape[0])]

    tensorData = self.data[self.chunkItr]

    x = tensorData[:-1]  # data
    y = tensorData[-1]   # labels
    
    # your transforms here
    
    self.chunkItr += 1 # every time we have not reached the end of the chunk, increment chunkItr and return false 
    return x, y

 trainDataset = lazyLoadCSVDataset(data_CSV_path=trainPath, rows_each_fetch=300000, shuffle=True)
 trainLoader = DataLoader(dataset=trainDataset, batch_size=batchSize, shuffle=True)
	import numpy as np
	import pandas as pd

	import torch

	from torch.utils.data import Dataset, DataLoader

	class lazyLoadCSVDataset(Dataset):
	def __init__(self, data_CSV_path, rows_each_fetch=500, shuffle=True):
	self.dataPath = data_CSV_path

	self.rowsEachFetch = rows_each_fetch

	self.shuffle = shuffle

	self.n = sum(1 for line in open(self.dataPath)) - 2 # subtract header and empty last line from total line count

	self.totalFetchCalls = int(np.around(self.n / self.rowsEachFetch))

	self.reader = pd.read_csv(self.dataPath, sep=',', chunksize=self.rowsEachFetch, comment='#', header=0, iterator=True)
	self.data = self.reader.get_chunk(self.rowsEachFetch).values # fetch the next chunk of lines from the file
	self.data = torch.as_tensor(self.data, dtype=torch.float32)
	self.data = self.data.to(device)

	if (self.shuffle == True):
	self.data = self.data[torch.randperm(self.data.shape[0])]

	self.chunkItr = 0 # chunk iterator

	def __len__(self):
	return self.n

	def reachedEndOfChunk(self):
	if (self.chunkItr == self.data.shape[0]): # when we reach end of chunk, reset chunk iterator and return true
	self.chunkItr = 0
	if (self.data.shape[0] < self.rowsEachFetch):
	self.reader = pd.read_csv(self.dataPath, sep=',', chunksize=self.rowsEachFetch, comment='#', header=0, iterator=True)
	return True
	else:
	return False

	def __getitem__(self, index):
	if (self.reachedEndOfChunk() == True): # If we reach the end of chunk then fetch a new chunk
	self.data = self.reader.get_chunk(self.rowsEachFetch).values # fetch the next chunk of lines from the file
	self.data = torch.as_tensor(self.data, dtype=torch.float32)
	self.data = self.data.to(device)

	if (self.shuffle == True):
	self.data = self.data[torch.randperm(self.data.shape[0])]

	tensorData = self.data[self.chunkItr]

	x = tensorData[:-1] # data
	y = tensorData[-1] # labels

	# your transforms here

	self.chunkItr += 1 # every time we have not reached the end of the chunk, increment chunkItr and return false
	return x, y

	trainDataset = lazyLoadCSVDataset(data_CSV_path=trainPath, rows_each_fetch=300000, shuffle=True)
	trainLoader = DataLoader(dataset=trainDataset, batch_size=batchSize, shuffle=True)
No results found