rhine3 · October 27, 2019 00:07
diff --git a/find_first_date.py b/find_first_date.py
 import pandas as pd

 # Function for processing pandas dataframe
 def process(df, user_col, date_col, save_to):
    '''
    Save table of earliest date per user
    
    Save dataframe containing earliest value 
    in date_col for each value in user_col. Creates dataframe 
    containing one row per unique value in user_col,
    where user_col is the user and date_col is the earliest
    date associated with the user. Appends this dataframe
    to the pre-existing .csv named save_to
    
    Args:
        df (pd.DataFrame): pandas dataframe with columns 
            user_col and date_col
        user_col (string): name of column containing users
        date_col (string): name of column containing dates
        save_to (string): pre-existing .csv to save file to
        
    Returns:
        nothing
    '''
    

    # See long comment below for explanation of this line
    new_df = df.loc[df.sort_values(date_col)[user_col].drop_duplicates(keep = 'first').index]
    
    # Append dataframe to file
    with open(save_to, 'a') as f:
        new_df.to_csv(f, header = False, index = False)
    
    '''
    #This breaks down the process in creating the new_df line:
    
    # Sort by date
    sorted_by_date = df.sort_values(date_col)
    
    # Keep only a list of users and indices, sorted by date
    sorted_users = sorted_by_date[user_col]
    
    # Keep first instance of user (gets index for earliest date)
    keep_first_instance_of_user = sorted_users.drop_duplicates(keep = 'first')
    
    # Use index to identify the rows to keep
    good_user_idx = keep_first_instance_of_user.index
    
    # Index down to rows containing earliest dates
    new_df = df.loc[good_user_idx]
    '''
    
    
 # Number of rows to read at once
 # Reduces the size needed to load at once
 chunksize = 1000
 dataset_name = 'ebird_dataset_name.csv'

 # Names of needed columns, in list format
 # Reduces the amount of memory needed to reach each chunk
 usecols = ['needed_num', 'needed_date']

 # Names of submission date and user id columns
 datecol = 'needed_date' #submission date column name
 usercol = 'needed_num' #user ID column name

 # Add header to the file
 save_filename = 'my_results.csv'
 with open(save_filename, 'w') as f:
    pd.DataFrame(columns = usecols).to_csv(f, index=False)
    
    
 # parse_dates with [datecol] may speed up loading because pandas knows what to expect
 # Another way to do this is to provide a date formatting string, which will make loading even faster
 for chunk in pd.read_csv(
    dataset_name,
    chunksize = chunksize,
    usecols = usecols,
    parse_dates = [datecol]
 ):
    
    # Add automatic date recognition
    process(df = chunk, date_col = datecol, user_col = usercol, save_to = save_filename)
	import pandas as pd

	# Function for processing pandas dataframe
	def process(df, user_col, date_col, save_to):
	'''
	Save table of earliest date per user

	Save dataframe containing earliest value
	in date_col for each value in user_col. Creates dataframe
	containing one row per unique value in user_col,
	where user_col is the user and date_col is the earliest
	date associated with the user. Appends this dataframe
	to the pre-existing .csv named save_to

	Args:
	df (pd.DataFrame): pandas dataframe with columns
	user_col and date_col
	user_col (string): name of column containing users
	date_col (string): name of column containing dates
	save_to (string): pre-existing .csv to save file to

	Returns:
	nothing
	'''


	# See long comment below for explanation of this line
	new_df = df.loc[df.sort_values(date_col)[user_col].drop_duplicates(keep = 'first').index]

	# Append dataframe to file
	with open(save_to, 'a') as f:
	new_df.to_csv(f, header = False, index = False)

	'''
	#This breaks down the process in creating the new_df line:

	# Sort by date
	sorted_by_date = df.sort_values(date_col)

	# Keep only a list of users and indices, sorted by date
	sorted_users = sorted_by_date[user_col]

	# Keep first instance of user (gets index for earliest date)
	keep_first_instance_of_user = sorted_users.drop_duplicates(keep = 'first')

	# Use index to identify the rows to keep
	good_user_idx = keep_first_instance_of_user.index

	# Index down to rows containing earliest dates
	new_df = df.loc[good_user_idx]
	'''


	# Number of rows to read at once
	# Reduces the size needed to load at once
	chunksize = 1000
	dataset_name = 'ebird_dataset_name.csv'

	# Names of needed columns, in list format
	# Reduces the amount of memory needed to reach each chunk
	usecols = ['needed_num', 'needed_date']

	# Names of submission date and user id columns
	datecol = 'needed_date' #submission date column name
	usercol = 'needed_num' #user ID column name

	# Add header to the file
	save_filename = 'my_results.csv'
	with open(save_filename, 'w') as f:
	pd.DataFrame(columns = usecols).to_csv(f, index=False)


	# parse_dates with [datecol] may speed up loading because pandas knows what to expect
	# Another way to do this is to provide a date formatting string, which will make loading even faster
	for chunk in pd.read_csv(
	dataset_name,
	chunksize = chunksize,
	usecols = usecols,
	parse_dates = [datecol]
	):

	# Add automatic date recognition
	process(df = chunk, date_col = datecol, user_col = usercol, save_to = save_filename)
No results found