Skip to content

Instantly share code, notes, and snippets.

@rhine3
Last active October 27, 2019 00:07
Show Gist options
  • Select an option

  • Save rhine3/0410cacfd7aae7212c19d3edae70657a to your computer and use it in GitHub Desktop.

Select an option

Save rhine3/0410cacfd7aae7212c19d3edae70657a to your computer and use it in GitHub Desktop.
import pandas as pd
# Function for processing pandas dataframe
def process(df, user_col, date_col, save_to):
'''
Save table of earliest date per user
Save dataframe containing earliest value
in date_col for each value in user_col. Creates dataframe
containing one row per unique value in user_col,
where user_col is the user and date_col is the earliest
date associated with the user. Appends this dataframe
to the pre-existing .csv named save_to
Args:
df (pd.DataFrame): pandas dataframe with columns
user_col and date_col
user_col (string): name of column containing users
date_col (string): name of column containing dates
save_to (string): pre-existing .csv to save file to
Returns:
nothing
'''
# See long comment below for explanation of this line
new_df = df.loc[df.sort_values(date_col)[user_col].drop_duplicates(keep = 'first').index]
# Append dataframe to file
with open(save_to, 'a') as f:
new_df.to_csv(f, header = False, index = False)
'''
#This breaks down the process in creating the new_df line:
# Sort by date
sorted_by_date = df.sort_values(date_col)
# Keep only a list of users and indices, sorted by date
sorted_users = sorted_by_date[user_col]
# Keep first instance of user (gets index for earliest date)
keep_first_instance_of_user = sorted_users.drop_duplicates(keep = 'first')
# Use index to identify the rows to keep
good_user_idx = keep_first_instance_of_user.index
# Index down to rows containing earliest dates
new_df = df.loc[good_user_idx]
'''
# Number of rows to read at once
# Reduces the size needed to load at once
chunksize = 1000
dataset_name = 'ebird_dataset_name.csv'
# Names of needed columns, in list format
# Reduces the amount of memory needed to reach each chunk
usecols = ['needed_num', 'needed_date']
# Names of submission date and user id columns
datecol = 'needed_date' #submission date column name
usercol = 'needed_num' #user ID column name
# Add header to the file
save_filename = 'my_results.csv'
with open(save_filename, 'w') as f:
pd.DataFrame(columns = usecols).to_csv(f, index=False)
# parse_dates with [datecol] may speed up loading because pandas knows what to expect
# Another way to do this is to provide a date formatting string, which will make loading even faster
for chunk in pd.read_csv(
dataset_name,
chunksize = chunksize,
usecols = usecols,
parse_dates = [datecol]
):
# Add automatic date recognition
process(df = chunk, date_col = datecol, user_col = usercol, save_to = save_filename)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment