Last active
October 27, 2019 00:07
-
-
Save rhine3/0410cacfd7aae7212c19d3edae70657a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pandas as pd | |
| # Function for processing pandas dataframe | |
| def process(df, user_col, date_col, save_to): | |
| ''' | |
| Save table of earliest date per user | |
| Save dataframe containing earliest value | |
| in date_col for each value in user_col. Creates dataframe | |
| containing one row per unique value in user_col, | |
| where user_col is the user and date_col is the earliest | |
| date associated with the user. Appends this dataframe | |
| to the pre-existing .csv named save_to | |
| Args: | |
| df (pd.DataFrame): pandas dataframe with columns | |
| user_col and date_col | |
| user_col (string): name of column containing users | |
| date_col (string): name of column containing dates | |
| save_to (string): pre-existing .csv to save file to | |
| Returns: | |
| nothing | |
| ''' | |
| # See long comment below for explanation of this line | |
| new_df = df.loc[df.sort_values(date_col)[user_col].drop_duplicates(keep = 'first').index] | |
| # Append dataframe to file | |
| with open(save_to, 'a') as f: | |
| new_df.to_csv(f, header = False, index = False) | |
| ''' | |
| #This breaks down the process in creating the new_df line: | |
| # Sort by date | |
| sorted_by_date = df.sort_values(date_col) | |
| # Keep only a list of users and indices, sorted by date | |
| sorted_users = sorted_by_date[user_col] | |
| # Keep first instance of user (gets index for earliest date) | |
| keep_first_instance_of_user = sorted_users.drop_duplicates(keep = 'first') | |
| # Use index to identify the rows to keep | |
| good_user_idx = keep_first_instance_of_user.index | |
| # Index down to rows containing earliest dates | |
| new_df = df.loc[good_user_idx] | |
| ''' | |
| # Number of rows to read at once | |
| # Reduces the size needed to load at once | |
| chunksize = 1000 | |
| dataset_name = 'ebird_dataset_name.csv' | |
| # Names of needed columns, in list format | |
| # Reduces the amount of memory needed to reach each chunk | |
| usecols = ['needed_num', 'needed_date'] | |
| # Names of submission date and user id columns | |
| datecol = 'needed_date' #submission date column name | |
| usercol = 'needed_num' #user ID column name | |
| # Add header to the file | |
| save_filename = 'my_results.csv' | |
| with open(save_filename, 'w') as f: | |
| pd.DataFrame(columns = usecols).to_csv(f, index=False) | |
| # parse_dates with [datecol] may speed up loading because pandas knows what to expect | |
| # Another way to do this is to provide a date formatting string, which will make loading even faster | |
| for chunk in pd.read_csv( | |
| dataset_name, | |
| chunksize = chunksize, | |
| usecols = usecols, | |
| parse_dates = [datecol] | |
| ): | |
| # Add automatic date recognition | |
| process(df = chunk, date_col = datecol, user_col = usercol, save_to = save_filename) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment