Created
July 8, 2019 20:17
-
-
Save MiaAltieri/4b52d3f1715b91ed3dfd48f0af317dc7 to your computer and use it in GitHub Desktop.
PyUoIMiceSamples runs PyUoI on samples, run wtih --help to see input arguments, requires that sparse_control.preprocessed.h5 be in the same directory
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import numpy as np | |
| import pandas as pd | |
| from sklearn.model_selection import train_test_split | |
| from scipy.sparse import csr_matrix | |
| from scipy.sparse import csc_matrix | |
| import pickle | |
| from pyuoi import UoI_Lasso | |
| from pyuoi import UoI_L1Logistic | |
| from pyuoi.utils import check_logger | |
| import getopt | |
| import os | |
| import sys | |
| from mpi4py import MPI | |
| import logging | |
| import h5py | |
| # handle input arguments/options | |
| opts, args = getopt.getopt(sys.argv[1:], 'test', ['predict=','help']) | |
| predict = None | |
| test = '' | |
| for o, a in opts: | |
| if o in ("--help"): | |
| print('required argument: --predict=<weight | speed | memory>') | |
| print('optional arguments: test, --help') | |
| print('example: python PyUoIMiceSamples --predict=weight test') | |
| sys.exit() | |
| elif o in ("--predict"): | |
| predict = a | |
| else: | |
| print('unhandled option ', o) | |
| print('try --help') | |
| sys.exit() | |
| for a in args: | |
| if a in ("test"): | |
| test='test' | |
| else: | |
| print('unhandled argument ', a) | |
| print('try --help') | |
| sys.exit() | |
| if predict == None: | |
| print('mising required argument') | |
| print('required argument: --predict=<weight | speed | memory>') | |
| print('try --help') | |
| sys.exit() | |
| print('Now running PyUoI on ',predict) | |
| mode = 'test' if 'test' in args else '' | |
| # set up mpi variables | |
| comm = MPI.COMM_WORLD | |
| rank = comm.Get_rank() | |
| size = comm.Get_size() | |
| # set up logger | |
| logger = check_logger(None, name='uoi_main', comm=comm) | |
| logger.setLevel(logging.INFO) | |
| # open data and parition into X and Y | |
| reread = ( pd.read_hdf('sparse_control.preprocessed.h5', key='rotarod') if predict != 'memory' | |
| else pd.read_hdf('sparse_control.preprocessed.h5', key='memory') ) | |
| start_index = 1 if predict == 'weight' else 2 | |
| end_index = 2 if predict == 'weight' else 3 | |
| X = reread.iloc[1:5, 4:25] if mode == "test" else reread.iloc[:, 4:] | |
| Y = reread.iloc[1:5, start_index:end_index] if mode == "test" else reread.iloc[:, start_index:end_index] | |
| X_test = None | |
| Y_test = None | |
| if mode != "test": | |
| X, X_test, Y, Y_test = train_test_split( \ | |
| X, Y, test_size=0.20, random_state=42) | |
| else: | |
| X, X_test, Y, Y_test = train_test_split( \ | |
| X, Y, test_size=0.5, random_state=42) | |
| X = csr_matrix(X) | |
| X_test = csr_matrix(X_test) | |
| if rank == 0: | |
| logger.info('occupancy: %f' % (X.nnz/(X.shape[0]*X.shape[1]))) | |
| logger.info('regressing onto genetic data') | |
| # binarize Y if memory | |
| if predict == 'memory': | |
| Y = np.where(Y >= 600, 0, 1) | |
| # UoI for sparse matrices | |
| logger.info('running Lasso') | |
| clf = ( UoI_Lasso(comm=comm, standardize=False, fit_intercept=True) if predict != 'memory' | |
| else UoI_L1Logistic(comm=comm, standardize=False, fit_intercept=True)) | |
| clf.fit(X, Y, verbose=True) | |
| logger.info(clf.intercept_) | |
| # verify we can pass in a vector and get a prediction | |
| # X_test = reread.iloc[3:4, 4:25] if mode == "test" else reread.iloc[3:4, 4:] | |
| # Y_test = reread.iloc[3:4, start_index:end_index] if mode == "test" else reread.iloc[3:4, start_index:end_index] | |
| # binarize Y_test if memory | |
| if predict == 'memory' and rank == 0: | |
| print('binarizing Y for memory dataset') | |
| Y_test = Y_test.values | |
| Y_test = np.where(Y_test >= 600, 0, 1) | |
| print(clf.intercept_) | |
| print(clf.coef_) | |
| # save coefficents and intercept | |
| model_name = 'model_'+predict+mode+'.h5' | |
| output_path = os.path.join(os.getcwd(), model_name) | |
| if rank == 0: | |
| logger.info("writing model to %s" % output_path) | |
| f = h5py.File(output_path , 'w') | |
| f.create_dataset('coef', data=clf.coef_) | |
| f.create_dataset('intercept', data=clf.intercept_) | |
| dset = f['coef'] | |
| print(dset) | |
| f.close() | |
| test_data = {'X': X_test, 'Y': Y_test} | |
| # now pickle X_test and Y_test | |
| with open('test_data_'+predict+'.pickle', 'wb') as handle: | |
| pickle.dump(test_data, handle, protocol=pickle.HIGHEST_PROTOCOL) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment