raov5 · September 2, 2017 14:01
diff --git a/Advanced Data Preparation in R.ipynb b/Advanced Data Preparation in R.ipynb
 {
    "metadata": {
        "kernelspec": {
            "name": "r-spark20", 
            "display_name": "R with Spark 2.0", 
            "language": "R"
        }, 
        "language_info": {
            "codemirror_mode": "r", 
            "name": "R", 
            "mimetype": "text/x-r-source", 
            "pygments_lexer": "r", 
            "version": "3.3.2", 
            "file_extension": ".r"
        }
    }, 
    "cells": [
        {
            "metadata": {
                "collapsed": true
            }, 
            "cell_type": "code", 
            "outputs": [], 
            "source": "#@author: Venky Rao raove@us.ibm.com\n#@last edited: 2 Sep 2017\n#@source: materials, data and examples adapted from R in Action 2nd Edition by Dr. Robert Kabacoff", 
            "execution_count": 56
        }, 
        {
            "metadata": {}, 
            "cell_type": "markdown", 
            "source": "# Advanced data preparation in R"
        }, 
        {
            "metadata": {}, 
            "cell_type": "markdown", 
            "source": "## A data management challenge"
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "output_type": "display_data", 
                    "data": {
                        "text/html": "<table>\n<thead><tr><th scope=col>Student</th><th scope=col>Math</th><th scope=col>Science</th><th scope=col>English</th></tr></thead>\n<tbody>\n\t<tr><td>John Davis       </td><td>502              </td><td>95               </td><td>25               </td></tr>\n\t<tr><td>Angela Williams  </td><td>600              </td><td>99               </td><td>22               </td></tr>\n\t<tr><td>Bullwinkle Moose </td><td>412              </td><td>80               </td><td>18               </td></tr>\n\t<tr><td>David Jones      </td><td>358              </td><td>82               </td><td>15               </td></tr>\n\t<tr><td>Janice Markhammer</td><td>495              </td><td>75               </td><td>20               </td></tr>\n\t<tr><td>Cheryl Cushing   </td><td>512              </td><td>85               </td><td>28               </td></tr>\n\t<tr><td>Reuven Ytzrhak   </td><td>410              </td><td>80               </td><td>15               </td></tr>\n\t<tr><td>Greg Knox        </td><td>625              </td><td>95               </td><td>30               </td></tr>\n\t<tr><td>Joel England     </td><td>573              </td><td>89               </td><td>27               </td></tr>\n\t<tr><td>Mary Rayburn     </td><td>522              </td><td>86               </td><td>18               </td></tr>\n</tbody>\n</table>\n", 
                        "text/plain": "   Student           Math Science English\n1  John Davis        502  95      25     \n2  Angela Williams   600  99      22     \n3  Bullwinkle Moose  412  80      18     \n4  David Jones       358  82      15     \n5  Janice Markhammer 495  75      20     \n6  Cheryl Cushing    512  85      28     \n7  Reuven Ytzrhak    410  80      15     \n8  Greg Knox         625  95      30     \n9  Joel England      573  89      27     \n10 Mary Rayburn      522  86      18     ", 
                        "text/latex": "\\begin{tabular}{r|llll}\n Student & Math & Science & English\\\\\n\\hline\n\t John Davis        & 502               & 95                & 25               \\\\\n\t Angela Williams   & 600               & 99                & 22               \\\\\n\t Bullwinkle Moose  & 412               & 80                & 18               \\\\\n\t David Jones       & 358               & 82                & 15               \\\\\n\t Janice Markhammer & 495               & 75                & 20               \\\\\n\t Cheryl Cushing    & 512               & 85                & 28               \\\\\n\t Reuven Ytzrhak    & 410               & 80                & 15               \\\\\n\t Greg Knox         & 625               & 95                & 30               \\\\\n\t Joel England      & 573               & 89                & 27               \\\\\n\t Mary Rayburn      & 522               & 86                & 18               \\\\\n\\end{tabular}\n"
                    }
                }
            ], 
            "source": "#let us begin this notebook by creating a dataset that we can analyze\n#this dataset is about grades various students have received in different subjects\nstudent <- c(\"John Davis\", \"Angela Williams\", \"Bullwinkle Moose\", \"David Jones\", \"Janice Markhammer\", \n             \"Cheryl Cushing\", \"Reuven Ytzrhak\", \"Greg Knox\", \"Joel England\", \"Mary Rayburn\")\nmathematics <- c(502, 600, 412, 358, 495, 512, 410, 625, 573, 522)\nscience <- c(95, 99, 80, 82, 75, 85, 80, 95, 89, 86)\nenglish <- c(25, 22, 18, 15, 20, 28, 15, 30, 27, 18)\n#create the data frame\ngrades <- data.frame(\"Student\" = student, \"Math\" = mathematics, \"Science\" = science, \"English\" = english, stringsAsFactors = F)\ngrades", 
            "execution_count": 39
        }, 
        {
            "metadata": {
                "collapsed": true
            }, 
            "cell_type": "code", 
            "outputs": [], 
            "source": "#our task is to:\n# 1. combine the scores of the students into a single performance indicator\n# 2. assign A grade to the top 20% of the students, B grade to the next 20% and so on", 
            "execution_count": 3
        }, 
        {
            "metadata": {
                "collapsed": true
            }, 
            "cell_type": "code", 
            "outputs": [], 
            "source": "#considerations:\n#several obstacles are immediately evident:\n# 1. scores on the 3 exams are not comparable.  They have widely different means and standard deviations \n     # so averaging them does not make sense.  You must transform the scores into comparable units before\n     # combining them\n# 2. you will need a method of determining a student's percentile rank on this score to assign a grade\n# 3. there is a single field for names complicating the task of sorting students.  You will need to split\n     # split names into first name and last name in order to sort them properly\n\n# let's review some key functions and then tackle these tasks", 
            "execution_count": 4
        }, 
        {
            "metadata": {}, 
            "cell_type": "markdown", 
            "source": "## Numerical and character functions - some examples and applications"
        }, 
        {
            "metadata": {}, 
            "cell_type": "markdown", 
            "source": "### Calculating mean and standard deviation"
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "output_type": "display_data", 
                    "data": {
                        "text/html": "500.9", 
                        "text/latex": "500.9", 
                        "text/markdown": "500.9", 
                        "text/plain": "[1] 500.9"
                    }
                }, 
                {
                    "metadata": {}, 
                    "output_type": "display_data", 
                    "data": {
                        "text/html": "86.6736535645188", 
                        "text/latex": "86.6736535645188", 
                        "text/markdown": "86.6736535645188", 
                        "text/plain": "[1] 86.67365"
                    }
                }
            ], 
            "source": "# calculating the mean and standard deviation of a vector of numbers\nmean(mathematics)\nsd(mathematics)", 
            "execution_count": 5
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "output_type": "display_data", 
                    "data": {
                        "text/html": "500.9", 
                        "text/latex": "500.9", 
                        "text/markdown": "500.9", 
                        "text/plain": "[1] 500.9"
                    }
                }, 
                {
                    "metadata": {}, 
                    "output_type": "display_data", 
                    "data": {
                        "text/html": "86.6736535645188", 
                        "text/latex": "86.6736535645188", 
                        "text/markdown": "86.6736535645188", 
                        "text/plain": "[1] 86.67365"
                    }
                }
            ], 
            "source": "#another approach to calculating mean and sd of a vector of numbers\nn <- length(mathematics) #storing the length of the vector in a variable\nmeanMath <- sum(mathematics) / n #mean = sum divided by number of observations\ncss <- sum((mathematics - meanMath)^2) #css = corrected sum of squares\nsdMath <- sqrt(css / (n - 1)) #standard deviation is the square root of css divided by (n - 1)\nmeanMath\nsdMath", 
            "execution_count": 6
        }, 
        {
            "metadata": {}, 
            "cell_type": "markdown", 
            "source": "### Generating pseudo-random numbers with a seed"
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "output_type": "display_data", 
                    "data": {
                        "text/html": "<ol class=list-inline>\n\t<li>0.711534205358475</li>\n\t<li>0.670486355898902</li>\n\t<li>0.872814682777971</li>\n\t<li>0.773019835585728</li>\n\t<li>0.82747818948701</li>\n</ol>\n", 
                        "text/latex": "\\begin{enumerate*}\n\\item 0.711534205358475\n\\item 0.670486355898902\n\\item 0.872814682777971\n\\item 0.773019835585728\n\\item 0.82747818948701\n\\end{enumerate*}\n", 
                        "text/markdown": "1. 0.711534205358475\n2. 0.670486355898902\n3. 0.872814682777971\n4. 0.773019835585728\n5. 0.82747818948701\n\n\n", 
                        "text/plain": "[1] 0.7115342 0.6704864 0.8728147 0.7730198 0.8274782"
                    }
                }
            ], 
            "source": "#generating pseudo-random numbers from a uniform distribution\nrunif(5)", 
            "execution_count": 7
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "output_type": "display_data", 
                    "data": {
                        "text/html": "<ol class=list-inline>\n\t<li>0.543440421111882</li>\n\t<li>0.771847827592865</li>\n\t<li>0.0152768774423748</li>\n\t<li>0.859203945845366</li>\n\t<li>0.856453391257674</li>\n</ol>\n", 
                        "text/latex": "\\begin{enumerate*}\n\\item 0.543440421111882\n\\item 0.771847827592865\n\\item 0.0152768774423748\n\\item 0.859203945845366\n\\item 0.856453391257674\n\\end{enumerate*}\n", 
                        "text/markdown": "1. 0.543440421111882\n2. 0.771847827592865\n3. 0.0152768774423748\n4. 0.859203945845366\n5. 0.856453391257674\n\n\n", 
                        "text/plain": "[1] 0.54344042 0.77184783 0.01527688 0.85920395 0.85645339"
                    }
                }
            ], 
            "source": "#you get a different set of numbers if you run this code again\nrunif(5)", 
            "execution_count": 8
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "output_type": "display_data", 
                    "data": {
                        "text/html": "<ol class=list-inline>\n\t<li>0.113703411305323</li>\n\t<li>0.622299404814839</li>\n\t<li>0.609274732880294</li>\n\t<li>0.623379441676661</li>\n\t<li>0.860915383556858</li>\n</ol>\n", 
                        "text/latex": "\\begin{enumerate*}\n\\item 0.113703411305323\n\\item 0.622299404814839\n\\item 0.609274732880294\n\\item 0.623379441676661\n\\item 0.860915383556858\n\\end{enumerate*}\n", 
                        "text/markdown": "1. 0.113703411305323\n2. 0.622299404814839\n3. 0.609274732880294\n4. 0.623379441676661\n5. 0.860915383556858\n\n\n", 
                        "text/plain": "[1] 0.1137034 0.6222994 0.6092747 0.6233794 0.8609154"
                    }
                }
            ], 
            "source": "#to ensure you receive the same numbers, you should set the seed explicitly as follows:\nset.seed(1234)\nrunif(5)", 
            "execution_count": 9
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "output_type": "display_data", 
                    "data": {
                        "text/html": "<ol class=list-inline>\n\t<li>0.113703411305323</li>\n\t<li>0.622299404814839</li>\n\t<li>0.609274732880294</li>\n\t<li>0.623379441676661</li>\n\t<li>0.860915383556858</li>\n</ol>\n", 
                        "text/latex": "\\begin{enumerate*}\n\\item 0.113703411305323\n\\item 0.622299404814839\n\\item 0.609274732880294\n\\item 0.623379441676661\n\\item 0.860915383556858\n\\end{enumerate*}\n", 
                        "text/markdown": "1. 0.113703411305323\n2. 0.622299404814839\n3. 0.609274732880294\n4. 0.623379441676661\n5. 0.860915383556858\n\n\n", 
                        "text/plain": "[1] 0.1137034 0.6222994 0.6092747 0.6233794 0.8609154"
                    }
                }
            ], 
            "source": "#let's try it again to confirm the results\nset.seed(1234)\nrunif(5)", 
            "execution_count": 10
        }, 
        {
            "metadata": {}, 
            "cell_type": "markdown", 
            "source": "### Generating multivariate normal data"
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "output_type": "stream", 
                    "name": "stderr", 
                    "text": "Installing package into \u2018/gpfs/global_fs01/sym_shared/YPProdSpark/user/s17c-9f3318fc11f06c-d37a4b9405b6/R/libs\u2019\n(as \u2018lib\u2019 is unspecified)\n"
                }
            ], 
            "source": "#install the MASS package\ninstall.packages(\"MASS\")", 
            "execution_count": 11
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "output_type": "stream", 
                    "name": "stderr", 
                    "text": "\nAttaching package: \u2018MASS\u2019\n\nThe following object is masked from \u2018package:SparkR\u2019:\n\n    select\n\n"
                }
            ], 
            "source": "#call the MASS library\nlibrary(MASS)", 
            "execution_count": 12
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "output_type": "display_data", 
                    "data": {
                        "text/html": "<ol class=list-inline>\n\t<li>500</li>\n\t<li>3</li>\n</ol>\n", 
                        "text/latex": "\\begin{enumerate*}\n\\item 500\n\\item 3\n\\end{enumerate*}\n", 
                        "text/markdown": "1. 500\n2. 3\n\n\n", 
                        "text/plain": "[1] 500   3"
                    }
                }, 
                {
                    "metadata": {}, 
                    "output_type": "display_data", 
                    "data": {
                        "text/html": "<table>\n<thead><tr><th scope=col>y</th><th scope=col>x1</th><th scope=col>x2</th></tr></thead>\n<tbody>\n\t<tr><td> 98.8</td><td> 41.3</td><td>3.43 </td></tr>\n\t<tr><td>244.5</td><td>205.2</td><td>3.80 </td></tr>\n\t<tr><td>375.7</td><td>186.7</td><td>2.51 </td></tr>\n\t<tr><td>-59.2</td><td> 11.2</td><td>4.71 </td></tr>\n\t<tr><td>313.0</td><td>111.0</td><td>3.45 </td></tr>\n\t<tr><td>288.8</td><td>185.1</td><td>2.72 </td></tr>\n\t<tr><td>134.8</td><td>165.0</td><td>4.39 </td></tr>\n\t<tr><td>171.7</td><td> 97.4</td><td>3.64 </td></tr>\n\t<tr><td>167.2</td><td>101.0</td><td>3.50 </td></tr>\n\t<tr><td>121.1</td><td> 94.5</td><td>4.10 </td></tr>\n</tbody>\n</table>\n", 
                        "text/plain": "   y     x1    x2  \n1   98.8  41.3 3.43\n2  244.5 205.2 3.80\n3  375.7 186.7 2.51\n4  -59.2  11.2 4.71\n5  313.0 111.0 3.45\n6  288.8 185.1 2.72\n7  134.8 165.0 4.39\n8  171.7  97.4 3.64\n9  167.2 101.0 3.50\n10 121.1  94.5 4.10", 
                        "text/latex": "\\begin{tabular}{r|lll}\n y & x1 & x2\\\\\n\\hline\n\t  98.8 &  41.3 & 3.43 \\\\\n\t 244.5 & 205.2 & 3.80 \\\\\n\t 375.7 & 186.7 & 2.51 \\\\\n\t -59.2 &  11.2 & 4.71 \\\\\n\t 313.0 & 111.0 & 3.45 \\\\\n\t 288.8 & 185.1 & 2.72 \\\\\n\t 134.8 & 165.0 & 4.39 \\\\\n\t 171.7 &  97.4 & 3.64 \\\\\n\t 167.2 & 101.0 & 3.50 \\\\\n\t 121.1 &  94.5 & 4.10 \\\\\n\\end{tabular}\n"
                    }
                }
            ], 
            "source": "#to create a sample of 500 observations for 3 variables with a normal distribution, use the following function:\n# mvrnorm(n, mean, sigma) where n = sample size, mean = vector of means and sigma = variance-covariance (or correlation) matrix\nsize <- 500 #size of the sample\noptions(digits = 3) # 3 variables\nset.seed(1234) # set the seed\nmean <- c(230.7, 146.7, 3.6) #vector of specified means\nsigma <- matrix(c(15360.8, 6721.2, -47.1, \n                  6721.2, 4700.9, -16.5,\n                  -47.1, -16.5, 0.3), nrow = 3, ncol = 3) #covariance matrix\nmydata <- mvrnorm(size, mean, sigma) #generate the data\nmydata <- as.data.frame(mydata) # convert the data into a data frame\nnames(mydata) <- c(\"y\", \"x1\", \"x2\") #name the columns of the data frame\ndim(mydata) #view the dimensions of the data\nhead(mydata, n = 10) #view the first 10 observations of the dataset", 
            "execution_count": 13
        }, 
        {
            "metadata": {}, 
            "cell_type": "markdown", 
            "source": "### Applying functions to matrices and data frames"
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "output_type": "display_data", 
                    "data": {
                        "text/html": "2.23606797749979", 
                        "text/latex": "2.23606797749979", 
                        "text/markdown": "2.23606797749979", 
                        "text/plain": "[1] 2.24"
                    }
                }
            ], 
            "source": "#the following examples demonstrate how to apply functions to data objects\na <- 5\nsqrt(a)", 
            "execution_count": 14
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "output_type": "display_data", 
                    "data": {
                        "text/html": "<ol class=list-inline>\n\t<li>1</li>\n\t<li>6</li>\n\t<li>3</li>\n</ol>\n", 
                        "text/latex": "\\begin{enumerate*}\n\\item 1\n\\item 6\n\\item 3\n\\end{enumerate*}\n", 
                        "text/markdown": "1. 1\n2. 6\n3. 3\n\n\n", 
                        "text/plain": "[1] 1 6 3"
                    }
                }
            ], 
            "source": "b <- c(1.243, 5.654, 2.99)\nround(b)", 
            "execution_count": 15
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "output_type": "display_data", 
                    "data": {
                        "text/html": "<table>\n<tbody>\n\t<tr><td>0.9636</td><td>0.216 </td><td>0.289 </td><td>0.913 </td></tr>\n\t<tr><td>0.2068</td><td>0.240 </td><td>0.804 </td><td>0.353 </td></tr>\n\t<tr><td>0.0862</td><td>0.197 </td><td>0.378 </td><td>0.931 </td></tr>\n</tbody>\n</table>\n", 
                        "text/latex": "\\begin{tabular}{llll}\n\t 0.9636 & 0.216  & 0.289  & 0.913 \\\\\n\t 0.2068 & 0.240  & 0.804  & 0.353 \\\\\n\t 0.0862 & 0.197  & 0.378  & 0.931 \\\\\n\\end{tabular}\n", 
                        "text/markdown": "1. 0.963599362876266\n2. 0.206762383226305\n3. 0.086197440745309\n4. 0.216028020251542\n5. 0.239646551199257\n6. 0.197160936193541\n7. 0.289008239516988\n8. 0.804114406695589\n9. 0.378249637782574\n10. 0.912817219272256\n11. 0.353391784243286\n12. 0.931487116962671\n\n\n", 
                        "text/plain": "     [,1]   [,2]  [,3]  [,4] \n[1,] 0.9636 0.216 0.289 0.913\n[2,] 0.2068 0.240 0.804 0.353\n[3,] 0.0862 0.197 0.378 0.931"
                    }
                }
            ], 
            "source": "c <- matrix(runif(12), nrow = 3)\nc", 
            "execution_count": 16
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "output_type": "display_data", 
                    "data": {
                        "text/html": "<table>\n<tbody>\n\t<tr><td>-0.0371</td><td>-1.53  </td><td>-1.241 </td><td>-0.0912</td></tr>\n\t<tr><td>-1.5762</td><td>-1.43  </td><td>-0.218 </td><td>-1.0402</td></tr>\n\t<tr><td>-2.4511</td><td>-1.62  </td><td>-0.972 </td><td>-0.0710</td></tr>\n</tbody>\n</table>\n", 
                        "text/latex": "\\begin{tabular}{llll}\n\t -0.0371 & -1.53   & -1.241  & -0.0912\\\\\n\t -1.5762 & -1.43   & -0.218  & -1.0402\\\\\n\t -2.4511 & -1.62   & -0.972  & -0.0710\\\\\n\\end{tabular}\n", 
                        "text/markdown": "1. -0.0370796694327288\n2. -1.57618505218607\n3. -2.45111479148283\n4. -1.53234715632455\n5. -1.42859014447\n6. -1.62373494876132\n7. -1.24130008083671\n8. -0.218013723041351\n9. -0.972200883927947\n10. -0.0912196163787894\n11. -1.04017796712205\n12. -0.0709729194573146\n\n\n", 
                        "text/plain": "     [,1]    [,2]  [,3]   [,4]   \n[1,] -0.0371 -1.53 -1.241 -0.0912\n[2,] -1.5762 -1.43 -0.218 -1.0402\n[3,] -2.4511 -1.62 -0.972 -0.0710"
                    }
                }
            ], 
            "source": "log(c)", 
            "execution_count": 17
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "output_type": "display_data", 
                    "data": {
                        "text/html": "0.464871924913799", 
                        "text/latex": "0.464871924913799", 
                        "text/markdown": "0.464871924913799", 
                        "text/plain": "[1] 0.465"
                    }
                }
            ], 
            "source": "mean(c)", 
            "execution_count": 18
        }, 
        {
            "metadata": {}, 
            "cell_type": "markdown", 
            "source": "### Applying functions to the rows (or columns) of a matrix"
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "output_type": "display_data", 
                    "data": {
                        "text/html": "<table>\n<tbody>\n\t<tr><td> 0.459</td><td> 1.203</td><td> 1.234</td><td> 0.591</td><td>-0.281</td></tr>\n\t<tr><td>-1.261</td><td> 0.769</td><td>-1.891</td><td>-0.435</td><td> 0.812</td></tr>\n\t<tr><td>-0.527</td><td> 0.238</td><td>-0.223</td><td>-0.251</td><td>-0.208</td></tr>\n\t<tr><td>-0.557</td><td>-1.415</td><td> 0.768</td><td>-0.926</td><td> 1.451</td></tr>\n\t<tr><td>-0.374</td><td> 2.934</td><td> 0.388</td><td> 1.087</td><td> 0.841</td></tr>\n\t<tr><td>-0.604</td><td> 0.935</td><td> 0.609</td><td>-1.944</td><td>-0.866</td></tr>\n</tbody>\n</table>\n", 
                        "text/latex": "\\begin{tabular}{lllll}\n\t  0.459 &  1.203 &  1.234 &  0.591 & -0.281\\\\\n\t -1.261 &  0.769 & -1.891 & -0.435 &  0.812\\\\\n\t -0.527 &  0.238 & -0.223 & -0.251 & -0.208\\\\\n\t -0.557 & -1.415 &  0.768 & -0.926 &  1.451\\\\\n\t -0.374 &  2.934 &  0.388 &  1.087 &  0.841\\\\\n\t -0.604 &  0.935 &  0.609 & -1.944 & -0.866\\\\\n\\end{tabular}\n", 
                        "text/markdown": "1. 0.458526008426864\n2. -1.26114914943113\n3. -0.527465163206145\n4. -0.556814221798541\n5. -0.374438726216025\n6. -0.604399980981061\n7. 1.20312707053266\n8. 0.768873233405475\n9. 0.238351023241361\n10. -1.41502814138869\n11. 2.93377438660602\n12. 0.935025757358322\n13. 1.23388452172586\n14. -1.89138472179805\n15. -0.222651293117936\n16. 0.768127528360915\n17. 0.387953658262852\n18. 0.609133031558329\n19. 0.590518608276321\n20. -0.435140753285453\n21. -0.250769898499009\n22. -0.926269383723312\n23. 1.08743580325344\n24. -1.94395921590421\n25. -0.280620438336351\n26. 0.812077556671369\n27. -0.207703687008036\n28. 1.45075731996683\n29. 0.841493178155365\n30. -0.865737822129377\n\n\n", 
                        "text/plain": "     [,1]   [,2]   [,3]   [,4]   [,5]  \n[1,]  0.459  1.203  1.234  0.591 -0.281\n[2,] -1.261  0.769 -1.891 -0.435  0.812\n[3,] -0.527  0.238 -0.223 -0.251 -0.208\n[4,] -0.557 -1.415  0.768 -0.926  1.451\n[5,] -0.374  2.934  0.388  1.087  0.841\n[6,] -0.604  0.935  0.609 -1.944 -0.866"
                    }
                }
            ], 
            "source": "mydata <- matrix(rnorm(30), nrow = 6) #generates data\nmydata #displays data", 
            "execution_count": 19
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "output_type": "display_data", 
                    "data": {
                        "text/html": "<ol class=list-inline>\n\t<li>0.641087154125071</li>\n\t<li>-0.401344766887558</li>\n\t<li>-0.194047803717953</li>\n\t<li>-0.135845379716559</li>\n\t<li>0.975243660012329</li>\n\t<li>-0.3739876460196</li>\n</ol>\n", 
                        "text/latex": "\\begin{enumerate*}\n\\item 0.641087154125071\n\\item -0.401344766887558\n\\item -0.194047803717953\n\\item -0.135845379716559\n\\item 0.975243660012329\n\\item -0.3739876460196\n\\end{enumerate*}\n", 
                        "text/markdown": "1. 0.641087154125071\n2. -0.401344766887558\n3. -0.194047803717953\n4. -0.135845379716559\n5. 0.975243660012329\n6. -0.3739876460196\n\n\n", 
                        "text/plain": "[1]  0.641 -0.401 -0.194 -0.136  0.975 -0.374"
                    }
                }
            ], 
            "source": "apply(mydata, 1, mean) #applies the mean function to the rows of the matrix", 
            "execution_count": 20
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "output_type": "display_data", 
                    "data": {
                        "text/html": "<ol class=list-inline>\n\t<li>0.552406205780717</li>\n\t<li>1.41091655037894</li>\n\t<li>1.10728458523771</li>\n\t<li>1.08004486563685</li>\n\t<li>0.875567958448012</li>\n</ol>\n", 
                        "text/latex": "\\begin{enumerate*}\n\\item 0.552406205780717\n\\item 1.41091655037894\n\\item 1.10728458523771\n\\item 1.08004486563685\n\\item 0.875567958448012\n\\end{enumerate*}\n", 
                        "text/markdown": "1. 0.552406205780717\n2. 1.41091655037894\n3. 1.10728458523771\n4. 1.08004486563685\n5. 0.875567958448012\n\n\n", 
                        "text/plain": "[1] 0.552 1.411 1.107 1.080 0.876"
                    }
                }
            ], 
            "source": "apply(mydata, 2, sd) #applies the standard deviation function to the columns of the matrix", 
            "execution_count": 21
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "output_type": "display_data", 
                    "data": {
                        "text/html": "<ol class=list-inline>\n\t<li>-0.515779523050443</li>\n\t<li>0.786344271134455</li>\n\t<li>0.38564073126604</li>\n\t<li>-0.255415356807863</li>\n\t<li>0.291311652370587</li>\n</ol>\n", 
                        "text/latex": "\\begin{enumerate*}\n\\item -0.515779523050443\n\\item 0.786344271134455\n\\item 0.38564073126604\n\\item -0.255415356807863\n\\item 0.291311652370587\n\\end{enumerate*}\n", 
                        "text/markdown": "1. -0.515779523050443\n2. 0.786344271134455\n3. 0.38564073126604\n4. -0.255415356807863\n5. 0.291311652370587\n\n\n", 
                        "text/plain": "[1] -0.516  0.786  0.386 -0.255  0.291"
                    }
                }
            ], 
            "source": "apply(mydata, 2, mean, trim = 0.2) #calculates trimmed column means\n# in this case, means based on the middle 60% of the data, top 20% and bottom 20% of the values are discarded", 
            "execution_count": 22
        }, 
        {
            "metadata": {
                "collapsed": true
            }, 
            "cell_type": "code", 
            "outputs": [], 
            "source": "#apply() applies a function to an array; lapply() and sapply() apply to a list", 
            "execution_count": 23
        }, 
        {
            "metadata": {}, 
            "cell_type": "markdown", 
            "source": "## A solution to the data management challenge"
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "output_type": "display_data", 
                    "data": {
                        "text/html": "<table>\n<thead><tr><th scope=col>Math</th><th scope=col>Science</th><th scope=col>English</th></tr></thead>\n<tbody>\n\t<tr><td> 0.013</td><td> 1.078</td><td> 0.587</td></tr>\n\t<tr><td> 1.143</td><td> 1.591</td><td> 0.037</td></tr>\n\t<tr><td>-1.026</td><td>-0.847</td><td>-0.697</td></tr>\n\t<tr><td>-1.649</td><td>-0.590</td><td>-1.247</td></tr>\n\t<tr><td>-0.068</td><td>-1.489</td><td>-0.330</td></tr>\n\t<tr><td> 0.128</td><td>-0.205</td><td> 1.137</td></tr>\n\t<tr><td>-1.049</td><td>-0.847</td><td>-1.247</td></tr>\n\t<tr><td> 1.432</td><td> 1.078</td><td> 1.504</td></tr>\n\t<tr><td> 0.832</td><td> 0.308</td><td> 0.954</td></tr>\n\t<tr><td> 0.243</td><td>-0.077</td><td>-0.697</td></tr>\n</tbody>\n</table>\n", 
                        "text/latex": "\\begin{tabular}{lll}\n Math & Science & English\\\\\n\\hline\n\t  0.013 &  1.078 &  0.587\\\\\n\t  1.143 &  1.591 &  0.037\\\\\n\t -1.026 & -0.847 & -0.697\\\\\n\t -1.649 & -0.590 & -1.247\\\\\n\t -0.068 & -1.489 & -0.330\\\\\n\t  0.128 & -0.205 &  1.137\\\\\n\t -1.049 & -0.847 & -1.247\\\\\n\t  1.432 &  1.078 &  1.504\\\\\n\t  0.832 &  0.308 &  0.954\\\\\n\t  0.243 & -0.077 & -0.697\\\\\n\\end{tabular}\n", 
                        "text/markdown": "1. 0.0126912845456687\n2. 1.14336936225068\n3. -1.02568654191811\n4. -1.64871323779639\n5. -0.0680714352904033\n6. 0.1280665985972\n7. -1.04876160472842\n8. 1.43180764737951\n9. 0.831856014311543\n10. 0.243441912648732\n11. 1.07806561759564\n12. 1.59143019740309\n13. -0.84705155668229\n14. -0.590369266778565\n15. -1.4887572814416\n16. -0.205345831922979\n17. -0.84705155668229\n18. 1.07806561759564\n19. 0.30801874788447\n20. -0.0770046869711166\n21. 0.586851445285841\n22. 0.0366782153303649\n23. -0.696886091276937\n24. -1.24705932123241\n25. -0.330103937973286\n26. 1.13702467524132\n27. -1.24705932123241\n28. 1.50380682854497\n29. 0.953633598589492\n30. -0.696886091276937\n\n\n", 
                        "text/plain": "      Math   Science English\n [1,]  0.013  1.078   0.587 \n [2,]  1.143  1.591   0.037 \n [3,] -1.026 -0.847  -0.697 \n [4,] -1.649 -0.590  -1.247 \n [5,] -0.068 -1.489  -0.330 \n [6,]  0.128 -0.205   1.137 \n [7,] -1.049 -0.847  -1.247 \n [8,]  1.432  1.078   1.504 \n [9,]  0.832  0.308   0.954 \n[10,]  0.243 -0.077  -0.697 "
                    }
                }
            ], 
            "source": "options(digits = 2) #limits the number of digits printed after the decimal place and makes the outputs easier to read\nz <- scale(grades[, 2:4]) # scales the scores so that the variables (scores) are standardized so that each test is reported\n                          # in standard deviation units rather than in their original scales\nz #output z to look at the transformed values", 
            "execution_count": 40
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "output_type": "display_data", 
                    "data": {
                        "text/html": "<ol class=list-inline>\n\t<li>0.559202782475718</li>\n\t<li>0.923825924994712</li>\n\t<li>-0.85654139662578</li>\n\t<li>-1.16204727526912</li>\n\t<li>-0.62897755156843</li>\n\t<li>0.353248480638513</li>\n\t<li>-1.04762416088104</li>\n\t<li>1.33789336450671</li>\n\t<li>0.697836120261835</li>\n\t<li>-0.176816288533107</li>\n</ol>\n", 
                        "text/latex": "\\begin{enumerate*}\n\\item 0.559202782475718\n\\item 0.923825924994712\n\\item -0.85654139662578\n\\item -1.16204727526912\n\\item -0.62897755156843\n\\item 0.353248480638513\n\\item -1.04762416088104\n\\item 1.33789336450671\n\\item 0.697836120261835\n\\item -0.176816288533107\n\\end{enumerate*}\n", 
                        "text/markdown": "1. 0.559202782475718\n2. 0.923825924994712\n3. -0.85654139662578\n4. -1.16204727526912\n5. -0.62897755156843\n6. 0.353248480638513\n7. -1.04762416088104\n8. 1.33789336450671\n9. 0.697836120261835\n10. -0.176816288533107\n\n\n", 
                        "text/plain": " [1]  0.56  0.92 -0.86 -1.16 -0.63  0.35 -1.05  1.34  0.70 -0.18"
                    }
                }
            ], 
            "source": "score <- apply(z, 1, mean) # use the apply function to calculate the score for each student\nscore #display the score", 
            "execution_count": 41
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "output_type": "display_data", 
                    "data": {
                        "text/html": "<table>\n<thead><tr><th scope=col>Student</th><th scope=col>Math</th><th scope=col>Science</th><th scope=col>English</th><th scope=col>Score</th></tr></thead>\n<tbody>\n\t<tr><td>John Davis       </td><td>502              </td><td>95               </td><td>25               </td><td> 0.56            </td></tr>\n\t<tr><td>Angela Williams  </td><td>600              </td><td>99               </td><td>22               </td><td> 0.92            </td></tr>\n\t<tr><td>Bullwinkle Moose </td><td>412              </td><td>80               </td><td>18               </td><td>-0.86            </td></tr>\n\t<tr><td>David Jones      </td><td>358              </td><td>82               </td><td>15               </td><td>-1.16            </td></tr>\n\t<tr><td>Janice Markhammer</td><td>495              </td><td>75               </td><td>20               </td><td>-0.63            </td></tr>\n\t<tr><td>Cheryl Cushing   </td><td>512              </td><td>85               </td><td>28               </td><td> 0.35            </td></tr>\n\t<tr><td>Reuven Ytzrhak   </td><td>410              </td><td>80               </td><td>15               </td><td>-1.05            </td></tr>\n\t<tr><td>Greg Knox        </td><td>625              </td><td>95               </td><td>30               </td><td> 1.34            </td></tr>\n\t<tr><td>Joel England     </td><td>573              </td><td>89               </td><td>27               </td><td> 0.70            </td></tr>\n\t<tr><td>Mary Rayburn     </td><td>522              </td><td>86               </td><td>18               </td><td>-0.18            </td></tr>\n</tbody>\n</table>\n", 
                        "text/plain": "   Student           Math Science English Score\n1  John Davis        502  95      25       0.56\n2  Angela Williams   600  99      22       0.92\n3  Bullwinkle Moose  412  80      18      -0.86\n4  David Jones       358  82      15      -1.16\n5  Janice Markhammer 495  75      20      -0.63\n6  Cheryl Cushing    512  85      28       0.35\n7  Reuven Ytzrhak    410  80      15      -1.05\n8  Greg Knox         625  95      30       1.34\n9  Joel England      573  89      27       0.70\n10 Mary Rayburn      522  86      18      -0.18", 
                        "text/latex": "\\begin{tabular}{r|lllll}\n Student & Math & Science & English & Score\\\\\n\\hline\n\t John Davis        & 502               & 95                & 25                &  0.56            \\\\\n\t Angela Williams   & 600               & 99                & 22                &  0.92            \\\\\n\t Bullwinkle Moose  & 412               & 80                & 18                & -0.86            \\\\\n\t David Jones       & 358               & 82                & 15                & -1.16            \\\\\n\t Janice Markhammer & 495               & 75                & 20                & -0.63            \\\\\n\t Cheryl Cushing    & 512               & 85                & 28                &  0.35            \\\\\n\t Reuven Ytzrhak    & 410               & 80                & 15                & -1.05            \\\\\n\t Greg Knox         & 625               & 95                & 30                &  1.34            \\\\\n\t Joel England      & 573               & 89                & 27                &  0.70            \\\\\n\t Mary Rayburn      & 522               & 86                & 18                & -0.18            \\\\\n\\end{tabular}\n"
                    }
                }
            ], 
            "source": "grades <- cbind(grades, \"Score\" = score) # add the score as a column to the data frame\ngrades", 
            "execution_count": 42
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "output_type": "display_data", 
                    "data": {
                        "text/html": "<dl class=dl-horizontal>\n\t<dt>80%</dt>\n\t\t<dd>0.74303408120841</dd>\n\t<dt>60%</dt>\n\t\t<dd>0.435630201373395</dd>\n\t<dt>40%</dt>\n\t\t<dd>-0.357680793747236</dd>\n\t<dt>20%</dt>\n\t\t<dd>-0.894757949476833</dd>\n</dl>\n", 
                        "text/latex": "\\begin{description*}\n\\item[80\\textbackslash{}\\%] 0.74303408120841\n\\item[60\\textbackslash{}\\%] 0.435630201373395\n\\item[40\\textbackslash{}\\%] -0.357680793747236\n\\item[20\\textbackslash{}\\%] -0.894757949476833\n\\end{description*}\n", 
                        "text/markdown": "80%\n:   0.7430340812084160%\n:   0.43563020137339540%\n:   -0.35768079374723620%\n:   -0.894757949476833\n\n", 
                        "text/plain": "  80%   60%   40%   20% \n 0.74  0.44 -0.36 -0.89 "
                    }
                }
            ], 
            "source": "y <- quantile(score, c(0.8, 0.6, 0.4, 0.2)) # gives the percentile rank of each student's performance score\ny # displays the quantiles", 
            "execution_count": 43
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "output_type": "display_data", 
                    "data": {
                        "text/html": "<table>\n<thead><tr><th scope=col>Student</th><th scope=col>Math</th><th scope=col>Science</th><th scope=col>English</th><th scope=col>Score</th><th scope=col>Grade</th></tr></thead>\n<tbody>\n\t<tr><td>John Davis       </td><td>502              </td><td>95               </td><td>25               </td><td> 0.56            </td><td>B                </td></tr>\n\t<tr><td>Angela Williams  </td><td>600              </td><td>99               </td><td>22               </td><td> 0.92            </td><td>A                </td></tr>\n\t<tr><td>Bullwinkle Moose </td><td>412              </td><td>80               </td><td>18               </td><td>-0.86            </td><td>D                </td></tr>\n\t<tr><td>David Jones      </td><td>358              </td><td>82               </td><td>15               </td><td>-1.16            </td><td>F                </td></tr>\n\t<tr><td>Janice Markhammer</td><td>495              </td><td>75               </td><td>20               </td><td>-0.63            </td><td>D                </td></tr>\n\t<tr><td>Cheryl Cushing   </td><td>512              </td><td>85               </td><td>28               </td><td> 0.35            </td><td>C                </td></tr>\n\t<tr><td>Reuven Ytzrhak   </td><td>410              </td><td>80               </td><td>15               </td><td>-1.05            </td><td>F                </td></tr>\n\t<tr><td>Greg Knox        </td><td>625              </td><td>95               </td><td>30               </td><td> 1.34            </td><td>A                </td></tr>\n\t<tr><td>Joel England     </td><td>573              </td><td>89               </td><td>27               </td><td> 0.70            </td><td>B                </td></tr>\n\t<tr><td>Mary Rayburn     </td><td>522              </td><td>86               </td><td>18               </td><td>-0.18            </td><td>C                </td></tr>\n</tbody>\n</table>\n", 
                        "text/plain": "   Student           Math Science English Score Grade\n1  John Davis        502  95      25       0.56 B    \n2  Angela Williams   600  99      22       0.92 A    \n3  Bullwinkle Moose  412  80      18      -0.86 D    \n4  David Jones       358  82      15      -1.16 F    \n5  Janice Markhammer 495  75      20      -0.63 D    \n6  Cheryl Cushing    512  85      28       0.35 C    \n7  Reuven Ytzrhak    410  80      15      -1.05 F    \n8  Greg Knox         625  95      30       1.34 A    \n9  Joel England      573  89      27       0.70 B    \n10 Mary Rayburn      522  86      18      -0.18 C    ", 
                        "text/latex": "\\begin{tabular}{r|llllll}\n Student & Math & Science & English & Score & Grade\\\\\n\\hline\n\t John Davis        & 502               & 95                & 25                &  0.56             & B                \\\\\n\t Angela Williams   & 600               & 99                & 22                &  0.92             & A                \\\\\n\t Bullwinkle Moose  & 412               & 80                & 18                & -0.86             & D                \\\\\n\t David Jones       & 358               & 82                & 15                & -1.16             & F                \\\\\n\t Janice Markhammer & 495               & 75                & 20                & -0.63             & D                \\\\\n\t Cheryl Cushing    & 512               & 85                & 28                &  0.35             & C                \\\\\n\t Reuven Ytzrhak    & 410               & 80                & 15                & -1.05             & F                \\\\\n\t Greg Knox         & 625               & 95                & 30                &  1.34             & A                \\\\\n\t Joel England      & 573               & 89                & 27                &  0.70             & B                \\\\\n\t Mary Rayburn      & 522               & 86                & 18                & -0.18             & C                \\\\\n\\end{tabular}\n"
                    }
                }
            ], 
            "source": "#grade the students\ngrades$Grade[score >= y[1]] <- \"A\"\ngrades$Grade[score < y[1] & score >= y[2]] <- \"B\"\ngrades$Grade[score < y[2] & score >= y[3]] <- \"C\"\ngrades$Grade[score < y[3] & score >= y[4]] <- \"D\"\ngrades$Grade[score < y[4]] <- \"F\"\ngrades #print the dataset with the new Grade column", 
            "execution_count": 44
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "output_type": "display_data", 
                    "data": {
                        "text/html": "<ol>\n\t<li><ol class=list-inline>\n\t<li>'John'</li>\n\t<li>'Davis'</li>\n</ol>\n</li>\n\t<li><ol class=list-inline>\n\t<li>'Angela'</li>\n\t<li>'Williams'</li>\n</ol>\n</li>\n\t<li><ol class=list-inline>\n\t<li>'Bullwinkle'</li>\n\t<li>'Moose'</li>\n</ol>\n</li>\n\t<li><ol class=list-inline>\n\t<li>'David'</li>\n\t<li>'Jones'</li>\n</ol>\n</li>\n\t<li><ol class=list-inline>\n\t<li>'Janice'</li>\n\t<li>'Markhammer'</li>\n</ol>\n</li>\n\t<li><ol class=list-inline>\n\t<li>'Cheryl'</li>\n\t<li>'Cushing'</li>\n</ol>\n</li>\n\t<li><ol class=list-inline>\n\t<li>'Reuven'</li>\n\t<li>'Ytzrhak'</li>\n</ol>\n</li>\n\t<li><ol class=list-inline>\n\t<li>'Greg'</li>\n\t<li>'Knox'</li>\n</ol>\n</li>\n\t<li><ol class=list-inline>\n\t<li>'Joel'</li>\n\t<li>'England'</li>\n</ol>\n</li>\n\t<li><ol class=list-inline>\n\t<li>'Mary'</li>\n\t<li>'Rayburn'</li>\n</ol>\n</li>\n</ol>\n", 
                        "text/latex": "\\begin{enumerate}\n\\item \\begin{enumerate*}\n\\item 'John'\n\\item 'Davis'\n\\end{enumerate*}\n\n\\item \\begin{enumerate*}\n\\item 'Angela'\n\\item 'Williams'\n\\end{enumerate*}\n\n\\item \\begin{enumerate*}\n\\item 'Bullwinkle'\n\\item 'Moose'\n\\end{enumerate*}\n\n\\item \\begin{enumerate*}\n\\item 'David'\n\\item 'Jones'\n\\end{enumerate*}\n\n\\item \\begin{enumerate*}\n\\item 'Janice'\n\\item 'Markhammer'\n\\end{enumerate*}\n\n\\item \\begin{enumerate*}\n\\item 'Cheryl'\n\\item 'Cushing'\n\\end{enumerate*}\n\n\\item \\begin{enumerate*}\n\\item 'Reuven'\n\\item 'Ytzrhak'\n\\end{enumerate*}\n\n\\item \\begin{enumerate*}\n\\item 'Greg'\n\\item 'Knox'\n\\end{enumerate*}\n\n\\item \\begin{enumerate*}\n\\item 'Joel'\n\\item 'England'\n\\end{enumerate*}\n\n\\item \\begin{enumerate*}\n\\item 'Mary'\n\\item 'Rayburn'\n\\end{enumerate*}\n\n\\end{enumerate}\n", 
                        "text/markdown": "1. 1. 'John'\n2. 'Davis'\n\n\n\n2. 1. 'Angela'\n2. 'Williams'\n\n\n\n3. 1. 'Bullwinkle'\n2. 'Moose'\n\n\n\n4. 1. 'David'\n2. 'Jones'\n\n\n\n5. 1. 'Janice'\n2. 'Markhammer'\n\n\n\n6. 1. 'Cheryl'\n2. 'Cushing'\n\n\n\n7. 1. 'Reuven'\n2. 'Ytzrhak'\n\n\n\n8. 1. 'Greg'\n2. 'Knox'\n\n\n\n9. 1. 'Joel'\n2. 'England'\n\n\n\n10. 1. 'Mary'\n2. 'Rayburn'\n\n\n\n\n\n", 
                        "text/plain": "[[1]]\n[1] \"John\"  \"Davis\"\n\n[[2]]\n[1] \"Angela\"   \"Williams\"\n\n[[3]]\n[1] \"Bullwinkle\" \"Moose\"     \n\n[[4]]\n[1] \"David\" \"Jones\"\n\n[[5]]\n[1] \"Janice\"     \"Markhammer\"\n\n[[6]]\n[1] \"Cheryl\"  \"Cushing\"\n\n[[7]]\n[1] \"Reuven\"  \"Ytzrhak\"\n\n[[8]]\n[1] \"Greg\" \"Knox\"\n\n[[9]]\n[1] \"Joel\"    \"England\"\n\n[[10]]\n[1] \"Mary\"    \"Rayburn\"\n"
                    }
                }
            ], 
            "source": "#extract the first and last names\nname <- strsplit((grades$Student), \" \") #splits each row in the name column into 2 character vectors\nname", 
            "execution_count": 45
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "output_type": "display_data", 
                    "data": {
                        "text/html": "<ol class=list-inline>\n\t<li>'Davis'</li>\n\t<li>'Williams'</li>\n\t<li>'Moose'</li>\n\t<li>'Jones'</li>\n\t<li>'Markhammer'</li>\n\t<li>'Cushing'</li>\n\t<li>'Ytzrhak'</li>\n\t<li>'Knox'</li>\n\t<li>'England'</li>\n\t<li>'Rayburn'</li>\n</ol>\n", 
                        "text/latex": "\\begin{enumerate*}\n\\item 'Davis'\n\\item 'Williams'\n\\item 'Moose'\n\\item 'Jones'\n\\item 'Markhammer'\n\\item 'Cushing'\n\\item 'Ytzrhak'\n\\item 'Knox'\n\\item 'England'\n\\item 'Rayburn'\n\\end{enumerate*}\n", 
                        "text/markdown": "1. 'Davis'\n2. 'Williams'\n3. 'Moose'\n4. 'Jones'\n5. 'Markhammer'\n6. 'Cushing'\n7. 'Ytzrhak'\n8. 'Knox'\n9. 'England'\n10. 'Rayburn'\n\n\n", 
                        "text/plain": " [1] \"Davis\"      \"Williams\"   \"Moose\"      \"Jones\"      \"Markhammer\"\n [6] \"Cushing\"    \"Ytzrhak\"    \"Knox\"       \"England\"    \"Rayburn\"   "
                    }
                }
            ], 
            "source": "Lastname <- sapply(name, \"[\", 2) #extracts the last name\nLastname", 
            "execution_count": 46
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "output_type": "display_data", 
                    "data": {
                        "text/html": "<ol class=list-inline>\n\t<li>'John'</li>\n\t<li>'Angela'</li>\n\t<li>'Bullwinkle'</li>\n\t<li>'David'</li>\n\t<li>'Janice'</li>\n\t<li>'Cheryl'</li>\n\t<li>'Reuven'</li>\n\t<li>'Greg'</li>\n\t<li>'Joel'</li>\n\t<li>'Mary'</li>\n</ol>\n", 
                        "text/latex": "\\begin{enumerate*}\n\\item 'John'\n\\item 'Angela'\n\\item 'Bullwinkle'\n\\item 'David'\n\\item 'Janice'\n\\item 'Cheryl'\n\\item 'Reuven'\n\\item 'Greg'\n\\item 'Joel'\n\\item 'Mary'\n\\end{enumerate*}\n", 
                        "text/markdown": "1. 'John'\n2. 'Angela'\n3. 'Bullwinkle'\n4. 'David'\n5. 'Janice'\n6. 'Cheryl'\n7. 'Reuven'\n8. 'Greg'\n9. 'Joel'\n10. 'Mary'\n\n\n", 
                        "text/plain": " [1] \"John\"       \"Angela\"     \"Bullwinkle\" \"David\"      \"Janice\"    \n [6] \"Cheryl\"     \"Reuven\"     \"Greg\"       \"Joel\"       \"Mary\"      "
                    }
                }
            ], 
            "source": "Firstname <- sapply(name, \"[\", 1) #extracts the first name\nFirstname", 
            "execution_count": 47
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "output_type": "display_data", 
                    "data": {
                        "text/html": "<table>\n<thead><tr><th scope=col>Student</th><th scope=col>Math</th><th scope=col>Science</th><th scope=col>English</th><th scope=col>Score</th><th scope=col>Grade</th></tr></thead>\n<tbody>\n\t<tr><td>John Davis       </td><td>502              </td><td>95               </td><td>25               </td><td> 0.56            </td><td>B                </td></tr>\n\t<tr><td>Angela Williams  </td><td>600              </td><td>99               </td><td>22               </td><td> 0.92            </td><td>A                </td></tr>\n\t<tr><td>Bullwinkle Moose </td><td>412              </td><td>80               </td><td>18               </td><td>-0.86            </td><td>D                </td></tr>\n\t<tr><td>David Jones      </td><td>358              </td><td>82               </td><td>15               </td><td>-1.16            </td><td>F                </td></tr>\n\t<tr><td>Janice Markhammer</td><td>495              </td><td>75               </td><td>20               </td><td>-0.63            </td><td>D                </td></tr>\n\t<tr><td>Cheryl Cushing   </td><td>512              </td><td>85               </td><td>28               </td><td> 0.35            </td><td>C                </td></tr>\n\t<tr><td>Reuven Ytzrhak   </td><td>410              </td><td>80               </td><td>15               </td><td>-1.05            </td><td>F                </td></tr>\n\t<tr><td>Greg Knox        </td><td>625              </td><td>95               </td><td>30               </td><td> 1.34            </td><td>A                </td></tr>\n\t<tr><td>Joel England     </td><td>573              </td><td>89               </td><td>27               </td><td> 0.70            </td><td>B                </td></tr>\n\t<tr><td>Mary Rayburn     </td><td>522              </td><td>86               </td><td>18               </td><td>-0.18            </td><td>C                </td></tr>\n</tbody>\n</table>\n", 
                        "text/plain": "   Student           Math Science English Score Grade\n1  John Davis        502  95      25       0.56 B    \n2  Angela Williams   600  99      22       0.92 A    \n3  Bullwinkle Moose  412  80      18      -0.86 D    \n4  David Jones       358  82      15      -1.16 F    \n5  Janice Markhammer 495  75      20      -0.63 D    \n6  Cheryl Cushing    512  85      28       0.35 C    \n7  Reuven Ytzrhak    410  80      15      -1.05 F    \n8  Greg Knox         625  95      30       1.34 A    \n9  Joel England      573  89      27       0.70 B    \n10 Mary Rayburn      522  86      18      -0.18 C    ", 
                        "text/latex": "\\begin{tabular}{r|llllll}\n Student & Math & Science & English & Score & Grade\\\\\n\\hline\n\t John Davis        & 502               & 95                & 25                &  0.56             & B                \\\\\n\t Angela Williams   & 600               & 99                & 22                &  0.92             & A                \\\\\n\t Bullwinkle Moose  & 412               & 80                & 18                & -0.86             & D                \\\\\n\t David Jones       & 358               & 82                & 15                & -1.16             & F                \\\\\n\t Janice Markhammer & 495               & 75                & 20                & -0.63             & D                \\\\\n\t Cheryl Cushing    & 512               & 85                & 28                &  0.35             & C                \\\\\n\t Reuven Ytzrhak    & 410               & 80                & 15                & -1.05             & F                \\\\\n\t Greg Knox         & 625               & 95                & 30                &  1.34             & A                \\\\\n\t Joel England      & 573               & 89                & 27                &  0.70             & B                \\\\\n\t Mary Rayburn      & 522               & 86                & 18                & -0.18             & C                \\\\\n\\end{tabular}\n"
                    }
                }
            ], 
            "source": "grades", 
            "execution_count": 48
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "output_type": "display_data", 
                    "data": {
                        "text/html": "<table>\n<thead><tr><th scope=col>Firstname</th><th scope=col>Lastname</th><th scope=col>Math</th><th scope=col>Science</th><th scope=col>English</th><th scope=col>Score</th><th scope=col>Grade</th></tr></thead>\n<tbody>\n\t<tr><td>John      </td><td>Davis     </td><td>502       </td><td>95        </td><td>25        </td><td> 0.56     </td><td>B         </td></tr>\n\t<tr><td>Angela    </td><td>Williams  </td><td>600       </td><td>99        </td><td>22        </td><td> 0.92     </td><td>A         </td></tr>\n\t<tr><td>Bullwinkle</td><td>Moose     </td><td>412       </td><td>80        </td><td>18        </td><td>-0.86     </td><td>D         </td></tr>\n\t<tr><td>David     </td><td>Jones     </td><td>358       </td><td>82        </td><td>15        </td><td>-1.16     </td><td>F         </td></tr>\n\t<tr><td>Janice    </td><td>Markhammer</td><td>495       </td><td>75        </td><td>20        </td><td>-0.63     </td><td>D         </td></tr>\n\t<tr><td>Cheryl    </td><td>Cushing   </td><td>512       </td><td>85        </td><td>28        </td><td> 0.35     </td><td>C         </td></tr>\n\t<tr><td>Reuven    </td><td>Ytzrhak   </td><td>410       </td><td>80        </td><td>15        </td><td>-1.05     </td><td>F         </td></tr>\n\t<tr><td>Greg      </td><td>Knox      </td><td>625       </td><td>95        </td><td>30        </td><td> 1.34     </td><td>A         </td></tr>\n\t<tr><td>Joel      </td><td>England   </td><td>573       </td><td>89        </td><td>27        </td><td> 0.70     </td><td>B         </td></tr>\n\t<tr><td>Mary      </td><td>Rayburn   </td><td>522       </td><td>86        </td><td>18        </td><td>-0.18     </td><td>C         </td></tr>\n</tbody>\n</table>\n", 
                        "text/plain": "   Firstname  Lastname   Math Science English Score Grade\n1  John       Davis      502  95      25       0.56 B    \n2  Angela     Williams   600  99      22       0.92 A    \n3  Bullwinkle Moose      412  80      18      -0.86 D    \n4  David      Jones      358  82      15      -1.16 F    \n5  Janice     Markhammer 495  75      20      -0.63 D    \n6  Cheryl     Cushing    512  85      28       0.35 C    \n7  Reuven     Ytzrhak    410  80      15      -1.05 F    \n8  Greg       Knox       625  95      30       1.34 A    \n9  Joel       England    573  89      27       0.70 B    \n10 Mary       Rayburn    522  86      18      -0.18 C    ", 
                        "text/latex": "\\begin{tabular}{r|lllllll}\n Firstname & Lastname & Math & Science & English & Score & Grade\\\\\n\\hline\n\t John       & Davis      & 502        & 95         & 25         &  0.56      & B         \\\\\n\t Angela     & Williams   & 600        & 99         & 22         &  0.92      & A         \\\\\n\t Bullwinkle & Moose      & 412        & 80         & 18         & -0.86      & D         \\\\\n\t David      & Jones      & 358        & 82         & 15         & -1.16      & F         \\\\\n\t Janice     & Markhammer & 495        & 75         & 20         & -0.63      & D         \\\\\n\t Cheryl     & Cushing    & 512        & 85         & 28         &  0.35      & C         \\\\\n\t Reuven     & Ytzrhak    & 410        & 80         & 15         & -1.05      & F         \\\\\n\t Greg       & Knox       & 625        & 95         & 30         &  1.34      & A         \\\\\n\t Joel       & England    & 573        & 89         & 27         &  0.70      & B         \\\\\n\t Mary       & Rayburn    & 522        & 86         & 18         & -0.18      & C         \\\\\n\\end{tabular}\n"
                    }
                }
            ], 
            "source": "grades <- cbind(Firstname, Lastname, grades[,-1]) #adds the firstname and lastname columns and drops the original name column\ngrades", 
            "execution_count": 49
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "output_type": "display_data", 
                    "data": {
                        "text/html": "<table>\n<thead><tr><th></th><th scope=col>Firstname</th><th scope=col>Lastname</th><th scope=col>Math</th><th scope=col>Science</th><th scope=col>English</th><th scope=col>Score</th><th scope=col>Grade</th></tr></thead>\n<tbody>\n\t<tr><th scope=row>6</th><td>Cheryl    </td><td>Cushing   </td><td>512       </td><td>85        </td><td>28        </td><td> 0.35     </td><td>C         </td></tr>\n\t<tr><th scope=row>1</th><td>John      </td><td>Davis     </td><td>502       </td><td>95        </td><td>25        </td><td> 0.56     </td><td>B         </td></tr>\n\t<tr><th scope=row>9</th><td>Joel      </td><td>England   </td><td>573       </td><td>89        </td><td>27        </td><td> 0.70     </td><td>B         </td></tr>\n\t<tr><th scope=row>4</th><td>David     </td><td>Jones     </td><td>358       </td><td>82        </td><td>15        </td><td>-1.16     </td><td>F         </td></tr>\n\t<tr><th scope=row>8</th><td>Greg      </td><td>Knox      </td><td>625       </td><td>95        </td><td>30        </td><td> 1.34     </td><td>A         </td></tr>\n\t<tr><th scope=row>5</th><td>Janice    </td><td>Markhammer</td><td>495       </td><td>75        </td><td>20        </td><td>-0.63     </td><td>D         </td></tr>\n\t<tr><th scope=row>3</th><td>Bullwinkle</td><td>Moose     </td><td>412       </td><td>80        </td><td>18        </td><td>-0.86     </td><td>D         </td></tr>\n\t<tr><th scope=row>10</th><td>Mary      </td><td>Rayburn   </td><td>522       </td><td>86        </td><td>18        </td><td>-0.18     </td><td>C         </td></tr>\n\t<tr><th scope=row>2</th><td>Angela    </td><td>Williams  </td><td>600       </td><td>99        </td><td>22        </td><td> 0.92     </td><td>A         </td></tr>\n\t<tr><th scope=row>7</th><td>Reuven    </td><td>Ytzrhak   </td><td>410       </td><td>80        </td><td>15        </td><td>-1.05     </td><td>F         </td></tr>\n</tbody>\n</table>\n", 
                        "text/plain": "   Firstname  Lastname   Math Science English Score Grade\n6  Cheryl     Cushing    512  85      28       0.35 C    \n1  John       Davis      502  95      25       0.56 B    \n9  Joel       England    573  89      27       0.70 B    \n4  David      Jones      358  82      15      -1.16 F    \n8  Greg       Knox       625  95      30       1.34 A    \n5  Janice     Markhammer 495  75      20      -0.63 D    \n3  Bullwinkle Moose      412  80      18      -0.86 D    \n10 Mary       Rayburn    522  86      18      -0.18 C    \n2  Angela     Williams   600  99      22       0.92 A    \n7  Reuven     Ytzrhak    410  80      15      -1.05 F    ", 
                        "text/latex": "\\begin{tabular}{r|lllllll}\n  & Firstname & Lastname & Math & Science & English & Score & Grade\\\\\n\\hline\n\t6 & Cheryl     & Cushing    & 512        & 85         & 28         &  0.35      & C         \\\\\n\t1 & John       & Davis      & 502        & 95         & 25         &  0.56      & B         \\\\\n\t9 & Joel       & England    & 573        & 89         & 27         &  0.70      & B         \\\\\n\t4 & David      & Jones      & 358        & 82         & 15         & -1.16      & F         \\\\\n\t8 & Greg       & Knox       & 625        & 95         & 30         &  1.34      & A         \\\\\n\t5 & Janice     & Markhammer & 495        & 75         & 20         & -0.63      & D         \\\\\n\t3 & Bullwinkle & Moose      & 412        & 80         & 18         & -0.86      & D         \\\\\n\t10 & Mary       & Rayburn    & 522        & 86         & 18         & -0.18      & C         \\\\\n\t2 & Angela     & Williams   & 600        & 99         & 22         &  0.92      & A         \\\\\n\t7 & Reuven     & Ytzrhak    & 410        & 80         & 15         & -1.05      & F         \\\\\n\\end{tabular}\n"
                    }
                }
            ], 
            "source": "grades <- grades[order(Lastname, Firstname), ] #sorts the data frame by last name and then by first name\ngrades # prints the final result", 
            "execution_count": 50
        }, 
        {
            "metadata": {
                "collapsed": true
            }, 
            "cell_type": "markdown", 
            "source": "## Control flow"
        }, 
        {
            "metadata": {
                "collapsed": true
            }, 
            "cell_type": "markdown", 
            "source": "### The FOR loop"
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "output_type": "stream", 
                    "name": "stdout", 
                    "text": "[1] \"Hello\"\n[1] \"Hello\"\n[1] \"Hello\"\n[1] \"Hello\"\n[1] \"Hello\"\n[1] \"Hello\"\n[1] \"Hello\"\n[1] \"Hello\"\n[1] \"Hello\"\n[1] \"Hello\"\n"
                }
            ], 
            "source": "#the for loop executes a statement repetitively until a variable's value is no longer contained in the sequence\n#for example:\nfor (i in 1:10) {\n    print (\"Hello\")\n} #hello is printed 10 times", 
            "execution_count": 35
        }, 
        {
            "metadata": {}, 
            "cell_type": "markdown", 
            "source": "### The WHILE loop"
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "output_type": "stream", 
                    "name": "stdout", 
                    "text": "[1] \"Hello\"\n[1] \"Hello\"\n[1] \"Hello\"\n[1] \"Hello\"\n[1] \"Hello\"\n[1] \"Hello\"\n[1] \"Hello\"\n[1] \"Hello\"\n[1] \"Hello\"\n[1] \"Hello\"\n"
                }
            ], 
            "source": "#a while loop executes a statement repetitively until the condition is no longer true\ni <- 10\nwhile (i > 0) {\n    print (\"Hello\")\n    i <- i - 1\n} #prints hello 10 times", 
            "execution_count": 36
        }, 
        {
            "metadata": {}, 
            "cell_type": "markdown", 
            "source": "### The IF-ELSE statement"
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "output_type": "stream", 
                    "name": "stdout", 
                    "text": "[1] \"Grade already is a factor.\"\n"
                }
            ], 
            "source": "#the if-else control structure executes a statement if a given condition is true.\n#optionally, a different statement is executed if the conditon is false\n#example\nif (is.character(grades$Grade)){\n    grades$Grade <- as.factor(grades$Grade)\n}\nif (!is.factor(grades$Grade)){\n    grades$Grade <- as.factor(grades$Grade)\n} else {\n    print (\"Grade already is a factor.\")\n}", 
            "execution_count": 52
        }, 
        {
            "metadata": {
                "collapsed": true
            }, 
            "cell_type": "markdown", 
            "source": "### The IFELSE statement"
        }, 
        {
            "metadata": {
                "collapsed": true
            }, 
            "cell_type": "code", 
            "outputs": [], 
            "source": "#the ifelse statement is a compact and vectorized version of the if-else statement\n#example:\n# ifelse(grades$score > 0.5, print (\"Passed\"), print (\"Failed\"))\n# outcome <- ifelse(score > 0.5, \"Passed\", \"Failed\")", 
            "execution_count": 54
        }, 
        {
            "metadata": {}, 
            "cell_type": "markdown", 
            "source": "### The SWITCH statement"
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "output_type": "stream", 
                    "name": "stdout", 
                    "text": "[1] \"Cheer up\"\n[1] \"There is nothing to fear\"\n"
                }
            ], 
            "source": "#switch choose statements based on the value of an expression. example:\nfeelings <- c(\"sad\", \"afraid\")\nfor (i in feelings) {\n    print(\n    switch(i,\n          happy = \"I am glad you are happy\",\n          afraid = \"There is nothing to fear\",\n          sad = \"Cheer up\",\n          angry = \"Calm down\"\n          )\n    )\n}", 
            "execution_count": 55
        }, 
        {
            "metadata": {}, 
            "cell_type": "markdown", 
            "source": "## User-written functions"
        }, 
        {
            "metadata": {
                "collapsed": true
            }, 
            "cell_type": "code", 
            "outputs": [], 
            "source": "#let's say you want a function that calculates the central tendency and spread of data objects.\n#the function should give you a choice between parametric (mean and standard deviation) and\n#non-parametric (median and median absolute deviation) statistics.\n#the results should be returned as a named list.\n#additionally, the user should have a choice of printing the results or not.\n#unless otherwise specified, the function's default behavior should be to calculate parametric statistics\n#and not print the results.  one way of achieving this is provided below.", 
            "execution_count": 57
        }, 
        {
            "metadata": {}, 
            "cell_type": "markdown", 
            "source": "### mystats() : a user-written function for summary statistics"
        }, 
        {
            "metadata": {
                "collapsed": true
            }, 
            "cell_type": "code", 
            "outputs": [], 
            "source": "mystats <- function(x, parametric = T, print = F) { #set parametric as default; set no printing as default\n    if (parametric) {\n        center <- mean(x)\n        spread <- sd(x)\n    } else {\n        center <- median(x)\n        spread <- mad(x)\n    }\n    if (print & parametric) {\n        cat (\"Mean = \", center, \"\\n\", \"SD = \", spread, \"\\n\")\n        } else if (print & !parametric) {\n        cat (\"Median =\", center, \"\\n\", \"MAD = \", spread, \"\\n\")\n        }\n    result <- list(center = center, spread = spread)\n    return(result)\n}", 
            "execution_count": 59
        }, 
        {
            "metadata": {
                "collapsed": true
            }, 
            "cell_type": "code", 
            "outputs": [], 
            "source": "#to see this function in action, first generate some data (a random sample of size 500 from a normal distribution)\nset.seed(1234)\nx <- rnorm(500)", 
            "execution_count": 61
        }, 
        {
            "metadata": {
                "collapsed": true
            }, 
            "cell_type": "code", 
            "outputs": [], 
            "source": "y <- mystats(x) #parametric stats are calculated but not printed", 
            "execution_count": 62
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "output_type": "stream", 
                    "name": "stdout", 
                    "text": "Median = -0.021 \n MAD =  1 \n"
                }
            ], 
            "source": "y <- mystats(x, parametric = F, print = T) #non-parametric stats are calculated and printed", 
            "execution_count": 63
        }, 
        {
            "metadata": {
                "collapsed": true
            }, 
            "cell_type": "code", 
            "outputs": [], 
            "source": "#another example of a user-written function that uses the Switch statement\n#this function gives the user a choice regarding the format of today's date\n#values that are assigned to parameters in the function declaration are taken as defaults\n#long is the default format for dates in this function if type isn't specified\nmydate <- function(type = \"long\") {\n    switch(type,\n          long = format(Sys.time(), \"%A %B %d %Y\"),\n          short = format(Sys.time(), \"%m-%d-%y\"),\n          cat(type, \"is not a recognized type\\n\")\n          )\n}", 
            "execution_count": 64
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "output_type": "display_data", 
                    "data": {
                        "text/html": "'Friday September 01 2017'", 
                        "text/latex": "'Friday September 01 2017'", 
                        "text/markdown": "'Friday September 01 2017'", 
                        "text/plain": "[1] \"Friday September 01 2017\""
                    }
                }
            ], 
            "source": "#here is the function in action:\nmydate(\"long\")", 
            "execution_count": 65
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "output_type": "display_data", 
                    "data": {
                        "text/html": "'09-01-17'", 
                        "text/latex": "'09-01-17'", 
                        "text/markdown": "'09-01-17'", 
                        "text/plain": "[1] \"09-01-17\""
                    }
                }
            ], 
            "source": "mydate(\"short\")", 
            "execution_count": 66
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "output_type": "display_data", 
                    "data": {
                        "text/html": "'Friday September 01 2017'", 
                        "text/latex": "'Friday September 01 2017'", 
                        "text/markdown": "'Friday September 01 2017'", 
                        "text/plain": "[1] \"Friday September 01 2017\""
                    }
                }
            ], 
            "source": "mydate() #default type is long", 
            "execution_count": 68
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "output_type": "stream", 
                    "name": "stdout", 
                    "text": "medium is not a recognized type\n"
                }
            ], 
            "source": "mydate(\"medium\") #should return error message", 
            "execution_count": 70
        }, 
        {
            "metadata": {}, 
            "cell_type": "markdown", 
            "source": "## Aggregation and reshaping"
        }, 
        {
            "metadata": {
                "collapsed": true
            }, 
            "cell_type": "markdown", 
            "source": "### Transposing a matrix"
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "output_type": "display_data", 
                    "data": {
                        "text/html": "<table>\n<thead><tr><th></th><th scope=col>mpg</th><th scope=col>cyl</th><th scope=col>disp</th><th scope=col>hp</th></tr></thead>\n<tbody>\n\t<tr><th scope=row>Mazda RX4</th><td>21 </td><td>6  </td><td>160</td><td>110</td></tr>\n\t<tr><th scope=row>Mazda RX4 Wag</th><td>21 </td><td>6  </td><td>160</td><td>110</td></tr>\n\t<tr><th scope=row>Datsun 710</th><td>23 </td><td>4  </td><td>108</td><td> 93</td></tr>\n\t<tr><th scope=row>Hornet 4 Drive</th><td>21 </td><td>6  </td><td>258</td><td>110</td></tr>\n\t<tr><th scope=row>Hornet Sportabout</th><td>19 </td><td>8  </td><td>360</td><td>175</td></tr>\n</tbody>\n</table>\n", 
                        "text/plain": "                  mpg cyl disp hp \nMazda RX4         21  6   160  110\nMazda RX4 Wag     21  6   160  110\nDatsun 710        23  4   108   93\nHornet 4 Drive    21  6   258  110\nHornet Sportabout 19  8   360  175", 
                        "text/latex": "\\begin{tabular}{r|llll}\n  & mpg & cyl & disp & hp\\\\\n\\hline\n\tMazda RX4 & 21  & 6   & 160 & 110\\\\\n\tMazda RX4 Wag & 21  & 6   & 160 & 110\\\\\n\tDatsun 710 & 23  & 4   & 108 &  93\\\\\n\tHornet 4 Drive & 21  & 6   & 258 & 110\\\\\n\tHornet Sportabout & 19  & 8   & 360 & 175\\\\\n\\end{tabular}\n"
                    }
                }
            ], 
            "source": "#transposing a matrix\ncars <- mtcars[1:5, 1:4] #store a subset of the mtcars dataset into the cars object\ncars #display the cars object", 
            "execution_count": 71
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "output_type": "display_data", 
                    "data": {
                        "text/html": "<table>\n<thead><tr><th></th><th scope=col>Mazda RX4</th><th scope=col>Mazda RX4 Wag</th><th scope=col>Datsun 710</th><th scope=col>Hornet 4 Drive</th><th scope=col>Hornet Sportabout</th></tr></thead>\n<tbody>\n\t<tr><th scope=row>mpg</th><td> 21</td><td> 21</td><td> 23</td><td> 21</td><td> 19</td></tr>\n\t<tr><th scope=row>cyl</th><td>  6</td><td>  6</td><td>  4</td><td>  6</td><td>  8</td></tr>\n\t<tr><th scope=row>disp</th><td>160</td><td>160</td><td>108</td><td>258</td><td>360</td></tr>\n\t<tr><th scope=row>hp</th><td>110</td><td>110</td><td> 93</td><td>110</td><td>175</td></tr>\n</tbody>\n</table>\n", 
                        "text/latex": "\\begin{tabular}{r|lllll}\n  & Mazda RX4 & Mazda RX4 Wag & Datsun 710 & Hornet 4 Drive & Hornet Sportabout\\\\\n\\hline\n\tmpg &  21 &  21 &  23 &  21 &  19\\\\\n\tcyl &   6 &   6 &   4 &   6 &   8\\\\\n\tdisp & 160 & 160 & 108 & 258 & 360\\\\\n\thp & 110 & 110 &  93 & 110 & 175\\\\\n\\end{tabular}\n", 
                        "text/markdown": "1. 21\n2. 6\n3. 160\n4. 110\n5. 21\n6. 6\n7. 160\n8. 110\n9. 22.8\n10. 4\n11. 108\n12. 93\n13. 21.4\n14. 6\n15. 258\n16. 110\n17. 18.7\n18. 8\n19. 360\n20. 175\n\n\n", 
                        "text/plain": "     Mazda RX4 Mazda RX4 Wag Datsun 710 Hornet 4 Drive Hornet Sportabout\nmpg   21        21            23         21             19              \ncyl    6         6             4          6              8              \ndisp 160       160           108        258            360              \nhp   110       110            93        110            175              "
                    }
                }
            ], 
            "source": "#transpose the cars object:\nt(cars)", 
            "execution_count": 72
        }, 
        {
            "metadata": {}, 
            "cell_type": "markdown", 
            "source": "### Aggregating data"
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "output_type": "display_data", 
                    "data": {
                        "text/html": "<table>\n<thead><tr><th scope=col>Group.1</th><th scope=col>Group.2</th><th scope=col>mpg</th><th scope=col>cyl</th><th scope=col>disp</th><th scope=col>hp</th><th scope=col>drat</th><th scope=col>wt</th><th scope=col>qsec</th><th scope=col>vs</th><th scope=col>am</th><th scope=col>gear</th><th scope=col>carb</th></tr></thead>\n<tbody>\n\t<tr><td>4   </td><td>3   </td><td>21.5</td><td>4   </td><td>120 </td><td> 97 </td><td>3.70</td><td>2.46</td><td>20.0</td><td>1.0 </td><td>0.00</td><td>3   </td><td>1.00</td></tr>\n\t<tr><td>6   </td><td>3   </td><td>19.8</td><td>6   </td><td>242 </td><td>108 </td><td>2.92</td><td>3.34</td><td>19.8</td><td>1.0 </td><td>0.00</td><td>3   </td><td>1.00</td></tr>\n\t<tr><td>8   </td><td>3   </td><td>15.1</td><td>8   </td><td>358 </td><td>194 </td><td>3.12</td><td>4.10</td><td>17.1</td><td>0.0 </td><td>0.00</td><td>3   </td><td>3.08</td></tr>\n\t<tr><td>4   </td><td>4   </td><td>26.9</td><td>4   </td><td>103 </td><td> 76 </td><td>4.11</td><td>2.38</td><td>19.6</td><td>1.0 </td><td>0.75</td><td>4   </td><td>1.50</td></tr>\n\t<tr><td>6   </td><td>4   </td><td>19.8</td><td>6   </td><td>164 </td><td>116 </td><td>3.91</td><td>3.09</td><td>17.7</td><td>0.5 </td><td>0.50</td><td>4   </td><td>4.00</td></tr>\n\t<tr><td>4   </td><td>5   </td><td>28.2</td><td>4   </td><td>108 </td><td>102 </td><td>4.10</td><td>1.83</td><td>16.8</td><td>0.5 </td><td>1.00</td><td>5   </td><td>2.00</td></tr>\n\t<tr><td>6   </td><td>5   </td><td>19.7</td><td>6   </td><td>145 </td><td>175 </td><td>3.62</td><td>2.77</td><td>15.5</td><td>0.0 </td><td>1.00</td><td>5   </td><td>6.00</td></tr>\n\t<tr><td>8   </td><td>5   </td><td>15.4</td><td>8   </td><td>326 </td><td>300 </td><td>3.88</td><td>3.37</td><td>14.6</td><td>0.0 </td><td>1.00</td><td>5   </td><td>6.00</td></tr>\n</tbody>\n</table>\n", 
                        "text/plain": "  Group.1 Group.2 mpg  cyl disp hp  drat wt   qsec vs  am   gear carb\n1 4       3       21.5 4   120   97 3.70 2.46 20.0 1.0 0.00 3    1.00\n2 6       3       19.8 6   242  108 2.92 3.34 19.8 1.0 0.00 3    1.00\n3 8       3       15.1 8   358  194 3.12 4.10 17.1 0.0 0.00 3    3.08\n4 4       4       26.9 4   103   76 4.11 2.38 19.6 1.0 0.75 4    1.50\n5 6       4       19.8 6   164  116 3.91 3.09 17.7 0.5 0.50 4    4.00\n6 4       5       28.2 4   108  102 4.10 1.83 16.8 0.5 1.00 5    2.00\n7 6       5       19.7 6   145  175 3.62 2.77 15.5 0.0 1.00 5    6.00\n8 8       5       15.4 8   326  300 3.88 3.37 14.6 0.0 1.00 5    6.00", 
                        "text/latex": "\\begin{tabular}{r|lllllllllllll}\n Group.1 & Group.2 & mpg & cyl & disp & hp & drat & wt & qsec & vs & am & gear & carb\\\\\n\\hline\n\t 4    & 3    & 21.5 & 4    & 120  &  97  & 3.70 & 2.46 & 20.0 & 1.0  & 0.00 & 3    & 1.00\\\\\n\t 6    & 3    & 19.8 & 6    & 242  & 108  & 2.92 & 3.34 & 19.8 & 1.0  & 0.00 & 3    & 1.00\\\\\n\t 8    & 3    & 15.1 & 8    & 358  & 194  & 3.12 & 4.10 & 17.1 & 0.0  & 0.00 & 3    & 3.08\\\\\n\t 4    & 4    & 26.9 & 4    & 103  &  76  & 4.11 & 2.38 & 19.6 & 1.0  & 0.75 & 4    & 1.50\\\\\n\t 6    & 4    & 19.8 & 6    & 164  & 116  & 3.91 & 3.09 & 17.7 & 0.5  & 0.50 & 4    & 4.00\\\\\n\t 4    & 5    & 28.2 & 4    & 108  & 102  & 4.10 & 1.83 & 16.8 & 0.5  & 1.00 & 5    & 2.00\\\\\n\t 6    & 5    & 19.7 & 6    & 145  & 175  & 3.62 & 2.77 & 15.5 & 0.0  & 1.00 & 5    & 6.00\\\\\n\t 8    & 5    & 15.4 & 8    & 326  & 300  & 3.88 & 3.37 & 14.6 & 0.0  & 1.00 & 5    & 6.00\\\\\n\\end{tabular}\n"
                    }
                }
            ], 
            "source": "#aggregating data using the aggregate() function\noptions(digits = 3) #specify the maximum number of digits for easy readability\nattach(mtcars) #attach mtcars dataset\naggdata <- aggregate(mtcars, by = list(cyl, gear), FUN = mean, na.rm = T) #aggregate by means of number of cylinder and gears\naggdata #print the results\ndetach(mtcars) #detach the dataset", 
            "execution_count": 73
        }, 
        {
            "metadata": {}, 
            "cell_type": "markdown", 
            "source": "### The reshape2 package"
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "output_type": "stream", 
                    "name": "stderr", 
                    "text": "Installing package into \u2018/gpfs/global_fs01/sym_shared/YPProdSpark/user/s17c-9f3318fc11f06c-d37a4b9405b6/R/libs\u2019\n(as \u2018lib\u2019 is unspecified)\n"
                }
            ], 
            "source": "#reshape2 package is a tremendously versatile approach to both restructuring and aggregating datasets\n#install the reshape2 package\ninstall.packages(\"reshape2\")", 
            "execution_count": 74
        }, 
        {
            "metadata": {
                "collapsed": true
            }, 
            "cell_type": "code", 
            "outputs": [], 
            "source": "#load the reshape2 library\nlibrary(reshape2)", 
            "execution_count": 75
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "output_type": "display_data", 
                    "data": {
                        "text/html": "<table>\n<thead><tr><th scope=col>ID</th><th scope=col>Time</th><th scope=col>X1</th><th scope=col>X2</th></tr></thead>\n<tbody>\n\t<tr><td>1</td><td>1</td><td>5</td><td>6</td></tr>\n\t<tr><td>1</td><td>2</td><td>3</td><td>5</td></tr>\n\t<tr><td>2</td><td>1</td><td>6</td><td>1</td></tr>\n\t<tr><td>2</td><td>2</td><td>2</td><td>4</td></tr>\n</tbody>\n</table>\n", 
                        "text/plain": "  ID Time X1 X2\n1 1  1    5  6 \n2 1  2    3  5 \n3 2  1    6  1 \n4 2  2    2  4 ", 
                        "text/latex": "\\begin{tabular}{r|llll}\n ID & Time & X1 & X2\\\\\n\\hline\n\t 1 & 1 & 5 & 6\\\\\n\t 1 & 2 & 3 & 5\\\\\n\t 2 & 1 & 6 & 1\\\\\n\t 2 & 2 & 2 & 4\\\\\n\\end{tabular}\n"
                    }
                }
            ], 
            "source": "#create a dataset\nid <- c(1, 1, 2, 2)\ntime <- c(1, 2, 1, 2)\nx1 <- c(5, 3, 6, 2)\nx2 <- c(6, 5, 1, 4)\nmydata <- data.frame(\"ID\" = id, \"Time\" = time, \"X1\" = x1, \"X2\" = x2)\nmydata", 
            "execution_count": 76
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "output_type": "display_data", 
                    "data": {
                        "text/html": "<table>\n<thead><tr><th scope=col>ID</th><th scope=col>Time</th><th scope=col>variable</th><th scope=col>value</th></tr></thead>\n<tbody>\n\t<tr><td>1 </td><td>1 </td><td>X1</td><td>5 </td></tr>\n\t<tr><td>1 </td><td>2 </td><td>X1</td><td>3 </td></tr>\n\t<tr><td>2 </td><td>1 </td><td>X1</td><td>6 </td></tr>\n\t<tr><td>2 </td><td>2 </td><td>X1</td><td>2 </td></tr>\n\t<tr><td>1 </td><td>1 </td><td>X2</td><td>6 </td></tr>\n\t<tr><td>1 </td><td>2 </td><td>X2</td><td>5 </td></tr>\n\t<tr><td>2 </td><td>1 </td><td>X2</td><td>1 </td></tr>\n\t<tr><td>2 </td><td>2 </td><td>X2</td><td>4 </td></tr>\n</tbody>\n</table>\n", 
                        "text/plain": "  ID Time variable value\n1 1  1    X1       5    \n2 1  2    X1       3    \n3 2  1    X1       6    \n4 2  2    X1       2    \n5 1  1    X2       6    \n6 1  2    X2       5    \n7 2  1    X2       1    \n8 2  2    X2       4    ", 
                        "text/latex": "\\begin{tabular}{r|llll}\n ID & Time & variable & value\\\\\n\\hline\n\t 1  & 1  & X1 & 5 \\\\\n\t 1  & 2  & X1 & 3 \\\\\n\t 2  & 1  & X1 & 6 \\\\\n\t 2  & 2  & X1 & 2 \\\\\n\t 1  & 1  & X2 & 6 \\\\\n\t 1  & 2  & X2 & 5 \\\\\n\t 2  & 1  & X2 & 1 \\\\\n\t 2  & 2  & X2 & 4 \\\\\n\\end{tabular}\n"
                    }
                }
            ], 
            "source": "#\"melt\"ing the dataset using the \"melt\" function\nmd <- melt(mydata, id = c(\"ID\", \"Time\"))\nmd", 
            "execution_count": 78
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "output_type": "display_data", 
                    "data": {
                        "text/html": "<table>\n<thead><tr><th scope=col>ID</th><th scope=col>Time</th><th scope=col>X1</th><th scope=col>X2</th></tr></thead>\n<tbody>\n\t<tr><td>1</td><td>1</td><td>5</td><td>6</td></tr>\n\t<tr><td>1</td><td>2</td><td>3</td><td>5</td></tr>\n\t<tr><td>2</td><td>1</td><td>6</td><td>1</td></tr>\n\t<tr><td>2</td><td>2</td><td>2</td><td>4</td></tr>\n</tbody>\n</table>\n", 
                        "text/plain": "  ID Time X1 X2\n1 1  1    5  6 \n2 1  2    3  5 \n3 2  1    6  1 \n4 2  2    2  4 ", 
                        "text/latex": "\\begin{tabular}{r|llll}\n ID & Time & X1 & X2\\\\\n\\hline\n\t 1 & 1 & 5 & 6\\\\\n\t 1 & 2 & 3 & 5\\\\\n\t 2 & 1 & 6 & 1\\\\\n\t 2 & 2 & 2 & 4\\\\\n\\end{tabular}\n"
                    }
                }
            ], 
            "source": "#\"cast\"ing the dataset using the dcast() function\nnewdata <- dcast(md, ID + Time ~ variable)\nnewdata", 
            "execution_count": 80
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "output_type": "display_data", 
                    "data": {
                        "text/html": "<table>\n<thead><tr><th scope=col>ID</th><th scope=col>variable</th><th scope=col>1</th><th scope=col>2</th></tr></thead>\n<tbody>\n\t<tr><td>1 </td><td>X1</td><td>5 </td><td>3 </td></tr>\n\t<tr><td>1 </td><td>X2</td><td>6 </td><td>5 </td></tr>\n\t<tr><td>2 </td><td>X1</td><td>6 </td><td>2 </td></tr>\n\t<tr><td>2 </td><td>X2</td><td>1 </td><td>4 </td></tr>\n</tbody>\n</table>\n", 
                        "text/plain": "  ID variable 1 2\n1 1  X1       5 3\n2 1  X2       6 5\n3 2  X1       6 2\n4 2  X2       1 4", 
                        "text/latex": "\\begin{tabular}{r|llll}\n ID & variable & 1 & 2\\\\\n\\hline\n\t 1  & X1 & 5  & 3 \\\\\n\t 1  & X2 & 6  & 5 \\\\\n\t 2  & X1 & 6  & 2 \\\\\n\t 2  & X2 & 1  & 4 \\\\\n\\end{tabular}\n"
                    }
                }
            ], 
            "source": "#another result using different dcast() parameters\nnewdata <- dcast(md, ID + variable ~ Time)\nnewdata", 
            "execution_count": 81
        }, 
        {
            "metadata": {}, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "output_type": "display_data", 
                    "data": {
                        "text/html": "<table>\n<thead><tr><th scope=col>ID</th><th scope=col>X1_1</th><th scope=col>X1_2</th><th scope=col>X2_1</th><th scope=col>X2_2</th></tr></thead>\n<tbody>\n\t<tr><td>1</td><td>5</td><td>3</td><td>6</td><td>5</td></tr>\n\t<tr><td>2</td><td>6</td><td>2</td><td>1</td><td>4</td></tr>\n</tbody>\n</table>\n", 
                        "text/plain": "  ID X1_1 X1_2 X2_1 X2_2\n1 1  5    3    6    5   \n2 2  6    2    1    4   ", 
                        "text/latex": "\\begin{tabular}{r|lllll}\n ID & X1\\_1 & X1\\_2 & X2\\_1 & X2\\_2\\\\\n\\hline\n\t 1 & 5 & 3 & 6 & 5\\\\\n\t 2 & 6 & 2 & 1 & 4\\\\\n\\end{tabular}\n"
                    }
                }
            ], 
            "source": "#another result using different dcast() parameters\nnewdata <- dcast(md, ID ~ variable + Time)\nnewdata", 
            "execution_count": 82
        }
    ], 
    "nbformat_minor": 1, 
    "nbformat": 4
 }
No results found