# Load data ------------------------------------------------------ # Open the servey survey data off the website # (but I recommend you download a copy on to your computer) ss <- read.csv("http://had.co.nz/stat480/data/server-survey.csv") # Give each record a unique id ss$id <- 1:nrow(ss) # Basic summaries---------------------------------------------------- # # Certain summaries only make sense for continuous variables # so we'll create a data frame that only contains them. # # We've seen sapply before - it works like lapply, but it will # simplify (that's what the s stands for) the result into a simple # vector or array if possible. Also remember that a data.frame is # a list of vectors, so cont will be a logical vector indicating # if each variable is numeric (ie. continuous) or not cont <- sapply(ss, is.numeric) scont <- ss[, cont] # Melt the data so we can perform some basic summary statistics library(reshape) contm <- melt(scont, id="id", preserve=FALSE) # This doesn't look very good because of the scientific notation cast(contm, variable ~ ., c(length, min, max, mean, sd)) # so we'll tell R not to use scientific notation unless it really # has too, and drop off the first row, which isn't very interesting # anyway options(scipen = 9) cast(contm, variable ~ ., c(length, min, max, mean, sd))[-1, ] # Explore with GGobi ---------------------------------------------- # It's often useful to do some basic exploration in GGobi as well: # it's very easy to switch between different plots. library(rggobi) ggobi(ss) # Focus on asian_prop, sex, state, and pcttip ------------------------ # We don't have time to fix all the variables in class, so we'll # focus on four with characteristics representative of all the others ## Sex table(ss$sex, exclude=NULL) # The 2 represents unknown, so we'll lump it in with the missings ss$sex[ss$sex == 2] <- NA table(ss$sex, exclude=NULL) ss$sex <- factor(ss$sex, levels=c(0,1), labels=c("male", "female")) # Worked out from look at hair colour/baldness ## State table(ss$state) # What a mess! table(tolower(ss$state)) # A little better ss$state <- tolower(ss$state) # Let's remove all non-US states, and standardise US to their # two letter abbreviations. We can do this in two ways. The # first is simple, but time consuming: fix each case individually. ss$state[ss$state == "(province) ontario, canada"] <- NA ss$state[ss$state == "alaska"] <- "AK" # and so on. # Anothe way is to use subsetting creatively # First we'll make up a named vector of all the states and their # abbreviations (I got mine from http://www.usps.com/ncsc/lookups/usps_abbreviations.html) states <- c("alabama"="al", "alaska"="ak", "american samoa"="as", "arizona"="az", "arkansas"="ar", "california"="ca", "colorado"="co", "connecticut"="ct", "delaware"="de", "district of columbia"="dc", "federated states of micronesia"="fm", "florida"="fl", "georgia"="ga", "guam"="gu", "hawaii"="hi", "idaho"="id", "illinois"="il", "indiana"="in", "iowa"="ia", "kansas"="ks", "kentucky"="ky", "louisiana"="la", "maine"="me", "marshall islands"="mh", "maryland"="md", "massachusetts"="ma", "michigan"="mi", "minnesota"="mn", "mississippi"="ms", "missouri"="mo", "montana"="mt", "nebraska"="ne", "nevada"="nv", "new hampshire"="nh", "new jersey"="nj", "new mexico"="nm", "new york"="ny", "north carolina"="nc", "north dakota"="nd", "northern mariana islands"="mp", "ohio"="oh", "oklahoma"="ok", "oregon"="or", "palau"="pw", "pennsylvania"="pa", "puerto rico"="pr", "rhode island"="ri", "south carolina"="sc", "south dakota"="sd", "tennessee"="tn", "texas"="tx", "utah"="ut", "vermont"="vt", "virgin islands"="vi", "virginia"="va", "washington"="wa", "west virginia"="wv", "wisconsin"="wi", "wyoming"="wy") # Next. find the states in that list longstates <- ss$state %in% names(states) ss$state[longstates] # Then subset to get the abbreviations states[ss$state[longstates]] # Then replace existing ss$state[longstates] <- states[ss$state[longstates]] table(ss$state) # Not perfect, but a lot better. You might want to clean up # the few typos by hand. # Lastly we'll get rid of all the non US states by setting # them to missing ss$state[!ss$state %in% states] <- NA # (Alternatively, you might want to aggregate by country instead) # And turn it into a factor ss$state <- factor(ss$state, levels=states) ## Percent tip table(ss$pcttip) # A couple of problems: # * some are proportions and some a percents # * some awfully big tips! # Turn proportions into percents props <- !is.na(ss$pcttip) & na.omit(ss$pcttip < 1) ss$pcttip[props] <- ss$pcttip[props] * 100 table(ss$pcttip) table(round_any(ss$pcttip, 1), exclude=NULL) # Looks like some awfully bad tippers too! # Let's investigate the big tips ss <- ss[!is.na(ss$pcttip) & ss$pcttip > 70, ] # They look suspicious to me - do people really tip 100% at IHOP?! # For this class, we'll just drop those records, but I suggest # you explore them further ss <- ss[!(!is.na(ss$pcttip) & ss$pcttip > 70), ] ## Asian prop table(ss$asian_prop, exclude=NULL) # Again propotions instead of percentages, plus a few wacko numbers props <- !is.na(ss$asian_prop) & ss$asian_prop < 1 ss$asian_prop[props] <- ss$asian_prop[props] * 100 ss[!is.na(ss$asian_prop) & ss$asian_prop > 100, ] ss[!is.na(ss$asian_prop) & ss$asian_prop > 100, "asian_prop"] <- NA qplot(ss$asian_prop, type="histogram", breaks=seq(0, 100, by=5)) qplot(ss$asian_prop, type="histogram", breaks=seq(0, 100, by=1)) # Quick state by state analysis ---------------------------------------- ssstatem <- melt(ss, id=c("id", "state"), measure=c("pcttip", "asian_prop"), preserve=FALSE) cast(ssstatem, state ~ variable, length) cast(ssstatem, state ~ variable, c(length, mean)) means <- cast(ssstatem, state ~ variable, mean) qplot(asian_prop, pcttip, data=means, main="State means") qplot(asian_prop, pcttip, data=ss, main="Individual restaurants") # Might want to think about making the size of the points proportional # to the number of observations, so we can see which ones have less dat