# Install the ggplot package (don't need to do this on lab machines) # (if you scroll down to the bottom of the list there's a mirror in # Iowa that should be fast and reliable) install.packages("ggplot2") # Load the ggplot package # - do this every time you want to use the ggplot package library(ggplot) # Examining distributions # ====================================================== # Histograms --------------------------------------- qplot(price, data=diamonds, geom="histogram") # ALWAYS EXPERIMENT WITH THE BIN SIZE! qplot(price, data=diamonds, geom="histogram", binwidth=500) # ALWAYS EXPERIMENT WITH THE BIN SIZE! qplot(price, data=diamonds, geom="histogram", binwidth=100) # ALWAYS EXPERIMENT WITH THE BIN SIZE! qplot(price, data=diamonds, geom="histogram", binwidth=50) # Investigating relationships # ====================================================== # Two continuous variables ----------------------------------- # Use a scatterplot qplot(price, carat, data=diamonds) qplot(log(carat), log(price), data=diamonds) qplot(carat, price/carat, data=diamonds) # Map extra variables to other aesthetic attributes qplot(carat, price, data=diamonds, colour=color) qplot(carat, price, data=diamonds, size=carat) qplot(carat, price, data=diamonds, shape=cut) # Facetting displays the same plot for different subsets of the data # use facets argument, rows on left hand-size qplot(price, carat, data=diamonds, facets = . ~ color) qplot(price, carat, data=diamonds, facets = color ~ clarity) # One continuous, one categorical ----------------------------------- # There is too much overplotting in a scatterplot to be very useful # we can basically only see the range of the data qplot(color, price/carat, data=diamonds) # We could spread the points out a little bit, by adding random jitter # This helps a little bit, but it's still hard to see what's going on qplot(color, price/carat, data=diamonds, position="jitter") qplot(color, price/carat, data=diamonds, position=position_jitter(x=2)) # Another technique is to visualise the conditional distribution, ie. # for a given colour, what is the distribution of price/colour. # A boxplot provides a simple summary of the distribution: qplot(color, price/carat, data=d, geom="boxplot") # Or we can use a histogram for each colour to look at the shape of # the distribution in more detail: qplot(price/carat, data=d, facet= color ~ ., geom="histogram") # ALWAYS EXPERIMENT WITH THE BIN SIZE! qplot(price/carat, data=d, facet= color ~ ., geom="histogram", binwidth=100) # Two categorical variables ----------------------------------- # Use a fluctuation diagram - a visualisation of the contingency table ggfluctuation(table(d$cut, d$color)) # Zooming --------------------------------------- # You can zoom in on an interesting area of the plot using the # xlim and ylim arguments: qplot(price, data=d, geom="histogram", xlim=c(0, 5000))