# Read data into R cancer <- read.csv(file.choose(), sep="|") # make all the variable names lower case names(cancer) <- tolower(names(cancer)) # Check data format str(cancer) # Remove unwanted symbols and blanks cancer[cancer == "~"] <- NA cancer[cancer == ""] <- NA cancer[cancer == "-"] <- NA # Remove totals ------------------------------------------------------------- # one at a time cancer <- subset(cancer, sex != "Male and Female") cancer <- subset(cancer, race != "All Races") cancer <- subset(cancer, site != "All Cancer Sites Combined") cancer <- cancer[, -(7:9)] # many at once aggregates <- c("United States", "South Atlantic", "South", "Seattle-Puget Sound", "San Jose-Monterey", "San Francisco-Oakland", "Los Angeles", "Detroit", "Atlanta", "East South Central", "East North Central", "West North Central", "West South Central", "Northeast", "Mountain", "Midwest", "Middle Atlantic") cancer <- subset(cancer, !(area %in% aggregates)) # get rid of the order statistics variables - # we can easily re-calculate them any time in R cancer <- cancer[,-(12:14)] # Make sure variables are the correct type ---------------------------------- # check again str(cancer) # still some factors instead of numeric variables # easiest: write out & read in again write.table(cancer, "cancer-clean.csv", sep=",", row.names=F) # where did it write to? getwd() cancer <- read.csv("cancer-clean.csv") # Harder: try to format the factor into a numeric variable # why does as.numeric(cancer$Age.Adjusted.Rate) not work? # - what does it do? # check again str(cancer) dim(cancer) # Reshape the data into a convenient form for analysis -------------------- library(reshape) cancer.long <- melt(cancer, measure.var=7:11) cancer.long$variable <- gsub("crude\\.", "", cancer.long$variable) cancer <- cast(cancer.long, year + site + area + sex + race ~ variable + event.type, mean)