# Read data into R
cancer <- read.csv(file.choose(), sep="|")
# make all the variable names lower case
names(cancer) <- tolower(names(cancer))

# Check data format
str(cancer)

# Remove unwanted symbols and blanks
cancer[cancer == "~"] <- NA
cancer[cancer == ""] <- NA
cancer[cancer == "-"] <- NA

# Remove totals -------------------------------------------------------------

# one at a time
cancer <- subset(cancer, sex != "Male and Female")
cancer <- subset(cancer, race != "All Races")
cancer <- subset(cancer, site != "All Cancer Sites Combined")
cancer <- cancer[, -(7:9)]

# many at once
aggregates <- c("United States", "South Atlantic", "South", "Seattle-Puget Sound", "San Jose-Monterey", "San Francisco-Oakland", "Los Angeles", "Detroit", "Atlanta", "East South Central", "East North Central", "West North Central", "West South Central", "Northeast", "Mountain", "Midwest", "Middle Atlantic")
cancer <- subset(cancer, !(area %in% aggregates))

# get rid of the order statistics variables - 
# we can easily re-calculate them any time in R

cancer <- cancer[,-(12:14)]


# Make sure variables are the correct type ----------------------------------

# check again
str(cancer)

# still some factors instead of numeric variables
# easiest: write out & read in again

write.table(cancer, "cancer-clean.csv", sep=",", row.names=F)
# where did it write to?
getwd()
cancer <- read.csv("cancer-clean.csv")

# Harder: try to format the factor into a numeric variable
# why does as.numeric(cancer$Age.Adjusted.Rate) not work?
# - what does it do?

# check again
str(cancer)
dim(cancer)


# Reshape the data into a convenient form for analysis -------------------- 

library(reshape)
cancer.long <- melt(cancer, measure.var=7:11)
cancer.long$variable <- gsub("crude\\.", "", cancer.long$variable)

cancer <- cast(cancer.long, year + site + area + sex + race ~ variable + event.type, mean)
