# -----------------------------------------------------------------------------
# Redo the analysis in Problem 1 in Assignment 4 (South African Heart Data)
# [the classification tree], by using the obtained principal components as the new covariates,
# together with the categorical variable "famhist", and treating "chd" as the response variable.
# In particular, do classification using a classification tree.
# Create the new data set with famhist and chd, along with all 8 principal components
SA[,c("chd", "famhist")] # Review the SA data
SA_newdata_post_PCA <- cbind(SA[,c("chd", "famhist")], newdata) # Bind the data together
head(SA_newdata_post_PCA) # Review the data
# Create a classification tree
SA_class_PCA<-rpart(chd~., data=SA_newdata_post_PCA, method="class") # The new classification tree
# Plot the classification tree
png("c:/Users/Nate/Git/riemann/classification_PCA_tree_post.png") # Place the png
plot(SA_class_PCA, # Using the generic package
main = "Post PCA Classification Tree - South African Heart Data") # Plot the tree
text(SA_class_PCA) # Fill in the text
dev.off() # Turn the device off
# Plot a fancy classification tree
png("c:/Users/Nate/Git/riemann/classification_PCA_tree_post_fancy.png") # Place the png
fancyRpartPlot(SA_class_PCA) # Plot the tree using the rattle package
text(SA_class_PCA) # Fill in the text
dev.off() # Turn the device off
# Let us now calculate the misclassification rate
(SA_class_PCA_predict <- predict(SA_class_PCA, SA_newdata_post_PCA, type="class")) # Make predictions for each of the patients
misclass_table_cart <- table(Actual=SA_newdata_post_PCA$chd, Classified=SA_class_PCA_predict) # Generate a table using the predicted values
(misclass_table_cart <- as.matrix(misclass_table_cart)) #Turn the misclassification table into a matrix
(rate_PCA <- (misclass_table_cart[1,2] + misclass_table_cart[2,1]) / sum(misclass_table_cart)) # Calculate the misclassification rate)
# -----------------------------------------------------------------------------
# Redo the analysis in Problem 3 in Assignment 3 (South African Heart Data), using the
# obtained principal components as the new covariates, together with the categorical
# variable "famhist", and treating "chd" as the response variable.
# To complete this question, I will build off the analysis that was done in assignment
# 3 as well as all relevant code.
# I note here that the new dataset is called SA_newdata_post_PCA
SA_newdata_post_PCA$famhist <- as.numeric(SA_newdata_post_PCA$famhist) # Ensure that famhist is a factor
SA_newdata_post_PCA$chd <- as.numeric(SA_newdata_post_PCA$chd) # Ensure that chd is a factor
# Ensure that the principal components inputs are numeric
SA_newdata_post_PCA$PC1 <- as.numeric(SA_newdata_post_PCA$PC1) # Ensure that this is a numeric variable
SA_newdata_post_PCA$PC2 <- as.numeric(SA_newdata_post_PCA$PC2) # Ensure that this is a numeric variable
SA_newdata_post_PCA$PC3 <- as.numeric(SA_newdata_post_PCA$PC3) # Ensure that this is a numeric variable
SA_newdata_post_PCA$PC4 <- as.numeric(SA_newdata_post_PCA$PC4) # Ensure that this is a numeric variable
SA_newdata_post_PCA$PC5 <- as.numeric(SA_newdata_post_PCA$PC5) # Ensure that this is a numeric variable
SA_newdata_post_PCA$PC6 <- as.numeric(SA_newdata_post_PCA$PC6) # Ensure that this is a numeric variable
SA_newdata_post_PCA$PC7 <- as.numeric(SA_newdata_post_PCA$PC7) # Ensure that this is a numeric variable
SA_newdata_post_PCA$PC8 <- as.numeric(SA_newdata_post_PCA$PC8) # Ensure that this is a numeric variable
str(SA_newdata_post_PCA) #Let's review the structure of the data
SA_newdata_post_PCA <- data.frame(SA_newdata_post_PCA) # Ensure that the data is in matrix form
# dimnames(SA_newdata_post_PCA)
# Generate a pairs plot of the data before using it in the GAM
png("c:/Users/Nate/Git/riemann/classification_PCA_tree_post_PCA_Pairs.png") # Start the PNG
pairs(SA_newdata_post_PCA, pch=21, bg = c("red", "green"))
dev.off()
# Let's switch to GAM models now ...
# Note that, in order to make the models work, I needed to make all the inputs numeric & continuous
# The first are some puerly parametric models
SA_class_GAM <- gam(chd ~ PC1 + PC2 + PC3 + famhist, data=SA_newdata_post_PCA)
summary(SA_class_GAM)
# Gather together the predicted fitted values
(SA_GAM_predict_fit <- as.matrix(SA_class_GAM$fitted.values))
# Convert the predictions into either 1 or 2
class.vec_GAM <- ifelse(SA_GAM_predict_fit[,1]>(0.5), 1, 0) # Note that the previous chd values
# were converted to numeric variables between the values of 1 and 2 ...
# Create a table comparing the predicted values vs the actual
misclass_table_GAM <- table(Actual=SA_newdata_post_PCA$chd, Classified=class.vec_GAM) # Generate a table using the predicted values
(misclass_table_GAM <- as.matrix(misclass_table_GAM)) #Turn the misclassification table into a matrix
(rate <- (misclass_table_GAM[1,2] + misclass_table_GAM[2,1]) / sum(misclass_table_GAM)) # Calculate the misclassification rate)
# Let's try some other models ... # Note that in this reduced model I use only certain
# components
SA_class_GAM_reduced <- gam(chd ~ PC1 + famhist, data=SA_newdata_post_PCA)
summary(SA_class_GAM_reduced)
# Let's try famhist as a factor in the model
SA_newdata_post_PCA$famhist <- as.factor(SA_newdata_post_PCA$famhist) # Ensure that famhist is a factor
str(SA_newdata_post_PCA)
# Try some other models
SA_class_GAM_reduced_1 <- gam(chd ~ PC1 + PC2 + PC4 + factor(famhist), data=SA_newdata_post_PCA)
summary(SA_class_GAM_reduced_1)
SA_class_GAM_reduced_2 <- gam(chd ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + factor(famhist), data=SA_newdata_post_PCA)
summary(SA_class_GAM_reduced_2)
# Let's change the chd model into a factor
SA_newdata_post_PCA$chd <- as.factor(SA_newdata_post_PCA$chd) # Ensure that famhist is a factor
# Let's try another model .... # Note that this produces an error which is perplexing
SA_class_GAM_reduced_3 <- gam(chd ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + factor(famhist), data=SA_newdata_post_PCA)
summary(SA_class_GAM_reduced_3)
# Let's change the chd model back into a numeric observation
SA_newdata_post_PCA$chd <- as.numeric(SA_newdata_post_PCA$chd) # Ensure that famhist is numeric
# Let's try another model .... # Note that this produces an error which is perplexing
SA_class_GAM_reduced_3 <- gam(chd ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + factor(famhist), data=SA_newdata_post_PCA)
summary(SA_class_GAM_reduced_3)
# Let's try some smoothing coefficients
SA_class_GAM_reduced_4 <- gam(chd ~ s(PC1) + s(PC2) + factor(famhist), data=SA_newdata_post_PCA)
summary(SA_class_GAM_reduced_4)
# Let us reduce the model down now removing non-significant components
SA_class_GAM_reduced_5 <- gam(chd ~ s(PC1) + s(PC4) + factor(famhist), data=SA_newdata_post_PCA)
summary(SA_class_GAM_reduced_5)
# Gather together the predicted fitted values
(SA_GAM_predict_fit_reduced <- as.matrix(SA_class_GAM_reduced_4$fitted.values))
# Convert the predictions into either 1 or 2
class.vec_GAM_reduced <- ifelse(SA_GAM_predict_fit_reduced[,1]>(0.5), 1, 0) # Note that the previous chd values
# were converted to numeric variables between the values of 1 and 2 ...
# Create a table comparing the predicted values vs the actual
misclass_table_GAM_reduced <- table(Actual=SA_newdata_post_PCA$chd, Classified=class.vec_GAM_reduced) # Generate a table using the predicted values
(misclass_table_GAM_reduced <- as.matrix(misclass_table_GAM_reduced)) #Turn the misclassification table into a matrix
(rate <- (misclass_table_GAM_reduced[1,2] + misclass_table_GAM_reduced[2,1]) / sum(misclass_table_GAM_reduced)) # Calculate the misclassification rate)