# Loading the required libraries
install.packages("ISLR")
library(ISLR)
install.packages("tree")
library(tree)
attach(Carseats)
head(Carseats, n=10)
dim(Carseats)
range(Sales)
# Creating a categorical variable for Sales data depending on the below condition
High = ifelse(Sales >=8, "Yes", "No")
# Appending this column "High" to the Carseats dataset
Carseats = data.frame(Carseats, High)
dim(Carseats)
# Remove the Sales columns from the dataset
Carseats = Carseats[,-1]
dim(Carseats)
# Split the dataset into traning and testing
set.seed(2)
# Generating the traning and testing datasets
train = sample(1:nrow(Carseats),nrow(Carseats)/2)
test = -train
training_data = Carseats[train,]
testing_data = Carseats[test,]
# Creating this variable to compare our prediction with the actual data
testing_High = High[test]
# Fit the tree model (full model) using training data
tree_model = tree(High~., training_data)
plot(tree_model)
text(tree_model, pretty=0)
# We will evaluate how our model is performing using the testing data
# We are going to predict using the tree model on the testing data and pass the
# parameter as "class" for the type of prediction
tree_pred = predict(tree_model, testing_data, type="class")
# To compare the means - we check the misclassification error
mean (tree_pred != testing_High)
#0.295 - 29.5% is a high number, which we can reduce this
# Now can prune our tree to reduce the misclassification error
# We will perform cross validation to check at what level we will stop pruning
set.seed(3)
# Generate a cross validation tree
cv_tree = cv.tree(tree_model, FUN = prune.misclass)
names(cv_tree)
# We will plot the size of the tree versus the deviance (that is the error rate)
plot(cv_tree$size, cv_tree$dev, type = "b")
# We can see below that minimum error rate is at tree size 9. So let’s create a pruned model below:
pruned_model = prune.misclass(tree_model, best=9)
plot(pruned_model)
text(pruned_model, pretty=0)
# Check how our model is performing
tree_pred = predict(pruned_model, testing_data, type = "class")
# Mean of the tree predicted from testing high
mean(tree_pred != testing_High)
#[1] 0.29 - we have reduced the misclassification rate by pruning out tree