#includes the Carseats dataset, a simulated dataset containing sales of child car seats at 400 different stores
library(ISLR)

#library for Classification and Regression Trees
library(tree)
library(rpart)

#Add Carseats dataset to R?s path making thus Carseats dataset available to R
# Contains sales of Carseats in 400 stores in the Us.
# A more detailed exaplanation of each variable can be found here:
# https://www.rdocumentation.org/packages/ISLR/versions/1.2/topics/Carseats
attach(Carseats)

#Take a quick look at the data (peek at data).
# Always DO that in order to see if the data is correct.
head(Carseats)

# The aim of this example is to predict the value for the Sales attribute in the Carseats dataset
# (i.e. our Class attribute will be Sales - class=Sales). However,
# However, attribute Sales is continuous and not nominal hence we have to transform it into a discrete (=nominal) variable.
# Keep in mind classification WORKS ONLY for predicting discrete (nominal) attributes.

#Take a look at the values of attribute Sales in the Carseats dataset
range(Sales)

# Sales in the Carseats dataset keeps the unit sales (in thousands) at each store.
# It is a continuous attribute ranging from 0.00 up until 16.27.

# We make an assumption and say that if Sales >= 8 (thousands - approx. in the middle) then Sales are high,
# otherwise Sales are high.
High = ifelse(Sales>=8, "Yes", "No")

# Now High is a vector containing Yes/No values, one for each record in the dataset.
# This means for example that  High[1] is the value of whether Sales are high for observation
# Carseats[1, ] in the dataset.
# We must now attach/concatenate High to the Carseats dataset, increasing its dimension by 1.
Carseats = data.frame(Carseats, High)


# The Sales attribute is no longer needed, since we have transformed it into a nominal and discrete value as specified by High. Sales
# is the first attribute of the Carseats dataset, and can be dropped (i.e. remove it from the Carseats dataset).
# Dimension of the dataset is reduced by 1
Carseats = Carseats[,-1]


# We have made our class attribute to have nominal values.
# Now, we create the TRAINING and TESTING dataset.
# To do so, we select randomly some records from Carseats in order to create these 2
# datasets
set.seed(2) # initialize random number generator. Note: if you keep 2, the same records will always be selected

# Create training set. We'll use 50% of the Carseat observations (=200 obs) as the training set which will
# be used to build the Decision tree.
# So get 200 random integers from 1 to 400. These integers will be interpreted as
# the indecies of the dataset that need to be included in the training set. So for example if we get
# the numbers 34, 8, 12 that means that rows 34, 8 and 12 from the dataset will be included in the training set.
train = sample( 1:nrow(Carseats), nrow(Carseats)/2)

#The rest will be our testing data.
# How does it work: we negate the integers in the training set. So if numbers 34, 8 and 12 are in the training set, -train
# will result in the set -34, -8, -12. If you use these indexes on your dataset it means get all rows except rows 34, 8 and 12.
# Hence this means get all rows except rows 34,8 and 12.
test = -train

# Train contains the indexes of the records in Carseats that will be included into the training set. Create the actual dataset
training_data = Carseats[train,]

# Create the decision tree using the training data. We want to predict High based on
# all other attributes of the Carseats dataset
#tree_model = tree(High ~ ., training_data)

tree_model = rpart(High  ~ ., data=training_data, method="class" )

# Decision tree built. Variable tree_model holds now our Decision tree as it has been created from the training set. Now visualize it
plot(tree_model)

# Plot does now show labels on Decision Tree. Plot tree with labels to make it easily understandable.
# Not some difference: There are no attribute values on the edges as shown in the examples. The tree displayed
# by plot should be interpreted as follows: left brances mean 'yes -attribute meets criterion',
# right brances mean 'No - attribute does not meet criterion' . So if you see a node with label let's say
# "Price < 86.5" the left edge means if the data meets that criterion (i.e. yes, price is < 86.5) while the right
# edge means data does not meet that criterion (i.e. price >=86.5). This holds for all labels.
text(tree_model, pretty=0)


# Next, use the testing data to test the Decision tree referenced by tree_model


# Now that we have built/trained our decision tree, apply it on the testing dataset

# First, select records from Carseats that will comprise our testing dataset. Note:
# the testing dataset has the High attribute but we will not remove it. This is
# because we will need this to calculate accuracy and error rate. In addition, the
# tree library will ignore this attribute anyway.
testing_data = Carseats[test,]

# Peek at testing data
head(testing_data)

# Predict the class attribute (High) for the testing dataset. Apply testing dataset
tree_predict = predict(tree_model, testing_data, type="class")

# Prediction done. Now tree_predict is a one dimensional data structure (separate
# from testing dataset) that holds one value ?Yes?/?No? for each record in testing
# set. I.e. the first value in tree_predict corresponds to the first record in
# testing set.



# Now, try to evaluate how well our testing data was classified by calculating the
# Confusion Matrix. There are two ways to do this:

# Using the 'table' command, which
# compares tree_predict and attribute High from testing dataset like this:
testingDataConfusionTable = table(tree_predict, testing_data$High)
print(testingDataConfusionTable)

# Ok, let's calculate some quality metrics of the model, in order to see how
# our model performed.
# We calculate accuracy and error rate of the classifier/mocel

# Calculate accuracy
# NOTE: diag() -> extract the diagonal of a matrix. In a confusion matrix these always tell us the correctly
# classified items. sum() just adds them together.
modelAccuracy = sum( diag(testingDataConfusionTable)/sum(testingDataConfusionTable))

# Calculate Error rate (note could also use 1-modelAccuracy)
modelErrorRate = 1 - sum( diag(testingDataConfusionTable)/sum(testingDataConfusionTable))

# Print the result out nicely. We loooooooooove nice and clear responses.
# Note: As a rule of thumb, accuracy  >=70% means our model is nice! If accuracy is lower, this might
# be due to the improperly selected training set. Hence re-run the entire sequence again until you find
# a disired accuracy. You can automate this by adding all the above steps in a loop and find the best model.

sprintf("Model accuracy: %f, model error rate:%f", modelAccuracy,modelErrorRate )