
############################################################################################
#
#  Applies Principal Components Analysis (PCA) on the breast cancer dataset. Calculates principal
#  components using the covariance matrix. 
#  Breast cancer dataset can be downloaded from the UCI machine learning dataset repository [1].
#
#  References:
#
#  * [1] Breast Cancer Wisconsin (Diagnostic), https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic
#  
#
#
#  NOTE: this file has many comments for educational purposes.
# 
#
#  v0.9/mmt/Oct 2025
#
############################################################################################


# Close any graphic device left open
graphics.off()

# Cleanup the environment
# all=TRUE in ls() is optional
rm(list = ls())

# Clear console
cat("\014")




DATA_FILE <-'breastCancer-wdbc.data'


###################################################################################################
#  Reading the breast cancer dataset
###################################################################################################

breastCancerData <- read.csv(DATA_FILE, header=F)

###################################################################################################
#  Preprocessing breast cancer data
###################################################################################################

# Filter and rearrange the columns: first column is the case ID while the second
# column is the label. Label column will be moved to the end of the data frame to 
# make handling of the data frame easier.
breastCancerData <- cbind(breastCancerData[, c(-1, -2)], breastCancerData[, 2])

# csv file did not have any header. Add it. 
colnames(breastCancerData) <- c('mean radius', 'mean texture', 'mean perimeter', 'mean area',
                                'mean smoothness', 'mean compactness', 'mean concavity',
                                'mean concave points', 'mean symmetry', 'mean fractal dimension',
                                'radius error', 'texture error', 'perimeter error', 'area error',
                                'smoothness error', 'compactness error', 'concavity error',
                                'concave points error', 'symmetry error', 'fractal dimension error',
                                'worst radius', 'worst texture', 'worst perimeter', 'worst area',
                                'worst smoothness', 'worst compactness', 'worst concavity',
                                'worst concave points', 'worst symmetry', 'worst fractal dimension', 'label')



breastCancerData[, 'label'] <- ifelse(breastCancerData[, 'label']=='B', 'Benign', ifelse(breastCancerData[, 'label']=='M', 'Malignant', ''))

breastCancerData[, -31] <- apply(breastCancerData[, -31], 2, scale)



###################################################################################################
#  Execute PCA
###################################################################################################

# Executing PCA using the covariance matrix. 
# R's princomp() function is one way of executing PCA (there are also other functions 
# available e.g. prcomp() ). We use princomp() because it can be configured to use the 
# Covariance matrix to calculate Eigenvalues and Eigenvectors.
# Please note that princomp() supports also performing PCA by using a Correlation matrix and SVD. 
# This all depends on the parameters that you will provide.
# 
# First parameter is the breast cancer data (breastCancerData). Since the covariance matrix will be used  
# parameter cor is set to FALSE. Set to TRUE, princomp() will calculate PCA using the correlation 
# matrix. For the parameters, see the princomp()'s documentation 
# https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/princomp
#
# However you can provide yourself the covariance matrix by setting the covmat parameter if desired. 
# Parameter score=TRUE tells princomp to transform each original observation to the new system 
# defined by the principal components. Remember that principal components define a new coordinate 
# system and the original data MUST be mapped properly onto this new coordinate system. 
principalComponents <- princomp(breastCancerData[, -31], cor=FALSE, score=TRUE)



# The result of the princomp() function is a new object -that we store in a new variable
# called principalComponents- that has inside all necessary information of the Principal Component
# Analysis. This object has so called attributes that you can access. The function's 
# documentation reports the attributes: 
# https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/princomp (See Value section)
# But first, let's see what attributes the resulting object has to interrogate its state.
print(attributes(principalComponents))



# One of the attributes is loadings which are the calculated  Eigenvectors (or Principal components). 
# Here, the Eigenvectors will be displayed (NOTE: there will be 30 Eigenvectors)
print(principalComponents$loadings)


# Displaying the available attributes of the principalComponents object; this will help
# us in 
# Attributes available are sdev, scores etc. You can display their values using the $
# operator.
# For example, here the scores are displayed. scores contain the new coordinates
# of the original data when projected onto the new space defined by the principal components. Keep in mind
# that Principal Components define a new coordinate system (i.e. new variables) onto
# which the original breast cancer data will be projected. This new projection is in the scores.
print(principalComponents$scores)

# Cast scores into a dataframe to make handling of points easier.
df <- as.data.frame(principalComponents$scores)

# Set color of points based on their label. Will be used when plotting
df['color'] <- ifelse(breastCancerData[, 'label']=='Benign', 'green', 'red')


# You may also plot the principal components to see the variance of each principal component 
# in decreasing order to identify the most important ones. There will be many principal components (30 actually),
# but only the first 10 are displayed.
plot(principalComponents)


#############################################################################
#
# Display data on new the dimensions
#
#############################################################################

# Plot data on the first two principal components. Patterns are now easier to see.
plot(df[, 1], df[, 2],  xlab='Principal component - 1', ylab='Principal component - 2', pch=16, col=df$color)
title(main="Principal Component Analysis of Breast Cancer Dataset")











