
"""
  Executes PCA on the Winsonsin beast cancer dataset [1]. sklearn's module for
  PCA is used. 

  5 eigenvectors are calculated using PCA. Dimension of data is reduced from 30 to
  5. Data points are transformed into the new space and visually displayed
  showing that the data is visually seperable.

  Data has 30 dimensions related to characteristics of growths (e.g. perimeters,
  smoothness, area, concaveness, etc). Contains also diagnosis: M (Malignant) and
  B (Benign).
  
  References:
     1) https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic.
     
  

  v0.2/mmt/Sept 2025

"""


from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

import pandas as pd

import matplotlib.pyplot as plt



DATA_FILE = 'breastCancer-wdbc.data'

header = ['mean radius', 'mean texture', 'mean perimeter', 'mean area',
 'mean smoothness', 'mean compactness', 'mean concavity',
 'mean concave points', 'mean symmetry', 'mean fractal dimension',
 'radius error', 'texture error', 'perimeter error', 'area error',
 'smoothness error', 'compactness error', 'concavity error',
 'concave points error', 'symmetry error', 'fractal dimension error',
 'worst radius', 'worst texture', 'worst perimeter', 'worst area',
 'worst smoothness', 'worst compactness', 'worst concavity',
 'worst concave points', 'worst symmetry', 'worst fractal dimension', 'label']





# Read the data. Data has no header
breastCancerData = pd.read_csv(DATA_FILE, sep=',', header=None)


#############################################################################
#
# Preprocessing
#
#############################################################################

# Change order of columns in dataframe.
# 2nd column (label 1 in header) is the diagnosis
# that will be moved to the last position (last column) in the data frame.
# This is done to make handling of the data frame easier later.
cols = breastCancerData.columns.tolist()
cols.pop(1)
cols.append(1)

breastCancerData = breastCancerData[cols]
# Reindex column headers i.e. make header start at 0 and be increment by one.
breastCancerData.columns = range(0, breastCancerData.columns.size)

# Get all columns, except first which is the case ID
breastCancerData = breastCancerData.iloc[:, 1:]

# Add header
breastCancerData.columns = header

# Replace vaues in label to make it more readable
breastCancerData['label'] = breastCancerData['label'].replace('B', 'Benign')
breastCancerData['label'] = breastCancerData['label'].replace('M', 'Malignant')

# PCA is a variance maximizing process. This means that magnitute of values
# incfluence the results.
# At this point it is done sure that values are in the same range in order to have an equal
# influence on the PCA result. This process is called normalization.
# There are many different ways to do normalization: Min-max, z-score etc.
# Here, sklearn's StandardScaler is used who does a z-score normalization
# https://en.wikipedia.org/wiki/Standard_score

# Scale only all numerical data. Last column is not numeric (label, see above)
# and is excluded
bcValues = breastCancerData.loc[:, header[:-1]].values
bcValuesStandardized = StandardScaler().fit_transform(bcValues)
# All numeric values will be now on the same *scale* i.e. normalized.
breastCancerDataNormalized = pd.DataFrame(bcValuesStandardized)



#############################################################################
#
# Execute PCA
#
#############################################################################

# Calculate up to the first 5 most important eigenvectors (=components)
pcaResult = PCA(n_components=5)

# Next call to .fit_transform() will not only calculate
# eigenvalues and eigenvectors but will also move/map breast cancer data
# onto the space defined by the new 2 dimensions i.e. eigenvectors.
transformedBreastData = pcaResult.fit_transform(breastCancerDataNormalized)


# Displaying eigenvectors
for i, v in enumerate(pcaResult.components_):   
   print('\tEigenvector', (i+1), ':',v)

plt.bar(['Eigenvector ' + str(i) for i in range(1, len(pcaResult.explained_variance_ )+1)], pcaResult.explained_variance_, color=['#97b6e8'])
plt.xlabel('Components (Eigenvectors)')
plt.ylabel('Variance explained (%)')

plt.show()

# Cast new dataset into a data frame to handle it easier
transformedBreastDataDF = pd.DataFrame(data = transformedBreastData[:, :2], columns=['Principal component 1 (Eigenvector 1)', 'Principal component 2 ((Eigenvector 2)'])



#############################################################################
#
# Display data on new the dimensions
#
#############################################################################

plt.figure(figsize=(7,7))
plt.xticks(fontsize=12)
plt.yticks(fontsize=14)
plt.xlabel('Principal Component - 1',fontsize=10)
plt.ylabel('Principal Component - 2',fontsize=10)
plt.title("Principal Component Analysis of Breast Cancer Dataset", fontsize=10)
targets = ['Benign', 'Malignant']
colors = ['g', 'r']
for target, color in zip(targets,colors):
    indicesToKeep = breastCancerData['label'] == target
    plt.scatter(transformedBreastDataDF.loc[indicesToKeep, 'Principal component 1 (Eigenvector 1)'], transformedBreastDataDF.loc[indicesToKeep, 'Principal component 2 ((Eigenvector 2)'], c = color, s = 50)

plt.legend(targets, prop={'size': 15})
plt.show()


