**R Code for Principal Component Analysis (PCA) and Factor Analysis (FA)**

**Data Set:- Bank Marketing**https://www.blogger.com/blogger.g?blogID=4359514443959080595#editor/target=post;postID=5426750358330739357

**Source of Data Set:- UCI Repository (http://archive.ics.uci.edu/ml/datasets/Bank+Marketing)**

**The Code includes the following:-**

**1. Data Exploration - Missing Values, Outliers**

**2. Data Visualisation**

**3. Correlation Matrix**

**4. Package "FactoMiner"**

**5. Topics Covered:- PCA, Correspondence analysis(CA), Dual Multiple Factor Analysis (DMFA), Factor Analysis for mixed data(FAMD), Generalised Procrustes Analysis (GPA), Hierarchical Tree**

**6. Detailed Plots**

**## The data has been imported using Import Dataset option in R Environment**

## The data set can be obtained from http://archive.ics.uci.edu/ml/datasets/Bank+Marketing

##

##

##

## Since it is a large dataset, graphical display of missing values will prove to be easier

##

## No red colour stripes are visible. hence no missing values.

##

##

## Use Box plots (Only for continuous variables)- To Check Ouliers

## Though some outliers are observed in Previous contacts, NumContacts and LastDuration, they have not bee removed keeping their significance into consideration

## Use Histograms (For both continuous and categorical variables)

## These histograms provide details abpout Skewness, Normal Distribution etc

## Function to create histograms for continuous variables with normal curve

## Balance is more skewed towards to Negative or Zero

## Last Duration is more skewed towards 0 to 100 secs.

## NUmContacts are more skewed towards 1

## Many were not contacted previously

## Since many were not contacted previously, therefore Pcontacts is 0

## Barplots for Categorical Variables

## Since Credit Default is highly skewed towards NO, this shall be removed from further analysis

##

## It can be observed that No two variables are highly correlated

## The data set can be obtained from http://archive.ics.uci.edu/ml/datasets/Bank+Marketing

##

**DATASET UNDERSTANDING***head(bank_full) ## Displays first 6 rows for each variable*

str(bank_full) ## Describes each variables

summary(bank_full) ## Provides basic statistical information of each variablestr(bank_full) ## Describes each variables

summary(bank_full) ## Provides basic statistical information of each variable

##

**DATA EXPLORATION - Check for Missing Data**##

__Option 1__*is.na(bank_full)*## Displays True for a missing value## Since it is a large dataset, graphical display of missing values will prove to be easier

##

__Option 2__*require(Amelia)*

missmap(bank_full,main="Missing Data - Bank Subscription", col=c("red","grey"),legend=FALSE)missmap(bank_full,main="Missing Data - Bank Subscription", col=c("red","grey"),legend=FALSE)

## No red colour stripes are visible. hence no missing values.

##

__Option 3__*summary(bank_full)*## displays missing values if any under every variable##

**DATA VISUALISATION**## Use Box plots (Only for continuous variables)- To Check Ouliers

*boxplot(bank_full$age~bank_full$subscribed, main=" AGE",ylab="age of customers",xlab="Subscribed")*

boxplot(bank_full$balance~bank_full$subscribed, main=" BALANCE",ylab="Balance of customers",xlab="Subscribed")

boxplot(bank_full$lastday~bank_full$subscribed, main=" LAST DAY",ylab="Last day of contact",xlab="Subscribed")

boxplot(bank_full$lastduration~bank_full$subscribed, main="LAST DURATION",ylab="Last duration of contact",xlab="Subscribed")

boxplot(bank_full$numcontacts~bank_full$subscribed, main="NUM CONTACTS",ylab="number of contacts",xlab="Subscribed")

boxplot(bank_full$pdays~bank_full$subscribed, main=" Previous DAYS",ylab="Previous days of contact",xlab="Subscribed")

boxplot(bank_full$pcontacts~bank_full$subscribed, main=" Previous Contacts",ylab="Previous Contacts with customers",xlab="Subscribed")boxplot(bank_full$balance~bank_full$subscribed, main=" BALANCE",ylab="Balance of customers",xlab="Subscribed")

boxplot(bank_full$lastday~bank_full$subscribed, main=" LAST DAY",ylab="Last day of contact",xlab="Subscribed")

boxplot(bank_full$lastduration~bank_full$subscribed, main="LAST DURATION",ylab="Last duration of contact",xlab="Subscribed")

boxplot(bank_full$numcontacts~bank_full$subscribed, main="NUM CONTACTS",ylab="number of contacts",xlab="Subscribed")

boxplot(bank_full$pdays~bank_full$subscribed, main=" Previous DAYS",ylab="Previous days of contact",xlab="Subscribed")

boxplot(bank_full$pcontacts~bank_full$subscribed, main=" Previous Contacts",ylab="Previous Contacts with customers",xlab="Subscribed")

## Though some outliers are observed in Previous contacts, NumContacts and LastDuration, they have not bee removed keeping their significance into consideration

## Use Histograms (For both continuous and categorical variables)

## These histograms provide details abpout Skewness, Normal Distribution etc

## Function to create histograms for continuous variables with normal curve

*bank_Conthist<-function(VarName,NumBreaks,xlab,main,lengthxfit) ## xlab and main should be mentioned under quotes as they are characters*

{

hist(VarName,breaks=NumBreaks,col="yellow",xlab=xlab,main=main)

xfit<-seq(min(VarName),max(VarName),length=lengthxfit)

yfit<-dnorm(xfit,mean=mean(VarName),sd=sd(VarName))

yfit<-yfit*diff(h$mids[1:2])*length(VarName)

lines(xfit,yfit,col="red",lwd=3)

}

bank_Conthist(bank_full$age,10,"age of customers","AGE",30)

bank_Conthist(bank_full$balance,50,"Balance of customers","Balance",100){

hist(VarName,breaks=NumBreaks,col="yellow",xlab=xlab,main=main)

xfit<-seq(min(VarName),max(VarName),length=lengthxfit)

yfit<-dnorm(xfit,mean=mean(VarName),sd=sd(VarName))

yfit<-yfit*diff(h$mids[1:2])*length(VarName)

lines(xfit,yfit,col="red",lwd=3)

}

bank_Conthist(bank_full$age,10,"age of customers","AGE",30)

bank_Conthist(bank_full$balance,50,"Balance of customers","Balance",100)

## Balance is more skewed towards to Negative or Zero

*bank_Conthist(bank_full$lastday,5,"Last Day of contact","LAst Day",10)*

bank_Conthist(bank_full$lastduration,100,"LastDuration of COntact","Last Duration",10)bank_Conthist(bank_full$lastduration,100,"LastDuration of COntact","Last Duration",10)

## Last Duration is more skewed towards 0 to 100 secs.

*bank_Conthist(bank_full$numcontacts,30,"Number of Contacts","NUmContacts",20)*## NUmContacts are more skewed towards 1

*bank_Conthist(bank_full$pdays,30,"Previous Days of contacts","PDays",20)*## Many were not contacted previously

*bank_Conthist(bank_full$pcontacts,20,"Previous Contacts","PContacts",10)*## Since many were not contacted previously, therefore Pcontacts is 0

## Barplots for Categorical Variables

*barplot(table(bank_full$job),col="red",main="JOB")*

barplot(table(bank_full$marital),col="green",main="Marital")

barplot(table(bank_full$education),col="red",main="Education")

barplot(table(bank_full$creditdefault),col="red",main="Credit Default")barplot(table(bank_full$marital),col="green",main="Marital")

barplot(table(bank_full$education),col="red",main="Education")

barplot(table(bank_full$creditdefault),col="red",main="Credit Default")

## Since Credit Default is highly skewed towards NO, this shall be removed from further analysis

*bank_full[5]<-NULL*

str(bank_full)

barplot(table(bank_full$housingloan),col="red",main="Housing Loan")

barplot(table(bank_full$personalloan),col="blue",main="Personal Loan")

barplot(table(bank_full$lastcommtype),col="red",main="Last communication type")

barplot(table(bank_full$lastmonth),col="violet",main="Last Month")

barplot(table(bank_full$poutcome),col="magenta",main="Previous Outcome")str(bank_full)

barplot(table(bank_full$housingloan),col="red",main="Housing Loan")

barplot(table(bank_full$personalloan),col="blue",main="Personal Loan")

barplot(table(bank_full$lastcommtype),col="red",main="Last communication type")

barplot(table(bank_full$lastmonth),col="violet",main="Last Month")

barplot(table(bank_full$poutcome),col="magenta",main="Previous Outcome")

##

**Correlation Matrix**among input (or independent) continuous variables*bank_full.cont<-data.frame(bank_full$age,bank_full$balance,bank_full$lastday,bank_full$lastduration,bank_full$numcontacts,bank_full$pdays,bank_full$pcontacts)*

str(bank_full.cont)

cor(bank_full.cont)str(bank_full.cont)

cor(bank_full.cont)

## It can be observed that No two variables are highly correlated

**## First, Lets convert factors having character levels to numeric levels**

## Now convert these numerical factors into numeric

##In bank_full_transform; the subscribed table is not quantitative

## Now all the variables are either intergers or numeric except SUBSCRIBED (which is factor)

*str(bank_full)*

bank_full_transform<-bank_full

bank_full_transform$marital=factor(bank_full_transform$marital,levels=c("single","married","divorced"),labels=c(1,2,3))

bank_full_transform$job=factor(bank_full_transform$job,levels=c("admin.","blue-collar","entrepreneur","housemaid","management","retired","self-employed","services","student","technician","unemployed","unknown"),labels=c(1,2,3,4,5,6,7,8,9,10,11,12))

bank_full_transform$education=factor(bank_full_transform$education,levels=c("primary","secondary","tertiary","unknown"),labels=c(1,2,3,4))

bank_full_transform$housingloan=factor(bank_full_transform$housingloan,levels=c("no","yes"),labels=c(1,2))

bank_full_transform$personalloan=factor(bank_full_transform$personalloan,levels=c("no","yes"),labels=c(1,2))bank_full_transform<-bank_full

bank_full_transform$marital=factor(bank_full_transform$marital,levels=c("single","married","divorced"),labels=c(1,2,3))

bank_full_transform$job=factor(bank_full_transform$job,levels=c("admin.","blue-collar","entrepreneur","housemaid","management","retired","self-employed","services","student","technician","unemployed","unknown"),labels=c(1,2,3,4,5,6,7,8,9,10,11,12))

bank_full_transform$education=factor(bank_full_transform$education,levels=c("primary","secondary","tertiary","unknown"),labels=c(1,2,3,4))

bank_full_transform$housingloan=factor(bank_full_transform$housingloan,levels=c("no","yes"),labels=c(1,2))

bank_full_transform$personalloan=factor(bank_full_transform$personalloan,levels=c("no","yes"),labels=c(1,2))

*bank_full_transform$lastcommtype=factor(bank_full_transform$lastcommtype,levels=c("cellular","telephone","unknown"),labels=c(1,2,3))*

bank_full_transform$poutcome=factor(bank_full_transform$poutcome,levels=c("failure","other","success","unknown"),labels=c(1,2,3,4))

bank_full_transform$lastmonth=factor(bank_full_transform$lastmonth,levels=c("apr","aug","dec","feb","jan","jul","jun","mar","may","nov","oct","sep"),labels=c(1,2,3,4,5,6,7,8,9,10,11,12))

str(bank_full_transform)bank_full_transform$poutcome=factor(bank_full_transform$poutcome,levels=c("failure","other","success","unknown"),labels=c(1,2,3,4))

bank_full_transform$lastmonth=factor(bank_full_transform$lastmonth,levels=c("apr","aug","dec","feb","jan","jul","jun","mar","may","nov","oct","sep"),labels=c(1,2,3,4,5,6,7,8,9,10,11,12))

str(bank_full_transform)

## Now convert these numerical factors into numeric

*bank_full_transform$job<-as.numeric(as.character(bank_full_transform$job))*

bank_full_transform$marital<-as.numeric(as.character(bank_full_transform$marital))

bank_full_transform$education<-as.numeric(as.character(bank_full_transform$education))

bank_full_transform$personalloan<-as.numeric(as.character(bank_full_transform$personalloan))

bank_full_transform$housingloan<-as.numeric(as.character(bank_full_transform$housingloan))

bank_full_transform$lastcommtype<-as.numeric(as.character(bank_full_transform$lastcommtype))

bank_full_transform$lastmonth<-as.numeric(as.character(bank_full_transform$lastmonth))

bank_full_transform$poutcome<-as.numeric(as.character(bank_full_transform$poutcome))

bank_full_transform$age<-as.numeric(as.character(bank_full_transform$age))

bank_full_transform$balance<-as.numeric(as.character(bank_full_transform$balance))

bank_full_transform$lastday<-as.numeric(as.character(bank_full_transform$lastday))bank_full_transform$marital<-as.numeric(as.character(bank_full_transform$marital))

bank_full_transform$education<-as.numeric(as.character(bank_full_transform$education))

bank_full_transform$personalloan<-as.numeric(as.character(bank_full_transform$personalloan))

bank_full_transform$housingloan<-as.numeric(as.character(bank_full_transform$housingloan))

bank_full_transform$lastcommtype<-as.numeric(as.character(bank_full_transform$lastcommtype))

bank_full_transform$lastmonth<-as.numeric(as.character(bank_full_transform$lastmonth))

bank_full_transform$poutcome<-as.numeric(as.character(bank_full_transform$poutcome))

bank_full_transform$age<-as.numeric(as.character(bank_full_transform$age))

bank_full_transform$balance<-as.numeric(as.character(bank_full_transform$balance))

bank_full_transform$lastday<-as.numeric(as.character(bank_full_transform$lastday))

*bank_full_transform$lastduration<-as.numeric(as.character(bank_full_transform$lastduration))*

bank_full_transform$numcontacts<-as.numeric(as.character(bank_full_transform$numcontacts))

bank_full_transform$pdays<-as.numeric(as.character(bank_full_transform$pdays))

bank_full_transform$pcontacts<-as.numeric(as.character(bank_full_transform$pcontacts))

str(bank_full_transform)bank_full_transform$numcontacts<-as.numeric(as.character(bank_full_transform$numcontacts))

bank_full_transform$pdays<-as.numeric(as.character(bank_full_transform$pdays))

bank_full_transform$pcontacts<-as.numeric(as.character(bank_full_transform$pcontacts))

str(bank_full_transform)

##In bank_full_transform; the subscribed table is not quantitative

*bank_full_transformfull<-bank_full_transform*

bank_full_transformfull$subscribed<-factor(bank_full_transformfull$subscribed,levels=c("no","yes"),labels=c(1,2))

bank_full_transformfull$subscribed<-as.numeric(as.character(bank_full_transformfull$subscribed))

all(is.na(bank_full_transformfull))

write.table(bank_full_transformfull,"x1.csv",sep=",")

str(bank_full_transformfull)bank_full_transformfull$subscribed<-factor(bank_full_transformfull$subscribed,levels=c("no","yes"),labels=c(1,2))

bank_full_transformfull$subscribed<-as.numeric(as.character(bank_full_transformfull$subscribed))

all(is.na(bank_full_transformfull))

write.table(bank_full_transformfull,"x1.csv",sep=",")

str(bank_full_transformfull)

## Now all the variables are either intergers or numeric except SUBSCRIBED (which is factor)

**## Multivariate Exploratory Data Analysis using FACTOMINER Package**

## Two-Way Analysis of Variance with the contrasts sum (The sum of coefficients is Zero)

## T test and F test

## Two-Way Anova with interaction with Age

##RV Coefficient:- Squared Pearson Correlation Coefficient. It measures the closeness of 2 sets of points in a matrix.

## Correlation of One Categorical or Continuous variable with other Categorical or quantitative variables

## Lets check with respect to Age variable (whose num.var=1, since it is first variable column)

condes(bank_full[1:14],num.var=1,proba=0.05)

*install.packages("FactoMineR")*

library(FactoMineR)library(FactoMineR)

## Two-Way Analysis of Variance with the contrasts sum (The sum of coefficients is Zero)

## T test and F test

*AovSum(subscribed~.,data=bank_full)*## Two-Way Anova with interaction with Age

*AovSum(subscribed~.:age,data=bank_full)*##RV Coefficient:- Squared Pearson Correlation Coefficient. It measures the closeness of 2 sets of points in a matrix.

*X<-bank_full[,1:8]*

Y<-bank_full[,9:14]Y<-bank_full[,9:14]

*coeffRV(X,Y)*## Correlation of One Categorical or Continuous variable with other Categorical or quantitative variables

## Lets check with respect to Age variable (whose num.var=1, since it is first variable column)

condes(bank_full[1:14],num.var=1,proba=0.05)

**## 1. Correspondence Analysis(CA) including supplementary row and (or)column points**

## Applicable only on Categorical Data in Numeric or Integer format

## Dimension Description : It points out the variables and categories which are more characteristic according to each dimensionobtained by factor analysis

## Visualise the active elements

##2. PRINCIPAL COMPONENT ANALYSIS (PCA)

## Initially estimate the number of components in PCA

## Using the above estmate, use the value as ncp. Here we get ncp=1

## PCA analysis for the above data

##bar plot with the eigen values

##Print the results of PCA

## Lets observe the PCA interpretation using third and fourth dimension as your axes

## Construction of Confidence Ellipses around the barycentres of all categorical variables

##habillage=15 indicates that the individuals are coloured according to 15th variable

## Dimension Description : It points out the variables and categories which are more

## Graph of variable after factor analysis

graph.var(bank_pca,axes=c(1,2),xlim="Comp-1",ylim="Comp-2",title="Graph of Variables after PCA")

## Hierarchical clustering on Principal Components

HCPC(bank_pca,nb.clust=-1)

## 3. Dual Multiple Factor Analysis (DMFA) with supplementary individuals, supplementary quantitative variables and supplementary categorical variables

## Construction of Confidence Ellipses around the barycentres of all categorical variables

## Dimension Description : It points out the variables and categories which are more characteristic according to each dimensionobtained by factor analysis

## 4. FACTOR ANALYSIS FOR MIXED DATA (Continuous and Catgeorical Data)

## Dimension Description : It points out the variables and categories which are more characteristic according to each dimensionobtained by factor analysis

## 5. Generalised PROCRUSTES ANALYSIS(GPA)

## It take sinto account all the missing values and Applicable only for quantitative (continuous) data sets

## Construction of Partial Points only for some indivuals

plotGPApartial(bank_gpa)

## Dimension Description : It points out the variables and categories which are more characteristic according to each dimensionobtained by factor analysis

dimdesc(bank_famd,axes=1:3,proba=0.05)

## 6. HIERARCHICAL TREE FROM A DATA SET

## Usefull when all the variables play a significant role

## initially the data is partitioned using k means clustering (here kk=10 clusters made)

##If nb.clust= 0 (the tree is cut at the level the user clicks on); -1 (the tree is automatically cut at the suggested level)

HCPC(bank_full_transform[,1:15],kk=10,nb.clust=-1)

## Applicable only on Categorical Data in Numeric or Integer format

*bank_CA<-CA(bank_full_transform,ncp=5,col.sup=10:13,row.sup=1:100,quanti.sup=1:4,quali.sup=15,graph=TRUE)*

summary(bank_CA)summary(bank_CA)

## Dimension Description : It points out the variables and categories which are more characteristic according to each dimensionobtained by factor analysis

*dimdesc(bank_CA,axes=1:3,proba=0.05)*## Visualise the active elements

*plot(bank_CA,invisible=c("row.sup","col.sup"))*##2. PRINCIPAL COMPONENT ANALYSIS (PCA)

## Initially estimate the number of components in PCA

*estim_ncp(bank_full_transform[,1:15],ncp.min=0,ncp.max=NULL,scale=TRUE, method="GCV")*## Using the above estmate, use the value as ncp. Here we get ncp=1

## PCA analysis for the above data

*bank_pca<-PCA(bank_full_transformfull,ncp=5,quanti.sup=NULL,quali.sup=15)*

summary(bank_pca)summary(bank_pca)

##bar plot with the eigen values

*barplot(bank_pca$eig[,1],main="Eigen Values",names.arg=paste("Dim",1:nrow(bank_pca$eig),sep=""))*##Print the results of PCA

*print(bank_pca)*

bank_pca$var$coordbank_pca$var$coord

## Lets observe the PCA interpretation using third and fourth dimension as your axes

*plot(bank_pca,choix="var",axes=c(3,4),lim.cos2.var=0)*## Construction of Confidence Ellipses around the barycentres of all categorical variables

*a<-cbind.data.frame(bank_full_transform[,16],bank_pca$ind$coord)*

b<-coord.ellipse(a,bary=TRUE)b<-coord.ellipse(a,bary=TRUE)

##habillage=15 indicates that the individuals are coloured according to 15th variable

*plot(bank_pca,habillage=15,ellipse=b)*

plotellipses(bank_pca)plotellipses(bank_pca)

## Dimension Description : It points out the variables and categories which are more

*characteristic according to each dimensionobtained by factor analysis*

dimdesc(bank_pca,axes=1:3,proba=0.05)dimdesc(bank_pca,axes=1:3,proba=0.05)

## Graph of variable after factor analysis

graph.var(bank_pca,axes=c(1,2),xlim="Comp-1",ylim="Comp-2",title="Graph of Variables after PCA")

## Hierarchical clustering on Principal Components

HCPC(bank_pca,nb.clust=-1)

## 3. Dual Multiple Factor Analysis (DMFA) with supplementary individuals, supplementary quantitative variables and supplementary categorical variables

*bank_dmfa<-DMFA(bank_full_transform,num.fact=16,scale.unit=TRUE,ncp=3,quanti.sup=NULL,quali.sup=16,graph=TRUE,axes=c(1,2))*

summary(bank_dmfa)summary(bank_dmfa)

## Construction of Confidence Ellipses around the barycentres of all categorical variables

*a<-cbind.data.frame(bank_full_transform[,16],bank_dmfa$ind$coord)*

b<-coord.ellipse(a,bary=TRUE)

plot.PCA(bank_dmfa,habillage=16,ellipse=b)

plotellipses(bank_dmfa)b<-coord.ellipse(a,bary=TRUE)

plot.PCA(bank_dmfa,habillage=16,ellipse=b)

plotellipses(bank_dmfa)

## Dimension Description : It points out the variables and categories which are more characteristic according to each dimensionobtained by factor analysis

*dimdesc(bank_dmfa,axes=1:3,proba=0.05)*## 4. FACTOR ANALYSIS FOR MIXED DATA (Continuous and Catgeorical Data)

*bank_famd<-FAMD(bank_full_transform[,1:15],ncp=5,graph=TRUE,axes=c(1,2))*

summary(bak_famd)summary(bak_famd)

## Dimension Description : It points out the variables and categories which are more characteristic according to each dimensionobtained by factor analysis

*dimdesc(bank_famd,axes=1:3,proba=0.05)*## 5. Generalised PROCRUSTES ANALYSIS(GPA)

## It take sinto account all the missing values and Applicable only for quantitative (continuous) data sets

*bank_full_transform_cont<-data.frame(bank_full_transform$age,bank_full_transform$balance,bank_full_transform$lastday,bank_full_transform$lastduration,bank_full_transform$numcontacts,bank_full_transform$pdays,bank_full_transform$pcontacts)*

bank_gpa<-GPA(bank_full_transform_cont,group=c(1,2,3))bank_gpa<-GPA(bank_f

## Construction of Partial Points only for some indivuals

plotGPApartial(bank_gpa)

## Dimension Description : It points out the variables and categories which are more characteristic according to each dimensionobtained by factor analysis

dimdesc(bank_famd,axes=1:3,proba=0.05)

## 6. HIERARCHICAL TREE FROM A DATA SET

## Usefull when all the variables play a significant role

## initially the data is partitioned using k means clustering (here kk=10 clusters made)

##If nb.clust= 0 (the tree is cut at the level the user clicks on); -1 (the tree is automatically cut at the suggested level)

HCPC(bank_full_transform[,1:15],kk=10,nb.clust=-1)

## No comments:

## Post a Comment