### R Code - Bank Subscription Marketing - Classification {NEURAL NETWORKS}

**R Code for NEURAL NETWORK**

**Data Set:- Bank Marketing**https://www.blogger.com/blogger.g?blogID=4359514443959080595#editor/target=post;postID=5426750358330739357

**Source of Data Set:- UCI Repository (http://archive.ics.uci.edu/ml/datasets/Bank+Marketing)**

**The Code includes the following:-**

**1. Data Exploration - Missing Values, Outliers**

**2. Data Visualisation**

**3. Correlation Matrix**

**4. Data Partitioning**

**5. Packages used "NNET" and "NEURAL NET"**

**6. Confusion Matrix with ROC**

**## The data has been imported using Import Dataset option in R Environment**

## The data set can be obtained from http://archive.ics.uci.edu/ml/datasets/Bank+Marketing

##

##

##

## Since it is a large dataset, graphical display of missing values will prove to be easier

##

## No red colour stripes are visible. hence no missing values.

##

##

## Use Box plots (Only for continuous variables)- To Check Ouliers

## Though some outliers are observed in Previous contacts, NumContacts and LastDuration, they have not bee removed keeping their significance into consideration

## Use Histograms (For both continuous and categorical variables)

## These histograms provide details abpout Skewness, Normal Distribution etc

## Function to create histograms for continuous variables with normal curve

## Balance is more skewed towards to Negative or Zero

## Last Duration is more skewed towards 0 to 100 secs.

## NUmContacts are more skewed towards 1

## Many were not contacted previously

## Since many were not contacted previously, therefore Pcontacts is 0

## Barplots for Categorical Variables

## Since Credit Default is highly skewed towards NO, this shall be removed from further analysis

##

## It can be observed that No two variables are highly correlated

##

## The data set can be obtained from http://archive.ics.uci.edu/ml/datasets/Bank+Marketing

##

**DATASET UNDERSTANDING***head(bank_full) ## Displays first 6 rows for each variable*

str(bank_full) ## Describes each variables

summary(bank_full) ## Provides basic statistical information of each variablestr(bank_full) ## Describes each variables

summary(bank_full) ## Provides basic statistical information of each variable

##

**DATA EXPLORATION - Check for Missing Data**##

__Option 1__*is.na(bank_full)*## Displays True for a missing value## Since it is a large dataset, graphical display of missing values will prove to be easier

##

__Option 2__*require(Amelia)*

missmap(bank_full,main="Missing Data - Bank Subscription", col=c("red","grey"),legend=FALSE)missmap(bank_full,main="Missing Data - Bank Subscription", col=c("red","grey"),legend=FALSE)

## No red colour stripes are visible. hence no missing values.

##

__Option 3__*summary(bank_full)*## displays missing values if any under every variable##

**DATA VISUALISATION**## Use Box plots (Only for continuous variables)- To Check Ouliers

*boxplot(bank_full$age~bank_full$subscribed, main=" AGE",ylab="age of customers",xlab="Subscribed")*

boxplot(bank_full$balance~bank_full$subscribed, main=" BALANCE",ylab="Balance of customers",xlab="Subscribed")

boxplot(bank_full$lastday~bank_full$subscribed, main=" LAST DAY",ylab="Last day of contact",xlab="Subscribed")

boxplot(bank_full$lastduration~bank_full$subscribed, main="LAST DURATION",ylab="Last duration of contact",xlab="Subscribed")

boxplot(bank_full$numcontacts~bank_full$subscribed, main="NUM CONTACTS",ylab="number of contacts",xlab="Subscribed")

boxplot(bank_full$pdays~bank_full$subscribed, main=" Previous DAYS",ylab="Previous days of contact",xlab="Subscribed")

boxplot(bank_full$pcontacts~bank_full$subscribed, main=" Previous Contacts",ylab="Previous Contacts with customers",xlab="Subscribed")boxplot(bank_full$balance~bank_full$subscribed, main=" BALANCE",ylab="Balance of customers",xlab="Subscribed")

boxplot(bank_full$lastday~bank_full$subscribed, main=" LAST DAY",ylab="Last day of contact",xlab="Subscribed")

boxplot(bank_full$lastduration~bank_full$subscribed, main="LAST DURATION",ylab="Last duration of contact",xlab="Subscribed")

boxplot(bank_full$numcontacts~bank_full$subscribed, main="NUM CONTACTS",ylab="number of contacts",xlab="Subscribed")

boxplot(bank_full$pdays~bank_full$subscribed, main=" Previous DAYS",ylab="Previous days of contact",xlab="Subscribed")

boxplot(bank_full$pcontacts~bank_full$subscribed, main=" Previous Contacts",ylab="Previous Contacts with customers",xlab="Subscribed")

## Though some outliers are observed in Previous contacts, NumContacts and LastDuration, they have not bee removed keeping their significance into consideration

## Use Histograms (For both continuous and categorical variables)

## These histograms provide details abpout Skewness, Normal Distribution etc

## Function to create histograms for continuous variables with normal curve

*bank_Conthist<-function(VarName,NumBreaks,xlab,main,lengthxfit) ## xlab and main should be mentioned under quotes as they are characters*

{

hist(VarName,breaks=NumBreaks,col="yellow",xlab=xlab,main=main)

xfit<-seq(min(VarName),max(VarName),length=lengthxfit)

yfit<-dnorm(xfit,mean=mean(VarName),sd=sd(VarName))

yfit<-yfit*diff(h$mids[1:2])*length(VarName)

lines(xfit,yfit,col="red",lwd=3)

}

bank_Conthist(bank_full$age,10,"age of customers","AGE",30)

bank_Conthist(bank_full$balance,50,"Balance of customers","Balance",100){

hist(VarName,breaks=NumBreaks,col="yellow",xlab=xlab,main=main)

xfit<-seq(min(VarName),max(VarName),length=lengthxfit)

yfit<-dnorm(xfit,mean=mean(VarName),sd=sd(VarName))

yfit<-yfit*diff(h$mids[1:2])*length(VarName)

lines(xfit,yfit,col="red",lwd=3)

}

bank_Conthist(bank_full$age,10,"age of customers","AGE",30)

bank_Conthist(bank_full$balance,50,"Balance of customers","Balance",100)

## Balance is more skewed towards to Negative or Zero

*bank_Conthist(bank_full$lastday,5,"Last Day of contact","LAst Day",10)*

bank_Conthist(bank_full$lastduration,100,"LastDuration of COntact","Last Duration",10)bank_Conthist(bank_full$lastduration,100,"LastDuration of COntact","Last Duration",10)

## Last Duration is more skewed towards 0 to 100 secs.

*bank_Conthist(bank_full$numcontacts,30,"Number of Contacts","NUmContacts",20)*## NUmContacts are more skewed towards 1

*bank_Conthist(bank_full$pdays,30,"Previous Days of contacts","PDays",20)*## Many were not contacted previously

*bank_Conthist(bank_full$pcontacts,20,"Previous Contacts","PContacts",10)*## Since many were not contacted previously, therefore Pcontacts is 0

## Barplots for Categorical Variables

*barplot(table(bank_full$job),col="red",main="JOB")*

barplot(table(bank_full$marital),col="green",main="Marital")

barplot(table(bank_full$education),col="red",main="Education")

barplot(table(bank_full$creditdefault),col="red",main="Credit Default")barplot(table(bank_full$marital),col="green",main="Marital")

barplot(table(bank_full$education),col="red",main="Education")

barplot(table(bank_full$creditdefault),col="red",main="Credit Default")

## Since Credit Default is highly skewed towards NO, this shall be removed from further analysis

*bank_full[5]<-NULL*

str(bank_full)

barplot(table(bank_full$housingloan),col="red",main="Housing Loan")

barplot(table(bank_full$personalloan),col="blue",main="Personal Loan")

barplot(table(bank_full$lastcommtype),col="red",main="Last communication type")

barplot(table(bank_full$lastmonth),col="violet",main="Last Month")

barplot(table(bank_full$poutcome),col="magenta",main="Previous Outcome")str(bank_full)

barplot(table(bank_full$housingloan),col="red",main="Housing Loan")

barplot(table(bank_full$personalloan),col="blue",main="Personal Loan")

barplot(table(bank_full$lastcommtype),col="red",main="Last communication type")

barplot(table(bank_full$lastmonth),col="violet",main="Last Month")

barplot(table(bank_full$poutcome),col="magenta",main="Previous Outcome")

##

**Correlation Matrix**among input (or independent) continuous variables*bank_full.cont<-data.frame(bank_full$age,bank_full$balance,bank_full$lastday,bank_full$lastduration,bank_full$numcontacts,bank_full$pdays,bank_full$pcontacts)*

str(bank_full.cont)

cor(bank_full.cont)str(bank_full.cont)

cor(bank_full.cont)

## It can be observed that No two variables are highly correlated

##

**Partitioning Data**into Train and Test datasets in 70:30*library(caret)**set.seed(1234567)*

train1<-createDataPartition(bank_full$subscribed,p=0.7,list=FALSE)

train<-bank_full[train1,]

test<-bank_full[-train1,]train1<-createDataPartition(bank_full$subscribed,p=0.7,list=FALSE)

train<-bank_full[train1,]

test<-bank_full[-train1,]

__## CLASSIFICATION USING NEURAL NETWORK__

## 1. Fit a Single Hidden Layer Neural Network using Least Squares

## Use TEST data for testing the trained model

## MisClassification Confusion Matrix

## One can maximize the Accuracy by changing the "size" while training the neural network. SIZE refers to the number of nodes in the hidden layer.

##2. Use Multinomial Log Linear models using Neural Networks

##USe TEST data for testing the trained model

##Misclassification or Confusion Matrix

##3. Training Neural Network Using BACK PROPOGATION

## Check for all Input Independent Variables to be Integer or Numeric or complex matrix or vector arguments. If they are not any one of these, then tranform them accordingly

## It can be observed that all are either integer or factor. Now these factors have to be transformed to numeric.

## One cannot use directly as.numeric() to convert factors to numeric as it has limitations.

## First, Lets convert factors having character levels to numeric levels

## Now convert these numerical factors into numeric

## Now all the variables are wither intergers or numeric

## Now we shall partition the data into train and test data

library(caret)

set.seed(1234567)

train2<-createDataPartition(bank_full_transform$subscribed,p=0.7,list=FALSE)

trainnew<-bank_full_transform[train2,]

testnew<-bank_full_transform[-train2,]

str(trainnew)

str(testnew)

## Now lets run the neuralnet model on Train dataset

## Here, Back Propogation Algorithm has been used. One can also use rprop+ (resilient BP with weight backtracking),rprop- (resilient BP without weight backtracking), "sag and "slr" as modified global convergent algorithm

## Accordingly the accuracy can be checked for each algorithm

## Plot method for the genralised weights wrt specific covariate (independent variable) and response target variable

*library(nnet)*## 1. Fit a Single Hidden Layer Neural Network using Least Squares

*train.nnet<-nnet(subscribed~.,train,size=3,rang=0.07,Hess=FALSE,decay=15e-4,maxit=250)*## Use TEST data for testing the trained model

*test.nnet<-predict(train.nnet,test,type=("class"))*## MisClassification Confusion Matrix

*table(test$subscribed,test.nnet)*## One can maximize the Accuracy by changing the "size" while training the neural network. SIZE refers to the number of nodes in the hidden layer.

*which.is.max(test.nnet)*## To Fine which row break ties at random (Maximum position in vector)##2. Use Multinomial Log Linear models using Neural Networks

*train.mlln<-multinom(subscribed~.,train)*##USe TEST data for testing the trained model

*test.mlln<-predict(train.mlln,test)*##Misclassification or Confusion Matrix

*table(test$subscribed,test.mlln)*##3. Training Neural Network Using BACK PROPOGATION

*install.packages("neuralnet")*

library(neuralnet)library(neuralnet)

## Check for all Input Independent Variables to be Integer or Numeric or complex matrix or vector arguments. If they are not any one of these, then tranform them accordingly

*str(train)*

str(test)str(test)

## It can be observed that all are either integer or factor. Now these factors have to be transformed to numeric.

## One cannot use directly as.numeric() to convert factors to numeric as it has limitations.

## First, Lets convert factors having character levels to numeric levels

*str(bank_full)*

bank_full_transform<-bank_full

bank_full_transform$marital=factor(bank_full_transform$marital,levels=c("single","married","divorced"),labels=c(1,2,3))

bank_full_transform$job=factor(bank_full_transform$job,levels=c("admin","blue-collar","entrepreneur","housemaid","management","retired","self-employed","services","student","technician","unemployed","unknown"),labels=c(1,2,3,4,5,6,7,8,9,10,11,12))

bank_full_transform$education=factor(bank_full_transform$education,levels=c("primary","secondary","tertiary","unknown"),labels=c(1,2,3,4))

bank_full_transform$housingloan=factor(bank_full_transform$housingloan,levels=c("no","yes"),labels=c(1,2))

bank_full_transform$personalloan=factor(bank_full_transform$personalloan,levels=c("no","yes"),labels=c(1,2))

bank_full_transform$subscribed=factor(bank_full_transform$subscribed,levels=c("no","yes"),labels=c(1,2))

bank_full_transform$lastcommtype=factor(bank_full_transform$lastcommtype,levels=c("cellular","telephone","unknown"),labels=c(1,2,3))

bank_full_transform$poutcome=factor(bank_full_transform$poutcome,levels=c("failure","other","success","unknown"),labels=c(1,2,3,4))

bank_full_transform$lastmonth=factor(bank_full_transform$lastmonth,levels=c("apr","aug","dec","feb","jan","jul","jun","mar","may","nov","oct","sep"),labels=c(1,2,3,4,5,6,7,8,9,10,11,12))

str(bank_full_transform)bank_full_transform<-bank_full

bank_full_transform$marital=factor(bank_full_transform$marital,levels=c("single","married","divorced"),labels=c(1,2,3))

bank_full_transform$job=factor(bank_full_transform$job,levels=c("admin","blue-collar","entrepreneur","housemaid","management","retired","self-employed","services","student","technician","unemployed","unknown"),labels=c(1,2,3,4,5,6,7,8,9,10,11,12))

bank_full_transform$education=factor(bank_full_transform$education,levels=c("primary","secondary","tertiary","unknown"),labels=c(1,2,3,4))

bank_full_transform$housingloan=factor(bank_full_transform$housingloan,levels=c("no","yes"),labels=c(1,2))

bank_full_transform$personalloan=factor(bank_full_transform$personalloan,levels=c("no","yes"),labels=c(1,2))

bank_full_transform$subscribed=factor(bank_full_transform$subscribed,levels=c("no","yes"),labels=c(1,2))

bank_full_transform$lastcommtype=factor(bank_full_transform$lastcommtype,levels=c("cellular","telephone","unknown"),labels=c(1,2,3))

bank_full_transform$poutcome=factor(bank_full_transform$poutcome,levels=c("failure","other","success","unknown"),labels=c(1,2,3,4))

bank_full_transform$lastmonth=factor(bank_full_transform$lastmonth,levels=c("apr","aug","dec","feb","jan","jul","jun","mar","may","nov","oct","sep"),labels=c(1,2,3,4,5,6,7,8,9,10,11,12))

str(bank_full_transform)

## Now convert these numerical factors into numeric

*bank_full_transform$subscribed<-as.numeric(as.character(bank_full_transform$subscribed))*

bank_full_transform$job<-as.numeric(as.character(bank_full_transform$job))

bank_full_transform$marital<-as.numeric(as.character(bank_full_transform$marital))

bank_full_transform$education<-as.numeric(as.character(bank_full_transform$education))

bank_full_transform$personalloan<-as.numeric(as.character(bank_full_transform$personalloan))

bank_full_transform$housingloan<-as.numeric(as.character(bank_full_transform$housingloan))

bank_full_transform$lastcommtype<-as.numeric(as.character(bank_full_transform$lastcommtype))

bank_full_transform$lastmonth<-as.numeric(as.character(bank_full_transform$lastmonth))

bank_full_transform$poutcome<-as.numeric(as.character(bank_full_transform$poutcome))

bank_full_transform$age<-as.numeric(as.character(bank_full_transform$age))

bank_full_transform$balance<-as.numeric(as.character(bank_full_transform$balance))

bank_full_transform$lastday<-as.numeric(as.character(bank_full_transform$lastday))

bank_full_transform$lastduration<-as.numeric(as.character(bank_full_transform$lastduration))

bank_full_transform$numcontacts<-as.numeric(as.character(bank_full_transform$numcontacts))

bank_full_transform$pdays<-as.numeric(as.character(bank_full_transform$pdays))

bank_full_transform$pcontacts<-as.numeric(as.character(bank_full_transform$pcontacts))

str(bank_full_transform)bank_full_transform$job<-as.numeric(as.character(bank_full_transform$job))

bank_full_transform$marital<-as.numeric(as.character(bank_full_transform$marital))

bank_full_transform$education<-as.numeric(as.character(bank_full_transform$education))

bank_full_transform$personalloan<-as.numeric(as.character(bank_full_transform$personalloan))

bank_full_transform$housingloan<-as.numeric(as.character(bank_full_transform$housingloan))

bank_full_transform$lastcommtype<-as.numeric(as.character(bank_full_transform$lastcommtype))

bank_full_transform$lastmonth<-as.numeric(as.character(bank_full_transform$lastmonth))

bank_full_transform$poutcome<-as.numeric(as.character(bank_full_transform$poutcome))

bank_full_transform$age<-as.numeric(as.character(bank_full_transform$age))

bank_full_transform$balance<-as.numeric(as.character(bank_full_transform$balance))

bank_full_transform$lastday<-as.numeric(as.character(bank_full_transform$lastday))

bank_full_transform$lastduration<-as.numeric(as.character(bank_full_transform$lastduration))

bank_full_transform$numcontacts<-as.numeric(as.character(bank_full_transform$numcontacts))

bank_full_transform$pdays<-as.numeric(as.character(bank_full_transform$pdays))

bank_full_transform$pcontacts<-as.numeric(as.character(bank_full_transform$pcontacts))

str(bank_full_transform)

## Now all the variables are wither intergers or numeric

## Now we shall partition the data into train and test data

library(caret)

set.seed(1234567)

train2<-createDataPartition(bank_full_transform$subscribed,p=0.7,list=FALSE)

trainnew<-bank_full_transform[train2,]

testnew<-bank_full_transform[-train2,]

str(trainnew)

str(testnew)

## Now lets run the neuralnet model on Train dataset

*trainnew.nnbp<-neuralnet(subscribed~age+balance+lastday+lastduration+numcontacts+pdays+pcontacts+marital+education+housingloan+personalloan+lastmonth+poutcome+lastcommtype,data=bank_full_transform,hidden=5,threshold=0.01,err.fct="sse",linear.output=FALSE,likelihood=TRUE,stepmax=1e+05,rep=1,startweights=NULL,learningrate.limit=list(0.1,1.5),learningrate.factor=list(minus=0.5,plus=1.5),learningrate=0.5,lifesign="minimal",lifesign.step=1000,algorithm="backprop",act.fct="logistic",exclude=NULL,constant.weights=NULL)*## Here, Back Propogation Algorithm has been used. One can also use rprop+ (resilient BP with weight backtracking),rprop- (resilient BP without weight backtracking), "sag and "slr" as modified global convergent algorithm

## Accordingly the accuracy can be checked for each algorithm

*summary(train.nnbp)*## Plot method for the genralised weights wrt specific covariate (independent variable) and response target variable

*gwplot(trainnew.nnbp,selected.covariate="balance")***##**

## Plot the trained Neural Network

## To check your prediction accuracy of training model

## Now use the TEST data set to test the trained model

## Make sure that target column in removed from the test data set

## MisClassification Confusion Matrix

**(Smoother the Curve- Better is the model prediction)**## Plot the trained Neural Network

*plot(trainnew.nnbp,rep="best")*## To check your prediction accuracy of training model

*prediction(trainnew.nnbp)*

print(trainnew.nnbp)print(trainnew.nnbp)

## Now use the TEST data set to test the trained model

## Make sure that target column in removed from the test data set

*columns=c("age","job","marital","education","balance","housingloan","personalloan","lastcommtype","lastday","lastmonth","lastduration","numcontacts","pdays","poutcome")*

testnew2<-subset(testnew,select=columns)

testnew.nnbp<-compute(trainnew.nnbp,testnew2,rep=1)testnew2<-subset(testnew,select=columns)

testnew.nnbp<-compute(trainnew.nnbp,testnew2,rep=1)

## MisClassification Confusion Matrix

*table(testnew$subscribed,testnew.nnbp$net.result)*

cbind(testnew$subscribed,testnew.nnbp$net.result)

print(testnew.nnbp)cbind(testnew$subscribed,testnew.nnbp$net.result)

print(testnew.nnbp)

*PLOT OF NEURAL NETWORK DESIGNED*

how to compute confusion matrix

ReplyDelete