Monday, 5 May 2014

NEURAL NETWORKS- Detailed solved Classification example - Packages using "NNET" and "NEURALNET" in R



R Code - Bank Subscription Marketing - Classification {NEURAL NETWORKS}

 R Code  for NEURAL NETWORK
Data Set:- Bank Marketing https://www.blogger.com/blogger.g?blogID=4359514443959080595#editor/target=post;postID=5426750358330739357
Source of Data Set:-  UCI Repository (http://archive.ics.uci.edu/ml/datasets/Bank+Marketing)
The Code includes the following:-
1. Data Exploration - Missing Values, Outliers
2. Data Visualisation
3.  Correlation Matrix
4. Data Partitioning
5. Packages used "NNET" and "NEURAL NET"
6. Confusion Matrix with ROC


## The data has been imported using Import Dataset option in R Environment
## The data set can be obtained from http://archive.ics.uci.edu/ml/datasets/Bank+Marketing
## DATASET UNDERSTANDING
head(bank_full) ## Displays first 6 rows for each variable
str(bank_full) ## Describes each variables
summary(bank_full) ## Provides basic statistical information of each variable

## DATA EXPLORATION - Check for Missing Data
## Option 1
is.na(bank_full) ## Displays True for a missing value
## Since it is a large dataset, graphical display of missing values will prove to be easier
##Option 2
require(Amelia)
missmap(bank_full,main="Missing Data - Bank Subscription", col=c("red","grey"),legend=FALSE)

## No red colour stripes are visible. hence no missing values.
##Option 3
summary(bank_full) ## displays missing values if any under every variable

## DATA VISUALISATION
## Use Box plots (Only for continuous variables)- To Check Ouliers
boxplot(bank_full$age~bank_full$subscribed, main=" AGE",ylab="age of customers",xlab="Subscribed")
boxplot(bank_full$balance~bank_full$subscribed, main=" BALANCE",ylab="Balance of customers",xlab="Subscribed")
boxplot(bank_full$lastday~bank_full$subscribed, main=" LAST DAY",ylab="Last day of contact",xlab="Subscribed")
boxplot(bank_full$lastduration~bank_full$subscribed, main="LAST DURATION",ylab="Last duration of contact",xlab="Subscribed")
boxplot(bank_full$numcontacts~bank_full$subscribed, main="NUM CONTACTS",ylab="number of contacts",xlab="Subscribed")
boxplot(bank_full$pdays~bank_full$subscribed, main=" Previous DAYS",ylab="Previous days of contact",xlab="Subscribed")
boxplot(bank_full$pcontacts~bank_full$subscribed, main=" Previous Contacts",ylab="Previous Contacts with customers",xlab="Subscribed")

## Though some outliers are observed in Previous contacts, NumContacts and LastDuration, they have not bee removed keeping their significance into consideration
## Use Histograms (For both continuous and categorical variables)
## These histograms provide details abpout Skewness, Normal Distribution etc
## Function to create histograms for continuous variables with normal curve
bank_Conthist<-function(VarName,NumBreaks,xlab,main,lengthxfit) ## xlab and main should be mentioned under quotes as they are characters
{
  hist(VarName,breaks=NumBreaks,col="yellow",xlab=xlab,main=main)
  xfit<-seq(min(VarName),max(VarName),length=lengthxfit)
  yfit<-dnorm(xfit,mean=mean(VarName),sd=sd(VarName))
  yfit<-yfit*diff(h$mids[1:2])*length(VarName)
  lines(xfit,yfit,col="red",lwd=3)
}
bank_Conthist(bank_full$age,10,"age of customers","AGE",30)
bank_Conthist(bank_full$balance,50,"Balance of customers","Balance",100)

## Balance is more skewed towards to Negative or Zero
bank_Conthist(bank_full$lastday,5,"Last Day of contact","LAst Day",10)
bank_Conthist(bank_full$lastduration,100,"LastDuration of COntact","Last Duration",10)

## Last Duration is more skewed towards 0 to 100 secs.
bank_Conthist(bank_full$numcontacts,30,"Number of Contacts","NUmContacts",20)
## NUmContacts are more skewed towards 1
bank_Conthist(bank_full$pdays,30,"Previous Days of contacts","PDays",20)
## Many were not contacted previously
bank_Conthist(bank_full$pcontacts,20,"Previous Contacts","PContacts",10)
## Since many were not contacted previously, therefore Pcontacts is 0

## Barplots for Categorical Variables
barplot(table(bank_full$job),col="red",main="JOB")
barplot(table(bank_full$marital),col="green",main="Marital")
barplot(table(bank_full$education),col="red",main="Education")
barplot(table(bank_full$creditdefault),col="red",main="Credit Default")

## Since Credit Default is highly skewed towards NO, this shall be removed from further analysis
bank_full[5]<-NULL
str(bank_full)
barplot(table(bank_full$housingloan),col="red",main="Housing Loan")
barplot(table(bank_full$personalloan),col="blue",main="Personal Loan")
barplot(table(bank_full$lastcommtype),col="red",main="Last communication type")
barplot(table(bank_full$lastmonth),col="violet",main="Last Month")
barplot(table(bank_full$poutcome),col="magenta",main="Previous Outcome")

## Correlation Matrix among input (or independent) continuous variables
bank_full.cont<-data.frame(bank_full$age,bank_full$balance,bank_full$lastday,bank_full$lastduration,bank_full$numcontacts,bank_full$pdays,bank_full$pcontacts)
str(bank_full.cont)
cor(bank_full.cont)

## It can be observed that No two variables are highly correlated

## Partitioning Data into Train and Test datasets in 70:30
library(caret)
set.seed(1234567)
train1<-createDataPartition(bank_full$subscribed,p=0.7,list=FALSE)
train<-bank_full[train1,]
test<-bank_full[-train1,]

  
 ## CLASSIFICATION USING NEURAL NETWORK
 

library(nnet)

## 1. Fit a Single Hidden Layer Neural Network using Least Squares
train.nnet<-nnet(subscribed~.,train,size=3,rang=0.07,Hess=FALSE,decay=15e-4,maxit=250)
## Use TEST data for testing the trained model
test.nnet<-predict(train.nnet,test,type=("class"))
## MisClassification Confusion Matrix
table(test$subscribed,test.nnet)
## One can maximize the Accuracy by changing the "size" while training the neural network. SIZE refers to the number of nodes in the hidden layer.
which.is.max(test.nnet)  ## To Fine which row break ties at random (Maximum position in vector)

##2. Use Multinomial Log Linear models using Neural Networks
train.mlln<-multinom(subscribed~.,train)
##USe TEST data for testing the trained model
test.mlln<-predict(train.mlln,test)
##Misclassification or Confusion Matrix
table(test$subscribed,test.mlln)
##3. Training Neural Network Using BACK PROPOGATION
install.packages("neuralnet")
library(neuralnet)

## Check for all Input Independent Variables to be Integer or Numeric or complex matrix or vector arguments. If they are not any one of these, then tranform them accordingly
str(train)
str(test)

## It can be observed that all are either integer or factor. Now these factors have to be transformed to numeric.
## One cannot use directly as.numeric() to convert factors to numeric as it has limitations.
## First, Lets convert factors having character levels to numeric levels
str(bank_full)
bank_full_transform<-bank_full
bank_full_transform$marital=factor(bank_full_transform$marital,levels=c("single","married","divorced"),labels=c(1,2,3))
bank_full_transform$job=factor(bank_full_transform$job,levels=c("admin","blue-collar","entrepreneur","housemaid","management","retired","self-employed","services","student","technician","unemployed","unknown"),labels=c(1,2,3,4,5,6,7,8,9,10,11,12))
bank_full_transform$education=factor(bank_full_transform$education,levels=c("primary","secondary","tertiary","unknown"),labels=c(1,2,3,4))
bank_full_transform$housingloan=factor(bank_full_transform$housingloan,levels=c("no","yes"),labels=c(1,2))
bank_full_transform$personalloan=factor(bank_full_transform$personalloan,levels=c("no","yes"),labels=c(1,2))
bank_full_transform$subscribed=factor(bank_full_transform$subscribed,levels=c("no","yes"),labels=c(1,2))
bank_full_transform$lastcommtype=factor(bank_full_transform$lastcommtype,levels=c("cellular","telephone","unknown"),labels=c(1,2,3))
bank_full_transform$poutcome=factor(bank_full_transform$poutcome,levels=c("failure","other","success","unknown"),labels=c(1,2,3,4))
bank_full_transform$lastmonth=factor(bank_full_transform$lastmonth,levels=c("apr","aug","dec","feb","jan","jul","jun","mar","may","nov","oct","sep"),labels=c(1,2,3,4,5,6,7,8,9,10,11,12))
str(bank_full_transform)

## Now convert these numerical factors into numeric
bank_full_transform$subscribed<-as.numeric(as.character(bank_full_transform$subscribed))
bank_full_transform$job<-as.numeric(as.character(bank_full_transform$job))
bank_full_transform$marital<-as.numeric(as.character(bank_full_transform$marital))
bank_full_transform$education<-as.numeric(as.character(bank_full_transform$education))
bank_full_transform$personalloan<-as.numeric(as.character(bank_full_transform$personalloan))
bank_full_transform$housingloan<-as.numeric(as.character(bank_full_transform$housingloan))
bank_full_transform$lastcommtype<-as.numeric(as.character(bank_full_transform$lastcommtype))
bank_full_transform$lastmonth<-as.numeric(as.character(bank_full_transform$lastmonth))
bank_full_transform$poutcome<-as.numeric(as.character(bank_full_transform$poutcome))
bank_full_transform$age<-as.numeric(as.character(bank_full_transform$age))
bank_full_transform$balance<-as.numeric(as.character(bank_full_transform$balance))
bank_full_transform$lastday<-as.numeric(as.character(bank_full_transform$lastday))
bank_full_transform$lastduration<-as.numeric(as.character(bank_full_transform$lastduration))
bank_full_transform$numcontacts<-as.numeric(as.character(bank_full_transform$numcontacts))
bank_full_transform$pdays<-as.numeric(as.character(bank_full_transform$pdays))
bank_full_transform$pcontacts<-as.numeric(as.character(bank_full_transform$pcontacts))
str(bank_full_transform)

## Now all the variables are wither intergers or numeric
## Now we shall partition the data into train and test data
library(caret)
set.seed(1234567)
train2<-createDataPartition(bank_full_transform$subscribed,p=0.7,list=FALSE)
trainnew<-bank_full_transform[train2,]
testnew<-bank_full_transform[-train2,]
str(trainnew)
str(testnew)

## Now lets run the neuralnet model on Train dataset
trainnew.nnbp<-neuralnet(subscribed~age+balance+lastday+lastduration+numcontacts+pdays+pcontacts+marital+education+housingloan+personalloan+lastmonth+poutcome+lastcommtype,data=bank_full_transform,hidden=5,threshold=0.01,err.fct="sse",linear.output=FALSE,likelihood=TRUE,stepmax=1e+05,rep=1,startweights=NULL,learningrate.limit=list(0.1,1.5),learningrate.factor=list(minus=0.5,plus=1.5),learningrate=0.5,lifesign="minimal",lifesign.step=1000,algorithm="backprop",act.fct="logistic",exclude=NULL,constant.weights=NULL)
## Here, Back Propogation Algorithm has been used. One can also use rprop+ (resilient BP with weight backtracking),rprop- (resilient BP without weight backtracking), "sag and "slr" as modified global convergent algorithm
## Accordingly the accuracy can be checked for each algorithm
summary(train.nnbp)
## Plot method for the genralised weights wrt specific covariate (independent variable) and response target variable
gwplot(trainnew.nnbp,selected.covariate="balance")

##(Smoother the Curve- Better is the model prediction)
## Plot the trained Neural Network
plot(trainnew.nnbp,rep="best")
## To check your prediction accuracy of training model
prediction(trainnew.nnbp)
print(trainnew.nnbp)

## Now use the TEST data set to test the trained model
## Make sure that target column in removed from the test data set
columns=c("age","job","marital","education","balance","housingloan","personalloan","lastcommtype","lastday","lastmonth","lastduration","numcontacts","pdays","poutcome")
testnew2<-subset(testnew,select=columns)
testnew.nnbp<-compute(trainnew.nnbp,testnew2,rep=1)

## MisClassification Confusion Matrix
table(testnew$subscribed,testnew.nnbp$net.result)
cbind(testnew$subscribed,testnew.nnbp$net.result)
print(testnew.nnbp)
 


 PLOT OF NEURAL NETWORK DESIGNED

 

1 comment: