### R Code - Bank Subscription Marketing - Classification {RECURSIVE PARTITIONING AND REGRESSION TREES}

**R Code for Recursive Partitioning and Regression Trees (RPART)**

**Data Set:- Bank Marketing**https://www.blogger.com/blogger.g?blogID=4359514443959080595#editor/target=post;postID=5426750358330739357

**Source of Data Set:- UCI Repository (http://archive.ics.uci.edu/ml/datasets/Bank+Marketing)**

**The Code includes the following:-**

**1. Data Exploration - Missing Values, Outliers**

**2. Data Visualisation**

**3. Correlation Matrix**

**4. Data Partitioning**

**5. Package used "rpart"**

**6. Model Tuning and Plots**

**7. Confusion Matrix**

**## The data has been imported using Import Dataset option in R Environment**

## The data set can be obtained from http://archive.ics.uci.edu/ml/datasets/Bank+Marketing

##

##

##

## Since it is a large dataset, graphical display of missing values will prove to be easier

##

## No red colour stripes are visible. hence no missing values.

##

##

## Use Box plots (Only for continuous variables)- To Check Ouliers

## Though some outliers are observed in Previous contacts, NumContacts and LastDuration, they have not bee removed keeping their significance into consideration

## Use Histograms (For both continuous and categorical variables)

## These histograms provide details abpout Skewness, Normal Distribution etc

## Function to create histograms for continuous variables with normal curve

## Balance is more skewed towards to Negative or Zero

## Last Duration is more skewed towards 0 to 100 secs.

## NUmContacts are more skewed towards 1

## Many were not contacted previously

## Since many were not contacted previously, therefore Pcontacts is 0

## Barplots for Categorical Variables

## Since Credit Default is highly skewed towards NO, this shall be removed from further analysis

##

## It can be observed that No two variables are highly correlated

## The data set can be obtained from http://archive.ics.uci.edu/ml/datasets/Bank+Marketing

##

**DATASET UNDERSTANDING***head(bank_full) ## Displays first 6 rows for each variable*

str(bank_full) ## Describes each variables

summary(bank_full) ## Provides basic statistical information of each variablestr(bank_full) ## Describes each variables

summary(bank_full) ## Provides basic statistical information of each variable

##

**DATA EXPLORATION - Check for Missing Data**##

__Option 1__*is.na(bank_full)*## Displays True for a missing value## Since it is a large dataset, graphical display of missing values will prove to be easier

##

__Option 2__*require(Amelia)*

missmap(bank_full,main="Missing Data - Bank Subscription", col=c("red","grey"),legend=FALSE)missmap(bank_full,main="Missing Data - Bank Subscription", col=c("red","grey"),legend=FALSE)

## No red colour stripes are visible. hence no missing values.

##

__Option 3__*summary(bank_full)*## displays missing values if any under every variable##

**DATA VISUALISATION**## Use Box plots (Only for continuous variables)- To Check Ouliers

*boxplot(bank_full$age~bank_full$subscribed, main=" AGE",ylab="age of customers",xlab="Subscribed")*

boxplot(bank_full$balance~bank_full$subscribed, main=" BALANCE",ylab="Balance of customers",xlab="Subscribed")

boxplot(bank_full$lastday~bank_full$subscribed, main=" LAST DAY",ylab="Last day of contact",xlab="Subscribed")

boxplot(bank_full$lastduration~bank_full$subscribed, main="LAST DURATION",ylab="Last duration of contact",xlab="Subscribed")

boxplot(bank_full$numcontacts~bank_full$subscribed, main="NUM CONTACTS",ylab="number of contacts",xlab="Subscribed")

boxplot(bank_full$pdays~bank_full$subscribed, main=" Previous DAYS",ylab="Previous days of contact",xlab="Subscribed")

boxplot(bank_full$pcontacts~bank_full$subscribed, main=" Previous Contacts",ylab="Previous Contacts with customers",xlab="Subscribed")boxplot(bank_full$balance~bank_full$subscribed, main=" BALANCE",ylab="Balance of customers",xlab="Subscribed")

boxplot(bank_full$lastday~bank_full$subscribed, main=" LAST DAY",ylab="Last day of contact",xlab="Subscribed")

boxplot(bank_full$lastduration~bank_full$subscribed, main="LAST DURATION",ylab="Last duration of contact",xlab="Subscribed")

boxplot(bank_full$numcontacts~bank_full$subscribed, main="NUM CONTACTS",ylab="number of contacts",xlab="Subscribed")

boxplot(bank_full$pdays~bank_full$subscribed, main=" Previous DAYS",ylab="Previous days of contact",xlab="Subscribed")

boxplot(bank_full$pcontacts~bank_full$subscribed, main=" Previous Contacts",ylab="Previous Contacts with customers",xlab="Subscribed")

## Though some outliers are observed in Previous contacts, NumContacts and LastDuration, they have not bee removed keeping their significance into consideration

## Use Histograms (For both continuous and categorical variables)

## These histograms provide details abpout Skewness, Normal Distribution etc

## Function to create histograms for continuous variables with normal curve

*bank_Conthist<-function(VarName,NumBreaks,xlab,main,lengthxfit) ## xlab and main should be mentioned under quotes as they are characters*

{

hist(VarName,breaks=NumBreaks,col="yellow",xlab=xlab,main=main)

xfit<-seq(min(VarName),max(VarName),length=lengthxfit)

yfit<-dnorm(xfit,mean=mean(VarName),sd=sd(VarName))

yfit<-yfit*diff(h$mids[1:2])*length(VarName)

lines(xfit,yfit,col="red",lwd=3)

}

bank_Conthist(bank_full$age,10,"age of customers","AGE",30)

bank_Conthist(bank_full$balance,50,"Balance of customers","Balance",100){

hist(VarName,breaks=NumBreaks,col="yellow",xlab=xlab,main=main)

xfit<-seq(min(VarName),max(VarName),length=lengthxfit)

yfit<-dnorm(xfit,mean=mean(VarName),sd=sd(VarName))

yfit<-yfit*diff(h$mids[1:2])*length(VarName)

lines(xfit,yfit,col="red",lwd=3)

}

bank_Conthist(bank_full$age,10,"age of customers","AGE",30)

bank_Conthist(bank_full$balance,50,"Balance of customers","Balance",100)

## Balance is more skewed towards to Negative or Zero

*bank_Conthist(bank_full$lastday,5,"Last Day of contact","LAst Day",10)*

bank_Conthist(bank_full$lastduration,100,"LastDuration of COntact","Last Duration",10)bank_Conthist(bank_full$lastduration,100,"LastDuration of COntact","Last Duration",10)

## Last Duration is more skewed towards 0 to 100 secs.

*bank_Conthist(bank_full$numcontacts,30,"Number of Contacts","NUmContacts",20)*## NUmContacts are more skewed towards 1

*bank_Conthist(bank_full$pdays,30,"Previous Days of contacts","PDays",20)*## Many were not contacted previously

*bank_Conthist(bank_full$pcontacts,20,"Previous Contacts","PContacts",10)*## Since many were not contacted previously, therefore Pcontacts is 0

## Barplots for Categorical Variables

*barplot(table(bank_full$job),col="red",main="JOB")*

barplot(table(bank_full$marital),col="green",main="Marital")

barplot(table(bank_full$education),col="red",main="Education")

barplot(table(bank_full$creditdefault),col="red",main="Credit Default")barplot(table(bank_full$marital),col="green",main="Marital")

barplot(table(bank_full$education),col="red",main="Education")

barplot(table(bank_full$creditdefault),col="red",main="Credit Default")

## Since Credit Default is highly skewed towards NO, this shall be removed from further analysis

*bank_full[5]<-NULL*

str(bank_full)

barplot(table(bank_full$housingloan),col="red",main="Housing Loan")

barplot(table(bank_full$personalloan),col="blue",main="Personal Loan")

barplot(table(bank_full$lastcommtype),col="red",main="Last communication type")

barplot(table(bank_full$lastmonth),col="violet",main="Last Month")

barplot(table(bank_full$poutcome),col="magenta",main="Previous Outcome")str(bank_full)

barplot(table(bank_full$housingloan),col="red",main="Housing Loan")

barplot(table(bank_full$personalloan),col="blue",main="Personal Loan")

barplot(table(bank_full$lastcommtype),col="red",main="Last communication type")

barplot(table(bank_full$lastmonth),col="violet",main="Last Month")

barplot(table(bank_full$poutcome),col="magenta",main="Previous Outcome")

##

**Correlation Matrix**among input (or independent) continuous variables*bank_full.cont<-data.frame(bank_full$age,bank_full$balance,bank_full$lastday,bank_full$lastduration,bank_full$numcontacts,bank_full$pdays,bank_full$pcontacts)*

str(bank_full.cont)

cor(bank_full.cont)str(bank_full.cont)

cor(bank_full.cont)

## It can be observed that No two variables are highly correlated

**## Partitioning Data into Train and Test datasets in 70:30**

*library(caret)*

set.seed(1234567)

train2<-createDataPartition(bank_full$subscribed,p=0.7,list=FALSE)

train<-bank_full[train2,]

test<-bank_full[-train2,]

write.table(bank_full,"bank1.csv",sep=",")

str(train)

str(test)set.seed(1234567)

train2<-createDataPartition(bank_full$subscribed,p=0.7,list=FALSE)

train<-bank_full[train2,]

test<-bank_full[-train2,]

write.table(bank_full,"bank1.csv",sep=",")

str(train)

str(test)

__## CLASSIFICATION USING Recursive partitioning and regressive trees(RPART) Model__

## Various parameters that control resursive partioning and regressive trees

## generally minimum no. of observations in any terminal leaf node(minbucket)=minimum no. of observations that must exist in a node in order to split (minsplit)/3

##Training the model

##Prints the rpart object

##Summarizes the model

## MEan Variance Plot- Graphical representation of deviance of a node divided by the number of observations at the node

##Shows paths to the selected nodes- It returns the names list where each element contains the splits on the path from the root to the selected node

##Plot the rpart object

##Label the plot drawn above (TREE Dendogram)

##PostScript Presentation Plot

##Snip-Contains the nodes that remain after selected subtrees are snipped off

## R squared plots for different splits

## Prediction on TEST data set using Trained model

##Prediction with Class Probabilities

## Level Numbers- Predicted

## Factors - Predicted

## Matrix of Level numbers, class frequencies, and probabilities

## Misclassification table or Confusion Matrix

## Residuals from a fitted rpart

##Predicted values after cross vaildation for a set of complexity parameter values

##cp is desired list of complexity values. By default takes values from cptable component fit

##Displays cptable

## Find MSE (Mean squared Error) with cross-validated model- Cross Validated Error estimate

## More valid for regression analysis

## Visual representation of cross validation results-Plots a complexity parameter table

##Cost complexity pruning-Determines nested sequence of subtrees by recursively snipping off least important splits

*install.packages("rpart")*

library(rpart)library(rpart)

## Various parameters that control resursive partioning and regressive trees

*control<-rpart.control(minsplit=30,minbucket=10,cp=0.01,maxcomplete=6,maxsurrogate=8,usersurrogate=2,xval=15,surrogatestyle=0,maxdepth=15)*## generally minimum no. of observations in any terminal leaf node(minbucket)=minimum no. of observations that must exist in a node in order to split (minsplit)/3

##Training the model

*train.rpart<-rpart(subscribed~.,data=train,control=control)*##Prints the rpart object

*print(train.rpart)*##Summarizes the model

*summary(train.rpart,digits=getOption("digits"),cp=0,save="train.rpart")*## MEan Variance Plot- Graphical representation of deviance of a node divided by the number of observations at the node

*meanvar(train.rpart)*## Drawn for Regression plots..Not significant for Classification plots##Shows paths to the selected nodes- It returns the names list where each element contains the splits on the path from the root to the selected node

*path.rpart(train.rpart,nodes=c(10,11,12,13),pretty=0,print.it=TRUE)*##Plot the rpart object

*plot(train.rpart,uniform=FALSE,branch=1,compress=TRUE,margin=0)*##Label the plot drawn above (TREE Dendogram)

*text(train.rpart,splits=TRUE,use.n=TRUE)*##PostScript Presentation Plot

*post(train.rpart,file="",title="Bank Data-RPART",use.n=TRUE)*##Snip-Contains the nodes that remain after selected subtrees are snipped off

*train.rpart.snip<-snip.rpart(train.rpart,toss=5) ## trim subtree at node=5*

plot(train.rpart.snip)

text(train.rpart.snip,splits=TRUE,use.n=TRUE)

post(train.rpart.snip,file="",title="Bank Data-RPART",use.n=TRUE)plot(train.rpart.snip)

text(train.rpart.snip,splits=TRUE,use.n=TRUE)

post(train.rpart.snip,file="",title="Bank Data-RPART",use.n=TRUE)

## R squared plots for different splits

*rsq.rpart(train.rpart)*## Prediction on TEST data set using Trained model

##Prediction with Class Probabilities

*test.rpart.prob<-predict(train.rpart,test,type="prob")*

test.rpart.probtest.rpart.prob

## Level Numbers- Predicted

*test.rpart.vector<-predict(train.rpart,test,type="vector")*

test.rpart.vectortest.rpart.vector

## Factors - Predicted

*test.rpart.factor<-predict(train.rpart,test,type="class")*

test.rpart.factortest.rpart.factor

## Matrix of Level numbers, class frequencies, and probabilities

*test.rpart.matrix<-predict(train.rpart,test,type="matrix")*

test.rpart.matrixtest.rpart.matrix

## Misclassification table or Confusion Matrix

*table(test.rpart.vector,test$subscribed)*## Residuals from a fitted rpart

*residuals(train.rpart,type=c("usual","pearson","deviance"))*## This has more significance in Regression type dataset##Predicted values after cross vaildation for a set of complexity parameter values

##cp is desired list of complexity values. By default takes values from cptable component fit

##Displays cptable

*printcp(train.rpart)*

train.rpart.cv<-xpred.rpart(train.rpart,xval=15,cp=4)train.rpart.cv<-xpred.rpart(train.rpart,xval=15,cp=4)

## Find MSE (Mean squared Error) with cross-validated model- Cross Validated Error estimate

## More valid for regression analysis

*train.error<-(train.rpart.cv-train$subscribed)^2*

apply(train.error,2,sum)apply(train.error,2,sum)

## Visual representation of cross validation results-Plots a complexity parameter table

*plotcp(train.rpart,minline=TRUE,lty=3,col="red",upper=c("size","splits","none"))*##Cost complexity pruning-Determines nested sequence of subtrees by recursively snipping off least important splits

*prune(train.rpart,cp=4)***RPART TREE**

**VISUAL REPRESENTATION OF CROSS VALIDATION RESULTS**

## No comments:

## Post a Comment