**R Code for LOGISTIC REGRESSION and C5.0 DECISION TREE**

**Data Set:- Bank Marketing**

**Source of Data Set:- UCI Repository (http://archive.ics.uci.edu/ml/datasets/Bank+Marketing)**

**The Code includes the following:-**

**1. Data Exploration - Missing Values, Outliers**

**2. Data Visualisation**

**3. Correlation Matrix**

**4. Data Partitioning**

**5. Logistic Regression**

**6. C5.0 Decision Tree**

**6. Confusion Matrix with ROC**

## The data has been imported using Import Dataset option in R Environment

## The data set can be obtained from http://archive.ics.uci.edu/ml/datasets/Bank+Marketing

##

**DATASET UNDERSTANDING**

*head(bank_full) ## Displays first 6 rows for each variable*

str(bank_full) ## Describes each variables

summary(bank_full) ## Provides basic statistical information of each variable

str(bank_full) ## Describes each variables

summary(bank_full) ## Provides basic statistical information of each variable

##

**DATA EXPLORATION - Check for Missing Data**

##

__Option 1__

*is.na(bank_full)*## Displays True for a missing value

## Since it is a large dataset, graphical display of missing values will prove to be easier

##

__Option 2__

*require(Amelia)*

missmap(bank_full,main="Missing Data - Bank Subscription", col=c("red","grey"),legend=FALSE)

missmap(bank_full,main="Missing Data - Bank Subscription", col=c("red","grey"),legend=FALSE)

## No red colour stripes are visible. hence no missing values.

##

__Option 3__

*summary(bank_full)*## displays missing values if any under every variable

##

**DATA VISUALISATION**

## Use Box plots (Only for continuous variables)- To Check Ouliers

*boxplot(bank_full$age~bank_full$subscribed, main=" AGE",ylab="age of customers",xlab="Subscribed")*

boxplot(bank_full$balance~bank_full$subscribed, main=" BALANCE",ylab="Balance of customers",xlab="Subscribed")

boxplot(bank_full$lastday~bank_full$subscribed, main=" LAST DAY",ylab="Last day of contact",xlab="Subscribed")

boxplot(bank_full$lastduration~bank_full$subscribed, main="LAST DURATION",ylab="Last duration of contact",xlab="Subscribed")

boxplot(bank_full$numcontacts~bank_full$subscribed, main="NUM CONTACTS",ylab="number of contacts",xlab="Subscribed")

boxplot(bank_full$pdays~bank_full$subscribed, main=" Previous DAYS",ylab="Previous days of contact",xlab="Subscribed")

boxplot(bank_full$pcontacts~bank_full$subscribed, main=" Previous Contacts",ylab="Previous Contacts with customers",xlab="Subscribed")

boxplot(bank_full$balance~bank_full$subscribed, main=" BALANCE",ylab="Balance of customers",xlab="Subscribed")

boxplot(bank_full$lastday~bank_full$subscribed, main=" LAST DAY",ylab="Last day of contact",xlab="Subscribed")

boxplot(bank_full$lastduration~bank_full$subscribed, main="LAST DURATION",ylab="Last duration of contact",xlab="Subscribed")

boxplot(bank_full$numcontacts~bank_full$subscribed, main="NUM CONTACTS",ylab="number of contacts",xlab="Subscribed")

boxplot(bank_full$pdays~bank_full$subscribed, main=" Previous DAYS",ylab="Previous days of contact",xlab="Subscribed")

boxplot(bank_full$pcontacts~bank_full$subscribed, main=" Previous Contacts",ylab="Previous Contacts with customers",xlab="Subscribed")

## Though some outliers are observed in Previous contacts, NumContacts and LastDuration, they have not bee removed keeping their significance into consideration

## Use Histograms (For both continuous and categorical variables)

## These histograms provide details abpout Skewness, Normal Distribution etc

## Function to create histograms for continuous variables with normal curve

*bank_Conthist<-function(VarName,NumBreaks,xlab,main,lengthxfit) ## xlab and main should be mentioned under quotes as they are characters*

{

hist(VarName,breaks=NumBreaks,col="yellow",xlab=xlab,main=main)

xfit<-seq(min(VarName),max(VarName),length=lengthxfit)

yfit<-dnorm(xfit,mean=mean(VarName),sd=sd(VarName))

yfit<-yfit*diff(h$mids[1:2])*length(VarName)

lines(xfit,yfit,col="red",lwd=3)

}

bank_Conthist(bank_full$age,10,"age of customers","AGE",30)

bank_Conthist(bank_full$balance,50,"Balance of customers","Balance",100)

{

hist(VarName,breaks=NumBreaks,col="yellow",xlab=xlab,main=main)

xfit<-seq(min(VarName),max(VarName),length=lengthxfit)

yfit<-dnorm(xfit,mean=mean(VarName),sd=sd(VarName))

yfit<-yfit*diff(h$mids[1:2])*length(VarName)

lines(xfit,yfit,col="red",lwd=3)

}

bank_Conthist(bank_full$age,10,"age of customers","AGE",30)

bank_Conthist(bank_full$balance,50,"Balance of customers","Balance",100)

## Balance is more skewed towards to Negative or Zero

*bank_Conthist(bank_full$lastday,5,"Last Day of contact","LAst Day",10)*

bank_Conthist(bank_full$lastduration,100,"LastDuration of COntact","Last Duration",10)

bank_Conthist(bank_full$lastduration,100,"LastDuration of COntact","Last Duration",10)

## Last Duration is more skewed towards 0 to 100 secs.

*bank_Conthist(bank_full$numcontacts,30,"Number of Contacts","NUmContacts",20)*

## NUmContacts are more skewed towards 1

*bank_Conthist(bank_full$pdays,30,"Previous Days of contacts","PDays",20)*

## Many were not contacted previously

*bank_Conthist(bank_full$pcontacts,20,"Previous Contacts","PContacts",10)*

## Since many were not contacted previously, therefore Pcontacts is 0

## Barplots for Categorical Variables

*barplot(table(bank_full$job),col="red",main="JOB")*

barplot(table(bank_full$marital),col="green",main="Marital")

barplot(table(bank_full$education),col="red",main="Education")

barplot(table(bank_full$creditdefault),col="red",main="Credit Default")

barplot(table(bank_full$marital),col="green",main="Marital")

barplot(table(bank_full$education),col="red",main="Education")

barplot(table(bank_full$creditdefault),col="red",main="Credit Default")

## Since Credit Default is highly skewed towards NO, this shall be removed from further analysis

*bank_full[5]<-NULL*

str(bank_full)

barplot(table(bank_full$housingloan),col="red",main="Housing Loan")

barplot(table(bank_full$personalloan),col="blue",main="Personal Loan")

barplot(table(bank_full$lastcommtype),col="red",main="Last communication type")

barplot(table(bank_full$lastmonth),col="violet",main="Last Month")

barplot(table(bank_full$poutcome),col="magenta",main="Previous Outcome")

str(bank_full)

barplot(table(bank_full$housingloan),col="red",main="Housing Loan")

barplot(table(bank_full$personalloan),col="blue",main="Personal Loan")

barplot(table(bank_full$lastcommtype),col="red",main="Last communication type")

barplot(table(bank_full$lastmonth),col="violet",main="Last Month")

barplot(table(bank_full$poutcome),col="magenta",main="Previous Outcome")

##

**Correlation Matrix**among input (or independent) continuous variables

*bank_full.cont<-data.frame(bank_full$age,bank_full$balance,bank_full$lastday,bank_full$lastduration,bank_full$numcontacts,bank_full$pdays,bank_full$pcontacts)*

str(bank_full.cont)

cor(bank_full.cont)

str(bank_full.cont)

cor(bank_full.cont)

## It can be observed that No two variables are highly correlated

##

**Partitioning Data**into Train and Test datasets in 70:30

*library(caret)*

*set.seed(1234567)*

train1<-createDataPartition(bank_full$subscribed,p=0.7,list=FALSE)

train<-bank_full[train1,]

test<-bank_full[-train1,]

train1<-createDataPartition(bank_full$subscribed,p=0.7,list=FALSE)

train<-bank_full[train1,]

test<-bank_full[-train1,]

## Without Any Transformations on the Variables, following are the Classification Models Applied.

##

**LOGISTIC REGRESSION**

*train.logistic=glm(subscribed~age+job+marital+education+housingloan+balance+personalloan+lastcommtype+lastday+lastmonth+lastduration+numcontacts+pdays+pcontacts+poutcome,binomial(link="logit"),train)*

summary(train.logistic) ##Display the Results. One can remove the variables which are not significant (if P value>0.05) and re-run the model to make it more effective in terms of accurate prediction.

summary(train.logistic) ##

##Deviance was reduced to 22845-18828= 4017 in 40 degrees of freedom (31648-316408)

## Significance Test

*1-pchisq(4017,df=40)*

## Since its valus is 0, the NUll hypothesis is rejected and the model developed is significant

##Model Diagnostic Plots with train data set

*plot(train.logistic)*

##Using model for testing the TEST data set

##Initial Probabilities are generated

*test.prob<-predict.glm(train.logistic,test[,1:15],type="response")*

##

**PLOT ROC**curve to finalise your cut off value

*library(pROC)*

ROC=roc(subscribed~test.prob,data=test)

plot(ROC) ## Here cut off value = 0.5 has been selected.

ROC=roc(subscribed~test.prob,data=test)

plot(ROC) ## Here cut off value = 0.5 has been selected.

## Based on the problem condition, Higher Specificity OR Higher Sensitivity can be selected.In this Scenario, higher sensitivity is preferable

##Now convert the probabilitis to classes/levels with cut off as 0.5

*test.prediction<-cut(test.prob,c(-Inf,0.5,Inf),labels=c("No","Yes"))*

test.prediction

summary(test.prediction)

test.prediction

summary(test.prediction)

##Confusion Matrix

*table(test.prediction,test$subscribed)*

**## 2. C5.0 Decision Tree**

*## Control for C5.0 Model*

library(C50)

library(C50)

*## Train Model using C5.0*

C5.0Control(subset=TRUE,bands=2,winnow=TRUE,noGlobalPruning=FALSE,CF=0.25,minCases=2,label="C5.0 Outcome")

C5.0Control(subset=TRUE,bands=2,winnow=TRUE,noGlobalPruning=FALSE,CF=0.25,minCases=2,label="C5.0 Outcome")

train.c50=C5.0(train[,1:15],train$subscribed,trials=10,rules=FALSE,control=C5.0Control(),costs=NULL)

## Variable Importance Measure

*## Lower % Usage variables can be dicarded and the Train function is re-run to get better accuracy*

C5imp(train.c50,metric="usage",pct=TRUE)

C5imp(train.c50,metric="usage",pct=TRUE)

*## Predict on the TEST data using trained Model*

summary(train.c50) ## Check the percentage of accuracy of the model

summary(train.c50) ## Check the percentage of accuracy of the model

*## CONFUSION MATRIX from C5.0*

test.c50=predict(object=train.c50,newdata=test[,1:15],trials=1,type="class")

table(test.c50,test$subscribed)

test.c50=predict(object=train.c50,newdata=test[,1:15],trials=1,type="class")

table(test.c50,test$subscribed)

Thanks for post:

ReplyDeleteship hỏa tốc sang Nepal

ship nhanh đi Nepal

ship nhanh tới Nepal

vận chuyển bưu phẩm đi Nepal

ship tốc độ đi Nepal

www.caycotacdunggi.info