USE BankChurn
GO
-- Right, so we have records of a bank. First we should determine some general information. Like how many records are in there?
Select count(*) from dbo.[Raw_Customer-Churn-Records];
-- 10000 records present. How many are exited? 
Select count(*) from dbo.[Raw_Customer-Churn-Records] where exited = 1;
-- 2038 people have exited the bank. Therefore percentage is:
With TotalExited as
 (Select Cast(exited as float) as 'Exited' From dbo.[Raw_Customer-Churn-Records]  where exited = 1)
Select(Select sum(exited) from TotalExited)  / (Select count(*) from dbo.[Raw_Customer-Churn-Records]) *100 as ChurnRatio;
-- Therefore 20%. Now the question is, in which groups is the churnration higher? 
-- This will determine on which group we should focus.
-- Lets first take gender. Is there a difference in gender churn ratio?
Select Gender, 
count(Gender) as 'Total', 
sum(cast(Exited as float)) as 'Exited',
sum(cast(Exited as float)) / count(Gender) as 'Percentage gender of total' 
From dbo.[Raw_Customer-Churn-Records] group by gender
-- Although the percentage of female is slightly lower, their churnrate is 5% higher then the average.
-- Female customers could be a focus for retention.
-- What about people that complained? 
-- Therefore the 99.5% of the people that complained exited the bank.
Select complain, 
count(complain) as 'Total' ,
sum(cast(Exited as float)) as 'Exited',
sum(cast(Exited as float)) / count(complain) as 'Percentage churn'
From dbo.[Raw_Customer-Churn-Records] group by complain
-- What is the percentage of people that exited that have complained?
-- This is a tricky question, as you first have to find the people that exited and then the amount that complained.
Select sum(cast(complain as float))/count(*) 
from dbo.[Raw_Customer-Churn-Records] where exited = 1
-- This is 99.8% a bit higher. 
-- There are therefore some complainers that not have exited the bank and some exiters that did not complain.
-- However, this amount is low. Almost all clients that complain also exit.
-- Next, look at exiting per geographic location.
Select 
Geography,
count(*) as 'Total clients', 
sum(cast(exited as float)) as 'Total exited', 
sum(cast(exited as float))/count(*) as 'Percentage exited'
from dbo.[Raw_Customer-Churn-Records] group by Geography
-- Therefore the percentage of people exiting in Germany is the highest. 
-- What about a continuous variable such as balance?
Select
[Balance Buckets], 
MIN(Balance) as 'Min Balance',
MAX(Balance) as 'Max Balance',
AVG(BALANCE) as 'Avg balance', 
SUM(cast(exited as float))/ count(cast(CustomerId as float))  as 'Percentage Exited' 
FROM
(SELECT 
CustomerID, 
Balance, 
cast(exited as float) as 'Exited',  
NTILE(10) OVER (ORDER BY Balance) as 'Balance Buckets'
From dbo.[Raw_Customer-Churn-Records]) as t group by [Balance Buckets] ORDER BY [Balance Buckets];
-- You can see that: 
-- 1)A large portion of the population has no balance.
-- 2)Clients with a moderate (within the dataset)balance have exited more (buckets 7 and 8).
-- Lastly we will check the age variable.
Select
[Age Buckets], 
AVG(Age) as 'Avg Age',
SUM(cast(exited as float))/
count(cast(CustomerId as float))  as 'Percentage Exited' 
FROM
(SELECT 
CustomerID, 
Age, 
cast(exited as float) as 'Exited',  
NTILE(10) OVER (ORDER BY Age) as 'Age Buckets'
From dbo.[Raw_Customer-Churn-Records]) as t group by [Age Buckets] ORDER BY [Age Buckets];
-- Clients that are older have exited relatively more.

import matplotlib.pyplot as plt
import pandas as pd
import os
import numpy as np


from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

current_path = os.getcwd()
parent_path = os.path.dirname(current_path)

data = pd.read_csv(parent_path + '/static/Customer-Churn-Records.csv')

data['Geography'] = data['Geography'].astype('category').cat.codes  # Categorical columns
data['Gender'] = data['Gender'].astype('category').cat.codes  # Categorical columns

# I did a couple of iterations with different features, to get the best accuracy.

# x = data[['CreditScore' ]] 
# # accuracy 81% precision(1) 0.43
# x = data[['CreditScore', 'Geography'  ]] 
# # accuracy 77% precision(1) 0.32
# x = data[['CreditScore', 'Geography', 'Gender' ]] 
# # accuracy 76% precision(1) 0.29
# x = data[['CreditScore', 'Geography', 'Gender', 'Age',   ]] 
# # accuracy 78% precision(1) 0.42
# x = data[['CreditScore', 'Geography', 'Gender', 'Age', 'Balance',   ]] 
# # accuracy 80% precision(1) 0.49
# x = data[['CreditScore', 'Geography', 'Gender', 'Age', 'Balance', 'IsActiveMember' , ]]
# # accuracy 82% precision(1) 0.57
# x = data[['CreditScore',  'IsActiveMember' , 'Satisfaction Score'  ]] 
# # accuracy 73% precision(1) 0.23
# x = data[['CreditScore',  'IsActiveMember' , 'Satisfaction Score', 'NumOfProducts'  ]] 
# # accuracy 75% precision(1) 0.36
x = data[['CreditScore', 'Geography', 'Gender', 'Age', 
          'Balance', 'IsActiveMember' , 'Satisfaction Score' ]] 
# accuracy 83% precision(1) 0.60

# I could continue with this exercise for all values and determine the highest accuracy, 
# but for this demonstration I will leave it to this feature selection.

y = data['Exited']  # Target variable

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=20)

model = RandomForestClassifier(n_estimators=100, random_state=42, criterion='entropy')  

# Entropy is the parameter that shows how many of one class 
# is separeted due to the decision condition. Let's say you 
# have a 0.5 Entropy and a two class tree, 
# then half of the sample is class 1 and the other class 2. 

model.fit(x_train, y_train)

y_pred = model.predict(x_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

fig, ax = plt.subplots(3, 1, figsize=(8, 8), constrained_layout=True)

# Plot the accuracy
ax[0].text(0, 0.5, f'Accuracy: {accuracy:.2f}', fontsize=15, ha='left', va='bottom')
ax[0].set_axis_off()

# Plot the confusion matrix
cmap = plt.get_cmap('Blues')
cax = ax[1].matshow(conf_matrix, cmap=cmap)
fig.colorbar(cax, ax=ax[1], location='left')
ax[1].set_title('Confusion Matrix')
ax[1].set_xlabel('Predicted Labels')
ax[1].set_ylabel('True Labels')
ax[1].set_xticks(np.arange(conf_matrix.shape[1]))
ax[1].set_yticks(np.arange(conf_matrix.shape[0]))
ax[1].set_xticklabels(np.unique(y_pred))
ax[1].set_yticklabels(np.unique(y_test))

# Annotate confusion matrix
for (i, j), val in np.ndenumerate(conf_matrix):
    ax[1].text(j, i, f'{val}', ha='center', va='center', color='black')
    
# Add the classification report as text

ax[2].text(0, 0, "\n\nClassification Report:\n\n" + class_report,  
           fontsize=12, family='monospace')
ax[2].set_axis_off()

plt.savefig(parent_path + '/static/modelparameters.png')

# Display the figure
plt.show()

Accuracy: 0.83
Confusion Matrix:
 [[2294  129]
 [ 381  196]]
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.95      0.90      2423
           1       0.60      0.34      0.43       577

    accuracy                           0.83      3000
   macro avg       0.73      0.64      0.67      3000
weighted avg       0.81      0.83      0.81      3000

from sklearn.tree import plot_tree
import matplotlib.pyplot as plt


# Fit the classifier
model.fit(x_train, y_train)

# Select one tree
chosen_tree = model.estimators_[0]
print(chosen_tree)

# Plot the tree
plt.figure(figsize=(20,10))
plot_tree(chosen_tree, rounded=True, max_depth=2, 
filled=True, feature_names=x.columns, class_names=['Not Exited', 'Exited'])

# plt.savefig(parent_path + '/static/modeltree.png')

plt.show()

importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure()
plt.title('Feature Importances')

# Creating the bar plot
plt.bar(range(x_train.shape[1]), importances[indices], align='center')

plt.xticks(range(x_train.shape[1]), x_train.columns[indices], rotation=90)

# plt.savefig(parent_path + '/static/features.png')

# Showing the plot
plt.show()

DecisionTreeClassifier(criterion='entropy', max_features='sqrt',
                       random_state=1608637542)

customers_not_exited = data[data['Exited'] == 0]
x_to_predict = customers_not_exited.drop(
    ['RowNumber', 'CustomerId', 
      'Surname', 'Tenure', 'NumOfProducts', 'HasCrCard', 
      'EstimatedSalary', 'Exited', 'Complain', 
      'Card Type', 'Point Earned']
, axis=1)

current_customer_list = customers_not_exited.copy()
current_customer_list['Churn_Probability'] = model.predict_proba(x_to_predict)[:, 1]

print(current_customer_list[['CustomerId', 'Churn_Probability']]
      .sort_values(by=['Churn_Probability'],ascending=False).head(25))

      CustomerId  Churn_Probability
9534    15795488               0.96
313     15797960               0.93
1253    15699523               0.92
4477    15595609               0.90
8188    15623314               0.90
4396    15691119               0.88
400     15690452               0.88
9116    15672082               0.87
5158    15789582               0.87
6183    15642726               0.85
9820    15762762               0.83
3550    15648455               0.82
4053    15785385               0.81
2895    15646859               0.81
2593    15755223               0.81
9476    15571529               0.80
7107    15767358               0.80
2763    15654495               0.80
747     15587535               0.80
8372    15811958               0.79
1698    15807595               0.78
1708    15772892               0.76
9901    15802909               0.76
9410    15734659               0.75
3671    15616666               0.74

Customer churn¶

First a introductory SQL Analysis ot familiarize with the data.¶

Continue in Python to build the model¶