Churn is one of the largest problems faced by most businesses. It costs between 5 times and 25 times as much to find a new customer than to retain an existing one. Therefore,anticipating when a customer churn is an important business objective. For the purposes of this report,
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.svm import SVC
import plotly.graph_objs as go
import plotly.offline as py
py.init_notebook_mode(connected=True)
%matplotlib inline
#Data Collection
df = pd.read_csv (r'C:\Users\User\Documents\Data science\Telco-Customer-Churn.csv')
# df = pd.read_csv('Telco-Customer-Churn.csv')
df.head()
#Data Overview
print("Rows:",df.shape[0])
print ("Columns:",df.shape[1])
print ("\nFeatures:\n",df.columns.tolist())
print ("\nMissingvalues:",df.isnull().sum().values.sum())
print ("\nUnique values:\n",df.nunique())
print (df['Churn'].value_counts())
df.info()
df.TotalCharges = pd.to_numeric(df.TotalCharges, errors="coerce")
plt.style.use(['seaborn-dark','seaborn-talk'])
fig, ax = plt.subplots(1,2,figsize=(16,6))
df['Churn'].value_counts().plot.pie(explode=[0,0.08], ax=ax[0], autopct='%1.2f%%', shadow=True,
fontsize=14, startangle=30, colors=["#008000", "#a6814c"])
ax[0].set_title('Total Churn Percentage')
sns.countplot('Churn', data=df, ax=ax[1], palette=["#008000", "#a6814c"])
ax[1].set_title('Total Number of Churn Customers')
ax[1].set_ylabel(' ')
plt.show()
The data shows that 26.54% of the company's customers have decided to terminate the telco's service and left.
Id_col = ['customerID']
target_col = ["Churn"]
cat_cols = df.nunique()[df.nunique() < 6].keys().tolist()
cat_cols = [x for x in cat_cols if x not in target_col]
num_cols = [x for x in df.columns if x not in cat_cols + target_col + Id_col]
#Separating churn and non churn customers
churn = df[df["Churn"] == "Yes"]
not_churn = df[df["Churn"] == "No"]
def plot_pie(column) :
trace1 = go.Pie(values = churn[column].value_counts().values.tolist(),
labels = churn[column].value_counts().keys().tolist(),
hoverinfo = "label+percent+name",
domain = dict(x = [0,.48]),
name = "Churn Customers",
marker = dict(line = dict(width = 2,
color = "rgb(33, 75, 99)")
),
hole = .6
)
trace2 = go.Pie(values = not_churn[column].value_counts().values.tolist(),
labels = not_churn[column].value_counts().keys().tolist(),
hoverinfo = "label+percent+name",
marker = dict(line = dict(width = 2,
color = "rgb(33, 75, 99)")
),
domain = dict(x = [.52,1]),
hole = .6,
name = "Non Churn Customers"
)
layout = go.Layout(dict(title = column + " distribution in customer attrition ",
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
annotations = [dict(text = "churn customers",
font = dict(size = 13),
showarrow = False,
x = .15, y = .5),
dict(text = "Non Churn Customers",
font = dict(size = 13),
showarrow = False,
x = .88,y = .5
)
]
)
)
data = [trace1,trace2]
fig = go.Figure(data = data,layout = layout)
py.iplot(fig)
#function for histogram for customer attrition types
def histogram(column) :
trace1 = go.Histogram(x = churn[column],
histnorm= "percent",
name = "Churn Customers",
marker = dict(line = dict(width = .5,
color = "black"
)
),
opacity = .9
)
trace2 = go.Histogram(x = not_churn[column],
histnorm = "percent",
name = "Non Churn customers",
marker = dict(line = dict(width = .5,
color = "black"
)
),
opacity = .9
)
data = [trace1,trace2]
layout = go.Layout(dict(title =column + " distribution in customer attrition ",
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
title = column,
zerolinewidth=1,
ticklen=5,
gridwidth=2
),
yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
title = "percent",
zerolinewidth=1,
ticklen=5,
gridwidth=2
),
)
)
fig = go.Figure(data=data,layout=layout)
py.iplot(fig)
#function for scatter plot matrix for numerical columns in data
def scatter_matrix(df) :
df = df.sort_values(by = "Churn" ,ascending = True)
classes = df["Churn"].unique().tolist()
classes
class_code = {classes[k] : k for k in range(2)}
class_code
color_vals = [class_code[cl] for cl in df["Churn"]]
color_vals
pl_colorscale = "Viridis"
pl_colorscale
text = [df.loc[k,"Churn"] for k in range(len(df))]
text
trace = go.Splom(dimensions = [dict(label = "tenure",
values = df["tenure"]),
dict(label = 'MonthlyCharges',
values = df['MonthlyCharges']),
dict(label = 'TotalCharges',
values = df['TotalCharges'])],
text = text,
marker = dict(color = color_vals,
colorscale = pl_colorscale,
size = 3,
showscale = False,
line = dict(width = .1,
color='rgb(230,230,230)'
)
)
)
axis = dict(showline = True,
zeroline = False,
gridcolor = "#fff",
ticklen = 4
)
layout = go.Layout(dict(title =
"Scatter plot matrix for Numerical columns for customer attrition",
autosize = False,
height = 800,
width = 800,
dragmode = "select",
hovermode = "closest",
plot_bgcolor = 'rgba(240,240,240, 0.95)',
xaxis1 = dict(axis),
yaxis1 = dict(axis),
xaxis2 = dict(axis),
yaxis2 = dict(axis),
xaxis3 = dict(axis),
yaxis3 = dict(axis),
)
)
data = [trace]
fig = go.Figure(data = data,layout = layout )
py.iplot(fig)
#all categorical columns plot pie
for i in cat_cols :
plot_pie(i)
#for all categorical columns plot pie
for i in cat_cols:
plot_pie(i)
#for all categorical columns plot histogram
for i in num_cols :
histogram(i)
#scatter plot matrix
scatter_matrix(df)
For better understanding we wanted to compare these important variables from costumers who churned and costumers who did not churned. Customer that churned are those who
import seaborn as sns sns.boxplot (x='Churn', y='tenure', data=df) sns.set_palette("cubehelix",3) plt.show()
Boxplot finding indicates that customers who has stayed with the telco over 10 years and more are unlike to churn versus a new customers
# 1.2. Monthly Charges vs Churn Relationship Analysis
sns.boxplot (x='Churn',
y='MonthlyCharges',
data=df)
sns.set_palette("cubehelix",3)
plt.show()
Boxplot finding also indicates that there is a positive correlation between Monthly Charges and customer churn .
# 1.3. Total Charges vs Churn Relationship Analysis
sns.boxplot(x='Churn',
y='TotalCharges',
data=df)
sns.set_palette("cubehelix",3)
plt.show()
# 1.3. Contract vs Churn Relationship Analysis
sns.set(style="darkgrid")
sns.set_palette("cubehelix",3)
fig, ax = plt.subplots(figsize=(20,10))
ax = sns.countplot(x="Contract", hue="Churn", data=df)
# 1.4. TechSupport vs Churn Relationship Analysis
sns.set(style="darkgrid")
sns.set_palette("cubehelix",3)
fig, ax = plt.subplots(figsize=(20,10))
ax = sns.countplot(x="TechSupport", hue="Churn", data=df)
#Preprocessing for Churn Modeling
df['gender'].replace(['Male','Female'],[0,1],inplace=True)
df['Partner'].replace(['Yes','No'],[1,0],inplace=True)
df['Dependents'].replace(['Yes','No'],[1,0],inplace=True)
df['PhoneService'].replace(['Yes','No'],[1,0],inplace=True)
df['MultipleLines'].replace(['No phone service','No', 'Yes'],[0,0,1],inplace=True)
df['InternetService'].replace(['No','DSL','Fiber optic'],[0,1,2],inplace=True)
df['OnlineSecurity'].replace(['No','Yes','No internet service'],[0,1,0],inplace=True)
df['OnlineBackup'].replace(['No','Yes','No internet service'],[0,1,0],inplace=True)
df['DeviceProtection'].replace(['No','Yes','No internet service'],[0,1,0],inplace=True)
df['TechSupport'].replace(['No','Yes','No internet service'],[0,1,0],inplace=True)
df['StreamingTV'].replace(['No','Yes','No internet service'],[0,1,0],inplace=True)
df['StreamingMovies'].replace(['No','Yes','No internet service'],[0,1,0],inplace=True)
df['Contract'].replace(['Month-to-month', 'One year', 'Two year'],[0,1,2],inplace=True)
df['PaperlessBilling'].replace(['Yes','No'],[1,0],inplace=True)
df['PaymentMethod'].replace(['Electronic check', 'Mailed check', 'Bank transfer (automatic)','Credit card (automatic)'],[0,1,2,3],inplace=True)
df['Churn'].replace(['Yes','No'],[1,0],inplace=True)
df.pop('customerID')
df.info()
#Future Scaling
sns.set(font_scale=1)
plot = sns.heatmap(df.corr(), cmap='cubehelix', linewidth=2, square = True)
To decide which features of the data to include in the predictive churn model, we’ll examine the correlation between churn and each customer feature. To avoid unstable estimates of coeffiecients and making it difficult to interprete in our models, we will drop the ‘TotalCharges’ variable, as it is highly correlated to both ‘Tenure’ and ‘MonthlyCharges’.
df.pop('TotalCharges')
#Future Scaling
categorical_features = [
"gender",
"SeniorCitizen",
"Partner",
"Dependents",
"PhoneService"
"MultipleLines",
"InternetService",
"OnlineSecurity",
"OnlineBackup",
"DeviceProtection",
"TechSupport",
"StreamingTV",
"StreamingMovies",
"Contract",
"PaperlessBilling",
"PaymentMethod",
]
numerical_features = ["tenure", "MonthlyCharges",]
target = "Churn"
df[numerical_features].describe()
#Feature Engeering
### Examine correlations
df.corr()
#Making Predictions: Churn Prediction
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size = 0.25)
train_y = train['Churn']
test_y = test['Churn']
train_x = train
train_x.pop('Churn')
test_x = test
test_x.pop('Churn')
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
logisticRegr = LogisticRegression()
logisticRegr.fit(X=train_x, y=train_y)
test_y_pred = logisticRegr.predict(test_x)
confusion_matrix = confusion_matrix(test_y, test_y_pred)
print('Intercept: ' + str(logisticRegr.intercept_))
print('Regression: ' + str(logisticRegr.coef_))
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logisticRegr.score(test_x, test_y)))
print(classification_report(test_y, test_y_pred))
confusion_matrix_df = pd.DataFrame(confusion_matrix, ('No churn', 'Churn'), ('No churn', 'Churn'))
heatmap = sns.heatmap(confusion_matrix_df, annot=True, annot_kws={"size": 20}, fmt="d")
heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize = 14)
heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize = 14)
plt.ylabel('True label', fontsize = 14)
plt.xlabel('Predicted label', fontsize = 14)
Classification accuracy from the logistic regression classifie is 81% however the precision and recall for predictions in the positive class (churn) are relatively low, which suggests our data set may be imbalanced.
df['Churn'].value_counts()
from sklearn.utils import resample
df_majority = df[df['Churn']==0]
df_minority = df[df['Churn']==1]
df_minority_upsampled = resample(df_minority,
replace=True,
n_samples=5174, #same number of samples as majority classe
random_state=1) #set the seed for random resampling
# Combine resampled results
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
df_upsampled['Churn'].value_counts()
train, test = train_test_split(df_upsampled, test_size = 0.3)
train_y_upsampled = train['Churn']
test_y_upsampled = test['Churn']
train_x_upsampled = train
train_x_upsampled.pop('Churn')
test_x_upsampled = test
test_x_upsampled.pop('Churn')
logisticRegr_balanced = LogisticRegression()
logisticRegr_balanced.fit(X=train_x_upsampled, y=train_y_upsampled)
test_y_pred_balanced = logisticRegr_balanced.predict(test_x_upsampled)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logisticRegr_balanced.score(test_x_upsampled, test_y_upsampled)))
print(classification_report(test_y_upsampled, test_y_pred_balanced))
The overall accuracy of the model has decreased, but the precision and recall scores for predicting a churn have improved.
!jupyter nbconvert your_notebook_name.ipynb --to html