16  Binary Classification

16.1 Data Loading

To illustrate binary classification, we’ll use an “Occupancy Detection” dataset dataset.

Data Source

“Experimental data used for binary classification (room occupancy) from Temperature, Humidity, Light and CO2. Ground-truth occupancy was obtained from time stamped pictures that were taken every minute.”

from ucimlrepo import fetch_ucirepo

ds = fetch_ucirepo(id=357)
ds.variables
name role type demographic description units missing_values
0 id ID Integer None None None no
1 date Feature Date None None None no
2 Temperature Feature Integer None None C no
3 Humidity Feature Continuous None None % no
4 Light Feature Integer None None Lux no
5 CO2 Feature Continuous None None ppm no
6 HumidityRatio Feature Continuous None None kgwater-vapor/kg-air no
7 Occupancy Target Binary None 0 for not occupied, 1 for occupied status None no
df = ds["data"]["original"].copy()
df.rename(columns={"date": "Date", "Occupancy": "Occupied"}, inplace=True)
df.drop(columns=["id"], inplace=True)
df.head()
Date Temperature Humidity Light CO2 HumidityRatio Occupied
0 2015-02-04 17:51:00 23.18 27.272 426 721.25 0.00479298817650529 1.0
1 2015-02-04 17:51:59 23.15 27.2675 429.5 714 0.00478344094931065 1.0
2 2015-02-04 17:53:00 23.15 27.245 426 713.5 0.00477946352442199 1.0
3 2015-02-04 17:54:00 23.15 27.2 426 708.25 0.00477150882608175 1.0
4 2015-02-04 17:55:00 23.1 27.2 426 704.5 0.00475699293331518 1.0

Dropping null values:

print(len(df))
df.dropna(inplace=True)
print(len(df))
20562
20560
from pandas import to_numeric

# clean datatypes / convert to numeric datatypes:
numeric_features = ["Temperature", "Humidity", "Light", "CO2", "HumidityRatio"]
df[numeric_features] = df[numeric_features].apply(to_numeric)

df.head()
Date Temperature Humidity Light CO2 HumidityRatio Occupied
0 2015-02-04 17:51:00 23.18 27.2720 426.0 721.25 0.004793 1.0
1 2015-02-04 17:51:59 23.15 27.2675 429.5 714.00 0.004783 1.0
2 2015-02-04 17:53:00 23.15 27.2450 426.0 713.50 0.004779 1.0
3 2015-02-04 17:54:00 23.15 27.2000 426.0 708.25 0.004772 1.0
4 2015-02-04 17:55:00 23.10 27.2000 426.0 704.50 0.004757 1.0

16.2 Data Exploration

16.2.1 Distribution of the Target

target = "Occupied"
#df[target] = df[target].map({0: False, 1: True})
df[target].value_counts()
Occupied
0.0    15810
1.0     4750
Name: count, dtype: int64
import plotly.express as px

px.histogram(df, x=target, nbins=5, height=350,
             title="Distribution of Occupancy"
            )

16.2.2 Relationships

#px.scatter(df, x="Light", y=target, height=350,
#           trendline="ols", trendline_color_override="red"
#)
px.histogram(df, x="Light", nbins=7, height=350,
             facet_col=target, color=target
            )
px.histogram(df, x="Temperature", nbins=7, height=350,
             facet_col=target, #facet_col_wrap=2
             color=target
            )

16.2.3 Correlation

import plotly.express as px

def plot_correlation_matrix(df, method="pearson", height=450):
    """Params: method (str): "spearman" or "pearson". """

    cor_mat = df.corr(method=method, numeric_only=True)

    title= f"{method.title()} Correlation"

    fig = px.imshow(cor_mat,
                    height=height, # title=title,
                    text_auto= ".2f", # round to two decimal places
                    color_continuous_scale="Blues",
                    color_continuous_midpoint=0,
                    labels={"x": "Variable", "y": "Variable"},
    )
    # center title (h/t: https://stackoverflow.com/questions/64571789/)
    fig.update_layout(title={'text': title, 'x':0.485, 'xanchor': 'center'})
    fig.show()
# df.drop(columns=["Hour", "Minute"])
plot_correlation_matrix(df, method="spearman", height=450)

Humidity and humidity ratio are the most highly correlated features. We can consider dropping one due to collinearity concerns.

corr_target = df.corr(numeric_only=True)[target].sort_values(ascending=False)
corr_target
Occupied         1.000000
Light            0.914850
Temperature      0.555610
CO2              0.501582
HumidityRatio    0.257324
Humidity         0.046240
Name: Occupied, dtype: float64

16.3 X/Y Split

df.columns.tolist()
['Date',
 'Temperature',
 'Humidity',
 'Light',
 'CO2',
 'HumidityRatio',
 'Occupied']
target = "Occupied"
y = df[target].copy()

x = df.drop(columns=[target, "Date"]).copy() #, "Timestamp", "Date"
print("X:", x.shape)
print("Y:", y.shape)
X: (20560, 5)
Y: (20560,)

16.4 Feature Scaling

x_scaled = (x - x.mean(axis=0)) / x.std(axis=0)
x_scaled.describe().T[["mean", "std"]]
mean std
Temperature 1.868976e-15 1.0
Humidity 1.105903e-17 1.0
Light 1.935330e-17 1.0
CO2 2.432987e-16 1.0
HumidityRatio 6.082467e-16 1.0

16.5 Train Test Split

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, random_state=99)
print("TRAIN:", x_train.shape, y_train.shape)
print("TEST:", x_test.shape, y_test.shape)
TRAIN: (15420, 5) (15420,)
TEST: (5140, 5) (5140,)

16.6 Model Training

from sklearn.linear_model import LogisticRegression

model = LogisticRegression(random_state=99)
model.fit(x_train, y_train)
LogisticRegression(random_state=99)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

Examining coefficients:

from pandas import Series

coef = Series(model.coef_[0], index=x_train.columns)
coef.sort_values(ascending=False)
Light            4.825258
HumidityRatio    1.252317
CO2              1.074644
Humidity        -0.871672
Temperature     -1.252623
dtype: float64

16.7 Model Evaluation

y_pred = model.predict(x_test)
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))
              precision    recall  f1-score   support

         0.0       1.00      0.99      0.99      3933
         1.0       0.96      1.00      0.98      1207

    accuracy                           0.99      5140
   macro avg       0.98      0.99      0.99      5140
weighted avg       0.99      0.99      0.99      5140
from sklearn.metrics import roc_auc_score

print("ROC-AUC:", roc_auc_score(y_test, y_pred).round(3))
ROC-AUC: 0.993

16.7.1 Confusion Matrix

Code
from sklearn.metrics import confusion_matrix
import plotly.express as px

def plot_confusion_matrix(y_true, y_pred, height=450, showscale=False, title=None, subtitle=None):
    # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
    # Confusion matrix whose i-th row and j-th column
    # ... indicates the number of samples with
    # ... true label being i-th class (ROW)
    # ... and predicted label being j-th class (COLUMN)
    cm = confusion_matrix(y_true, y_pred)

    class_names = sorted(y_test.unique().tolist())

    cm = confusion_matrix(y_test, y_pred, labels=class_names)

    title = title or "Confusion Matrix"
    #if subtitle:
    #    title += f"<br><sup>{subtitle}</sup>"

    fig = px.imshow(cm, x=class_names, y=class_names, height=height,
                    labels={"x": "Predicted", "y": "Actual"},
                    color_continuous_scale="Blues", text_auto=True,
    )
    fig.update_layout(title={'text': title, 'x':0.485, 'xanchor': 'center'})
    fig.update_coloraxes(showscale=showscale)

    fig.show()
plot_confusion_matrix(y_test, y_pred, height=400)

16.8 Complexity vs Performance

Code
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from pandas import Series

def train_eval_logistic(df, target="Occupied", features=[]):
    if not any(features):
        features = df.drop(columns=[target]).columns.tolist()
    print("FEATURES:", features)

    x = df[features].copy()
    print("X:", x.shape)

    y = df[target].copy()
    print("Y:", y.shape)

    # SCALING:
    x_scaled = (x - x.mean(axis=0)) / x.std(axis=0)

    # TRAIN / TEST SPLIT:
    x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, random_state=99)
    # MODEL TRAINING:
    model = LogisticRegression(random_state=99)
    model.fit(x_train, y_train)

    #print("COEFS:")
    #coef = Series(model.coef_[0], index=x_train.columns)
    #print(coef.sort_values(ascending=False))

    # PREDS AND EVAL:
    y_pred = model.predict(x_test)

    print(classification_report(y_test, y_pred))
train_eval_logistic(df, features=numeric_features)
FEATURES: ['Temperature', 'Humidity', 'Light', 'CO2', 'HumidityRatio']
X: (20560, 5)
Y: (20560,)
              precision    recall  f1-score   support

         0.0       1.00      0.99      0.99      3933
         1.0       0.96      1.00      0.98      1207

    accuracy                           0.99      5140
   macro avg       0.98      0.99      0.99      5140
weighted avg       0.99      0.99      0.99      5140
train_eval_logistic(df, features=["Light"])
FEATURES: ['Light']
X: (20560, 1)
Y: (20560,)
              precision    recall  f1-score   support

         0.0       1.00      0.98      0.99      3933
         1.0       0.95      1.00      0.97      1207

    accuracy                           0.99      5140
   macro avg       0.97      0.99      0.98      5140
weighted avg       0.99      0.99      0.99      5140
train_eval_logistic(df, features=["Temperature"])
FEATURES: ['Temperature']
X: (20560, 1)
Y: (20560,)
              precision    recall  f1-score   support

         0.0       0.84      0.92      0.88      3933
         1.0       0.64      0.45      0.53      1207

    accuracy                           0.81      5140
   macro avg       0.74      0.68      0.70      5140
weighted avg       0.80      0.81      0.80      5140
train_eval_logistic(df, features=["CO2"])
FEATURES: ['CO2']
X: (20560, 1)
Y: (20560,)
              precision    recall  f1-score   support

         0.0       0.81      0.93      0.87      3933
         1.0       0.57      0.30      0.40      1207

    accuracy                           0.78      5140
   macro avg       0.69      0.62      0.63      5140
weighted avg       0.76      0.78      0.76      5140
train_eval_logistic(df, features=["Temperature","CO2"])
FEATURES: ['Temperature', 'CO2']
X: (20560, 2)
Y: (20560,)
              precision    recall  f1-score   support

         0.0       0.86      0.91      0.89      3933
         1.0       0.64      0.54      0.58      1207

    accuracy                           0.82      5140
   macro avg       0.75      0.72      0.73      5140
weighted avg       0.81      0.82      0.81      5140