from ucimlrepo import fetch_ucirepo
= fetch_ucirepo(id=357) ds
16 Binary Classification
16.1 Data Loading
To illustrate binary classification, we’ll use an “Occupancy Detection” dataset dataset.
Data Source
“Experimental data used for binary classification (room occupancy) from Temperature, Humidity, Light and CO2. Ground-truth occupancy was obtained from time stamped pictures that were taken every minute.”
ds.variables
name | role | type | demographic | description | units | missing_values | |
---|---|---|---|---|---|---|---|
0 | id | ID | Integer | None | None | None | no |
1 | date | Feature | Date | None | None | None | no |
2 | Temperature | Feature | Integer | None | None | C | no |
3 | Humidity | Feature | Continuous | None | None | % | no |
4 | Light | Feature | Integer | None | None | Lux | no |
5 | CO2 | Feature | Continuous | None | None | ppm | no |
6 | HumidityRatio | Feature | Continuous | None | None | kgwater-vapor/kg-air | no |
7 | Occupancy | Target | Binary | None | 0 for not occupied, 1 for occupied status | None | no |
= ds["data"]["original"].copy()
df ={"date": "Date", "Occupancy": "Occupied"}, inplace=True)
df.rename(columns=["id"], inplace=True)
df.drop(columns df.head()
Date | Temperature | Humidity | Light | CO2 | HumidityRatio | Occupied | |
---|---|---|---|---|---|---|---|
0 | 2015-02-04 17:51:00 | 23.18 | 27.272 | 426 | 721.25 | 0.00479298817650529 | 1.0 |
1 | 2015-02-04 17:51:59 | 23.15 | 27.2675 | 429.5 | 714 | 0.00478344094931065 | 1.0 |
2 | 2015-02-04 17:53:00 | 23.15 | 27.245 | 426 | 713.5 | 0.00477946352442199 | 1.0 |
3 | 2015-02-04 17:54:00 | 23.15 | 27.2 | 426 | 708.25 | 0.00477150882608175 | 1.0 |
4 | 2015-02-04 17:55:00 | 23.1 | 27.2 | 426 | 704.5 | 0.00475699293331518 | 1.0 |
Dropping null values:
print(len(df))
=True)
df.dropna(inplaceprint(len(df))
20562
20560
from pandas import to_numeric
# clean datatypes / convert to numeric datatypes:
= ["Temperature", "Humidity", "Light", "CO2", "HumidityRatio"]
numeric_features = df[numeric_features].apply(to_numeric)
df[numeric_features]
df.head()
Date | Temperature | Humidity | Light | CO2 | HumidityRatio | Occupied | |
---|---|---|---|---|---|---|---|
0 | 2015-02-04 17:51:00 | 23.18 | 27.2720 | 426.0 | 721.25 | 0.004793 | 1.0 |
1 | 2015-02-04 17:51:59 | 23.15 | 27.2675 | 429.5 | 714.00 | 0.004783 | 1.0 |
2 | 2015-02-04 17:53:00 | 23.15 | 27.2450 | 426.0 | 713.50 | 0.004779 | 1.0 |
3 | 2015-02-04 17:54:00 | 23.15 | 27.2000 | 426.0 | 708.25 | 0.004772 | 1.0 |
4 | 2015-02-04 17:55:00 | 23.10 | 27.2000 | 426.0 | 704.50 | 0.004757 | 1.0 |
16.2 Data Exploration
16.2.1 Distribution of the Target
= "Occupied"
target #df[target] = df[target].map({0: False, 1: True})
df[target].value_counts()
Occupied
0.0 15810
1.0 4750
Name: count, dtype: int64
import plotly.express as px
=target, nbins=5, height=350,
px.histogram(df, x="Distribution of Occupancy"
title )
16.2.2 Relationships
#px.scatter(df, x="Light", y=target, height=350,
# trendline="ols", trendline_color_override="red"
#)
="Light", nbins=7, height=350,
px.histogram(df, x=target, color=target
facet_col )
="Temperature", nbins=7, height=350,
px.histogram(df, x=target, #facet_col_wrap=2
facet_col=target
color )
16.2.3 Correlation
import plotly.express as px
def plot_correlation_matrix(df, method="pearson", height=450):
"""Params: method (str): "spearman" or "pearson". """
= df.corr(method=method, numeric_only=True)
cor_mat
= f"{method.title()} Correlation"
title
= px.imshow(cor_mat,
fig =height, # title=title,
height= ".2f", # round to two decimal places
text_auto="Blues",
color_continuous_scale=0,
color_continuous_midpoint={"x": "Variable", "y": "Variable"},
labels
)# center title (h/t: https://stackoverflow.com/questions/64571789/)
={'text': title, 'x':0.485, 'xanchor': 'center'})
fig.update_layout(title fig.show()
# df.drop(columns=["Hour", "Minute"])
="spearman", height=450) plot_correlation_matrix(df, method
Humidity and humidity ratio are the most highly correlated features. We can consider dropping one due to collinearity concerns.
= df.corr(numeric_only=True)[target].sort_values(ascending=False)
corr_target corr_target
Occupied 1.000000
Light 0.914850
Temperature 0.555610
CO2 0.501582
HumidityRatio 0.257324
Humidity 0.046240
Name: Occupied, dtype: float64
16.3 X/Y Split
df.columns.tolist()
['Date',
'Temperature',
'Humidity',
'Light',
'CO2',
'HumidityRatio',
'Occupied']
= "Occupied"
target = df[target].copy()
y
= df.drop(columns=[target, "Date"]).copy() #, "Timestamp", "Date"
x print("X:", x.shape)
print("Y:", y.shape)
X: (20560, 5)
Y: (20560,)
16.4 Feature Scaling
= (x - x.mean(axis=0)) / x.std(axis=0)
x_scaled "mean", "std"]] x_scaled.describe().T[[
mean | std | |
---|---|---|
Temperature | 1.868976e-15 | 1.0 |
Humidity | 1.105903e-17 | 1.0 |
Light | 1.935330e-17 | 1.0 |
CO2 | 2.432987e-16 | 1.0 |
HumidityRatio | 6.082467e-16 | 1.0 |
16.5 Train Test Split
from sklearn.model_selection import train_test_split
= train_test_split(x_scaled, y, random_state=99)
x_train, x_test, y_train, y_test print("TRAIN:", x_train.shape, y_train.shape)
print("TEST:", x_test.shape, y_test.shape)
TRAIN: (15420, 5) (15420,)
TEST: (5140, 5) (5140,)
16.6 Model Training
from sklearn.linear_model import LogisticRegression
= LogisticRegression(random_state=99)
model model.fit(x_train, y_train)
LogisticRegression(random_state=99)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(random_state=99)
Examining coefficients:
from pandas import Series
= Series(model.coef_[0], index=x_train.columns)
coef =False) coef.sort_values(ascending
Light 4.825258
HumidityRatio 1.252317
CO2 1.074644
Humidity -0.871672
Temperature -1.252623
dtype: float64
16.7 Model Evaluation
= model.predict(x_test) y_pred
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))
precision recall f1-score support
0.0 1.00 0.99 0.99 3933
1.0 0.96 1.00 0.98 1207
accuracy 0.99 5140
macro avg 0.98 0.99 0.99 5140
weighted avg 0.99 0.99 0.99 5140
from sklearn.metrics import roc_auc_score
print("ROC-AUC:", roc_auc_score(y_test, y_pred).round(3))
ROC-AUC: 0.993
16.7.1 Confusion Matrix
Code
from sklearn.metrics import confusion_matrix
import plotly.express as px
def plot_confusion_matrix(y_true, y_pred, height=450, showscale=False, title=None, subtitle=None):
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
# Confusion matrix whose i-th row and j-th column
# ... indicates the number of samples with
# ... true label being i-th class (ROW)
# ... and predicted label being j-th class (COLUMN)
= confusion_matrix(y_true, y_pred)
cm
= sorted(y_test.unique().tolist())
class_names
= confusion_matrix(y_test, y_pred, labels=class_names)
cm
= title or "Confusion Matrix"
title #if subtitle:
# title += f"<br><sup>{subtitle}</sup>"
= px.imshow(cm, x=class_names, y=class_names, height=height,
fig ={"x": "Predicted", "y": "Actual"},
labels="Blues", text_auto=True,
color_continuous_scale
)={'text': title, 'x':0.485, 'xanchor': 'center'})
fig.update_layout(title=showscale)
fig.update_coloraxes(showscale
fig.show()
=400) plot_confusion_matrix(y_test, y_pred, height
16.8 Complexity vs Performance
Code
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from pandas import Series
def train_eval_logistic(df, target="Occupied", features=[]):
if not any(features):
= df.drop(columns=[target]).columns.tolist()
features print("FEATURES:", features)
= df[features].copy()
x print("X:", x.shape)
= df[target].copy()
y print("Y:", y.shape)
# SCALING:
= (x - x.mean(axis=0)) / x.std(axis=0)
x_scaled
# TRAIN / TEST SPLIT:
= train_test_split(x_scaled, y, random_state=99)
x_train, x_test, y_train, y_test # MODEL TRAINING:
= LogisticRegression(random_state=99)
model
model.fit(x_train, y_train)
#print("COEFS:")
#coef = Series(model.coef_[0], index=x_train.columns)
#print(coef.sort_values(ascending=False))
# PREDS AND EVAL:
= model.predict(x_test)
y_pred
print(classification_report(y_test, y_pred))
=numeric_features) train_eval_logistic(df, features
FEATURES: ['Temperature', 'Humidity', 'Light', 'CO2', 'HumidityRatio']
X: (20560, 5)
Y: (20560,)
precision recall f1-score support
0.0 1.00 0.99 0.99 3933
1.0 0.96 1.00 0.98 1207
accuracy 0.99 5140
macro avg 0.98 0.99 0.99 5140
weighted avg 0.99 0.99 0.99 5140
=["Light"]) train_eval_logistic(df, features
FEATURES: ['Light']
X: (20560, 1)
Y: (20560,)
precision recall f1-score support
0.0 1.00 0.98 0.99 3933
1.0 0.95 1.00 0.97 1207
accuracy 0.99 5140
macro avg 0.97 0.99 0.98 5140
weighted avg 0.99 0.99 0.99 5140
=["Temperature"]) train_eval_logistic(df, features
FEATURES: ['Temperature']
X: (20560, 1)
Y: (20560,)
precision recall f1-score support
0.0 0.84 0.92 0.88 3933
1.0 0.64 0.45 0.53 1207
accuracy 0.81 5140
macro avg 0.74 0.68 0.70 5140
weighted avg 0.80 0.81 0.80 5140
=["CO2"]) train_eval_logistic(df, features
FEATURES: ['CO2']
X: (20560, 1)
Y: (20560,)
precision recall f1-score support
0.0 0.81 0.93 0.87 3933
1.0 0.57 0.30 0.40 1207
accuracy 0.78 5140
macro avg 0.69 0.62 0.63 5140
weighted avg 0.76 0.78 0.76 5140
=["Temperature","CO2"]) train_eval_logistic(df, features
FEATURES: ['Temperature', 'CO2']
X: (20560, 2)
Y: (20560,)
precision recall f1-score support
0.0 0.86 0.91 0.89 3933
1.0 0.64 0.54 0.58 1207
accuracy 0.82 5140
macro avg 0.75 0.72 0.73 5140
weighted avg 0.81 0.82 0.81 5140