import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
Probability
In this blog post I will discuss a few examples of probability in machine learning. If you are new to probability, I recommend one of great textbooks that cover the topic and are available for free online, such as Think Bayes by Allen Downey and Bayes Rules! by Alicia A. Johnson, Miles Q. Ott, and Mine Dogucu.
Classification algorithms algorithms can estimate \(n \times k\) class membership probabilities for each dataset, where n is the number of data points in the dataset and k is the number of classes in the training dataset. Similarly, the Gaussian Mixtures clustering algorithm can generate \(n \times k\) cluster label probabilities.
Besides a data point and the Gaussian Mixtures models can estimate cluster membership probability. point , especially Logistic Regression and Naive Bayes. Every classification algorithm can estimate probabilities of belonging to each class.
\(\Huge P(A\vert B)={\frac {P(B\vert A)P(A)}{P(B)}}\)
= sns.load_dataset("penguins")
df df.head()
species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | |
---|---|---|---|---|---|---|---|
0 | Adelie | Torgersen | 39.1 | 18.7 | 181.0 | 3750.0 | Male |
1 | Adelie | Torgersen | 39.5 | 17.4 | 186.0 | 3800.0 | Female |
2 | Adelie | Torgersen | 40.3 | 18.0 | 195.0 | 3250.0 | Female |
3 | Adelie | Torgersen | NaN | NaN | NaN | NaN | NaN |
4 | Adelie | Torgersen | 36.7 | 19.3 | 193.0 | 3450.0 | Female |
= df["species"]
y = df.drop("species", axis=1)
X = pd.get_dummies(X, columns=["island", "sex"]) X
= KNNImputer()
knni = X.columns
colnames = knni.fit_transform(X)
X = pd.DataFrame(X, columns=colnames) X
# https://blog.4dcu.be/programming/2021/03/19/Code-Nugget-PCA-with-loadings.html
= Pipeline([
pipeline "scaler", StandardScaler()),
("pca", PCA(n_components=2)),
(
])
= pd.DataFrame(
pca_data
pipeline.fit_transform(X),=["PC1", "PC2"],
columns=df.index,
index
)"species"] = df["species"]
pca_data[
= pipeline.steps[1][1]
pca_step = pd.DataFrame(
loadings
pca_step.components_.T,=["PC1", "PC2"],
columns=X.columns,
index
)
def loading_plot(
=1, text_x=None, text_y=None, colors=None, visible=None, ax=plt, arrow_size=0.5
coeff, labels, scale
):for i, label in enumerate(labels):
if visible is None or visible[i]:
ax.arrow(0,
0,
0] * scale,
coeff[i, 1] * scale,
coeff[i, =arrow_size * scale,
head_width=arrow_size * scale,
head_length="#000" if colors is None else colors[i],
color
)
ax.text(if text_x.all() else coeff[i, 0] * 1.2 * scale,
text_x[i] if text_y.all() else coeff[i, 1] * 1.2 * scale,
text_y[i]
label,="#000" if colors is None else colors[i],
color="center",
ha="center",
va
)
= loadings * 3.2
loadings
= loadings["PC1"] * 2.4
text_x = loadings["PC2"] * 2.4
text_y
"sex_Male"] -= .5
text_y["bill_depth_mm"] -= .4
text_y["bill_depth_mm"] -= .4
text_x["sex_Female"] += .5
text_y["island_Torgersen"] += .1
text_y["island_Dream"] -= .5
text_x["island_Dream"] -= .3
text_y["island_Biscoe"] += .3
text_y["island_Biscoe"] += .3
text_x["flipper_length_mm"] += .55
text_x["body_mass_g"] += .95
text_x["body_mass_g"] -= .05
text_y[
# https://seaborn.pydata.org/generated/seaborn.jointplot.html
= sns.jointplot(data=pca_data, x="PC1", y="PC2", hue="species", ratio=4, marginal_ticks=True, height=8)
g =0, levels=6)
g.plot_joint(sns.kdeplot, zorder=-.025, clip_on=False)
g.plot_marginals(sns.rugplot, height# Add loadings
"PC1", "PC2"]].values, loadings.index, text_x=text_x, text_y=text_y, scale=2, arrow_size=0.02)
loading_plot(loadings[[
# Add variance explained by the
f"PC1 ({pca_step.explained_variance_ratio_[0]*100:.2f} %)")
plt.xlabel(f"PC2 ({pca_step.explained_variance_ratio_[1]*100:.2f} %)")
plt.ylabel(
plt.tight_layout()"PCA_with_loadings.png", dpi=300)
plt.savefig( plt.show()
/var/folders/mn/wpqfzxsn0z10p73_8jr0v2180000gn/T/ipykernel_7153/3221505183.py:36: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
text_x[i] if text_x.all() else coeff[i, 0] * 1.2 * scale,
/var/folders/mn/wpqfzxsn0z10p73_8jr0v2180000gn/T/ipykernel_7153/3221505183.py:37: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
text_y[i] if text_y.all() else coeff[i, 1] * 1.2 * scale,
="bill_length_mm", kde=True, hue="species", stat="count"); sns.displot(df, x
="bill_length_mm", kde=True, hue="species", stat="density");
sns.displot(df, x plt.show()
="bill_length_mm", kde=True, rug=True, hue="species", stat="proportion");
sns.displot(df, x plt.show()
= plt.subplots(1, 2, sharey=True)
fig, axes 'PDF and CDF comparision')
fig.suptitle("bill_length_mm", "flipper_length_mm"]], kde=True, ax=axes[0]);
sns.histplot(df[["bill_length_mm", "flipper_length_mm"]], stat="count", ax=axes[1], legend=False)
sns.ecdfplot(df[[0, 375));
plt.ylim(( plt.show()
import pathlib
= LogisticRegression(max_iter=10000)
lr = train_test_split(X, y, test_size=0.33, random_state=42)
X_train, X_test, y_train, y_test
lr.fit(X_train, y_train)= ConfusionMatrixDisplay.from_estimator(lr, X_test, y_test)
cmd plt.show()
Logistic function visualization
# https://github.com/ageron/handson-ml3/blob/main/04_training_linear_models.ipynb
= 6
lim = np.linspace(-lim, lim, 100)
t = 1 / (1 + np.exp(-t))
sig
=(8, 3))
plt.figure(figsize-lim, lim], [0, 0], "k-")
plt.plot([-lim, lim], [0.5, 0.5], "k:")
plt.plot([-lim, lim], [1, 1], "k:")
plt.plot([0, 0], [-1.1, 1.1], "k-")
plt.plot(["b-", linewidth=2, label=r"$\sigma(t) = \dfrac{1}{1 + e^{-t}}$")
plt.plot(t, sig, "t")
plt.xlabel(="upper left")
plt.legend(loc-lim, lim, -0.1, 1.1])
plt.axis([0, 0.25, 0.5, 0.75, 1])
plt.gca().set_yticks([
plt.grid() plt.show()
Obtain the logistic function mathematically
Step 1. Write out the linear regression equation
\(\Huge y=\beta_0+\beta_1 x_1+...+\beta_n x_n\) ## Step 2. The logistic regression equation is the same as above except output is log odds \(\Huge log(odds)=\beta_0+\beta_1 x_1+...+\beta_n x_n\) ## Step 3. Exponentiate both sides of the logistic regression equation to get odds \(\Huge odds=e^{\beta_0+\beta_1 x_1+...+\beta_n x_n}\) ## Step 4. Write out the probability equation \(\Huge p=\frac{odds}{1+odds}\) ## Step 5. Plug odds (from step 3) into the probability equation \(\Huge p=\frac{e^{\beta_0+\beta_1 x_1+...+\beta_n x_n}}{1+e^{\beta_0+\beta_1 x_1+...+\beta_n x_n}}\) ## Step 6. Divide the numerator and denominator by the odds (from step 3) \(\Huge p=\frac{1}{1+e^{-(\beta_0+\beta_1 x_1+...+\beta_n x_n)}}\)
= GaussianNB()
gnb
gnb.fit(X_train, y_train)= ConfusionMatrixDisplay.from_estimator(gnb, X_test, y_test)
cmd plt.show()
= pd.crosstab(df["species"], df["body_mass_g"] > df["body_mass_g"].mean(), margins=True) ct
ct
body_mass_g | False | True | All |
---|---|---|---|
species | |||
Adelie | 127 | 25 | 152 |
Chinstrap | 61 | 7 | 68 |
Gentoo | 7 | 117 | 124 |
All | 195 | 149 | 344 |
= ct.iloc[0, 0] / ct.iloc[0, 2]
likelihood likelihood
0.8355263157894737
= ct.iloc[0, 2] / ct.iloc[3, 2]
prior prior
0.4418604651162791
= ct.iloc[3, 0] / ct.iloc[3, 2]
norm norm
0.5668604651162791
= ct.iloc[0, 0] / ct.iloc[3, 0]
posterior posterior
0.6512820512820513
\(\Huge P(A\vert B)={\frac {P(B\vert A)P(A)}{P(B)}}\)
= likelihood * prior / norm result
"body_mass_g"] > df["body_mass_g"].mean() df[
0 False
1 False
2 False
3 False
4 False
...
339 False
340 True
341 True
342 True
343 True
Name: body_mass_g, Length: 344, dtype: bool
# gnb.fit([df["body_mass_g"] > df["body_mass_g"].mean()], y)
Citation
@online{laptev2024,
author = {Laptev, Martin},
title = {Probability},
date = {2024},
urldate = {2024},
url = {https://maptv.github.io/ml/prob/},
langid = {en}
}