In [None]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from collections import OrderedDict
from time import time

import numpy as np
import scipy as sp
import pandas as pd

from scipy.optimize import fmin_powell
from scipy import integrate
from scipy import linalg

from sklearn import linear_model
from sklearn.exceptions import ConvergenceWarning

sns.set_style("whitegrid")
sns.set_palette("colorblind")
palette = sns.color_palette()
figsize = (15,8)
legend_fontsize = 16

from matplotlib import rc
rc('font',**{'family':'sans-serif'})
rc('text', usetex=True)
rc('text.latex',preamble=r'\usepackage[utf8]{inputenc}')
rc('text.latex',preamble=r'\usepackage[russian]{babel}')
rc('axes', **{'titlesize': '16', 'labelsize': '16'})
rc('legend', **{'fontsize': '16'})
rc('figure', **{'dpi' : 300})

## Порождающие модели для классификации

In [None]:
def sample_mixture(N, pi, mu1, sigma1, mu2, sigma2):
    z = np.array( np.random.rand(N) < pi, dtype=int)
    res = np.zeros((N, 2))
    res[np.where(z == 1)] = np.random.multivariate_normal(mu1, sigma1, np.sum(z))
    res[np.where(z == 0)] = np.random.multivariate_normal(mu2, sigma2, N-np.sum(z))
    return z, res

def plot_points(ax, x, z, mu1, mu2):
    ax.scatter(x[np.where(z==1), 0], x[np.where(z==1), 1], marker='.', color='b')
    ax.scatter(x[np.where(z==0), 0], x[np.where(z==0), 1], marker='.', color='r')
    ax.scatter([mu1[0]], [mu1[1]], marker='*', s=120, color='b')
    ax.scatter([mu2[0]], [mu2[1]], marker='*', s=120, color='r')


mu1, sigma1 = np.array([-1, -.5]),  np.array([[1, -.5], [-.5, 2]])
mu2, sigma2 = np.array([.5, 1]), np.array([[2, .5], [.5, .5]])
z, x = sample_mixture(100, 0.5, mu1, sigma1, mu2, sigma2)
test_z, test_x = sample_mixture(100, 0.5, mu1, sigma1, mu2, sigma2)

fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(111)
plot_points(ax, x, z, mu1, mu2)

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

lda = LinearDiscriminantAnalysis(store_covariance=True)
z_lda = lda.fit(x, z).predict(x)

qda = QuadraticDiscriminantAnalysis(store_covariance=True)
z_qda = qda.fit(x, z).predict(x)

In [None]:
from matplotlib import colors
cmap = colors.LinearSegmentedColormap(
    'red_blue_classes',
    {'red': [(0, 1, 1), (1, 0.7, 0.7)],
     'green': [(0, 0.7, 0.7), (1, 0.7, 0.7)],
     'blue': [(0, 0.7, 0.7), (1, 1, 1)]})

def plot_ellipse(ax, mu, sigma, color):
    v, w = sp.linalg.eigh(sigma)
    u = w[0] / sp.linalg.norm(w[0])
    angle = np.arctan(u[1] / u[0])
    angle = 180 * angle / np.pi
    ell = mpl.patches.Ellipse((mu[0], mu[1]), 2 * v[0] ** 0.5, 2 * v[1] ** 0.5,
                              angle=180 + angle, facecolor=color,
                              edgecolor='black', linewidth=2)
    ell.set_clip_box(ax.bbox)
    ell.set_alpha(0.2)
    ax.add_artist(ell)
    ax.scatter(mu[0], mu[1], marker='+', color=color, s=100)

def plot_colormesh(ax, model):
    nx, ny = 100, 100
    x_min, x_max = plt.xlim()
    y_min, y_max = plt.ylim()
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, nx), np.linspace(y_min, y_max, ny))
    Z = model.predict_proba(np.c_[xx.ravel(), yy.ravel()])
    Z = Z[:, 1].reshape(xx.shape)
    plt.pcolormesh(xx, yy, Z, cmap=cmap,
                   norm=colors.Normalize(0., 1.), zorder=0)
    plt.contour(xx, yy, Z, [0.5], linewidths=2., colors='white')    
    plt.contour(xx, yy, Z, [0.2, 0.4, 0.6, 0.8], linewidths=.8, colors='white')    

In [None]:
print("LDA priors: %s" % lda.priors_)

fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(111)
plot_points(ax, x, z, mu1, mu2)
plot_ellipse(ax, lda.means_[0], lda.covariance_, 'r')
plot_ellipse(ax, lda.means_[1], lda.covariance_, 'b')
plot_colormesh(ax, lda)
plt.show()

In [None]:
print("QDA priors: %s" % qda.priors_)

fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(111)
plot_points(ax, x, z, mu1, mu2)
plot_ellipse(ax, qda.means_[0], qda.covariance_[0], 'r')
plot_ellipse(ax, qda.means_[1], qda.covariance_[1], 'b')
plot_colormesh(ax, qda)
plt.show()

## Логистическая регрессия

In [None]:
logregr = linear_model.LogisticRegression(class_weight='balanced')
z_logregr = logregr.fit(x, z).predict(x)

In [None]:
fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(111)
plot_points(ax, x, z, mu1, mu2)
plot_colormesh(ax, logregr)
plt.show()

In [None]:
def plot_decision_boundary(ax, model, color, label):
    nx, ny = 100, 100
    x_min, x_max = plt.xlim()
    y_min, y_max = plt.ylim()
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, nx), np.linspace(y_min, y_max, ny))
    Z = model.predict_proba(np.c_[xx.ravel(), yy.ravel()])
    Z = Z[:, 1].reshape(xx.shape)   
    p = ax.contour(xx, yy, Z, [0.5], linewidths=1.5, colors=[color])
    h,l = p.legend_elements()
    return h

fig = plt.figure(figsize=(10,6))
ax = fig.add_subplot(111)
plot_points(ax, x, z, mu1, mu2)
h1 = plot_decision_boundary(ax, lda, palette[0], "LDA")
h2 = plot_decision_boundary(ax, qda, palette[1], "QDA")
h3 = plot_decision_boundary(ax, logregr, palette[2], "Логистическая регрессия")
plt.legend([h1[0], h2[0], h3[0]], ["LDA", "QDA", "Логистическая регрессия"], loc="lower left")
plt.show()

## Метод ближайших соседей

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# If x, z, mu1, mu2 are not defined, recreate a sample:
try:
    _ = x, z, mu1, mu2
except NameError:
    mu1, sigma1 = np.array([-1, -.5]),  np.array([[1, -.5], [-.5, 2]])
    mu2, sigma2 = np.array([.5, 1]),     np.array([[2, .5],  [.5, .5]])
    z, x = sample_mixture(200, 0.5, mu1, sigma1, mu2, sigma2)

# Train a simple k-NN (k=5 by default)
k = 5
knn = KNeighborsClassifier(n_neighbors=k, weights="uniform")
knn.fit(x, z)

# Plot training points and the k-NN decision boundary (same helper you use above)
fig = plt.figure(figsize=(10,6))
ax = fig.add_subplot(111)
plot_points(ax, x, z, mu1, mu2)

# Use a neutral color if your 'palette' isn't defined
try:
    color = palette[3]
except Exception:
    color = "green"

h = plot_decision_boundary(ax, knn, color, f"{k}-NN")
plt.legend([h[0]], [f"{k}-NN"], loc="lower left")
plt.title(f"Метод ближайших соседей, k-NN (k={k})")
plt.show()

In [None]:
# --- Interactive k control for k-NN ---
from ipywidgets import interact, IntSlider
from sklearn.neighbors import KNeighborsClassifier

def knn_widget(X, y, title=None, k_min=1, k_max=31, step=2, default=5):
    """
    Interactively visualize k-NN decision boundary on a given 2D binary dataset.
    X: (n,2), y: {0,1}
    """
    X = np.asarray(X)
    y = np.asarray(y).astype(int)
    assert X.ndim == 2 and X.shape[1] == 2, "X must be 2D with shape (n, 2)"
    assert set(np.unique(y)).issubset({0,1}), "y must be binary {0,1}"
    
    pad = 0.75
    x_min, x_max = X[:,0].min() - pad, X[:,0].max() + pad
    y_min, y_max = X[:,1].min() - pad, X[:,1].max() + pad
    
    def _plot(k):
        model = KNeighborsClassifier(n_neighbors=k, weights="uniform")
        model.fit(X, y)
        fig = plt.figure(figsize=(10,6))
        ax = fig.add_subplot(111)
        ax.scatter(X[y==1,0], X[y==1,1], marker='.', alpha=0.6, color='b', label='class 1')
        ax.scatter(X[y==0,0], X[y==0,1], marker='.', alpha=0.6, color='r', label='class 0')
        ax.set_xlim((x_min, x_max))
        ax.set_ylim((y_min, y_max))
        try:
            color = palette[5]
        except Exception:
            color = "black"
        h = plot_decision_boundary(ax, model, color, f"{k}-NN")
        ax.legend([h[0]], [f"{k}-NN"], loc="lower left")
        ax.set_title(title if title is not None else f"k-NN decision boundary (k={k})")
        plt.show()
    
    return interact(_plot, k=IntSlider(min=k_min, max=k_max, step=step, value=default, description="k"))

knn_widget(x, z, title="Смеси двух гауссианов")


In [None]:
# --- More complex Gaussian mixture: multiple clusters per class ---
from sklearn.neighbors import KNeighborsClassifier

def sample_multicluster(N_per_cluster=100):
    # Two clusters for class 0 (red), two for class 1 (blue)
    mus_red = [np.array([-2, 0]), np.array([2, 2])]
    mus_blue = [np.array([-2, 2]), np.array([2, -1])]
    sigma = np.array([[0.5, 0.0], [0.0, 0.5]])

    Xs, ys = [], []
    for m in mus_red:
        Xs.append(np.random.multivariate_normal(m, sigma, N_per_cluster))
        ys.append(np.zeros(N_per_cluster, dtype=int))
    for m in mus_blue:
        Xs.append(np.random.multivariate_normal(m, sigma, N_per_cluster))
        ys.append(np.ones(N_per_cluster, dtype=int))

    X = np.vstack(Xs)
    y = np.concatenate(ys)
    return X, y, mus_red, mus_blue

x_multi, z_multi, mus_red, mus_blue = sample_multicluster(100)

# Train a k-NN (you can later reuse your interactive widget here too)
k = 7
knn_multi = KNeighborsClassifier(n_neighbors=k, weights='uniform').fit(x_multi, z_multi)

# Plot
fig = plt.figure(figsize=(10,6))
ax = fig.add_subplot(111)
for m in mus_blue:
    ax.scatter([m[0]], [m[1]], marker='*', s=150, color='b')
for m in mus_red:
    ax.scatter([m[0]], [m[1]], marker='*', s=150, color='r')
ax.scatter(x_multi[z_multi==1,0], x_multi[z_multi==1,1], color='b', marker='.', alpha=0.5)
ax.scatter(x_multi[z_multi==0,0], x_multi[z_multi==0,1], color='r', marker='.', alpha=0.5)

try:
    color = palette[5]
except Exception:
    color = 'green'

h = plot_decision_boundary(ax, knn_multi, color, f"{k}-NN")
plt.legend([h[0]], [f"{k}-NN"], loc='lower left')
plt.title(f"Метод ближайших соседей, k-NN (k={k})")
plt.show()

In [None]:
# Fit models
lda2 = LinearDiscriminantAnalysis(store_covariance=True).fit(x_multi, z_multi)
logreg2 = linear_model.LogisticRegression(class_weight='balanced', max_iter=1000).fit(x_multi, z_multi)
k = 7
knn2 = KNeighborsClassifier(n_neighbors=k, weights="uniform").fit(x_multi, z_multi)

# Plot data
fig = plt.figure(figsize=(10,6))
ax = fig.add_subplot(111)
for m in mus_blue:
    ax.scatter([m[0]], [m[1]], marker='*', s=150, color='b')
for m in mus_red:
    ax.scatter([m[0]], [m[1]], marker='*', s=150, color='r')
ax.scatter(x_multi[z_multi==1,0], x_multi[z_multi==1,1], color='b', marker='.', alpha=0.5)
ax.scatter(x_multi[z_multi==0,0], x_multi[z_multi==0,1], color='r', marker='.', alpha=0.5)

# Colors (fallbacks if 'palette' isn't present)
try:
    c_lda = palette[0]
    c_log = palette[2]
    c_knn = palette[5]
except Exception:
    c_lda, c_log, c_knn = "tab:orange", "tab:purple", "black"

# Boundaries
h_lda = plot_decision_boundary(ax, lda2, c_lda, "LDA")
h_log = plot_decision_boundary(ax, logreg2, c_log, "Logistic")
h_knn = plot_decision_boundary(ax, knn2, c_knn, f"{k}-NN")

plt.legend([h_lda[0], h_log[0], h_knn[0]], ["LDA (linear)", "Logistic (linear)", f"{k}-NN (nonlinear)"], loc="lower left")
plt.title(f"Линейные классификаторы и k-NN (k={k})")
plt.show()


In [None]:
# --- Random multi-cluster Gaussian mixture: linearly inseparable data ---
from sklearn.neighbors import KNeighborsClassifier

def sample_random_clusters(n_clusters=8, points_per_cluster=60, spread=0.4, seed=42):
    """
    Generate many random Gaussian clusters (half for each class).
    """
    np.random.seed(seed)
    centers = np.random.uniform(-4, 4, size=(n_clusters, 2))
    labels = np.array([i % 2 for i in range(n_clusters)])  # alternate red/blue clusters
    sigma = np.eye(2) * spread

    Xs, ys = [], []
    for c, lab in zip(centers, labels):
        Xs.append(np.random.multivariate_normal(c, sigma, points_per_cluster))
        ys.append(np.full(points_per_cluster, lab, dtype=int))

    X = np.vstack(Xs)
    y = np.concatenate(ys)
    return X, y, centers, labels

# Generate data
x_multi, z_multi, centers, center_labels = sample_random_clusters(
    n_clusters=10, points_per_cluster=60, spread=0.35
)

# Train a k-NN
k = 7
knn_multi = KNeighborsClassifier(n_neighbors=k).fit(x_multi, z_multi)

# Plot points and centers
fig = plt.figure(figsize=(10,6))
ax = fig.add_subplot(111)

colors = np.array(['r', 'b'])
for c, lab in zip(centers, center_labels):
    ax.scatter([c[0]], [c[1]], marker='*', s=150, color=colors[lab])
ax.scatter(x_multi[z_multi==0,0], x_multi[z_multi==0,1], color='r', marker='.', alpha=0.4)
ax.scatter(x_multi[z_multi==1,0], x_multi[z_multi==1,1], color='b', marker='.', alpha=0.4)

try:
    color = palette[5]
except Exception:
    color = "green"

h = plot_decision_boundary(ax, knn_multi, color, f"{k}-NN")
plt.legend([h[0]], [f"{k}-NN"], loc="lower left")
plt.title(f"Метод ближайших соседей, k-NN (k={k})")
plt.show()

In [None]:
x_multi, z_multi, centers, center_labels = sample_random_clusters(10, 60, 0.35)
knn_widget(x_multi, z_multi, title="Сложная смесь")

## Статистическая теория принятия решений

In [None]:
orig = lambda x : np.sin(2*x)

xd = np.array([-3, -2, -1, -0.5, 0, 0.5, 1, 1.5, 2.5, 3, 4]) / 2
num_points = len(xd)
data = orig(xd) + np.random.normal(0, .25, num_points)
xs = np.arange(xd[0]-1.5, xd[-1]+1.5, 0.01)
xs_d = np.vstack([xs ** i for i in range(1, num_points+1)]).transpose()
xd_d = np.vstack([xd ** i for i in range(1, num_points+1)]).transpose()


def train_model(xs, ys, alpha=0, use_lasso=False):
    if alpha == 0:
        return linear_model.LinearRegression(fit_intercept=True).fit( xs, ys )
    else:
        if use_lasso:
            return linear_model.Lasso(alpha=alpha, fit_intercept=True).fit( xs, ys )
        else:
            return linear_model.Ridge(alpha=alpha, fit_intercept=True).fit( xs, ys )

In [None]:
N = 1500

## Change alpha here
alpha =  10.1
use_lasso = False

fig = plt.figure(figsize=figsize)
ax = fig.add_subplot(111)
ax.set_xlim((xs[0], xs[-1]))
ax.set_ylim((-3, 3))

res = []
for _ in range(N):
    cur_data = orig(xd) + np.random.normal(0, .25, num_points)
    cur_model = train_model(xd_d, cur_data, alpha, use_lasso)
    res.append(cur_model.predict( xs_d ))
    ax.plot(xs, res[-1], linewidth=.15, color="0.5")

ax.plot(xs, orig(xs), linewidth=2, label="Original function", color=palette[0])
ax.scatter(xd, orig(xd), marker='*', s=150, color=palette[0])

ax.plot(xs, np.mean( res, axis=0 ), linewidth=2, label="Averaged predictions", color="red")
ax.legend(loc="upper center", fontsize=legend_fontsize)
plt.show()

In [None]:
test_set_size = 50
test_set_x = np.random.rand(test_set_size) * (2 + 1.5) - 1.5
test_set_xs = np.vstack([test_set_x ** i for i in range(1, 12)]).transpose()
print(test_set_xs.shape)
test_set_y = orig(test_set_xs[:,0]) + np.random.normal(0, .25, test_set_size)

fig = plt.figure(figsize=figsize)
ax = fig.add_subplot(111)
ax.plot(xs, orig(xs), linewidth=2, label="Исходная функция", color=palette[0])
ax.scatter(test_set_xs[:,0], test_set_y, marker='*', s=150, color=palette[0])
ax.set_xlim((-1.5, 2))

def test_set_error(model, d):
    return np.mean( (test_set_y - model.predict(test_set_xs[:, :d])) ** 2 )

In [None]:
N = 5000
use_lasso=False
alphas = np.logspace(-4, 2, num=20)
errors, biases, variances = [], [], []
for alpha in alphas:
    res, res_preds, res_test = [], [], []
    for _ in range(N):
        cur_data = orig(xd) + np.random.normal(0, .25, num_points)
        cur_model = train_model(xd_d, cur_data, alpha, use_lasso)
        res.append(test_set_error(cur_model, xd_d.shape[1]))
        res_test.append(cur_model.predict(test_set_xs))
        res_preds.append(cur_model.predict( xs_d ))
    res_test = np.array(res_test)
    avg_preds = np.mean(res_test, axis=0)
    errors.append(np.mean(res))
    biases.append(np.mean((avg_preds-orig(test_set_x))**2))
    variances.append(np.mean((res_test-avg_preds)**2))
    print("alpha = %.6f\tmean error = %.6f\tbias = %.6f\tvariance = %.6f" % (alpha, errors[-1], biases[-1], variances[-1]))

In [None]:
fig = plt.figure(figsize=figsize)
ax = fig.add_subplot(111)
# plt.yscale('log')
plt.xscale('log')
ax.plot(alphas, biases, linewidth=2, label="Bias")
ax.plot(alphas, variances, linewidth=2, label="Variance")
ax.plot(alphas, np.array(biases) + np.array(variances), linewidth=2, label="Bias + Variance")
ax.plot(alphas, errors, linewidth=2, label="Test set error", color="black")
ax.set_ylim((0, 1))
ax.set_xlim((0.001, 100))
ax.legend(fontsize=legend_fontsize)