import jax
import jax.numpy as jnp
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler

# 1. Load Data
X, y = make_blobs(n_samples=300, centers=2, random_state=6, cluster_std=1.2)
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Map labels to {-1, 1}
y = jnp.array(y)
y = jnp.where(y == 0, -1, 1)[:, None]
X = jnp.array(X)

# 2. Bias Trick
ones = jnp.ones((X.shape[0], 1))
X_aug = jnp.hstack((ones, X))

# 3. Initialization
key = jax.random.PRNGKey(42)
W = jax.random.normal(key, (X_aug.shape[1], 1))

# Loss function (Used only for logging purposes now)
def loss_fn(W, X, y, C):
    # Don't regularize the bias (index 0)
    reg_loss = 0.5 * jnp.sum(W[1:] ** 2)
    # Hinge: mean(max(0, 1 - y(XW)))
    distances = 1 - y * (X @ W)
    hinge_loss = jnp.mean(jnp.maximum(0, distances))
    return reg_loss + C * hinge_loss

# MANUAL GRADIENT UPDATE
@jax.jit
def update_step(W, X, y, learning_rate, C):
    N = X.shape[0]
    
    # 1. Identify Violations (The Mask)
    # distance = 1 - y_i(w^T x_i)
    # If distance > 0, the point is inside the margin -> It contributes to gradient
    scores = X @ W
    margins = 1 - (y * scores)
    
    # mask is 1.0 where we have violations, 0.0 elsewhere
    mask = (margins > 0).astype(jnp.float32)
    
    # 2. Compute Hinge Gradient
    # For a single point: grad = -y * x
    # Vectorized sum: -X.T @ (mask * y)
    # We divide by N because the loss used jnp.mean()
    grad_hinge = -(X.T @ (mask * y)) / N
    
    # 3. Compute Regularization Gradient
    # Derivative of 0.5*||w||^2 is just w.
    # BUT we must set bias gradient to 0 (we don't regularize intercept)
    grad_reg = W
    grad_reg = grad_reg.at[0].set(0.0)
    
    # 4. Total Gradient
    total_grad = grad_reg + C * grad_hinge
    
    return W - learning_rate * total_grad

# Training Loop
learning_rate = 0.01
epochs = 1000
C_param = 1.0
loss_history = []

for i in range(epochs):
    W = update_step(W, X_aug, y, learning_rate, C_param)
    if i % 100 == 0:
        loss_history.append(loss_fn(W, X_aug, y, C_param))

print(f"Final Weights: {W.flatten()}")

# 5. Plotting
plt.figure(figsize=(12, 5))

# Plot Decision Boundary
plt.subplot(1, 2, 1)
plt.scatter(X[:, 0], X[:, 1], c=y.flatten(), cmap='coolwarm', s=50, edgecolors='k')

# Meshgrid for contours
ax = plt.gca()
xlim = ax.get_xlim()
ylim = ax.get_ylim()
xx = jnp.linspace(xlim[0], xlim[1], 30)
yy = jnp.linspace(ylim[0], ylim[1], 30)
YY, XX = jnp.meshgrid(yy, xx)
xy = jnp.vstack([XX.ravel(), YY.ravel()]).T
xy_aug = jnp.hstack((jnp.ones((xy.shape[0], 1)), xy))
Z = (xy_aug @ W).reshape(XX.shape)

ax.contour(XX, YY, Z, colors='k', levels=[-1, 0, 1], alpha=0.5, linestyles=['--', '-', '--'])
plt.title(f"SVM Manual Gradients (C={C_param})")

# Plot Loss
plt.subplot(1, 2, 2)
plt.plot(loss_history)
plt.title("Loss History")
plt.xlabel("Hundreds of Iterations")
plt.grid(True)

plt.show()

Final Weights: [ 0.01390198 -0.24345854 -0.6626574 ]


import jax
import jax.numpy as jnp
import matplotlib.pyplot as plt
from sklearn.datasets import make_circles
from sklearn.preprocessing import StandardScaler

# 1. Load Data (Non-linear dataset this time!)
#    We use make_circles, which is impossible for a linear SVM.
X, y = make_circles(n_samples=300, factor=0.3, noise=0.1, random_state=42)
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Map labels to {-1, 1}
y = jnp.array(y)
y = jnp.where(y == 0, -1, 1)[:, None]
X = jnp.array(X)

# --- THE KERNEL TRICK SETUP ---
# We define the RBF Kernel: exp(-gamma * ||x - y||^2)
def rbf_kernel(x1, x2, gamma=1.0):
    return jnp.exp(-gamma * jnp.sum((x1 - x2) ** 2))

# Use vmap to compute the pairwise kernel matrix (N x N)
# This creates the "Gram Matrix" where K[i, j] is similarity between data[i] and data[j]
gamma_param = 0.5
K_matrix = jax.vmap(lambda x1: jax.vmap(lambda x2: rbf_kernel(x1, x2, gamma_param))(X))(X)

# 2. Initialization
#    INSTEAD OF W (features x 1), WE LEARN BETA (samples x 1)
#    According to Representer Theorem: w = sum(beta_i * phi(x_i))
key = jax.random.PRNGKey(42)
Beta = jax.random.normal(key, (X.shape[0], 1)) * 0.1
Bias = 0.0  # Keep bias separate for Kernel methods

# Loss function
# Note: The input here is the Kernel Matrix 'K', not the features 'X'
def loss_fn(Beta, Bias, K, y, C):
    # Model prediction: f(x) = K * Beta + Bias
    logits = (K @ Beta) + Bias
    
    # Hinge Loss: mean(max(0, 1 - y * logits))
    hinge_loss = jnp.mean(jnp.maximum(0, 1 - y * logits))
    
    # Regularization: 0.5 * Beta.T @ K @ Beta
    # (This is the kernelized equivalent of 0.5 * ||w||^2)
    reg_loss = 0.5 * (Beta.T @ (K @ Beta))[0, 0]
    
    return reg_loss + C * hinge_loss

# MANUAL GRADIENT UPDATE
@jax.jit
def update_step(Beta, Bias, K, y, learning_rate, C):
    N = K.shape[0]
    
    # 1. Forward Pass
    logits = (K @ Beta) + Bias
    
    # 2. Identify Violations (The Mask)
    margins = 1 - (y * logits)
    mask = (margins > 0).astype(jnp.float32)
    
    # 3. Compute Gradients
    # Gradient of Hinge w.r.t Beta
    # Chain rule: dLoss/dBeta = dLoss/dLogits * dLogits/dBeta
    # dLogits/dBeta = K
    # dLoss/dLogits = -y * mask
    grad_hinge_beta = -(K.T @ (mask * y)) / N
    
    # Gradient of Hinge w.r.t Bias
    grad_hinge_bias = -jnp.sum(mask * y) / N
    
    # Gradient of Regularization w.r.t Beta
    # d(0.5 * B.T @ K @ B)/dB = K @ B
    grad_reg_beta = K @ Beta
    
    # 4. Total Gradients
    total_grad_beta = grad_reg_beta + C * grad_hinge_beta
    total_grad_bias = C * grad_hinge_bias # No regularization on bias
    
    # 5. Update
    new_Beta = Beta - learning_rate * total_grad_beta
    new_Bias = Bias - learning_rate * total_grad_bias
    
    return new_Beta, new_Bias

# Training Loop

# Lower the learning rate to prevent "jumping" too fast
learning_rate = 0.001  # Changed from 0.01

# Increase C so the model cares MORE about accuracy than complexity
C_param = 100.0        # Changed from 10.0

# learning_rate = 0.01
epochs = 2000
# C_param = 10.0 # Higher C for non-linear often helps
loss_history = []

for i in range(epochs):
    Beta, Bias = update_step(Beta, Bias, K_matrix, y, learning_rate, C_param)
    if i % 100 == 0:
        loss_history.append(loss_fn(Beta, Bias, K_matrix, y, C_param))

print(f"Final Bias: {Bias}")

# --- PLOTTING (Tricky part: we need to compute Kernel distance for meshgrid points) ---
plt.figure(figsize=(12, 5))

# Plot Data
plt.subplot(1, 2, 1)
plt.scatter(X[:, 0], X[:, 1], c=y.flatten(), cmap='coolwarm', s=50, edgecolors='k')

# Generate Meshgrid
ax = plt.gca()
xx = jnp.linspace(X[:, 0].min() - 0.5, X[:, 0].max() + 0.5, 30)
yy = jnp.linspace(X[:, 1].min() - 0.5, X[:, 1].max() + 0.5, 30)
YY, XX = jnp.meshgrid(yy, xx)
grid_points = jnp.vstack([XX.ravel(), YY.ravel()]).T

# COMPUTE KERNEL BETWEEN GRID POINTS AND TRAINING POINTS
# We need K(X_train, X_grid)
# shape: (N_train, N_grid)
K_grid = jax.vmap(lambda x_train: jax.vmap(lambda x_g: rbf_kernel(x_train, x_g, gamma_param))(grid_points))(X)

# Prediction = (Beta.T @ K_grid) + Bias
# shape: (1, N_grid)
Z = (Beta.T @ K_grid) + Bias
Z = Z.reshape(XX.shape)

ax.contour(XX, YY, Z, colors='k', levels=[-1, 0, 1], alpha=0.5, linestyles=['--', '-', '--'])
plt.title(f"Kernel SVM (RBF) | C={C_param}")

# Plot Loss
plt.subplot(1, 2, 2)
plt.plot(loss_history)
plt.title("Loss History")
plt.xlabel("Hundreds of Iterations")
plt.grid(True)

plt.show()

Final Bias: -1.1039990186691284

Support Vector Machine¶

Peter Ma | Dec 31st 2025¶

The Margin¶

Optimization¶

Gradients¶

Kernel Connection¶

The Problem¶

The Solution¶

The "Trick"¶

Kernel Function¶

Example¶

Whats the catch?¶

Demo¶