shakes76 · February 8, 2026 01:37
diff --git a/agc.py b/agc.py
 import jax
 import jax.numpy as jnp

 @jax.jit
 def l2_norm(tree):
    """Compute the l2 norm of a pytree of arrays. Useful for weight decay."""
    leaves, _ = jax.tree.flatten(tree)
    return jnp.sqrt(sum(jnp.vdot(x, x) for x in leaves))

 @jax.jit
 def adaptive_grad_clip(params, grads, clip_factor=0.01, eps=1e-3):
    '''
    Adaptive gradient clipping (AGC) (as proposed in High-Performance
    Large-Scale Image Recognition Without Normalization1): http://arxiv.org/abs/2102.06171

    The paper attributes AGC as a crucial component in order to train
    deep neural networks without batch normalization
    '''
    p_norm = l2_norm(params)
    max_norm = jnp.maximum(p_norm, eps) * clip_factor
    norm = l2_norm(grads)
    normalize = lambda g: jnp.where(norm < max_norm, g, g * (max_norm / jnp.maximum(norm, 1e-6)))
    return jax.tree.map(normalize, grads)
	import jax
	import jax.numpy as jnp

	@jax.jit
	def l2_norm(tree):
	"""Compute the l2 norm of a pytree of arrays. Useful for weight decay."""
	leaves, _ = jax.tree.flatten(tree)
	return jnp.sqrt(sum(jnp.vdot(x, x) for x in leaves))

	@jax.jit
	def adaptive_grad_clip(params, grads, clip_factor=0.01, eps=1e-3):
	'''
	Adaptive gradient clipping (AGC) (as proposed in High-Performance
	Large-Scale Image Recognition Without Normalization1): http://arxiv.org/abs/2102.06171

	The paper attributes AGC as a crucial component in order to train
	deep neural networks without batch normalization
	'''
	p_norm = l2_norm(params)
	max_norm = jnp.maximum(p_norm, eps) * clip_factor
	norm = l2_norm(grads)
	normalize = lambda g: jnp.where(norm < max_norm, g, g * (max_norm / jnp.maximum(norm, 1e-6)))
	return jax.tree.map(normalize, grads)
No results found