Skip to content

Instantly share code, notes, and snippets.

@planetis-m
Last active February 10, 2026 15:44
Show Gist options
  • Select an option

  • Save planetis-m/41c57eba82fe9e926b82908e96103a7f to your computer and use it in GitHub Desktop.

Select an option

Save planetis-m/41c57eba82fe9e926b82908e96103a7f to your computer and use it in GitHub Desktop.
import std/tables
import jsonx
import jsonx/parsejson
type
ArchitectureKind = enum
akGlm4Moe
akGlm4MoeLite
akNemotronH
BaseConfig = ref object of RootObj
architectures: seq[string]
modelType: string
vocabSize: int
Glm4MoeConfig = ref object of BaseConfig
headDim: int
useQkNorm: bool
numHiddenLayers: int
Glm4MoeLiteConfig = ref object of BaseConfig
topkMethod: string
qLoraRank: int
numHiddenLayers: int
NemotronHConfig = ref object of BaseConfig
autoMap: Table[string, string]
chunkSize: int
mambaNumHeads: int
ModelConfig = object
case kind: ArchitectureKind
of akGlm4Moe:
glm4Moe: Glm4MoeConfig
of akGlm4MoeLite:
glm4MoeLite: Glm4MoeLiteConfig
of akNemotronH:
nemotronH: NemotronHConfig
proc parseArchitecture(name: string; kind: var ArchitectureKind): bool =
case name
of "Glm4MoeForCausalLM":
kind = akGlm4Moe
result = true
of "Glm4MoeLiteForCausalLM":
kind = akGlm4MoeLite
result = true
of "NemotronHForCausalLM":
kind = akNemotronH
result = true
else:
result = false
proc readJson(dst: var ModelConfig; p: var JsonParser) =
var architectures: seq[string]
var modelType = ""
var vocabSize = 0
var headDim = 0
var useQkNorm = false
var numHiddenLayers = 0
var topkMethod = ""
var qLoraRank = 0
var autoMap = initTable[string, string]()
var chunkSize = 0
var mambaNumHeads = 0
eat(p, tkCurlyLe)
while p.tok != tkCurlyRi:
if p.tok != tkString:
raiseParseErr(p, "string literal as key")
case p.a
of "architectures":
discard getTok(p)
eat(p, tkColon)
readJson(architectures, p)
of "model_type":
discard getTok(p)
eat(p, tkColon)
readJson(modelType, p)
of "vocab_size":
discard getTok(p)
eat(p, tkColon)
readJson(vocabSize, p)
of "head_dim":
discard getTok(p)
eat(p, tkColon)
readJson(headDim, p)
of "use_qk_norm":
discard getTok(p)
eat(p, tkColon)
readJson(useQkNorm, p)
of "num_hidden_layers":
discard getTok(p)
eat(p, tkColon)
readJson(numHiddenLayers, p)
of "topk_method":
discard getTok(p)
eat(p, tkColon)
readJson(topkMethod, p)
of "q_lora_rank":
discard getTok(p)
eat(p, tkColon)
readJson(qLoraRank, p)
of "auto_map":
discard getTok(p)
eat(p, tkColon)
readJson(autoMap, p)
of "chunk_size":
discard getTok(p)
eat(p, tkColon)
readJson(chunkSize, p)
of "mamba_num_heads":
discard getTok(p)
eat(p, tkColon)
readJson(mambaNumHeads, p)
else:
discard getTok(p)
eat(p, tkColon)
skipJson(p)
expectObjectSeparator(p)
eat(p, tkCurlyRi)
if architectures.len == 0:
raiseParseErr(p, "non-empty architectures")
var kind: ArchitectureKind
if not parseArchitecture(architectures[0], kind):
raiseParseErr(p, "supported architectures[0] discriminator")
case kind
of akGlm4Moe:
dst = ModelConfig(
kind: akGlm4Moe,
glm4Moe: Glm4MoeConfig(
architectures: architectures,
modelType: modelType,
vocabSize: vocabSize,
headDim: headDim,
useQkNorm: useQkNorm,
numHiddenLayers: numHiddenLayers
)
)
of akGlm4MoeLite:
dst = ModelConfig(
kind: akGlm4MoeLite,
glm4MoeLite: Glm4MoeLiteConfig(
architectures: architectures,
modelType: modelType,
vocabSize: vocabSize,
topkMethod: topkMethod,
qLoraRank: qLoraRank,
numHiddenLayers: numHiddenLayers
)
)
of akNemotronH:
dst = ModelConfig(
kind: akNemotronH,
nemotronH: NemotronHConfig(
architectures: architectures,
modelType: modelType,
vocabSize: vocabSize,
autoMap: autoMap,
chunkSize: chunkSize,
mambaNumHeads: mambaNumHeads
)
)
const
glm4MoeJson = """
{
"architectures": [
"Glm4MoeForCausalLM"
],
"attention_bias": true,
"attention_dropout": 0.0,
"pad_token_id": 151329,
"eos_token_id": [
151329,
151336,
151338
],
"head_dim": 128,
"hidden_act": "silu",
"hidden_size": 5120,
"partial_rotary_factor": 0.5,
"initializer_range": 0.02,
"intermediate_size": 12288,
"max_position_embeddings": 202752,
"model_type": "glm4_moe",
"moe_intermediate_size": 1536,
"norm_topk_prob": true,
"num_attention_heads": 96,
"n_group": 1,
"topk_group": 1,
"n_routed_experts": 160,
"n_shared_experts": 1,
"routed_scaling_factor": 2.5,
"num_experts_per_tok": 8,
"first_k_dense_replace": 3,
"num_hidden_layers": 92,
"num_key_value_heads": 8,
"rms_norm_eps": 1e-05,
"rope_scaling": null,
"rope_theta": 1000000,
"num_nextn_predict_layers": 1,
"tie_word_embeddings": false,
"torch_dtype": "bfloat16",
"transformers_version": "4.54.0",
"use_cache": true,
"use_qk_norm": true,
"vocab_size": 151552
}
"""
glm4MoeLiteJson = """
{
"architectures": [
"Glm4MoeLiteForCausalLM"
],
"attention_bias": false,
"attention_dropout": 0.0,
"pad_token_id": 154820,
"eos_token_id": [
154820,
154827,
154829
],
"hidden_act": "silu",
"hidden_size": 2048,
"intermediate_size": 10240,
"max_position_embeddings": 202752,
"model_type": "glm4_moe_lite",
"moe_intermediate_size": 1536,
"topk_method": "noaux_tc",
"norm_topk_prob": true,
"num_attention_heads": 20,
"n_group": 1,
"topk_group": 1,
"n_routed_experts": 64,
"n_shared_experts": 1,
"routed_scaling_factor": 1.8,
"num_experts_per_tok": 4,
"first_k_dense_replace": 1,
"num_hidden_layers": 47,
"num_key_value_heads": 20,
"num_nextn_predict_layers": 1,
"partial_rotary_factor": 1.0,
"rms_norm_eps": 1e-05,
"rope_scaling": null,
"rope_theta": 1000000,
"tie_word_embeddings": false,
"dtype": "bfloat16",
"transformers_version": "5.0.0rc0",
"q_lora_rank": 768,
"kv_lora_rank": 512,
"qk_nope_head_dim": 192,
"qk_rope_head_dim": 64,
"v_head_dim": 256,
"vocab_size": 154880
}
"""
nemotronHJson = """
{
"architectures": [
"NemotronHForCausalLM"
],
"attention_bias": false,
"attention_dropout": 0.0,
"auto_map": {
"AutoConfig": "configuration_nemotron_h.NemotronHConfig",
"AutoModel": "modeling_nemotron_h.NemotronHForCausalLM",
"AutoModelForCausalLM": "modeling_nemotron_h.NemotronHForCausalLM"
},
"bos_token_id": 1,
"chunk_size": 128,
"conv_kernel": 4,
"eos_token_id": 2,
"expand": 2,
"head_dim": 128,
"hidden_dropout": 0.0,
"hidden_size": 2688,
"hybrid_override_pattern": "MEMEM*EMEMEM*EMEMEM*EMEMEM*EMEMEM*EMEMEMEM*EMEMEMEME",
"initializer_range": 0.02,
"intermediate_size": 1856,
"layer_norm_epsilon": 1e-05,
"mamba_head_dim": 64,
"mamba_hidden_act": "silu",
"mamba_num_heads": 64,
"mamba_proj_bias": false,
"mamba_ssm_cache_dtype": "float32",
"max_position_embeddings": 262144,
"mlp_bias": false,
"mlp_hidden_act": "relu2",
"model_type": "nemotron_h",
"moe_intermediate_size": 1856,
"moe_shared_expert_intermediate_size": 3712,
"n_group": 1,
"n_groups": 8,
"n_routed_experts": 128,
"n_shared_experts": 1,
"norm_eps": 1e-05,
"norm_topk_prob": true,
"num_attention_heads": 32,
"num_experts_per_tok": 6,
"num_hidden_layers": 52,
"num_key_value_heads": 2,
"num_logits_to_keep": 1,
"pad_token_id": 0,
"partial_rotary_factor": 1.0,
"rescale_prenorm_residual": true,
"residual_in_fp32": false,
"rope_theta": 10000,
"routed_scaling_factor": 2.5,
"sliding_window": null,
"ssm_state_size": 128,
"tie_word_embeddings": false,
"time_step_floor": 0.0001,
"time_step_max": 0.1,
"time_step_min": 0.001,
"topk_group": 1,
"torch_dtype": "bfloat16",
"transformers_version": "4.53.2",
"use_bias": false,
"use_cache": true,
"use_conv_bias": true,
"use_mamba_kernels": true,
"vocab_size": 131072
}
"""
when isMainModule:
let glm4MoeCfg = fromJson(glm4MoeJson, ModelConfig)
doAssert glm4MoeCfg.kind == akGlm4Moe
doAssert glm4MoeCfg.glm4Moe.architectures[0] == "Glm4MoeForCausalLM"
doAssert glm4MoeCfg.glm4Moe.modelType == "glm4_moe"
doAssert glm4MoeCfg.glm4Moe.headDim == 128
doAssert glm4MoeCfg.glm4Moe.useQkNorm
doAssert glm4MoeCfg.glm4Moe.numHiddenLayers == 92
doAssert glm4MoeCfg.glm4Moe.vocabSize == 151552
let glm4MoeLiteCfg = fromJson(glm4MoeLiteJson, ModelConfig)
doAssert glm4MoeLiteCfg.kind == akGlm4MoeLite
doAssert glm4MoeLiteCfg.glm4MoeLite.architectures[0] == "Glm4MoeLiteForCausalLM"
doAssert glm4MoeLiteCfg.glm4MoeLite.modelType == "glm4_moe_lite"
doAssert glm4MoeLiteCfg.glm4MoeLite.topkMethod == "noaux_tc"
doAssert glm4MoeLiteCfg.glm4MoeLite.qLoraRank == 768
doAssert glm4MoeLiteCfg.glm4MoeLite.numHiddenLayers == 47
doAssert glm4MoeLiteCfg.glm4MoeLite.vocabSize == 154880
let nemotronHCfg = fromJson(nemotronHJson, ModelConfig)
doAssert nemotronHCfg.kind == akNemotronH
doAssert nemotronHCfg.nemotronH.architectures[0] == "NemotronHForCausalLM"
doAssert nemotronHCfg.nemotronH.modelType == "nemotron_h"
doAssert nemotronHCfg.nemotronH.chunkSize == 128
doAssert nemotronHCfg.nemotronH.mambaNumHeads == 64
doAssert nemotronHCfg.nemotronH.vocabSize == 131072
doAssert nemotronHCfg.nemotronH.autoMap.len == 3
doAssert nemotronHCfg.nemotronH.autoMap["AutoConfig"] ==
"configuration_nemotron_h.NemotronHConfig"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment