Last active
February 10, 2026 15:44
-
-
Save planetis-m/41c57eba82fe9e926b82908e96103a7f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import std/tables | |
| import jsonx | |
| import jsonx/parsejson | |
| type | |
| ArchitectureKind = enum | |
| akGlm4Moe | |
| akGlm4MoeLite | |
| akNemotronH | |
| BaseConfig = ref object of RootObj | |
| architectures: seq[string] | |
| modelType: string | |
| vocabSize: int | |
| Glm4MoeConfig = ref object of BaseConfig | |
| headDim: int | |
| useQkNorm: bool | |
| numHiddenLayers: int | |
| Glm4MoeLiteConfig = ref object of BaseConfig | |
| topkMethod: string | |
| qLoraRank: int | |
| numHiddenLayers: int | |
| NemotronHConfig = ref object of BaseConfig | |
| autoMap: Table[string, string] | |
| chunkSize: int | |
| mambaNumHeads: int | |
| ModelConfig = object | |
| case kind: ArchitectureKind | |
| of akGlm4Moe: | |
| glm4Moe: Glm4MoeConfig | |
| of akGlm4MoeLite: | |
| glm4MoeLite: Glm4MoeLiteConfig | |
| of akNemotronH: | |
| nemotronH: NemotronHConfig | |
| proc parseArchitecture(name: string; kind: var ArchitectureKind): bool = | |
| case name | |
| of "Glm4MoeForCausalLM": | |
| kind = akGlm4Moe | |
| result = true | |
| of "Glm4MoeLiteForCausalLM": | |
| kind = akGlm4MoeLite | |
| result = true | |
| of "NemotronHForCausalLM": | |
| kind = akNemotronH | |
| result = true | |
| else: | |
| result = false | |
| proc readJson(dst: var ModelConfig; p: var JsonParser) = | |
| var architectures: seq[string] | |
| var modelType = "" | |
| var vocabSize = 0 | |
| var headDim = 0 | |
| var useQkNorm = false | |
| var numHiddenLayers = 0 | |
| var topkMethod = "" | |
| var qLoraRank = 0 | |
| var autoMap = initTable[string, string]() | |
| var chunkSize = 0 | |
| var mambaNumHeads = 0 | |
| eat(p, tkCurlyLe) | |
| while p.tok != tkCurlyRi: | |
| if p.tok != tkString: | |
| raiseParseErr(p, "string literal as key") | |
| case p.a | |
| of "architectures": | |
| discard getTok(p) | |
| eat(p, tkColon) | |
| readJson(architectures, p) | |
| of "model_type": | |
| discard getTok(p) | |
| eat(p, tkColon) | |
| readJson(modelType, p) | |
| of "vocab_size": | |
| discard getTok(p) | |
| eat(p, tkColon) | |
| readJson(vocabSize, p) | |
| of "head_dim": | |
| discard getTok(p) | |
| eat(p, tkColon) | |
| readJson(headDim, p) | |
| of "use_qk_norm": | |
| discard getTok(p) | |
| eat(p, tkColon) | |
| readJson(useQkNorm, p) | |
| of "num_hidden_layers": | |
| discard getTok(p) | |
| eat(p, tkColon) | |
| readJson(numHiddenLayers, p) | |
| of "topk_method": | |
| discard getTok(p) | |
| eat(p, tkColon) | |
| readJson(topkMethod, p) | |
| of "q_lora_rank": | |
| discard getTok(p) | |
| eat(p, tkColon) | |
| readJson(qLoraRank, p) | |
| of "auto_map": | |
| discard getTok(p) | |
| eat(p, tkColon) | |
| readJson(autoMap, p) | |
| of "chunk_size": | |
| discard getTok(p) | |
| eat(p, tkColon) | |
| readJson(chunkSize, p) | |
| of "mamba_num_heads": | |
| discard getTok(p) | |
| eat(p, tkColon) | |
| readJson(mambaNumHeads, p) | |
| else: | |
| discard getTok(p) | |
| eat(p, tkColon) | |
| skipJson(p) | |
| expectObjectSeparator(p) | |
| eat(p, tkCurlyRi) | |
| if architectures.len == 0: | |
| raiseParseErr(p, "non-empty architectures") | |
| var kind: ArchitectureKind | |
| if not parseArchitecture(architectures[0], kind): | |
| raiseParseErr(p, "supported architectures[0] discriminator") | |
| case kind | |
| of akGlm4Moe: | |
| dst = ModelConfig( | |
| kind: akGlm4Moe, | |
| glm4Moe: Glm4MoeConfig( | |
| architectures: architectures, | |
| modelType: modelType, | |
| vocabSize: vocabSize, | |
| headDim: headDim, | |
| useQkNorm: useQkNorm, | |
| numHiddenLayers: numHiddenLayers | |
| ) | |
| ) | |
| of akGlm4MoeLite: | |
| dst = ModelConfig( | |
| kind: akGlm4MoeLite, | |
| glm4MoeLite: Glm4MoeLiteConfig( | |
| architectures: architectures, | |
| modelType: modelType, | |
| vocabSize: vocabSize, | |
| topkMethod: topkMethod, | |
| qLoraRank: qLoraRank, | |
| numHiddenLayers: numHiddenLayers | |
| ) | |
| ) | |
| of akNemotronH: | |
| dst = ModelConfig( | |
| kind: akNemotronH, | |
| nemotronH: NemotronHConfig( | |
| architectures: architectures, | |
| modelType: modelType, | |
| vocabSize: vocabSize, | |
| autoMap: autoMap, | |
| chunkSize: chunkSize, | |
| mambaNumHeads: mambaNumHeads | |
| ) | |
| ) | |
| const | |
| glm4MoeJson = """ | |
| { | |
| "architectures": [ | |
| "Glm4MoeForCausalLM" | |
| ], | |
| "attention_bias": true, | |
| "attention_dropout": 0.0, | |
| "pad_token_id": 151329, | |
| "eos_token_id": [ | |
| 151329, | |
| 151336, | |
| 151338 | |
| ], | |
| "head_dim": 128, | |
| "hidden_act": "silu", | |
| "hidden_size": 5120, | |
| "partial_rotary_factor": 0.5, | |
| "initializer_range": 0.02, | |
| "intermediate_size": 12288, | |
| "max_position_embeddings": 202752, | |
| "model_type": "glm4_moe", | |
| "moe_intermediate_size": 1536, | |
| "norm_topk_prob": true, | |
| "num_attention_heads": 96, | |
| "n_group": 1, | |
| "topk_group": 1, | |
| "n_routed_experts": 160, | |
| "n_shared_experts": 1, | |
| "routed_scaling_factor": 2.5, | |
| "num_experts_per_tok": 8, | |
| "first_k_dense_replace": 3, | |
| "num_hidden_layers": 92, | |
| "num_key_value_heads": 8, | |
| "rms_norm_eps": 1e-05, | |
| "rope_scaling": null, | |
| "rope_theta": 1000000, | |
| "num_nextn_predict_layers": 1, | |
| "tie_word_embeddings": false, | |
| "torch_dtype": "bfloat16", | |
| "transformers_version": "4.54.0", | |
| "use_cache": true, | |
| "use_qk_norm": true, | |
| "vocab_size": 151552 | |
| } | |
| """ | |
| glm4MoeLiteJson = """ | |
| { | |
| "architectures": [ | |
| "Glm4MoeLiteForCausalLM" | |
| ], | |
| "attention_bias": false, | |
| "attention_dropout": 0.0, | |
| "pad_token_id": 154820, | |
| "eos_token_id": [ | |
| 154820, | |
| 154827, | |
| 154829 | |
| ], | |
| "hidden_act": "silu", | |
| "hidden_size": 2048, | |
| "intermediate_size": 10240, | |
| "max_position_embeddings": 202752, | |
| "model_type": "glm4_moe_lite", | |
| "moe_intermediate_size": 1536, | |
| "topk_method": "noaux_tc", | |
| "norm_topk_prob": true, | |
| "num_attention_heads": 20, | |
| "n_group": 1, | |
| "topk_group": 1, | |
| "n_routed_experts": 64, | |
| "n_shared_experts": 1, | |
| "routed_scaling_factor": 1.8, | |
| "num_experts_per_tok": 4, | |
| "first_k_dense_replace": 1, | |
| "num_hidden_layers": 47, | |
| "num_key_value_heads": 20, | |
| "num_nextn_predict_layers": 1, | |
| "partial_rotary_factor": 1.0, | |
| "rms_norm_eps": 1e-05, | |
| "rope_scaling": null, | |
| "rope_theta": 1000000, | |
| "tie_word_embeddings": false, | |
| "dtype": "bfloat16", | |
| "transformers_version": "5.0.0rc0", | |
| "q_lora_rank": 768, | |
| "kv_lora_rank": 512, | |
| "qk_nope_head_dim": 192, | |
| "qk_rope_head_dim": 64, | |
| "v_head_dim": 256, | |
| "vocab_size": 154880 | |
| } | |
| """ | |
| nemotronHJson = """ | |
| { | |
| "architectures": [ | |
| "NemotronHForCausalLM" | |
| ], | |
| "attention_bias": false, | |
| "attention_dropout": 0.0, | |
| "auto_map": { | |
| "AutoConfig": "configuration_nemotron_h.NemotronHConfig", | |
| "AutoModel": "modeling_nemotron_h.NemotronHForCausalLM", | |
| "AutoModelForCausalLM": "modeling_nemotron_h.NemotronHForCausalLM" | |
| }, | |
| "bos_token_id": 1, | |
| "chunk_size": 128, | |
| "conv_kernel": 4, | |
| "eos_token_id": 2, | |
| "expand": 2, | |
| "head_dim": 128, | |
| "hidden_dropout": 0.0, | |
| "hidden_size": 2688, | |
| "hybrid_override_pattern": "MEMEM*EMEMEM*EMEMEM*EMEMEM*EMEMEM*EMEMEMEM*EMEMEMEME", | |
| "initializer_range": 0.02, | |
| "intermediate_size": 1856, | |
| "layer_norm_epsilon": 1e-05, | |
| "mamba_head_dim": 64, | |
| "mamba_hidden_act": "silu", | |
| "mamba_num_heads": 64, | |
| "mamba_proj_bias": false, | |
| "mamba_ssm_cache_dtype": "float32", | |
| "max_position_embeddings": 262144, | |
| "mlp_bias": false, | |
| "mlp_hidden_act": "relu2", | |
| "model_type": "nemotron_h", | |
| "moe_intermediate_size": 1856, | |
| "moe_shared_expert_intermediate_size": 3712, | |
| "n_group": 1, | |
| "n_groups": 8, | |
| "n_routed_experts": 128, | |
| "n_shared_experts": 1, | |
| "norm_eps": 1e-05, | |
| "norm_topk_prob": true, | |
| "num_attention_heads": 32, | |
| "num_experts_per_tok": 6, | |
| "num_hidden_layers": 52, | |
| "num_key_value_heads": 2, | |
| "num_logits_to_keep": 1, | |
| "pad_token_id": 0, | |
| "partial_rotary_factor": 1.0, | |
| "rescale_prenorm_residual": true, | |
| "residual_in_fp32": false, | |
| "rope_theta": 10000, | |
| "routed_scaling_factor": 2.5, | |
| "sliding_window": null, | |
| "ssm_state_size": 128, | |
| "tie_word_embeddings": false, | |
| "time_step_floor": 0.0001, | |
| "time_step_max": 0.1, | |
| "time_step_min": 0.001, | |
| "topk_group": 1, | |
| "torch_dtype": "bfloat16", | |
| "transformers_version": "4.53.2", | |
| "use_bias": false, | |
| "use_cache": true, | |
| "use_conv_bias": true, | |
| "use_mamba_kernels": true, | |
| "vocab_size": 131072 | |
| } | |
| """ | |
| when isMainModule: | |
| let glm4MoeCfg = fromJson(glm4MoeJson, ModelConfig) | |
| doAssert glm4MoeCfg.kind == akGlm4Moe | |
| doAssert glm4MoeCfg.glm4Moe.architectures[0] == "Glm4MoeForCausalLM" | |
| doAssert glm4MoeCfg.glm4Moe.modelType == "glm4_moe" | |
| doAssert glm4MoeCfg.glm4Moe.headDim == 128 | |
| doAssert glm4MoeCfg.glm4Moe.useQkNorm | |
| doAssert glm4MoeCfg.glm4Moe.numHiddenLayers == 92 | |
| doAssert glm4MoeCfg.glm4Moe.vocabSize == 151552 | |
| let glm4MoeLiteCfg = fromJson(glm4MoeLiteJson, ModelConfig) | |
| doAssert glm4MoeLiteCfg.kind == akGlm4MoeLite | |
| doAssert glm4MoeLiteCfg.glm4MoeLite.architectures[0] == "Glm4MoeLiteForCausalLM" | |
| doAssert glm4MoeLiteCfg.glm4MoeLite.modelType == "glm4_moe_lite" | |
| doAssert glm4MoeLiteCfg.glm4MoeLite.topkMethod == "noaux_tc" | |
| doAssert glm4MoeLiteCfg.glm4MoeLite.qLoraRank == 768 | |
| doAssert glm4MoeLiteCfg.glm4MoeLite.numHiddenLayers == 47 | |
| doAssert glm4MoeLiteCfg.glm4MoeLite.vocabSize == 154880 | |
| let nemotronHCfg = fromJson(nemotronHJson, ModelConfig) | |
| doAssert nemotronHCfg.kind == akNemotronH | |
| doAssert nemotronHCfg.nemotronH.architectures[0] == "NemotronHForCausalLM" | |
| doAssert nemotronHCfg.nemotronH.modelType == "nemotron_h" | |
| doAssert nemotronHCfg.nemotronH.chunkSize == 128 | |
| doAssert nemotronHCfg.nemotronH.mambaNumHeads == 64 | |
| doAssert nemotronHCfg.nemotronH.vocabSize == 131072 | |
| doAssert nemotronHCfg.nemotronH.autoMap.len == 3 | |
| doAssert nemotronHCfg.nemotronH.autoMap["AutoConfig"] == | |
| "configuration_nemotron_h.NemotronHConfig" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment