| { | |
| "_name_or_path": "/home/dinalt/ai_assets/models/walsh", | |
| "activation_args": {}, | |
| "activation_cls": "torch.nn.GELU", | |
| "architectures": [ | |
| "HFCausalModel" | |
| ], | |
| "attention_args": { | |
| "beta": 0.25, | |
| "dropout": 0.1 | |
| }, | |
| "attention_cls": ".CausalSelfAttention", | |
| "auto_map": { | |
| "AutoConfig": "modelling_walsh.Config", | |
| "AutoModelForCausalLM": "modelling_walsh.HFCausalModel" | |
| }, | |
| "d_embed": 2048, | |
| "dim_feedforward": 8192, | |
| "dropout": 0.1, | |
| "embdding_cls": "torch.nn.Embedding", | |
| "embedding_args": {}, | |
| "feedforward_args": { | |
| "beta": 0.25, | |
| "bias": true | |
| }, | |
| "feedforward_cls": ".FeedforwardLayer", | |
| "head_args": {}, | |
| "head_cls": ".Transformer", | |
| "init_gain": 1.0, | |
| "layer_args": { | |
| "alpha": 2.828427124746 | |
| }, | |
| "layer_cls": ".DeepnetLayer", | |
| "layer_stack_args": {}, | |
| "layer_stack_cls": ".TransformerLayerStack", | |
| "loss_function": ".causal_loss", | |
| "max_sequence_length": 16384, | |
| "model_type": "walsh-causal-v1", | |
| "norm_args": { | |
| "normalized_shape": 2084 | |
| }, | |
| "norm_cls": "torch.nn.LayerNorm", | |
| "num_attention_heads": 32, | |
| "num_hidden_layers": 32, | |
| "output_proj_args": {}, | |
| "output_proj_cls": "torch.nn.Linear", | |
| "pad_index": null, | |
| "positional_encoder_args": { | |
| "d_embed": 2048, | |
| "gain": 0.3333, | |
| "max_seq": 16384 | |
| }, | |
| "positional_encoder_cls": ".RSWalshPositionalEncoder", | |
| "torch_dtype": "bfloat16", | |
| "transformer_args": {}, | |
| "transformer_cls": ".Transformer", | |
| "transformers_version": "4.37.2", | |
| "vocab_size": 32000 | |
| } | |