katerynaCh commited on
Commit
a025381
·
verified ·
1 Parent(s): 43a8539

Upload folder using huggingface_hub

Browse files
config.json ADDED
@@ -0,0 +1,402 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "NemotronParseForConditionalGeneration"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "hf_nemotron_parse_config.NemotronParseConfig",
7
+ "AutoModel": "hf_nemotron_parse_modeling.NemotronParseForConditionalGeneration",
8
+ "AutoImageProcessor": "hf_nemotron_parse_processor.NemotronParseImageProcessor",
9
+ "AutoProcessor": "hf_nemotron_parse_processor.NemotronParseProcessor"
10
+ },
11
+ "bos_token_id": 0,
12
+ "decoder": {
13
+ "_attn_implementation": "sdpa",
14
+ "_name_or_path": "",
15
+ "activation_dropout": 0.0,
16
+ "activation_function": "gelu",
17
+ "add_cross_attention": true,
18
+ "add_final_layer_norm": true,
19
+ "architectures": null,
20
+ "attention_dropout": 0.0,
21
+ "bad_words_ids": null,
22
+ "begin_suppress_tokens": null,
23
+ "bos_token_id": 0,
24
+ "chunk_size_feed_forward": 0,
25
+ "classifier_dropout": 0.0,
26
+ "cross_attention_hidden_size": null,
27
+ "d_model": 1024,
28
+ "decoder_attention_heads": 16,
29
+ "decoder_ffn_dim": 4096,
30
+ "decoder_layerdrop": 0.0,
31
+ "decoder_layers": 10,
32
+ "decoder_start_token_id": null,
33
+ "diversity_penalty": 0.0,
34
+ "do_sample": false,
35
+ "dropout": 0.1,
36
+ "early_stopping": false,
37
+ "encoder_attention_heads": 16,
38
+ "encoder_ffn_dim": 4096,
39
+ "encoder_layerdrop": 0.0,
40
+ "encoder_layers": 12,
41
+ "encoder_no_repeat_ngram_size": 0,
42
+ "eos_token_id": 2,
43
+ "exponential_decay_length_penalty": null,
44
+ "finetuning_task": null,
45
+ "forced_bos_token_id": null,
46
+ "forced_eos_token_id": 2,
47
+ "hidden_size": 1024,
48
+ "id2label": {
49
+ "0": "LABEL_0",
50
+ "1": "LABEL_1",
51
+ "2": "LABEL_2"
52
+ },
53
+ "init_std": 0.02,
54
+ "is_decoder": true,
55
+ "is_encoder_decoder": false,
56
+ "label2id": {
57
+ "LABEL_0": 0,
58
+ "LABEL_1": 1,
59
+ "LABEL_2": 2
60
+ },
61
+ "length_penalty": 1.0,
62
+ "max_length": 20,
63
+ "min_length": 0,
64
+ "model_type": "nemotron_parse_text",
65
+ "no_repeat_ngram_size": 0,
66
+ "num_beam_groups": 1,
67
+ "num_beams": 1,
68
+ "num_hidden_layers": 12,
69
+ "num_return_sequences": 1,
70
+ "output_attentions": false,
71
+ "output_hidden_states": false,
72
+ "output_scores": false,
73
+ "pad_token_id": 1,
74
+ "prefix": null,
75
+ "problem_type": null,
76
+ "pruned_heads": {},
77
+ "remove_invalid_values": false,
78
+ "repetition_penalty": 1.0,
79
+ "return_dict": true,
80
+ "return_dict_in_generate": false,
81
+ "scale_embedding": true,
82
+ "sep_token_id": null,
83
+ "suppress_tokens": null,
84
+ "task_specific_params": null,
85
+ "temperature": 1.0,
86
+ "tf_legacy_loss": false,
87
+ "tie_encoder_decoder": false,
88
+ "tie_word_embeddings": false,
89
+ "tokenizer_class": null,
90
+ "top_k": 50,
91
+ "top_p": 1.0,
92
+ "torch_dtype": "bfloat16",
93
+ "torchscript": false,
94
+ "transformers_version": "4.51.3",
95
+ "typical_p": 1.0,
96
+ "use_bfloat16": true,
97
+ "use_cache": true,
98
+ "vocab_size": 52352
99
+ },
100
+ "decoder_start_token_id": 2,
101
+ "encoder": {
102
+ "_attn_implementation": "eager",
103
+ "_name_or_path": "nvidia/C-RADIOv2-H",
104
+ "adaptor_configs": {},
105
+ "adaptor_names": null,
106
+ "add_cross_attention": false,
107
+ "architectures": [
108
+ "RADIOModel"
109
+ ],
110
+ "args": {
111
+ "aa": null,
112
+ "amp": true,
113
+ "amp_dtype": "bfloat16",
114
+ "amp_impl": "native",
115
+ "aug_repeats": 0,
116
+ "aug_splits": 0,
117
+ "bn_eps": null,
118
+ "bn_momentum": null,
119
+ "cache_dir": null,
120
+ "channels_last": false,
121
+ "checkpoint_hist": 10,
122
+ "chk_keep_forever": 100,
123
+ "class_map": "",
124
+ "clip_grad": null,
125
+ "clip_mode": "norm",
126
+ "cls_token_per_teacher": true,
127
+ "coco_annotations_file": "/datasets/coco2017-adlsa/annotations/captions_val2017.json",
128
+ "coco_image_dir": "/datasets/coco2017-adlsa/val2017",
129
+ "color_jitter": 0.4,
130
+ "cooldown_epochs": 0,
131
+ "cpe_max_size": 2048,
132
+ "crd_loss": false,
133
+ "crd_loss_weight": 0.8,
134
+ "crop_pct": null,
135
+ "cutmix": 0.0,
136
+ "cutmix_minmax": null,
137
+ "dataset_download": false,
138
+ "debug_full_knn": false,
139
+ "decay_epochs": 90,
140
+ "decay_milestones": [
141
+ 90,
142
+ 180,
143
+ 270
144
+ ],
145
+ "decay_rate": 0.1,
146
+ "depchain": true,
147
+ "dist_bn": "reduce",
148
+ "dist_norm_weight": 0.0,
149
+ "distributed": true,
150
+ "drop": 0.0,
151
+ "drop_block": null,
152
+ "drop_connect": null,
153
+ "drop_path": null,
154
+ "dtype": "bfloat16",
155
+ "epoch_repeats": 0.0,
156
+ "eval": false,
157
+ "eval_metric": "knn_top1",
158
+ "eval_teacher": false,
159
+ "eval_teacher_only": false,
160
+ "eval_throughput": false,
161
+ "fast_norm": false,
162
+ "fd_loss_fn": "MSE",
163
+ "feature_normalization": "SHIP_NORM",
164
+ "feature_summarizer": "cls_token",
165
+ "feature_upscale_factor": null,
166
+ "force_new_wandb_id": false,
167
+ "force_spectral_reparam": true,
168
+ "freeze_bn": false,
169
+ "fsdp": false,
170
+ "fuser": "",
171
+ "gp": null,
172
+ "grad_accum_steps": 1,
173
+ "grad_checkpointing": false,
174
+ "head_init_bias": null,
175
+ "head_init_scale": null,
176
+ "head_warmup": 5,
177
+ "head_weight_decay": 0.001,
178
+ "hflip": 0.5,
179
+ "img_size": null,
180
+ "in_chans": null,
181
+ "initial_checkpoint": null,
182
+ "input_size": null,
183
+ "interpolation": "",
184
+ "layer_decay": null,
185
+ "local_rank": 0,
186
+ "log_interval": 50,
187
+ "log_mlflow": false,
188
+ "log_wandb": true,
189
+ "loss_auto_balance": false,
190
+ "lr_base": 0.1,
191
+ "lr_base_scale": "",
192
+ "lr_base_size": 256,
193
+ "lr_cycle_decay": 0.5,
194
+ "lr_cycle_limit": 1,
195
+ "lr_cycle_mul": 1.0,
196
+ "lr_k_decay": 1.0,
197
+ "lr_noise": null,
198
+ "lr_noise_pct": 0.67,
199
+ "lr_noise_std": 1.0,
200
+ "mean": null,
201
+ "mesa": false,
202
+ "min_lr": 0,
203
+ "mixup": 0.0,
204
+ "mixup_mode": "batch",
205
+ "mixup_off_epoch": 0,
206
+ "mixup_prob": 1.0,
207
+ "mixup_switch_prob": 0.5,
208
+ "mlp_hidden_size": 1520,
209
+ "mlp_num_inner": 3,
210
+ "mlp_version": "v2",
211
+ "model": "vit_huge_patch16_224",
212
+ "model_kwargs": {},
213
+ "model_norm": false,
214
+ "momentum": 0.9,
215
+ "no_aug": false,
216
+ "no_ddp_bb": true,
217
+ "no_prefetcher": false,
218
+ "no_resume_opt": false,
219
+ "num_classes": null,
220
+ "opt_betas": null,
221
+ "opt_eps": null,
222
+ "patience_epochs": 10,
223
+ "pin_mem": false,
224
+ "prefetcher": true,
225
+ "pretrained": false,
226
+ "rank": 0,
227
+ "ratio": [
228
+ 0.75,
229
+ 1.3333333333333333
230
+ ],
231
+ "recount": 1,
232
+ "recovery_interval": 0,
233
+ "register_multiple": 8,
234
+ "remode": "pixel",
235
+ "reprob": 0.0,
236
+ "reset_loss_state": false,
237
+ "resplit": false,
238
+ "save_images": false,
239
+ "scale": [
240
+ 0.5,
241
+ 1.0
242
+ ],
243
+ "sched": "cosine",
244
+ "seed": 42,
245
+ "smoothing": 0.1,
246
+ "spectral_heads": false,
247
+ "spectral_reparam": false,
248
+ "split_bn": false,
249
+ "start_epoch": null,
250
+ "std": null,
251
+ "stream_teachers": true,
252
+ "sync_bn": false,
253
+ "synchronize_step": false,
254
+ "teachers": [
255
+ {
256
+ "fd_normalize": false,
257
+ "feature_distillation": true,
258
+ "input_size": 378,
259
+ "model": "ViT-H-14-378-quickgelu",
260
+ "name": "clip",
261
+ "pretrained": "dfn5b",
262
+ "type": "open_clip",
263
+ "use_summary": true
264
+ },
265
+ {
266
+ "fd_normalize": false,
267
+ "feature_distillation": true,
268
+ "input_size": 378,
269
+ "model": "ViT-SO400M-14-SigLIP-384",
270
+ "name": "siglip",
271
+ "pretrained": "webli",
272
+ "type": "open_clip",
273
+ "use_summary": true
274
+ },
275
+ {
276
+ "fd_normalize": false,
277
+ "feature_distillation": true,
278
+ "input_size": 378,
279
+ "model": "dinov2_vitg14_reg",
280
+ "name": "dino_v2",
281
+ "type": "dino_v2",
282
+ "use_summary": true
283
+ },
284
+ {
285
+ "fd_normalize": false,
286
+ "feature_distillation": true,
287
+ "input_size": 1024,
288
+ "model": "vit-h",
289
+ "name": "sam",
290
+ "type": "sam",
291
+ "use_summary": false
292
+ }
293
+ ],
294
+ "torchcompile": null,
295
+ "torchscript": false,
296
+ "train_interpolation": "random",
297
+ "train_split": "train",
298
+ "tta": 0,
299
+ "use_coco": false,
300
+ "use_multi_epochs_loader": false,
301
+ "val_ema_only": false,
302
+ "val_split": "val",
303
+ "vflip": 0.0,
304
+ "vitdet_version": 1,
305
+ "wandb_entity": "",
306
+ "wandb_job_type": "",
307
+ "wandb_name": "",
308
+ "wandb_project": "",
309
+ "warmup_lr": 1e-05,
310
+ "warmup_prefix": false,
311
+ "worker_seeding": "all",
312
+ "workers": 8,
313
+ "world_size": 256
314
+ },
315
+ "auto_map": {
316
+ "AutoConfig": "nvidia/C-RADIOv2-H--hf_model.RADIOConfig",
317
+ "AutoModel": "nvidia/C-RADIOv2-H--hf_model.RADIOModel"
318
+ },
319
+ "bad_words_ids": null,
320
+ "begin_suppress_tokens": null,
321
+ "bos_token_id": null,
322
+ "chunk_size_feed_forward": 0,
323
+ "cross_attention_hidden_size": null,
324
+ "decoder_start_token_id": null,
325
+ "diversity_penalty": 0.0,
326
+ "do_sample": false,
327
+ "early_stopping": false,
328
+ "encoder_no_repeat_ngram_size": 0,
329
+ "eos_token_id": null,
330
+ "exponential_decay_length_penalty": null,
331
+ "feature_normalizer_config": null,
332
+ "finetuning_task": null,
333
+ "forced_bos_token_id": null,
334
+ "forced_eos_token_id": null,
335
+ "id2label": {
336
+ "0": "LABEL_0",
337
+ "1": "LABEL_1"
338
+ },
339
+ "inter_feature_normalizer_config": null,
340
+ "is_decoder": false,
341
+ "is_encoder_decoder": false,
342
+ "label2id": {
343
+ "LABEL_0": 0,
344
+ "LABEL_1": 1
345
+ },
346
+ "length_penalty": 1.0,
347
+ "max_length": 20,
348
+ "max_resolution": 2048,
349
+ "min_length": 0,
350
+ "model_type": "",
351
+ "no_repeat_ngram_size": 0,
352
+ "num_beam_groups": 1,
353
+ "num_beams": 1,
354
+ "num_return_sequences": 1,
355
+ "output_attentions": false,
356
+ "output_hidden_states": false,
357
+ "output_scores": false,
358
+ "pad_token_id": null,
359
+ "patch_size": 16,
360
+ "preferred_resolution": [
361
+ 768,
362
+ 768
363
+ ],
364
+ "prefix": null,
365
+ "problem_type": null,
366
+ "pruned_heads": {},
367
+ "remove_invalid_values": false,
368
+ "repetition_penalty": 1.0,
369
+ "return_dict": true,
370
+ "return_dict_in_generate": false,
371
+ "sep_token_id": null,
372
+ "suppress_tokens": null,
373
+ "task_specific_params": null,
374
+ "temperature": 1.0,
375
+ "tf_legacy_loss": false,
376
+ "tie_encoder_decoder": false,
377
+ "tie_word_embeddings": true,
378
+ "tokenizer_class": null,
379
+ "top_k": 50,
380
+ "top_p": 1.0,
381
+ "torch_dtype": "bfloat16",
382
+ "torchscript": false,
383
+ "transformers_version": "4.51.3",
384
+ "typical_p": 1.0,
385
+ "use_bfloat16": true,
386
+ "version": "radio_v2.5-h",
387
+ "vitdet_window_size": null
388
+ },
389
+ "eos_token_id": 2,
390
+ "image_size": [
391
+ 2048,
392
+ 1648
393
+ ],
394
+ "is_encoder_decoder": true,
395
+ "max_sequence_length": 9000,
396
+ "model_type": "nemotron_parse",
397
+ "pad_token_id": 1,
398
+ "tie_word_embeddings": false,
399
+ "torch_dtype": "bfloat16",
400
+ "transformers_version": "4.51.3",
401
+ "vocab_size": 52327
402
+ }
generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "decoder_start_token_id": 2,
5
+ "eos_token_id": 2,
6
+ "forced_eos_token_id": 2,
7
+ "pad_token_id": 1,
8
+ "max_new_tokens": 9000,
9
+ "do_sample": false,
10
+ "num_beams": 1,
11
+ "repetition_penalty": 1.1,
12
+ "transformers_version": "4.51.3"
13
+ }
hf_nemotron_parse_config.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from os import truncate
2
+ from quopri import decodestring
3
+ from transformers import PretrainedConfig
4
+ from typing import List, Optional
5
+
6
+ from transformers.dynamic_module_utils import get_class_from_dynamic_module
7
+
8
+ class NemotronParseTextConfig(PretrainedConfig):
9
+ """
10
+ Configuration class for NemotronParse text decoder (mBART-based).
11
+ """
12
+ model_type = "nemotron_parse_text"
13
+
14
+ def __init__(
15
+ self,
16
+ vocab_size: int = 250027,
17
+ d_model: int = 1024,
18
+ encoder_layers: int = 12,
19
+ decoder_layers: int = 12,
20
+ encoder_attention_heads: int = 16,
21
+ decoder_attention_heads: int = 16,
22
+ decoder_ffn_dim: int = 4096,
23
+ encoder_ffn_dim: int = 4096,
24
+ activation_function: str = "gelu",
25
+ dropout: float = 0.1,
26
+ attention_dropout: float = 0.0,
27
+ activation_dropout: float = 0.0,
28
+ classifier_dropout: float = 0.0,
29
+ init_std: float = 0.02,
30
+ encoder_layerdrop: float = 0.0,
31
+ decoder_layerdrop: float = 0.0,
32
+ scale_embedding: bool = False,
33
+ use_cache: bool = True,
34
+ num_labels: int = 3,
35
+ forced_eos_token_id: int = 2,
36
+ add_cross_attention: bool = True, # Enable cross-attention for vision-encoder-decoder
37
+ is_decoder: bool = True, # This is a decoder
38
+ max_sequence_length: int = 9000,
39
+ **kwargs
40
+ ):
41
+ super().__init__(**kwargs)
42
+ self.vocab_size = vocab_size
43
+ self.d_model = d_model
44
+ self.encoder_layers = encoder_layers
45
+ self.decoder_layers = decoder_layers
46
+ self.encoder_attention_heads = encoder_attention_heads
47
+ self.decoder_attention_heads = decoder_attention_heads
48
+ self.decoder_ffn_dim = decoder_ffn_dim
49
+ self.encoder_ffn_dim = encoder_ffn_dim
50
+ self.activation_function = activation_function
51
+ self.dropout = dropout
52
+ self.attention_dropout = attention_dropout
53
+ self.activation_dropout = activation_dropout
54
+ self.classifier_dropout = classifier_dropout
55
+ self.init_std = init_std
56
+ self.encoder_layerdrop = encoder_layerdrop
57
+ self.decoder_layerdrop = decoder_layerdrop
58
+ self.scale_embedding = scale_embedding
59
+ self.use_cache = use_cache
60
+ self.num_labels = num_labels
61
+ self.add_cross_attention = add_cross_attention
62
+ self.is_decoder = is_decoder
63
+
64
+ # Add hidden_size as alias for d_model (for compatibility)
65
+ self.hidden_size = self.d_model
66
+ self.forced_eos_token_id = forced_eos_token_id
67
+ self.num_attention_heads = self.encoder_attention_heads
68
+
69
+ self.max_sequence_length = max_sequence_length
70
+
71
+
72
+ class NemotronParseConfig(PretrainedConfig):
73
+ """
74
+ Configuration class for NemotronParse model.
75
+
76
+ This configuration class is used to store the configuration of a [`NemotronParseForConditionalGeneration`] model.
77
+ It is used to instantiate an NemotronParse model according to the specified arguments, defining the vision and text model configs.
78
+ """
79
+ model_type = "nemotron_parse"
80
+ is_composition = True
81
+ max_sequence_length = 9000
82
+
83
+ def __init__(
84
+ self,
85
+ encoder: Optional[dict] = None,
86
+ decoder: Optional[dict] = None,
87
+ tie_word_embeddings: bool = False,
88
+ decoder_start_token_id: int = 2,
89
+ pad_token_id: int = 1,
90
+ eos_token_id: int = 2,
91
+ bos_token_id: int = 0,
92
+ image_size: List[int] = [2048, 1648],
93
+ is_encoder_decoder: bool = True,
94
+ max_sequence_length: int = 9000,
95
+ **kwargs
96
+ ):
97
+ super().__init__(
98
+ tie_word_embeddings=tie_word_embeddings,
99
+ decoder_start_token_id=decoder_start_token_id,
100
+ pad_token_id=pad_token_id,
101
+ eos_token_id=eos_token_id,
102
+ bos_token_id=bos_token_id,
103
+ max_sequence_length=max_sequence_length,
104
+ **kwargs
105
+ )
106
+
107
+
108
+ if decoder is None:
109
+ decoder = {}
110
+
111
+ if encoder is not None:
112
+ assert "auto_map" in encoder and "AutoConfig" in encoder["auto_map"]
113
+ vision_auto_config = get_class_from_dynamic_module(*encoder["auto_map"]["AutoConfig"].split("--")[::-1])
114
+ self.encoder = vision_auto_config(**encoder)
115
+ else:
116
+ self.encoder = PretrainedConfig()
117
+
118
+ decoder["max_sequence_length"] = max_sequence_length
119
+ self.decoder = NemotronParseTextConfig(**decoder)
120
+ self.image_size = image_size
121
+
122
+ # Initialize vocab size from text config
123
+ self.vocab_size = self.decoder.vocab_size
124
+ self.is_encoder_decoder = is_encoder_decoder
125
+ self.max_sequence_length = max_sequence_length
126
+
127
+ def to_dict(self):
128
+ """
129
+ Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
130
+ """
131
+ output = super().to_dict()
132
+ output["encoder"] = self.encoder.to_dict()
133
+ output["decoder"] = self.decoder.to_dict()
134
+ output["model_type"] = self.model_type
135
+ output["is_encoder_decoder"] = self.is_encoder_decoder
136
+ return output
hf_nemotron_parse_modeling.py ADDED
@@ -0,0 +1,585 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ import torch.nn as nn
4
+ from torch.nn import CrossEntropyLoss
5
+ from transformers import PreTrainedModel, GenerationMixin
6
+ from transformers.models.vision_encoder_decoder.modeling_vision_encoder_decoder import VisionEncoderDecoderModel
7
+ from transformers.models.vision_encoder_decoder.configuration_vision_encoder_decoder import VisionEncoderDecoderConfig
8
+ from transformers.modeling_outputs import Seq2SeqLMOutput
9
+ from transformers.models.mbart.modeling_mbart import MBartPreTrainedModel, MBartConfig, MBartScaledWordEmbedding, MBartDecoderLayer, BaseModelOutputWithPastAndCrossAttentions
10
+ from transformers.models.donut.modeling_donut_swin import DonutSwinModelOutput
11
+ from einops import rearrange
12
+ from typing import Optional, List, Union, Tuple
13
+ import warnings
14
+ from transformers.modeling_outputs import BaseModelOutput
15
+ from transformers.models.encoder_decoder.modeling_encoder_decoder import shift_tokens_right
16
+ from hf_nemotron_parse_config import NemotronParseConfig
17
+ from transformers import AutoModel
18
+ import time
19
+ from transformers.modeling_attn_mask_utils import (
20
+ _prepare_4d_attention_mask,
21
+ _prepare_4d_attention_mask_for_sdpa,
22
+ _prepare_4d_causal_attention_mask,
23
+ _prepare_4d_causal_attention_mask_for_sdpa,
24
+ )
25
+
26
+
27
+ class NemotronParseDecoder(MBartPreTrainedModel):
28
+ """
29
+ Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`MBartDecoderLayer`]
30
+
31
+ Args:
32
+ config: MBartConfig
33
+ embed_tokens (nn.Embedding): output embedding
34
+ """
35
+
36
+ def __init__(self, config: MBartConfig, embed_tokens: Optional[nn.Embedding] = None):
37
+ super().__init__(config)
38
+ self.dropout = config.dropout
39
+ self.layerdrop = config.decoder_layerdrop
40
+ self.padding_idx = config.pad_token_id
41
+ embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
42
+
43
+ self.embed_tokens = MBartScaledWordEmbedding(
44
+ config.vocab_size, config.d_model, self.padding_idx, embed_scale=embed_scale
45
+ )
46
+
47
+ if embed_tokens is not None:
48
+ self.embed_tokens.weight = embed_tokens.weight
49
+
50
+ self.layers = nn.ModuleList([MBartDecoderLayer(config) for _ in range(config.decoder_layers)])
51
+ self.config = config
52
+
53
+ self.layernorm_embedding = nn.LayerNorm(config.d_model)
54
+ self.layer_norm = nn.LayerNorm(config.d_model)
55
+
56
+ self.gradient_checkpointing = False
57
+ # Initialize weights and apply final processing
58
+ self.post_init()
59
+
60
+ def get_input_embeddings(self):
61
+ return self.embed_tokens
62
+
63
+ def set_input_embeddings(self, value):
64
+ self.embed_tokens = value
65
+
66
+ def forward(
67
+ self,
68
+ input_ids: Optional[torch.LongTensor] = None,
69
+ attention_mask: Optional[torch.Tensor] = None,
70
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
71
+ encoder_attention_mask: Optional[torch.LongTensor] = None,
72
+ head_mask: Optional[torch.Tensor] = None,
73
+ cross_attn_head_mask: Optional[torch.Tensor] = None,
74
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
75
+ inputs_embeds: Optional[torch.FloatTensor] = None,
76
+ use_cache: Optional[bool] = None,
77
+ output_attentions: Optional[bool] = None,
78
+ output_hidden_states: Optional[bool] = None,
79
+ return_dict: Optional[bool] = None,
80
+ ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
81
+ r"""
82
+ Args:
83
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
84
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
85
+ provide it.
86
+
87
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
88
+ [`PreTrainedTokenizer.__call__`] for details.
89
+
90
+ [What are input IDs?](../glossary#input-ids)
91
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
92
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
93
+
94
+ - 1 for tokens that are **not masked**,
95
+ - 0 for tokens that are **masked**.
96
+
97
+ [What are attention masks?](../glossary#attention-mask)
98
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
99
+ Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
100
+ of the decoder.
101
+ encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
102
+ Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
103
+ selected in `[0, 1]`:
104
+
105
+ - 1 for tokens that are **not masked**,
106
+ - 0 for tokens that are **masked**.
107
+
108
+ [What are attention masks?](../glossary#attention-mask)
109
+ head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
110
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
111
+
112
+ - 1 indicates the head is **not masked**,
113
+ - 0 indicates the head is **masked**.
114
+
115
+ cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
116
+ Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
117
+ cross-attention on hidden heads. Mask values selected in `[0, 1]`:
118
+
119
+ - 1 indicates the head is **not masked**,
120
+ - 0 indicates the head is **masked**.
121
+
122
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
123
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
124
+ shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
125
+ shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
126
+
127
+ Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
128
+ cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
129
+
130
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
131
+ that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
132
+ all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
133
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
134
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
135
+ This is useful if you want more control over how to convert `input_ids` indices into associated vectors
136
+ than the model's internal embedding lookup matrix.
137
+ output_attentions (`bool`, *optional*):
138
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
139
+ returned tensors for more detail.
140
+ output_hidden_states (`bool`, *optional*):
141
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
142
+ for more detail.
143
+ return_dict (`bool`, *optional*):
144
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
145
+ """
146
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
147
+ output_hidden_states = (
148
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
149
+ )
150
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
151
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
152
+
153
+ # retrieve input_ids and inputs_embeds
154
+ if input_ids is not None and inputs_embeds is not None:
155
+ raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
156
+ elif input_ids is not None:
157
+ input = input_ids
158
+ input_shape = input.size()
159
+ input_ids = input_ids.view(-1, input_shape[-1])
160
+ elif inputs_embeds is not None:
161
+ input_shape = inputs_embeds.size()[:-1]
162
+ input = inputs_embeds[:, :, -1]
163
+ else:
164
+ raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
165
+
166
+ # past_key_values_length
167
+ past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
168
+
169
+ if inputs_embeds is None:
170
+ inputs_embeds = self.embed_tokens(input_ids)
171
+
172
+ if self.config._attn_implementation == "flash_attention_2":
173
+ # 2d mask is passed through the layers
174
+ attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
175
+ elif self.config._attn_implementation == "sdpa" and not output_attentions and cross_attn_head_mask is None:
176
+ # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on
177
+ # the manual implementation that requires a 4D causal mask in all cases.
178
+ attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
179
+ attention_mask,
180
+ input_shape,
181
+ inputs_embeds,
182
+ past_key_values_length,
183
+ )
184
+ else:
185
+ # 4d mask is passed through the layers
186
+ attention_mask = _prepare_4d_causal_attention_mask(
187
+ attention_mask, input_shape, inputs_embeds, past_key_values_length
188
+ )
189
+
190
+ # expand encoder attention mask
191
+ if encoder_hidden_states is not None and encoder_attention_mask is not None:
192
+ if self.config._attn_implementation == "flash_attention_2":
193
+ encoder_attention_mask = encoder_attention_mask if 0 in encoder_attention_mask else None
194
+ elif self.config._attn_implementation == "sdpa" and cross_attn_head_mask is None and not output_attentions:
195
+ # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on
196
+ # the manual implementation that requires a 4D causal mask in all cases.
197
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
198
+ encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa(
199
+ encoder_attention_mask,
200
+ inputs_embeds.dtype,
201
+ tgt_len=input_shape[-1],
202
+ )
203
+ else:
204
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
205
+ encoder_attention_mask = _prepare_4d_attention_mask(
206
+ encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
207
+ )
208
+ hidden_states = inputs_embeds
209
+ hidden_states = self.layernorm_embedding(hidden_states)
210
+
211
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
212
+
213
+ if self.gradient_checkpointing and self.training:
214
+ if use_cache:
215
+ logger.warning_once(
216
+ "`use_cache=True` is incompatible with gradient checkpointing`. Setting `use_cache=False`..."
217
+ )
218
+ use_cache = False
219
+
220
+ # decoder layers
221
+ all_hidden_states = () if output_hidden_states else None
222
+ all_self_attns = () if output_attentions else None
223
+ all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
224
+ next_decoder_cache = () if use_cache else None
225
+
226
+ # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
227
+ for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
228
+ if attn_mask is not None:
229
+ if attn_mask.size()[0] != len(self.layers):
230
+ raise ValueError(
231
+ f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
232
+ f" {attn_mask.size()[0]}."
233
+ )
234
+ for idx, decoder_layer in enumerate(self.layers):
235
+ # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
236
+ if output_hidden_states:
237
+ all_hidden_states += (hidden_states,)
238
+ if self.training:
239
+ dropout_probability = torch.rand([])
240
+ if dropout_probability < self.layerdrop:
241
+ continue
242
+
243
+ past_key_value = past_key_values[idx] if past_key_values is not None else None
244
+
245
+ if self.gradient_checkpointing and self.training:
246
+ layer_outputs = self._gradient_checkpointing_func(
247
+ decoder_layer.__call__,
248
+ hidden_states,
249
+ attention_mask,
250
+ encoder_hidden_states,
251
+ encoder_attention_mask,
252
+ head_mask[idx] if head_mask is not None else None,
253
+ cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
254
+ None,
255
+ output_attentions,
256
+ use_cache,
257
+ )
258
+ else:
259
+ layer_outputs = decoder_layer(
260
+ hidden_states,
261
+ attention_mask=attention_mask,
262
+ encoder_hidden_states=encoder_hidden_states,
263
+ encoder_attention_mask=encoder_attention_mask,
264
+ layer_head_mask=(head_mask[idx] if head_mask is not None else None),
265
+ cross_attn_layer_head_mask=(
266
+ cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
267
+ ),
268
+ past_key_value=past_key_value,
269
+ output_attentions=output_attentions,
270
+ use_cache=use_cache,
271
+ )
272
+ hidden_states = layer_outputs[0]
273
+
274
+ if use_cache:
275
+ next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
276
+
277
+ if output_attentions:
278
+ all_self_attns += (layer_outputs[1],)
279
+
280
+ if encoder_hidden_states is not None:
281
+ all_cross_attentions += (layer_outputs[2],)
282
+
283
+ hidden_states = self.layer_norm(hidden_states)
284
+
285
+ # add hidden states from the last decoder layer
286
+ if output_hidden_states:
287
+ all_hidden_states += (hidden_states,)
288
+
289
+ next_cache = next_decoder_cache if use_cache else None
290
+ if not return_dict:
291
+ return tuple(
292
+ v
293
+ for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
294
+ if v is not None
295
+ )
296
+ return BaseModelOutputWithPastAndCrossAttentions(
297
+ last_hidden_state=hidden_states,
298
+ past_key_values=next_cache,
299
+ hidden_states=all_hidden_states,
300
+ attentions=all_self_attns,
301
+ cross_attentions=all_cross_attentions,
302
+ )
303
+
304
+
305
+ class RadioWithNeck(nn.Module):
306
+ """Vision encoder using RADIO model with custom neck."""
307
+
308
+ def __init__(self, config):
309
+ super().__init__()
310
+ self.config = config
311
+
312
+ self.model_encoder = AutoModel.from_config(config, trust_remote_code=True)
313
+
314
+ # Neck components
315
+ last_hidden_state = 1024
316
+ self.conv1 = nn.Conv1d(1280, last_hidden_state, 1)
317
+ self.layer_norm1 = nn.LayerNorm(last_hidden_state, eps=1e-06, elementwise_affine=True)
318
+ self.conv2 = nn.Conv2d(last_hidden_state, last_hidden_state, kernel_size=(1,4), stride=(1,4), padding=0, bias=False)
319
+ self.layer_norm2 = nn.LayerNorm(last_hidden_state, eps=1e-06, elementwise_affine=True)
320
+ self.sum_proj = nn.Linear(3840, last_hidden_state)
321
+ self.layer_norm3 = nn.LayerNorm(last_hidden_state, eps=1e-06, elementwise_affine=True)
322
+
323
+ def forward(self, pixel_values, output_attentions=False, output_hidden_states=False, return_dict=False, **kwargs):
324
+ radio_output = self.model_encoder(pixel_values)
325
+ summary, feature = radio_output
326
+
327
+
328
+ output = self.conv1(feature.permute(0,2,1)).permute(0,2,1)
329
+ output = self.layer_norm1(output)
330
+
331
+ patch_size = self.config.patch_size
332
+ output = rearrange(output, 'b (h w) d -> b d h w',
333
+ h=pixel_values.shape[-2] // patch_size,
334
+ w=pixel_values.shape[-1] // patch_size)
335
+
336
+ output = self.conv2(output)
337
+ output = rearrange(output, 'b d h w -> b (h w) d')
338
+ output = self.layer_norm2(output)
339
+ summary = self.layer_norm3(self.sum_proj(summary))
340
+ output = torch.cat((output, summary.unsqueeze(1)), dim=1)
341
+
342
+ return DonutSwinModelOutput(last_hidden_state=output)
343
+
344
+
345
+ class NemotronParsePreTrainedModel(PreTrainedModel):
346
+ """
347
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models.
348
+ """
349
+ config_class = NemotronParseConfig
350
+ base_model_prefix = "vision_encoder_decoder" # Use VisionEncoderDecoder prefix
351
+ main_input_name = "pixel_values"
352
+ supports_gradient_checkpointing = True
353
+ _no_split_modules = ["RadioWithNeck", "MBartDecoder"]
354
+ _skip_keys_device_placement = "past_key_values"
355
+
356
+ def _init_weights(self, module):
357
+ """Initialize the weights"""
358
+ if isinstance(module, nn.Linear):
359
+ module.weight.data.normal_(mean=0.0, std=self.config.decoder.init_std)
360
+ if module.bias is not None:
361
+ module.bias.data.zero_()
362
+ elif isinstance(module, nn.Embedding):
363
+ module.weight.data.normal_(mean=0.0, std=self.config.decoder.init_std)
364
+ if module.padding_idx is not None:
365
+ module.weight.data[module.padding_idx].zero_()
366
+
367
+ # Based on transformers.models.encoder_decoder.modeling_encoder_decoder
368
+ class NemotronParseForConditionalGeneration(NemotronParsePreTrainedModel, GenerationMixin):
369
+ """
370
+ NemotronParse model for conditional generation tasks.
371
+
372
+ This model combines a RADIO-based vision encoder with an mBART-based text decoder.
373
+ """
374
+
375
+ def __init__(self, config: NemotronParseConfig):
376
+ super().__init__(config)
377
+
378
+ self.encoder = RadioWithNeck(config.encoder)
379
+ self.encoder.main_input_name = 'pixel_values'
380
+ self.encoder = self.encoder.to(config.encoder.torch_dtype)
381
+
382
+ self.decoder = NemotronParseDecoder(config.decoder)
383
+ self.decoder = self.decoder.to(config.decoder.torch_dtype)
384
+
385
+ self.lm_head = nn.Linear(config.decoder.d_model, config.decoder.vocab_size, bias=False, dtype=config.decoder.torch_dtype)
386
+
387
+ # Extra heads
388
+ num_extra_heads = getattr(config, 'num_extra_heads', 0)
389
+ self.decoder.extra_heads = nn.ModuleList([
390
+ nn.Linear(config.decoder.d_model, config.decoder.d_model)
391
+ for _ in range(num_extra_heads)
392
+ ])
393
+ self.decoder.extra_proj = nn.ModuleList([
394
+ nn.Linear(config.decoder.d_model, config.decoder.d_model)
395
+ for _ in range(num_extra_heads)
396
+ ])
397
+
398
+ # Class token index for loss weighting
399
+ self.class_token_indx_start = getattr(config, 'class_token_start_idx', 50000)
400
+
401
+ self.post_init()
402
+
403
+ def get_encoder(self):
404
+ return self.encoder
405
+
406
+ def get_decoder(self):
407
+ return self.decoder
408
+
409
+ def get_output_embeddings(self):
410
+ return self.lm_head
411
+
412
+ def set_output_embeddings(self, new_embeddings):
413
+ self.lm_head = new_embeddings
414
+
415
+ def get_input_embeddings(self):
416
+ return self.decoder.get_input_embeddings()
417
+
418
+ def forward(
419
+ self,
420
+ pixel_values: Optional[torch.FloatTensor] = None,
421
+ decoder_input_ids: Optional[torch.LongTensor] = None,
422
+ decoder_attention_mask: Optional[torch.BoolTensor] = None,
423
+ encoder_outputs: Optional[Tuple[torch.FloatTensor]] = None,
424
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
425
+ decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
426
+ labels: Optional[torch.LongTensor] = None,
427
+ use_cache: Optional[bool] = None,
428
+ output_attentions: Optional[bool] = None,
429
+ output_hidden_states: Optional[bool] = None,
430
+ return_dict: Optional[bool] = None,
431
+ __subflavors__: Optional[str] = None,
432
+ __keys__: Optional[List[str]] = None,
433
+ return_sample_losses: Optional[torch.FloatTensor] = None,
434
+ **kwargs,
435
+ ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
436
+
437
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
438
+
439
+ kwargs_encoder = {argument: value for argument, value in kwargs.items() if not argument.startswith("decoder_")}
440
+
441
+ kwargs_decoder = {
442
+ argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")
443
+ }
444
+
445
+ if encoder_outputs is None:
446
+ if pixel_values is None:
447
+ raise ValueError("You have to specify pixel_values")
448
+
449
+ encoder_outputs = self.encoder(
450
+ pixel_values,
451
+ output_attentions=output_attentions,
452
+ output_hidden_states=output_hidden_states,
453
+ return_dict=return_dict,
454
+ **kwargs_encoder,
455
+ )
456
+
457
+ elif isinstance(encoder_outputs, tuple):
458
+ encoder_outputs = BaseModelOutput(*encoder_outputs)
459
+
460
+ encoder_hidden_states = encoder_outputs[0]
461
+
462
+ encoder_attention_mask = None
463
+
464
+ if (labels is not None) and (decoder_input_ids is None and decoder_inputs_embeds is None):
465
+ decoder_input_ids = shift_tokens_right(
466
+ labels, self.config.pad_token_id, self.config.decoder_start_token_id
467
+ )
468
+
469
+ output_hidden_states = True
470
+
471
+ decoder_outputs = self.decoder(
472
+ input_ids=decoder_input_ids,
473
+ attention_mask=decoder_attention_mask,
474
+ encoder_hidden_states=encoder_hidden_states,
475
+ encoder_attention_mask=encoder_attention_mask,
476
+ inputs_embeds=decoder_inputs_embeds,
477
+ output_attentions=output_attentions,
478
+ output_hidden_states=output_hidden_states,
479
+ use_cache=use_cache,
480
+ past_key_values=past_key_values,
481
+ return_dict=return_dict,
482
+ **kwargs_decoder,
483
+ )
484
+ loss = None
485
+
486
+ if labels is not None:
487
+ main_logits = self.lm_head(decoder_outputs.last_hidden_state)
488
+ logits = [main_logits]
489
+ decoder_inputs_embeds = decoder_outputs.inputs_embeds
490
+ for iii, head in enumerate(self.decoder.extra_heads):
491
+
492
+ decoder_input_embeds_shift = self.decoder.extra_proj[iii](torch.cat((decoder_inputs_embeds[:,1:,:], torch.zeros_like(decoder_inputs_embeds[:,0,:].unsqueeze(1))), axis=1))
493
+ hidden = head(decoder_outputs['hidden_states'][-1] + decoder_input_embeds_shift)
494
+ logits.append(self.lm_head(hidden)) # Use main lm_head, NOT decoder.lm_head
495
+
496
+ logits = torch.stack(logits, dim=-2)
497
+ loss_fct = CrossEntropyLoss(reduction="none")
498
+
499
+ losses_per_head = []
500
+ tokens_per_head = []
501
+ for head_num in range(len(self.decoder.extra_heads)+1):
502
+ logits_head = logits[:,:,head_num,:]
503
+ labels_head = torch.cat(
504
+ (labels[:, head_num:], torch.full_like(labels[:, :head_num], -100)),
505
+ 1
506
+ )
507
+ loss_full = loss_fct(logits_head.permute(0, 2, 1), labels_head)
508
+ loss_full[labels_head >= self.class_token_indx_start] *= 10
509
+ losses_per_head.append(loss_full.sum(1))
510
+ tokens_per_head.append((labels_head != -100).sum(1))
511
+
512
+ losses_per_sample = torch.stack(losses_per_head, dim=1).sum(1)
513
+ tokens_per_sample = torch.stack(tokens_per_head, dim=1).sum(1)
514
+ loss = losses_per_sample.sum() / (tokens_per_sample.sum() + 1e-6)
515
+ if return_sample_losses is not None:
516
+ return_sample_losses.copy_(losses_per_sample.detach() / (tokens_per_sample + 1e-6))
517
+
518
+ if not return_dict:
519
+ if loss is not None:
520
+ return (loss,) + decoder_outputs + encoder_outputs
521
+ else:
522
+ return decoder_outputs + encoder_outputs
523
+ output_logits = self.lm_head(decoder_outputs.last_hidden_state)
524
+ return Seq2SeqLMOutput(
525
+ loss=loss,
526
+ logits=output_logits,
527
+ past_key_values=decoder_outputs.past_key_values,
528
+ decoder_hidden_states=decoder_outputs.hidden_states,
529
+ decoder_attentions=decoder_outputs.attentions,
530
+ cross_attentions=decoder_outputs.cross_attentions,
531
+ encoder_last_hidden_state=encoder_outputs.last_hidden_state,
532
+ encoder_hidden_states=encoder_outputs.hidden_states,
533
+ encoder_attentions=encoder_outputs.attentions,
534
+ )
535
+
536
+ def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
537
+ return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
538
+
539
+
540
+ def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of: Optional[int] = None):
541
+ """Resize token embeddings and update lm_head accordingly."""
542
+ # Resize decoder embeddings
543
+ new_embeddings = self.decoder.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
544
+
545
+ # Update lm_head to match new vocab size
546
+ if new_embeddings is not None:
547
+ old_vocab_size, hidden_size = self.lm_head.weight.shape
548
+ new_vocab_size = new_embeddings.num_embeddings
549
+
550
+ if old_vocab_size != new_vocab_size:
551
+ print(f"Resizing lm_head from {old_vocab_size} to {new_vocab_size} tokens")
552
+ new_lm_head = nn.Linear(hidden_size, new_vocab_size, bias=False, device=self.lm_head.weight.device, dtype=self.lm_head.weight.dtype)
553
+
554
+ # Copy old weights to new lm_head
555
+ num_tokens_to_copy = min(old_vocab_size, new_vocab_size)
556
+ new_lm_head.weight.data[:num_tokens_to_copy] = self.lm_head.weight.data[:num_tokens_to_copy]
557
+
558
+ # Update reference
559
+ self.lm_head = new_lm_head
560
+ # DO NOT update decoder.lm_head - keep them separate
561
+
562
+ return new_embeddings
563
+
564
+ def _reorder_cache(self, past_key_values, beam_idx):
565
+ # apply decoder cache reordering here
566
+ return self.decoder._reorder_cache(past_key_values, beam_idx)
567
+
568
+
569
+ # Copied from transformers.models.encoder_decoder.modeling_encoder_decoder.shift_tokens_right
570
+ def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
571
+ """
572
+ Shift input ids one token to the right.
573
+ """
574
+ shifted_input_ids = input_ids.new_zeros(input_ids.shape)
575
+ shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
576
+ if decoder_start_token_id is None:
577
+ raise ValueError("Make sure to set the decoder_start_token_id attribute of the model's configuration.")
578
+ shifted_input_ids[:, 0] = decoder_start_token_id
579
+
580
+ if pad_token_id is None:
581
+ raise ValueError("Make sure to set the pad_token_id attribute of the model's configuration.")
582
+ # replace possible -100 values in labels by `pad_token_id`
583
+ shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
584
+
585
+ return shifted_input_ids
hf_nemotron_parse_processor.py ADDED
@@ -0,0 +1,376 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from PIL import Image
3
+ from typing import List, Optional, Union, Dict, Any
4
+ import torch
5
+ from torchvision import transforms as T
6
+ import albumentations as A
7
+ import cv2
8
+ import json
9
+
10
+ from transformers import ProcessorMixin, BaseImageProcessor, ImageProcessingMixin
11
+ from transformers.tokenization_utils_base import BatchEncoding
12
+ from transformers.image_utils import ChannelDimension, ImageInput, PILImageResampling, infer_channel_dimension_format
13
+ from transformers.utils import TensorType
14
+
15
+
16
+ class NemotronParseImageProcessor(BaseImageProcessor, ImageProcessingMixin):
17
+ """
18
+ Image processor for NemotronParse model.
19
+
20
+ This processor inherits from BaseImageProcessor to be compatible with transformers AutoImageProcessor.
21
+ """
22
+
23
+ model_input_names = ["pixel_values"]
24
+
25
+ def __init__(
26
+ self,
27
+ final_size: tuple = (2048, 1648),
28
+ **kwargs,
29
+ ):
30
+ clean_kwargs = {}
31
+ for k, v in kwargs.items():
32
+ if not k.startswith('_') and k not in ['transform', 'torch_transform']:
33
+ clean_kwargs[k] = v
34
+
35
+ if 'size' in clean_kwargs:
36
+ size_config = clean_kwargs.pop('size')
37
+ if isinstance(size_config, dict):
38
+ if 'longest_edge' in size_config:
39
+ longest_edge = size_config['longest_edge']
40
+ if isinstance(longest_edge, (list, tuple)):
41
+ final_size = tuple(int(x) for x in longest_edge)
42
+ else:
43
+ final_size = (int(longest_edge), int(longest_edge))
44
+ elif 'height' in size_config and 'width' in size_config:
45
+ final_size = (int(size_config['height']), int(size_config['width']))
46
+
47
+ super().__init__(**clean_kwargs)
48
+
49
+ if isinstance(final_size, (list, tuple)) and len(final_size) >= 2:
50
+ self.final_size = (int(final_size[0]), int(final_size[1]))
51
+ elif isinstance(final_size, (int, float)):
52
+ self.final_size = (int(final_size), int(final_size))
53
+ else:
54
+ self.final_size = (2048, 1648) # Default fallback
55
+
56
+ self._create_transforms()
57
+
58
+ def _create_transforms(self):
59
+ """Create transform objects (not serialized to JSON)."""
60
+ if isinstance(self.final_size, (list, tuple)):
61
+ self.target_height, self.target_width = int(self.final_size[0]), int(self.final_size[1])
62
+ else:
63
+ self.target_height = self.target_width = int(self.final_size)
64
+
65
+ self.transform = A.Compose([
66
+ A.PadIfNeeded(
67
+ min_height=self.target_height,
68
+ min_width=self.target_width,
69
+ border_mode=cv2.BORDER_CONSTANT,
70
+ value=[255, 255, 255],
71
+ p=1.0
72
+ ),
73
+ ])
74
+
75
+ self.torch_transform = T.Compose([
76
+ T.ToTensor(),
77
+ # Note: Normalization is done within RADIO model
78
+ ])
79
+
80
+ def to_dict(self):
81
+ """Override to exclude non-serializable transforms."""
82
+ output = super().to_dict()
83
+ output.pop('transform', None)
84
+ output.pop('torch_transform', None)
85
+ return output
86
+
87
+ @classmethod
88
+ def from_dict(cls, config_dict: dict, **kwargs):
89
+ """Override to recreate transforms after loading."""
90
+ config_dict = config_dict.copy()
91
+ config_dict.pop('transform', None)
92
+ config_dict.pop('torch_transform', None)
93
+
94
+ # Clean any problematic entries
95
+ for key in list(config_dict.keys()):
96
+ if key.startswith('_') or config_dict[key] is None:
97
+ config_dict.pop(key, None)
98
+
99
+ # Ensure numeric types are correct
100
+ if 'final_size' in config_dict:
101
+ final_size = config_dict['final_size']
102
+ if isinstance(final_size, (list, tuple)):
103
+ config_dict['final_size'] = tuple(int(x) for x in final_size)
104
+
105
+ try:
106
+ return cls(**config_dict, **kwargs)
107
+ except Exception as e:
108
+ print(f"Warning: Error in from_dict: {e}")
109
+ print("Using default parameters...")
110
+ return cls(**kwargs)
111
+
112
+ def save_pretrained(self, save_directory, **kwargs):
113
+ """Save image processor configuration."""
114
+ import os
115
+ import json
116
+
117
+ os.makedirs(save_directory, exist_ok=True)
118
+
119
+ # Save preprocessor config in standard HuggingFace format
120
+ config = {
121
+ "feature_extractor_type": "NemotronParseImageProcessor",
122
+ "image_processor_type": "NemotronParseImageProcessor",
123
+ "processor_class": "NemotronParseImageProcessor",
124
+ "size": {
125
+ "height": self.final_size[0],
126
+ "width": self.final_size[1],
127
+ "longest_edge": self.final_size
128
+ },
129
+ "final_size": self.final_size,
130
+ }
131
+
132
+ config_path = os.path.join(save_directory, "preprocessor_config.json")
133
+ with open(config_path, 'w') as f:
134
+ json.dump(config, f, indent=2)
135
+
136
+ def _resize_with_aspect_ratio(self, image: np.ndarray) -> np.ndarray:
137
+ """Resize image maintaining aspect ratio (exact replica of original LongestMaxSizeHW)."""
138
+ height, width = image.shape[:2]
139
+ max_size_height = self.target_height
140
+ max_size_width = self.target_width
141
+
142
+ # Original LongestMaxSizeHW algorithm from custom_augmentations.py
143
+ aspect_ratio = width / height
144
+ new_height = height
145
+ new_width = width
146
+
147
+ if height > max_size_height:
148
+ new_height = max_size_height
149
+ new_width = int(new_height * aspect_ratio)
150
+
151
+ if new_width > max_size_width:
152
+ new_width = max_size_width
153
+ new_height = int(new_width / aspect_ratio)
154
+
155
+ return cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_LINEAR)
156
+
157
+ def _pad_to_size(self, image: np.ndarray) -> np.ndarray:
158
+ """Pad image to target size with white padding (matches A.PadIfNeeded behavior)."""
159
+ h, w = image.shape[:2]
160
+ min_height, min_width = self.target_height, self.target_width
161
+
162
+ pad_h = max(0, min_height - h)
163
+ pad_w = max(0, min_width - w)
164
+
165
+ if pad_h == 0 and pad_w == 0:
166
+ return image
167
+
168
+ if len(image.shape) == 3:
169
+ padded = np.pad(
170
+ image,
171
+ ((0, pad_h), (0, pad_w), (0, 0)),
172
+ mode='constant',
173
+ constant_values=255
174
+ )
175
+ else:
176
+ padded = np.pad(
177
+ image,
178
+ ((0, pad_h), (0, pad_w)),
179
+ mode='constant',
180
+ constant_values=255
181
+ )
182
+
183
+ return padded
184
+
185
+ def preprocess(
186
+ self,
187
+ images: ImageInput,
188
+ return_tensors: Optional[Union[str, TensorType]] = None,
189
+ **kwargs,
190
+ ) -> Dict[str, torch.Tensor]:
191
+ """
192
+ Preprocess an image or batch of images for the NemotronParse model.
193
+
194
+ Args:
195
+ images: Input image(s)
196
+ return_tensors: Type of tensors to return
197
+ """
198
+
199
+ # Ensure images is a list
200
+ if not isinstance(images, list):
201
+ images = [images]
202
+
203
+ # Convert PIL images to numpy arrays if needed
204
+ processed_images = []
205
+ for image in images:
206
+ if isinstance(image, Image.Image):
207
+ image = np.asarray(image)
208
+ processed_images.append(image)
209
+
210
+ # Apply NemotronParse-specific transforms
211
+ pixel_values = []
212
+ for image in processed_images:
213
+ processed_image = self._resize_with_aspect_ratio(image)
214
+
215
+ if self.transform is not None:
216
+ transformed = self.transform(image=processed_image)
217
+ processed_image = transformed["image"]
218
+ else:
219
+ # Fallback: just pad to target size
220
+ processed_image = self._pad_to_size(processed_image)
221
+
222
+ pixel_values_tensor = self.torch_transform(processed_image)
223
+
224
+ if pixel_values_tensor.shape[0] == 1:
225
+ pixel_values_tensor = pixel_values_tensor.expand(3, -1, -1)
226
+
227
+ pixel_values.append(pixel_values_tensor)
228
+
229
+ pixel_values = torch.stack(pixel_values)
230
+
231
+ data = {"pixel_values": pixel_values}
232
+
233
+ if return_tensors is not None:
234
+ data = self._convert_output_format(data, return_tensors)
235
+
236
+ return data
237
+
238
+ def _convert_output_format(self, data: Dict[str, torch.Tensor], return_tensors: Union[str, TensorType]) -> Dict:
239
+ """Convert output format based on return_tensors parameter."""
240
+ if return_tensors == "pt" or return_tensors == TensorType.PYTORCH:
241
+ return data
242
+ elif return_tensors == "np" or return_tensors == TensorType.NUMPY:
243
+ return {k: v.numpy() for k, v in data.items()}
244
+ else:
245
+ return data
246
+
247
+ def __call__(self, images: Union[Image.Image, List[Image.Image]], **kwargs) -> Dict[str, torch.Tensor]:
248
+ """Process images for the model (backward compatibility)."""
249
+ return self.preprocess(images, **kwargs)
250
+
251
+
252
+ class NemotronParseProcessor(ProcessorMixin):
253
+
254
+ attributes = ["image_processor", "tokenizer"]
255
+ image_processor_class = "NemotronParseImageProcessor"
256
+ tokenizer_class = ("PreTrainedTokenizer", "PreTrainedTokenizerFast")
257
+
258
+ def __init__(self, image_processor=None, tokenizer=None, **kwargs):
259
+ if image_processor is None:
260
+ image_processor = NemotronParseImageProcessor(**kwargs)
261
+
262
+ super().__init__(image_processor, tokenizer)
263
+
264
+
265
+ def __call__(
266
+ self,
267
+ images: Union[Image.Image, List[Image.Image]] = None,
268
+ text: Union[str, List[str]] = None,
269
+ add_special_tokens: bool = True,
270
+ padding: Union[bool, str] = False,
271
+ truncation: Union[bool, str] = False,
272
+ max_length: Optional[int] = None,
273
+ stride: int = 0,
274
+ pad_to_multiple_of: Optional[int] = None,
275
+ return_attention_mask: Optional[bool] = None,
276
+ return_overflowing_tokens: bool = False,
277
+ return_special_tokens_mask: bool = False,
278
+ return_offsets_mapping: bool = False,
279
+ return_token_type_ids: bool = False,
280
+ return_length: bool = False,
281
+ verbose: bool = True,
282
+ return_tensors: Optional[Union[str, "TensorType"]] = None,
283
+ **kwargs
284
+ ) -> BatchEncoding:
285
+ """
286
+ Main method to prepare for the model one or several text(s) and image(s).
287
+ """
288
+
289
+ # Process images
290
+ if images is not None:
291
+ image_inputs = self.image_processor(images, **kwargs)
292
+ else:
293
+ image_inputs = {}
294
+
295
+ # Process text
296
+ if text is not None:
297
+ text_inputs = self.tokenizer(
298
+ text,
299
+ add_special_tokens=add_special_tokens,
300
+ padding=padding,
301
+ truncation=truncation,
302
+ max_length=max_length,
303
+ stride=stride,
304
+ pad_to_multiple_of=pad_to_multiple_of,
305
+ return_attention_mask=return_attention_mask,
306
+ return_overflowing_tokens=return_overflowing_tokens,
307
+ return_special_tokens_mask=return_special_tokens_mask,
308
+ return_offsets_mapping=return_offsets_mapping,
309
+ return_token_type_ids=return_token_type_ids,
310
+ return_length=return_length,
311
+ verbose=verbose,
312
+ return_tensors=return_tensors,
313
+ **kwargs,
314
+ )
315
+ else:
316
+ text_inputs = {}
317
+
318
+ # Combine inputs
319
+ return BatchEncoding({**image_inputs, **text_inputs})
320
+
321
+ def decode(self, *args, **kwargs):
322
+ """Decode token ids to strings."""
323
+ return self.tokenizer.decode(*args, **kwargs)
324
+
325
+ def batch_decode(self, *args, **kwargs):
326
+ """Batch decode token ids to strings."""
327
+ return self.tokenizer.batch_decode(*args, **kwargs)
328
+
329
+ def post_process_generation(self, sequences, fix_markdown=False):
330
+ """Post-process generated sequences."""
331
+ if hasattr(self.tokenizer, 'post_process_generation'):
332
+ return self.tokenizer.post_process_generation(sequences, fix_markdown=fix_markdown)
333
+ else:
334
+ # Fallback processing
335
+ if isinstance(sequences, str):
336
+ sequences = [sequences]
337
+
338
+ processed = []
339
+ for seq in sequences:
340
+ # Basic cleaning
341
+ seq = seq.replace('<s>', '').replace('</s>', '').strip()
342
+ processed.append(seq)
343
+
344
+ return processed[0] if len(processed) == 1 else processed
345
+
346
+ @classmethod
347
+ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
348
+ """
349
+ Load processor from pretrained model.
350
+
351
+ This method is compatible with AutoProcessor.from_pretrained().
352
+ """
353
+ # Use the parent class's from_pretrained method which handles auto-loading
354
+ return super().from_pretrained(pretrained_model_name_or_path, **kwargs)
355
+
356
+ def save_pretrained(self, save_directory, **kwargs):
357
+ """
358
+ Save processor to directory.
359
+
360
+ This method is compatible with AutoProcessor/AutoImageProcessor loading.
361
+ """
362
+ import os
363
+ os.makedirs(save_directory, exist_ok=True)
364
+
365
+ # Save tokenizer with proper configuration for AutoTokenizer
366
+ print("Saving tokenizer for AutoTokenizer compatibility...")
367
+ self.tokenizer.save_pretrained(save_directory, **kwargs)
368
+
369
+ # Save image processor
370
+ print("Saving image processor...")
371
+ self.image_processor.save_pretrained(save_directory, **kwargs)
372
+
373
+ # Use the parent class's save_pretrained method for processor config
374
+ super().save_pretrained(save_directory, **kwargs)
375
+ print(f"NemotronParseProcessor saved to {save_directory}")
376
+ print(f"AutoTokenizer.from_pretrained('{save_directory}') should now work!")
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81cae06dbfa407fce43e8624cc25a167340eff9e710492e890039605e2ac2570
3
+ size 3827116504
preprocessor_config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "feature_extractor_type": "NemotronParseImageProcessor",
3
+ "image_processor_type": "NemotronParseImageProcessor",
4
+ "processor_class": "NemotronParseProcessor",
5
+ "do_normalize": false,
6
+ "do_rescale": true,
7
+ "rescale_factor": 0.00392156862745098,
8
+ "size": {
9
+ "height": 2048,
10
+ "width": 1648,
11
+ "longest_edge": [
12
+ 2048,
13
+ 1648
14
+ ]
15
+ },
16
+ "final_size": [
17
+ 2048,
18
+ 1648
19
+ ]
20
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<predict_classes>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ }
10
+ ],
11
+ "bos_token": {
12
+ "content": "<s>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "eos_token": {
19
+ "content": "</s>",
20
+ "lstrip": false,
21
+ "normalized": false,
22
+ "rstrip": false,
23
+ "single_word": false
24
+ },
25
+ "pad_token": {
26
+ "content": "<pad>",
27
+ "lstrip": false,
28
+ "normalized": false,
29
+ "rstrip": false,
30
+ "single_word": false
31
+ },
32
+ "unk_token": {
33
+ "content": "<unk>",
34
+ "lstrip": false,
35
+ "normalized": false,
36
+ "rstrip": false,
37
+ "single_word": false
38
+ }
39
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff