File size: 5,144 Bytes
a025381 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
from os import truncate
from quopri import decodestring
from transformers import PretrainedConfig
from typing import List, Optional
from transformers.dynamic_module_utils import get_class_from_dynamic_module
class NemotronParseTextConfig(PretrainedConfig):
"""
Configuration class for NemotronParse text decoder (mBART-based).
"""
model_type = "nemotron_parse_text"
def __init__(
self,
vocab_size: int = 250027,
d_model: int = 1024,
encoder_layers: int = 12,
decoder_layers: int = 12,
encoder_attention_heads: int = 16,
decoder_attention_heads: int = 16,
decoder_ffn_dim: int = 4096,
encoder_ffn_dim: int = 4096,
activation_function: str = "gelu",
dropout: float = 0.1,
attention_dropout: float = 0.0,
activation_dropout: float = 0.0,
classifier_dropout: float = 0.0,
init_std: float = 0.02,
encoder_layerdrop: float = 0.0,
decoder_layerdrop: float = 0.0,
scale_embedding: bool = False,
use_cache: bool = True,
num_labels: int = 3,
forced_eos_token_id: int = 2,
add_cross_attention: bool = True, # Enable cross-attention for vision-encoder-decoder
is_decoder: bool = True, # This is a decoder
max_sequence_length: int = 9000,
**kwargs
):
super().__init__(**kwargs)
self.vocab_size = vocab_size
self.d_model = d_model
self.encoder_layers = encoder_layers
self.decoder_layers = decoder_layers
self.encoder_attention_heads = encoder_attention_heads
self.decoder_attention_heads = decoder_attention_heads
self.decoder_ffn_dim = decoder_ffn_dim
self.encoder_ffn_dim = encoder_ffn_dim
self.activation_function = activation_function
self.dropout = dropout
self.attention_dropout = attention_dropout
self.activation_dropout = activation_dropout
self.classifier_dropout = classifier_dropout
self.init_std = init_std
self.encoder_layerdrop = encoder_layerdrop
self.decoder_layerdrop = decoder_layerdrop
self.scale_embedding = scale_embedding
self.use_cache = use_cache
self.num_labels = num_labels
self.add_cross_attention = add_cross_attention
self.is_decoder = is_decoder
# Add hidden_size as alias for d_model (for compatibility)
self.hidden_size = self.d_model
self.forced_eos_token_id = forced_eos_token_id
self.num_attention_heads = self.encoder_attention_heads
self.max_sequence_length = max_sequence_length
class NemotronParseConfig(PretrainedConfig):
"""
Configuration class for NemotronParse model.
This configuration class is used to store the configuration of a [`NemotronParseForConditionalGeneration`] model.
It is used to instantiate an NemotronParse model according to the specified arguments, defining the vision and text model configs.
"""
model_type = "nemotron_parse"
is_composition = True
max_sequence_length = 9000
def __init__(
self,
encoder: Optional[dict] = None,
decoder: Optional[dict] = None,
tie_word_embeddings: bool = False,
decoder_start_token_id: int = 2,
pad_token_id: int = 1,
eos_token_id: int = 2,
bos_token_id: int = 0,
image_size: List[int] = [2048, 1648],
is_encoder_decoder: bool = True,
max_sequence_length: int = 9000,
**kwargs
):
super().__init__(
tie_word_embeddings=tie_word_embeddings,
decoder_start_token_id=decoder_start_token_id,
pad_token_id=pad_token_id,
eos_token_id=eos_token_id,
bos_token_id=bos_token_id,
max_sequence_length=max_sequence_length,
**kwargs
)
if decoder is None:
decoder = {}
if encoder is not None:
assert "auto_map" in encoder and "AutoConfig" in encoder["auto_map"]
vision_auto_config = get_class_from_dynamic_module(*encoder["auto_map"]["AutoConfig"].split("--")[::-1])
self.encoder = vision_auto_config(**encoder)
else:
self.encoder = PretrainedConfig()
decoder["max_sequence_length"] = max_sequence_length
self.decoder = NemotronParseTextConfig(**decoder)
self.image_size = image_size
# Initialize vocab size from text config
self.vocab_size = self.decoder.vocab_size
self.is_encoder_decoder = is_encoder_decoder
self.max_sequence_length = max_sequence_length
def to_dict(self):
"""
Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
"""
output = super().to_dict()
output["encoder"] = self.encoder.to_dict()
output["decoder"] = self.decoder.to_dict()
output["model_type"] = self.model_type
output["is_encoder_decoder"] = self.is_encoder_decoder
return output |