|
|
from os import truncate |
|
|
from quopri import decodestring |
|
|
from transformers import PretrainedConfig |
|
|
from typing import List, Optional |
|
|
|
|
|
from transformers.dynamic_module_utils import get_class_from_dynamic_module |
|
|
|
|
|
class NemotronParseTextConfig(PretrainedConfig): |
|
|
""" |
|
|
Configuration class for NemotronParse text decoder (mBART-based). |
|
|
""" |
|
|
model_type = "nemotron_parse_text" |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
vocab_size: int = 250027, |
|
|
d_model: int = 1024, |
|
|
encoder_layers: int = 12, |
|
|
decoder_layers: int = 12, |
|
|
encoder_attention_heads: int = 16, |
|
|
decoder_attention_heads: int = 16, |
|
|
decoder_ffn_dim: int = 4096, |
|
|
encoder_ffn_dim: int = 4096, |
|
|
activation_function: str = "gelu", |
|
|
dropout: float = 0.1, |
|
|
attention_dropout: float = 0.0, |
|
|
activation_dropout: float = 0.0, |
|
|
classifier_dropout: float = 0.0, |
|
|
init_std: float = 0.02, |
|
|
encoder_layerdrop: float = 0.0, |
|
|
decoder_layerdrop: float = 0.0, |
|
|
scale_embedding: bool = False, |
|
|
use_cache: bool = True, |
|
|
num_labels: int = 3, |
|
|
forced_eos_token_id: int = 2, |
|
|
add_cross_attention: bool = True, |
|
|
is_decoder: bool = True, |
|
|
max_sequence_length: int = 9000, |
|
|
**kwargs |
|
|
): |
|
|
super().__init__(**kwargs) |
|
|
self.vocab_size = vocab_size |
|
|
self.d_model = d_model |
|
|
self.encoder_layers = encoder_layers |
|
|
self.decoder_layers = decoder_layers |
|
|
self.encoder_attention_heads = encoder_attention_heads |
|
|
self.decoder_attention_heads = decoder_attention_heads |
|
|
self.decoder_ffn_dim = decoder_ffn_dim |
|
|
self.encoder_ffn_dim = encoder_ffn_dim |
|
|
self.activation_function = activation_function |
|
|
self.dropout = dropout |
|
|
self.attention_dropout = attention_dropout |
|
|
self.activation_dropout = activation_dropout |
|
|
self.classifier_dropout = classifier_dropout |
|
|
self.init_std = init_std |
|
|
self.encoder_layerdrop = encoder_layerdrop |
|
|
self.decoder_layerdrop = decoder_layerdrop |
|
|
self.scale_embedding = scale_embedding |
|
|
self.use_cache = use_cache |
|
|
self.num_labels = num_labels |
|
|
self.add_cross_attention = add_cross_attention |
|
|
self.is_decoder = is_decoder |
|
|
|
|
|
|
|
|
self.hidden_size = self.d_model |
|
|
self.forced_eos_token_id = forced_eos_token_id |
|
|
self.num_attention_heads = self.encoder_attention_heads |
|
|
|
|
|
self.max_sequence_length = max_sequence_length |
|
|
|
|
|
|
|
|
class NemotronParseConfig(PretrainedConfig): |
|
|
""" |
|
|
Configuration class for NemotronParse model. |
|
|
|
|
|
This configuration class is used to store the configuration of a [`NemotronParseForConditionalGeneration`] model. |
|
|
It is used to instantiate an NemotronParse model according to the specified arguments, defining the vision and text model configs. |
|
|
""" |
|
|
model_type = "nemotron_parse" |
|
|
is_composition = True |
|
|
max_sequence_length = 9000 |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
encoder: Optional[dict] = None, |
|
|
decoder: Optional[dict] = None, |
|
|
tie_word_embeddings: bool = False, |
|
|
decoder_start_token_id: int = 2, |
|
|
pad_token_id: int = 1, |
|
|
eos_token_id: int = 2, |
|
|
bos_token_id: int = 0, |
|
|
image_size: List[int] = [2048, 1648], |
|
|
is_encoder_decoder: bool = True, |
|
|
max_sequence_length: int = 9000, |
|
|
**kwargs |
|
|
): |
|
|
super().__init__( |
|
|
tie_word_embeddings=tie_word_embeddings, |
|
|
decoder_start_token_id=decoder_start_token_id, |
|
|
pad_token_id=pad_token_id, |
|
|
eos_token_id=eos_token_id, |
|
|
bos_token_id=bos_token_id, |
|
|
max_sequence_length=max_sequence_length, |
|
|
**kwargs |
|
|
) |
|
|
|
|
|
|
|
|
if decoder is None: |
|
|
decoder = {} |
|
|
|
|
|
if encoder is not None: |
|
|
assert "auto_map" in encoder and "AutoConfig" in encoder["auto_map"] |
|
|
vision_auto_config = get_class_from_dynamic_module(*encoder["auto_map"]["AutoConfig"].split("--")[::-1]) |
|
|
self.encoder = vision_auto_config(**encoder) |
|
|
else: |
|
|
self.encoder = PretrainedConfig() |
|
|
|
|
|
decoder["max_sequence_length"] = max_sequence_length |
|
|
self.decoder = NemotronParseTextConfig(**decoder) |
|
|
self.image_size = image_size |
|
|
|
|
|
|
|
|
self.vocab_size = self.decoder.vocab_size |
|
|
self.is_encoder_decoder = is_encoder_decoder |
|
|
self.max_sequence_length = max_sequence_length |
|
|
|
|
|
def to_dict(self): |
|
|
""" |
|
|
Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. |
|
|
""" |
|
|
output = super().to_dict() |
|
|
output["encoder"] = self.encoder.to_dict() |
|
|
output["decoder"] = self.decoder.to_dict() |
|
|
output["model_type"] = self.model_type |
|
|
output["is_encoder_decoder"] = self.is_encoder_decoder |
|
|
return output |