|
|
import torch |
|
|
from PIL import Image, ImageDraw |
|
|
from transformers import AutoModel, AutoProcessor, AutoTokenizer, AutoConfig, AutoImageProcessor, GenerationConfig |
|
|
from postprocessing import extract_classes_bboxes, transform_bbox_to_original, postprocess_text |
|
|
|
|
|
|
|
|
model_path = "nvidia/NVIDIA-Nemotron-Parse-v1.1" |
|
|
device = "cuda:0" |
|
|
|
|
|
model = AutoModel.from_pretrained( |
|
|
model_path, |
|
|
trust_remote_code=True, |
|
|
torch_dtype=torch.bfloat16 |
|
|
).to(device).eval() |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_path) |
|
|
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) |
|
|
|
|
|
|
|
|
image = Image.open("path/to/your/image.jpg") |
|
|
task_prompt = "</s><s><predict_bbox><predict_classes><output_markdown>" |
|
|
|
|
|
|
|
|
inputs = processor(images=[image], text=task_prompt, return_tensors="pt").to(device) |
|
|
prompt_ids = processor.tokenizer.encode(task_prompt, return_tensors="pt", add_special_tokens=False).cuda() |
|
|
|
|
|
|
|
|
generation_config = GenerationConfig.from_pretrained(model_path, trust_remote_code=True) |
|
|
|
|
|
outputs = model.generate(**inputs, generation_config=generation_config) |
|
|
|
|
|
|
|
|
generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0] |
|
|
classes, bboxes, texts = extract_classes_bboxes(generated_text) |
|
|
bboxes = [transform_bbox_to_original(bbox, image.width, image.height) for bbox in bboxes] |
|
|
|
|
|
|
|
|
table_format = 'latex' |
|
|
text_format = 'markdown' |
|
|
blank_text_in_figures = False |
|
|
texts = [postprocess_text(text, cls = cls, table_format=table_format, text_format=text_format, blank_text_in_figures=blank_text_in_figures) for text, cls in zip(texts, classes)] |
|
|
|
|
|
for cl, bb, txt in zip(classes, bboxes, texts): |
|
|
print(cl, ': ', txt) |
|
|
|
|
|
|
|
|
draw = ImageDraw.Draw(image) |
|
|
for bbox in bboxes: |
|
|
draw.rectangle((bbox[0], bbox[1], bbox[2], bbox[3]), outline="red") |
|
|
|
|
|
|