import re
from latex2html import convert_html_tables_to_markdown, latex_table_to_html
def extract_classes_bboxes(text: str):
_re_extract_class_bbox = re.compile(r'(.*?)]+)>', re.DOTALL)
classes = []
bboxes = []
texts = []
for m in _re_extract_class_bbox.finditer(text):
x1, y1, text, x2, y2, cls = m.groups()
classes.append(cls)
bboxes.append((float(x1), float(y1), float(x2), float(y2)))
texts.append(text)
# TODO: Remove when fixed
classes = [
"Formula" if cls == "Inline-formula" else cls for cls in classes
]
assert "Page-number" not in classes
return classes, bboxes, texts
def transform_bbox_to_original(bbox, original_width, original_height, target_w=1648, target_h=2048):
# Replicate exact resize logic
aspect_ratio = original_width / original_height
new_height = original_height
new_width = original_width
if original_height > target_h:
new_height = target_h
new_width = int(new_height * aspect_ratio)
if new_width > target_w:
new_width = target_w
new_height = int(new_width / aspect_ratio)
resized_width = new_width
resized_height = new_height
# Calculate padding
pad_left = (target_w - resized_width) // 2
pad_top = (target_h - resized_height) // 2
# # Transform: use the ACTUAL resized dimensions, not the scale
# # X coords
left = ((bbox[0] * target_w) - pad_left) * original_width / resized_width
right = ((bbox[2] * target_w) - pad_left) * original_width / resized_width
# # Y coords - using original_height / resized_height directly
top = ((bbox[1] * target_h) - pad_top) * original_height / resized_height
bottom = ((bbox[3] * target_h) - pad_top) * original_height / resized_height
return left, top, right, bottom
def postprocess_text(text, cls = 'Text', text_format='markdown', table_format='latex', blank_text_in_figures=False):
assert text_format in ['markdown', 'plain'], 'Unknown text format. Supported: markdown | plain'
assert table_format in ['latex', 'HTML', 'markdown'], 'Unknown table format. Supported: latex | HTML | markdown'
if cls != 'Table':
if text_format == 'plain':
text = convert_mmd_to_plain_text_ours(text)
elif table_format == 'HTML':
text = latex_table_to_html(text)
elif table_format == 'markdown':
text = convert_html_tables_to_markdown(latex_table_to_html(text))
if blank_text_in_figures and cls == 'Picture':
text = ''
return text
def remove_nemotron_formatting(text):
text = text.replace('', '')
mmd_text = mmd_text.replace('\\<|unk|\\>', '')
mmd_text = mmd_text.replace('\\unknown', '')
def convert_mmd_to_plain_text_ours(mmd_text):
mmd_text = re.sub(r'(.*?)', r'^{\\1}', mmd_text, flags=re.DOTALL)
mmd_text = re.sub(r'(.*?)', r'_{\\1}', mmd_text, flags=re.DOTALL)
mmd_text = mmd_text.replace('
', '\n')
# Remove headers (e.g., ##)
mmd_text = re.sub(r'#+\s', '', mmd_text)
# Remove bold (e.g., **)
mmd_text = re.sub(r'\*\*(.*?)\*\*', r'\1', mmd_text)
#mmd_text = mmd_text.replace("**","")
# Remove italic (e.g., *)
mmd_text = re.sub(r'\*(.*?)\*', r'\1', mmd_text)
# Remove emphasized text formatting (e.g., _)
mmd_text = re.sub(r'(?