|
|
import re |
|
|
from latex2html import convert_html_tables_to_markdown, latex_table_to_html |
|
|
|
|
|
def extract_classes_bboxes(text: str): |
|
|
_re_extract_class_bbox = re.compile(r'<x_(\d+(?:\.\d+)?)><y_(\d+(?:\.\d+)?)>(.*?)<x_(\d+(?:\.\d+)?)><y_(\d+(?:\.\d+)?)><class_([^>]+)>', re.DOTALL) |
|
|
classes = [] |
|
|
bboxes = [] |
|
|
texts = [] |
|
|
for m in _re_extract_class_bbox.finditer(text): |
|
|
x1, y1, text, x2, y2, cls = m.groups() |
|
|
classes.append(cls) |
|
|
bboxes.append((float(x1), float(y1), float(x2), float(y2))) |
|
|
texts.append(text) |
|
|
|
|
|
|
|
|
classes = [ |
|
|
"Formula" if cls == "Inline-formula" else cls for cls in classes |
|
|
] |
|
|
assert "Page-number" not in classes |
|
|
|
|
|
return classes, bboxes, texts |
|
|
|
|
|
def transform_bbox_to_original(bbox, original_width, original_height, target_w=1648, target_h=2048): |
|
|
|
|
|
aspect_ratio = original_width / original_height |
|
|
new_height = original_height |
|
|
new_width = original_width |
|
|
|
|
|
if original_height > target_h: |
|
|
new_height = target_h |
|
|
new_width = int(new_height * aspect_ratio) |
|
|
|
|
|
if new_width > target_w: |
|
|
new_width = target_w |
|
|
new_height = int(new_width / aspect_ratio) |
|
|
|
|
|
resized_width = new_width |
|
|
resized_height = new_height |
|
|
|
|
|
|
|
|
pad_left = (target_w - resized_width) // 2 |
|
|
pad_top = (target_h - resized_height) // 2 |
|
|
|
|
|
|
|
|
|
|
|
left = ((bbox[0] * target_w) - pad_left) * original_width / resized_width |
|
|
right = ((bbox[2] * target_w) - pad_left) * original_width / resized_width |
|
|
|
|
|
|
|
|
top = ((bbox[1] * target_h) - pad_top) * original_height / resized_height |
|
|
bottom = ((bbox[3] * target_h) - pad_top) * original_height / resized_height |
|
|
|
|
|
return left, top, right, bottom |
|
|
|
|
|
def postprocess_text(text, cls = 'Text', text_format='markdown', table_format='latex', blank_text_in_figures=False): |
|
|
assert text_format in ['markdown', 'plain'], 'Unknown text format. Supported: markdown | plain' |
|
|
assert table_format in ['latex', 'HTML', 'markdown'], 'Unknown table format. Supported: latex | HTML | markdown' |
|
|
if cls != 'Table': |
|
|
if text_format == 'plain': |
|
|
text = convert_mmd_to_plain_text_ours(text) |
|
|
elif table_format == 'HTML': |
|
|
text = latex_table_to_html(text) |
|
|
elif table_format == 'markdown': |
|
|
text = convert_html_tables_to_markdown(latex_table_to_html(text)) |
|
|
if blank_text_in_figures and cls == 'Picture': |
|
|
text = '' |
|
|
return text |
|
|
|
|
|
def remove_nemotron_formatting(text): |
|
|
text = text.replace('<tbc>', '') |
|
|
text = text.replace('\\<|unk|\\>', '') |
|
|
text = text.replace('\\unknown', '') |
|
|
return text |
|
|
def convert_mmd_to_plain_text_ours(mmd_text): |
|
|
mmd_text = re.sub(r'<sup>(.*?)</sup>', r'^{\\1}', mmd_text, flags=re.DOTALL) |
|
|
mmd_text = re.sub(r'<sub>(.*?)</sub>', r'_{\\1}', mmd_text, flags=re.DOTALL) |
|
|
mmd_text = mmd_text.replace('<br>', '\n') |
|
|
|
|
|
|
|
|
mmd_text = re.sub(r'#+\s', '', mmd_text) |
|
|
|
|
|
|
|
|
mmd_text = re.sub(r'\*\*(.*?)\*\*', r'\1', mmd_text) |
|
|
|
|
|
|
|
|
mmd_text = re.sub(r'\*(.*?)\*', r'\1', mmd_text) |
|
|
|
|
|
mmd_text = re.sub(r'(?<!\w)_([^_]+)_', r'\1', mmd_text) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return mmd_text.strip() |
|
|
|