prithivMLmods commited on
Commit
874a882
·
verified ·
1 Parent(s): 62c4995

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -6
app.py CHANGED
@@ -10,22 +10,27 @@ from typing import Optional, Tuple, Dict, Any, Iterable
10
  from gradio.themes import Soft
11
  from gradio.themes.utils import colors, fonts, sizes
12
 
 
13
  print("Downloading model snapshot to ensure all scripts are present...")
 
14
  model_dir = snapshot_download(repo_id="nvidia/NVIDIA-Nemotron-Parse-v1.1")
15
  print(f"Model downloaded to: {model_dir}")
16
 
 
17
  sys.path.append(model_dir)
18
 
19
  try:
20
  from postprocessing import extract_classes_bboxes, transform_bbox_to_original, postprocess_text
21
- print("Successfully imported postprocessing functions.")
22
  except ImportError as e:
23
- print(f" Error importing postprocessing: {e}")
24
  raise e
25
 
 
26
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
27
  print(f"Using device: {device}")
28
 
 
29
  colors.steel_blue = colors.Color(
30
  name="steel_blue",
31
  c50="#EBF3F8",
@@ -92,6 +97,7 @@ css = """
92
  #output-title h2 { font-size: 2.1em !important; }
93
  """
94
 
 
95
  print("Loading Model components...")
96
 
97
  processor = AutoProcessor.from_pretrained(model_dir, trust_remote_code=True)
@@ -107,7 +113,7 @@ except Exception as e:
107
  print(f"Warning: Could not load GenerationConfig: {e}. Using default.")
108
  generation_config = GenerationConfig(max_new_tokens=4096)
109
 
110
- print("Model loaded successfully.")
111
 
112
  @spaces.GPU
113
  def process_ocr_task(image):
@@ -124,7 +130,7 @@ def process_ocr_task(image):
124
  if device.type == 'cuda':
125
  inputs = {k: v.to(torch.bfloat16) if v.dtype == torch.float32 else v for k, v in inputs.items()}
126
 
127
- print("Running inference...")
128
  with torch.no_grad():
129
  outputs = model.generate(
130
  **inputs,
@@ -139,6 +145,7 @@ def process_ocr_task(image):
139
  print(f"Error extracting boxes: {e}")
140
  return generated_text, image
141
 
 
142
  bboxes = [transform_bbox_to_original(bbox, image.width, image.height) for bbox in bboxes]
143
 
144
  table_format = 'latex'
@@ -169,8 +176,15 @@ def process_ocr_task(image):
169
  final_output_text = ""
170
 
171
  for cls, bbox, txt in zip(classes, bboxes, processed_texts):
 
 
 
 
 
 
 
172
  color = color_map.get(cls, "red")
173
- draw.rectangle([bbox[0], bbox[1], bbox[2], bbox[3]], outline=color, width=3)
174
 
175
  if cls == "Table":
176
  final_output_text += f"\n\n--- [Table] ---\n{txt}\n-----------------\n"
@@ -184,6 +198,7 @@ def process_ocr_task(image):
184
 
185
  return final_output_text, result_image
186
 
 
187
  with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
188
  gr.Markdown("# **NVIDIA Nemotron Parse v1.1 [OCR/Parsing]**", elem_id="main-title")
189
  gr.Markdown("Upload a document image to extract text, tables, and layout structures using NVIDIA's state-of-the-art Parse model.")
@@ -200,9 +215,16 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
200
  )
201
 
202
  with gr.Column(scale=2):
203
- output_text = gr.Textbox(label="Parsed Content (Markdown/LaTeX)", lines=8, show_copy_button=True)
204
  output_image = gr.Image(label="Detected Layout & Bounding Boxes", type="pil")
205
 
 
 
 
 
 
 
 
206
  submit_btn.click(
207
  fn=process_ocr_task,
208
  inputs=[image_input],
 
10
  from gradio.themes import Soft
11
  from gradio.themes.utils import colors, fonts, sizes
12
 
13
+ # --- Model & Script Download ---
14
  print("Downloading model snapshot to ensure all scripts are present...")
15
+ # Download the full model repo to ensure postprocessing.py is available locally
16
  model_dir = snapshot_download(repo_id="nvidia/NVIDIA-Nemotron-Parse-v1.1")
17
  print(f"Model downloaded to: {model_dir}")
18
 
19
+ # Add the model directory to sys.path so we can import postprocessing
20
  sys.path.append(model_dir)
21
 
22
  try:
23
  from postprocessing import extract_classes_bboxes, transform_bbox_to_original, postprocess_text
24
+ print("Successfully imported postprocessing functions.")
25
  except ImportError as e:
26
+ print(f" Error importing postprocessing: {e}")
27
  raise e
28
 
29
+ # --- Device Setup ---
30
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
31
  print(f"Using device: {device}")
32
 
33
+ # --- Theme Definition ---
34
  colors.steel_blue = colors.Color(
35
  name="steel_blue",
36
  c50="#EBF3F8",
 
97
  #output-title h2 { font-size: 2.1em !important; }
98
  """
99
 
100
+ # --- Model Loading ---
101
  print("Loading Model components...")
102
 
103
  processor = AutoProcessor.from_pretrained(model_dir, trust_remote_code=True)
 
113
  print(f"Warning: Could not load GenerationConfig: {e}. Using default.")
114
  generation_config = GenerationConfig(max_new_tokens=4096)
115
 
116
+ print("Model loaded successfully.")
117
 
118
  @spaces.GPU
119
  def process_ocr_task(image):
 
130
  if device.type == 'cuda':
131
  inputs = {k: v.to(torch.bfloat16) if v.dtype == torch.float32 else v for k, v in inputs.items()}
132
 
133
+ print("🏃 Running inference...")
134
  with torch.no_grad():
135
  outputs = model.generate(
136
  **inputs,
 
145
  print(f"Error extracting boxes: {e}")
146
  return generated_text, image
147
 
148
+ # Transform boxes to original image size
149
  bboxes = [transform_bbox_to_original(bbox, image.width, image.height) for bbox in bboxes]
150
 
151
  table_format = 'latex'
 
176
  final_output_text = ""
177
 
178
  for cls, bbox, txt in zip(classes, bboxes, processed_texts):
179
+ # Normalize coordinates to prevent PIL ValueError (x1 >= x0)
180
+ x1, y1, x2, y2 = bbox
181
+ xmin = min(x1, x2)
182
+ ymin = min(y1, y2)
183
+ xmax = max(x1, x2)
184
+ ymax = max(y1, y2)
185
+
186
  color = color_map.get(cls, "red")
187
+ draw.rectangle([xmin, ymin, xmax, ymax], outline=color, width=3)
188
 
189
  if cls == "Table":
190
  final_output_text += f"\n\n--- [Table] ---\n{txt}\n-----------------\n"
 
198
 
199
  return final_output_text, result_image
200
 
201
+ # --- Gradio Interface ---
202
  with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
203
  gr.Markdown("# **NVIDIA Nemotron Parse v1.1 [OCR/Parsing]**", elem_id="main-title")
204
  gr.Markdown("Upload a document image to extract text, tables, and layout structures using NVIDIA's state-of-the-art Parse model.")
 
215
  )
216
 
217
  with gr.Column(scale=2):
218
+ output_text = gr.Textbox(label="Parsed Content (Markdown/LaTeX)", lines=20, show_copy_button=True)
219
  output_image = gr.Image(label="Detected Layout & Bounding Boxes", type="pil")
220
 
221
+ with gr.Accordion("Technical Details", open=False):
222
+ gr.Markdown("""
223
+ **Model:** [nvidia/NVIDIA-Nemotron-Parse-v1.1](https://huggingface.co/nvidia/NVIDIA-Nemotron-Parse-v1.1)
224
+ **Architecture:** Llama-3-Vila based.
225
+ **Capabilities:** High-accuracy OCR, Table extraction (to LaTeX/HTML), Figure detection.
226
+ """)
227
+
228
  submit_btn.click(
229
  fn=process_ocr_task,
230
  inputs=[image_input],