levalencia commited on
Commit
c62bdf5
Β·
1 Parent(s): bf666f7

Enhance Streamlit app to support original JSON and YAML display

Browse files

- Added buttons to show original document structures in JSON and YAML formats.
- Updated the UI to handle the display and download options for JSON and YAML content.
- Modified the document processing logic to include YAML format alongside JSON and markdown.
- Improved the layout for displaying document structures and added format information for better user guidance.

src/processing/document_processor.py CHANGED
@@ -21,8 +21,10 @@ class DocumentResult:
21
  file_path: str
22
  structured_markdown: str
23
  structured_json: dict
 
24
  redacted_markdown: str
25
  redacted_json: dict
 
26
 
27
  @dataclass
28
  class ProcessingResult:
@@ -30,7 +32,9 @@ class ProcessingResult:
30
  original_document_md: str
31
  redacted_document_md: str
32
  original_document_json: dict
 
33
  redacted_document_json: dict
 
34
  removed_indices: list # Add the actual indices that were removed
35
  input_tokens: int
36
  output_tokens: int
@@ -103,7 +107,9 @@ def process_document_with_redaction(
103
  original_document_md=result.structured_markdown,
104
  redacted_document_md=result.redacted_markdown,
105
  original_document_json=result.structured_json,
 
106
  redacted_document_json=result.redacted_json,
 
107
  removed_indices=removed_indices,
108
  input_tokens=total_input_tokens,
109
  output_tokens=total_output_tokens,
@@ -149,6 +155,11 @@ class DocumentProcessor:
149
  structured_md = conv_result.document.export_to_markdown()
150
  structured_text = conv_result.document.export_to_text()
151
  doc_json = conv_result.document.export_to_dict()
 
 
 
 
 
152
  logger.info(f"Extracted document content (text length {len(structured_text)} characters)")
153
 
154
  # Use SectionExtractor to remove target sections if provided
@@ -197,8 +208,10 @@ class DocumentProcessor:
197
  file_path=file_path,
198
  structured_markdown=structured_md,
199
  structured_json=doc_json,
 
200
  redacted_markdown=redacted_md,
201
- redacted_json=redacted_json
 
202
  )
203
  logger.info(f"Finished processing for file: {file_path}")
204
  return result
 
21
  file_path: str
22
  structured_markdown: str
23
  structured_json: dict
24
+ structured_yaml: str # Add YAML format
25
  redacted_markdown: str
26
  redacted_json: dict
27
+ raw_text: str # Add raw text without preprocessing
28
 
29
  @dataclass
30
  class ProcessingResult:
 
32
  original_document_md: str
33
  redacted_document_md: str
34
  original_document_json: dict
35
+ original_document_yaml: str # Add YAML format
36
  redacted_document_json: dict
37
+ raw_text: str # Add raw text without preprocessing
38
  removed_indices: list # Add the actual indices that were removed
39
  input_tokens: int
40
  output_tokens: int
 
107
  original_document_md=result.structured_markdown,
108
  redacted_document_md=result.redacted_markdown,
109
  original_document_json=result.structured_json,
110
+ original_document_yaml=result.structured_yaml,
111
  redacted_document_json=result.redacted_json,
112
+ raw_text=result.raw_text,
113
  removed_indices=removed_indices,
114
  input_tokens=total_input_tokens,
115
  output_tokens=total_output_tokens,
 
155
  structured_md = conv_result.document.export_to_markdown()
156
  structured_text = conv_result.document.export_to_text()
157
  doc_json = conv_result.document.export_to_dict()
158
+
159
+ # Convert JSON to YAML for display
160
+ import yaml
161
+ doc_yaml = yaml.dump(doc_json, default_flow_style=False, allow_unicode=True, sort_keys=False)
162
+
163
  logger.info(f"Extracted document content (text length {len(structured_text)} characters)")
164
 
165
  # Use SectionExtractor to remove target sections if provided
 
208
  file_path=file_path,
209
  structured_markdown=structured_md,
210
  structured_json=doc_json,
211
+ structured_yaml=doc_yaml,
212
  redacted_markdown=redacted_md,
213
+ redacted_json=redacted_json,
214
+ raw_text=structured_text # Include the raw text
215
  )
216
  logger.info(f"Finished processing for file: {file_path}")
217
  return result
src/streamlit_app.py CHANGED
@@ -433,7 +433,7 @@ if uploaded_files:
433
  uploaded_file = next(f for f in uploaded_files if f.name == selected_file)
434
 
435
  # Create buttons for different actions
436
- col1, col2, col3 = st.columns(3)
437
 
438
  with col1:
439
  if st.button("πŸ“„ Show Original", type="primary"):
@@ -515,6 +515,54 @@ if uploaded_files:
515
  st.session_state.show_original = True
516
  st.session_state.show_processed = False
517
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
518
  # Show current view status
519
  if st.session_state.get("show_original", False):
520
  st.info("πŸ“„ Currently viewing: **Original Document Structure**")
@@ -526,13 +574,24 @@ if uploaded_files:
526
  # Display results based on button clicked
527
  if st.session_state.get("show_original", False):
528
  st.markdown("---")
529
- st.subheader(f"Original Document Structure - {selected_file}")
 
 
 
 
 
 
 
 
 
 
530
 
531
  # Get the original structure
532
  original_json = st.session_state.original_structures[selected_file]
533
  original_markdown = st.session_state.original_structures.get(f"{selected_file}_markdown", "")
 
534
 
535
- # Display PDF viewer and original markdown side by side
536
  col1, col2 = st.columns([1, 1])
537
 
538
  with col1:
@@ -547,30 +606,86 @@ if uploaded_files:
547
  st.markdown(pdf_display, unsafe_allow_html=True)
548
 
549
  with col2:
550
- st.subheader("πŸ“‹ Original Document (Markdown)")
551
- st.caption("Docling-generated markdown from the PDF")
552
- # Use a text area for better readability and scrolling
553
- st.text_area(
554
- label="Original markdown content",
555
- value=original_markdown,
556
- height=600,
557
- key="original_markdown_display",
558
- label_visibility="collapsed"
559
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
560
 
561
- # Add a download button for the original markdown
562
  st.markdown("---")
563
- col1, col2 = st.columns(2)
564
  with col1:
565
- st.download_button(
566
- label="πŸ“₯ Download Original Markdown",
567
- data=original_markdown,
568
- file_name=f"{selected_file}_original.md",
569
- mime="text/markdown"
570
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
571
  with col2:
572
- st.subheader("πŸ“Š JSON Structure")
573
- st.json(original_json)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
574
 
575
  elif st.session_state.get("show_processed", False):
576
  st.markdown("---")
 
433
  uploaded_file = next(f for f in uploaded_files if f.name == selected_file)
434
 
435
  # Create buttons for different actions
436
+ col1, col2, col3, col4, col5 = st.columns(5)
437
 
438
  with col1:
439
  if st.button("πŸ“„ Show Original", type="primary"):
 
515
  st.session_state.show_original = True
516
  st.session_state.show_processed = False
517
 
518
+ with col4:
519
+ if st.button("πŸ“„ Show Original JSON", type="secondary"):
520
+ # Process the document to get original structure (without redaction)
521
+ if selected_file not in st.session_state.original_structures:
522
+ # Save uploaded file to a temporary location
523
+ temp_path = save_uploaded_file(uploaded_file, selected_file)
524
+
525
+ # Create a DocumentProcessor without section extraction (for original structure)
526
+ processor = DocumentProcessor(section_extractor=None)
527
+
528
+ # Process the document to get original structure
529
+ result = processor.process(temp_path)
530
+ st.session_state.original_structures[selected_file] = result.structured_json
531
+ # Store the original markdown for comparison
532
+ st.session_state.original_structures[f"{selected_file}_markdown"] = result.structured_markdown
533
+ # Store the original YAML for comparison
534
+ st.session_state.original_structures[f"{selected_file}_yaml"] = result.structured_yaml
535
+
536
+ # Display the original JSON structure
537
+ st.session_state.show_original = True
538
+ st.session_state.show_processed = False
539
+ st.session_state.show_json = True
540
+ st.session_state.show_yaml = False
541
+
542
+ with col5:
543
+ if st.button("πŸ“„ Show Original YAML", type="secondary"):
544
+ # Process the document to get original structure (without redaction)
545
+ if selected_file not in st.session_state.original_structures:
546
+ # Save uploaded file to a temporary location
547
+ temp_path = save_uploaded_file(uploaded_file, selected_file)
548
+
549
+ # Create a DocumentProcessor without section extraction (for original structure)
550
+ processor = DocumentProcessor(section_extractor=None)
551
+
552
+ # Process the document to get original structure
553
+ result = processor.process(temp_path)
554
+ st.session_state.original_structures[selected_file] = result.structured_json
555
+ # Store the original markdown for comparison
556
+ st.session_state.original_structures[f"{selected_file}_markdown"] = result.structured_markdown
557
+ # Store the original YAML for comparison
558
+ st.session_state.original_structures[f"{selected_file}_yaml"] = result.structured_yaml
559
+
560
+ # Display the original YAML structure
561
+ st.session_state.show_original = True
562
+ st.session_state.show_processed = False
563
+ st.session_state.show_json = False
564
+ st.session_state.show_yaml = True
565
+
566
  # Show current view status
567
  if st.session_state.get("show_original", False):
568
  st.info("πŸ“„ Currently viewing: **Original Document Structure**")
 
574
  # Display results based on button clicked
575
  if st.session_state.get("show_original", False):
576
  st.markdown("---")
577
+
578
+ # Determine what to show based on button clicked
579
+ show_json = st.session_state.get("show_json", False)
580
+ show_yaml = st.session_state.get("show_yaml", False)
581
+
582
+ if show_json:
583
+ st.subheader(f"Original Document Structure (JSON) - {selected_file}")
584
+ elif show_yaml:
585
+ st.subheader(f"Original Document Structure (YAML) - {selected_file}")
586
+ else:
587
+ st.subheader(f"Original Document Structure (Markdown) - {selected_file}")
588
 
589
  # Get the original structure
590
  original_json = st.session_state.original_structures[selected_file]
591
  original_markdown = st.session_state.original_structures.get(f"{selected_file}_markdown", "")
592
+ original_yaml = st.session_state.original_structures.get(f"{selected_file}_yaml", "")
593
 
594
+ # Display PDF viewer and content side by side
595
  col1, col2 = st.columns([1, 1])
596
 
597
  with col1:
 
606
  st.markdown(pdf_display, unsafe_allow_html=True)
607
 
608
  with col2:
609
+ if show_json:
610
+ st.subheader("πŸ“‹ Original Document (JSON)")
611
+ st.caption("Docling-generated JSON structure from the PDF")
612
+ # Use a text area for better readability and scrolling
613
+ st.text_area(
614
+ label="Original JSON content",
615
+ value=json.dumps(original_json, indent=2, ensure_ascii=False),
616
+ height=600,
617
+ key="original_json_display",
618
+ label_visibility="collapsed"
619
+ )
620
+ elif show_yaml:
621
+ st.subheader("πŸ“‹ Original Document (YAML)")
622
+ st.caption("Docling-generated YAML structure from the PDF")
623
+ # Use a text area for better readability and scrolling
624
+ st.text_area(
625
+ label="Original YAML content",
626
+ value=original_yaml,
627
+ height=600,
628
+ key="original_yaml_display",
629
+ label_visibility="collapsed"
630
+ )
631
+ else:
632
+ st.subheader("πŸ“‹ Original Document (Markdown)")
633
+ st.caption("Docling-generated markdown from the PDF")
634
+ # Use a text area for better readability and scrolling
635
+ st.text_area(
636
+ label="Original markdown content",
637
+ value=original_markdown,
638
+ height=600,
639
+ key="original_markdown_display",
640
+ label_visibility="collapsed"
641
+ )
642
 
643
+ # Add download buttons for the original content
644
  st.markdown("---")
645
+ col1, col2, col3 = st.columns(3)
646
  with col1:
647
+ if show_json:
648
+ st.download_button(
649
+ label="πŸ“₯ Download Original JSON",
650
+ data=json.dumps(original_json, indent=2, ensure_ascii=False),
651
+ file_name=f"{selected_file}_original.json",
652
+ mime="application/json"
653
+ )
654
+ elif show_yaml:
655
+ st.download_button(
656
+ label="πŸ“₯ Download Original YAML",
657
+ data=original_yaml,
658
+ file_name=f"{selected_file}_original.yaml",
659
+ mime="text/yaml"
660
+ )
661
+ else:
662
+ st.download_button(
663
+ label="πŸ“₯ Download Original Markdown",
664
+ data=original_markdown,
665
+ file_name=f"{selected_file}_original.md",
666
+ mime="text/markdown"
667
+ )
668
  with col2:
669
+ if show_json or show_yaml:
670
+ st.subheader("πŸ“Š Document Structure")
671
+ st.json(original_json)
672
+ else:
673
+ st.subheader("πŸ“Š JSON Structure")
674
+ st.json(original_json)
675
+ with col3:
676
+ if show_json or show_yaml:
677
+ # Show format information
678
+ st.subheader("πŸ“‹ Format Info")
679
+ if show_json:
680
+ st.info("**JSON Format**: Structured data representation with key-value pairs")
681
+ st.write("**Use case**: API integration, data processing, programmatic access")
682
+ elif show_yaml:
683
+ st.info("**YAML Format**: Human-readable data serialization")
684
+ st.write("**Use case**: Configuration files, documentation, easy reading")
685
+ else:
686
+ st.subheader("πŸ“‹ Markdown Info")
687
+ st.info("**Markdown Format**: Formatted text with headers, lists, and styling")
688
+ st.write("**Use case**: Documentation, readable output, web display")
689
 
690
  elif st.session_state.get("show_processed", False):
691
  st.markdown("---")