Spaces:
Sleeping
Sleeping
Commit
Β·
c62bdf5
1
Parent(s):
bf666f7
Enhance Streamlit app to support original JSON and YAML display
Browse files- Added buttons to show original document structures in JSON and YAML formats.
- Updated the UI to handle the display and download options for JSON and YAML content.
- Modified the document processing logic to include YAML format alongside JSON and markdown.
- Improved the layout for displaying document structures and added format information for better user guidance.
- src/processing/document_processor.py +14 -1
- src/streamlit_app.py +138 -23
src/processing/document_processor.py
CHANGED
|
@@ -21,8 +21,10 @@ class DocumentResult:
|
|
| 21 |
file_path: str
|
| 22 |
structured_markdown: str
|
| 23 |
structured_json: dict
|
|
|
|
| 24 |
redacted_markdown: str
|
| 25 |
redacted_json: dict
|
|
|
|
| 26 |
|
| 27 |
@dataclass
|
| 28 |
class ProcessingResult:
|
|
@@ -30,7 +32,9 @@ class ProcessingResult:
|
|
| 30 |
original_document_md: str
|
| 31 |
redacted_document_md: str
|
| 32 |
original_document_json: dict
|
|
|
|
| 33 |
redacted_document_json: dict
|
|
|
|
| 34 |
removed_indices: list # Add the actual indices that were removed
|
| 35 |
input_tokens: int
|
| 36 |
output_tokens: int
|
|
@@ -103,7 +107,9 @@ def process_document_with_redaction(
|
|
| 103 |
original_document_md=result.structured_markdown,
|
| 104 |
redacted_document_md=result.redacted_markdown,
|
| 105 |
original_document_json=result.structured_json,
|
|
|
|
| 106 |
redacted_document_json=result.redacted_json,
|
|
|
|
| 107 |
removed_indices=removed_indices,
|
| 108 |
input_tokens=total_input_tokens,
|
| 109 |
output_tokens=total_output_tokens,
|
|
@@ -149,6 +155,11 @@ class DocumentProcessor:
|
|
| 149 |
structured_md = conv_result.document.export_to_markdown()
|
| 150 |
structured_text = conv_result.document.export_to_text()
|
| 151 |
doc_json = conv_result.document.export_to_dict()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
logger.info(f"Extracted document content (text length {len(structured_text)} characters)")
|
| 153 |
|
| 154 |
# Use SectionExtractor to remove target sections if provided
|
|
@@ -197,8 +208,10 @@ class DocumentProcessor:
|
|
| 197 |
file_path=file_path,
|
| 198 |
structured_markdown=structured_md,
|
| 199 |
structured_json=doc_json,
|
|
|
|
| 200 |
redacted_markdown=redacted_md,
|
| 201 |
-
redacted_json=redacted_json
|
|
|
|
| 202 |
)
|
| 203 |
logger.info(f"Finished processing for file: {file_path}")
|
| 204 |
return result
|
|
|
|
| 21 |
file_path: str
|
| 22 |
structured_markdown: str
|
| 23 |
structured_json: dict
|
| 24 |
+
structured_yaml: str # Add YAML format
|
| 25 |
redacted_markdown: str
|
| 26 |
redacted_json: dict
|
| 27 |
+
raw_text: str # Add raw text without preprocessing
|
| 28 |
|
| 29 |
@dataclass
|
| 30 |
class ProcessingResult:
|
|
|
|
| 32 |
original_document_md: str
|
| 33 |
redacted_document_md: str
|
| 34 |
original_document_json: dict
|
| 35 |
+
original_document_yaml: str # Add YAML format
|
| 36 |
redacted_document_json: dict
|
| 37 |
+
raw_text: str # Add raw text without preprocessing
|
| 38 |
removed_indices: list # Add the actual indices that were removed
|
| 39 |
input_tokens: int
|
| 40 |
output_tokens: int
|
|
|
|
| 107 |
original_document_md=result.structured_markdown,
|
| 108 |
redacted_document_md=result.redacted_markdown,
|
| 109 |
original_document_json=result.structured_json,
|
| 110 |
+
original_document_yaml=result.structured_yaml,
|
| 111 |
redacted_document_json=result.redacted_json,
|
| 112 |
+
raw_text=result.raw_text,
|
| 113 |
removed_indices=removed_indices,
|
| 114 |
input_tokens=total_input_tokens,
|
| 115 |
output_tokens=total_output_tokens,
|
|
|
|
| 155 |
structured_md = conv_result.document.export_to_markdown()
|
| 156 |
structured_text = conv_result.document.export_to_text()
|
| 157 |
doc_json = conv_result.document.export_to_dict()
|
| 158 |
+
|
| 159 |
+
# Convert JSON to YAML for display
|
| 160 |
+
import yaml
|
| 161 |
+
doc_yaml = yaml.dump(doc_json, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
| 162 |
+
|
| 163 |
logger.info(f"Extracted document content (text length {len(structured_text)} characters)")
|
| 164 |
|
| 165 |
# Use SectionExtractor to remove target sections if provided
|
|
|
|
| 208 |
file_path=file_path,
|
| 209 |
structured_markdown=structured_md,
|
| 210 |
structured_json=doc_json,
|
| 211 |
+
structured_yaml=doc_yaml,
|
| 212 |
redacted_markdown=redacted_md,
|
| 213 |
+
redacted_json=redacted_json,
|
| 214 |
+
raw_text=structured_text # Include the raw text
|
| 215 |
)
|
| 216 |
logger.info(f"Finished processing for file: {file_path}")
|
| 217 |
return result
|
src/streamlit_app.py
CHANGED
|
@@ -433,7 +433,7 @@ if uploaded_files:
|
|
| 433 |
uploaded_file = next(f for f in uploaded_files if f.name == selected_file)
|
| 434 |
|
| 435 |
# Create buttons for different actions
|
| 436 |
-
col1, col2, col3 = st.columns(
|
| 437 |
|
| 438 |
with col1:
|
| 439 |
if st.button("π Show Original", type="primary"):
|
|
@@ -515,6 +515,54 @@ if uploaded_files:
|
|
| 515 |
st.session_state.show_original = True
|
| 516 |
st.session_state.show_processed = False
|
| 517 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 518 |
# Show current view status
|
| 519 |
if st.session_state.get("show_original", False):
|
| 520 |
st.info("π Currently viewing: **Original Document Structure**")
|
|
@@ -526,13 +574,24 @@ if uploaded_files:
|
|
| 526 |
# Display results based on button clicked
|
| 527 |
if st.session_state.get("show_original", False):
|
| 528 |
st.markdown("---")
|
| 529 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 530 |
|
| 531 |
# Get the original structure
|
| 532 |
original_json = st.session_state.original_structures[selected_file]
|
| 533 |
original_markdown = st.session_state.original_structures.get(f"{selected_file}_markdown", "")
|
|
|
|
| 534 |
|
| 535 |
-
# Display PDF viewer and
|
| 536 |
col1, col2 = st.columns([1, 1])
|
| 537 |
|
| 538 |
with col1:
|
|
@@ -547,30 +606,86 @@ if uploaded_files:
|
|
| 547 |
st.markdown(pdf_display, unsafe_allow_html=True)
|
| 548 |
|
| 549 |
with col2:
|
| 550 |
-
|
| 551 |
-
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
|
| 558 |
-
|
| 559 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 560 |
|
| 561 |
-
# Add
|
| 562 |
st.markdown("---")
|
| 563 |
-
col1, col2 = st.columns(
|
| 564 |
with col1:
|
| 565 |
-
|
| 566 |
-
|
| 567 |
-
|
| 568 |
-
|
| 569 |
-
|
| 570 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 571 |
with col2:
|
| 572 |
-
|
| 573 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 574 |
|
| 575 |
elif st.session_state.get("show_processed", False):
|
| 576 |
st.markdown("---")
|
|
|
|
| 433 |
uploaded_file = next(f for f in uploaded_files if f.name == selected_file)
|
| 434 |
|
| 435 |
# Create buttons for different actions
|
| 436 |
+
col1, col2, col3, col4, col5 = st.columns(5)
|
| 437 |
|
| 438 |
with col1:
|
| 439 |
if st.button("π Show Original", type="primary"):
|
|
|
|
| 515 |
st.session_state.show_original = True
|
| 516 |
st.session_state.show_processed = False
|
| 517 |
|
| 518 |
+
with col4:
|
| 519 |
+
if st.button("π Show Original JSON", type="secondary"):
|
| 520 |
+
# Process the document to get original structure (without redaction)
|
| 521 |
+
if selected_file not in st.session_state.original_structures:
|
| 522 |
+
# Save uploaded file to a temporary location
|
| 523 |
+
temp_path = save_uploaded_file(uploaded_file, selected_file)
|
| 524 |
+
|
| 525 |
+
# Create a DocumentProcessor without section extraction (for original structure)
|
| 526 |
+
processor = DocumentProcessor(section_extractor=None)
|
| 527 |
+
|
| 528 |
+
# Process the document to get original structure
|
| 529 |
+
result = processor.process(temp_path)
|
| 530 |
+
st.session_state.original_structures[selected_file] = result.structured_json
|
| 531 |
+
# Store the original markdown for comparison
|
| 532 |
+
st.session_state.original_structures[f"{selected_file}_markdown"] = result.structured_markdown
|
| 533 |
+
# Store the original YAML for comparison
|
| 534 |
+
st.session_state.original_structures[f"{selected_file}_yaml"] = result.structured_yaml
|
| 535 |
+
|
| 536 |
+
# Display the original JSON structure
|
| 537 |
+
st.session_state.show_original = True
|
| 538 |
+
st.session_state.show_processed = False
|
| 539 |
+
st.session_state.show_json = True
|
| 540 |
+
st.session_state.show_yaml = False
|
| 541 |
+
|
| 542 |
+
with col5:
|
| 543 |
+
if st.button("π Show Original YAML", type="secondary"):
|
| 544 |
+
# Process the document to get original structure (without redaction)
|
| 545 |
+
if selected_file not in st.session_state.original_structures:
|
| 546 |
+
# Save uploaded file to a temporary location
|
| 547 |
+
temp_path = save_uploaded_file(uploaded_file, selected_file)
|
| 548 |
+
|
| 549 |
+
# Create a DocumentProcessor without section extraction (for original structure)
|
| 550 |
+
processor = DocumentProcessor(section_extractor=None)
|
| 551 |
+
|
| 552 |
+
# Process the document to get original structure
|
| 553 |
+
result = processor.process(temp_path)
|
| 554 |
+
st.session_state.original_structures[selected_file] = result.structured_json
|
| 555 |
+
# Store the original markdown for comparison
|
| 556 |
+
st.session_state.original_structures[f"{selected_file}_markdown"] = result.structured_markdown
|
| 557 |
+
# Store the original YAML for comparison
|
| 558 |
+
st.session_state.original_structures[f"{selected_file}_yaml"] = result.structured_yaml
|
| 559 |
+
|
| 560 |
+
# Display the original YAML structure
|
| 561 |
+
st.session_state.show_original = True
|
| 562 |
+
st.session_state.show_processed = False
|
| 563 |
+
st.session_state.show_json = False
|
| 564 |
+
st.session_state.show_yaml = True
|
| 565 |
+
|
| 566 |
# Show current view status
|
| 567 |
if st.session_state.get("show_original", False):
|
| 568 |
st.info("π Currently viewing: **Original Document Structure**")
|
|
|
|
| 574 |
# Display results based on button clicked
|
| 575 |
if st.session_state.get("show_original", False):
|
| 576 |
st.markdown("---")
|
| 577 |
+
|
| 578 |
+
# Determine what to show based on button clicked
|
| 579 |
+
show_json = st.session_state.get("show_json", False)
|
| 580 |
+
show_yaml = st.session_state.get("show_yaml", False)
|
| 581 |
+
|
| 582 |
+
if show_json:
|
| 583 |
+
st.subheader(f"Original Document Structure (JSON) - {selected_file}")
|
| 584 |
+
elif show_yaml:
|
| 585 |
+
st.subheader(f"Original Document Structure (YAML) - {selected_file}")
|
| 586 |
+
else:
|
| 587 |
+
st.subheader(f"Original Document Structure (Markdown) - {selected_file}")
|
| 588 |
|
| 589 |
# Get the original structure
|
| 590 |
original_json = st.session_state.original_structures[selected_file]
|
| 591 |
original_markdown = st.session_state.original_structures.get(f"{selected_file}_markdown", "")
|
| 592 |
+
original_yaml = st.session_state.original_structures.get(f"{selected_file}_yaml", "")
|
| 593 |
|
| 594 |
+
# Display PDF viewer and content side by side
|
| 595 |
col1, col2 = st.columns([1, 1])
|
| 596 |
|
| 597 |
with col1:
|
|
|
|
| 606 |
st.markdown(pdf_display, unsafe_allow_html=True)
|
| 607 |
|
| 608 |
with col2:
|
| 609 |
+
if show_json:
|
| 610 |
+
st.subheader("π Original Document (JSON)")
|
| 611 |
+
st.caption("Docling-generated JSON structure from the PDF")
|
| 612 |
+
# Use a text area for better readability and scrolling
|
| 613 |
+
st.text_area(
|
| 614 |
+
label="Original JSON content",
|
| 615 |
+
value=json.dumps(original_json, indent=2, ensure_ascii=False),
|
| 616 |
+
height=600,
|
| 617 |
+
key="original_json_display",
|
| 618 |
+
label_visibility="collapsed"
|
| 619 |
+
)
|
| 620 |
+
elif show_yaml:
|
| 621 |
+
st.subheader("π Original Document (YAML)")
|
| 622 |
+
st.caption("Docling-generated YAML structure from the PDF")
|
| 623 |
+
# Use a text area for better readability and scrolling
|
| 624 |
+
st.text_area(
|
| 625 |
+
label="Original YAML content",
|
| 626 |
+
value=original_yaml,
|
| 627 |
+
height=600,
|
| 628 |
+
key="original_yaml_display",
|
| 629 |
+
label_visibility="collapsed"
|
| 630 |
+
)
|
| 631 |
+
else:
|
| 632 |
+
st.subheader("π Original Document (Markdown)")
|
| 633 |
+
st.caption("Docling-generated markdown from the PDF")
|
| 634 |
+
# Use a text area for better readability and scrolling
|
| 635 |
+
st.text_area(
|
| 636 |
+
label="Original markdown content",
|
| 637 |
+
value=original_markdown,
|
| 638 |
+
height=600,
|
| 639 |
+
key="original_markdown_display",
|
| 640 |
+
label_visibility="collapsed"
|
| 641 |
+
)
|
| 642 |
|
| 643 |
+
# Add download buttons for the original content
|
| 644 |
st.markdown("---")
|
| 645 |
+
col1, col2, col3 = st.columns(3)
|
| 646 |
with col1:
|
| 647 |
+
if show_json:
|
| 648 |
+
st.download_button(
|
| 649 |
+
label="π₯ Download Original JSON",
|
| 650 |
+
data=json.dumps(original_json, indent=2, ensure_ascii=False),
|
| 651 |
+
file_name=f"{selected_file}_original.json",
|
| 652 |
+
mime="application/json"
|
| 653 |
+
)
|
| 654 |
+
elif show_yaml:
|
| 655 |
+
st.download_button(
|
| 656 |
+
label="π₯ Download Original YAML",
|
| 657 |
+
data=original_yaml,
|
| 658 |
+
file_name=f"{selected_file}_original.yaml",
|
| 659 |
+
mime="text/yaml"
|
| 660 |
+
)
|
| 661 |
+
else:
|
| 662 |
+
st.download_button(
|
| 663 |
+
label="π₯ Download Original Markdown",
|
| 664 |
+
data=original_markdown,
|
| 665 |
+
file_name=f"{selected_file}_original.md",
|
| 666 |
+
mime="text/markdown"
|
| 667 |
+
)
|
| 668 |
with col2:
|
| 669 |
+
if show_json or show_yaml:
|
| 670 |
+
st.subheader("π Document Structure")
|
| 671 |
+
st.json(original_json)
|
| 672 |
+
else:
|
| 673 |
+
st.subheader("π JSON Structure")
|
| 674 |
+
st.json(original_json)
|
| 675 |
+
with col3:
|
| 676 |
+
if show_json or show_yaml:
|
| 677 |
+
# Show format information
|
| 678 |
+
st.subheader("π Format Info")
|
| 679 |
+
if show_json:
|
| 680 |
+
st.info("**JSON Format**: Structured data representation with key-value pairs")
|
| 681 |
+
st.write("**Use case**: API integration, data processing, programmatic access")
|
| 682 |
+
elif show_yaml:
|
| 683 |
+
st.info("**YAML Format**: Human-readable data serialization")
|
| 684 |
+
st.write("**Use case**: Configuration files, documentation, easy reading")
|
| 685 |
+
else:
|
| 686 |
+
st.subheader("π Markdown Info")
|
| 687 |
+
st.info("**Markdown Format**: Formatted text with headers, lists, and styling")
|
| 688 |
+
st.write("**Use case**: Documentation, readable output, web display")
|
| 689 |
|
| 690 |
elif st.session_state.get("show_processed", False):
|
| 691 |
st.markdown("---")
|