Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -11,11 +11,73 @@ from huggingface_hub import HfApi, get_token
|
|
| 11 |
import huggingface_hub
|
| 12 |
import os
|
| 13 |
from mistralai import Mistral
|
|
|
|
| 14 |
|
| 15 |
# Configure logging
|
| 16 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 17 |
logger = logging.getLogger(__name__)
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
# --- Mistral OCR Setup ---
|
| 20 |
api_key = os.environ.get("MISTRAL_API_KEY")
|
| 21 |
hf_token_global = None
|
|
@@ -286,12 +348,16 @@ def get_hf_token(explicit_token: str = None) -> str:
|
|
| 286 |
return None
|
| 287 |
|
| 288 |
def process_file_and_save(
|
| 289 |
-
file_objs:
|
| 290 |
strip_headers: bool, hf_token: str, repo_name: str
|
| 291 |
) -> str:
|
| 292 |
"""Orchestrates OCR, chunking, and saving to Hugging Face for multiple files."""
|
|
|
|
| 293 |
if not file_objs:
|
| 294 |
return "Error: No files uploaded."
|
|
|
|
|
|
|
|
|
|
| 295 |
if not repo_name or '/' not in repo_name:
|
| 296 |
return "Error: Invalid repository name (use 'username/dataset-name')."
|
| 297 |
|
|
@@ -443,6 +509,12 @@ with gr.Blocks(title="Mistral OCR & Dataset Creator",
|
|
| 443 |
gr.Markdown("*Requires MISTRAL_API_KEY or HF token*")
|
| 444 |
|
| 445 |
if __name__ == "__main__":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 446 |
initial_token = get_hf_token()
|
| 447 |
if not initial_token and not client:
|
| 448 |
print("\nWARNING: Neither Mistral API key nor HF token found.")
|
|
@@ -452,4 +524,4 @@ if __name__ == "__main__":
|
|
| 452 |
share=os.getenv('GRADIO_SHARE', 'False').lower() == 'true',
|
| 453 |
debug=True,
|
| 454 |
auth_message="Provide a valid Hugging Face token if prompted"
|
| 455 |
-
)
|
|
|
|
| 11 |
import huggingface_hub
|
| 12 |
import os
|
| 13 |
from mistralai import Mistral
|
| 14 |
+
import gradio_client.utils as client_utils
|
| 15 |
|
| 16 |
# Configure logging
|
| 17 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 18 |
logger = logging.getLogger(__name__)
|
| 19 |
|
| 20 |
+
# --- Patch Gradio's get_type function to handle boolean schemas ---
|
| 21 |
+
def patched_get_type(schema: Any) -> str:
|
| 22 |
+
"""Patched version of get_type to handle boolean schemas."""
|
| 23 |
+
if isinstance(schema, bool):
|
| 24 |
+
return "bool"
|
| 25 |
+
if "const" in schema:
|
| 26 |
+
return f"Literal[{repr(schema['const'])}]"
|
| 27 |
+
if "enum" in schema:
|
| 28 |
+
return f"Literal[{', '.join(repr(v) for v in schema['enum'])}]"
|
| 29 |
+
if "type" not in schema:
|
| 30 |
+
return "Any"
|
| 31 |
+
type_ = schema["type"]
|
| 32 |
+
if isinstance(type_, list):
|
| 33 |
+
return f"Union[{', '.join(t for t in type_ if t != 'null')}]"
|
| 34 |
+
if type_ == "array":
|
| 35 |
+
items = schema.get("items", {})
|
| 36 |
+
return f"List[{patched_json_schema_to_python_type(items, schema.get('$defs'))}]"
|
| 37 |
+
if type_ == "object":
|
| 38 |
+
return "Dict[str, Any]"
|
| 39 |
+
if type_ == "null":
|
| 40 |
+
return "None"
|
| 41 |
+
if type_ == "integer":
|
| 42 |
+
return "int"
|
| 43 |
+
if type_ == "number":
|
| 44 |
+
return "float"
|
| 45 |
+
if type_ == "boolean":
|
| 46 |
+
return "bool"
|
| 47 |
+
return type_
|
| 48 |
+
|
| 49 |
+
def patched_json_schema_to_python_type(schema: Any, defs: Dict[str, Any] = None) -> str:
|
| 50 |
+
"""Patched version of json_schema_to_python_type to use patched_get_type."""
|
| 51 |
+
defs = defs or {}
|
| 52 |
+
if not schema:
|
| 53 |
+
return "Any"
|
| 54 |
+
if "$ref" in schema:
|
| 55 |
+
ref = schema["$ref"].split("/")[-1]
|
| 56 |
+
return patched_json_schema_to_python_type(defs.get(ref, {}), defs)
|
| 57 |
+
if "anyOf" in schema:
|
| 58 |
+
types = [
|
| 59 |
+
patched_json_schema_to_python_type(s, defs) for s in schema["anyOf"]
|
| 60 |
+
]
|
| 61 |
+
return f"Union[{', '.join(t for t in types if t != 'None')}]"
|
| 62 |
+
if "type" in schema and schema["type"] == "array":
|
| 63 |
+
items = schema.get("items", {})
|
| 64 |
+
elements = patched_json_schema_to_python_type(items, defs)
|
| 65 |
+
return f"List[{elements}]"
|
| 66 |
+
if "type" in schema and schema["type"] == "object":
|
| 67 |
+
if "properties" in schema:
|
| 68 |
+
des = [
|
| 69 |
+
f"{n}: {patched_json_schema_to_python_type(v, defs)}{client_utils.get_desc(v)}"
|
| 70 |
+
for n, v in schema["properties"].items()
|
| 71 |
+
]
|
| 72 |
+
return f"Dict[str, Union[{', '.join(des)}]]"
|
| 73 |
+
if "additionalProperties" in schema:
|
| 74 |
+
return f"Dict[str, {patched_json_schema_to_python_type(schema['additionalProperties'], defs)}]"
|
| 75 |
+
return "Dict[str, Any]"
|
| 76 |
+
return patched_get_type(schema)
|
| 77 |
+
|
| 78 |
+
# Override Gradio's json_schema_to_python_type
|
| 79 |
+
client_utils.json_schema_to_python_type = patched_json_schema_to_python_type
|
| 80 |
+
|
| 81 |
# --- Mistral OCR Setup ---
|
| 82 |
api_key = os.environ.get("MISTRAL_API_KEY")
|
| 83 |
hf_token_global = None
|
|
|
|
| 348 |
return None
|
| 349 |
|
| 350 |
def process_file_and_save(
|
| 351 |
+
file_objs: Any, chunk_size: int, chunk_overlap: int,
|
| 352 |
strip_headers: bool, hf_token: str, repo_name: str
|
| 353 |
) -> str:
|
| 354 |
"""Orchestrates OCR, chunking, and saving to Hugging Face for multiple files."""
|
| 355 |
+
# Handle case where file_objs is a single file or None
|
| 356 |
if not file_objs:
|
| 357 |
return "Error: No files uploaded."
|
| 358 |
+
if not isinstance(file_objs, list):
|
| 359 |
+
file_objs = [file_objs]
|
| 360 |
+
|
| 361 |
if not repo_name or '/' not in repo_name:
|
| 362 |
return "Error: Invalid repository name (use 'username/dataset-name')."
|
| 363 |
|
|
|
|
| 509 |
gr.Markdown("*Requires MISTRAL_API_KEY or HF token*")
|
| 510 |
|
| 511 |
if __name__ == "__main__":
|
| 512 |
+
import gradio
|
| 513 |
+
logger.info(f"Using Gradio version: {gradio.__version__}")
|
| 514 |
+
if not gradio.__version__.startswith("4."):
|
| 515 |
+
logger.warning("Gradio version is not 4.x. Updating to the latest version is recommended.")
|
| 516 |
+
print("Consider running: pip install --upgrade gradio")
|
| 517 |
+
|
| 518 |
initial_token = get_hf_token()
|
| 519 |
if not initial_token and not client:
|
| 520 |
print("\nWARNING: Neither Mistral API key nor HF token found.")
|
|
|
|
| 524 |
share=os.getenv('GRADIO_SHARE', 'False').lower() == 'true',
|
| 525 |
debug=True,
|
| 526 |
auth_message="Provide a valid Hugging Face token if prompted"
|
| 527 |
+
)
|