Spaces:
Sleeping
Sleeping
| import editdistance | |
| import frontmatter | |
| from hexdump2 import hexdump | |
| import gradio as gr | |
| import json | |
| import shlex | |
| import subprocess | |
| import tempfile | |
| from dist import levenshtein_with_wildcard, print_match_summary | |
| description = frontmatter.load("README.md").content | |
| def trim(str, n): | |
| return "\n".join(str.splitlines()[n:]) | |
| def trim_objdump(str): | |
| return trim(str, 7) | |
| def disassemble_bytes(byte_data, architecture, options): | |
| with tempfile.NamedTemporaryFile(suffix=".bin", delete=False) as temp_bin_file: | |
| temp_bin_file.write(byte_data) | |
| temp_bin_file_name = temp_bin_file.name | |
| disassembly = subprocess.run( | |
| [ | |
| "objdump", | |
| "-D", | |
| "-b", | |
| "binary", | |
| "-m", | |
| architecture, | |
| "-M", | |
| options, | |
| temp_bin_file_name, | |
| ], | |
| capture_output=True, | |
| text=True, | |
| ).stdout | |
| disassembly = trim_objdump(disassembly) | |
| return disassembly | |
| def compile(compiler, flags, source): | |
| # Create a temporary file for the C source code | |
| with tempfile.NamedTemporaryFile(suffix=".c", delete=False) as temp_c_file: | |
| temp_c_file.write(source.encode()) | |
| temp_c_file_name = temp_c_file.name | |
| # Create a temporary file for the object file | |
| with tempfile.NamedTemporaryFile(suffix=".o", delete=False) as temp_o_file: | |
| temp_o_file_name = temp_o_file.name | |
| # Compile the C file to an object file | |
| result = subprocess.run( | |
| [compiler, "-c", temp_c_file_name] | |
| + shlex.split(flags) | |
| + ["-o", temp_o_file_name], | |
| capture_output=True, | |
| text=True, | |
| ) | |
| compile_output = result.stdout + result.stderr | |
| if result.returncode == 0: | |
| # Create a temporary file for the raw bytes | |
| with tempfile.NamedTemporaryFile(suffix=".raw", delete=True) as raw_bytes_file: | |
| subprocess.run( | |
| [ | |
| "objcopy", | |
| "--only-section", | |
| ".text", | |
| # XXX in reality we should probably look at the sections | |
| "--only-section", | |
| ".text.*", | |
| "-O", | |
| "binary", | |
| temp_o_file_name, | |
| raw_bytes_file.name, | |
| ] | |
| ) | |
| compiled_bytes = raw_bytes_file.read() | |
| # Disassemble the object file | |
| disassembly = subprocess.run( | |
| ["objdump", "-dr", temp_o_file_name], capture_output=True, text=True | |
| ).stdout | |
| disassembly = trim_objdump(disassembly) | |
| # Relocs | |
| json_relocs = subprocess.run( | |
| [ | |
| "llvm-readobj-19", | |
| "--elf-output-style=JSON", | |
| "--relocations", | |
| temp_o_file_name, | |
| ], | |
| capture_output=True, | |
| text=True, | |
| ).stdout | |
| json_relocs = json.loads(json_relocs) | |
| json_relocs = json_relocs[0]["Relocations"] | |
| json_relocs = [r["Relocation"] for d in json_relocs for r in d["Relocs"]] | |
| # Filter out .text | |
| json_relocs = [r for r in json_relocs if r["Symbol"]["Name"] != ".text"] | |
| return json_relocs, compiled_bytes, compile_output, disassembly | |
| else: | |
| return None, None, compile_output, None | |
| def _reloc_type2size(s): | |
| match s: | |
| case "R_X86_64_PC32": | |
| return 4 | |
| case "R_X86_64_PLT32": | |
| return 4 | |
| case _: | |
| assert False, f"Unknown reloc {s}" | |
| def _compute_relocs_byte_range(json_relocs): | |
| relocs_byte_range = [ | |
| range(r["Offset"], r["Offset"] + _reloc_type2size(r["Type"]["Name"])) | |
| for r in json_relocs | |
| ] | |
| # Flatten relocs_byte_range | |
| relocs_byte_range = [i for r in relocs_byte_range for i in r] | |
| return relocs_byte_range | |
| def predict(target_bytes, source, compiler, flags, disasm_arch, disasm_options): | |
| target_bytes = bytes.fromhex(target_bytes) | |
| compiled_relocs, compiled_bytes, compile_output, compiled_disassembly = compile( | |
| compiler, flags, source | |
| ) | |
| target_disassembly = disassemble_bytes(target_bytes, disasm_arch, disasm_options) | |
| if compiled_bytes is not None: | |
| reloc_edit_distance, reloc_operations = print_match_summary( | |
| target_bytes, | |
| compiled_bytes, | |
| wildcard_offsets_seq2=_compute_relocs_byte_range(compiled_relocs), | |
| ) | |
| print(f"reloc_edit_distance: {reloc_edit_distance}") | |
| print(f"reloc operations: {reloc_operations}") | |
| return ( | |
| hexdump(compiled_bytes, result="return"), | |
| hexdump(target_bytes, result="return"), | |
| editdistance.eval(compiled_bytes, target_bytes), | |
| reloc_edit_distance, | |
| "\n".join(reloc_operations), | |
| compile_output, | |
| compiled_disassembly, | |
| compiled_relocs, | |
| target_disassembly, | |
| ) | |
| else: | |
| return ( | |
| "Compilation failed", | |
| hexdump(target_bytes, result="return"), | |
| -1, | |
| None, | |
| None, | |
| compile_output, | |
| compiled_disassembly, | |
| compiled_relocs, | |
| target_disassembly, | |
| ) | |
| def run(): | |
| demo = gr.Interface( | |
| fn=predict, | |
| description=description, | |
| inputs=[ | |
| gr.Textbox( | |
| lines=10, | |
| label="Bytes of Target Function (in hex)", | |
| value="b8 2a 00 00 00 c3", | |
| ), | |
| gr.Textbox( | |
| lines=10, | |
| label="Decompiled C Source Code", | |
| value="int x;\nint foo() { return x; }", | |
| ), | |
| gr.Textbox(label="Compiler", value="g++"), | |
| gr.Textbox(label="Compiler Flags", value="-O2"), | |
| gr.Textbox(label="Architecture (objdump -m)", value="i386"), | |
| gr.Textbox(label="Disassembler options (objdump -M)", value="x86-64"), | |
| ], | |
| outputs=[ | |
| gr.Textbox(label="Compiled bytes"), | |
| gr.Textbox(label="Target bytes"), | |
| gr.Number(label="Edit distance (lower is better)"), | |
| gr.Number(label="Edit distance (ignoring relocs; lower is better)"), | |
| gr.Textbox(label="Edit description (ignoring relocs)"), | |
| gr.Textbox(label="Compiler Output"), | |
| gr.Textbox(label="Compiled Disassembly"), | |
| gr.JSON(label="Compiled relocations", open=True), | |
| gr.Textbox(label="Target Disassembly"), | |
| ], | |
| ) | |
| demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True) | |
| run() | |