Spaces:
Running
Running
| <html lang="en"> | |
| <head> | |
| <meta charset="UTF-8" /> | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0" /> | |
| <title>Repo-Level Dedupe Visualization</title> | |
| <link rel="stylesheet" href="style.css" /> | |
| <script src="https://cdn.jsdelivr.net/npm/vega@5"></script> | |
| <script src="https://cdn.jsdelivr.net/npm/vega-lite@5"></script> | |
| <script src="https://cdn.jsdelivr.net/npm/vega-embed@6"></script> | |
| </head> | |
| <body> | |
| <div class="container"> | |
| <div class="header"> | |
| <h1>Visualizing Repo-Level Dedupe</h1> | |
| <p> | |
| This visualization demonstrates block-level deduplication across all | |
| models in | |
| <a | |
| target="_blank" | |
| href="https://huggingface.co/bartowski/gemma-2-9b-it-GGUF" | |
| >bartowski/gemma-2-9b-it-GGUF</a | |
| >. | |
| </p> | |
| <p> | |
| Each row represents a file in the repository grouped into blocks of up | |
| to 64MB. The color of each block represents the deduplication ratio | |
| for the block, which is a function of how often the chunks in the | |
| block are shared between files. The darker the color, the more | |
| frequently content is shared, the better the overall upload and | |
| download times for a given file! The deduplication savings here take a | |
| 191GB repo and cut it down to 97GB, helping to shave a few hours off | |
| the upload time. | |
| </p> | |
| <p> | |
| You can read more about chunks, blocks, and the nitty gritty details | |
| of how we make this all work in our accompanying | |
| <a | |
| target="_blank" | |
| href="https://huggingface.co/blog/from-chunks-to-blocks" | |
| >blog post</a | |
| >. | |
| </p> | |
| To explore the visualization: | |
| <ul> | |
| <li> | |
| <strong>Hover</strong> over a block in an individual file to | |
| highlight it and see where else it appears in the repository. | |
| </li> | |
| <li> | |
| <strong>Click</strong> any block in a file to see all other files | |
| that share blocks. | |
| </li> | |
| <li> | |
| <strong>Double-click</strong> anywhere on any file to reset and | |
| continue exploring. | |
| </li> | |
| </ul> | |
| </div> | |
| <div class="heatmap-container"> | |
| <div id="vis"></div> | |
| </div> | |
| </div> | |
| <script> | |
| var vlSpec = { | |
| $schema: "https://vega.github.io/schema/vega-lite/v5.json", | |
| resolve: { scale: { x: "independent" } }, | |
| width: 800, | |
| height: 25, | |
| params: [ | |
| { | |
| name: "highlight", | |
| select: { type: "point", fields: ["xorb_id"], on: "pointerover" }, | |
| }, | |
| { | |
| name: "select", | |
| select: { type: "point", fields: ["repo"], toggle: "false" }, | |
| }, | |
| { | |
| name: "xorbs_selected", | |
| expr: "pluck(data('source_0'), 'repo_xorb_selected')", | |
| }, | |
| { | |
| name: "any_xorbs_selected", | |
| expr: "extent(xorbs_selected)[0] != null", | |
| }, | |
| ], | |
| transform: [ | |
| { | |
| calculate: | |
| "(select.repo != null ? indexof(select.repo, datum.repo) : -1) + 1", | |
| as: "repo_selected", | |
| }, | |
| { | |
| calculate: "if(datum.repo_selected > 0, datum.xorb_id, null)", | |
| as: "repo_xorb_selected", | |
| }, | |
| { | |
| calculate: | |
| "split(datum.repo, '/')[length(split(datum.repo, '/')) - 1]", | |
| as: "repo", | |
| }, | |
| ], | |
| data: { | |
| url: "xorbs.json", | |
| }, | |
| mark: "rect", | |
| encoding: { | |
| x: { | |
| field: "xorb_id", | |
| axis: null, | |
| sort: { field: "dedupe_factor", order: "descending" }, | |
| stack: "normalize", | |
| }, | |
| color: { | |
| condition: [ | |
| { test: "datum.xorb_id == highlight.xorb_id", value: "orange" }, | |
| ], | |
| field: "dedupe_factor", | |
| type: "quantitative", | |
| scale: { scheme: "blues", domain: [0, 10] }, | |
| }, | |
| opacity: { | |
| condition: [ | |
| { | |
| test: "any_xorbs_selected && indexof(xorbs_selected, datum.xorb_id) == -1", | |
| value: 0.2, | |
| }, | |
| ], | |
| }, | |
| tooltip: [ | |
| { field: "repo", type: "nominal", title: "File" }, | |
| { field: "xorb_id", type: "nominal", title: "Block Hash" }, | |
| { | |
| field: "dedupe_factor", | |
| type: "quantitative", | |
| title: "Dedupe Factor", | |
| }, | |
| ], | |
| row: { | |
| field: "repo", | |
| title: "", | |
| spacing: 1, | |
| header: { labelAngle: 0, labelAlign: "left", labelFontSize: 14 }, | |
| sort: { field: "repo", order: "ascending" }, | |
| }, | |
| }, | |
| }; | |
| vegaEmbed("#vis", vlSpec); | |
| </script> | |
| </body> | |
| </html> | |