Spaces:

acmc
/

PDFuzz

Running

App Files Files Community

acmc commited on Aug 17

Commit

71cd7b2

verified ·

1 Parent(s): 33283ff

Update pdf_attacker.py

Browse files

Files changed (1) hide show

pdf_attacker.py +84 -80

pdf_attacker.py CHANGED Viewed

@@ -43,6 +43,9 @@ class PDFAttacker:
         except Exception:
             self.upem = 1000  # conservative default
     def create_normal_pdf(self, text: str, output_path: str):
         """Create PDF with normal text ordering using shaped cluster layout"""
         c = canvas.Canvas(output_path, pagesize=self.page_size)
@@ -53,35 +56,29 @@ class PDFAttacker:
         # shape into glyph-clusters and layout greedily into lines
         cluster_items = self._shape_into_clusters(clean_text)
-        # layout greedy by cluster widths
         max_width = self.page_size[0] - 2 * self.margin
         x = self.margin
         y = self.page_size[1] - self.margin
-        for item in cluster_items:
-            # prefer HarfBuzz advance if present
-            adv = item.get('adv_pts', item.get('width', 0))
-            width_rl = item.get('width_rl', adv)
-            offset = item.get('offset_pts', 0)
-            s = item['text']
-            # stability heuristic: if measured width differs significantly from HarfBuzz advance,
-            # prefer the ReportLab-measured width for layout to match drawString behavior (fix em-dash cases)
-            thresh = max(0.5, self.font_size * 0.1)
-            used_adv = adv
-            if abs(width_rl - adv) > thresh:
-                used_adv = width_rl
-            # clamp offset if it's unreasonably large relative to advance
-            if abs(offset) > (used_adv * 0.6):
-                offset = 0
-            if x + used_adv > self.margin + max_width:
                 x = self.margin
                 y -= self.line_height
-            # draw at x + offset to respect glyph x_offset where reasonable
-            c.drawString(x + offset, y, s)
-            x += used_adv
         c.save()
         print(f"Normal PDF saved: {output_path}")
@@ -99,41 +96,28 @@ class PDFAttacker:
         # shape text into clusters (keeps ligatures, diacritics, etc.)
         cluster_items = self._shape_into_clusters(clean_text)
-        # Layout clusters greedily into lines and record positions
         max_width = self.page_size[0] - 2 * self.margin
-        lines = []
-        cur_line = []
-        cur_w = 0.0
-        for item in cluster_items:
-            if cur_w + item['width'] > max_width and cur_line:
-                lines.append(cur_line)
-                cur_line = []
-                cur_w = 0.0
-            cur_line.append(item)
-            cur_w += item['width']
-        if cur_line:
-            lines.append(cur_line)
-        # compute absolute positions for each cluster
-        char_positions = []  # (x, y, text)
         y = self.page_size[1] - self.margin
-        for line in lines:
-            x = self.margin
-            for item in line:
                 adv = item.get('adv_pts', item.get('width', 0))
-                width_rl = item.get('width_rl', adv)
                 offset = item.get('offset_pts', 0)
-                thresh = max(0.5, self.font_size * 0.1)
-                used_adv = adv
-                if abs(width_rl - adv) > thresh:
-                    used_adv = width_rl
-                if abs(offset) > (used_adv * 0.6):
                     offset = 0
                 char_positions.append((x + offset, y, item['text']))
-                x += used_adv
-            y -= self.line_height
         # drawing order is per-cluster; attack by shuffling a subset
         drawing_order = list(range(len(char_positions)))
@@ -188,40 +172,28 @@ class PDFAttacker:
         final_extraction_order = target_seq + space_indices + remaining_indices
-        # Layout clusters visually to get positions
-        max_width = self.page_size[0] - 2 * self.margin
-        lines = []
-        cur_line = []
-        cur_w = 0.0
-        for item in cluster_items:
-            if cur_w + item['width'] > max_width and cur_line:
-                lines.append(cur_line)
-                cur_line = []
-                cur_w = 0.0
-            cur_line.append(item)
-            cur_w += item['width']
-        if cur_line:
-            lines.append(cur_line)
         positions = []
         y = self.page_size[1] - self.margin
-        for line in lines:
-            x = self.margin
-            for item in line:
                 adv = item.get('adv_pts', item.get('width', 0))
-                width_rl = item.get('width_rl', adv)
                 offset = item.get('offset_pts', 0)
-                thresh = max(0.5, self.font_size * 0.1)
-                used_adv = adv
-                if abs(width_rl - adv) > thresh:
-                    used_adv = width_rl
-                if abs(offset) > (used_adv * 0.6):
                     offset = 0
                 positions.append((x + offset, y, item['text']))
-                x += used_adv
-            y -= self.line_height
         c = canvas.Canvas(output_path, pagesize=self.page_size)
         c.setFont(self.font_name, self.font_size)
@@ -383,6 +355,38 @@ class PDFAttacker:
         return seq
 def main():
     ai_text = """

         except Exception:
             self.upem = 1000  # conservative default
+        # wrapping mode: if True, break lines on word tokens; if False, break per-cluster
+        self.wrap_on_words = True
     def create_normal_pdf(self, text: str, output_path: str):
         """Create PDF with normal text ordering using shaped cluster layout"""
         c = canvas.Canvas(output_path, pagesize=self.page_size)
         # shape into glyph-clusters and layout greedily into lines
         cluster_items = self._shape_into_clusters(clean_text)
+        # layout greedy by token (word/space) widths so we break on word boundaries
         max_width = self.page_size[0] - 2 * self.margin
         x = self.margin
         y = self.page_size[1] - self.margin
+        tokens = self._tokens_from_clusters(cluster_items)
+        for token in tokens:
+            tw = token['width']
+            # wrap at token (word) boundaries
+            if x + tw > self.margin + max_width and x != self.margin:
                 x = self.margin
                 y -= self.line_height
+            # draw clusters within the token sequentially
+            for ci in token['clusters']:
+                item = cluster_items[ci]
+                adv = item.get('adv_pts', item.get('width', 0))
+                offset = item.get('offset_pts', 0)
+                # clamp offset conservative
+                if abs(offset) > (adv * 0.6):
+                    offset = 0
+                c.drawString(x + offset, y, item['text'])
+                x += adv
         c.save()
         print(f"Normal PDF saved: {output_path}")
         # shape text into clusters (keeps ligatures, diacritics, etc.)
         cluster_items = self._shape_into_clusters(clean_text)
+        # Layout tokens and compute cluster positions (wrap on word boundaries)
+        tokens = self._tokens_from_clusters(cluster_items)
+        char_positions = []  # index -> (x,y,text)
         max_width = self.page_size[0] - 2 * self.margin
         y = self.page_size[1] - self.margin
+        x = self.margin
+        for token in tokens:
+            tw = token['width']
+            if x + tw > self.margin + max_width and x != self.margin:
+                x = self.margin
+                y -= self.line_height
+            for ci in token['clusters']:
+                item = cluster_items[ci]
                 adv = item.get('adv_pts', item.get('width', 0))
                 offset = item.get('offset_pts', 0)
+                if abs(offset) > (adv * 0.6):
                     offset = 0
                 char_positions.append((x + offset, y, item['text']))
+                x += adv
+        # end token layout
         # drawing order is per-cluster; attack by shuffling a subset
         drawing_order = list(range(len(char_positions)))
         final_extraction_order = target_seq + space_indices + remaining_indices
+        # Layout tokens and compute cluster positions (wrap on word boundaries)
+        tokens = self._tokens_from_clusters(cluster_items)
         positions = []
+        max_width = self.page_size[0] - 2 * self.margin
         y = self.page_size[1] - self.margin
+        x = self.margin
+        for token in tokens:
+            tw = token['width']
+            if x + tw > self.margin + max_width and x != self.margin:
+                x = self.margin
+                y -= self.line_height
+            for ci in token['clusters']:
+                item = cluster_items[ci]
                 adv = item.get('adv_pts', item.get('width', 0))
                 offset = item.get('offset_pts', 0)
+                if abs(offset) > (adv * 0.6):
                     offset = 0
                 positions.append((x + offset, y, item['text']))
+                x += adv
+        # end token layout
         c = canvas.Canvas(output_path, pagesize=self.page_size)
         c.setFont(self.font_name, self.font_size)
         return seq
+    def _tokens_from_clusters(self, cluster_items):
+        """Group clusters into tokens: words (one or more non-space clusters) and space tokens.
+        Returns list of tokens: {'kind': 'word'|'space', 'clusters':[idxs], 'text': str, 'width': float}
+        """
+        tokens = []
+        i = 0
+        n = len(cluster_items)
+        while i < n:
+            item = cluster_items[i]
+            text = item['text']
+            if text.isspace():
+                # space token (keep consecutive spaces separate clusters)
+                tokens.append({'kind': 'space', 'clusters': [i], 'text': text, 'width': item.get('adv_pts', item.get('width', 0))})
+                i += 1
+                continue
+            # accumulate a word: consecutive non-space clusters
+            clusters = [i]
+            width = item.get('adv_pts', item.get('width', 0))
+            txt = text
+            i += 1
+            while i < n and not cluster_items[i]['text'].isspace():
+                clusters.append(i)
+                width += cluster_items[i].get('adv_pts', cluster_items[i].get('width', 0))
+                txt += cluster_items[i]['text']
+                i += 1
+            tokens.append({'kind': 'word', 'clusters': clusters, 'text': txt, 'width': width})
+        return tokens
 def main():
     ai_text = """