acmc commited on
Commit
71cd7b2
·
verified ·
1 Parent(s): 33283ff

Update pdf_attacker.py

Browse files
Files changed (1) hide show
  1. pdf_attacker.py +84 -80
pdf_attacker.py CHANGED
@@ -43,6 +43,9 @@ class PDFAttacker:
43
  except Exception:
44
  self.upem = 1000 # conservative default
45
 
 
 
 
46
  def create_normal_pdf(self, text: str, output_path: str):
47
  """Create PDF with normal text ordering using shaped cluster layout"""
48
  c = canvas.Canvas(output_path, pagesize=self.page_size)
@@ -53,35 +56,29 @@ class PDFAttacker:
53
  # shape into glyph-clusters and layout greedily into lines
54
  cluster_items = self._shape_into_clusters(clean_text)
55
 
56
- # layout greedy by cluster widths
57
  max_width = self.page_size[0] - 2 * self.margin
58
  x = self.margin
59
  y = self.page_size[1] - self.margin
60
 
61
- for item in cluster_items:
62
- # prefer HarfBuzz advance if present
63
- adv = item.get('adv_pts', item.get('width', 0))
64
- width_rl = item.get('width_rl', adv)
65
- offset = item.get('offset_pts', 0)
66
- s = item['text']
67
-
68
- # stability heuristic: if measured width differs significantly from HarfBuzz advance,
69
- # prefer the ReportLab-measured width for layout to match drawString behavior (fix em-dash cases)
70
- thresh = max(0.5, self.font_size * 0.1)
71
- used_adv = adv
72
- if abs(width_rl - adv) > thresh:
73
- used_adv = width_rl
74
-
75
- # clamp offset if it's unreasonably large relative to advance
76
- if abs(offset) > (used_adv * 0.6):
77
- offset = 0
78
-
79
- if x + used_adv > self.margin + max_width:
80
  x = self.margin
81
  y -= self.line_height
82
- # draw at x + offset to respect glyph x_offset where reasonable
83
- c.drawString(x + offset, y, s)
84
- x += used_adv
 
 
 
 
 
 
 
 
85
 
86
  c.save()
87
  print(f"Normal PDF saved: {output_path}")
@@ -99,41 +96,28 @@ class PDFAttacker:
99
  # shape text into clusters (keeps ligatures, diacritics, etc.)
100
  cluster_items = self._shape_into_clusters(clean_text)
101
 
102
- # Layout clusters greedily into lines and record positions
 
 
103
  max_width = self.page_size[0] - 2 * self.margin
104
- lines = []
105
- cur_line = []
106
- cur_w = 0.0
107
- for item in cluster_items:
108
- if cur_w + item['width'] > max_width and cur_line:
109
- lines.append(cur_line)
110
- cur_line = []
111
- cur_w = 0.0
112
- cur_line.append(item)
113
- cur_w += item['width']
114
- if cur_line:
115
- lines.append(cur_line)
116
-
117
- # compute absolute positions for each cluster
118
- char_positions = [] # (x, y, text)
119
  y = self.page_size[1] - self.margin
120
- for line in lines:
121
- x = self.margin
122
- for item in line:
 
 
 
 
 
 
 
123
  adv = item.get('adv_pts', item.get('width', 0))
124
- width_rl = item.get('width_rl', adv)
125
  offset = item.get('offset_pts', 0)
126
-
127
- thresh = max(0.5, self.font_size * 0.1)
128
- used_adv = adv
129
- if abs(width_rl - adv) > thresh:
130
- used_adv = width_rl
131
- if abs(offset) > (used_adv * 0.6):
132
  offset = 0
133
-
134
  char_positions.append((x + offset, y, item['text']))
135
- x += used_adv
136
- y -= self.line_height
137
 
138
  # drawing order is per-cluster; attack by shuffling a subset
139
  drawing_order = list(range(len(char_positions)))
@@ -188,40 +172,28 @@ class PDFAttacker:
188
 
189
  final_extraction_order = target_seq + space_indices + remaining_indices
190
 
191
- # Layout clusters visually to get positions
192
- max_width = self.page_size[0] - 2 * self.margin
193
- lines = []
194
- cur_line = []
195
- cur_w = 0.0
196
- for item in cluster_items:
197
- if cur_w + item['width'] > max_width and cur_line:
198
- lines.append(cur_line)
199
- cur_line = []
200
- cur_w = 0.0
201
- cur_line.append(item)
202
- cur_w += item['width']
203
- if cur_line:
204
- lines.append(cur_line)
205
-
206
  positions = []
 
207
  y = self.page_size[1] - self.margin
208
- for line in lines:
209
- x = self.margin
210
- for item in line:
 
 
 
 
 
 
 
211
  adv = item.get('adv_pts', item.get('width', 0))
212
- width_rl = item.get('width_rl', adv)
213
  offset = item.get('offset_pts', 0)
214
-
215
- thresh = max(0.5, self.font_size * 0.1)
216
- used_adv = adv
217
- if abs(width_rl - adv) > thresh:
218
- used_adv = width_rl
219
- if abs(offset) > (used_adv * 0.6):
220
  offset = 0
221
-
222
  positions.append((x + offset, y, item['text']))
223
- x += used_adv
224
- y -= self.line_height
225
 
226
  c = canvas.Canvas(output_path, pagesize=self.page_size)
227
  c.setFont(self.font_name, self.font_size)
@@ -383,6 +355,38 @@ class PDFAttacker:
383
 
384
  return seq
385
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
386
 
387
  def main():
388
  ai_text = """
 
43
  except Exception:
44
  self.upem = 1000 # conservative default
45
 
46
+ # wrapping mode: if True, break lines on word tokens; if False, break per-cluster
47
+ self.wrap_on_words = True
48
+
49
  def create_normal_pdf(self, text: str, output_path: str):
50
  """Create PDF with normal text ordering using shaped cluster layout"""
51
  c = canvas.Canvas(output_path, pagesize=self.page_size)
 
56
  # shape into glyph-clusters and layout greedily into lines
57
  cluster_items = self._shape_into_clusters(clean_text)
58
 
59
+ # layout greedy by token (word/space) widths so we break on word boundaries
60
  max_width = self.page_size[0] - 2 * self.margin
61
  x = self.margin
62
  y = self.page_size[1] - self.margin
63
 
64
+ tokens = self._tokens_from_clusters(cluster_items)
65
+ for token in tokens:
66
+ tw = token['width']
67
+ # wrap at token (word) boundaries
68
+ if x + tw > self.margin + max_width and x != self.margin:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  x = self.margin
70
  y -= self.line_height
71
+
72
+ # draw clusters within the token sequentially
73
+ for ci in token['clusters']:
74
+ item = cluster_items[ci]
75
+ adv = item.get('adv_pts', item.get('width', 0))
76
+ offset = item.get('offset_pts', 0)
77
+ # clamp offset conservative
78
+ if abs(offset) > (adv * 0.6):
79
+ offset = 0
80
+ c.drawString(x + offset, y, item['text'])
81
+ x += adv
82
 
83
  c.save()
84
  print(f"Normal PDF saved: {output_path}")
 
96
  # shape text into clusters (keeps ligatures, diacritics, etc.)
97
  cluster_items = self._shape_into_clusters(clean_text)
98
 
99
+ # Layout tokens and compute cluster positions (wrap on word boundaries)
100
+ tokens = self._tokens_from_clusters(cluster_items)
101
+ char_positions = [] # index -> (x,y,text)
102
  max_width = self.page_size[0] - 2 * self.margin
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  y = self.page_size[1] - self.margin
104
+ x = self.margin
105
+
106
+ for token in tokens:
107
+ tw = token['width']
108
+ if x + tw > self.margin + max_width and x != self.margin:
109
+ x = self.margin
110
+ y -= self.line_height
111
+
112
+ for ci in token['clusters']:
113
+ item = cluster_items[ci]
114
  adv = item.get('adv_pts', item.get('width', 0))
 
115
  offset = item.get('offset_pts', 0)
116
+ if abs(offset) > (adv * 0.6):
 
 
 
 
 
117
  offset = 0
 
118
  char_positions.append((x + offset, y, item['text']))
119
+ x += adv
120
+ # end token layout
121
 
122
  # drawing order is per-cluster; attack by shuffling a subset
123
  drawing_order = list(range(len(char_positions)))
 
172
 
173
  final_extraction_order = target_seq + space_indices + remaining_indices
174
 
175
+ # Layout tokens and compute cluster positions (wrap on word boundaries)
176
+ tokens = self._tokens_from_clusters(cluster_items)
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  positions = []
178
+ max_width = self.page_size[0] - 2 * self.margin
179
  y = self.page_size[1] - self.margin
180
+ x = self.margin
181
+
182
+ for token in tokens:
183
+ tw = token['width']
184
+ if x + tw > self.margin + max_width and x != self.margin:
185
+ x = self.margin
186
+ y -= self.line_height
187
+
188
+ for ci in token['clusters']:
189
+ item = cluster_items[ci]
190
  adv = item.get('adv_pts', item.get('width', 0))
 
191
  offset = item.get('offset_pts', 0)
192
+ if abs(offset) > (adv * 0.6):
 
 
 
 
 
193
  offset = 0
 
194
  positions.append((x + offset, y, item['text']))
195
+ x += adv
196
+ # end token layout
197
 
198
  c = canvas.Canvas(output_path, pagesize=self.page_size)
199
  c.setFont(self.font_name, self.font_size)
 
355
 
356
  return seq
357
 
358
+ def _tokens_from_clusters(self, cluster_items):
359
+ """Group clusters into tokens: words (one or more non-space clusters) and space tokens.
360
+
361
+ Returns list of tokens: {'kind': 'word'|'space', 'clusters':[idxs], 'text': str, 'width': float}
362
+ """
363
+ tokens = []
364
+ i = 0
365
+ n = len(cluster_items)
366
+ while i < n:
367
+ item = cluster_items[i]
368
+ text = item['text']
369
+ if text.isspace():
370
+ # space token (keep consecutive spaces separate clusters)
371
+ tokens.append({'kind': 'space', 'clusters': [i], 'text': text, 'width': item.get('adv_pts', item.get('width', 0))})
372
+ i += 1
373
+ continue
374
+
375
+ # accumulate a word: consecutive non-space clusters
376
+ clusters = [i]
377
+ width = item.get('adv_pts', item.get('width', 0))
378
+ txt = text
379
+ i += 1
380
+ while i < n and not cluster_items[i]['text'].isspace():
381
+ clusters.append(i)
382
+ width += cluster_items[i].get('adv_pts', cluster_items[i].get('width', 0))
383
+ txt += cluster_items[i]['text']
384
+ i += 1
385
+
386
+ tokens.append({'kind': 'word', 'clusters': clusters, 'text': txt, 'width': width})
387
+
388
+ return tokens
389
+
390
 
391
  def main():
392
  ai_text = """