radicalnumerics
/

RND1-Base-0910

@@ -16,26 +16,41 @@ def apply_top_k_filtering(logits: torch.Tensor, k: int) -> torch.Tensor:
     """
     Apply top-k filtering to logits: with non-top-k values set to -inf
     """
-    top_k_values, top_k_indices = torch.topk(logits, min(k, logits.size(-1)), dim=-1)
-    filtered_logits = torch.full_like(logits, float('-inf'))
-    filtered_logits.scatter_(-1, top_k_indices, top_k_values)
-    return filtered_logits
-def apply_top_p_filtering(logits: torch.Tensor, p: float) -> torch.Tensor:
     """
     Apply top-p (nucleus) filtering to logits: with tokens beyond threshold set to -inf
     """
     sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
-    cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
-    # Remove tokens with cumulative probability above threshold
     sorted_indices_to_remove = cumulative_probs > p
-    sorted_indices_to_remove[..., 0] = False  # Keep at least one token
     sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
-    indices_to_remove = sorted_indices_to_remove.scatter(-1, sorted_indices, sorted_indices_to_remove)
-    return logits.masked_fill(indices_to_remove, float('-inf'))
 @torch.no_grad()
@@ -189,18 +204,23 @@ def diffusion_sample(
             # Fall back to positional argument
             model_output = model(tokens)
-        safe_temperature = max(temperature, 1e-8)  # Prevent division by zero
-        logits = model_output.logits / safe_temperature
-        # Note: When both top_k and top_p are provided, they are applied sequentially:
-        # First top_k filters to k tokens, then top_p filters from those k tokens
-        if top_k is not None and top_k > 0:
-            logits = apply_top_k_filtering(logits, top_k)
-        if top_p is not None and 0 < top_p < 1.0:
-            logits = apply_top_p_filtering(logits, top_p)
-        logp = torch.log_softmax(logits, dim=-1)
         if greedy:
             pred_next = logp.argmax(-1)
@@ -208,10 +228,10 @@ def diffusion_sample(
             # Sample from categorical distribution with proper RNG handling
             if generator is not None:
                 # Use multinomial with generator for reproducible sampling
-                probs = logp.exp()
                 pred_next = torch.multinomial(probs.view(-1, probs.size(-1)), 1, generator=generator).squeeze(-1).view(probs.shape[:-1])
             else:
-                pred_next = torch.distributions.Categorical(logits=logp).sample()
         conf_next = torch.gather(logp, -1, pred_next.unsqueeze(-1)).squeeze(-1)

     """
     Apply top-k filtering to logits: with non-top-k values set to -inf
     """
+    if k is None or k <= 0:
+        return torch.full_like(logits, float("-inf"))
+    k = min(k, logits.size(-1))
+    top_k_values, top_k_indices = torch.topk(logits, k, dim=-1)
+    filtered = torch.full_like(logits, float("-inf"))
+    filtered.scatter_(-1, top_k_indices, top_k_values)
+    return filtered
+def apply_top_p_filtering(logits: torch.Tensor, p: float, min_tokens_to_keep: int = 1) -> torch.Tensor:
     """
     Apply top-p (nucleus) filtering to logits: with tokens beyond threshold set to -inf
     """
+    if p <= 0:
+        p = 1e-8
+    if p >= 1:
+        return logits
     sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
+    probs = torch.softmax(sorted_logits, dim=-1)
+    cumulative_probs = torch.cumsum(probs, dim=-1)
     sorted_indices_to_remove = cumulative_probs > p
+    if min_tokens_to_keep > 0:
+        sorted_indices_to_remove[..., :min_tokens_to_keep] = False
     sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+    sorted_indices_to_remove[..., 0] = False
+    indices_to_remove = torch.zeros_like(sorted_indices_to_remove)
+    indices_to_remove.scatter_(-1, sorted_indices, sorted_indices_to_remove)
+    return logits.masked_fill(indices_to_remove, float("-inf"))
 @torch.no_grad()
             # Fall back to positional argument
             model_output = model(tokens)
+        # Apply temperature scaling (if temperature == 0, treat as 1.0 for greedy)
+        logits = model_output.logits
+        if temperature > 0:
+            logits = logits / temperature
+        # Apply filtering only when not in greedy mode
+        # Order matches reference: top_p before top_k
+        if not greedy:
+            if top_p is not None and 0 < top_p < 1.0:
+                logits = apply_top_p_filtering(logits, top_p)
+            if top_k is not None and top_k > 0:
+                logits = apply_top_k_filtering(logits, top_k)
+        # Compute probabilities for sampling and metrics
+        probs = torch.softmax(logits, dim=-1)
+        logp = torch.log(probs + 1e-10)  # Add epsilon for numerical stability
         if greedy:
             pred_next = logp.argmax(-1)
             # Sample from categorical distribution with proper RNG handling
             if generator is not None:
                 # Use multinomial with generator for reproducible sampling
                 pred_next = torch.multinomial(probs.view(-1, probs.size(-1)), 1, generator=generator).squeeze(-1).view(probs.shape[:-1])
             else:
+                # Sample from categorical using probabilities
+                pred_next = torch.distributions.Categorical(probs=probs).sample()
         conf_next = torch.gather(logp, -1, pred_next.unsqueeze(-1)).squeeze(-1)