DoctorChaos
/

selective-prompt-anchoring

Safetensors

Model card Files Files and versions

xet

Community

DoctorChaos commited on May 14

Commit

a2a2f0e

verified ·

1 Parent(s): d82fdf0

Upload spa_hf.py with huggingface_hub

Browse files

Files changed (1) hide show

spa_hf.py +197 -0

spa_hf.py ADDED Viewed

	@@ -0,0 +1,197 @@

+import torch
+import torch.nn as nn
+from huggingface_hub import PyTorchModelHubMixin
+from transformers import AutoModel, AutoTokenizer
+# Import core SPA functionality
+from spa import SPALogitsProcessor, spa_tokenize, preprocess_anchors, create_default_attention_mask
+class SPAModel(nn.Module, PyTorchModelHubMixin):
+    """
+    Selective Prompt Anchoring (SPA) model with Hugging Face Hub integration.
+    This model wraps a base LLM and provides the SPA functionality with
+    the ability to be shared and downloaded from the Hugging Face Hub.
+    """
+    def __init__(
+        self,
+        base_model_name="Qwen/Qwen3-0.6B",
+        anchoring_strength=2,
+        modulated_by_prob=True,
+        use_attention_mask=True,
+        device_map="auto",
+        **kwargs
+    ):
+        super().__init__()
+        # Store configuration parameters
+        self.base_model_name = base_model_name
+        self.anchoring_strength = anchoring_strength
+        self.modulated_by_prob = modulated_by_prob
+        self.use_attention_mask = use_attention_mask
+        self.device_map = device_map
+        # Load the base model and tokenizer - using AutoModel to handle any model type
+        self.model = AutoModel.from_pretrained(base_model_name, device_map=device_map, **kwargs)
+        self.tokenizer = AutoTokenizer.from_pretrained(base_model_name)
+        # Set default pad token if needed
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+            if hasattr(self.model, "config"):
+                self.model.config.pad_token_id = self.model.config.eos_token_id
+        # Determine device
+        if hasattr(self.model, "device"):
+            self.device = self.model.device
+        else:
+            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    def forward(self, input_ids, attention_mask=None, **kwargs):
+        """Pass through to the base model's forward method"""
+        return self.model(input_ids=input_ids, attention_mask=attention_mask, **kwargs)
+    def generate_with_spa(
+        self,
+        prompt,
+        anchors=None,
+        anchoring_strength=None,
+        modulated_by_prob=None,
+        use_attention_mask=None,
+        max_new_tokens=100,
+        min_new_tokens=1,
+        do_sample=True,
+        temperature=0.7,
+        top_p=0.95,
+        top_k=50,
+        stream=False,
+        **kwargs
+    ):
+        """
+        Generate text using Selective Prompt Anchoring.
+        Args:
+            prompt: Text or messages to generate from
+            anchors: List of anchor strings to influence generation
+            anchoring_strength: How much to weight the anchored version
+            modulated_by_prob: Whether to modulate strength by token probability
+            use_attention_mask: Whether to use attention masking for anchor tokens
+            max_new_tokens: Maximum number of tokens to generate
+            min_new_tokens: Minimum number of tokens to generate
+            do_sample: Whether to use sampling for generation
+            temperature: Sampling temperature
+            top_p: Top-p sampling parameter
+            top_k: Top-k sampling parameter
+            stream: Whether to stream the output
+        Returns:
+            Generated text (or streamer if stream=True)
+        """
+        # Use instance defaults if parameters are not provided
+        anchoring_strength = anchoring_strength or self.anchoring_strength
+        modulated_by_prob = modulated_by_prob if modulated_by_prob is not None else self.modulated_by_prob
+        use_attention_mask = use_attention_mask if use_attention_mask is not None else self.use_attention_mask
+        # Default to empty list if anchors not provided
+        if anchors is None:
+            anchors = []
+        # Preprocess anchors
+        anchors = preprocess_anchors(anchors)
+        # Tokenize with SPA
+        main_inputs, aux_inputs, mask_token = spa_tokenize(
+            prompt_with_anchors=prompt,
+            global_anchors=anchors,
+            tokenizer=self.tokenizer,
+            device=self.device
+        )
+        # Create SPA logits processor
+        spa_processor = SPALogitsProcessor(
+            aux_model=self.model,
+            aux_input_ids=aux_inputs,
+            strength=anchoring_strength,
+            modulated_by_prob=modulated_by_prob,
+            use_attention_mask=use_attention_mask,
+            mask_token=mask_token,
+            tokenizer=self.tokenizer
+        )
+        # Get attention mask
+        attention_mask = create_default_attention_mask(main_inputs, device=self.device)
+        # Set up generation kwargs
+        generation_kwargs = {
+            "input_ids": main_inputs,
+            "attention_mask": attention_mask,
+            "logits_processor": [spa_processor],
+            "min_new_tokens": min_new_tokens,
+            "max_new_tokens": max_new_tokens,
+            "do_sample": do_sample,
+            "temperature": temperature,
+            "top_p": top_p,
+            "top_k": top_k,
+            **kwargs
+        }
+        if stream:
+            from transformers import TextIteratorStreamer
+            import threading
+            # Set up streamer
+            streamer = TextIteratorStreamer(
+                self.tokenizer,
+                skip_special_tokens=True,
+                skip_prompt=True
+            )
+            generation_kwargs["streamer"] = streamer
+            # Start generation in a separate thread
+            generation_thread = threading.Thread(
+                target=self.model.generate,
+                kwargs=generation_kwargs
+            )
+            generation_thread.start()
+            # Return streamer for token-by-token output
+            return streamer
+        else:
+            # Normal generation (non-streaming)
+            output_sequences = self.model.generate(**generation_kwargs)
+            # Decode the output
+            input_length = main_inputs.shape[1]
+            new_tokens = output_sequences[0][input_length:]
+            generated_text = self.tokenizer.decode(new_tokens, skip_special_tokens=True)
+            return generated_text
+# Create a helper function to load models directly from hub
+def load_spa_model(
+    model_name="magic-yuantian/selective-prompt-anchoring",
+    base_model_name="meta-llama/Llama-3.1-8B-Instruct",
+    **kwargs
+):
+    """
+    Load a SPAModel from the Hugging Face Hub or create a new one.
+    Args:
+        model_name: Name or path of the SPA model in the Hub
+        base_model_name: The base model to use (if creating a new model)
+        **kwargs: Additional arguments to pass to from_pretrained or __init__
+    Returns:
+        A SPAModel instance
+    """
+    try:
+        # Try to load from hub
+        model = SPAModel.from_pretrained(model_name, **kwargs)
+        return model
+    except Exception as e:
+        print(f"Error loading model from hub: {e}")
+        print(f"Creating a new SPAModel with base model {base_model_name}")
+        # Create a new model
+        model = SPAModel(base_model_name=base_model_name, **kwargs)
+        return model