Saivamsim26
/

qwen2.5-vl-7b-ui-grounding

@@ -19,14 +19,10 @@ class EndpointHandler:
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
-        # Load processor and model
         self.processor = AutoProcessor.from_pretrained(path)
-        self.model = AutoModelForVision2Seq.from_pretrained(
-            path,
-            torch_dtype=self.dtype,
-            device_map="auto"
-        )
-        self.model.eval()
         # System instruction (same as training)
         self.system_instruction = (
@@ -108,7 +104,7 @@ class EndpointHandler:
             # Get generation parameters
             parameters = data.get("parameters", {})
             max_new_tokens = parameters.get("max_new_tokens", 100)
-            temperature = parameters.get("temperature", 0.1)  # Low temp for structured output
             # Format messages (same structure as training)
             messages = [
@@ -127,7 +123,7 @@ class EndpointHandler:
                 }
             ]
-            # Process inputs using chat template
             text_input = self.processor.apply_chat_template(
                 messages,
                 add_generation_prompt=True,
@@ -143,22 +139,13 @@ class EndpointHandler:
             )
             # Move to device
-            model_inputs = {
-                k: v.to(self.device) if torch.is_tensor(v) else v
-                for k, v in model_inputs.items()
-            }
-            # Cast pixel_values to correct dtype
-            if "pixel_values" in model_inputs:
-                model_inputs["pixel_values"] = model_inputs["pixel_values"].to(self.dtype)
             # Generate prediction
             with torch.no_grad():
                 generated_ids = self.model.generate(
                     **model_inputs,
-                    max_new_tokens=max_new_tokens,
-                    temperature=temperature,
-                    do_sample=temperature > 0,
                 )
             # Decode only the generated tokens (skip the input prompt)

         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
+        # Load processor and model - EXACTLY like the notebook
         self.processor = AutoProcessor.from_pretrained(path)
+        self.model = AutoModelForVision2Seq.from_pretrained(path)
+        self.model.to(self.device)
         # System instruction (same as training)
         self.system_instruction = (
             # Get generation parameters
             parameters = data.get("parameters", {})
             max_new_tokens = parameters.get("max_new_tokens", 100)
+            temperature = parameters.get("temperature", 0.1)
             # Format messages (same structure as training)
             messages = [
                 }
             ]
+            # Process inputs using chat template - EXACTLY like notebook
             text_input = self.processor.apply_chat_template(
                 messages,
                 add_generation_prompt=True,
             )
             # Move to device
+            model_inputs = {k: v.to(self.device) for k, v in model_inputs.items()}
             # Generate prediction
             with torch.no_grad():
                 generated_ids = self.model.generate(
                     **model_inputs,
+                    max_new_tokens=max_new_tokens
                 )
             # Decode only the generated tokens (skip the input prompt)