Update handler.py
Browse files- handler.py +7 -20
handler.py
CHANGED
|
@@ -19,14 +19,10 @@ class EndpointHandler:
|
|
| 19 |
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 20 |
self.dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
|
| 21 |
|
| 22 |
-
# Load processor and model
|
| 23 |
self.processor = AutoProcessor.from_pretrained(path)
|
| 24 |
-
self.model = AutoModelForVision2Seq.from_pretrained(
|
| 25 |
-
|
| 26 |
-
torch_dtype=self.dtype,
|
| 27 |
-
device_map="auto"
|
| 28 |
-
)
|
| 29 |
-
self.model.eval()
|
| 30 |
|
| 31 |
# System instruction (same as training)
|
| 32 |
self.system_instruction = (
|
|
@@ -108,7 +104,7 @@ class EndpointHandler:
|
|
| 108 |
# Get generation parameters
|
| 109 |
parameters = data.get("parameters", {})
|
| 110 |
max_new_tokens = parameters.get("max_new_tokens", 100)
|
| 111 |
-
temperature = parameters.get("temperature", 0.1)
|
| 112 |
|
| 113 |
# Format messages (same structure as training)
|
| 114 |
messages = [
|
|
@@ -127,7 +123,7 @@ class EndpointHandler:
|
|
| 127 |
}
|
| 128 |
]
|
| 129 |
|
| 130 |
-
# Process inputs using chat template
|
| 131 |
text_input = self.processor.apply_chat_template(
|
| 132 |
messages,
|
| 133 |
add_generation_prompt=True,
|
|
@@ -143,22 +139,13 @@ class EndpointHandler:
|
|
| 143 |
)
|
| 144 |
|
| 145 |
# Move to device
|
| 146 |
-
model_inputs = {
|
| 147 |
-
k: v.to(self.device) if torch.is_tensor(v) else v
|
| 148 |
-
for k, v in model_inputs.items()
|
| 149 |
-
}
|
| 150 |
-
|
| 151 |
-
# Cast pixel_values to correct dtype
|
| 152 |
-
if "pixel_values" in model_inputs:
|
| 153 |
-
model_inputs["pixel_values"] = model_inputs["pixel_values"].to(self.dtype)
|
| 154 |
|
| 155 |
# Generate prediction
|
| 156 |
with torch.no_grad():
|
| 157 |
generated_ids = self.model.generate(
|
| 158 |
**model_inputs,
|
| 159 |
-
max_new_tokens=max_new_tokens
|
| 160 |
-
temperature=temperature,
|
| 161 |
-
do_sample=temperature > 0,
|
| 162 |
)
|
| 163 |
|
| 164 |
# Decode only the generated tokens (skip the input prompt)
|
|
|
|
| 19 |
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 20 |
self.dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
|
| 21 |
|
| 22 |
+
# Load processor and model - EXACTLY like the notebook
|
| 23 |
self.processor = AutoProcessor.from_pretrained(path)
|
| 24 |
+
self.model = AutoModelForVision2Seq.from_pretrained(path)
|
| 25 |
+
self.model.to(self.device)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
# System instruction (same as training)
|
| 28 |
self.system_instruction = (
|
|
|
|
| 104 |
# Get generation parameters
|
| 105 |
parameters = data.get("parameters", {})
|
| 106 |
max_new_tokens = parameters.get("max_new_tokens", 100)
|
| 107 |
+
temperature = parameters.get("temperature", 0.1)
|
| 108 |
|
| 109 |
# Format messages (same structure as training)
|
| 110 |
messages = [
|
|
|
|
| 123 |
}
|
| 124 |
]
|
| 125 |
|
| 126 |
+
# Process inputs using chat template - EXACTLY like notebook
|
| 127 |
text_input = self.processor.apply_chat_template(
|
| 128 |
messages,
|
| 129 |
add_generation_prompt=True,
|
|
|
|
| 139 |
)
|
| 140 |
|
| 141 |
# Move to device
|
| 142 |
+
model_inputs = {k: v.to(self.device) for k, v in model_inputs.items()}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
|
| 144 |
# Generate prediction
|
| 145 |
with torch.no_grad():
|
| 146 |
generated_ids = self.model.generate(
|
| 147 |
**model_inputs,
|
| 148 |
+
max_new_tokens=max_new_tokens
|
|
|
|
|
|
|
| 149 |
)
|
| 150 |
|
| 151 |
# Decode only the generated tokens (skip the input prompt)
|