Update README.md
Browse files
README.md
CHANGED
|
@@ -64,7 +64,7 @@ export OMP_NUM_THREADS=4
|
|
| 64 |
|
| 65 |
CONTEXT_LENGTH=32768
|
| 66 |
vllm serve \
|
| 67 |
-
__YOUR_PATH__/
|
| 68 |
--served-model-name MY_MODEL_NAME \
|
| 69 |
--enable-auto-tool-choice \
|
| 70 |
--tool-call-parser deepseek_v31 \
|
|
@@ -75,7 +75,7 @@ vllm serve \
|
|
| 75 |
--gpu-memory-utilization 0.9 \
|
| 76 |
--tensor-parallel-size 8 \
|
| 77 |
--enable-expert-parallel \ # optional
|
| 78 |
-
--speculative-config '{"model": "__YOUR_PATH__/
|
| 79 |
--trust-remote-code \
|
| 80 |
--host 0.0.0.0 \
|
| 81 |
--port 8000
|
|
@@ -95,7 +95,7 @@ vllm serve \
|
|
| 95 |
### 【Model Download】
|
| 96 |
```python
|
| 97 |
from modelscope import snapshot_download
|
| 98 |
-
snapshot_download('
|
| 99 |
```
|
| 100 |
|
| 101 |
### 【Overview】
|
|
|
|
| 64 |
|
| 65 |
CONTEXT_LENGTH=32768
|
| 66 |
vllm serve \
|
| 67 |
+
__YOUR_PATH__/QuantTrio/DeepSeek-V3.2-Speciale-AWQ \
|
| 68 |
--served-model-name MY_MODEL_NAME \
|
| 69 |
--enable-auto-tool-choice \
|
| 70 |
--tool-call-parser deepseek_v31 \
|
|
|
|
| 75 |
--gpu-memory-utilization 0.9 \
|
| 76 |
--tensor-parallel-size 8 \
|
| 77 |
--enable-expert-parallel \ # optional
|
| 78 |
+
--speculative-config '{"model": "__YOUR_PATH__/QuantTrio/DeepSeek-V3.2-Speciale-AWQ", "num_speculative_tokens": 1}' \ # optional, 50%+- throughput increase is observed
|
| 79 |
--trust-remote-code \
|
| 80 |
--host 0.0.0.0 \
|
| 81 |
--port 8000
|
|
|
|
| 95 |
### 【Model Download】
|
| 96 |
```python
|
| 97 |
from modelscope import snapshot_download
|
| 98 |
+
snapshot_download('QuantTrio/DeepSeek-V3.2-Speciale-AWQ', cache_dir="your_local_path")
|
| 99 |
```
|
| 100 |
|
| 101 |
### 【Overview】
|