Upload folder using huggingface_hub
Browse files- .gitattributes +18 -0
- README.md +99 -3
- jina-reranker-v3-BF16.gguf +3 -0
- jina-reranker-v3-IQ1_M.gguf +3 -0
- jina-reranker-v3-IQ1_S.gguf +3 -0
- jina-reranker-v3-IQ2_M.gguf +3 -0
- jina-reranker-v3-IQ2_XXS.gguf +3 -0
- jina-reranker-v3-IQ3_M.gguf +3 -0
- jina-reranker-v3-IQ3_S.gguf +3 -0
- jina-reranker-v3-IQ3_XS.gguf +3 -0
- jina-reranker-v3-IQ3_XXS.gguf +3 -0
- jina-reranker-v3-IQ4_NL.gguf +3 -0
- jina-reranker-v3-IQ4_XS.gguf +3 -0
- jina-reranker-v3-Q2_K.gguf +3 -0
- jina-reranker-v3-Q3_K_M.gguf +3 -0
- jina-reranker-v3-Q4_K_M.gguf +3 -0
- jina-reranker-v3-Q5_K_M.gguf +3 -0
- jina-reranker-v3-Q5_K_S.gguf +3 -0
- jina-reranker-v3-Q6_K.gguf +3 -0
- jina-reranker-v3-Q8_0.gguf +3 -0
- projector.safetensors +3 -0
- rerank.py +247 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,21 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
jina-reranker-v3-BF16.gguf filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
jina-reranker-v3-IQ1_M.gguf filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
jina-reranker-v3-IQ1_S.gguf filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
jina-reranker-v3-IQ2_M.gguf filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
jina-reranker-v3-IQ2_XXS.gguf filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
jina-reranker-v3-IQ3_M.gguf filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
jina-reranker-v3-IQ3_S.gguf filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
jina-reranker-v3-IQ3_XS.gguf filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
jina-reranker-v3-IQ3_XXS.gguf filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
jina-reranker-v3-IQ4_NL.gguf filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
jina-reranker-v3-IQ4_XS.gguf filter=lfs diff=lfs merge=lfs -text
|
| 47 |
+
jina-reranker-v3-Q2_K.gguf filter=lfs diff=lfs merge=lfs -text
|
| 48 |
+
jina-reranker-v3-Q3_K_M.gguf filter=lfs diff=lfs merge=lfs -text
|
| 49 |
+
jina-reranker-v3-Q4_K_M.gguf filter=lfs diff=lfs merge=lfs -text
|
| 50 |
+
jina-reranker-v3-Q5_K_M.gguf filter=lfs diff=lfs merge=lfs -text
|
| 51 |
+
jina-reranker-v3-Q5_K_S.gguf filter=lfs diff=lfs merge=lfs -text
|
| 52 |
+
jina-reranker-v3-Q6_K.gguf filter=lfs diff=lfs merge=lfs -text
|
| 53 |
+
jina-reranker-v3-Q8_0.gguf filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
|
@@ -1,3 +1,99 @@
|
|
| 1 |
-
---
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
pipeline_tag: text-ranking
|
| 3 |
+
tags:
|
| 4 |
+
- gguf
|
| 5 |
+
- reranker
|
| 6 |
+
- qwen3
|
| 7 |
+
- llama-cpp
|
| 8 |
+
language:
|
| 9 |
+
- multilingual
|
| 10 |
+
base_model: jinaai/jina-reranker-v3
|
| 11 |
+
base_model_relation: quantized
|
| 12 |
+
inference: false
|
| 13 |
+
license: cc-by-nc-4.0
|
| 14 |
+
library_name: llama.cpp
|
| 15 |
+
---
|
| 16 |
+
|
| 17 |
+
# jina-reranker-v3-GGUF
|
| 18 |
+
|
| 19 |
+
GGUF quantizations of [jina-reranker-v3](https://huggingface.co/jinaai/jina-reranker-v3) using llama.cpp. A 0.6B parameter multilingual listwise reranker quantized for efficient inference.
|
| 20 |
+
|
| 21 |
+
## Requirements
|
| 22 |
+
|
| 23 |
+
- Python 3.8+
|
| 24 |
+
- llama.cpp binaries (`llama-embedding` and `llama-tokenize`)
|
| 25 |
+
- Hanxiao's llama.cpp fork recommended: https://github.com/hanxiao/llama.cpp
|
| 26 |
+
|
| 27 |
+
## Installation
|
| 28 |
+
|
| 29 |
+
```bash
|
| 30 |
+
pip install numpy safetensors
|
| 31 |
+
```
|
| 32 |
+
|
| 33 |
+
## Files
|
| 34 |
+
|
| 35 |
+
- `jina-reranker-v3-BF16.gguf` - Quantized model weights (BF16, 1.1GB)
|
| 36 |
+
- `projector.safetensors` - MLP projector weights (3MB)
|
| 37 |
+
- `rerank.py` - Reranker implementation
|
| 38 |
+
|
| 39 |
+
## Usage
|
| 40 |
+
|
| 41 |
+
```python
|
| 42 |
+
from rerank import GGUFReranker
|
| 43 |
+
|
| 44 |
+
# Initialize reranker
|
| 45 |
+
reranker = GGUFReranker(
|
| 46 |
+
model_path="jina-reranker-v3-BF16.gguf",
|
| 47 |
+
projector_path="projector.safetensors",
|
| 48 |
+
llama_embedding_path="/path/to/llama-embedding"
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
# Rerank documents
|
| 52 |
+
query = "What is the capital of France?"
|
| 53 |
+
documents = [
|
| 54 |
+
"Paris is the capital and largest city of France.",
|
| 55 |
+
"Berlin is the capital of Germany.",
|
| 56 |
+
"The Eiffel Tower is located in Paris."
|
| 57 |
+
]
|
| 58 |
+
|
| 59 |
+
results = reranker.rerank(query, documents)
|
| 60 |
+
|
| 61 |
+
for result in results:
|
| 62 |
+
print(f"Score: {result['relevance_score']:.4f}, Doc: {result['document'][:50]}...")
|
| 63 |
+
```
|
| 64 |
+
|
| 65 |
+
## API
|
| 66 |
+
|
| 67 |
+
### `GGUFReranker.rerank(query, documents, top_n=None, return_embeddings=False, instruction=None)`
|
| 68 |
+
|
| 69 |
+
**Arguments:**
|
| 70 |
+
- `query` (str): Search query
|
| 71 |
+
- `documents` (List[str]): Documents to rerank
|
| 72 |
+
- `top_n` (int, optional): Return only top N results
|
| 73 |
+
- `return_embeddings` (bool): Include embeddings in output
|
| 74 |
+
- `instruction` (str, optional): Custom ranking instruction
|
| 75 |
+
|
| 76 |
+
**Returns:**
|
| 77 |
+
List of dicts with keys: `index`, `relevance_score`, `document`, and optionally `embedding`
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
## Citation
|
| 82 |
+
|
| 83 |
+
If you find `jina-reranker-v3` useful in your research, please cite the [original paper](https://arxiv.org/abs/2509.25085):
|
| 84 |
+
|
| 85 |
+
```bibtex
|
| 86 |
+
@misc{wang2025jinarerankerv3lateinteractiondocument,
|
| 87 |
+
title={jina-reranker-v3: Last but Not Late Interaction for Document Reranking},
|
| 88 |
+
author={Feng Wang and Yuqing Li and Han Xiao},
|
| 89 |
+
year={2025},
|
| 90 |
+
eprint={2509.25085},
|
| 91 |
+
archivePrefix={arXiv},
|
| 92 |
+
primaryClass={cs.CL},
|
| 93 |
+
url={https://arxiv.org/abs/2509.25085},
|
| 94 |
+
}
|
| 95 |
+
```
|
| 96 |
+
|
| 97 |
+
## License
|
| 98 |
+
|
| 99 |
+
This MLX implementation follows the same CC BY-NC 4.0 license as the original model. For commercial usage inquiries, please [contact Jina AI](https://jina.ai/contact-sales/).
|
jina-reranker-v3-BF16.gguf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2e0e94dd584f84bab2a83254d38b93699ae5f40405422b4f419874aca68e6313
|
| 3 |
+
size 1198785888
|
jina-reranker-v3-IQ1_M.gguf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0ed1e761349bcabe9d91efb865996eade10731f9a5dbbd07ebd1eb8a97ebd77b
|
| 3 |
+
size 216655456
|
jina-reranker-v3-IQ1_S.gguf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c5dcad4805b2c4a8d86eb584fd14586d278a6b971b9ab89950d109a2d3a9f555
|
| 3 |
+
size 208619104
|
jina-reranker-v3-IQ2_M.gguf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ff7c65d50b7263fe213f228bd5d68629cf958cb2b1cc04896a47d2ffe4815fea
|
| 3 |
+
size 265512544
|
jina-reranker-v3-IQ2_XXS.gguf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:557af10823f1195edbffa02404572242f53caee200199c8ef153701cfb56efc8
|
| 3 |
+
size 230049376
|
jina-reranker-v3-IQ3_M.gguf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4f072998e87864ad5475732dd747fd8e20e959fc89a3f3ca25684726696f9fbe
|
| 3 |
+
size 336630368
|
jina-reranker-v3-IQ3_S.gguf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d8733f7f0f640b9ffc1b7978fdd69d9271cba9f1d8069d2c7ce0311b52496ac2
|
| 3 |
+
size 323678816
|
jina-reranker-v3-IQ3_XS.gguf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:52d4227ffb43eeb0bceaab9441512010948aefb4be3994ffd4f132f248eebeff
|
| 3 |
+
size 313356896
|
jina-reranker-v3-IQ3_XXS.gguf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f3bbd42148f40b8d22347e8c25ecdbffd9a30025ded775c2c5953ba8bacf887d
|
| 3 |
+
size 279619168
|
jina-reranker-v3-IQ4_NL.gguf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:10a5b0f0d8ed9dcd4b5adeddfe371b12fadb584bcafa69f5ceacd5e48104c1ce
|
| 3 |
+
size 382169696
|
jina-reranker-v3-IQ4_XS.gguf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0480feaf9b3bcff664707c7263d1249489afffc7170c558301efdec4c7cbfa70
|
| 3 |
+
size 368407136
|
jina-reranker-v3-Q2_K.gguf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:51198d403351d4596e6290b69aec04bdb9b4e9c7451b626941c6e2ac5b5d8a51
|
| 3 |
+
size 296841824
|
jina-reranker-v3-Q3_K_M.gguf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ed67b76ff409aea52355b7db49fa61959ae9376055d1f3b57812587b2469c5dc
|
| 3 |
+
size 347730528
|
jina-reranker-v3-Q4_K_M.gguf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ac307418158012c3c87a83e40835b2090cf979c5ac8e38f36322bc2e1ca43f51
|
| 3 |
+
size 397308512
|
jina-reranker-v3-Q5_K_M.gguf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8b31ac0efae5de481f6d4661d2eac03d66392439b5a4fe977a87212b5227168f
|
| 3 |
+
size 445018720
|
jina-reranker-v3-Q5_K_S.gguf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b0e024cbc5fa37a2fe5202b96357a3d834aeceb88a81497152873ec638c4e119
|
| 3 |
+
size 437219936
|
jina-reranker-v3-Q6_K.gguf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b53bb9e0081896b8dc5e116c0978b1bc3248db4205f341ca7a9e908ed1e191ff
|
| 3 |
+
size 495710816
|
jina-reranker-v3-Q8_0.gguf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ac96f5c868b78e4f369c78386f7767ea6ed92167b349cf6006944541f620991e
|
| 3 |
+
size 640050528
|
projector.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9d67c38edb8f2010e26b9877aef98c0d1fd975ffb734a03597e318cb5de74a09
|
| 3 |
+
size 3145912
|
rerank.py
ADDED
|
@@ -0,0 +1,247 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
import numpy as np
|
| 3 |
+
import subprocess
|
| 4 |
+
import tempfile
|
| 5 |
+
import os
|
| 6 |
+
from typing import Optional, List, Dict
|
| 7 |
+
from safetensors import safe_open
|
| 8 |
+
import json
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class MLPProjector:
|
| 12 |
+
"""MLP projector to project hidden states to embedding space."""
|
| 13 |
+
def __init__(self, linear1_weight, linear2_weight):
|
| 14 |
+
self.linear1_weight = linear1_weight
|
| 15 |
+
self.linear2_weight = linear2_weight
|
| 16 |
+
|
| 17 |
+
def __call__(self, x):
|
| 18 |
+
# Linear 1
|
| 19 |
+
x = x @ self.linear1_weight.T
|
| 20 |
+
# ReLU
|
| 21 |
+
x = np.maximum(0, x)
|
| 22 |
+
# Linear 2
|
| 23 |
+
x = x @ self.linear2_weight.T
|
| 24 |
+
return x
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def load_projector(projector_path: str) -> MLPProjector:
|
| 28 |
+
"""Load projector weights from safetensors file."""
|
| 29 |
+
with safe_open(projector_path, framework="numpy") as f:
|
| 30 |
+
w0 = f.get_tensor("projector.0.weight")
|
| 31 |
+
w2 = f.get_tensor("projector.2.weight")
|
| 32 |
+
|
| 33 |
+
return MLPProjector(w0, w2)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def sanitize_input(text: str, special_tokens: Dict[str, str]) -> str:
|
| 37 |
+
"""Remove special tokens from input text."""
|
| 38 |
+
for token in special_tokens.values():
|
| 39 |
+
text = text.replace(token, "")
|
| 40 |
+
return text
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def format_docs_prompts_func(
|
| 44 |
+
query: str,
|
| 45 |
+
docs: list[str],
|
| 46 |
+
instruction: Optional[str] = None,
|
| 47 |
+
special_tokens: Dict[str, str] = {},
|
| 48 |
+
) -> str:
|
| 49 |
+
"""Format query and documents into a prompt for the model."""
|
| 50 |
+
query = sanitize_input(query, special_tokens)
|
| 51 |
+
docs = [sanitize_input(doc, special_tokens) for doc in docs]
|
| 52 |
+
|
| 53 |
+
prefix = (
|
| 54 |
+
"<|im_start|>system\n"
|
| 55 |
+
"You are a search relevance expert who can determine a ranking of the passages based on how relevant they are to the query. "
|
| 56 |
+
"If the query is a question, how relevant a passage is depends on how well it answers the question. "
|
| 57 |
+
"If not, try to analyze the intent of the query and assess how well each passage satisfies the intent. "
|
| 58 |
+
"If an instruction is provided, you should follow the instruction when determining the ranking."
|
| 59 |
+
"<|im_end|>\n<|im_start|>user\n"
|
| 60 |
+
)
|
| 61 |
+
suffix = "<|im_end|>\n<|im_start|>assistant\n"
|
| 62 |
+
|
| 63 |
+
doc_emb_token = special_tokens["doc_embed_token"]
|
| 64 |
+
query_emb_token = special_tokens["query_embed_token"]
|
| 65 |
+
|
| 66 |
+
prompt = (
|
| 67 |
+
f"I will provide you with {len(docs)} passages, each indicated by a numerical identifier. "
|
| 68 |
+
f"Rank the passages based on their relevance to query: {query}\n"
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
if instruction:
|
| 72 |
+
prompt += f'<instruct>\n{instruction}\n</instruct>\n'
|
| 73 |
+
|
| 74 |
+
doc_prompts = [f'<passage id="{i}">\n{doc}{doc_emb_token}\n</passage>' for i, doc in enumerate(docs)]
|
| 75 |
+
prompt += "\n".join(doc_prompts) + "\n"
|
| 76 |
+
prompt += f"<query>\n{query}{query_emb_token}\n</query>"
|
| 77 |
+
|
| 78 |
+
return prefix + prompt + suffix
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
class GGUFReranker:
|
| 82 |
+
"""GGUF-based implementation of jina-reranker-v3."""
|
| 83 |
+
|
| 84 |
+
def __init__(self, model_path: str = "jina-reranker-v3-BF16.gguf", projector_path: str = "projector.safetensors",
|
| 85 |
+
llama_embedding_path: str = "/tmp/hanxiao-llama.cpp/build/bin/llama-embedding"):
|
| 86 |
+
"""Initialize GGUF-based reranker."""
|
| 87 |
+
self.model_path = model_path
|
| 88 |
+
self.llama_embedding_path = llama_embedding_path
|
| 89 |
+
self.projector = load_projector(projector_path)
|
| 90 |
+
|
| 91 |
+
# Special tokens
|
| 92 |
+
self.special_tokens = {
|
| 93 |
+
"query_embed_token": "<|rerank_token|>",
|
| 94 |
+
"doc_embed_token": "<|embed_token|>"
|
| 95 |
+
}
|
| 96 |
+
self.doc_embed_token_id = 151670
|
| 97 |
+
self.query_embed_token_id = 151671
|
| 98 |
+
|
| 99 |
+
def _get_hidden_states(self, prompt: str) -> np.ndarray:
|
| 100 |
+
"""Get per-token hidden states using llama-embedding CLI."""
|
| 101 |
+
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f:
|
| 102 |
+
f.write(prompt)
|
| 103 |
+
prompt_file = f.name
|
| 104 |
+
|
| 105 |
+
try:
|
| 106 |
+
result = subprocess.run(
|
| 107 |
+
[
|
| 108 |
+
self.llama_embedding_path,
|
| 109 |
+
'-m', self.model_path,
|
| 110 |
+
'-f', prompt_file,
|
| 111 |
+
'--pooling', 'none',
|
| 112 |
+
'--embd-separator', '<#JINA_SEP#>', # Preserve internal newlines
|
| 113 |
+
'--embd-normalize', '-1',
|
| 114 |
+
'--embd-output-format', 'json',
|
| 115 |
+
'--ubatch-size', '512',
|
| 116 |
+
'--ctx-size', '8192',
|
| 117 |
+
'--flash-attn',
|
| 118 |
+
'-ngl', '99'
|
| 119 |
+
],
|
| 120 |
+
stdout=subprocess.PIPE,
|
| 121 |
+
stderr=subprocess.PIPE,
|
| 122 |
+
text=True,
|
| 123 |
+
check=True
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
output = json.loads(result.stdout)
|
| 127 |
+
embeddings = [item['embedding'] for item in output['data']]
|
| 128 |
+
return np.array(embeddings)
|
| 129 |
+
finally:
|
| 130 |
+
os.unlink(prompt_file)
|
| 131 |
+
|
| 132 |
+
def _tokenize(self, prompt: str) -> List[int]:
|
| 133 |
+
"""Tokenize prompt to find special token positions."""
|
| 134 |
+
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f:
|
| 135 |
+
f.write(prompt)
|
| 136 |
+
prompt_file = f.name
|
| 137 |
+
|
| 138 |
+
try:
|
| 139 |
+
result = subprocess.run(
|
| 140 |
+
['llama-tokenize', '-m', self.model_path, '-f', prompt_file],
|
| 141 |
+
stdout=subprocess.PIPE,
|
| 142 |
+
stderr=subprocess.DEVNULL,
|
| 143 |
+
text=True,
|
| 144 |
+
check=True
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
tokens = []
|
| 148 |
+
for line in result.stdout.strip().split('\n'):
|
| 149 |
+
if '->' in line:
|
| 150 |
+
token_id = int(line.split('->')[0].strip())
|
| 151 |
+
tokens.append(token_id)
|
| 152 |
+
return tokens
|
| 153 |
+
finally:
|
| 154 |
+
os.unlink(prompt_file)
|
| 155 |
+
|
| 156 |
+
def rerank(
|
| 157 |
+
self,
|
| 158 |
+
query: str,
|
| 159 |
+
documents: List[str],
|
| 160 |
+
top_n: Optional[int] = None,
|
| 161 |
+
return_embeddings: bool = False,
|
| 162 |
+
instruction: Optional[str] = None
|
| 163 |
+
) -> List[Dict]:
|
| 164 |
+
"""Rerank documents based on relevance to query."""
|
| 165 |
+
# Format prompt
|
| 166 |
+
prompt = format_docs_prompts_func(
|
| 167 |
+
query,
|
| 168 |
+
documents,
|
| 169 |
+
instruction=instruction,
|
| 170 |
+
special_tokens=self.special_tokens
|
| 171 |
+
)
|
| 172 |
+
|
| 173 |
+
# Get per-token hidden states using llama-embedding CLI
|
| 174 |
+
embeddings = self._get_hidden_states(prompt)
|
| 175 |
+
|
| 176 |
+
# Tokenize to find special token positions
|
| 177 |
+
tokens = self._tokenize(prompt)
|
| 178 |
+
tokens_array = np.array(tokens)
|
| 179 |
+
|
| 180 |
+
query_embed_positions_in_tokens = np.where(tokens_array == self.query_embed_token_id)[0]
|
| 181 |
+
doc_embed_positions_in_tokens = np.where(tokens_array == self.doc_embed_token_id)[0]
|
| 182 |
+
|
| 183 |
+
if len(query_embed_positions_in_tokens) == 0:
|
| 184 |
+
raise ValueError(f"Query embed token (ID {self.query_embed_token_id}) not found in input")
|
| 185 |
+
|
| 186 |
+
if len(doc_embed_positions_in_tokens) == 0:
|
| 187 |
+
raise ValueError(f"Document embed tokens (ID {self.doc_embed_token_id}) not found in input")
|
| 188 |
+
|
| 189 |
+
# llama-embedding strips trailing newlines but preserves internal newlines (via --embd-separator)
|
| 190 |
+
# Token positions map directly to embedding indices
|
| 191 |
+
query_pos = query_embed_positions_in_tokens[0]
|
| 192 |
+
doc_positions = doc_embed_positions_in_tokens
|
| 193 |
+
|
| 194 |
+
# Extract embeddings at special token positions
|
| 195 |
+
query_hidden = embeddings[query_pos:query_pos+1] # [1, hidden_size]
|
| 196 |
+
doc_hidden = embeddings[doc_positions] # [num_docs, hidden_size]
|
| 197 |
+
|
| 198 |
+
# Project embeddings
|
| 199 |
+
query_embeds = self.projector(query_hidden) # [1, 512]
|
| 200 |
+
doc_embeds = self.projector(doc_hidden) # [num_docs, 512]
|
| 201 |
+
|
| 202 |
+
# Compute cosine similarity scores
|
| 203 |
+
# Broadcast query to match doc shape
|
| 204 |
+
query_expanded = np.tile(query_embeds, (len(doc_embeds), 1)) # [num_docs, 512]
|
| 205 |
+
|
| 206 |
+
# Cosine similarity
|
| 207 |
+
dot_product = np.sum(doc_embeds * query_expanded, axis=-1) # [num_docs]
|
| 208 |
+
doc_norm = np.sqrt(np.sum(doc_embeds * doc_embeds, axis=-1)) # [num_docs]
|
| 209 |
+
query_norm = np.sqrt(np.sum(query_expanded * query_expanded, axis=-1)) # [num_docs]
|
| 210 |
+
scores = dot_product / (doc_norm * query_norm) # [num_docs]
|
| 211 |
+
|
| 212 |
+
# Create results
|
| 213 |
+
results = []
|
| 214 |
+
for idx, (doc, score, embed) in enumerate(zip(documents, scores, doc_embeds)):
|
| 215 |
+
result = {
|
| 216 |
+
"index": idx,
|
| 217 |
+
"relevance_score": float(score),
|
| 218 |
+
"document": doc
|
| 219 |
+
}
|
| 220 |
+
if return_embeddings:
|
| 221 |
+
result["embedding"] = embed.tolist()
|
| 222 |
+
results.append(result)
|
| 223 |
+
|
| 224 |
+
# Sort by score descending
|
| 225 |
+
results.sort(key=lambda x: x["relevance_score"], reverse=True)
|
| 226 |
+
|
| 227 |
+
# Return top_n if specified
|
| 228 |
+
if top_n is not None:
|
| 229 |
+
results = results[:top_n]
|
| 230 |
+
|
| 231 |
+
return results
|
| 232 |
+
|
| 233 |
+
|
| 234 |
+
if __name__ == "__main__":
|
| 235 |
+
# Test the reranker
|
| 236 |
+
reranker = GGUFReranker()
|
| 237 |
+
|
| 238 |
+
query = "What is the capital of France?"
|
| 239 |
+
documents = [
|
| 240 |
+
"Paris is the capital and largest city of France.",
|
| 241 |
+
"Berlin is the capital of Germany.",
|
| 242 |
+
"The Eiffel Tower is located in Paris."
|
| 243 |
+
]
|
| 244 |
+
|
| 245 |
+
results = reranker.rerank(query, documents)
|
| 246 |
+
for result in results:
|
| 247 |
+
print(f"Doc {result['index']}: {result['relevance_score']:.4f} - {result['document'][:50]}...")
|