hanxiao commited on
Commit
4bbace8
·
verified ·
1 Parent(s): f4013f7

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,21 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ jina-reranker-v3-BF16.gguf filter=lfs diff=lfs merge=lfs -text
37
+ jina-reranker-v3-IQ1_M.gguf filter=lfs diff=lfs merge=lfs -text
38
+ jina-reranker-v3-IQ1_S.gguf filter=lfs diff=lfs merge=lfs -text
39
+ jina-reranker-v3-IQ2_M.gguf filter=lfs diff=lfs merge=lfs -text
40
+ jina-reranker-v3-IQ2_XXS.gguf filter=lfs diff=lfs merge=lfs -text
41
+ jina-reranker-v3-IQ3_M.gguf filter=lfs diff=lfs merge=lfs -text
42
+ jina-reranker-v3-IQ3_S.gguf filter=lfs diff=lfs merge=lfs -text
43
+ jina-reranker-v3-IQ3_XS.gguf filter=lfs diff=lfs merge=lfs -text
44
+ jina-reranker-v3-IQ3_XXS.gguf filter=lfs diff=lfs merge=lfs -text
45
+ jina-reranker-v3-IQ4_NL.gguf filter=lfs diff=lfs merge=lfs -text
46
+ jina-reranker-v3-IQ4_XS.gguf filter=lfs diff=lfs merge=lfs -text
47
+ jina-reranker-v3-Q2_K.gguf filter=lfs diff=lfs merge=lfs -text
48
+ jina-reranker-v3-Q3_K_M.gguf filter=lfs diff=lfs merge=lfs -text
49
+ jina-reranker-v3-Q4_K_M.gguf filter=lfs diff=lfs merge=lfs -text
50
+ jina-reranker-v3-Q5_K_M.gguf filter=lfs diff=lfs merge=lfs -text
51
+ jina-reranker-v3-Q5_K_S.gguf filter=lfs diff=lfs merge=lfs -text
52
+ jina-reranker-v3-Q6_K.gguf filter=lfs diff=lfs merge=lfs -text
53
+ jina-reranker-v3-Q8_0.gguf filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,3 +1,99 @@
1
- ---
2
- license: cc-by-nc-4.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ pipeline_tag: text-ranking
3
+ tags:
4
+ - gguf
5
+ - reranker
6
+ - qwen3
7
+ - llama-cpp
8
+ language:
9
+ - multilingual
10
+ base_model: jinaai/jina-reranker-v3
11
+ base_model_relation: quantized
12
+ inference: false
13
+ license: cc-by-nc-4.0
14
+ library_name: llama.cpp
15
+ ---
16
+
17
+ # jina-reranker-v3-GGUF
18
+
19
+ GGUF quantizations of [jina-reranker-v3](https://huggingface.co/jinaai/jina-reranker-v3) using llama.cpp. A 0.6B parameter multilingual listwise reranker quantized for efficient inference.
20
+
21
+ ## Requirements
22
+
23
+ - Python 3.8+
24
+ - llama.cpp binaries (`llama-embedding` and `llama-tokenize`)
25
+ - Hanxiao's llama.cpp fork recommended: https://github.com/hanxiao/llama.cpp
26
+
27
+ ## Installation
28
+
29
+ ```bash
30
+ pip install numpy safetensors
31
+ ```
32
+
33
+ ## Files
34
+
35
+ - `jina-reranker-v3-BF16.gguf` - Quantized model weights (BF16, 1.1GB)
36
+ - `projector.safetensors` - MLP projector weights (3MB)
37
+ - `rerank.py` - Reranker implementation
38
+
39
+ ## Usage
40
+
41
+ ```python
42
+ from rerank import GGUFReranker
43
+
44
+ # Initialize reranker
45
+ reranker = GGUFReranker(
46
+ model_path="jina-reranker-v3-BF16.gguf",
47
+ projector_path="projector.safetensors",
48
+ llama_embedding_path="/path/to/llama-embedding"
49
+ )
50
+
51
+ # Rerank documents
52
+ query = "What is the capital of France?"
53
+ documents = [
54
+ "Paris is the capital and largest city of France.",
55
+ "Berlin is the capital of Germany.",
56
+ "The Eiffel Tower is located in Paris."
57
+ ]
58
+
59
+ results = reranker.rerank(query, documents)
60
+
61
+ for result in results:
62
+ print(f"Score: {result['relevance_score']:.4f}, Doc: {result['document'][:50]}...")
63
+ ```
64
+
65
+ ## API
66
+
67
+ ### `GGUFReranker.rerank(query, documents, top_n=None, return_embeddings=False, instruction=None)`
68
+
69
+ **Arguments:**
70
+ - `query` (str): Search query
71
+ - `documents` (List[str]): Documents to rerank
72
+ - `top_n` (int, optional): Return only top N results
73
+ - `return_embeddings` (bool): Include embeddings in output
74
+ - `instruction` (str, optional): Custom ranking instruction
75
+
76
+ **Returns:**
77
+ List of dicts with keys: `index`, `relevance_score`, `document`, and optionally `embedding`
78
+
79
+
80
+
81
+ ## Citation
82
+
83
+ If you find `jina-reranker-v3` useful in your research, please cite the [original paper](https://arxiv.org/abs/2509.25085):
84
+
85
+ ```bibtex
86
+ @misc{wang2025jinarerankerv3lateinteractiondocument,
87
+ title={jina-reranker-v3: Last but Not Late Interaction for Document Reranking},
88
+ author={Feng Wang and Yuqing Li and Han Xiao},
89
+ year={2025},
90
+ eprint={2509.25085},
91
+ archivePrefix={arXiv},
92
+ primaryClass={cs.CL},
93
+ url={https://arxiv.org/abs/2509.25085},
94
+ }
95
+ ```
96
+
97
+ ## License
98
+
99
+ This MLX implementation follows the same CC BY-NC 4.0 license as the original model. For commercial usage inquiries, please [contact Jina AI](https://jina.ai/contact-sales/).
jina-reranker-v3-BF16.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e0e94dd584f84bab2a83254d38b93699ae5f40405422b4f419874aca68e6313
3
+ size 1198785888
jina-reranker-v3-IQ1_M.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ed1e761349bcabe9d91efb865996eade10731f9a5dbbd07ebd1eb8a97ebd77b
3
+ size 216655456
jina-reranker-v3-IQ1_S.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5dcad4805b2c4a8d86eb584fd14586d278a6b971b9ab89950d109a2d3a9f555
3
+ size 208619104
jina-reranker-v3-IQ2_M.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff7c65d50b7263fe213f228bd5d68629cf958cb2b1cc04896a47d2ffe4815fea
3
+ size 265512544
jina-reranker-v3-IQ2_XXS.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:557af10823f1195edbffa02404572242f53caee200199c8ef153701cfb56efc8
3
+ size 230049376
jina-reranker-v3-IQ3_M.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f072998e87864ad5475732dd747fd8e20e959fc89a3f3ca25684726696f9fbe
3
+ size 336630368
jina-reranker-v3-IQ3_S.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8733f7f0f640b9ffc1b7978fdd69d9271cba9f1d8069d2c7ce0311b52496ac2
3
+ size 323678816
jina-reranker-v3-IQ3_XS.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52d4227ffb43eeb0bceaab9441512010948aefb4be3994ffd4f132f248eebeff
3
+ size 313356896
jina-reranker-v3-IQ3_XXS.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3bbd42148f40b8d22347e8c25ecdbffd9a30025ded775c2c5953ba8bacf887d
3
+ size 279619168
jina-reranker-v3-IQ4_NL.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10a5b0f0d8ed9dcd4b5adeddfe371b12fadb584bcafa69f5ceacd5e48104c1ce
3
+ size 382169696
jina-reranker-v3-IQ4_XS.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0480feaf9b3bcff664707c7263d1249489afffc7170c558301efdec4c7cbfa70
3
+ size 368407136
jina-reranker-v3-Q2_K.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51198d403351d4596e6290b69aec04bdb9b4e9c7451b626941c6e2ac5b5d8a51
3
+ size 296841824
jina-reranker-v3-Q3_K_M.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed67b76ff409aea52355b7db49fa61959ae9376055d1f3b57812587b2469c5dc
3
+ size 347730528
jina-reranker-v3-Q4_K_M.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac307418158012c3c87a83e40835b2090cf979c5ac8e38f36322bc2e1ca43f51
3
+ size 397308512
jina-reranker-v3-Q5_K_M.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b31ac0efae5de481f6d4661d2eac03d66392439b5a4fe977a87212b5227168f
3
+ size 445018720
jina-reranker-v3-Q5_K_S.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0e024cbc5fa37a2fe5202b96357a3d834aeceb88a81497152873ec638c4e119
3
+ size 437219936
jina-reranker-v3-Q6_K.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b53bb9e0081896b8dc5e116c0978b1bc3248db4205f341ca7a9e908ed1e191ff
3
+ size 495710816
jina-reranker-v3-Q8_0.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac96f5c868b78e4f369c78386f7767ea6ed92167b349cf6006944541f620991e
3
+ size 640050528
projector.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d67c38edb8f2010e26b9877aef98c0d1fd975ffb734a03597e318cb5de74a09
3
+ size 3145912
rerank.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import numpy as np
3
+ import subprocess
4
+ import tempfile
5
+ import os
6
+ from typing import Optional, List, Dict
7
+ from safetensors import safe_open
8
+ import json
9
+
10
+
11
+ class MLPProjector:
12
+ """MLP projector to project hidden states to embedding space."""
13
+ def __init__(self, linear1_weight, linear2_weight):
14
+ self.linear1_weight = linear1_weight
15
+ self.linear2_weight = linear2_weight
16
+
17
+ def __call__(self, x):
18
+ # Linear 1
19
+ x = x @ self.linear1_weight.T
20
+ # ReLU
21
+ x = np.maximum(0, x)
22
+ # Linear 2
23
+ x = x @ self.linear2_weight.T
24
+ return x
25
+
26
+
27
+ def load_projector(projector_path: str) -> MLPProjector:
28
+ """Load projector weights from safetensors file."""
29
+ with safe_open(projector_path, framework="numpy") as f:
30
+ w0 = f.get_tensor("projector.0.weight")
31
+ w2 = f.get_tensor("projector.2.weight")
32
+
33
+ return MLPProjector(w0, w2)
34
+
35
+
36
+ def sanitize_input(text: str, special_tokens: Dict[str, str]) -> str:
37
+ """Remove special tokens from input text."""
38
+ for token in special_tokens.values():
39
+ text = text.replace(token, "")
40
+ return text
41
+
42
+
43
+ def format_docs_prompts_func(
44
+ query: str,
45
+ docs: list[str],
46
+ instruction: Optional[str] = None,
47
+ special_tokens: Dict[str, str] = {},
48
+ ) -> str:
49
+ """Format query and documents into a prompt for the model."""
50
+ query = sanitize_input(query, special_tokens)
51
+ docs = [sanitize_input(doc, special_tokens) for doc in docs]
52
+
53
+ prefix = (
54
+ "<|im_start|>system\n"
55
+ "You are a search relevance expert who can determine a ranking of the passages based on how relevant they are to the query. "
56
+ "If the query is a question, how relevant a passage is depends on how well it answers the question. "
57
+ "If not, try to analyze the intent of the query and assess how well each passage satisfies the intent. "
58
+ "If an instruction is provided, you should follow the instruction when determining the ranking."
59
+ "<|im_end|>\n<|im_start|>user\n"
60
+ )
61
+ suffix = "<|im_end|>\n<|im_start|>assistant\n"
62
+
63
+ doc_emb_token = special_tokens["doc_embed_token"]
64
+ query_emb_token = special_tokens["query_embed_token"]
65
+
66
+ prompt = (
67
+ f"I will provide you with {len(docs)} passages, each indicated by a numerical identifier. "
68
+ f"Rank the passages based on their relevance to query: {query}\n"
69
+ )
70
+
71
+ if instruction:
72
+ prompt += f'<instruct>\n{instruction}\n</instruct>\n'
73
+
74
+ doc_prompts = [f'<passage id="{i}">\n{doc}{doc_emb_token}\n</passage>' for i, doc in enumerate(docs)]
75
+ prompt += "\n".join(doc_prompts) + "\n"
76
+ prompt += f"<query>\n{query}{query_emb_token}\n</query>"
77
+
78
+ return prefix + prompt + suffix
79
+
80
+
81
+ class GGUFReranker:
82
+ """GGUF-based implementation of jina-reranker-v3."""
83
+
84
+ def __init__(self, model_path: str = "jina-reranker-v3-BF16.gguf", projector_path: str = "projector.safetensors",
85
+ llama_embedding_path: str = "/tmp/hanxiao-llama.cpp/build/bin/llama-embedding"):
86
+ """Initialize GGUF-based reranker."""
87
+ self.model_path = model_path
88
+ self.llama_embedding_path = llama_embedding_path
89
+ self.projector = load_projector(projector_path)
90
+
91
+ # Special tokens
92
+ self.special_tokens = {
93
+ "query_embed_token": "<|rerank_token|>",
94
+ "doc_embed_token": "<|embed_token|>"
95
+ }
96
+ self.doc_embed_token_id = 151670
97
+ self.query_embed_token_id = 151671
98
+
99
+ def _get_hidden_states(self, prompt: str) -> np.ndarray:
100
+ """Get per-token hidden states using llama-embedding CLI."""
101
+ with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f:
102
+ f.write(prompt)
103
+ prompt_file = f.name
104
+
105
+ try:
106
+ result = subprocess.run(
107
+ [
108
+ self.llama_embedding_path,
109
+ '-m', self.model_path,
110
+ '-f', prompt_file,
111
+ '--pooling', 'none',
112
+ '--embd-separator', '<#JINA_SEP#>', # Preserve internal newlines
113
+ '--embd-normalize', '-1',
114
+ '--embd-output-format', 'json',
115
+ '--ubatch-size', '512',
116
+ '--ctx-size', '8192',
117
+ '--flash-attn',
118
+ '-ngl', '99'
119
+ ],
120
+ stdout=subprocess.PIPE,
121
+ stderr=subprocess.PIPE,
122
+ text=True,
123
+ check=True
124
+ )
125
+
126
+ output = json.loads(result.stdout)
127
+ embeddings = [item['embedding'] for item in output['data']]
128
+ return np.array(embeddings)
129
+ finally:
130
+ os.unlink(prompt_file)
131
+
132
+ def _tokenize(self, prompt: str) -> List[int]:
133
+ """Tokenize prompt to find special token positions."""
134
+ with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f:
135
+ f.write(prompt)
136
+ prompt_file = f.name
137
+
138
+ try:
139
+ result = subprocess.run(
140
+ ['llama-tokenize', '-m', self.model_path, '-f', prompt_file],
141
+ stdout=subprocess.PIPE,
142
+ stderr=subprocess.DEVNULL,
143
+ text=True,
144
+ check=True
145
+ )
146
+
147
+ tokens = []
148
+ for line in result.stdout.strip().split('\n'):
149
+ if '->' in line:
150
+ token_id = int(line.split('->')[0].strip())
151
+ tokens.append(token_id)
152
+ return tokens
153
+ finally:
154
+ os.unlink(prompt_file)
155
+
156
+ def rerank(
157
+ self,
158
+ query: str,
159
+ documents: List[str],
160
+ top_n: Optional[int] = None,
161
+ return_embeddings: bool = False,
162
+ instruction: Optional[str] = None
163
+ ) -> List[Dict]:
164
+ """Rerank documents based on relevance to query."""
165
+ # Format prompt
166
+ prompt = format_docs_prompts_func(
167
+ query,
168
+ documents,
169
+ instruction=instruction,
170
+ special_tokens=self.special_tokens
171
+ )
172
+
173
+ # Get per-token hidden states using llama-embedding CLI
174
+ embeddings = self._get_hidden_states(prompt)
175
+
176
+ # Tokenize to find special token positions
177
+ tokens = self._tokenize(prompt)
178
+ tokens_array = np.array(tokens)
179
+
180
+ query_embed_positions_in_tokens = np.where(tokens_array == self.query_embed_token_id)[0]
181
+ doc_embed_positions_in_tokens = np.where(tokens_array == self.doc_embed_token_id)[0]
182
+
183
+ if len(query_embed_positions_in_tokens) == 0:
184
+ raise ValueError(f"Query embed token (ID {self.query_embed_token_id}) not found in input")
185
+
186
+ if len(doc_embed_positions_in_tokens) == 0:
187
+ raise ValueError(f"Document embed tokens (ID {self.doc_embed_token_id}) not found in input")
188
+
189
+ # llama-embedding strips trailing newlines but preserves internal newlines (via --embd-separator)
190
+ # Token positions map directly to embedding indices
191
+ query_pos = query_embed_positions_in_tokens[0]
192
+ doc_positions = doc_embed_positions_in_tokens
193
+
194
+ # Extract embeddings at special token positions
195
+ query_hidden = embeddings[query_pos:query_pos+1] # [1, hidden_size]
196
+ doc_hidden = embeddings[doc_positions] # [num_docs, hidden_size]
197
+
198
+ # Project embeddings
199
+ query_embeds = self.projector(query_hidden) # [1, 512]
200
+ doc_embeds = self.projector(doc_hidden) # [num_docs, 512]
201
+
202
+ # Compute cosine similarity scores
203
+ # Broadcast query to match doc shape
204
+ query_expanded = np.tile(query_embeds, (len(doc_embeds), 1)) # [num_docs, 512]
205
+
206
+ # Cosine similarity
207
+ dot_product = np.sum(doc_embeds * query_expanded, axis=-1) # [num_docs]
208
+ doc_norm = np.sqrt(np.sum(doc_embeds * doc_embeds, axis=-1)) # [num_docs]
209
+ query_norm = np.sqrt(np.sum(query_expanded * query_expanded, axis=-1)) # [num_docs]
210
+ scores = dot_product / (doc_norm * query_norm) # [num_docs]
211
+
212
+ # Create results
213
+ results = []
214
+ for idx, (doc, score, embed) in enumerate(zip(documents, scores, doc_embeds)):
215
+ result = {
216
+ "index": idx,
217
+ "relevance_score": float(score),
218
+ "document": doc
219
+ }
220
+ if return_embeddings:
221
+ result["embedding"] = embed.tolist()
222
+ results.append(result)
223
+
224
+ # Sort by score descending
225
+ results.sort(key=lambda x: x["relevance_score"], reverse=True)
226
+
227
+ # Return top_n if specified
228
+ if top_n is not None:
229
+ results = results[:top_n]
230
+
231
+ return results
232
+
233
+
234
+ if __name__ == "__main__":
235
+ # Test the reranker
236
+ reranker = GGUFReranker()
237
+
238
+ query = "What is the capital of France?"
239
+ documents = [
240
+ "Paris is the capital and largest city of France.",
241
+ "Berlin is the capital of Germany.",
242
+ "The Eiffel Tower is located in Paris."
243
+ ]
244
+
245
+ results = reranker.rerank(query, documents)
246
+ for result in results:
247
+ print(f"Doc {result['index']}: {result['relevance_score']:.4f} - {result['document'][:50]}...")