Update README.md
Browse files
README.md
CHANGED
|
@@ -12,4 +12,61 @@ tags:
|
|
| 12 |
- semantic_similarity
|
| 13 |
---
|
| 14 |
|
| 15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
- semantic_similarity
|
| 13 |
---
|
| 14 |
|
| 15 |
+
### ViConBERT models <a name="models2"></a>
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
Model | #params | Arch. | Max length | Training data | License
|
| 19 |
+
---|---|---|---|---|---
|
| 20 |
+
[`tkhangg0910/viconbert-base`](https://huggingface.co/tkhangg0910/viconbert-base) | 135M | base | 256 | ViConWSD | [MIT License](https://github.com/VinAIResearch/PhoBERT/blob/master/LICENSE)
|
| 21 |
+
[`tkhangg0910/viconbert-large`](https://huggingface.co/tkhangg0910/viconbert-large) | 370M | large | 256 | ViConWSD | [MIT License](https://github.com/VinAIResearch/PhoBERT/blob/master/LICENSE)
|
| 22 |
+
|
| 23 |
+
### Example usage <a name="usage2"></a>
|
| 24 |
+
SpanExtractor and text_normalize are implemented in [`Implementation`](https://github.com/tkhangg0910/ViConBERT/tree/main/utils)
|
| 25 |
+
```python
|
| 26 |
+
import logging
|
| 27 |
+
from typing import Optional, Tuple
|
| 28 |
+
import re
|
| 29 |
+
from transformers import AutoModel, PhobertTokenizerFast
|
| 30 |
+
|
| 31 |
+
from utils.span_extractor import SpanExtractor
|
| 32 |
+
from utils.process_data import text_normalize
|
| 33 |
+
import torch
|
| 34 |
+
|
| 35 |
+
model = AutoModel.from_pretrained("tkhangg0910/viconbert-base", trust_remote_code=True)
|
| 36 |
+
tokenizer = PhobertTokenizerFast.from_pretrained("tkhangg0910/viconbert-base")
|
| 37 |
+
|
| 38 |
+
span_ex =SpanExtractor(tokenizer)
|
| 39 |
+
|
| 40 |
+
def pipeline(query, target):
|
| 41 |
+
query_norm=text_normalize(query)
|
| 42 |
+
tokenized_query = tokenizer(query_norm,return_tensors="pt").to(device)
|
| 43 |
+
span_idx = span_ex.get_span_indices(query_norm, target)
|
| 44 |
+
span =torch.Tensor(span_idx).unsqueeze(0).to(device)
|
| 45 |
+
model.eval()
|
| 46 |
+
query_vec = model(tokenized_query, span)
|
| 47 |
+
return query_vec
|
| 48 |
+
|
| 49 |
+
# Honosemous word: "Khoan" example
|
| 50 |
+
query_1 = "Tôi đang khoan."
|
| 51 |
+
target_1 = "Khoan"
|
| 52 |
+
query_vec_1 = pipeline(query_1, target_1)
|
| 53 |
+
|
| 54 |
+
query_2 = "khoan này bị mất mũi khoan."
|
| 55 |
+
target_2 = "mũi khoan"
|
| 56 |
+
query_vec_2 = pipeline(query_2, target_2)
|
| 57 |
+
|
| 58 |
+
query_3 = "Khoan là việc rất tiện lợi."
|
| 59 |
+
target_3 = "Khoan"
|
| 60 |
+
query_vec_3 = pipeline(query_3, target_3)
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def cosine_similarity(vec1, vec2):
|
| 64 |
+
return F.cosine_similarity(vec1, vec2, dim=1).item()
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
sim_1 = cosine_similarity(query_vec_1, query_vec_3)
|
| 68 |
+
sim_2 = cosine_similarity(query_vec_2, query_vec_3)
|
| 69 |
+
|
| 70 |
+
print(f"Similarity between 1: {target_1} and 3: {target_3}: {sim_1:.4f}")
|
| 71 |
+
print(f"Similarity between 2: {target_2} and 3:{target_3}: {sim_2:.4f}")
|
| 72 |
+
```
|