Upload processor
Browse files- README.md +4 -4
- added_tokens.json +1 -2
- special_tokens_map.json +2 -2
- tokenizer_config.json +4 -4
- vocab.json +1 -0
README.md
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
language:
|
| 3 |
- ar
|
| 4 |
license: apache-2.0
|
| 5 |
-
base_model: tarteel-ai/whisper-base-ar-quran
|
| 6 |
tags:
|
| 7 |
- generated_from_trainer
|
|
|
|
| 8 |
datasets:
|
| 9 |
- zolfa
|
| 10 |
metrics:
|
|
@@ -13,16 +13,16 @@ model-index:
|
|
| 13 |
- name: Whisper-raghadomar
|
| 14 |
results:
|
| 15 |
- task:
|
| 16 |
-
name: Automatic Speech Recognition
|
| 17 |
type: automatic-speech-recognition
|
|
|
|
| 18 |
dataset:
|
| 19 |
name: Zolfa Dataset
|
| 20 |
type: zolfa
|
| 21 |
args: 'config: ar, split: test'
|
| 22 |
metrics:
|
| 23 |
-
-
|
| 24 |
-
type: wer
|
| 25 |
value: 6.896551724137931
|
|
|
|
| 26 |
---
|
| 27 |
|
| 28 |
<!-- This model card has been generated automatically according to the information the Trainer had access to. You
|
|
|
|
| 2 |
language:
|
| 3 |
- ar
|
| 4 |
license: apache-2.0
|
|
|
|
| 5 |
tags:
|
| 6 |
- generated_from_trainer
|
| 7 |
+
base_model: tarteel-ai/whisper-base-ar-quran
|
| 8 |
datasets:
|
| 9 |
- zolfa
|
| 10 |
metrics:
|
|
|
|
| 13 |
- name: Whisper-raghadomar
|
| 14 |
results:
|
| 15 |
- task:
|
|
|
|
| 16 |
type: automatic-speech-recognition
|
| 17 |
+
name: Automatic Speech Recognition
|
| 18 |
dataset:
|
| 19 |
name: Zolfa Dataset
|
| 20 |
type: zolfa
|
| 21 |
args: 'config: ar, split: test'
|
| 22 |
metrics:
|
| 23 |
+
- type: wer
|
|
|
|
| 24 |
value: 6.896551724137931
|
| 25 |
+
name: Wer
|
| 26 |
---
|
| 27 |
|
| 28 |
<!-- This model card has been generated automatically according to the information the Trainer had access to. You
|
added_tokens.json
CHANGED
|
@@ -17,7 +17,6 @@
|
|
| 17 |
"<|da|>": 50285,
|
| 18 |
"<|de|>": 50261,
|
| 19 |
"<|el|>": 50281,
|
| 20 |
-
"<|endoftext|>": 50257,
|
| 21 |
"<|en|>": 50259,
|
| 22 |
"<|es|>": 50262,
|
| 23 |
"<|et|>": 50307,
|
|
@@ -30,6 +29,7 @@
|
|
| 30 |
"<|gu|>": 50333,
|
| 31 |
"<|haw|>": 50352,
|
| 32 |
"<|ha|>": 50354,
|
|
|
|
| 33 |
"<|hi|>": 50276,
|
| 34 |
"<|hr|>": 50291,
|
| 35 |
"<|ht|>": 50339,
|
|
@@ -38,7 +38,6 @@
|
|
| 38 |
"<|id|>": 50275,
|
| 39 |
"<|is|>": 50311,
|
| 40 |
"<|it|>": 50274,
|
| 41 |
-
"<|iw|>": 50279,
|
| 42 |
"<|ja|>": 50266,
|
| 43 |
"<|jw|>": 50356,
|
| 44 |
"<|ka|>": 50329,
|
|
|
|
| 17 |
"<|da|>": 50285,
|
| 18 |
"<|de|>": 50261,
|
| 19 |
"<|el|>": 50281,
|
|
|
|
| 20 |
"<|en|>": 50259,
|
| 21 |
"<|es|>": 50262,
|
| 22 |
"<|et|>": 50307,
|
|
|
|
| 29 |
"<|gu|>": 50333,
|
| 30 |
"<|haw|>": 50352,
|
| 31 |
"<|ha|>": 50354,
|
| 32 |
+
"<|he|>": 50279,
|
| 33 |
"<|hi|>": 50276,
|
| 34 |
"<|hr|>": 50291,
|
| 35 |
"<|ht|>": 50339,
|
|
|
|
| 38 |
"<|id|>": 50275,
|
| 39 |
"<|is|>": 50311,
|
| 40 |
"<|it|>": 50274,
|
|
|
|
| 41 |
"<|ja|>": 50266,
|
| 42 |
"<|jw|>": 50356,
|
| 43 |
"<|ka|>": 50329,
|
special_tokens_map.json
CHANGED
|
@@ -22,7 +22,7 @@
|
|
| 22 |
"<|hi|>",
|
| 23 |
"<|fi|>",
|
| 24 |
"<|vi|>",
|
| 25 |
-
"<|
|
| 26 |
"<|uk|>",
|
| 27 |
"<|el|>",
|
| 28 |
"<|ms|>",
|
|
@@ -130,7 +130,7 @@
|
|
| 130 |
"single_word": false
|
| 131 |
},
|
| 132 |
"unk_token": {
|
| 133 |
-
"content": "",
|
| 134 |
"lstrip": false,
|
| 135 |
"normalized": true,
|
| 136 |
"rstrip": false,
|
|
|
|
| 22 |
"<|hi|>",
|
| 23 |
"<|fi|>",
|
| 24 |
"<|vi|>",
|
| 25 |
+
"<|he|>",
|
| 26 |
"<|uk|>",
|
| 27 |
"<|el|>",
|
| 28 |
"<|ms|>",
|
|
|
|
| 130 |
"single_word": false
|
| 131 |
},
|
| 132 |
"unk_token": {
|
| 133 |
+
"content": "<|endoftext|>",
|
| 134 |
"lstrip": false,
|
| 135 |
"normalized": true,
|
| 136 |
"rstrip": false,
|
tokenizer_config.json
CHANGED
|
@@ -179,7 +179,7 @@
|
|
| 179 |
"special": true
|
| 180 |
},
|
| 181 |
"50279": {
|
| 182 |
-
"content": "<|
|
| 183 |
"lstrip": false,
|
| 184 |
"normalized": false,
|
| 185 |
"rstrip": false,
|
|
@@ -882,7 +882,7 @@
|
|
| 882 |
"<|hi|>",
|
| 883 |
"<|fi|>",
|
| 884 |
"<|vi|>",
|
| 885 |
-
"<|
|
| 886 |
"<|uk|>",
|
| 887 |
"<|el|>",
|
| 888 |
"<|ms|>",
|
|
@@ -972,10 +972,10 @@
|
|
| 972 |
"clean_up_tokenization_spaces": true,
|
| 973 |
"eos_token": "<|endoftext|>",
|
| 974 |
"errors": "replace",
|
| 975 |
-
"model_max_length":
|
| 976 |
"pad_token": "<|endoftext|>",
|
| 977 |
"processor_class": "WhisperProcessor",
|
| 978 |
"return_attention_mask": false,
|
| 979 |
"tokenizer_class": "WhisperTokenizer",
|
| 980 |
-
"unk_token": ""
|
| 981 |
}
|
|
|
|
| 179 |
"special": true
|
| 180 |
},
|
| 181 |
"50279": {
|
| 182 |
+
"content": "<|he|>",
|
| 183 |
"lstrip": false,
|
| 184 |
"normalized": false,
|
| 185 |
"rstrip": false,
|
|
|
|
| 882 |
"<|hi|>",
|
| 883 |
"<|fi|>",
|
| 884 |
"<|vi|>",
|
| 885 |
+
"<|he|>",
|
| 886 |
"<|uk|>",
|
| 887 |
"<|el|>",
|
| 888 |
"<|ms|>",
|
|
|
|
| 972 |
"clean_up_tokenization_spaces": true,
|
| 973 |
"eos_token": "<|endoftext|>",
|
| 974 |
"errors": "replace",
|
| 975 |
+
"model_max_length": 1024,
|
| 976 |
"pad_token": "<|endoftext|>",
|
| 977 |
"processor_class": "WhisperProcessor",
|
| 978 |
"return_attention_mask": false,
|
| 979 |
"tokenizer_class": "WhisperTokenizer",
|
| 980 |
+
"unk_token": "<|endoftext|>"
|
| 981 |
}
|
vocab.json
CHANGED
|
@@ -314,6 +314,7 @@
|
|
| 314 |
";;": 35746,
|
| 315 |
"<": 27,
|
| 316 |
"</": 3433,
|
|
|
|
| 317 |
"=": 28,
|
| 318 |
"=\"": 13114,
|
| 319 |
"=\"#": 34106,
|
|
|
|
| 314 |
";;": 35746,
|
| 315 |
"<": 27,
|
| 316 |
"</": 3433,
|
| 317 |
+
"<|endoftext|>": 50257,
|
| 318 |
"=": 28,
|
| 319 |
"=\"": 13114,
|
| 320 |
"=\"#": 34106,
|