Upload Japanese CLIP model with fixed configuration
Browse files- README.md +23 -0
- __pycache__/modeling_japanese_clip.cpython-310.pyc +0 -0
- config.json +9 -8
- modeling_japanese_clip.py +12 -0
- pytorch_model.bin +3 -0
- test_model.py +51 -0
- usage_example.py +29 -14
README.md
CHANGED
|
@@ -11,6 +11,7 @@ tags:
|
|
| 11 |
datasets:
|
| 12 |
- stair-captions
|
| 13 |
library_name: transformers
|
|
|
|
| 14 |
---
|
| 15 |
|
| 16 |
# japanese-clip-stair
|
|
@@ -25,6 +26,12 @@ library_name: transformers
|
|
| 25 |
- 学習データ: STAIR Captions
|
| 26 |
- 埋め込み次元: 512
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
## 使用方法
|
| 29 |
|
| 30 |
### 基本的な使用例
|
|
@@ -121,3 +128,19 @@ Apache 2.0
|
|
| 121 |
## 使用例
|
| 122 |
|
| 123 |
詳細な使用例は `usage_example.py` を参照してください。
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
datasets:
|
| 12 |
- stair-captions
|
| 13 |
library_name: transformers
|
| 14 |
+
pipeline_tag: zero-shot-image-classification
|
| 15 |
---
|
| 16 |
|
| 17 |
# japanese-clip-stair
|
|
|
|
| 26 |
- 学習データ: STAIR Captions
|
| 27 |
- 埋め込み次元: 512
|
| 28 |
|
| 29 |
+
## 必要なライブラリ
|
| 30 |
+
|
| 31 |
+
```bash
|
| 32 |
+
pip install torch torchvision transformers pillow requests
|
| 33 |
+
```
|
| 34 |
+
|
| 35 |
## 使用方法
|
| 36 |
|
| 37 |
### 基本的な使用例
|
|
|
|
| 128 |
## 使用例
|
| 129 |
|
| 130 |
詳細な使用例は `usage_example.py` を参照してください。
|
| 131 |
+
|
| 132 |
+
## トラブルシューティング
|
| 133 |
+
|
| 134 |
+
### KeyError: 'japanese-clip'
|
| 135 |
+
|
| 136 |
+
もしこのエラーが発生した場合は、以下のコマンドでTransformersを最新版に更新してください:
|
| 137 |
+
|
| 138 |
+
```bash
|
| 139 |
+
pip install --upgrade transformers
|
| 140 |
+
```
|
| 141 |
+
|
| 142 |
+
それでも解決しない場合は、`trust_remote_code=True`パラメータを使用してください:
|
| 143 |
+
|
| 144 |
+
```python
|
| 145 |
+
model = AutoModel.from_pretrained("AoiNoGeso/japanese-clip-stair", trust_remote_code=True)
|
| 146 |
+
```
|
__pycache__/modeling_japanese_clip.cpython-310.pyc
CHANGED
|
Binary files a/__pycache__/modeling_japanese_clip.cpython-310.pyc and b/__pycache__/modeling_japanese_clip.cpython-310.pyc differ
|
|
|
config.json
CHANGED
|
@@ -1,12 +1,13 @@
|
|
| 1 |
{
|
| 2 |
-
"architectures": [
|
| 3 |
-
"JapaneseCLIPModel"
|
| 4 |
-
],
|
| 5 |
-
"image_embed_dim": 512,
|
| 6 |
"model_type": "japanese-clip",
|
| 7 |
-
"temperature": 0.07,
|
| 8 |
-
"text_embed_dim": 512,
|
| 9 |
"text_model_name": "cl-tohoku/bert-base-japanese-v3",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
"torch_dtype": "float32",
|
| 11 |
-
"transformers_version": "4.
|
| 12 |
-
}
|
|
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"model_type": "japanese-clip",
|
|
|
|
|
|
|
| 3 |
"text_model_name": "cl-tohoku/bert-base-japanese-v3",
|
| 4 |
+
"image_embed_dim": 512,
|
| 5 |
+
"text_embed_dim": 512,
|
| 6 |
+
"temperature": 0.07,
|
| 7 |
+
"auto_map": {
|
| 8 |
+
"AutoConfig": "modeling_japanese_clip.JapaneseCLIPConfig",
|
| 9 |
+
"AutoModel": "modeling_japanese_clip.JapaneseCLIPModel"
|
| 10 |
+
},
|
| 11 |
"torch_dtype": "float32",
|
| 12 |
+
"transformers_version": "4.21.0"
|
| 13 |
+
}
|
modeling_japanese_clip.py
CHANGED
|
@@ -36,6 +36,12 @@ class JapaneseCLIPModel(PreTrainedModel):
|
|
| 36 |
def __init__(self, config):
|
| 37 |
super().__init__(config)
|
| 38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
# 画像エンコーダ(ResNet50ベース)
|
| 40 |
self.image_encoder = resnet50(pretrained=True)
|
| 41 |
self.image_encoder.fc = nn.Linear(
|
|
@@ -119,3 +125,9 @@ class JapaneseCLIPModel(PreTrainedModel):
|
|
| 119 |
outputs['temperature'] = temperature
|
| 120 |
|
| 121 |
return outputs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
def __init__(self, config):
|
| 37 |
super().__init__(config)
|
| 38 |
|
| 39 |
+
# torchvisionのインポートを内部で行う
|
| 40 |
+
try:
|
| 41 |
+
from torchvision.models import resnet50
|
| 42 |
+
except ImportError:
|
| 43 |
+
raise ImportError("torchvision is required for this model. Install it with: pip install torchvision")
|
| 44 |
+
|
| 45 |
# 画像エンコーダ(ResNet50ベース)
|
| 46 |
self.image_encoder = resnet50(pretrained=True)
|
| 47 |
self.image_encoder.fc = nn.Linear(
|
|
|
|
| 125 |
outputs['temperature'] = temperature
|
| 126 |
|
| 127 |
return outputs
|
| 128 |
+
|
| 129 |
+
# AutoModelにカスタムモデルを登録
|
| 130 |
+
from transformers import AutoConfig, AutoModel
|
| 131 |
+
|
| 132 |
+
AutoConfig.register("japanese-clip", JapaneseCLIPConfig)
|
| 133 |
+
AutoModel.register(JapaneseCLIPConfig, JapaneseCLIPModel)
|
pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:269731ff224f67b230b1e2ebcae2a11cbe13b467626835d0ca3fea8053d0b980
|
| 3 |
+
size 546098531
|
test_model.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Japanese CLIP テストスクリプト
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import torch
|
| 7 |
+
from transformers import AutoTokenizer, AutoModel
|
| 8 |
+
|
| 9 |
+
def test_model_loading():
|
| 10 |
+
"""モデル読み込みテスト"""
|
| 11 |
+
print("Testing model loading...")
|
| 12 |
+
|
| 13 |
+
try:
|
| 14 |
+
# モデルとトークナイザーの読み込み
|
| 15 |
+
model = AutoModel.from_pretrained(".", trust_remote_code=True)
|
| 16 |
+
tokenizer = AutoTokenizer.from_pretrained(".")
|
| 17 |
+
|
| 18 |
+
print("✓ Model and tokenizer loaded successfully")
|
| 19 |
+
|
| 20 |
+
# ダミーデータでテスト
|
| 21 |
+
texts = ["テスト", "犬", "猫"]
|
| 22 |
+
text_inputs = tokenizer(texts, padding=True, return_tensors="pt")
|
| 23 |
+
|
| 24 |
+
# ダミー画像データ
|
| 25 |
+
dummy_image = torch.randn(1, 3, 224, 224)
|
| 26 |
+
|
| 27 |
+
with torch.no_grad():
|
| 28 |
+
outputs = model(
|
| 29 |
+
pixel_values=dummy_image,
|
| 30 |
+
input_ids=text_inputs['input_ids'],
|
| 31 |
+
attention_mask=text_inputs['attention_mask']
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
print(f"✓ Forward pass successful")
|
| 35 |
+
print(f" - Image features shape: {outputs['image_features'].shape}")
|
| 36 |
+
print(f" - Text features shape: {outputs['text_features'].shape}")
|
| 37 |
+
print(f" - Logits shape: {outputs['logits_per_image'].shape}")
|
| 38 |
+
print(f" - Temperature: {outputs['temperature'].item():.4f}")
|
| 39 |
+
|
| 40 |
+
return True
|
| 41 |
+
|
| 42 |
+
except Exception as e:
|
| 43 |
+
print(f"✗ Test failed: {e}")
|
| 44 |
+
return False
|
| 45 |
+
|
| 46 |
+
if __name__ == "__main__":
|
| 47 |
+
success = test_model_loading()
|
| 48 |
+
if success:
|
| 49 |
+
print("\n🎉 All tests passed!")
|
| 50 |
+
else:
|
| 51 |
+
print("\n❌ Tests failed!")
|
usage_example.py
CHANGED
|
@@ -28,15 +28,25 @@ def main():
|
|
| 28 |
|
| 29 |
# モデルとトークナイザーの読み込み
|
| 30 |
print("Loading model and tokenizer...")
|
| 31 |
-
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
# 画像取得
|
| 35 |
print("Loading image...")
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
# テキスト候補
|
| 42 |
texts = ["犬", "猫", "象", "鳥", "魚", "花", "車", "建物"]
|
|
@@ -45,14 +55,19 @@ def main():
|
|
| 45 |
|
| 46 |
# 推論実行
|
| 47 |
print("Running inference...")
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
# 結果表示
|
| 58 |
print("\n" + "="*50)
|
|
|
|
| 28 |
|
| 29 |
# モデルとトークナイザーの読み込み
|
| 30 |
print("Loading model and tokenizer...")
|
| 31 |
+
try:
|
| 32 |
+
model = AutoModel.from_pretrained("AoiNoGeso/japanese-clip-stair", trust_remote_code=True).to(device)
|
| 33 |
+
tokenizer = AutoTokenizer.from_pretrained("AoiNoGeso/japanese-clip-stair")
|
| 34 |
+
print("✓ Model loaded successfully")
|
| 35 |
+
except Exception as e:
|
| 36 |
+
print(f"✗ Failed to load model: {e}")
|
| 37 |
+
return
|
| 38 |
|
| 39 |
# 画像取得
|
| 40 |
print("Loading image...")
|
| 41 |
+
try:
|
| 42 |
+
image_url = "https://images.pexels.com/photos/2253275/pexels-photo-2253275.jpeg"
|
| 43 |
+
response = requests.get(image_url)
|
| 44 |
+
image = Image.open(io.BytesIO(response.content))
|
| 45 |
+
pixel_values = preprocess_image(image).to(device)
|
| 46 |
+
print("✓ Image loaded successfully")
|
| 47 |
+
except Exception as e:
|
| 48 |
+
print(f"✗ Failed to load image: {e}")
|
| 49 |
+
return
|
| 50 |
|
| 51 |
# テキスト候補
|
| 52 |
texts = ["犬", "猫", "象", "鳥", "魚", "花", "車", "建物"]
|
|
|
|
| 55 |
|
| 56 |
# 推論実行
|
| 57 |
print("Running inference...")
|
| 58 |
+
try:
|
| 59 |
+
with torch.no_grad():
|
| 60 |
+
outputs = model(
|
| 61 |
+
pixel_values=pixel_values,
|
| 62 |
+
input_ids=text_inputs['input_ids'],
|
| 63 |
+
attention_mask=text_inputs['attention_mask']
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
probs = outputs['logits_per_image'].softmax(dim=-1)
|
| 67 |
+
print("✓ Inference completed successfully")
|
| 68 |
+
except Exception as e:
|
| 69 |
+
print(f"✗ Inference failed: {e}")
|
| 70 |
+
return
|
| 71 |
|
| 72 |
# 結果表示
|
| 73 |
print("\n" + "="*50)
|