Upload Japanese CLIP model with fixed configuration

Browse files

Files changed (7) hide show

README.md +23 -0
__pycache__/modeling_japanese_clip.cpython-310.pyc +0 -0
config.json +9 -8
modeling_japanese_clip.py +12 -0
pytorch_model.bin +3 -0
test_model.py +51 -0
usage_example.py +29 -14

README.md CHANGED Viewed

@@ -11,6 +11,7 @@ tags:
 datasets:
 - stair-captions
 library_name: transformers
 ---
 # japanese-clip-stair
@@ -25,6 +26,12 @@ library_name: transformers
 - 学習データ: STAIR Captions
 - 埋め込み次元: 512
 ## 使用方法
 ### 基本的な使用例
@@ -121,3 +128,19 @@ Apache 2.0
 ## 使用例
 詳細な使用例は `usage_example.py` を参照してください。

 datasets:
 - stair-captions
 library_name: transformers
+pipeline_tag: zero-shot-image-classification
 ---
 # japanese-clip-stair
 - 学習データ: STAIR Captions
 - 埋め込み次元: 512
+## 必要なライブラリ
+```bash
+pip install torch torchvision transformers pillow requests
+```
 ## 使用方法
 ### 基本的な使用例
 ## 使用例
 詳細な使用例は `usage_example.py` を参照してください。
+## トラブルシューティング
+### KeyError: 'japanese-clip'
+もしこのエラーが発生した場合は、以下のコマンドでTransformersを最新版に更新してください：
+```bash
+pip install --upgrade transformers
+```
+それでも解決しない場合は、`trust_remote_code=True`パラメータを使用してください：
+```python
+model = AutoModel.from_pretrained("AoiNoGeso/japanese-clip-stair", trust_remote_code=True)
+```

__pycache__/modeling_japanese_clip.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/modeling_japanese_clip.cpython-310.pyc and b/__pycache__/modeling_japanese_clip.cpython-310.pyc differ

config.json CHANGED Viewed

@@ -1,12 +1,13 @@
 {
-  "architectures": [
-    "JapaneseCLIPModel"
-  ],
-  "image_embed_dim": 512,
   "model_type": "japanese-clip",
-  "temperature": 0.07,
-  "text_embed_dim": 512,
   "text_model_name": "cl-tohoku/bert-base-japanese-v3",
   "torch_dtype": "float32",
-  "transformers_version": "4.52.4"
-}

 {
   "model_type": "japanese-clip",
   "text_model_name": "cl-tohoku/bert-base-japanese-v3",
+  "image_embed_dim": 512,
+  "text_embed_dim": 512,
+  "temperature": 0.07,
+  "auto_map": {
+    "AutoConfig": "modeling_japanese_clip.JapaneseCLIPConfig",
+    "AutoModel": "modeling_japanese_clip.JapaneseCLIPModel"
+  },
   "torch_dtype": "float32",
+  "transformers_version": "4.21.0"
+}

modeling_japanese_clip.py CHANGED Viewed

@@ -36,6 +36,12 @@ class JapaneseCLIPModel(PreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         # 画像エンコーダ（ResNet50ベース）
         self.image_encoder = resnet50(pretrained=True)
         self.image_encoder.fc = nn.Linear(
@@ -119,3 +125,9 @@ class JapaneseCLIPModel(PreTrainedModel):
             outputs['temperature'] = temperature
         return outputs

     def __init__(self, config):
         super().__init__(config)
+        # torchvisionのインポートを内部で行う
+        try:
+            from torchvision.models import resnet50
+        except ImportError:
+            raise ImportError("torchvision is required for this model. Install it with: pip install torchvision")
         # 画像エンコーダ（ResNet50ベース）
         self.image_encoder = resnet50(pretrained=True)
         self.image_encoder.fc = nn.Linear(
             outputs['temperature'] = temperature
         return outputs
+# AutoModelにカスタムモデルを登録
+from transformers import AutoConfig, AutoModel
+AutoConfig.register("japanese-clip", JapaneseCLIPConfig)
+AutoModel.register(JapaneseCLIPConfig, JapaneseCLIPModel)

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:269731ff224f67b230b1e2ebcae2a11cbe13b467626835d0ca3fea8053d0b980
+size 546098531

test_model.py ADDED Viewed

	@@ -0,0 +1,51 @@

+#!/usr/bin/env python3
+"""
+Japanese CLIP テストスクリプト
+"""
+import torch
+from transformers import AutoTokenizer, AutoModel
+def test_model_loading():
+    """モデル読み込みテスト"""
+    print("Testing model loading...")
+    try:
+        # モデルとトークナイザーの読み込み
+        model = AutoModel.from_pretrained(".", trust_remote_code=True)
+        tokenizer = AutoTokenizer.from_pretrained(".")
+        print("✓ Model and tokenizer loaded successfully")
+        # ダミーデータでテスト
+        texts = ["テスト", "犬", "猫"]
+        text_inputs = tokenizer(texts, padding=True, return_tensors="pt")
+        # ダミー画像データ
+        dummy_image = torch.randn(1, 3, 224, 224)
+        with torch.no_grad():
+            outputs = model(
+                pixel_values=dummy_image,
+                input_ids=text_inputs['input_ids'],
+                attention_mask=text_inputs['attention_mask']
+            )
+            print(f"✓ Forward pass successful")
+            print(f"  - Image features shape: {outputs['image_features'].shape}")
+            print(f"  - Text features shape: {outputs['text_features'].shape}")
+            print(f"  - Logits shape: {outputs['logits_per_image'].shape}")
+            print(f"  - Temperature: {outputs['temperature'].item():.4f}")
+        return True
+    except Exception as e:
+        print(f"✗ Test failed: {e}")
+        return False
+if __name__ == "__main__":
+    success = test_model_loading()
+    if success:
+        print("\n🎉 All tests passed!")
+    else:
+        print("\n❌ Tests failed!")

usage_example.py CHANGED Viewed

@@ -28,15 +28,25 @@ def main():
     # モデルとトークナイザーの読み込み
     print("Loading model and tokenizer...")
-    model = AutoModel.from_pretrained("AoiNoGeso/japanese-clip-stair", trust_remote_code=True).to(device)
-    tokenizer = AutoTokenizer.from_pretrained("AoiNoGeso/japanese-clip-stair")
     # 画像取得
     print("Loading image...")
-    image_url = "https://images.pexels.com/photos/2253275/pexels-photo-2253275.jpeg"
-    response = requests.get(image_url)
-    image = Image.open(io.BytesIO(response.content))
-    pixel_values = preprocess_image(image).to(device)
     # テキスト候補
     texts = ["犬", "猫", "象", "鳥", "魚", "花", "車", "建物"]
@@ -45,14 +55,19 @@ def main():
     # 推論実行
     print("Running inference...")
-    with torch.no_grad():
-        outputs = model(
-            pixel_values=pixel_values,
-            input_ids=text_inputs['input_ids'],
-            attention_mask=text_inputs['attention_mask']
-        )
-        probs = outputs['logits_per_image'].softmax(dim=-1)
     # 結果表示
     print("\n" + "="*50)

     # モデルとトークナイザーの読み込み
     print("Loading model and tokenizer...")
+    try:
+        model = AutoModel.from_pretrained("AoiNoGeso/japanese-clip-stair", trust_remote_code=True).to(device)
+        tokenizer = AutoTokenizer.from_pretrained("AoiNoGeso/japanese-clip-stair")
+        print("✓ Model loaded successfully")
+    except Exception as e:
+        print(f"✗ Failed to load model: {e}")
+        return
     # 画像取得
     print("Loading image...")
+    try:
+        image_url = "https://images.pexels.com/photos/2253275/pexels-photo-2253275.jpeg"
+        response = requests.get(image_url)
+        image = Image.open(io.BytesIO(response.content))
+        pixel_values = preprocess_image(image).to(device)
+        print("✓ Image loaded successfully")
+    except Exception as e:
+        print(f"✗ Failed to load image: {e}")
+        return
     # テキスト候補
     texts = ["犬", "猫", "象", "鳥", "魚", "花", "車", "建物"]
     # 推論実行
     print("Running inference...")
+    try:
+        with torch.no_grad():
+            outputs = model(
+                pixel_values=pixel_values,
+                input_ids=text_inputs['input_ids'],
+                attention_mask=text_inputs['attention_mask']
+            )
+            probs = outputs['logits_per_image'].softmax(dim=-1)
+        print("✓ Inference completed successfully")
+    except Exception as e:
+        print(f"✗ Inference failed: {e}")
+        return
     # 結果表示
     print("\n" + "="*50)