AoiNoGeso commited on
Commit
cb2a584
·
verified ·
1 Parent(s): 1457867

Upload Japanese CLIP model with fixed configuration

Browse files
README.md CHANGED
@@ -11,6 +11,7 @@ tags:
11
  datasets:
12
  - stair-captions
13
  library_name: transformers
 
14
  ---
15
 
16
  # japanese-clip-stair
@@ -25,6 +26,12 @@ library_name: transformers
25
  - 学習データ: STAIR Captions
26
  - 埋め込み次元: 512
27
 
 
 
 
 
 
 
28
  ## 使用方法
29
 
30
  ### 基本的な使用例
@@ -121,3 +128,19 @@ Apache 2.0
121
  ## 使用例
122
 
123
  詳細な使用例は `usage_example.py` を参照してください。
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  datasets:
12
  - stair-captions
13
  library_name: transformers
14
+ pipeline_tag: zero-shot-image-classification
15
  ---
16
 
17
  # japanese-clip-stair
 
26
  - 学習データ: STAIR Captions
27
  - 埋め込み次元: 512
28
 
29
+ ## 必要なライブラリ
30
+
31
+ ```bash
32
+ pip install torch torchvision transformers pillow requests
33
+ ```
34
+
35
  ## 使用方法
36
 
37
  ### 基本的な使用例
 
128
  ## 使用例
129
 
130
  詳細な使用例は `usage_example.py` を参照してください。
131
+
132
+ ## トラブルシューティング
133
+
134
+ ### KeyError: 'japanese-clip'
135
+
136
+ もしこのエラーが発生した場合は、以下のコマンドでTransformersを最新版に更新してください:
137
+
138
+ ```bash
139
+ pip install --upgrade transformers
140
+ ```
141
+
142
+ それでも解決しない場合は、`trust_remote_code=True`パラメータを使用してください:
143
+
144
+ ```python
145
+ model = AutoModel.from_pretrained("AoiNoGeso/japanese-clip-stair", trust_remote_code=True)
146
+ ```
__pycache__/modeling_japanese_clip.cpython-310.pyc CHANGED
Binary files a/__pycache__/modeling_japanese_clip.cpython-310.pyc and b/__pycache__/modeling_japanese_clip.cpython-310.pyc differ
 
config.json CHANGED
@@ -1,12 +1,13 @@
1
  {
2
- "architectures": [
3
- "JapaneseCLIPModel"
4
- ],
5
- "image_embed_dim": 512,
6
  "model_type": "japanese-clip",
7
- "temperature": 0.07,
8
- "text_embed_dim": 512,
9
  "text_model_name": "cl-tohoku/bert-base-japanese-v3",
 
 
 
 
 
 
 
10
  "torch_dtype": "float32",
11
- "transformers_version": "4.52.4"
12
- }
 
1
  {
 
 
 
 
2
  "model_type": "japanese-clip",
 
 
3
  "text_model_name": "cl-tohoku/bert-base-japanese-v3",
4
+ "image_embed_dim": 512,
5
+ "text_embed_dim": 512,
6
+ "temperature": 0.07,
7
+ "auto_map": {
8
+ "AutoConfig": "modeling_japanese_clip.JapaneseCLIPConfig",
9
+ "AutoModel": "modeling_japanese_clip.JapaneseCLIPModel"
10
+ },
11
  "torch_dtype": "float32",
12
+ "transformers_version": "4.21.0"
13
+ }
modeling_japanese_clip.py CHANGED
@@ -36,6 +36,12 @@ class JapaneseCLIPModel(PreTrainedModel):
36
  def __init__(self, config):
37
  super().__init__(config)
38
 
 
 
 
 
 
 
39
  # 画像エンコーダ(ResNet50ベース)
40
  self.image_encoder = resnet50(pretrained=True)
41
  self.image_encoder.fc = nn.Linear(
@@ -119,3 +125,9 @@ class JapaneseCLIPModel(PreTrainedModel):
119
  outputs['temperature'] = temperature
120
 
121
  return outputs
 
 
 
 
 
 
 
36
  def __init__(self, config):
37
  super().__init__(config)
38
 
39
+ # torchvisionのインポートを内部で行う
40
+ try:
41
+ from torchvision.models import resnet50
42
+ except ImportError:
43
+ raise ImportError("torchvision is required for this model. Install it with: pip install torchvision")
44
+
45
  # 画像エンコーダ(ResNet50ベース)
46
  self.image_encoder = resnet50(pretrained=True)
47
  self.image_encoder.fc = nn.Linear(
 
125
  outputs['temperature'] = temperature
126
 
127
  return outputs
128
+
129
+ # AutoModelにカスタムモデルを登録
130
+ from transformers import AutoConfig, AutoModel
131
+
132
+ AutoConfig.register("japanese-clip", JapaneseCLIPConfig)
133
+ AutoModel.register(JapaneseCLIPConfig, JapaneseCLIPModel)
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:269731ff224f67b230b1e2ebcae2a11cbe13b467626835d0ca3fea8053d0b980
3
+ size 546098531
test_model.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Japanese CLIP テストスクリプト
4
+ """
5
+
6
+ import torch
7
+ from transformers import AutoTokenizer, AutoModel
8
+
9
+ def test_model_loading():
10
+ """モデル読み込みテスト"""
11
+ print("Testing model loading...")
12
+
13
+ try:
14
+ # モデルとトークナイザーの読み込み
15
+ model = AutoModel.from_pretrained(".", trust_remote_code=True)
16
+ tokenizer = AutoTokenizer.from_pretrained(".")
17
+
18
+ print("✓ Model and tokenizer loaded successfully")
19
+
20
+ # ダミーデータでテスト
21
+ texts = ["テスト", "犬", "猫"]
22
+ text_inputs = tokenizer(texts, padding=True, return_tensors="pt")
23
+
24
+ # ダミー画像データ
25
+ dummy_image = torch.randn(1, 3, 224, 224)
26
+
27
+ with torch.no_grad():
28
+ outputs = model(
29
+ pixel_values=dummy_image,
30
+ input_ids=text_inputs['input_ids'],
31
+ attention_mask=text_inputs['attention_mask']
32
+ )
33
+
34
+ print(f"✓ Forward pass successful")
35
+ print(f" - Image features shape: {outputs['image_features'].shape}")
36
+ print(f" - Text features shape: {outputs['text_features'].shape}")
37
+ print(f" - Logits shape: {outputs['logits_per_image'].shape}")
38
+ print(f" - Temperature: {outputs['temperature'].item():.4f}")
39
+
40
+ return True
41
+
42
+ except Exception as e:
43
+ print(f"✗ Test failed: {e}")
44
+ return False
45
+
46
+ if __name__ == "__main__":
47
+ success = test_model_loading()
48
+ if success:
49
+ print("\n🎉 All tests passed!")
50
+ else:
51
+ print("\n❌ Tests failed!")
usage_example.py CHANGED
@@ -28,15 +28,25 @@ def main():
28
 
29
  # モデルとトークナイザーの読み込み
30
  print("Loading model and tokenizer...")
31
- model = AutoModel.from_pretrained("AoiNoGeso/japanese-clip-stair", trust_remote_code=True).to(device)
32
- tokenizer = AutoTokenizer.from_pretrained("AoiNoGeso/japanese-clip-stair")
 
 
 
 
 
33
 
34
  # 画像取得
35
  print("Loading image...")
36
- image_url = "https://images.pexels.com/photos/2253275/pexels-photo-2253275.jpeg"
37
- response = requests.get(image_url)
38
- image = Image.open(io.BytesIO(response.content))
39
- pixel_values = preprocess_image(image).to(device)
 
 
 
 
 
40
 
41
  # テキスト候補
42
  texts = ["犬", "猫", "象", "鳥", "魚", "花", "車", "建物"]
@@ -45,14 +55,19 @@ def main():
45
 
46
  # 推論実行
47
  print("Running inference...")
48
- with torch.no_grad():
49
- outputs = model(
50
- pixel_values=pixel_values,
51
- input_ids=text_inputs['input_ids'],
52
- attention_mask=text_inputs['attention_mask']
53
- )
54
-
55
- probs = outputs['logits_per_image'].softmax(dim=-1)
 
 
 
 
 
56
 
57
  # 結果表示
58
  print("\n" + "="*50)
 
28
 
29
  # モデルとトークナイザーの読み込み
30
  print("Loading model and tokenizer...")
31
+ try:
32
+ model = AutoModel.from_pretrained("AoiNoGeso/japanese-clip-stair", trust_remote_code=True).to(device)
33
+ tokenizer = AutoTokenizer.from_pretrained("AoiNoGeso/japanese-clip-stair")
34
+ print("✓ Model loaded successfully")
35
+ except Exception as e:
36
+ print(f"✗ Failed to load model: {e}")
37
+ return
38
 
39
  # 画像取得
40
  print("Loading image...")
41
+ try:
42
+ image_url = "https://images.pexels.com/photos/2253275/pexels-photo-2253275.jpeg"
43
+ response = requests.get(image_url)
44
+ image = Image.open(io.BytesIO(response.content))
45
+ pixel_values = preprocess_image(image).to(device)
46
+ print("✓ Image loaded successfully")
47
+ except Exception as e:
48
+ print(f"✗ Failed to load image: {e}")
49
+ return
50
 
51
  # テキスト候補
52
  texts = ["犬", "猫", "象", "鳥", "魚", "花", "車", "建物"]
 
55
 
56
  # 推論実行
57
  print("Running inference...")
58
+ try:
59
+ with torch.no_grad():
60
+ outputs = model(
61
+ pixel_values=pixel_values,
62
+ input_ids=text_inputs['input_ids'],
63
+ attention_mask=text_inputs['attention_mask']
64
+ )
65
+
66
+ probs = outputs['logits_per_image'].softmax(dim=-1)
67
+ print("✓ Inference completed successfully")
68
+ except Exception as e:
69
+ print(f"✗ Inference failed: {e}")
70
+ return
71
 
72
  # 結果表示
73
  print("\n" + "="*50)