| import requests | |
| from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer | |
| import torch | |
| from PIL import Image | |
| model1 = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning") | |
| feature_extractor1 = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning") | |
| tokenizer1 = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning") | |
| device1 = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| model1.to(device1) | |
| max_length = 16 | |
| num_beams = 4 | |
| gen_kwargs = {"max_length": max_length, "num_beams": num_beams} | |
| def image_to_text_model_1(image_url): | |
| raw_image = Image.open(requests.get(image_url, stream=True).raw).convert('RGB') | |
| pixel_values = feature_extractor1(images=[raw_image], return_tensors="pt").pixel_values | |
| pixel_values = pixel_values.to(device1) | |
| output_ids = model1.generate(pixel_values, **gen_kwargs) | |
| preds = tokenizer1.batch_decode(output_ids, skip_special_tokens=True) | |
| preds = [pred.strip() for pred in preds] | |
| return preds | |
| def bytes_to_text_model_1(bts): | |
| pixel_values = feature_extractor1(images=[bts], return_tensors="pt").pixel_values | |
| pixel_values = pixel_values.to(device1) | |
| output_ids = model1.generate(pixel_values, **gen_kwargs) | |
| preds = tokenizer1.batch_decode(output_ids, skip_special_tokens=True) | |
| preds = [pred.strip() for pred in preds] | |
| print(preds[0]) | |
| import requests | |
| from PIL import Image | |
| from transformers import BlipProcessor, BlipForConditionalGeneration | |
| import torch | |
| device2 = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| processor2 = BlipProcessor.from_pretrained("noamrot/FuseCap") | |
| model2 = BlipForConditionalGeneration.from_pretrained("noamrot/FuseCap").to(device2) | |
| def image_to_text_model_2(img_url): | |
| raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB') | |
| text = "a picture of " | |
| inputs = processor2(raw_image, text, return_tensors="pt").to(device2) | |
| out = model2.generate(**inputs, num_beams = 3) | |
| print(processor2.decode(out[0], skip_special_tokens=True)) | |
| def bytes_to_text_model_2(byts): | |
| text = "a picture of " | |
| inputs = processor2(byts, text, return_tensors="pt").to(device2) | |
| out = model2.generate(**inputs, num_beams = 3) | |
| print(processor2.decode(out[0], skip_special_tokens=True)) | |
| import requests | |
| from PIL import Image | |
| from transformers import BlipProcessor, BlipForConditionalGeneration | |
| processor3 = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large") | |
| model3 = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large") | |
| def image_to_text_model_3(img_url): | |
| raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB') | |
| text = "a picture of" | |
| inputs = processor3(raw_image, text, return_tensors="pt") | |
| inputs = processor3(raw_image, return_tensors="pt") | |
| out = model3.generate(**inputs) | |
| print(processor3.decode(out[0], skip_special_tokens=True)) | |
| def bytes_to_text_model_3(byts): | |
| text = "a picture of" | |
| inputs = processor3(byts, text, return_tensors="pt") | |
| inputs = processor3(byts, return_tensors="pt") | |
| out = model3.generate(**inputs) | |
| print(processor3.decode(out[0], skip_special_tokens=True)) | |
| import cv2 | |
| def FrameCapture(path): | |
| vidObj = cv2.VideoCapture(path) | |
| count = 0 | |
| success = 1 | |
| while success: | |
| success, image = vidObj.read() | |
| if count % 20 == 0: | |
| print("NEW FRAME") | |
| print("MODEL 1") | |
| bytes_to_text_model_1(image) | |
| print("MODEL 2") | |
| bytes_to_text_model_2(image) | |
| print("MODEL 3") | |
| bytes_to_text_model_3(image) | |
| print("\n\n") | |
| count += 1 | |
| FrameCapture("animation.mp4") |