Spaces:

mojtaba-nafez
/

persian-poem-recommender-based-on-text

Runtime error

App Files Files Community

mojtaba-nafez commited on Mar 9, 2023

Commit

2fa2727

1 Parent(s): 85615cd

add initial files to deploy

Browse files

Files changed (18) hide show

app.py +32 -0
config.py +124 -0
data/Dataset-Merged.json +0 -0
data/test_dataset.json +0 -0
data/train_dataset.json +0 -0
data/val_dataset.json +0 -0
datasets.py +214 -0
inference.py +147 -0
main.py +50 -0
main_clip.py +59 -0
metrics.py +65 -0
models.py +410 -0
modules.py +199 -0
projections/LaBSE_best_text_projection.pt +3 -0
projections/ParsBERT_best_poem_projection.pt +3 -0
requirements.txt +12 -0
train.py +202 -0
utils.py +207 -0

app.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from models import PoemTextModel
+from inference import predict_poems_from_text
+from utils import get_poem_embeddings
+import config as CFG
+import json
+import gradio as gr
+def greet_user(name):
+	return "Hello " + name + " Welcome to Gradio!😎"
+if __name__ == "__main__":
+    model = PoemTextModel(poem_encoder_pretrained=True, text_encoder_pretrained=True).to(CFG.device)
+    model.eval()
+    # Inference: Output some example predictions and write them in a file
+    with open(CFG.dataset_path, encoding="utf-8") as f:
+        dataset = json.load(f)
+    def gradio_make_predictions(text):
+        beyts = predict_poems_from_text(model, poem_embeddings, text, [data['beyt'] for data in dataset], n=10)
+        return "\n".join(beyts)
+    CFG.batch_size = 512
+    model, poem_embeddings = get_poem_embeddings(dataset, model)
+    # print(poem_embeddings[0])
+    # with open('poem_embeddings.json'.format(CFG.poem_encoder_model, CFG.text_encoder_model),'w', encoding="utf-8") as f:
+    #     f.write(json.dumps(poem_embeddings, indent= 4))
+    text_input = gr.Textbox(label = "Enter the text to find poem beyts for")
+    output = gr.Textbox()
+    app = gr.Interface(fn = gradio_make_predictions, inputs=text_input, outputs=output)
+    app.launch()

config.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import torch
+from transformers import AutoTokenizer, AutoModel, AutoConfig
+from transformers import BertTokenizer, BertModel, BertConfig, BertTokenizerFast
+from transformers import XLMRobertaModel, XLMRobertaConfig
+import os
+"""
+Configurations
+"""
+file_dirname = os.path.dirname(__file__) #in case it is needed for relative paths
+dataset_path = os.path.join(file_dirname, "../data/Dataset-Merged.json")  # dataset path for PoemTextModel training, validation and test
+image_path = ""  # path to append to the image filenames of datasets used for CLIPModel training
+random_seed = 3  # the seed used to shuffle dataset with
+# what percentage of dataset will be used for each set?
+train_propotion = 0.85
+val_propotion = 0.05
+# The remaining will be used as the test set
+batch_size = 128
+num_workers = 0  # parameter of torch Dataloader
+lr = 1e-3  # learning rate
+weight_decay = 1e-3
+patience = 2  # patience parameter for lr scheduler
+factor = 0.5  # factor parameter for lr scheduler
+epochs = 60
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Pretrained hugging face models chosen by poem_encoder_model
+poem_encoder_dict = {
+    "Bert":{
+        "poem_encoder_pretrained_name": 'mitra-mir/BERT-Persian-Poetry',
+    },
+    "ALBERT":{
+        "poem_encoder_pretrained_name": 'mitra-mir/ALBERT-Persian-Poetry',
+    },
+    "ParsBERT":{
+        "poem_encoder_pretrained_name": 'HooshvareLab/bert-base-parsbert-uncased',
+    },
+    }
+poem_encoder_model = "ParsBERT" ### Important! The base model for poem encoder (one of "Bert", "ALBERT" and "ParsBERT")
+# keep this an empty string if you want to use the pretrained weights from
+# huggingface (poem_encoder_dict[poem_encoder_model])/a fresh model.
+# else give the path to encoder
+poem_encoder_load_path = ""
+# path to save encoder to
+poem_encoder_save_path = "{}-poem-encoder".format(poem_encoder_model)
+if poem_encoder_load_path:
+    poem_encoder_pretrained_name = poem_encoder_load_path
+    poem_tokenizer = poem_encoder_load_path
+else:
+    poem_encoder_pretrained_name = poem_encoder_dict[poem_encoder_model]['poem_encoder_pretrained_name']
+    poem_tokenizer = poem_encoder_dict[poem_encoder_model]['poem_encoder_pretrained_name']
+poem_embedding = 768  # embedding dim of poem encoder's output (for one token)
+poems_max_length = 64 # max_length parameter when padding/truncating poems using poem tokenizer
+# keep this an empty string if you want to use a freshly initialized projection module. else give the path to projection model
+poem_projection_load_path = os.path.join(file_dirname, "projections/{}_best_poem_projection.pt".format(poem_encoder_model))
+# path to save projection to
+poem_projection_save_path = "{}_best_poem_projection.pt".format(poem_encoder_model)
+poem_encoder_trainable = False # if set to false, this encoder's frozen and its weights won't be saved at all.
+# Pretrained hugging face models chosen by text_encoder_model
+text_encoder_dict = {
+    "M-Bert":{
+        "text_encoder_pretrained_name": 'bert-base-multilingual-cased',
+    },
+    "XLM-RoBERTa":{
+        "text_encoder_pretrained_name": 'xlm-roberta-base',
+        },
+    "LaBSE":{
+        "text_encoder_pretrained_name": 'setu4993/LaBSE',
+    }
+    }
+text_encoder_model = 'LaBSE' ### Important! The base model for text encoder (one of "M-Bert", "XLM-RoBERTa" and "LaBSE")
+# keep this an empty string if you want to use the pretrained weights from huggingface/a fresh model. else give the path to encoder
+text_encoder_load_path = ""
+# path to save encoder to
+text_encoder_save_path = "{}-text-encoder".format(text_encoder_model)
+if text_encoder_load_path:
+    text_encoder_pretrained_name = text_encoder_load_path
+    text_tokenizer = text_encoder_load_path
+else:
+    text_encoder_pretrained_name = text_encoder_dict[text_encoder_model]["text_encoder_pretrained_name"]
+    text_tokenizer = text_encoder_dict[text_encoder_model]["text_encoder_pretrained_name"]
+text_embedding = 768   # embedding dim of text encoder's output (for one token)
+text_max_length = 200  # max_length parameter when padding/truncating text using text tokenizer
+# keep this an empty string if you want to use a freshly initialized projection module. else give the path to projection model
+text_projection_load_path = os.path.join(file_dirname, "projections/{}_best_text_projection.pt".format(text_encoder_model))
+# path to save peojection to
+text_projection_save_path = "{}_best_text_projection.pt".format(text_encoder_model)
+text_encoder_trainable = False  # if set to false, this encoder's frozen and its weights won't be saved at all.
+image_encoder_model = 'resnet50' # image model name to load via timm library
+# keep this an empty string if you want to use the pretrained weights from huggingface/a fresh model. else give the path to encoder
+image_encoder_weights_load_path = ""
+# path to save encoder weights to
+image_encoder_weights_save_path = "{}_best_image_encoder.pt".format(image_encoder_model)
+image_embedding = 2048   # embedding dim of image encoder's output (for one token)
+# keep this an empty string if you want to use a freshly initialized projection module. else give the path to projection model
+image_projection_load_path = ""
+# path to save projection to
+image_projection_save_path = "{}_best_image_projection.pt".format(image_encoder_model)
+image_encoder_trainable = False  # if set to false, this encoder's frozen and its weights won't be saved at all.
+# classes of Tokenizer, Model and Config to use for each text/poem encoder model
+tokenizers = {"ALBERT": AutoTokenizer, "M-Bert": BertTokenizer, "XLM-RoBERTa": AutoTokenizer, "ParsBERT":AutoTokenizer, "Bert":AutoTokenizer, "LaBSE": BertTokenizerFast}
+encoders = {"ALBERT": AutoModel, "M-Bert": BertModel, "XLM-RoBERTa":XLMRobertaModel, "ParsBERT": AutoModel, "Bert":AutoModel, "LaBSE": BertModel}
+configs = {"ALBERT": AutoConfig, "M-Bert": BertConfig, "XLM-RoBERTa": XLMRobertaConfig, "ParsBERT": AutoConfig, "Bert":AutoConfig, "LaBSE": BertConfig}
+temperature = 1.0 # temperature parameter for scaling dot similarities
+# image size
+size = 224
+# for projection head; used for poem, text and image encoders
+projection_dim = 1024 # projection embedding dim (output of models dim)
+dropout = 0.1 # fraction of the output of fc layer in projection head to be zeroed.

data/Dataset-Merged.json ADDED Viewed

The diff for this file is too large to render. See raw diff

data/test_dataset.json ADDED Viewed

The diff for this file is too large to render. See raw diff

data/train_dataset.json ADDED Viewed

The diff for this file is too large to render. See raw diff

data/val_dataset.json ADDED Viewed

The diff for this file is too large to render. See raw diff

datasets.py ADDED Viewed

	@@ -0,0 +1,214 @@

+import os
+import cv2
+import torch
+import albumentations as A
+import config as CFG
+class PoemTextDataset(torch.utils.data.Dataset):
+    """
+    torch Dataset for PoemTextModel.
+    ...
+    Attributes:
+    -----------
+    dataset_dict : list of dict
+        dataset containing poem-text pair with ids
+    encoded_poems : dict
+        output of tokenizer for beyts found in dataset_dict. max_length spedified in configs.
+        padding and truncation set to True to be truncated or padded to max length.
+    encoded_texts : dict
+        output of tokenizer for texts found in dataset_dict. max_length spedified in configs.
+        padding and truncation set to True to be truncated or padded to max length.
+    Methods:
+    --------
+        __get_item__(idx)
+            returns item with index idx.
+        __len__()
+            represents length of dataset
+    """
+    def __init__(self, dataset_dict):
+        """
+        Init class, save dataset_dict and calculate output of tokenizers for each text and poem using their corresponding tokenizers.
+        The tokenizers are chosen based on configs.
+            Parameters:
+            -----------
+                dataset_dict: list of dict
+                    a list containing dictionaries which have "beyt", "text" and "id" keys.
+        """
+        self.dataset_dict = dataset_dict
+        poem_tokenizer = CFG.tokenizers[CFG.poem_encoder_model].from_pretrained(CFG.poem_tokenizer)
+        text_tokenizer = CFG.tokenizers[CFG.text_encoder_model].from_pretrained(CFG.text_tokenizer)
+        self.encoded_poems = poem_tokenizer(
+            [item['beyt'] for item in dataset_dict], padding=True, truncation=True, max_length=CFG.poems_max_length
+        )
+        self.encoded_texts = text_tokenizer(
+            [item['text'] for item in dataset_dict], padding=True, truncation=True, max_length=CFG.text_max_length
+        )
+    def __getitem__(self, idx):
+        """
+        returns a dict having data with index idx. the dict is used as an input to the PoemTextModel.
+            Parameters:
+            -----------
+                idx: int
+                    index of the data to get
+            Returns:
+            --------
+                item: dict
+                    a dict having tokenizers' output for poem and text, and id of the data with index idx
+        """
+        item = {}
+        item["beyt"] = {
+            key: torch.tensor(values[idx])
+            for key, values in self.encoded_poems.items()
+        }
+        item["text"] = {
+            key: torch.tensor(values[idx])
+            for key, values in self.encoded_texts.items()
+        }
+        item['id'] = self.dataset_dict[idx]['id']
+        return item
+    def __len__(self):
+        """
+        returns the length of the dataset
+            Returns:
+            --------
+                length: int
+                    length using the length of dataset_dict we saved in class
+        """
+        return len(self.dataset_dict)
+class CLIPDataset(torch.utils.data.Dataset):
+    """
+    torch Dataset for CLIPModel.
+    ...
+    Attributes:
+    -----------
+    dataset_dict : list of dict
+        dataset containing poem-image or text-image pair with ids
+    encoded : dict
+        output of tokenizer for beyts/texts found in dataset_dict. max_length spedified in configs.
+        padding and truncation set to True to be truncated or padded to max length.
+    transforms: albumentations.BasicTransform
+        transforms to apply to the images
+    Methods:
+    --------
+        __get_item__(idx)
+            returns item with index idx.
+        __len__()
+            represents length of dataset
+    """
+    def __init__(self, dataset_dict, transforms, is_image_poem_pair=True):
+        """
+        Init class, save dataset_dict and transforms and calculate output of tokenizers for each text and poem using their corresponding tokenizers.
+        The tokenizers are chosen based on configs.
+            Parameters:
+            -----------
+                dataset_dict: list of dict
+                    a list containing dictionaries which have "beyt", "text" and "id" keys.
+                transforms: albumentations.BasicTransform
+                    transforms to apply to the images
+                is_image_poem_pair: Bool, optional
+                    if set to False, dataset has text-image pairs and must use the corresponding text tokenizer.
+                    else has poem-images pairs and uses the poem tokenizer.
+        """
+        self.dataset_dict = dataset_dict
+        # using the poem tokenizer to encode poems or text tokenizer to encode text (based on configs).
+        if is_image_poem_pair:
+          poem_tokenizer = CFG.tokenizers[CFG.poem_encoder_model].from_pretrained(CFG.poem_tokenizer)
+          self.encoded = poem_tokenizer(
+              [item['beyt'] for item in dataset_dict], padding=True, truncation=True, max_length=CFG.poems_max_length
+          )
+        else:
+          text_tokenizer = CFG.tokenizers[CFG.text_encoder_model].from_pretrained(CFG.text_tokenizer)
+          self.encoded = text_tokenizer(
+              [item['text'] for item in dataset_dict], padding=True, truncation=True, max_length=CFG.text_max_length
+          )
+        self.transforms = transforms
+    def __getitem__(self, idx):
+        """
+        returns a dict having data with index idx. the dict is used as an input to the CLIPModel.
+            Parameters:
+            -----------
+                idx: int
+                    index of the data to get
+            Returns:
+            --------
+                item: dict
+                    a dict having tokenizers' output for poem and text, and id of the data with index idx
+        """
+        item = {}
+        # getting text from encoded texts
+        item["text"] = {
+            key: torch.tensor(values[idx])
+            for key, values in self.encoded.items()
+        }
+        # opening the image
+        image = cv2.imread(f"{CFG.image_path}{self.dataset_dict[idx]['image']}")
+        # converting BGR to RGB for transforms
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        # apply transforms
+        image = self.transforms(image=image)['image']
+        # permute dims of image
+        item['image'] = torch.tensor(image).permute(2, 0, 1).float()
+        return item
+    def __len__(self):
+        """
+        returns the length of the dataset
+            Returns:
+            --------
+                length: int
+                    length using the length of dataset_dict we saved in class
+        """
+        return len(self.dataset_dict)
+def get_transforms(mode="train"):
+    """
+        returns transforms to use on image based on mode
+            Parameters:
+            -----------
+                mode: str, optional
+                    to distinguish between train and val/test transforms (here they are the same!)
+            Returns:
+            --------
+                item: dict
+                    a dict having tokenizers' output for poem and text, and id of the data with index idx
+        """
+    if mode == "train":
+        return A.Compose(
+            [
+                A.Resize(CFG.size, CFG.size, always_apply=True),  # resizing image to CFG.size
+                A.Normalize(max_pixel_value=255.0, always_apply=True),  # normalizing image values
+            ]
+        )
+    else:
+        return A.Compose(
+            [
+                A.Resize(CFG.size, CFG.size, always_apply=True),  # resizing image to CFG.size
+                A.Normalize(max_pixel_value=255.0, always_apply=True),  # normalizing image values
+            ]
+        )

inference.py ADDED Viewed

	@@ -0,0 +1,147 @@

+from __future__ import annotations
+import torch
+import cv2
+import torch.nn.functional as F
+import numpy as np
+import config as CFG
+from datasets import get_transforms
+#for running this script as main
+from utils import get_datasets, build_loaders
+from models import PoemTextModel
+from utils import get_poem_embeddings
+import json
+import os
+def predict_poems_from_text(model, poem_embeddings, query, poems, text_tokenizer=None, n=10):
+    """
+    Returns n poems which are the most similar to a text query
+        Parameters:
+        -----------
+            model: PoemTextModel
+                model to compute text query's embeddings
+            poem_embeddings: sequence with shape (#poems, CFG.projection_dim)
+                poem embeddings to check similarity
+            query: str
+                text query
+            poems: list of str
+                poems corresponding to poem_embeddings
+            text_tokenizer: huggingface Tokenizer, optional
+                tokenizer to tokenize query with. if none, will instantiate a new text tokenizer using configs.
+            n: int, optional
+                number of poems to return
+        Returns:
+        --------
+            A list of n poem strings whose embeddings are the most similar to query text's embedding.
+    """
+    #Tokenizing and Encoding the query text
+    if not text_tokenizer:
+        text_tokenizer = CFG.tokenizers[CFG.text_encoder_model].from_pretrained(CFG.text_tokenizer)
+    encoded_query = text_tokenizer([query])
+    batch = {
+        key: torch.tensor(values).to(CFG.device)
+        for key, values in encoded_query.items()
+    }
+    # getting query text's embeddings
+    model.eval()
+    with torch.no_grad():
+        text_features = model.text_encoder(
+            input_ids= batch["input_ids"], attention_mask=batch["attention_mask"]
+        )
+        text_embeddings = model.text_projection(text_features)
+    # normalizing and computing dot similarity of poem and text embeddings
+    poem_embeddings_n = F.normalize(poem_embeddings, p=2, dim=-1)
+    text_embeddings_n = F.normalize(text_embeddings, p=2, dim=-1)
+    dot_similarity = text_embeddings_n @ poem_embeddings_n.T
+    # returning top n poems based on embedding similarity
+    _, indices = torch.topk(dot_similarity.squeeze(0), n)
+    return [poems[idx] for idx in indices]
+def predict_poems_from_image(model, poem_embeddings, image_filename, poems, n=10):
+    """
+    Returns n poems which are the most similar to an image query
+        Parameters:
+        -----------
+            model: CLIPModel
+                model to compute image query's embeddings
+            poem_embeddings: sequence with shape (#poems, CFG.projection_dim)
+                poem embeddings to check similarity
+            image_filename: str
+                path and file name for the image query
+            poems: list of str
+                poems corresponding to poem_embeddings
+            n: int, optional
+                number of poems to return
+        Returns:
+        --------
+            A list of n poem strings whose embeddings are the most similar to image query's embedding.
+    """
+    # Reading, Processing and applying transforms to image (all explained in datasets.py)
+    image = cv2.imread(f"{image_filename}")
+    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+    image = get_transforms(mode="test")(image=image)['image']
+    image = torch.tensor(image).permute(2, 0, 1).float()
+    # getting image query's embeddings
+    model.eval()
+    with torch.no_grad():
+        image_features = model.image_encoder(torch.unsqueeze(image, 0).to(CFG.device))
+        image_embeddings = model.image_projection(image_features)
+    # normalizing and computing dot similarity of poem and text embeddings
+    poem_embeddings_n = F.normalize(poem_embeddings, p=2, dim=-1)
+    image_embeddings_n = F.normalize(image_embeddings, p=2, dim=-1)
+    dot_similarity = image_embeddings_n @ poem_embeddings_n.T
+    # returning top n poems based on embedding similarity
+    _, indices = torch.topk(dot_similarity.squeeze(0), n)
+    return [poems[idx] for idx in indices]
+if __name__ == "__main__":
+    """
+    Creates a PoemTextModel based on configs, and outputs some examples of its prediction.
+    """
+    # get dataset from dataset_path (the same datasets as the train, val and test dataset files in the data directory is made)
+    train_dataset, val_dataset, test_dataset = get_datasets()
+    model = PoemTextModel(poem_encoder_pretrained=True, text_encoder_pretrained=True).to(CFG.device)
+    model.eval()
+    # Inference: Output some example predictions and write them in a file
+    print("_"*20)
+    print("Output Examples from test set")
+    model, poem_embeddings = get_poem_embeddings(test_dataset, model)
+    example = {}
+    for i, test_data in enumerate(test_dataset[:100]):
+        example[i] = {'Text': test_data["text"], 'True Beyt': test_data["beyt"], "Predicted Beyt":predict_poems_from_text(model, poem_embeddings, test_data["text"], [data['beyt'] for data in test_dataset], n=10)}
+    for i in range(10):
+        print("Text: ", example[i]['Text'])
+        print("True Beyt: ", example[i]['True Beyt'])
+        print("predicted Beyts: \n\t", "\n\t".join(example[i]["Predicted Beyt"]))
+    with open('example_output__{}_{}.json'.format(CFG.poem_encoder_model, CFG.text_encoder_model),'w', encoding="utf-8") as f:
+        f.write(json.dumps(example, ensure_ascii=False, indent= 4))
+    print("Preparing model for user input...")
+    with open(CFG.dataset_path, encoding="utf-8") as f:
+        dataset = json.load(f)
+    model, poem_embeddings = get_poem_embeddings(dataset, model)
+    while(True):
+        user_text = input("Enter a Text to find poem beyts for: ")
+        beyts = predict_poems_from_text(model, poem_embeddings, user_text, [data['beyt'] for data in dataset], n=10)
+        print("predicted Beyts: \n\t", "\n\t".join(beyts))
+        with open('{}_output__{}_{}.json'.format(user_text, CFG.poem_encoder_model, CFG.text_encoder_model),'a+', encoding="utf-8") as f:
+            f.write(json.dumps(beyts, ensure_ascii=False, indent= 4))

main.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from utils import get_datasets, build_loaders
+from models import PoemTextModel
+from train import train, test
+from metrics import calc_metrics
+from inference import predict_poems_from_text
+from utils import get_poem_embeddings
+import config as CFG
+import json
+def main():
+    """
+    Creates a PoemTextModel based on configs and trains, tests and outputs some examples of its prediction.
+    """
+    # get dataset from dataset_path (the same datasets as the train, val and test dataset files in the data directory is made)
+    train_dataset, val_dataset, test_dataset = get_datasets()
+    train_loader = build_loaders(train_dataset, mode="train")
+    valid_loader = build_loaders(val_dataset, mode="valid")
+    # train a PoemTextModel and write its loss history in a file
+    model = PoemTextModel(poem_encoder_pretrained=True, text_encoder_pretrained=True).to(CFG.device)
+    model, loss_history = train(model, train_loader, valid_loader)
+    with open('loss_history_{}_{}.json'.format(CFG.poem_encoder_model, CFG.text_encoder_model),'w', encoding="utf-8") as f:
+        f.write(json.dumps(loss_history, indent= 4))
+    # compute accuracy, mean rank and MRR using test set and write them in a file
+    model.eval()
+    print("Accuracy on test set: ", test(model, test_dataset))
+    metrics = calc_metrics(test_dataset, model)
+    print('mean rank: ', metrics["mean_rank"])
+    print('mean reciprocal rank (MRR)', metrics["mean_reciprocal_rank_(MRR)"])
+    with open('test_metrics_{}_{}.json'.format(CFG.poem_encoder_model, CFG.text_encoder_model),'w', encoding="utf-8") as f:
+        f.write(json.dumps(metrics, indent= 4))
+    # Inference: Output some example predictions and write them in a file
+    print("_"*20)
+    print("Output Examples from test set")
+    model, poem_embeddings = get_poem_embeddings(test_dataset, model)
+    example = {}
+    for i, test_data in enumerate(test_dataset[:100]):
+        example[i] = {'Text': test_data["text"], 'True Beyt': test_data["beyt"], "Predicted Beyt":predict_poems_from_text(model, poem_embeddings, test_data["text"], [data['beyt'] for data in test_dataset], n=10)}
+    for i in range(10):
+        print("Text: ", example[i]['Text'])
+        print("True Beyt: ", example[i]['True Beyt'])
+        print("predicted Beyts: \n\t", "\n\t".join(example[i]["Predicted Beyt"]))
+    with open('example_output__{}_{}.json'.format(CFG.poem_encoder_model, CFG.text_encoder_model),'w', encoding="utf-8") as f:
+        f.write(json.dumps(example, ensure_ascii=False, indent= 4))
+if __name__ == "__main__":
+    main()

main_clip.py ADDED Viewed

	@@ -0,0 +1,59 @@

+from utils import get_datasets, build_loaders
+from models import PoemTextModel
+from train import train, test
+from metrics import calc_metrics
+from inference import predict_poems_from_text
+from utils import get_poem_embeddings
+import config as CFG
+import json
+def main():
+    """
+    Creates a PoemTextModel based on configs and trains, tests and outputs some examples of its prediction.
+    """
+    train_or_not = input("Train a new CLIP model using text embeddings? (needs the sajjadayobi360/cc3mfav2 and adityajn105/flickr8k datasets to be downloaded)\n[Y/N]")
+    if train_or_not == 'Y':
+        # Please download sajjadayobi360/cc3mfav2 and adityajn105/flickr8k datasets from kaggle
+        #   !kaggle datasets download -d sajjadayobi360/cc3mfav2
+        #   !kaggle datasets download -d adityajn105/flickr8k
+        #.... TODO
+        clip_dataset_dict = []
+        # get dataset from dataset_path (the same datasets as the train, val and test dataset files in the data directory is made)
+        train_dataset, val_dataset, test_dataset = get_clip_datasets(clip_dataset_dict)
+        train_loader = build_image_loaders(train_dataset, mode="train")
+        valid_loader = build_image_loaders(val_dataset, mode="valid")
+        # train a PoemTextModel and write its loss history in a file
+        model = CLIPModel(image_encoder_pretrained=True,
+                    text_encoder_pretrained=True,
+                    text_projection_trainable=False,
+                    is_image_poem_pair=False
+                    ).to(CFG.device)
+        model, loss_history = train(model, train_loader, valid_loader)
+        with open('loss_history_{}_{}.json'.format(CFG.poem_encoder_model, CFG.text_encoder_model),'w', encoding="utf-8") as f:
+            f.write(json.dumps(loss_history, indent= 4))
+    # Inference: Get a filename and output predictions then write them in a file
+    print("_"*20)
+    print("INFERENCE PHASE")
+    model = CLIPModel(image_encoder_pretrained=True,
+                text_encoder_pretrained=True,
+                text_projection_trainable=False,
+                is_image_poem_pair=True
+                ).to(CFG.device)
+    model.eval()
+    with open(CFG.dataset_path, encoding="utf-8") as f:
+        dataset = json.load(f)
+    model, poem_embeddings = get_poem_embeddings(test_dataset, model)
+    while(True):
+        image_filename = input("Enter an image filename to predict poems for")
+        beyts = predict_poems_from_image(model, poem_embeddings, image_filename, [data['beyt'] for data in dataset], n=10)
+        print("predicted Beyts: \n\t", "\n\t".join(beyts))
+        with open('{}_output__{}_{}.json'.format(image_filename, CFG.poem_encoder_model, CFG.text_encoder_model),'a+', encoding="utf-8") as f:
+            f.write(json.dumps(beyts, ensure_ascii=False, indent= 4))
+if __name__ == "__main__":
+    main()

metrics.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from __future__ import annotations
+import numpy as np
+import inference
+from utils import get_poem_embeddings
+import config as CFG
+#for running this script as main
+from utils import get_datasets, build_loaders
+from models import PoemTextModel
+from train import train, test
+import json
+import os
+def calc_metrics(test_dataset, model):
+    """
+      compute ranks of the test_dataset (and mean rank and MRR)
+        Parameters:
+        -----------
+          test_dataset: list of dict
+            dataset containing text and poem beyts to compute metrics from
+          model: PoemTextModel
+            The PoemTextModel model to get poem embeddings from and predict poems for each text
+    """
+    # computing all poems embeddings once (to avoid computing them for each test text)
+    m , embedding = get_poem_embeddings(test_dataset, model)
+    # adding poems and texts
+    poems = []
+    meanings = []
+    for p in np.array(test_dataset):
+      poems.append(p['beyt'])
+      meanings.append(p['text'])
+    # instantiating a text tokenizer to encode texts
+    text_tokenizer = CFG.tokenizers[CFG.text_encoder_model].from_pretrained(CFG.text_tokenizer)
+    rank = []
+    for i, meaning in enumerate(meanings):
+        # predict most similar poem beyts for each text
+        sorted_pred = inference.predict_poems_from_text(model, embedding, meaning, poems, text_tokenizer, n=len(test_dataset))
+        # find index of this text's true beyt in the sorted predictions
+        idx = sorted_pred.index(poems[i])
+        rank.append(idx+1)
+    rank = np.array(rank)
+    metrics = {
+    "mean_rank": np.mean(rank),
+    "mean_reciprocal_rank_(MRR)":np.mean(np.reciprocal(rank.astype(float))),
+    "rank": rank.tolist()
+    }
+    return metrics
+if __name__ == "__main__":
+    """
+    Creates a PoemTextModel based on configs, and computes its metrics.
+    """
+    # get dataset from dataset_path (the same datasets as the train, val and test dataset files in the data directory is made)
+    train_dataset, val_dataset, test_dataset = get_datasets()
+    model = PoemTextModel(poem_encoder_pretrained=True, text_encoder_pretrained=True).to(CFG.device)
+    model.eval()
+    # compute accuracy, mean rank and MRR using test set and write them in a file
+    print("Accuracy on test set: ", test(model, test_dataset))
+    metrics = calc_metrics(test_dataset, model)
+    print('mean rank: ', metrics["mean_rank"])
+    print('mean reciprocal rank (MRR)', metrics["mean_reciprocal_rank_(MRR)"])
+    with open('test_metrics_{}_{}.json'.format(CFG.poem_encoder_model, CFG.text_encoder_model),'w', encoding="utf-8") as f:
+        f.write(json.dumps(metrics, indent= 4))

models.py ADDED Viewed

	@@ -0,0 +1,410 @@

+import torch
+from torch import nn
+import torch.nn.functional as F
+#FIX
+import config as CFG
+from modules import TextEncoder, ProjectionHead, ImageEncoder
+class PoemTextModel(nn.Module):
+    """
+    Model predicting poem and text embeddings, and their similarities.
+    ...
+    Attributes:
+    -----------
+        poem_encoder : TextEncoder
+            encoder used for extracting poem embeddings
+        text_encoder : TextEncoder
+            encoder used for extracting text embeddings
+        poem_projection: ProjectionHead
+            projection head used for poem embeddings (projects poem encoder output to shared embedding space)
+        text_projection: ProjectionHead
+            projection head used for text embeddings (projects text encoder output to shared embedding space)
+        temperature: float
+            used to scale the dot similarities
+    Methods:
+    --------
+        forward(batch):
+            returns poem and text embeddings of batch
+        similarity_scores(batch):
+            computes dot similarities of a batch of text-poem pair
+        predict(batch):
+            predicts the most similar poem idx for each text (using previous methods)
+        calculate_loss(batch):
+            computes contrastive (cross entropy) loss for both poems and texts.
+        save_current():
+            saves current model's encoders (if trainable) and projection heads.
+    """
+    def __init__(
+        self,
+        poem_encoder_pretrained,
+        text_encoder_pretrained,
+        temperature=CFG.temperature,
+        poem_embedding=CFG.poem_embedding,
+        text_embedding=CFG.text_embedding,
+    ):
+        """
+        Initializes model's submodules
+            Parameters:
+            -----------
+                poem_encoder_pretrained: bool
+                    whether or not to load a pretrained poem encoder.
+                text_encoder_pretrained: bool
+                    whether or not to load a pretrained text encoder.
+                temperature: float, optional
+                    used to scale the dot similarities
+                poem_embedding: int, optional
+                    dim of poem encoder's encoding output before projection
+                text_embedding: int, optional
+                    dim of text encoder's encoding output before projection
+        """
+        super().__init__()
+        self.poem_encoder = TextEncoder(CFG.poem_encoder_model, CFG.poem_encoder_pretrained_name, pretrained=poem_encoder_pretrained, trainable= CFG.poem_encoder_trainable)
+        self.text_encoder = TextEncoder(CFG.text_encoder_model, CFG.text_encoder_pretrained_name, pretrained=text_encoder_pretrained, trainable= CFG.text_encoder_trainable)
+        self.poem_projection = ProjectionHead(embedding_dim=poem_embedding)
+        if CFG.poem_projection_load_path: # if provided, load projection weights from this path
+            self.poem_projection.load_state_dict(torch.load(CFG.poem_projection_load_path, map_location=CFG.device))
+        self.text_projection = ProjectionHead(embedding_dim=text_embedding)
+        if CFG.text_projection_load_path: # if provided, load projection weights from this path
+            self.text_projection.load_state_dict(torch.load(CFG.text_projection_load_path, map_location=CFG.device))
+        self.temperature = temperature
+    def forward(self, batch):
+        """
+        returns poem and text embeddings of batch
+            Parameters:
+            -----------
+            batch: list of dict
+                input (containing poem-text pairs (encoded using the encoder's tokenizer) with keys 'beyt' and 'text')
+            Returns:
+            --------
+            poem and text embeddings of batch (each of shape (batch_size, projection_dim))
+        """
+        beyts, texts = batch["beyt"], batch["text"]
+        # Getting Beyt and Text Features
+        poem_features = self.poem_encoder(
+            input_ids=beyts["input_ids"], attention_mask=beyts["attention_mask"]
+        )
+        text_features = self.text_encoder(
+            input_ids=texts["input_ids"], attention_mask=texts["attention_mask"]
+        )
+        # Getting Beyt and Text Embeddings (with same dimension)
+        poem_embeddings = self.poem_projection(poem_features)
+        text_embeddings = self.text_projection(text_features)
+        return poem_embeddings, text_embeddings
+    def similarity_scores(self, batch):
+        """
+        computes dot similarities of a batch of text-poem pair
+            Parameters:
+            -----------
+            batch: list of dict
+                input (containing poem-text pairs (encoded using the encoder's tokenizer) with keys 'beyt' and 'text')
+            Returns:
+            --------
+            dot similarity of poem and text embeddings of batch (of shape (batch_size, batch_size))
+        """
+        # Getting Beyt and Text Embeddings (with same dimension)
+        poem_embeddings, text_embeddings = self.forward(batch)
+        # Normalizing embeddings
+        poem_embeddings_n = F.normalize(poem_embeddings, p=2, dim=-1)
+        text_embeddings_n = F.normalize(text_embeddings, p=2, dim=-1)
+        # Computing dot / cosine similarity of the normalized embeddings
+        dot_similarity = text_embeddings_n @ poem_embeddings_n.T
+        return dot_similarity # (batch_size, batch_size) first dim is texts, second dim is poems for each text
+    def predict(self, batch):
+        """
+        predicts the most similar poem (idx) for each text (using previous methods)
+            Parameters:
+            -----------
+            batch: list of dict
+                input (containing poem-text pairs (encoded using the encoder's tokenizer) with keys 'beyt' and 'text')
+            Returns:
+            --------
+            index of poem predicted for each text (of shape (batch_size))
+        """
+        dot_similarity = self.similarity_scores(batch)
+        # Getting argmax in first dimension of the dot-similarities to predict index of the most similar poem for each text
+        return torch.argmax(dot_similarity, dim=1)
+    def calculate_loss(self, poem_embeddings, text_embeddings):
+        """
+        computes contrastive (cross entropy) loss for both poems and texts.
+            Parameters:
+            -----------
+            poem_embeddings: of shape (batch_size, projection_dim)
+                output embeddings of poem projection head
+            text_embeddings: of shape (batch_size, projection_dim)
+                output embeddings of text projection head
+            Returns:
+            --------
+            average of the loss computed from inputs
+        """
+        # dot similarity of the embeddings scaled by temperature (logits)
+        logits = (text_embeddings @ poem_embeddings.T) / self.temperature
+        # computing targets for the cross entropy loss to compare with logits.
+        # each embedding's similarity is computed with itself and then added,
+        # scaled by the temperature parameter, and normalized into a probability distribution via a softmax
+        poems_similarity = poem_embeddings @ poem_embeddings.T
+        texts_similarity = text_embeddings @ text_embeddings.T
+        targets = F.softmax(
+            (poems_similarity + texts_similarity) / 2 * self.temperature, dim=-1
+        )
+        # taking cross entropy loss in both dimensions: once for texts and once for poems
+        texts_loss = cross_entropy(logits, targets, reduction='none')
+        poems_loss = cross_entropy(logits.T, targets.T, reduction='none')
+        loss =  (poems_loss + texts_loss) / 2.0 # average of losses. shape: (batch_size)
+        return loss.mean()
+    def save_current(self):
+        """
+        saves current model's encoders (if trainable) and projection heads.
+        """
+        if CFG.text_encoder_trainable:
+            self.text_encoder.model.save_pretrained(CFG.text_encoder_save_path)
+        if CFG.poem_encoder_trainable:
+            self.poem_encoder.model.save_pretrained(CFG.poem_encoder_save_path)
+        torch.save(self.text_projection.state_dict(), CFG.text_projection_save_path)
+        torch.save(self.poem_projection.state_dict(), CFG.poem_projection_save_path)
+class CLIPModel(nn.Module):
+    """
+    Model predicting poem/text and image embeddings, and their similarities.
+    ...
+    Attributes:
+    -----------
+        encoder : TextEncoder
+            encoder used for extracting poem/text embeddings
+        image_encoder : ImageEncoder
+            encoder used for extracting image embeddings
+        text_projection: ProjectionHead
+            projection head used for poem/text embeddings (projects text encoder output to shared embedding space)
+        image_projection: ProjectionHead
+            projection head used for image embeddings (projects image encoder output to shared embedding space)
+        temperature: float
+            used to scale the dot similarities
+    Methods:
+    --------
+        forward(batch):
+            returns poem/text and image embeddings of batch
+        similarity_scores(batch):
+            computes dot similarities of a batch of text-image pair
+        predict(batch):
+            predicts the most similar poem/text idx for each image (using previous methods)
+        calculate_loss(batch):
+            computes contrastive (cross entropy) loss for both poems/texts and images.
+        save_current():
+            saves current model's encoders (if trainable) and projection heads.
+    """
+    def __init__(
+        self,
+        image_encoder_pretrained,
+        text_encoder_pretrained,
+        text_projection_trainable,
+        temperature=CFG.temperature,
+        image_embedding=CFG.image_embedding,
+        text_embedding=CFG.text_embedding,
+        is_image_poem_pair=True
+    ):
+        """
+        Initializes model's submodules
+            Parameters:
+            -----------
+                image_encoder_pretrained: bool
+                    whether or not to load a pretrained image encoder.
+                text_encoder_pretrained: bool
+                    whether or not to load a pretrained text encoder.
+                text_projection_trainable: bool
+                    whether or not to train text projection
+                    (since the text projection is frozen in our trainings unlike other projections of models)
+                temperature: float, optional
+                    used to scale the dot similarities
+                image_embedding: int, optional
+                    dim of image encoder's encoding output before projection
+                text_embedding: int, optional
+                    dim of text encoder's encoding output before projection
+                is_image_poem_pair: bool, optional
+                    if True, the text inputs to this model is poems and needs one of the poem encoders to predict embeddings with.
+                    else it's a text that needs the encoders dedicated to text.
+        """
+        super().__init__()
+        # Loading the encoders and their projections using configs
+        self.image_encoder = ImageEncoder(pretrained=image_encoder_pretrained, trainable=CFG.image_encoder_trainable)
+        if is_image_poem_pair:
+            self.encoder = TextEncoder(CFG.poem_encoder_model, CFG.poem_encoder_pretrained_name, pretrained=text_encoder_pretrained, trainable=CFG.poem_encoder_trainable)
+            self.text_projection = ProjectionHead(embedding_dim=text_embedding)
+            if CFG.poem_projection_load_path:
+                self.text_projection.load_state_dict(torch.load(CFG.poem_projection_load_path, map_location=CFG.device))
+        else:
+            self.encoder = TextEncoder(CFG.text_encoder_model, CFG.text_encoder_pretrained_name, pretrained=text_encoder_pretrained, trainable=CFG.text_encoder_trainable)
+            self.text_projection = ProjectionHead(embedding_dim=text_embedding)
+            if CFG.text_projection_load_path:
+                self.text_projection.load_state_dict(torch.load(CFG.text_projection_load_path, map_location=CFG.device))
+        self.image_projection = ProjectionHead(embedding_dim=image_embedding)
+        if CFG.image_projection_load_path:
+            self.image_projection.load_state_dict(torch.load(CFG.image_projection_load_path, map_location=CFG.device))
+        if not text_projection_trainable:
+            for p in self.text_projection.parameters():
+                p.requires_grad = False
+        self.text_projection_trainable = text_projection_trainable
+        self.is_image_poem_pair = is_image_poem_pair
+        self.temperature = temperature
+    def forward(self, batch):
+        """
+        returns image and text/poem embeddings of batch
+            Parameters:
+            -----------
+            batch: list of dict
+                input (containing image-text/poem pairs (text/poem encoded using the encoder's tokenizer)
+                with keys 'image' and 'text')
+            Returns:
+            --------
+            poem/text and image embeddings of batch (each of shape (batch_size, projection_dim))
+        """
+        image, texts = batch["image"], batch["text"]
+        # Getting Image and Text Features
+        image_features = self.image_encoder(batch["image"])
+        text_features = self.encoder(
+            input_ids=texts["input_ids"], attention_mask=texts["attention_mask"]
+        )
+        # Getting Image and Text Embeddings (with same dimension)
+        image_embeddings = self.image_projection(image_features)
+        text_embeddings = self.text_projection(text_features)
+        return image_embeddings, text_embeddings
+    def similarity_scores(self, batch):
+        """
+        computes dot similarities of a batch of text/poem-image pair
+            Parameters:
+            -----------
+            batch: list of dict
+                input (containing image-text/poem pairs (text/poem encoded using the encoder's tokenizer)
+                with keys 'image' and 'text')
+            Returns:
+            --------
+            dot similarity of poem/text and image embeddings of batch (of shape (batch_size, batch_size))
+        """
+        # Getting Image and Text Embeddings (with same dimension)
+        image_embeddings, text_embeddings = self.forward(batch)
+        # Normalizing embeddings
+        image_embeddings_n = F.normalize(image_embeddings, p=2, dim=-1)
+        text_embeddings_n = F.normalize(text_embeddings, p=2, dim=-1)
+        # Computing dot / cosine similarity of the normalized embeddings
+        dot_similarity = image_embeddings_n @ text_embeddings_n.T
+        return dot_similarity # (batch_size, batch_size) first dim is images, second dim is poems/texts for each image
+    def predict(self, batch):
+        """
+        predicts the most similar poem/text (idx) for each image (using previous methods)
+            Parameters:
+            -----------
+            batch: list of dict
+                input (containing image-text/poem pairs (text/poem encoded using the encoder's tokenizer)
+                with keys 'image' and 'text')
+            Returns:
+            --------
+            index of poem/text predicted for each image (of shape (batch_size))
+        """
+        dot_similarity = self.similarity_scores(batch)
+        # Getting argmax in first dimension of the dot-similarities
+        # to predict index of the most similar poem/text for each image
+        return torch.argmax(dot_similarity, dim=1)
+    def calculate_loss(self, image_embeddings, text_embeddings):
+        """
+        computes contrastive (cross entropy) loss for both poems/texts and images.
+            Parameters:
+            -----------
+            image_embeddings: of shape (batch_size, projection_dim)
+                output embeddings of image projection head
+            text_embeddings: of shape (batch_size, projection_dim)
+                output embeddings of text projection head
+            Returns:
+            --------
+            average of the loss computed from inputs
+        """
+        # dot similarity of the embeddings scaled by temperature (logits)
+        logits = (text_embeddings @ image_embeddings.T) / self.temperature
+        # computing targets for the cross entropy loss to compare with logits.
+        # each embedding's similarity is computed with itself and then averaged,
+        # scaled by the temperature parameter, and normalized into a probability distribution via a softmax
+        images_similarity = image_embeddings @ image_embeddings.T
+        texts_similarity = text_embeddings @ text_embeddings.T
+        targets = F.softmax(
+            (images_similarity + texts_similarity) / 2 * self.temperature, dim=-1
+        )
+        # taking cross entropy loss in both dimensions: once for texts and once for images
+        texts_loss = cross_entropy(logits, targets, reduction='none')
+        images_loss = cross_entropy(logits.T, targets.T, reduction='none')
+        loss =  (images_loss + texts_loss) / 2.0  # average of losses. shape: (batch_size)
+        return loss.mean()
+    def save_current(self):
+        """
+        saves current model's encoders and projection heads (if trainable).
+        """
+        if self.is_image_poem_pair:
+            if CFG.poem_encoder_trainable:
+                self.encoder.model.save_pretrained(CFG.poem_encoder_save_path)
+        else:
+            if CFG.text_encoder_trainable:
+                self.encoder.model.save_pretrained(CFG.text_encoder_save_path)
+        if CFG.image_encoder_trainable:
+            torch.save(self.image_encoder.model.state_dict(), CFG.image_encoder_weights_save_path)
+        if self.text_projection_trainable:
+            torch.save(self.text_projection.state_dict(), CFG.text_projection_save_path)
+        torch.save(self.image_projection.state_dict(), CFG.image_projection_save_path)
+def cross_entropy(preds, targets, reduction='none'):
+    """
+    Computes cross_entropy of logits and targets using their last dimension
+        Parameters:
+        -----------
+            preds: tensor/numpy array
+                logits
+            targets: tensor/ numpy array
+            reduction: str, optional
+                if set to "mean", return loss mean across all dimensions.
+                if set to "none", return loss computed using last dim.
+        Returns:
+        --------
+            loss or loss average
+    """
+    log_softmax = nn.LogSoftmax(dim=-1)
+    loss = (-targets * log_softmax(preds)).sum(1) # cross entropy loss
+    if reduction == "none":
+        return loss
+    elif reduction == "mean":
+        return loss.mean()

modules.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import torch
+from torch import nn
+import timm
+import config as CFG
+class TextEncoder(nn.Module):
+    """
+    Text/Poem encoder used in PoemTextModel and CLIPModel
+    ...
+    Attributes:
+    -----------
+    model : a torch.nn.Module model
+        The image encoder model
+    Methods:
+    --------
+        forward(x)
+            returns model embeddings of x (batch of texts/poems) (of the CLS token)
+        __init__()
+            creates the encoder model using huggingface transformers,
+            also freezes the model if it's not trainable.
+    """
+    def __init__(self, encoder_model, encoder_pretrained_name, pretrained, trainable):
+        """
+        creates the poem or text encoder model using transformers and loads weights from pretrained model if needed.
+        Also freezes the model if it's not trainable.
+            Parameters:
+            -----------
+            pretrained: bool
+                if pretrained=True, get pretrained model's weights. else create a fresh untrained model.
+            trainable: bool
+                if trainable=False, the model's weights will be frozen.
+            encoder_model: str
+                image encoder model name used as input to get the right model from configs.
+            encoder_pretrained_name: str
+                image encoder model to get weights from. (not used when pretrained=False)
+        """
+        super().__init__()
+        if pretrained:
+            self.model = CFG.encoders[encoder_model].from_pretrained(encoder_pretrained_name)
+        else:
+            self.model = CFG.encoders[encoder_model](config=CFG.configs[encoder_model]())
+        for p in self.model.parameters():
+            p.requires_grad = trainable
+        # Using the CLS token hidden representation as the sentence's embedding
+        self.target_token_idx = 0
+    def forward(self, input_ids, attention_mask):
+        """
+        forwards and calculates embeddings of the input using attention mask.
+            Parameters:
+            -----------
+            input_ids: input ids (output of tokenizer)
+            attention masks: input masks (for example for padding, pad tokens will be masked)
+            Returns:
+            --------
+            the embedding of the CLS (or target) token of the encoder's last hidden state
+        """
+        output = self.model(input_ids=input_ids, attention_mask=attention_mask)
+        last_hidden_state = output.last_hidden_state
+        return last_hidden_state[:, self.target_token_idx, :]
+class ProjectionHead(nn.Module):
+    """
+    Projection head used to project embeddings from each encoder to a shared embedding space
+    ...
+    Attributes:
+    -----------
+    projection : torch.nn.Linear
+        The main Dense projection (from encoder's embedding dim to shared embedding projection dim)
+    gelu: torch.nn.GELU
+        activation function
+    fc: torch.nn.Linear
+        a dense layer after projection (projection_dim to projection_dim)
+    dropout: torch.nn.Dropout
+        dropout after fc
+    layer_norm: torch.nn.LayerNorm
+        layer norm after dropout
+    Methods:
+    --------
+        forward(x)
+            returns projection embeddings from x (encoder output embeddings)
+        __init__()
+            creates the projection head
+    """
+    def __init__(
+        self,
+        embedding_dim,
+        projection_dim=CFG.projection_dim,
+        dropout=CFG.dropout
+    ):
+        """
+        Creates the projection head used after an encoder.
+            Parameters:
+            -----------
+            embedding_dim: int
+                dimension of the output embeddings of the encoder.
+            projection_dim: int, optional
+                dimension to project embeddings to.
+            dropout: float
+                fraction of the output of fc layer to be zeroed.
+        """
+        super().__init__()
+        self.projection = nn.Linear(embedding_dim, projection_dim)
+        self.gelu = nn.GELU()
+        self.fc = nn.Linear(projection_dim, projection_dim)
+        self.dropout = nn.Dropout(dropout)
+        self.layer_norm = nn.LayerNorm(projection_dim)
+    def forward(self, x):
+        """
+        Forwards and calculates projected embeddings from encoder embeddings.
+            Parameters:
+            -----------
+            x: input (of shape (batch_size, embedding_dim))
+                the output embedding of this projection head's encoder
+            Returns:
+            --------
+            the embeddings in a shared embedding space (of shape (batch_size, projection_dim))
+        """
+        projected = self.projection(x) #main projection layer
+        x = self.gelu(projected)
+        x = self.fc(x)
+        x = self.dropout(x)
+        # the projected outputs are added to x as a residual connection
+        x = x + projected
+        x = self.layer_norm(x)
+        return x
+class ImageEncoder(nn.Module):
+    """
+    Image encoder used in CLIPModel
+    ...
+    Attributes:
+    -----------
+    model : a torch.nn.Module model from timm (pytorch-image-models)
+        The image encoder model
+    Methods:
+    --------
+        forward(x)
+            returns model embeddings of x (batch of images)
+        __init__()
+            creates the encoder model using timm and loads fine-tuned model's state dict if needed.
+            also freezes the model if it's not trainable.
+    """
+    def __init__(
+        self, pretrained, trainable, model_name=CFG.image_encoder_model
+    ):
+        """
+        creates the encoder model using timm and loads fine-tuned model's state dict if needed.
+        Also freezes the model if it's not trainable.
+            Parameters:
+            -----------
+            pretrained: bool
+                if pretrained=True, get SOTA weights (or weights saved in image_encoder_weights_load_path).
+                else create a fresh untrained model.
+            trainable: bool
+                if trainable=False, the model's weights will be frozen.
+            model_name: str
+                image encoder model name used as input to timm.create_model.
+        """
+        super().__init__()
+        self.model = timm.create_model(
+            model_name, pretrained, num_classes=0, global_pool="avg"
+        )
+        if pretrained and CFG.image_encoder_weights_load_path:
+            self.model.load_state_dict(torch.load(CFG.image_encoder_weights_load_path, map_location=CFG.device))
+        for p in self.model.parameters():
+            p.requires_grad = trainable
+    def forward(self, x):
+        """
+        forwards and calculates embeddings of the input.
+            Parameters:
+            -----------
+            x: input (batch of transformed images)
+            Returns:
+            --------
+            embeddings of the model for the input (of shape (batch_size, image_embedding))
+        """
+        return self.model(x)

projections/LaBSE_best_text_projection.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:42369217ef5104e0ccf452ad310b2d2dcfc81d20d6444532d70c44bb064e76d8
+size 7358959

projections/ParsBERT_best_poem_projection.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:953022eab4908ab16e512446c11e7edf32a2ec8e7379de0d6748d52e7dda9773
+size 7358983

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+numpy
+pandas
+Pillow
+scikit_learn
+torch
+torchvision
+tqdm
+transformers
+timm
+opencv-python
+albumentations
+gradio

train.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import os
+import gc
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+import random
+import json
+import torch
+from torch import nn
+#FIX
+import config as CFG
+from models import CLIPModel
+from utils import AvgMeter, get_lr
+from utils import get_datasets, build_loaders
+def train_epoch(model, train_loader, optimizer, lr_scheduler, step):
+    """
+    Performs one epoch of training.
+        Parameters:
+        -----------
+        model: PoemTextModel or CLIPModel
+            model to train
+        train_loader: torch.utils.data.DataLoader
+            dataloader to get batches from
+        optimizer: torch.optim.Optimizer
+            optimizer used for training
+        lr_scheduler: torch.optim.lr_scheduler.LRScheduler
+            scheduler used for training
+        step: str ("batch" or "epoch")
+            if "batch", lr_scheduler will step (update) for each batch of loader.
+            else lr_scheduler only steps and updates after finishing each epoch.
+        Returns:
+        --------
+        loss_meter: AvgMeter
+            the class containing average loss of this epoch's training
+    """
+    loss_meter = AvgMeter() # to track average of loss
+    tqdm_object = tqdm(train_loader, total=len(train_loader))
+    for batch_cpu in tqdm_object:
+        # put batch data on device
+        batch = {k: {dict_k: dict_v.to(CFG.device) for dict_k, dict_v in v.items()} for k, v in batch_cpu.items() if not k in ["id", "image"]}
+        if "image" in batch_cpu:
+          batch["image"] = batch_cpu["image"].to(CFG.device)
+        #get model's embeddings and calculate loss
+        poem_or_img_embeddings, text_embeddings = model(batch)
+        loss = model.calculate_loss(poem_or_img_embeddings, text_embeddings)
+        # backpropagate and step
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        if step == "batch":
+            lr_scheduler.step()
+        #update training info
+        count = batch["text"]["input_ids"].size(0)
+        loss_meter.update(loss.item(), count)
+        tqdm_object.set_postfix(train_loss=loss_meter.avg, lr=get_lr(optimizer))
+    # print('train loss: ', loss_meter.avg)
+    return loss_meter
+def valid_epoch(model, valid_loader):
+    """
+    Performs one epoch of validation.
+        Parameters:
+        -----------
+        model: PoemTextModel or CLIPModel
+            model to validate
+        valid_loader: torch.utils.data.DataLoader
+            dataloader to get batches from.
+        Returns:
+        --------
+        loss_meter: AvgMeter
+            the class containing average loss of this epoch's validation
+    """
+    loss_meter = AvgMeter() # to track average of loss
+    tqdm_object = tqdm(valid_loader, total=len(valid_loader))
+    for batch_cpu in tqdm_object:
+        # put batch data on device
+        batch = {k: {dict_k: dict_v.to(CFG.device) for dict_k, dict_v in v.items()} for k, v in batch_cpu.items() if not k in ["id", "image"]}
+        if "image" in batch_cpu:
+            batch["image"] = batch_cpu["image"].to(CFG.device)
+        #get model's embeddings and calculate loss
+        poem_or_img_embeddings, text_embeddings = model(batch)
+        loss = model.calculate_loss(poem_or_img_embeddings, text_embeddings)
+        #update validation info
+        count = batch["text"]["input_ids"].size(0)
+        loss_meter.update(loss.item(), count)
+        tqdm_object.set_postfix(valid_loss=loss_meter.avg)
+    # print('validation loss: ', loss_meter.avg)
+    return loss_meter
+def test(model, test_dataset):
+    """
+    Calculates accuracy on test set.
+    This method is used for the PoemTextModel, since the other model (CLIPModel) does not have a test set containing pairs of image-poem.
+        Parameters:
+        -----------
+        model: PoemTextModel
+            model to test
+        test_dataset: list of dict
+            the list containing dict of data to perform test on (must have "text" and "poem" keys)
+        Returns:
+        --------
+        accuracy: np.float
+            The accuracy of model on the test set given
+    """
+    test_loader = build_loaders(test_dataset, mode="test")
+    accuracy = 0
+    tqdm_object = tqdm(test_loader, total=len(test_loader))
+    model.eval()
+    with torch.no_grad():
+        for batch_cpu in tqdm_object:
+            # put batch data on device
+            batch = {k: {dict_k: dict_v.to(CFG.device) for dict_k, dict_v in v.items()} for k, v in batch_cpu.items() if not k in ["id", "image"]}
+            if "image" in batch_cpu:
+                batch["image"] = batch_cpu["image"].to(CFG.device)
+            # get model's prediction for each text (a numpy array of index/labels showing which poem belongs to which text)
+            pred = model.predict(batch).cpu().numpy()
+            count = batch["text"]["input_ids"].size(0)
+            # since each text is associated with the poem with the same index as it, np.arange(count) is the real labels.
+            acc = np.sum(pred == np.arange(count))
+            accuracy += acc
+            tqdm_object.set_postfix(accuracy=acc / count)
+    accuracy /= len(test_dataset)
+    return accuracy
+def train(model, train_loader, valid_loader, epochs=CFG.epochs):
+    """
+    Performs train and validation for (epochs) epochs.
+        Parameters:
+        -----------
+        model: PoemTextModel or CLIPModel
+            model to train
+        train_loader: torch.utils.data.DataLoader
+            train dataloader to get batches from
+        valid_loader: torch.utils.data.DataLoader
+            validation dataloader to get batches from
+        epochs: int, optional
+            the number of epochs to train
+        Returns:
+        --------
+        model: PoemTextModel or CLIPModel
+            trained model
+        loss_history: dict
+            a dict containing train and validation average loss for each epoch.
+    """
+    # Using AdamW optimizer and ReduceLROnPlateau lr-scheduler with settings from config
+    optimizer = torch.optim.AdamW(
+        model.parameters(), lr=CFG.lr, weight_decay=CFG.weight_decay
+    )
+    lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
+        optimizer, mode="min", patience=CFG.patience, factor=CFG.factor
+    )
+    # if step="batch", lr_scheduler will step (update) for each batch of loader.
+    # else lr_scheduler only steps and updates after finishing each epoch. (this case)
+    step = "epoch"
+    loss_history = {"train":[], "valid":[]}
+    # to keep track of best validation loss
+    best_loss = float('inf')
+    for epoch in range(CFG.epochs):
+        print(f"Epoch: {epoch + 1}")
+        # train for one epoch
+        model.train()
+        train_loss = train_epoch(model, train_loader, optimizer, lr_scheduler, step)
+        loss_history["train"].append(train_loss.avg)
+        # validate trained model
+        model.eval()
+        with torch.no_grad():
+            valid_loss = valid_epoch(model, valid_loader)
+            loss_history["valid"].append(valid_loss.avg)
+        # if this epoch's avg validation loss is lower than best loss, save and keep this model.
+        if valid_loss.avg < best_loss:
+            best_loss = valid_loss.avg
+            model.save_current()
+            print("Saved Best Model!")
+        if step == "epoch":
+            lr_scheduler.step(valid_loss.avg)
+    return model, loss_history

utils.py ADDED Viewed

	@@ -0,0 +1,207 @@

+import config as CFG
+import json
+from models import PoemTextModel
+import torch
+import random
+from datasets import PoemTextDataset, get_transforms, CLIPDataset
+from tqdm import tqdm
+import numpy as np
+class AvgMeter:
+    """
+    Used to keep track of batch losses during training / validation.
+    ...
+    Attributes:
+    -----------
+    name : str
+    count : int
+        number of data whose train/val loss has been metered
+    sum: int or float
+        sum of all losses metered
+    avg: int or float
+        average of metered losses
+    Methods:
+    --------
+    reset():
+        Sets count, sum and avg to 0.
+    update(val, count=1):
+        Updates loss sum, count and avg.
+    __repr__():
+        string representation of this class.
+    """
+    def __init__(self, name="Metric"):
+        """Sets the name of the avg meter. sets avg, sum & count to 0."""
+        self.name = name
+        self.reset()
+    def reset(self):
+        """Sets avg, sum & count to 0."""
+        self.avg, self.sum, self.count = [0] * 3
+    def update(self, val, count=1):
+        """Updates loss sum, count and avg using val and count (count of the val input)"""
+        self.count += count
+        self.sum += val * count
+        self.avg = self.sum / self.count
+    def __repr__(self):
+        """String representation of this class"""
+        text = f"{self.name}: {self.avg:.4f}"
+        return text
+def get_lr(optimizer):
+    """Returns learning rate of the input optimizer"""
+    for param_group in optimizer.param_groups:
+        return param_group["lr"]
+def get_datasets():
+    """
+    Returns train, validation & test split from a dataset json file specified using CFG.dataset_path.
+    This function first loads the file into a list of dict and shuffles them with CFG.random_seed seed,
+    then splits them using CFG.train_propotion & CFG.val_propotion.
+        Returns:
+        --------
+		    train_dataset: list of dict
+                Train split
+		    val_dataset: list of dict
+                Validation split
+		    test_dataset: list of dict
+                Test split
+    """
+    with open(CFG.dataset_path, encoding="utf-8") as f:
+        dataset = json.load(f)
+    random.Random(CFG.random_seed).shuffle(dataset)
+    # https://stackoverflow.com/questions/38250710/how-to-split-data-into-3-sets-train-validation-and-test
+    train_dataset, val_dataset, test_dataset = np.split(dataset,
+                       [int(CFG.train_propotion*len(dataset)), int((CFG.train_propotion + CFG.val_propotion)*len(dataset))])
+    return train_dataset, val_dataset, test_dataset
+def build_loaders(dataset_dict, mode):
+    """
+    Returns a torch Dataloader from a list of dictionaries (dataset_dict).
+    First makes a PoemTextDataset which is a torch Dataset object from dataset_dict and then instantiates a Dataloader.
+        Parameters:
+        -----------
+            dataset_dict: list of dict
+                the dataset to return a dataloader of.
+            mode: str ("train" or any other word)
+                if the mode is "train", dataloader will activate shuffling.
+        Returns:
+        --------
+            dataloader: torch.utils.data.DataLoader
+                the torch Dataloader created from dataset_dict using PoemTextDataset and configs.
+    """
+    dataset = PoemTextDataset(
+        dataset_dict
+    )
+    dataloader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=CFG.batch_size,
+        num_workers=CFG.num_workers,
+        shuffle=True if mode == "train" else False,
+    )
+    return dataloader
+def get_clip_datasets(dataset_dict):
+    """
+    (Used for clip model training) Returns train, validation & test split from input.
+    This function takes a list of dict as dataset and shuffles them with CFG.random_seed seed,
+    then splits them using CFG.train_propotion & CFG.val_propotion.
+        Parameters:
+        -----------
+            dataset_dict: list of dict
+                the input dataset
+        Returns:
+        --------
+		    train_dataset: list of dict
+                Train split
+		    val_dataset: list of dict
+                Validation split
+		    test_dataset: list of dict
+                Test split
+    """
+    random.Random(CFG.random_seed).shuffle(dataset_dict)
+    # https://stackoverflow.com/questions/38250710/how-to-split-data-into-3-sets-train-validation-and-test
+    train_dataset, val_dataset, test_dataset = np.split(dataset_dict,
+                       [int(CFG.train_propotion*len(dataset_dict)), int((CFG.train_propotion + CFG.val_propotion)*len(dataset_dict))])
+    return train_dataset, val_dataset, test_dataset
+def build_image_loaders(dataset_dict, mode):
+    """
+    (Used for clip model training) Returns a torch Dataloader from a list of dictionaries (dataset_dict).
+    First makes a PoemTextDataset which is a torch Dataset object from dataset_dict and then instantiates a Dataloader.
+        Parameters:
+        -----------
+            dataset_dict: list of dict
+                the dataset to return a dataloader of.
+            mode: str ("train" or any other word)
+                if the mode is "train", dataloader will activate shuffling.
+        Returns:
+        --------
+            dataloader: torch.utils.data.DataLoader
+                the torch Dataloader created from dataset_dict using CLIPDataset and configs.
+    """
+    transforms = get_transforms(mode=mode)
+    dataset = CLIPDataset(
+        dataset_dict, transforms, is_image_poem_pair=False
+    )
+    dataloader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=CFG.batch_size,
+        num_workers=CFG.num_workers,
+        shuffle=True if mode == "train" else False,
+    )
+    return dataloader
+def get_poem_embeddings(test_dataset, model=None):
+    """
+    Returns embeddings of the poems existing in test_dataset.
+        Parameters:
+        -----------
+            test_dataset: list of dict
+                dataset to get poems from. each of its dictionaries must have a "beyt" key.
+            model: PoemTextModel, optional
+                The PoemTextModel model to get poem embeddings from.
+                If None is given, instantiates a new model (with all of its parts in pretrained settings) using configurations provided in config.py.
+        Returns:
+        --------
+            model (PoemTextModel): The model used for creating poem embeddings
+    """
+    test_loader = build_loaders(test_dataset, mode="test")  # building a dataloder (which also tokenizes the poems)
+    if model == None:
+        model = PoemTextModel(True, False, True, False, poem_projection_pretrained=True, text_projection_pretrained=True).to(CFG.device)
+    model.eval()
+    poem_embeddings = []
+    with torch.no_grad():
+        for batch in tqdm(test_loader):
+            # get poem embeddings by passing tokenizer output of the poems
+            # to the model's poem encoder and projection
+            beyts = {
+            key: values.to(CFG.device)
+            for key, values in batch["beyt"].items()
+            }
+            if model.__class__.__name__ == "PoemTextModel":
+                poem_features = model.poem_encoder(input_ids=beyts["input_ids"], attention_mask=beyts["attention_mask"])
+                poem_emb = model.poem_projection(poem_features)
+                poem_embeddings.append(poem_emb)
+            elif model.__class__.__name__ == "CLIPModel":
+                poem_features = model.encoder(input_ids=beyts["input_ids"], attention_mask=beyts["attention_mask"])
+                poem_emb = model.text_projection(poem_features)
+                poem_embeddings.append(poem_emb)
+            else:
+              raise #not a right model to use!
+    return model, torch.cat(poem_embeddings)