Spaces:

mojtaba-nafez
/

persian-poem-recommender-based-on-text

Runtime error

App Files Files Community

mojtaba-nafez commited on Mar 9, 2023

Commit

fd6aade

1 Parent(s): 1385d75

fix app.py to read from saved poem_embeddings.json

Browse files

Files changed (2) hide show

app.py +8 -4
inference.py +64 -8

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ from inference import predict_poems_from_text
 from utils import get_poem_embeddings
 import config as CFG
 import json
 import gradio as gr
 def greet_user(name):
@@ -12,15 +13,18 @@ if __name__ == "__main__":
     model = PoemTextModel(poem_encoder_pretrained=True, text_encoder_pretrained=True).to(CFG.device)
     model.eval()
     # Inference: Output some example predictions and write them in a file
-    with open(CFG.dataset_path, encoding="utf-8") as f:
-        dataset = json.load(f)
     def gradio_make_predictions(text):
-        beyts = predict_poems_from_text(model, poem_embeddings, text, [data['beyt'] for data in dataset], n=10)
         return "\n".join(beyts)
     CFG.batch_size = 512
-    model, poem_embeddings = get_poem_embeddings(dataset, model)
     # print(poem_embeddings[0])
     # with open('poem_embeddings.json'.format(CFG.poem_encoder_model, CFG.text_encoder_model),'w', encoding="utf-8") as f:
     #     f.write(json.dumps(poem_embeddings, indent= 4))

 from utils import get_poem_embeddings
 import config as CFG
 import json
+import torch
 import gradio as gr
 def greet_user(name):
     model = PoemTextModel(poem_encoder_pretrained=True, text_encoder_pretrained=True).to(CFG.device)
     model.eval()
     # Inference: Output some example predictions and write them in a file
+    with open('poem_embeddings.json', encoding="utf-8") as f:
+        pe = json.load(f)
+    poem_embeddings = torch.Tensor([p['embeddings'] for p in pe]).to(CFG.device)
+    print(poem_embeddings.shape)
+    poems = [p['beyt'] for p in pe]
     def gradio_make_predictions(text):
+        beyts = predict_poems_from_text(model, poem_embeddings, text, poems, n=10)
         return "\n".join(beyts)
     CFG.batch_size = 512
     # print(poem_embeddings[0])
     # with open('poem_embeddings.json'.format(CFG.poem_encoder_model, CFG.text_encoder_model),'w', encoding="utf-8") as f:
     #     f.write(json.dumps(poem_embeddings, indent= 4))

inference.py CHANGED Viewed

@@ -12,9 +12,10 @@ from models import PoemTextModel
 from utils import get_poem_embeddings
 import json
 import os
-def predict_poems_from_text(model, poem_embeddings, query, poems, text_tokenizer=None, n=10):
     """
     Returns n poems which are the most similar to a text query
@@ -32,6 +33,8 @@ def predict_poems_from_text(model, poem_embeddings, query, poems, text_tokenizer
                 tokenizer to tokenize query with. if none, will instantiate a new text tokenizer using configs.
             n: int, optional
                 number of poems to return
         Returns:
         --------
@@ -63,11 +66,36 @@ def predict_poems_from_text(model, poem_embeddings, query, poems, text_tokenizer
     dot_similarity = text_embeddings_n @ poem_embeddings_n.T
     # returning top n poems based on embedding similarity
-    _, indices = torch.topk(dot_similarity.squeeze(0), n)
-    return [poems[idx] for idx in indices]
-def predict_poems_from_image(model, poem_embeddings, image_filename, poems, n=10):
     """
     Returns n poems which are the most similar to an image query
@@ -83,6 +111,8 @@ def predict_poems_from_image(model, poem_embeddings, image_filename, poems, n=10
                 poems corresponding to poem_embeddings
             n: int, optional
                 number of poems to return
         Returns:
         --------
@@ -107,8 +137,34 @@ def predict_poems_from_image(model, poem_embeddings, image_filename, poems, n=10
     dot_similarity = image_embeddings_n @ poem_embeddings_n.T
     # returning top n poems based on embedding similarity
-    _, indices = torch.topk(dot_similarity.squeeze(0), n)
-    return [poems[idx] for idx in indices]
 if __name__ == "__main__":
     """

 from utils import get_poem_embeddings
 import json
 import os
+import regex
+def predict_poems_from_text(model, poem_embeddings, query, poems, text_tokenizer=None, n=10, return_similarities=False):
     """
     Returns n poems which are the most similar to a text query
                 tokenizer to tokenize query with. if none, will instantiate a new text tokenizer using configs.
             n: int, optional
                 number of poems to return
+            return_similarities: bool, optional
+                if True, a dictionary will be returned which has the poem beyts and their similarities to the text
         Returns:
         --------
     dot_similarity = text_embeddings_n @ poem_embeddings_n.T
     # returning top n poems based on embedding similarity
+    values, indices = torch.topk(dot_similarity.squeeze(0), len(poems))
+    # since we collected poems from many sources, some of them are equal (the same beyt with different meanings),
+    # so we must check the poems added to result not to be duplicates
+    def is_poem_duplicate(poem, poems):
+        poem = regex.findall(r'\p{L}+', poem.replace('\u200c', ''))
+        for other_poem in poems:
+            other_poem = regex.findall(r'\p{L}+', other_poem.replace('\u200c', ''))
+            if poem == other_poem:
+                return True
+        return False
+    results = []
+    computed_k = 0
+    for i in range(len(poems)):
+        if computed_k == n:
+            break
+        if not is_poem_duplicate(poems[indices[i]], [res['beyt'] for res in results]):
+            results.append({
+                'beyt': poems[indices[i]].replace(' * * ', ' * ').replace('*** * ', ''),
+                'similarity': values[i]
+            })
+            computed_k += 1
+    if return_similarities:
+        return results
+    else:
+        return [res['beyt'] for res in results]
+def predict_poems_from_image(model, poem_embeddings, image_filename, poems, n=10, return_similarities=False):
     """
     Returns n poems which are the most similar to an image query
                 poems corresponding to poem_embeddings
             n: int, optional
                 number of poems to return
+            return_similarities: bool, optional
+                if True, a dictionary will be returned which has the poem beyts and their similarities to the text
         Returns:
         --------
     dot_similarity = image_embeddings_n @ poem_embeddings_n.T
     # returning top n poems based on embedding similarity
+    values, indices = torch.topk(dot_similarity.squeeze(0), len(poems))
+    # since we collected poems from many sources, some of them are equal (the same beyt with different meanings),
+    # so we must check the poems added to result not to be duplicates
+    def is_poem_duplicate(poem, poems):
+        poem = regex.findall(r'\p{L}+', poem.replace('\u200c', ''))
+        for other_poem in poems:
+            other_poem = regex.findall(r'\p{L}+', other_poem.replace('\u200c', ''))
+            if poem == other_poem:
+                return True
+        return False
+    results = []
+    computed_k = 0
+    for i in range(len(poems)):
+        if computed_k == n:
+            break
+        if not is_poem_duplicate(poems[indices[i]], [res['beyt'] for res in results]):
+            results.append({
+                'beyt': poems[indices[i]].replace(' * * ', ' * ').replace('*** * ', ''),
+                'similarity': values[i]
+            })
+            computed_k += 1
+    if return_similarities:
+        return results
+    else:
+        return [res['beyt'] for res in results]
 if __name__ == "__main__":
     """