File size: 13,205 Bytes
dcacefd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
#!/usr/bin/python
# -*- coding:utf-8 -*-
from copy import copy, deepcopy
import math
import os
from typing import Dict, List, Tuple
import requests

import numpy as np
from Bio.PDB import PDBParser, PDBIO
from Bio.PDB.Structure import Structure as BStructure
from Bio.PDB.Model import Model as BModel
from Bio.PDB.Chain import Chain as BChain
from Bio.PDB.Residue import Residue as BResidue
from Bio.PDB.Atom import Atom as BAtom


class AminoAcid:
    def __init__(self, symbol: str, abrv: str, sidechain: List[str], idx=0):
        self.symbol = symbol
        self.abrv = abrv
        self.idx = idx
        self.sidechain = sidechain

    def __str__(self):
        return f'{self.idx} {self.symbol} {self.abrv} {self.sidechain}'


class AminoAcidVocab:

    MAX_ATOM_NUMBER = 14   # 4 backbone atoms + up to 10 sidechain atoms

    def __init__(self):
        self.backbone_atoms = ['N', 'CA', 'C', 'O']
        self.PAD, self.MASK = '#', '*'
        self.BOA, self.BOH, self.BOL = '&', '+', '-' # begin of antigen, heavy chain, light chain
        specials = [# special added
                (self.PAD, 'PAD'), (self.MASK, 'MASK'), # mask for masked / unknown residue
                (self.BOA, '<X>'), (self.BOH, '<H>'), (self.BOL, '<L>')
            ]

        aas = [
            ('A', 'ALA'), ('R', 'ARG'), ('N', 'ASN'), ('D', 'ASP'),
            ('C', 'CYS'), ('Q', 'GLN'), ('E', 'GLU'), ('G', 'GLY'),
            ('H', 'HIS'), ('I', 'ILE'), ('L', 'LEU'), ('K', 'LYS'),
            ('M', 'MET'), ('F', 'PHE'), ('P', 'PRO'), ('S', 'SER'),
            ('T', 'THR'), ('W', 'TRP'), ('Y', 'TYR'), ('V', 'VAL'),   # 20 aa
            # ('U', 'SEC') # 21 aa for eukaryote
        ]

        # max number of sidechain atoms: 10        
        self.atom_pad, self.atom_mask = 'p', 'm'
        self.atom_pos_mask, self.atom_pos_bb, self.atom_pos_pad = 'm', 'b', 'p'
        sidechain_map = {
            'G': [],   # -H
            'A': ['CB'],  # -CH3
            'V': ['CB', 'CG1', 'CG2'],  # -CH-(CH3)2
            'L': ['CB', 'CG', 'CD1', 'CD2'],  # -CH2-CH(CH3)2
            'I': ['CB', 'CG1', 'CG2', 'CD1'], # -CH(CH3)-CH2-CH3
            'F': ['CB', 'CG', 'CD1', 'CD2', 'CE1', 'CE2', 'CZ'],  # -CH2-C6H5
            'W': ['CB', 'CG', 'CD1', 'CD2', 'NE1', 'CE2', 'CE3', 'CZ2', 'CZ3', 'CH2'],  # -CH2-C8NH6
            'Y': ['CB', 'CG', 'CD1', 'CD2', 'CE1', 'CE2', 'CZ', 'OH'],  # -CH2-C6H4-OH
            'D': ['CB', 'CG', 'OD1', 'OD2'],  # -CH2-COOH
            'H': ['CB', 'CG', 'ND1', 'CD2', 'CE1', 'NE2'],  # -CH2-C3H3N2
            'N': ['CB', 'CG', 'OD1', 'ND2'],  # -CH2-CONH2
            'E': ['CB', 'CG', 'CD', 'OE1', 'OE2'],  # -(CH2)2-COOH
            'K': ['CB', 'CG', 'CD', 'CE', 'NZ'],  # -(CH2)4-NH2
            'Q': ['CB', 'CG', 'CD', 'OE1', 'NE2'],  # -(CH2)-CONH2
            'M': ['CB', 'CG', 'SD', 'CE'],  # -(CH2)2-S-CH3
            'R': ['CB', 'CG', 'CD', 'NE', 'CZ', 'NH1', 'NH2'],  # -(CH2)3-NHC(NH)NH2
            'S': ['CB', 'OG'],  # -CH2-OH
            'T': ['CB', 'OG1', 'CG2'],  # -CH(CH3)-OH
            'C': ['CB', 'SG'],  # -CH2-SH
            'P': ['CB', 'CG', 'CD'],  # -C3H6
        }

        self.chi_angles_atoms = {
            "ALA": [],
            # Chi5 in arginine is always 0 +- 5 degrees, so ignore it.
            "ARG": [
                ["N", "CA", "CB", "CG"],
                ["CA", "CB", "CG", "CD"],
                ["CB", "CG", "CD", "NE"],
                ["CG", "CD", "NE", "CZ"],
            ],
            "ASN": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "OD1"]],
            "ASP": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "OD1"]],
            "CYS": [["N", "CA", "CB", "SG"]],
            "GLN": [
                ["N", "CA", "CB", "CG"],
                ["CA", "CB", "CG", "CD"],
                ["CB", "CG", "CD", "OE1"],
            ],
            "GLU": [
                ["N", "CA", "CB", "CG"],
                ["CA", "CB", "CG", "CD"],
                ["CB", "CG", "CD", "OE1"],
            ],
            "GLY": [],
            "HIS": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "ND1"]],
            "ILE": [["N", "CA", "CB", "CG1"], ["CA", "CB", "CG1", "CD1"]],
            "LEU": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD1"]],
            "LYS": [
                ["N", "CA", "CB", "CG"],
                ["CA", "CB", "CG", "CD"],
                ["CB", "CG", "CD", "CE"],
                ["CG", "CD", "CE", "NZ"],
            ],
            "MET": [
                ["N", "CA", "CB", "CG"],
                ["CA", "CB", "CG", "SD"],
                ["CB", "CG", "SD", "CE"],
            ],
            "PHE": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD1"]],
            "PRO": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD"]],
            "SER": [["N", "CA", "CB", "OG"]],
            "THR": [["N", "CA", "CB", "OG1"]],
            "TRP": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD1"]],
            "TYR": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD1"]],
            "VAL": [["N", "CA", "CB", "CG1"]],
        }

        self.sidechain_bonds = {
            "ALA": { "CA": ["CB"] },
            "GLY": {},
            "VAL": {
                "CA": ["CB"],
                "CB": ["CG1", "CG2"]
            },
            "LEU": {
                "CA": ["CB"],
                "CB": ["CG"],
                "CG": ["CD2", "CD1"]
            },
            "ILE": {
                "CA": ["CB"],
                "CB": ["CG1", "CG2"],
                "CG1": ["CD1"]
            },
            "MET": {
                "CA": ["CB"],
                "CB": ["CG"],
                "CG": ["SD"],
                "SD": ["CE"],
            },
            "PHE": {
                "CA": ["CB"],
                "CB": ["CG"],
                "CG": ["CD1", "CD2"],
                "CD1": ["CE1"],
                "CD2": ["CE2"],
                "CE1": ["CZ"]
            },
            "TRP": {
                "CA": ["CB"],
                "CB": ["CG"],
                "CG": ["CD1", "CD2"],
                "CD1": ["NE1"],
                "CD2": ["CE2", "CE3"],
                "CE2": ["CZ2"],
                "CZ2": ["CH2"],
                "CE3": ["CZ3"]
            },
            "PRO": {
                "CA": ["CB"],
                "CB": ["CG"],
                "CG": ["CD"]
            },
            "SER": {
                "CA": ["CB"],
                "CB": ["OG"]
            },
            "THR": {
                "CA": ["CB"],
                "CB": ["OG1", "CG2"]
            },
            "CYS": {
                "CA": ["CB"],
                "CB": ["SG"]
            },
            "TYR": {
                "CA": ["CB"],
                "CB": ["CG"],
                "CG": ["CD1", "CD2"],
                "CD1": ["CE1"],
                "CD2": ["CE2"],
                "CE1": ["CZ"],
                "CZ": ["OH"]
            },
            "ASN": {
                "CA": ["CB"],
                "CB": ["CG"],
                "CG": ["OD1", "ND2"]
            },
            "GLN": {
                "CA": ["CB"],
                "CB": ["CG"],
                "CG": ["CD"],
                "CD": ["OE1", "NE2"]
            },
            "ASP": {
                "CA": ["CB"],
                "CB": ["CG"],
                "CG": ["OD1", "OD2"]
            },
            "GLU": {
                "CA": ["CB"],
                "CB": ["CG"],
                "CG": ["CD"],
                "CD": ["OE1", "OE2"]
            },
            "LYS": {
                "CA": ["CB"],
                "CB": ["CG"],
                "CG": ["CD"],
                "CD": ["CE"],
                "CE": ["NZ"]
            },
            "ARG": {
                "CA": ["CB"],
                "CB": ["CG"],
                "CG": ["CD"],
                "CD": ["NE"],
                "NE": ["CZ"],
                "CZ": ["NH1", "NH2"]
            },
            "HIS": {
                "CA": ["CB"],
                "CB": ["CG"],
                "CG": ["ND1", "CD2"],
                "ND1": ["CE1"],
                "CD2": ["NE2"]
            }
        }
        

        _all = aas + specials
        self.amino_acids = [AminoAcid(symbol, abrv, sidechain_map.get(symbol, [])) for symbol, abrv in _all]
        self.symbol2idx, self.abrv2idx = {}, {}
        for i, aa in enumerate(self.amino_acids):
            self.symbol2idx[aa.symbol] = i
            self.abrv2idx[aa.abrv] = i
            aa.idx = i
        self.special_mask = [0 for _ in aas] + [1 for _ in specials]

        # atom level vocab
        self.idx2atom = [self.atom_pad, self.atom_mask, 'C', 'N', 'O', 'S']
        self.idx2atom_pos = [self.atom_pos_pad, self.atom_pos_mask, self.atom_pos_bb, 'B', 'G', 'D', 'E', 'Z', 'H']
        self.atom2idx, self.atom_pos2idx = {}, {}
        for i, atom in enumerate(self.idx2atom):
            self.atom2idx[atom] = i
        for i, atom_pos in enumerate(self.idx2atom_pos):
            self.atom_pos2idx[atom_pos] = i
    
    def abrv_to_symbol(self, abrv):
        idx = self.abrv_to_idx(abrv)
        return None if idx is None else self.amino_acids[idx].symbol

    def symbol_to_abrv(self, symbol):
        idx = self.symbol_to_idx(symbol)
        return None if idx is None else self.amino_acids[idx].abrv

    def abrv_to_idx(self, abrv):
        abrv = abrv.upper()
        return self.abrv2idx.get(abrv, None)

    def symbol_to_idx(self, symbol):
        symbol = symbol.upper()
        return self.symbol2idx.get(symbol, None)
    
    def idx_to_symbol(self, idx):
        return self.amino_acids[idx].symbol

    def idx_to_abrv(self, idx):
        return self.amino_acids[idx].abrv

    def get_pad_idx(self):
        return self.symbol_to_idx(self.PAD)

    def get_mask_idx(self):
        return self.symbol_to_idx(self.MASK)
    
    def get_special_mask(self):
        return copy(self.special_mask)

    def get_atom_type_mat(self):
        atom_pad = self.get_atom_pad_idx()
        mat = []
        for i, aa in enumerate(self.amino_acids):
            atoms = [atom_pad for _ in range(self.MAX_ATOM_NUMBER)]
            if aa.symbol == self.PAD:
                pass
            elif self.special_mask[i] == 1:  # specials
                atom_mask = self.get_atom_mask_idx()
                atoms = [atom_mask for _ in range(self.MAX_ATOM_NUMBER)]
            else:
                for aidx, atom in enumerate(self.backbone_atoms + aa.sidechain):
                    atoms[aidx] = self.atom_to_idx(atom[0])
            mat.append(atoms)
        return mat

    def get_atom_pos_mat(self):
        atom_pos_pad = self.get_atom_pos_pad_idx()
        mat = []
        for i, aa in enumerate(self.amino_acids):
            aps = [atom_pos_pad for _ in range(self.MAX_ATOM_NUMBER)]
            if aa.symbol == self.PAD:
                pass
            elif self.special_mask[i] == 1:
                atom_pos_mask = self.get_atom_pos_mask_idx()
                aps = [atom_pos_mask for _ in range(self.MAX_ATOM_NUMBER)]
            else:
                aidx = 0
                for _ in self.backbone_atoms:
                    aps[aidx] = self.atom_pos_to_idx(self.atom_pos_bb)
                    aidx += 1
                for atom in aa.sidechain:
                    aps[aidx] = self.atom_pos_to_idx(atom[1])
                    aidx += 1
            mat.append(aps)
        return mat

    def get_sidechain_info(self, symbol):
        idx = self.symbol_to_idx(symbol)
        return copy(self.amino_acids[idx].sidechain)
    
    def get_sidechain_geometry(self, symbol):
        abrv = self.symbol_to_abrv(symbol)
        chi_angles_atoms = copy(self.chi_angles_atoms[abrv])
        sidechain_bonds = self.sidechain_bonds[abrv]
        return (chi_angles_atoms, sidechain_bonds)
    
    def get_atom_pad_idx(self):
        return self.atom2idx[self.atom_pad]
    
    def get_atom_mask_idx(self):
        return self.atom2idx[self.atom_mask]
    
    def get_atom_pos_pad_idx(self):
        return self.atom_pos2idx[self.atom_pos_pad]

    def get_atom_pos_mask_idx(self):
        return self.atom_pos2idx[self.atom_pos_mask]
    
    def idx_to_atom(self, idx):
        return self.idx2atom[idx]

    def atom_to_idx(self, atom):
        return self.atom2idx[atom]

    def idx_to_atom_pos(self, idx):
        return self.idx2atom_pos[idx]
    
    def atom_pos_to_idx(self, atom_pos):
        return self.atom_pos2idx[atom_pos]

    def get_num_atom_type(self):
        return len(self.idx2atom)
    
    def get_num_atom_pos(self):
        return len(self.idx2atom_pos)

    def get_num_amino_acid_type(self):
        return len(self.special_mask) - sum(self.special_mask)

    def __len__(self):
        return len(self.symbol2idx)


VOCAB = AminoAcidVocab()


def format_aa_abrv(abrv):  # special cases
    if abrv == 'MSE':
        return 'MET' # substitue MSE with MET
    return abrv


def fetch_from_pdb(identifier):
    # example identifier: 1FBI
    url = 'https://data.rcsb.org/rest/v1/core/entry/' + identifier
    res = requests.get(url)
    if res.status_code != 200:
        return None
    url = f'https://files.rcsb.org/download/{identifier}.pdb'
    text = requests.get(url)
    data = res.json()
    data['pdb'] = text.text
    return data