What it does:
(TM-align is an algorithm for sequence-independent protein structure comparisons)
When to use:
•
When you want to align two protein chains using only structural information, without sequence/residue alignment.
•
If you’re not sure whether two chains have equal sequence or at least equal sequence length, use TM-align to find the best structural match.
•
When comparing protein structures with potentially different evolutionary origins but similar folds
Python code
from Bio.PDB import PDBParser
from Bio.Data.PDBData import protein_letters_3to1
from tmtools import tm_align
import numpy as np
def perform_tm_align(pdb_fn1: str, chain_id1: str, pdb_fn2: str, chain_id2: str, missing_coord_placeholder: float = 99.999):
"""
Performs TM-align between specific chains in two PDB files.
Parameters:
-----------
pdb_fn1 : str
Path to the first PDB file
chain_id1 : str
Chain ID in the first PDB file
pdb_fn2 : str
Path to the second PDB file
chain_id2 : str
Chain ID in the second PDB file
missing_coord_placeholder : float
Value used as a placeholder for missing coordinates
Returns:
--------
result : dict
Dictionary containing TM-align results including TM-score, rotation matrix, etc.
Raises:
-------
KeyError: If specified chain IDs don't exist in the structures
ValueError: If structures have multiple models or insufficient CA atoms
"""
parser = PDBParser(QUIET=True)
structure1 = parser.get_structure('X', pdb_fn1)
structure2 = parser.get_structure('Y', pdb_fn2)
# Ensure we're working with single models
if len(structure1) != 1:
raise ValueError(f"First structure contains {len(structure1)} models, expected 1")
if len(structure2) != 1:
raise ValueError(f"Second structure contains {len(structure2)} models, expected 1")
# Check if chains exist
if chain_id1 not in structure1[0]:
raise KeyError(f"Chain '{chain_id1}' not found in {pdb_fn1}")
if chain_id2 not in structure2[0]:
raise KeyError(f"Chain '{chain_id2}' not found in {pdb_fn2}")
chain1 = structure1[0][chain_id1]
chain2 = structure2[0][chain_id2]
# Extract sequences
seq1 = ''.join([protein_letters_3to1.get(residue.get_resname(), 'X') for residue in chain1.get_residues()])
seq2 = ''.join([protein_letters_3to1.get(residue.get_resname(), 'X') for residue in chain2.get_residues()])
# Extract CA coordinates and filter out missing atoms
ca_coords1 = []
for residue in chain1.get_residues():
if 'CA' in residue:
ca_coords1.append(residue['CA'].coord)
if not ca_coords1:
raise ValueError(f"No CA atoms found in chain '{chain_id1}' of {pdb_fn1}")
ca_coords1 = np.array(ca_coords1)
ca_mask1 = ca_coords1[:, 0] != missing_coord_placeholder
ca_coords1 = ca_coords1[ca_mask1]
if len(ca_coords1) == 0:
raise ValueError(f"No valid CA atoms found in chain '{chain_id1}' after filtering missing coordinates")
seq1 = ''.join(list(np.array(list(seq1))[ca_mask1]))
ca_coords2 = []
for residue in chain2.get_residues():
if 'CA' in residue:
ca_coords2.append(residue['CA'].coord)
if not ca_coords2:
raise ValueError(f"No CA atoms found in chain '{chain_id2}' of {pdb_fn2}")
ca_coords2 = np.array(ca_coords2)
ca_mask2 = ca_coords2[:, 0] != missing_coord_placeholder
ca_coords2 = ca_coords2[ca_mask2]
if len(ca_coords2) == 0:
raise ValueError(f"No valid CA atoms found in chain '{chain_id2}' after filtering missing coordinates")
seq2 = ''.join(list(np.array(list(seq2))[ca_mask2]))
# Perform TM-align
result = tm_align(ca_coords1, ca_coords2, seq1, seq2)
return result
Python
복사
Example usage
pdb_fn1 = "1mbn.pdb"
chain_1 = "A"
pdb_fn2 = "1pmb.pdb"
chain_2 = "A"
missing_coord_placeholder = 99.999
result = perform_tm_align(pdb_fn1, chain_id1, pdb_fn2, chain_id2, missing_coord_placeholder)
Python
복사
output
Tips and Tricks
•
TM-score has the value in (0, 1], with values >0.5 generally indicating the same fold
•
•
The function checks for CA atoms in each residue, making it more robust against incomplete structures