What it does:
Performs Kabsch alignment between two different domains
When to use:
•
When you want to find the optimal rotation matrix and translation matrix that minimizes the RMSD between two coordinate arrays with same shape
•
When you need to align specific residue ranges in two different protein chains
•
For structural comparison of protein domains with the same number of atoms
Python code
from Bio.PDB import PDBParser
from Bio.PDB.Chain import Chain
from Bio.PDB.Residue import Residue
import numpy as np
from typing import List, Tuple
def perform_kabsch_align(pdb_fn1: str, chain_id1: str, residue_idx_range1: List[int],
pdb_fn2: str, chain_id2: str, residue_idx_range2: List[int],
missing_coord_placeholder: float, verbose: bool = False) -> Tuple[Chain, np.ndarray, np.ndarray]:
"""
Performs Kabsch alignment between specific residue ranges in two different chains.
Parameters:
-----------
pdb_fn1 : str
Path to the first PDB file
chain_id1 : str
Chain ID in the first PDB file
residue_idx_range1 : List[int]
List of residue indices to use from first chain
pdb_fn2 : str
Path to the second PDB file
chain_id2 : str
Chain ID in the second PDB file
residue_idx_range2 : List[int]
List of residue indices to use from second chain
missing_coord_placeholder : float
Value used as a placeholder for missing coordinates
verbose : bool
Whether to print alignment information
Returns:
--------
chain2_aligned : Bio.PDB.Chain.Chain
Aligned version of chain2
t : np.ndarray
Translation vector
R : np.ndarray
Rotation matrix
Raises:
-------
KeyError: If specified chain IDs don't exist in the structures
ValueError: If residue ranges don't match in length or CA atoms are missing
"""
try:
parser = PDBParser(QUIET=True)
structure1 = parser.get_structure('X', pdb_fn1)
structure2 = parser.get_structure('Y', pdb_fn2)
# Ensure we're working with single models
if len(structure1) != 1:
raise ValueError(f"First structure contains {len(structure1)} models, expected 1")
if len(structure2) != 1:
raise ValueError(f"Second structure contains {len(structure2)} models, expected 1")
# Check if chains exist
if chain_id1 not in structure1[0]:
raise KeyError(f"Chain '{chain_id1}' not found in {pdb_fn1}")
if chain_id2 not in structure2[0]:
raise KeyError(f"Chain '{chain_id2}' not found in {pdb_fn2}")
chain1 = structure1[0][chain_id1]
chain2 = structure2[0][chain_id2]
# Check if residue ranges have the same length
if len(residue_idx_range1) != len(residue_idx_range2):
raise ValueError("Residue index ranges must have the same length")
# Extract CA coordinates for the specified residue ranges
residue_list1 = list(chain1.get_residues())
residue_list2 = list(chain2.get_residues())
ca_coords1 = []
for i in residue_idx_range1:
if i >= len(residue_list1):
raise ValueError(f"Residue index {i} out of range for chain {chain_id1}")
if 'CA' not in residue_list1[i]:
raise ValueError(f"CA atom missing in residue {i} of chain {chain_id1}")
ca_coords1.append(residue_list1[i]['CA'].coord)
ca_coords1 = np.array(ca_coords1)
ca_coords2 = []
for i in residue_idx_range2:
if i >= len(residue_list2):
raise ValueError(f"Residue index {i} out of range for chain {chain_id2}")
if 'CA' not in residue_list2[i]:
raise ValueError(f"CA atom missing in residue {i} of chain {chain_id2}")
ca_coords2.append(residue_list2[i]['CA'].coord)
ca_coords2 = np.array(ca_coords2)
if verbose:
print("RMSD between domains before alignment:", calc_rmsd(ca_coords1, ca_coords2))
# Create mask for missing coordinates
mask = np.ones(ca_coords1.shape[0], dtype=bool)
mask[ca_coords1[:, 0] == missing_coord_placeholder] = False
mask[ca_coords2[:, 0] == missing_coord_placeholder] = False
# Perform Kabsch alignment
t, R = kabsch_align(ca_coords1, ca_coords2, mask)
if verbose:
print("Translation:", t)
print("Rotation:", R)
aligned_coords2 = transform_array(ca_coords2, R, t)
print("RMSD between domains after alignment:", calc_rmsd(ca_coords1, aligned_coords2))
# Transform the entire second chain
chain2_aligned = transform_chain(chain2, R, t, missing_coord_placeholder)
return chain2_aligned, t, R
except FileNotFoundError as e:
raise FileNotFoundError(f"PDB file not found: {str(e)}")
except Exception as e:
raise Exception(f"Error performing Kabsch alignment: {str(e)}")
def kabsch_align(target: np.ndarray, source: np.ndarray, mask: np.ndarray = None) -> Tuple[np.ndarray, np.ndarray]:
"""
Performs Kabsch alignment to find optimal rotation and translation.
Parameters:
-----------
target : np.ndarray
Target coordinates (Nx3)
source : np.ndarray
Source coordinates to be aligned to target (Nx3)
mask : np.ndarray
Boolean mask for valid coordinates
Returns:
--------
t : np.ndarray
Translation vector (1x3)
R : np.ndarray
Rotation matrix (3x3)
"""
if mask is not None:
if len(mask) != target.shape[0]:
raise ValueError("Mask length must match the number of points in target/source.")
target = target[mask]
source = source[mask]
if target.shape[0] == 0 or source.shape[0] == 0:
raise ValueError("Target and source arrays must have at least one point.")
# Center the coordinates
mean_target = np.mean(target, axis=0, keepdims=True)
mean_source = np.mean(source, axis=0, keepdims=True)
center_target = target - mean_target
center_source = source - mean_source
# Calculate covariance matrix
H = center_source.T @ center_target
# Singular value decomposition
U, S, Vt = np.linalg.svd(H)
# Calculate rotation matrix
R = Vt.T @ U.T
# Ensure proper rotation (det(R) = 1)
if np.linalg.det(R) < 0:
Vt[-1, :] *= -1
R = Vt.T @ U.T
# Calculate translation
t = mean_target - mean_source @ R.T
return t, R
def transform_array(array: np.ndarray, rot: np.ndarray, tr: np.ndarray) -> np.ndarray:
"""
Transforms coordinates using rotation matrix and translation vector.
Parameters:
-----------
array : np.ndarray
Coordinates to transform (Nx3)
rot : np.ndarray
Rotation matrix (3x3)
tr : np.ndarray
Translation vector (1x3)
Returns:
--------
transformed : np.ndarray
Transformed coordinates
"""
return array @ rot.T + tr
def transform_chain(chain: Chain, rot: np.ndarray, tr: np.ndarray, missing_coord_place_holder: float) -> Chain:
"""
Transforms an entire chain using rotation matrix and translation vector.
Parameters:
-----------
chain : Bio.PDB.Chain.Chain
Chain to transform
rot : np.ndarray
Rotation matrix (3x3)
tr : np.ndarray
Translation vector (1x3)
missing_coord_place_holder : float
Value used as a placeholder for missing coordinates
Returns:
--------
new_chain : Bio.PDB.Chain.Chain
Transformed chain
"""
new_chain = Chain(chain.id)
for residue in chain:
new_residue = Residue(residue.get_id(), residue.get_resname(), residue.get_segid())
for atom in residue:
new_atom = atom.copy()
if not np.any(atom.get_coord() == missing_coord_place_holder):
new_coord = atom.get_coord() @ rot.T + tr
new_atom.set_coord(new_coord[0] if new_coord.ndim > 1 else new_coord)
new_residue.add(new_atom)
new_chain.add(new_residue)
return new_chain
def calc_rmsd(coords1: np.ndarray, coords2: np.ndarray, mask: np.ndarray = None) -> float:
"""
Calculates RMSD between two sets of coordinates.
Parameters:
-----------
coords1 : np.ndarray
First coordinate set (Nx3)
coords2 : np.ndarray
Second coordinate set (Nx3)
mask : np.ndarray
Boolean mask for valid coordinates
Returns:
--------
rmsd : float
Root mean square deviation
"""
if mask is not None:
coords1 = coords1[mask]
coords2 = coords2[mask]
return np.sqrt(np.mean(np.sum((coords1 - coords2) ** 2, axis=1)))
Python
복사
Example usage
pdb_fn1 = "5xxy.pdb"
chain_id1 = 'H'
residue_idx_range1 = list(range(3, 17)) # Residues 3-16 (0-indexed)
pdb_fn2 = "5x8l.pdb"
chain_id2 = 'H'
residue_idx_range2 = list(range(3, 17)) # Matching residue range
missing_coord_placeholder = 99.999
chain2_aligned, translation, rotation = perform_kabsch_align(
pdb_fn1, chain_id1, residue_idx_range1,
pdb_fn2, chain_id2, residue_idx_range2,
missing_coord_placeholder, verbose=True
)
# You can use the aligned chain for further analysis or visualization
Python
복사
Output
Tips and Tricks
•
The Kabsch algorithm provides the optimal rigid-body alignment that minimizes RMSD between two sets of points
•
Make sure the residue indices in both ranges correspond to structurally equivalent positions
•
Remember that the residue indices are 0-indexed in this implementation
•
Unlike TM-align, Kabsch requires the same number of atoms in both structures being compared
•
For domains with insertions or deletions, TM-align might be more appropriate
•
The transformed chain can be saved to a PDB file using Bio.PDB.PDBIO for visualization