Search

Kabsch-align

Status
Done
What it does:
Performs Kabsch alignment between two different domains
When to use:
When you want to find the optimal rotation matrix and translation matrix that minimizes the RMSD between two coordinate arrays with same shape
When you need to align specific residue ranges in two different protein chains
For structural comparison of protein domains with the same number of atoms

Python code

from Bio.PDB import PDBParser from Bio.PDB.Chain import Chain from Bio.PDB.Residue import Residue import numpy as np from typing import List, Tuple def perform_kabsch_align(pdb_fn1: str, chain_id1: str, residue_idx_range1: List[int], pdb_fn2: str, chain_id2: str, residue_idx_range2: List[int], missing_coord_placeholder: float, verbose: bool = False) -> Tuple[Chain, np.ndarray, np.ndarray]: """ Performs Kabsch alignment between specific residue ranges in two different chains. Parameters: ----------- pdb_fn1 : str Path to the first PDB file chain_id1 : str Chain ID in the first PDB file residue_idx_range1 : List[int] List of residue indices to use from first chain pdb_fn2 : str Path to the second PDB file chain_id2 : str Chain ID in the second PDB file residue_idx_range2 : List[int] List of residue indices to use from second chain missing_coord_placeholder : float Value used as a placeholder for missing coordinates verbose : bool Whether to print alignment information Returns: -------- chain2_aligned : Bio.PDB.Chain.Chain Aligned version of chain2 t : np.ndarray Translation vector R : np.ndarray Rotation matrix Raises: ------- KeyError: If specified chain IDs don't exist in the structures ValueError: If residue ranges don't match in length or CA atoms are missing """ try: parser = PDBParser(QUIET=True) structure1 = parser.get_structure('X', pdb_fn1) structure2 = parser.get_structure('Y', pdb_fn2) # Ensure we're working with single models if len(structure1) != 1: raise ValueError(f"First structure contains {len(structure1)} models, expected 1") if len(structure2) != 1: raise ValueError(f"Second structure contains {len(structure2)} models, expected 1") # Check if chains exist if chain_id1 not in structure1[0]: raise KeyError(f"Chain '{chain_id1}' not found in {pdb_fn1}") if chain_id2 not in structure2[0]: raise KeyError(f"Chain '{chain_id2}' not found in {pdb_fn2}") chain1 = structure1[0][chain_id1] chain2 = structure2[0][chain_id2] # Check if residue ranges have the same length if len(residue_idx_range1) != len(residue_idx_range2): raise ValueError("Residue index ranges must have the same length") # Extract CA coordinates for the specified residue ranges residue_list1 = list(chain1.get_residues()) residue_list2 = list(chain2.get_residues()) ca_coords1 = [] for i in residue_idx_range1: if i >= len(residue_list1): raise ValueError(f"Residue index {i} out of range for chain {chain_id1}") if 'CA' not in residue_list1[i]: raise ValueError(f"CA atom missing in residue {i} of chain {chain_id1}") ca_coords1.append(residue_list1[i]['CA'].coord) ca_coords1 = np.array(ca_coords1) ca_coords2 = [] for i in residue_idx_range2: if i >= len(residue_list2): raise ValueError(f"Residue index {i} out of range for chain {chain_id2}") if 'CA' not in residue_list2[i]: raise ValueError(f"CA atom missing in residue {i} of chain {chain_id2}") ca_coords2.append(residue_list2[i]['CA'].coord) ca_coords2 = np.array(ca_coords2) if verbose: print("RMSD between domains before alignment:", calc_rmsd(ca_coords1, ca_coords2)) # Create mask for missing coordinates mask = np.ones(ca_coords1.shape[0], dtype=bool) mask[ca_coords1[:, 0] == missing_coord_placeholder] = False mask[ca_coords2[:, 0] == missing_coord_placeholder] = False # Perform Kabsch alignment t, R = kabsch_align(ca_coords1, ca_coords2, mask) if verbose: print("Translation:", t) print("Rotation:", R) aligned_coords2 = transform_array(ca_coords2, R, t) print("RMSD between domains after alignment:", calc_rmsd(ca_coords1, aligned_coords2)) # Transform the entire second chain chain2_aligned = transform_chain(chain2, R, t, missing_coord_placeholder) return chain2_aligned, t, R except FileNotFoundError as e: raise FileNotFoundError(f"PDB file not found: {str(e)}") except Exception as e: raise Exception(f"Error performing Kabsch alignment: {str(e)}") def kabsch_align(target: np.ndarray, source: np.ndarray, mask: np.ndarray = None) -> Tuple[np.ndarray, np.ndarray]: """ Performs Kabsch alignment to find optimal rotation and translation. Parameters: ----------- target : np.ndarray Target coordinates (Nx3) source : np.ndarray Source coordinates to be aligned to target (Nx3) mask : np.ndarray Boolean mask for valid coordinates Returns: -------- t : np.ndarray Translation vector (1x3) R : np.ndarray Rotation matrix (3x3) """ if mask is not None: if len(mask) != target.shape[0]: raise ValueError("Mask length must match the number of points in target/source.") target = target[mask] source = source[mask] if target.shape[0] == 0 or source.shape[0] == 0: raise ValueError("Target and source arrays must have at least one point.") # Center the coordinates mean_target = np.mean(target, axis=0, keepdims=True) mean_source = np.mean(source, axis=0, keepdims=True) center_target = target - mean_target center_source = source - mean_source # Calculate covariance matrix H = center_source.T @ center_target # Singular value decomposition U, S, Vt = np.linalg.svd(H) # Calculate rotation matrix R = Vt.T @ U.T # Ensure proper rotation (det(R) = 1) if np.linalg.det(R) < 0: Vt[-1, :] *= -1 R = Vt.T @ U.T # Calculate translation t = mean_target - mean_source @ R.T return t, R def transform_array(array: np.ndarray, rot: np.ndarray, tr: np.ndarray) -> np.ndarray: """ Transforms coordinates using rotation matrix and translation vector. Parameters: ----------- array : np.ndarray Coordinates to transform (Nx3) rot : np.ndarray Rotation matrix (3x3) tr : np.ndarray Translation vector (1x3) Returns: -------- transformed : np.ndarray Transformed coordinates """ return array @ rot.T + tr def transform_chain(chain: Chain, rot: np.ndarray, tr: np.ndarray, missing_coord_place_holder: float) -> Chain: """ Transforms an entire chain using rotation matrix and translation vector. Parameters: ----------- chain : Bio.PDB.Chain.Chain Chain to transform rot : np.ndarray Rotation matrix (3x3) tr : np.ndarray Translation vector (1x3) missing_coord_place_holder : float Value used as a placeholder for missing coordinates Returns: -------- new_chain : Bio.PDB.Chain.Chain Transformed chain """ new_chain = Chain(chain.id) for residue in chain: new_residue = Residue(residue.get_id(), residue.get_resname(), residue.get_segid()) for atom in residue: new_atom = atom.copy() if not np.any(atom.get_coord() == missing_coord_place_holder): new_coord = atom.get_coord() @ rot.T + tr new_atom.set_coord(new_coord[0] if new_coord.ndim > 1 else new_coord) new_residue.add(new_atom) new_chain.add(new_residue) return new_chain def calc_rmsd(coords1: np.ndarray, coords2: np.ndarray, mask: np.ndarray = None) -> float: """ Calculates RMSD between two sets of coordinates. Parameters: ----------- coords1 : np.ndarray First coordinate set (Nx3) coords2 : np.ndarray Second coordinate set (Nx3) mask : np.ndarray Boolean mask for valid coordinates Returns: -------- rmsd : float Root mean square deviation """ if mask is not None: coords1 = coords1[mask] coords2 = coords2[mask] return np.sqrt(np.mean(np.sum((coords1 - coords2) ** 2, axis=1)))
Python
복사

Example usage

pdb_fn1 = "5xxy.pdb" chain_id1 = 'H' residue_idx_range1 = list(range(3, 17)) # Residues 3-16 (0-indexed) pdb_fn2 = "5x8l.pdb" chain_id2 = 'H' residue_idx_range2 = list(range(3, 17)) # Matching residue range missing_coord_placeholder = 99.999 chain2_aligned, translation, rotation = perform_kabsch_align( pdb_fn1, chain_id1, residue_idx_range1, pdb_fn2, chain_id2, residue_idx_range2, missing_coord_placeholder, verbose=True ) # You can use the aligned chain for further analysis or visualization
Python
복사
Output

Tips and Tricks

The Kabsch algorithm provides the optimal rigid-body alignment that minimizes RMSD between two sets of points
Make sure the residue indices in both ranges correspond to structurally equivalent positions
Remember that the residue indices are 0-indexed in this implementation
Unlike TM-align, Kabsch requires the same number of atoms in both structures being compared
For domains with insertions or deletions, TM-align might be more appropriate
The transformed chain can be saved to a PDB file using Bio.PDB.PDBIO for visualization