Source code for molecupy.converters.pdbdatafile2model

"""This module handles the logic of converting a :py:class:`.PdbDataFile` to a
:py:class:`.Model`"""

from ..structures import Model, Atom, GhostAtom, SmallMolecule, Residue, Chain
from ..structures import BindSite, AlphaHelix, BetaStrand, Complex
from ..pdb.pdbdatafile import PdbDataFile
from ..pdb import residues as residues_dict

[docs]def model_from_pdb_data_file(data_file, model_id=1): """Takes a :py:class:`.PdbDataFile`, converts it to a :py:class:`.Model`, and returns it. :py:class:`.PdbDataFile` objects can contain multiple models. By default, model 1 will be used, but you can specify specific models with the ``model_id`` argument. :param PdbDataFile data_file: The :py:class:`.PdbDataFile` to convert. :param int model_id: The ID of the model in the data fileto be used for\ conversion. :rtype: :py:class:`.Model`""" if not isinstance(data_file, PdbDataFile): raise TypeError("model_from_pdb_data_file can only convert PdbDataFiles") for model_dict in data_file.models(): if model_dict["model_id"] == model_id: model = Model() model._source = data_file add_small_molecules_to_model(model, data_file, model_id) add_chains_to_model(model, data_file, model_id) connect_atoms(model, data_file, model_id) bond_residue_atoms(model, data_file, model_id) bond_residues_together(model, data_file, model_id) make_disulphide_bonds(model, data_file, model_id) make_link_bonds(model, data_file, model_id) give_model_sites(model, data_file, model_id) map_sites_to_ligands(model, data_file, model_id) give_model_alpha_helices(model, data_file, model_id) give_model_beta_strands(model, data_file, model_id) give_model_complexes(model, data_file, model_id) return model raise ValueError("There is no model with ID %i" % model_id)
[docs]def add_small_molecules_to_model(model, data_file, model_id): """Takes a :py:class:`.Model` and creates :py:class:`.SmallMolecule` objects in it based on the ``heteroatoms`` in the provided :py:class:`.PdbDataFile`. :param Model model: the model to update. :param PdbDataFile data_file: The source Pdb Data File :param int model_id: The ID of the model in the data fileto be used for\ conversion.""" heteroatoms = data_file.heteroatoms() molecule_names = set([a["residue_name"] for a in heteroatoms]) molecule_ids = set([_mol_id_from_atom(a) for a in heteroatoms]) for molecule_name in molecule_names: for molecule_id in molecule_ids: relevant_atoms = [a for a in heteroatoms if a["model_id"] == model_id and a["residue_name"] == molecule_name and _mol_id_from_atom(a) == molecule_id] if relevant_atoms: atoms = [Atom( a["x"], a["y"], a["z"], a["element"], a["atom_id"], a["atom_name"] ) for a in relevant_atoms] small_molecule = SmallMolecule( molecule_id, molecule_name, *atoms ) model.add_small_molecule(small_molecule)
[docs]def add_chains_to_model(model, data_file, model_id): """Takes a :py:class:`.Model` and creates :py:class:`.Chain` objects in it based on the ``atoms`` in the provided :py:class:`.PdbDataFile`. :param Model model: the model to update. :param PdbDataFile data_file: The source Pdb Data File :param int model_id: The ID of the model in the data fileto be used for\ conversion.""" atoms = [a for a in data_file.atoms() if a["model_id"] == model_id] heteroatoms = [a for a in data_file.heteroatoms() if a["model_id"] == model_id] highest_id = _get_top_atom_id(atoms, heteroatoms) chain_ids = set([a["chain_id"] for a in atoms]) for chain_id in sorted(chain_ids): residues = [] _add_residues_to_chain(residues, chain_id, atoms) missing_residue_info = _get_missing_residue_info(data_file, chain_id) if missing_residue_info: _add_missing_residues_to_chain(residues, missing_residue_info, highest_id) chain = Chain(chain_id, *residues) highest_id = max( [atom.atom_id() for atom in chain.atoms(atom_type="all")] ) model.add_chain(chain)
[docs]def connect_atoms(model, data_file, model_id): """Takes a :py:class:`.Model` and creates :py:class:`.Bond` objects between atoms in it based on the ``connections`` in the provided :py:class:`.PdbDataFile`. :param Model model: the model to update. :param PdbDataFile data_file: The source Pdb Data File :param int model_id: The ID of the model in the data fileto be used for\ conversion.""" for connection in data_file.connections(): atom = model.get_atom_by_id(connection["atom_id"]) if atom: for bonded_atom_id in connection["bonded_atoms"]: bonded_atom = model.get_atom_by_id(bonded_atom_id) if bonded_atom and atom is not bonded_atom: atom.bond_to(bonded_atom)
[docs]def bond_residue_atoms(model, data_file, model_id): """Takes a :py:class:`.Model` and creates :py:class:`.Bond` objects within the residues of the Model, based on a pre-defined dictionary of how residues are connected internally. :param Model model: the model to update. :param PdbDataFile data_file: The source Pdb Data File :param int model_id: The ID of the model in the data fileto be used for\ conversion.""" for chain in model.chains(): for residue in chain.residues(include_missing=False): lookup = residues_dict.connection_data.get(residue.residue_name()) if lookup: for atom in residue.atoms(): atom_lookup = lookup.get(atom.atom_name()) if atom_lookup: for other_atom_name in atom_lookup: other_atom = residue.get_atom_by_name(other_atom_name) if other_atom: atom.bond_to(other_atom)
[docs]def bond_residues_together(model, data_file, model_id): """Takes a :py:class:`.Model` and creates :py:class:`.Bond` objects between the residues of chains in the model. :param Model model: the model to update. :param PdbDataFile data_file: The source Pdb Data File :param int model_id: The ID of the model in the data fileto be used for\ conversion.""" for chain in model.chains(): for index, residue in enumerate(chain.residues()[:-1]): next_residue = chain.residues()[index + 1] residue.connect_to(next_residue) carboxy_atom = residue.get_atom_by_name("C",) amino_nitrogen = next_residue.get_atom_by_name("N") if carboxy_atom and amino_nitrogen: carboxy_atom.bond_to(amino_nitrogen)
[docs]def make_disulphide_bonds(model, data_file, model_id): """Takes a :py:class:`.Model` and creates disulphide :py:class:`.Bond` objects in it based on the ``ss_bonds`` in the provided :py:class:`.PdbDataFile`. :param Model model: the model to update. :param PdbDataFile data_file: The source Pdb Data File :param int model_id: The ID of the model in the data fileto be used for\ conversion.""" for disulphide_bond in data_file.ss_bonds(): chain1 = model.get_chain_by_id(disulphide_bond["chain_id_1"]) chain2 = model.get_chain_by_id(disulphide_bond["chain_id_2"]) if chain1 and chain2: residue1 = chain1.get_residue_by_id( disulphide_bond["chain_id_1"] + str(disulphide_bond["residue_id_1"]) + disulphide_bond["insert_code_1"] ) residue2 = chain2.get_residue_by_id( disulphide_bond["chain_id_2"] + str(disulphide_bond["residue_id_2"]) + disulphide_bond["insert_code_2"] ) if residue1 and residue2: atom1 = residue1.get_atom_by_element("S") atom2 = residue2.get_atom_by_element("S") if atom1 and atom2 and atom1 is not atom2: atom1.bond_to(atom2)
[docs]def give_model_sites(model, data_file, model_id): """Takes a :py:class:`.Model` and creates :py:class:`.BindSite` objects in it based on the ``sites`` in the provided :py:class:`.PdbDataFile`. :param Model model: the model to update. :param PdbDataFile data_file: The source Pdb Data File :param int model_id: The ID of the model in the data fileto be used for\ conversion.""" for site in data_file.sites(): residues = [model.get_chain_by_id(residue["chain_id"]).get_residue_by_id( str(residue["chain_id"]) + str(residue["residue_id"]) + residue["insert_code"] ) if residue["chain_id"] in [ chain.chain_id() for chain in model.chains() ] else None for residue in site["residues"]] residues = [r for r in residues if r] if residues: site = BindSite( site["site_id"], *residues ) model.add_bind_site(site)
[docs]def map_sites_to_ligands(model, data_file, model_id): """Takes a :py:class:`.Model` and assocated ligands and binding sites to each other based on 800-remarks in the provided :py:class:`.PdbDataFile`. :param Model model: the model to update. :param PdbDataFile data_file: The source Pdb Data File :param int model_id: The ID of the model in the data fileto be used for\ conversion.""" remark_800 = data_file.get_remark_by_number(800) if remark_800: remark_lines = [ line for line in remark_800["content"].split("\n") if line != "SITE" ] for index, line in enumerate(remark_lines): if line.startswith("SITE_IDENTIFIER"): site_id = line.split(":")[1].strip() if ":" in line else None if site_id: for trailing_line in remark_lines[index+1:]: if trailing_line.startswith("SITE_IDENTIFIER"): break if trailing_line.startswith("SITE_DESCRIPTION"): site = model.get_bind_site_by_id(site_id) if site: ligand_id = trailing_line.split()[-1] if not ligand_id[0].isalpha(): ligand_id = trailing_line.split()[-2] + ligand_id ligand = model.get_small_molecule_by_id(ligand_id) site.ligand(ligand)
[docs]def give_model_alpha_helices(model, data_file, model_id): """Takes a :py:class:`.Model` and creates :py:class:`.AlphaHelix` objects in it based on the ``helices`` in the provided :py:class:`.PdbDataFile`. :param Model model: the model to update. :param PdbDataFile data_file: The source Pdb Data File :param int model_id: The ID of the model in the data fileto be used for\ conversion.""" for helix in data_file.helices(): chain = model.get_chain_by_id(helix["start_residue_chain_id"]) if chain: start_residue = chain.get_residue_by_id( helix["start_residue_chain_id"] + str(helix["start_residue_id"]) + helix["start_residue_insert"] ) end_residue = chain.get_residue_by_id( helix["end_residue_chain_id"] + str(helix["end_residue_id"]) + helix["end_residue_insert"] ) if start_residue and end_residue: start_index = chain.residues().index(start_residue) end_index = chain.residues().index(end_residue) if end_index > start_index: AlphaHelix( helix["helix_name"], *chain.residues()[start_index:end_index + 1], comment=helix["comment"], helix_class=HELIX_CLASSES.get(helix["helix_class"], HELIX_CLASSES[1]) )
[docs]def give_model_beta_strands(model, data_file, model_id): """Takes a :py:class:`.Model` and creates :py:class:`.BetaStrand` objects in it based on the ``sheets`` in the provided :py:class:`.PdbDataFile`. :param Model model: the model to update. :param PdbDataFile data_file: The source Pdb Data File :param int model_id: The ID of the model in the data fileto be used for\ conversion.""" for sheet in data_file.sheets(): for strand in sheet["strands"]: chain = model.get_chain_by_id(strand["start_residue_chain_id"]) if chain: start_residue = chain.get_residue_by_id( strand["start_residue_chain_id"] + str(strand["start_residue_id"]) + strand["start_residue_insert"] ) end_residue = chain.get_residue_by_id( strand["end_residue_chain_id"] + str(strand["end_residue_id"]) + strand["end_residue_insert"] ) if start_residue and end_residue: start_index = chain.residues().index(start_residue) end_index = chain.residues().index(end_residue) BetaStrand( str(strand["strand_id"]), strand["sense"], *chain.residues()[start_index:end_index + 1] )
[docs]def give_model_complexes(model, data_file, model_id): """Takes a :py:class:`.Model` and creates :py:class:`.Complex` objects in it based on the ``compounds`` in the provided :py:class:`.PdbDataFile`. :param Model model: the model to update. :param PdbDataFile data_file: The source Pdb Data File :param int model_id: The ID of the model in the data fileto be used for\ conversion.""" for compound in data_file.compounds(): chains = [] for chain in model.chains(): if chain.chain_id() in compound["CHAIN"]: chains.append(chain) if chains: complex_ = Complex(str(compound["MOL_ID"]), compound["MOLECULE"], *chains) model.add_complex(complex_)
def _get_top_atom_id(atoms=None, heteroatoms=None): atom_id = max([atom["atom_id"] for atom in atoms]) if atoms else -1 hetero_id = max([atom["atom_id"] for atom in heteroatoms]) if heteroatoms else -1 return max((atom_id, hetero_id)) def _mol_id_from_atom(atom): return atom["chain_id"] + str(atom["residue_id"]) + atom["insert_code"] def _add_residues_to_chain(residues, chain_id, atoms): relevant_atoms = [a for a in atoms if a["chain_id"] == chain_id] residue_ids = [] for a in relevant_atoms: id_ = _mol_id_from_atom(a) if id_ not in residue_ids: residue_ids.append(id_) for residue_id in residue_ids: residue_atoms = [ a for a in relevant_atoms if _mol_id_from_atom(a) == residue_id ] pdb_atoms = [Atom( a["x"], a["y"], a["z"], a["element"], a["atom_id"], a["atom_name"] ) for a in residue_atoms] residue = Residue( residue_id, residue_atoms[0]["residue_name"], *pdb_atoms ) residues.append(residue) def _get_missing_residue_info(data_file, chain_id): missing_residues_remark = data_file.get_remark_by_number(465) if missing_residues_remark: missing_residues = missing_residues_remark["content"].split("\n")[6:] missing_residues = [line.split() for line in missing_residues if line] missing_residues = [ res for res in missing_residues if len(res) == 3 or int(res[0]) == model_id ] missing_residues = [ res for res in missing_residues if res[-2] == chain_id ] missing_residue_ids = [(res[0], res[-2] + res[-1]) for res in missing_residues] # Remove duplicate residue IDs - I can't remember why this is needed... while len(set([res[1] for res in missing_residue_ids])) < len(missing_residue_ids): ids = [res[1] for res in missing_residue_ids] duplicates = [id_ for id_ in ids if ids.count(id_) > 1] id_to_remove = duplicates[0] for res in missing_residue_ids[::-1]: if res[1] == id_to_remove: missing_residue_ids.remove(res) break return missing_residue_ids def _add_missing_residues_to_chain(residues, missing_residue_info, highest_id): missing_residues = [] atom_id = highest_id + 1 for missing_id in missing_residue_info: missing_residues.append(_create_missing_residue_from_id(missing_id, atom_id)) atom_id += len(missing_residues[-1].atoms(atom_type="all")) for missing_residue in missing_residues: closest_smaller_id = None if not _residue_id_is_greater_than_residue_id(residues[0].residue_id(), missing_residue.residue_id()): closest_smaller_id = sorted( residues, key=lambda k: _residue_id_to_int(missing_residue.residue_id()) - _residue_id_to_int( k.residue_id() ) if _residue_id_to_int(missing_residue.residue_id()) > _residue_id_to_int( k.residue_id() ) else float("inf"))[0] residues.insert( residues.index(closest_smaller_id) + 1 if closest_smaller_id else 0, missing_residue ) def _create_missing_residue_from_id(missing_id, atom_id): lookup = residues_dict.connection_data.get(missing_id[0]) residue = None if lookup: empty_atoms = [] for atom_name in lookup: atom = GhostAtom(atom_name[0], atom_id, atom_name) atom_id += 1 empty_atoms.append(atom) residue = Residue(missing_id[1], missing_id[0], *empty_atoms) else: empty_atoms = [] for atom_name in ["C", "CA", "N"]: atom = GhostAtom(atom_name[0], atom_id, atom_name) atom_id += 1 empty_atoms.append(atom) residue = Residue(missing_id[1], missing_id[0], *empty_atoms) return residue def _residue_id_to_int(residue_id): int_component = int( "".join([char for char in residue_id if char in "-0123456789"]) ) char_component = residue_id[-1] if residue_id[-1] not in "-0123456789" else "" int_component *= 100 return int_component + (ord(char_component) - 64 if char_component else 0) def _residue_id_is_greater_than_residue_id(residue_id1, residue_id2): residues = [] for residue_id in (residue_id1, residue_id2): chain_id = residue_id[0] if residue_id[0].isalpha() else "" number = int("".join([char for char in residue_id if char.isnumeric() or char == "-"])) insert = ord(residue_id[-1]) if residue_id[-1].isalpha() else 0 residues.append((chain_id, number, insert)) if residues[0][1] > residues[1][1]: return True elif residues[0][1] == residues[1][1] and residues[0][2] > residues[1][2]: return True else: return False HELIX_CLASSES = { 1: "Right-handed alpha", 2: "Right-handed omega", 3: "Right-handed pi", 4: "Right-handed gamma", 5: "Right-handed 3 - 10", 6: "Left-handed alpha", 7: "Left-handed omega", 8: "Left-handed gamma", 9: "2 - 7 ribbon/helix", 10: "Polyproline", }