Source code for molecupy.converters.pdbdatafile2model

"""This module handles the logic of converting a :py:class:`.PdbDataFile` to a
:py:class:`.Model`"""

from ..structures import Model, Atom, GhostAtom, SmallMolecule, Residue, Chain
from ..structures import BindSite, AlphaHelix, BetaStrand, Complex
from ..pdb.pdbdatafile import PdbDataFile
from ..pdb import residues as residues_dict

[docs]def model_from_pdb_data_file(data_file, model_id=1):
    """Takes a :py:class:`.PdbDataFile`, converts it to a :py:class:`.Model`,
    and returns it.

    :py:class:`.PdbDataFile` objects can contain multiple models. By default,
    model 1 will be used, but you can specify specific models with the
    ``model_id`` argument.

    :param PdbDataFile data_file: The :py:class:`.PdbDataFile` to convert.
    :param int model_id: The ID of the model in the data fileto be used for\
    conversion.
    :rtype: :py:class:`.Model`"""

    if not isinstance(data_file, PdbDataFile):
        raise TypeError("model_from_pdb_data_file can only convert PdbDataFiles")
    for model_dict in data_file.models():
        if model_dict["model_id"] == model_id:
            model = Model()
            model._source = data_file

            add_small_molecules_to_model(model, data_file, model_id)
            add_chains_to_model(model, data_file, model_id)
            connect_atoms(model, data_file, model_id)
            bond_residue_atoms(model, data_file, model_id)
            bond_residues_together(model, data_file, model_id)
            make_disulphide_bonds(model, data_file, model_id)
            make_link_bonds(model, data_file, model_id)
            give_model_sites(model, data_file, model_id)
            map_sites_to_ligands(model, data_file, model_id)
            give_model_alpha_helices(model, data_file, model_id)
            give_model_beta_strands(model, data_file, model_id)
            give_model_complexes(model, data_file, model_id)

            return model
    raise ValueError("There is no model with ID %i" % model_id)


[docs]def add_small_molecules_to_model(model, data_file, model_id):
    """Takes a :py:class:`.Model` and creates :py:class:`.SmallMolecule`
    objects in it based on the ``heteroatoms`` in the provided
    :py:class:`.PdbDataFile`.

    :param Model model: the model to update.
    :param PdbDataFile data_file: The source Pdb Data File
    :param int model_id: The ID of the model in the data fileto be used for\
    conversion."""

    heteroatoms = data_file.heteroatoms()
    molecule_names = set([a["residue_name"] for a in heteroatoms])
    molecule_ids = set([_mol_id_from_atom(a) for a in heteroatoms])
    for molecule_name in molecule_names:
        for molecule_id in molecule_ids:
            relevant_atoms = [a for a in heteroatoms if a["model_id"] == model_id
             and a["residue_name"] == molecule_name and _mol_id_from_atom(a) == molecule_id]
            if relevant_atoms:
                atoms = [Atom(
                 a["x"], a["y"], a["z"],
                 a["element"],
                 a["atom_id"],
                 a["atom_name"]
                ) for a in relevant_atoms]
                small_molecule = SmallMolecule(
                 molecule_id, molecule_name, *atoms
                )
                model.add_small_molecule(small_molecule)


[docs]def add_chains_to_model(model, data_file, model_id):
    """Takes a :py:class:`.Model` and creates :py:class:`.Chain`
    objects in it based on the ``atoms`` in the provided
    :py:class:`.PdbDataFile`.

    :param Model model: the model to update.
    :param PdbDataFile data_file: The source Pdb Data File
    :param int model_id: The ID of the model in the data fileto be used for\
    conversion."""

    atoms = [a for a in data_file.atoms() if a["model_id"] == model_id]
    heteroatoms = [a for a in data_file.heteroatoms() if a["model_id"] == model_id]
    highest_id = _get_top_atom_id(atoms, heteroatoms)
    chain_ids = set([a["chain_id"] for a in atoms])
    for chain_id in sorted(chain_ids):
        residues = []
        _add_residues_to_chain(residues, chain_id, atoms)
        missing_residue_info = _get_missing_residue_info(data_file, chain_id)
        if missing_residue_info:
            _add_missing_residues_to_chain(residues, missing_residue_info, highest_id)
        chain = Chain(chain_id, *residues)
        highest_id = max(
         [atom.atom_id() for atom in chain.atoms(atom_type="all")]
        )
        model.add_chain(chain)


[docs]def connect_atoms(model, data_file, model_id):
    """Takes a :py:class:`.Model` and creates :py:class:`.Bond`
    objects between atoms in it based on the ``connections`` in the provided
    :py:class:`.PdbDataFile`.

    :param Model model: the model to update.
    :param PdbDataFile data_file: The source Pdb Data File
    :param int model_id: The ID of the model in the data fileto be used for\
    conversion."""

    for connection in data_file.connections():
        atom = model.get_atom_by_id(connection["atom_id"])
        if atom:
            for bonded_atom_id in connection["bonded_atoms"]:
                bonded_atom = model.get_atom_by_id(bonded_atom_id)
                if bonded_atom and atom is not bonded_atom:
                    atom.bond_to(bonded_atom)


[docs]def bond_residue_atoms(model, data_file, model_id):
    """Takes a :py:class:`.Model` and creates :py:class:`.Bond`
    objects within the residues of the Model, based on a pre-defined
    dictionary of how residues are connected internally.

    :param Model model: the model to update.
    :param PdbDataFile data_file: The source Pdb Data File
    :param int model_id: The ID of the model in the data fileto be used for\
    conversion."""

    for chain in model.chains():
        for residue in chain.residues(include_missing=False):
            lookup = residues_dict.connection_data.get(residue.residue_name())
            if lookup:
                for atom in residue.atoms():
                    atom_lookup = lookup.get(atom.atom_name())
                    if atom_lookup:
                        for other_atom_name in atom_lookup:
                            other_atom = residue.get_atom_by_name(other_atom_name)
                            if other_atom:
                                atom.bond_to(other_atom)


[docs]def bond_residues_together(model, data_file, model_id):
    """Takes a :py:class:`.Model` and creates :py:class:`.Bond`
    objects between the residues of chains in the model.

    :param Model model: the model to update.
    :param PdbDataFile data_file: The source Pdb Data File
    :param int model_id: The ID of the model in the data fileto be used for\
    conversion."""

    for chain in model.chains():
        for index, residue in enumerate(chain.residues()[:-1]):
            next_residue = chain.residues()[index + 1]
            residue.connect_to(next_residue)
            carboxy_atom = residue.get_atom_by_name("C",)
            amino_nitrogen = next_residue.get_atom_by_name("N")
            if carboxy_atom and amino_nitrogen:
                carboxy_atom.bond_to(amino_nitrogen)


[docs]def make_disulphide_bonds(model, data_file, model_id):
    """Takes a :py:class:`.Model` and creates disulphide :py:class:`.Bond`
    objects in it based on the ``ss_bonds`` in the provided
    :py:class:`.PdbDataFile`.

    :param Model model: the model to update.
    :param PdbDataFile data_file: The source Pdb Data File
    :param int model_id: The ID of the model in the data fileto be used for\
    conversion."""

    for disulphide_bond in data_file.ss_bonds():
        chain1 = model.get_chain_by_id(disulphide_bond["chain_id_1"])
        chain2 = model.get_chain_by_id(disulphide_bond["chain_id_2"])
        if chain1 and chain2:
            residue1 = chain1.get_residue_by_id(
             disulphide_bond["chain_id_1"] +
             str(disulphide_bond["residue_id_1"]) +
             disulphide_bond["insert_code_1"]
            )
            residue2 = chain2.get_residue_by_id(
             disulphide_bond["chain_id_2"] +
             str(disulphide_bond["residue_id_2"]) +
             disulphide_bond["insert_code_2"]
            )
            if residue1 and residue2:
                atom1 = residue1.get_atom_by_element("S")
                atom2 = residue2.get_atom_by_element("S")
                if atom1 and atom2 and atom1 is not atom2:
                    atom1.bond_to(atom2)


[docs]def make_link_bonds(model, data_file, model_id):
    """Takes a :py:class:`.Model` and creates specified :py:class:`.Bond`
    objects in it based on the ``links`` in the provided
    :py:class:`.PdbDataFile`.

    :param Model model: the model to update.
    :param PdbDataFile data_file: The source Pdb Data File
    :param int model_id: The ID of the model in the data fileto be used for\
    conversion."""

    for link in data_file.links():
        chain1 = model.get_chain_by_id(link["chain_id_1"])
        chain2 = model.get_chain_by_id(link["chain_id_2"])
        if chain1 and chain2:
            molecule1, molecule2 = [chain1.get_residue_by_id(
             link["chain_id_%i" % n] +
             str(link["residue_id_%i" % n]) +
             link["insert_code_%i" % n]
            ) for n in (1, 2)]
            if not molecule1:
                molecule1 = model.get_small_molecule_by_id(
                 link["chain_id_1"] +
                 str(link["residue_id_1"]) +
                 link["insert_code_1"]
                )
            if not molecule2:
                molecule2 = model.get_small_molecule_by_id(
                 link["chain_id_2"] +
                 str(link["residue_id_2"]) +
                 link["insert_code_2"]
                )
            if molecule1 and molecule2:
                atom1 = molecule1.get_atom_by_name(link["atom_1"])
                atom2 = molecule2.get_atom_by_name(link["atom_2"])
                if atom1 and atom2:
                    atom1.bond_to(atom2)


[docs]def give_model_sites(model, data_file, model_id):
    """Takes a :py:class:`.Model` and creates :py:class:`.BindSite`
    objects in it based on the ``sites`` in the provided
    :py:class:`.PdbDataFile`.

    :param Model model: the model to update.
    :param PdbDataFile data_file: The source Pdb Data File
    :param int model_id: The ID of the model in the data fileto be used for\
    conversion."""

    for site in data_file.sites():
        residues = [model.get_chain_by_id(residue["chain_id"]).get_residue_by_id(
         str(residue["chain_id"]) + str(residue["residue_id"]) + residue["insert_code"]
        ) if residue["chain_id"] in [
         chain.chain_id() for chain in model.chains()
        ] else None for residue in site["residues"]]
        residues = [r for r in residues if r]
        if residues:
            site = BindSite(
             site["site_id"],
             *residues
            )
            model.add_bind_site(site)


[docs]def map_sites_to_ligands(model, data_file, model_id):
    """Takes a :py:class:`.Model` and assocated ligands and binding sites to
    each other based on 800-remarks in the provided
    :py:class:`.PdbDataFile`.

    :param Model model: the model to update.
    :param PdbDataFile data_file: The source Pdb Data File
    :param int model_id: The ID of the model in the data fileto be used for\
    conversion."""

    remark_800 = data_file.get_remark_by_number(800)
    if remark_800:
        remark_lines = [
         line for line in remark_800["content"].split("\n") if line != "SITE"
        ]
        for index, line in enumerate(remark_lines):
            if line.startswith("SITE_IDENTIFIER"):
                site_id = line.split(":")[1].strip() if ":" in line else None
                if site_id:
                    for trailing_line in remark_lines[index+1:]:
                        if trailing_line.startswith("SITE_IDENTIFIER"): break
                        if trailing_line.startswith("SITE_DESCRIPTION"):
                            site = model.get_bind_site_by_id(site_id)
                            if site:
                                ligand_id = trailing_line.split()[-1]
                                if not ligand_id[0].isalpha():
                                    ligand_id = trailing_line.split()[-2] + ligand_id
                                ligand = model.get_small_molecule_by_id(ligand_id)
                                site.ligand(ligand)


[docs]def give_model_alpha_helices(model, data_file, model_id):
    """Takes a :py:class:`.Model` and creates :py:class:`.AlphaHelix`
    objects in it based on the ``helices`` in the provided
    :py:class:`.PdbDataFile`.

    :param Model model: the model to update.
    :param PdbDataFile data_file: The source Pdb Data File
    :param int model_id: The ID of the model in the data fileto be used for\
    conversion."""

    for helix in data_file.helices():
        chain = model.get_chain_by_id(helix["start_residue_chain_id"])
        if chain:
            start_residue = chain.get_residue_by_id(
             helix["start_residue_chain_id"] +
             str(helix["start_residue_id"]) +
             helix["start_residue_insert"]
            )
            end_residue = chain.get_residue_by_id(
             helix["end_residue_chain_id"] +
             str(helix["end_residue_id"]) +
             helix["end_residue_insert"]
            )
            if start_residue and end_residue:
                start_index = chain.residues().index(start_residue)
                end_index = chain.residues().index(end_residue)
                if end_index > start_index:
                    AlphaHelix(
                     helix["helix_name"],
                     *chain.residues()[start_index:end_index + 1],
                     comment=helix["comment"],
                     helix_class=HELIX_CLASSES.get(helix["helix_class"], HELIX_CLASSES[1])
                    )


[docs]def give_model_beta_strands(model, data_file, model_id):
    """Takes a :py:class:`.Model` and creates :py:class:`.BetaStrand`
    objects in it based on the ``sheets`` in the provided
    :py:class:`.PdbDataFile`.

    :param Model model: the model to update.
    :param PdbDataFile data_file: The source Pdb Data File
    :param int model_id: The ID of the model in the data fileto be used for\
    conversion."""

    for sheet in data_file.sheets():
        for strand in sheet["strands"]:
            chain = model.get_chain_by_id(strand["start_residue_chain_id"])
            if chain:
                start_residue = chain.get_residue_by_id(
                 strand["start_residue_chain_id"] +
                 str(strand["start_residue_id"]) +
                 strand["start_residue_insert"]
                )
                end_residue = chain.get_residue_by_id(
                 strand["end_residue_chain_id"] +
                 str(strand["end_residue_id"]) +
                 strand["end_residue_insert"]
                )
                if start_residue and end_residue:
                    start_index = chain.residues().index(start_residue)
                    end_index = chain.residues().index(end_residue)
                    BetaStrand(
                     str(strand["strand_id"]),
                     strand["sense"],
                     *chain.residues()[start_index:end_index + 1]
                    )


[docs]def give_model_complexes(model, data_file, model_id):
    """Takes a :py:class:`.Model` and creates :py:class:`.Complex`
    objects in it based on the ``compounds`` in the provided
    :py:class:`.PdbDataFile`.

    :param Model model: the model to update.
    :param PdbDataFile data_file: The source Pdb Data File
    :param int model_id: The ID of the model in the data fileto be used for\
    conversion."""
    
    for compound in data_file.compounds():
        chains = []
        for chain in model.chains():
            if chain.chain_id() in compound["CHAIN"]:
                chains.append(chain)
        if chains:
            complex_ = Complex(str(compound["MOL_ID"]), compound["MOLECULE"], *chains)
            model.add_complex(complex_)


def _get_top_atom_id(atoms=None, heteroatoms=None):
    atom_id = max([atom["atom_id"] for atom in atoms]) if atoms else -1
    hetero_id = max([atom["atom_id"] for atom in heteroatoms]) if heteroatoms else -1
    return max((atom_id, hetero_id))


def _mol_id_from_atom(atom):
    return atom["chain_id"] + str(atom["residue_id"]) + atom["insert_code"]


def _add_residues_to_chain(residues, chain_id, atoms):
    relevant_atoms = [a for a in atoms if a["chain_id"] == chain_id]
    residue_ids = []
    for a in relevant_atoms:
        id_ = _mol_id_from_atom(a)
        if id_ not in residue_ids:
            residue_ids.append(id_)
    for residue_id in residue_ids:
        residue_atoms = [
         a for a in relevant_atoms if _mol_id_from_atom(a) == residue_id
        ]
        pdb_atoms = [Atom(
         a["x"], a["y"], a["z"],
         a["element"],
         a["atom_id"],
         a["atom_name"]
        ) for a in residue_atoms]
        residue = Residue(
         residue_id, residue_atoms[0]["residue_name"], *pdb_atoms
        )
        residues.append(residue)


def _get_missing_residue_info(data_file, chain_id):
    missing_residues_remark = data_file.get_remark_by_number(465)
    if missing_residues_remark:
        missing_residues = missing_residues_remark["content"].split("\n")[6:]
        missing_residues = [line.split() for line in missing_residues if line]
        missing_residues = [
         res for res in missing_residues if len(res) == 3 or int(res[0]) == model_id
        ]
        missing_residues = [
         res for res in missing_residues if res[-2] == chain_id
        ]
        missing_residue_ids = [(res[0], res[-2] + res[-1]) for res in missing_residues]

        # Remove duplicate residue IDs - I can't remember why this is needed...
        while len(set([res[1] for res in missing_residue_ids])) < len(missing_residue_ids):
            ids = [res[1] for res in missing_residue_ids]
            duplicates = [id_ for id_ in ids if ids.count(id_) > 1]
            id_to_remove = duplicates[0]
            for res in missing_residue_ids[::-1]:
                if res[1] == id_to_remove:
                    missing_residue_ids.remove(res)
                    break
        return missing_residue_ids


def _add_missing_residues_to_chain(residues, missing_residue_info, highest_id):
    missing_residues = []
    atom_id = highest_id + 1
    for missing_id in missing_residue_info:
        missing_residues.append(_create_missing_residue_from_id(missing_id, atom_id))
        atom_id += len(missing_residues[-1].atoms(atom_type="all"))

    for missing_residue in missing_residues:
        closest_smaller_id = None
        if not _residue_id_is_greater_than_residue_id(residues[0].residue_id(), missing_residue.residue_id()):
            closest_smaller_id = sorted(
             residues,
             key=lambda k: _residue_id_to_int(missing_residue.residue_id()) - _residue_id_to_int(
              k.residue_id()
             ) if _residue_id_to_int(missing_residue.residue_id()) > _residue_id_to_int(
              k.residue_id()
             ) else float("inf"))[0]
        residues.insert(
         residues.index(closest_smaller_id) + 1 if closest_smaller_id else 0,
         missing_residue
        )


def _create_missing_residue_from_id(missing_id, atom_id):
    lookup = residues_dict.connection_data.get(missing_id[0])
    residue = None
    if lookup:
        empty_atoms = []
        for atom_name in lookup:
            atom = GhostAtom(atom_name[0], atom_id, atom_name)
            atom_id += 1
            empty_atoms.append(atom)
        residue = Residue(missing_id[1], missing_id[0], *empty_atoms)
    else:
        empty_atoms = []
        for atom_name in ["C", "CA", "N"]:
            atom = GhostAtom(atom_name[0], atom_id, atom_name)
            atom_id += 1
            empty_atoms.append(atom)
        residue = Residue(missing_id[1], missing_id[0], *empty_atoms)
    return residue


def _residue_id_to_int(residue_id):
    int_component = int(
     "".join([char for char in residue_id if char in "-0123456789"])
    )
    char_component = residue_id[-1] if residue_id[-1] not in "-0123456789" else ""
    int_component *= 100
    return int_component + (ord(char_component) - 64 if char_component else 0)


def _residue_id_is_greater_than_residue_id(residue_id1, residue_id2):
    residues = []
    for residue_id in (residue_id1, residue_id2):
        chain_id = residue_id[0] if residue_id[0].isalpha() else ""
        number = int("".join([char for char in residue_id if char.isnumeric() or char == "-"]))
        insert = ord(residue_id[-1]) if residue_id[-1].isalpha() else 0
        residues.append((chain_id, number, insert))
    if residues[0][1] > residues[1][1]:
        return True
    elif residues[0][1] == residues[1][1] and residues[0][2] > residues[1][2]:
        return True
    else:
        return False


HELIX_CLASSES = {
 1: "Right-handed alpha",
 2: "Right-handed omega",
 3: "Right-handed pi",
 4: "Right-handed gamma",
 5: "Right-handed 3 - 10",
 6: "Left-handed alpha",
 7: "Left-handed omega",
 8: "Left-handed gamma",
 9: "2 - 7 ribbon/helix",
 10: "Polyproline",
}