Skip to content
177 changes: 162 additions & 15 deletions pdbtools/pdb_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,18 @@
"""
Merges several PDB files into one.

The contents are not sorted and no lines are deleted (e.g. END, TER
statements) so we recommend piping the results through `pdb_tidy.py`.
Use `pdb_mkensemble` if you with to make an ensemble of multiple
conformation states of the same protein.

Follows the criteria:

* The merged PDB file will represent a single MODEL.
* Non-coordinate lines in input PDBs will be ignored.
* Atom numbers are restarted from 1.
* CONECT lines are yield at the end. CONECT numbers are updated to
the new atom numbers.
* Missing TER and END statements are placed accordingly. Original
TER and END statements are maintained.

Usage:
python pdb_merge.py <pdb file> <pdb file>
Expand All @@ -41,23 +51,34 @@
__email__ = "joaomcteixeira@gmail.com"


# Python 2.7 compatibility
try:
FileNotFoundError
except NameError:
FileNotFoundError = IOError


def check_input(args):
"""Checks whether to read from stdin/file and validates user input/options.
"""

# Defaults
fl = [] # file list

if len(args) >= 1:
if len(args) == 1:
sys.stderr.write('ERROR!! Please provide more than one input file.')
sys.stderr.write(__doc__)
sys.exit(1)

elif len(args) >= 1:
for fn in args:
if not os.path.isfile(fn):
emsg = 'ERROR!! File not found or not readable: \'{}\'\n'
emsg = 'ERROR!! File not found or not readable: \'{}\''
sys.stderr.write(emsg.format(fn))
sys.stderr.write(__doc__)
sys.exit(1)

fh = open(fn, 'r')
fl.append(fh)
fl.append(fn)

else: # Whatever ...
sys.stderr.write(__doc__)
Expand All @@ -66,25 +87,151 @@ def check_input(args):
return fl


def run(flist):
# TER 606 LEU A 75
_fmt_TER = "TER {:>5d} {:3s} {:1s}{:>4s}{:1s}" + " " * 53 + os.linesep


def make_TER(prev_line, fmt_TER=_fmt_TER):
"""Creates a TER statement based on the last ATOM/HETATM line."""
# Add last TER statement
serial = int(prev_line[6:11]) + 1
rname = prev_line[17:20]
chain = prev_line[21]
resid = prev_line[22:26]
icode = prev_line[26]

return fmt_TER.format(serial, rname, chain, resid, icode)


def _get_lines_from_input(pinput, i=0):
"""Decide wheter input is file or lines."""
try:
return open(pinput, 'r')
except (FileNotFoundError, TypeError):
return pinput


def _update_atom_number(line, number, anisou=('ANISOU',)):
if line.startswith(anisou):
number -= 1
return line[:6] + str(number).rjust(5) + line[11:]


def run(input_list):
"""
Iterate over a list of files and yields each line sequentially.
Merges PDB files into a single file.

Follows the criteria:

* The merged PDB file will represent a single MODEL.
* Non-coordinate lines will be ignored.
* Atom numbers are restarted from 1.
* CONECT lines are yield at the end. CONECT numbers are updated
to the new atom numbers.
* TER and END statements are placed accordingly.

Use `pdb_mkensemble` if you with to make an ensemble of multiple
conformation states of the same protein.

Parameters
----------
flist : list of file-like objects
Must handle `.close()` attribute.
input_list : iterator of iterators
`input_list` can be:
* a list of file paths
* a list of file handlers
* a list of lists of lines, the latter representing the
content of the different input PDB files

Yields
------
str (line-by-line)
Lines from the concatenated PDB files.
Lines from the merged PDB files.
"""
records = ('ATOM', 'HETATM', 'ANISOU', 'CONECT', 'MODEL', 'ENDMDL')
atom_anisou = ('ATOM', 'ANISOU')
atom_hetatm = ('ATOM', 'HETATM')
hetatm = ('HETATM',)
conect = ('CONECT',)
prev_chain = None
chain = None
prev_line = ''
conect_lines = []

# CONECT logic taken from pdb_preatom
fmt_CONECT = "CONECT{:>5s}{:>5s}{:>5s}{:>5s}{:>5s}" + " " * 49 + os.linesep
char_ranges = (
slice(6, 11),
slice(11, 16),
slice(16, 21),
slice(21, 26),
slice(26, 31),
)
atom_number = 1

for input_item in input_list:

lines = _get_lines_from_input(input_item)

# store for CONECT statements
# restart at each PDB. Read docs above
serial_equiv = {'': ''}

for line in lines:

if not line.startswith(records):
continue

chain = line[21]

if line.startswith(atom_hetatm):
serial_equiv[line[6:11].strip()] = atom_number

if \
line.startswith(hetatm) \
and prev_line.startswith(atom_anisou):

yield _update_atom_number(make_TER(prev_line), atom_number)
atom_number += 1

elif \
prev_chain is not None \
and chain != prev_chain \
and prev_line.startswith(atom_anisou):

yield _update_atom_number(make_TER(prev_line), atom_number)
atom_number += 1

elif line.startswith(conect):

# 6:11, 11:16, 16:21, 21:26, 26:31
serials = (line[cr].strip() for cr in char_ranges)

# If not found, return default
new_serials = (str(serial_equiv.get(s, s)) for s in serials)
conect_line = fmt_CONECT.format(*new_serials)

conect_lines.append(conect_line)

continue

elif not line.strip(os.linesep).strip():
continue

yield _update_atom_number(line, atom_number)
atom_number += 1

prev_line = line
prev_chain = chain

try:
lines.close()
except AttributeError:
pass

for line in conect_lines:
yield line

for fhandle in flist:
for line in fhandle:
yield line
fhandle.close()
yield 'END' + os.linesep


concatenate_files = run
Expand Down
15 changes: 15 additions & 0 deletions tests/data/dummy_merge_A.pdb
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
HEADER CHAIN A FOR pdb_merge
ATOM 1 N ASN A 1 22.066 40.557 0.420 1.00 0.00 N
ATOM 2 CA BASN A 1 20.000 30.000 0.005 0.60 0.00 C
ATOM 3 CA AASN A 1 21.411 39.311 0.054 0.40 0.00 C
ATOM 4 C ASN A 1 22.143 38.629 -1.102 1.00 0.00 C
ATOM 5 O ASN A 1 21.581 38.297 -2.176 1.00 0.00 O
ATOM 6 N ARG A 2 23.408 38.395 -0.829 1.00 0.00 N
ATOM 7 CA ARG A 2 24.384 37.823 -1.757 1.00 0.00 C
ATOM 8 C ARG A 2 24.061 36.421 -2.189 1.00 0.00 C
ATOM 9 O ARG A 2 24.411 36.106 -3.325 1.00 0.00 O
ATOM 10 N GLU A 3 23.456 35.570 -1.408 1.00 0.00 N
ATOM 11 CA GLU A 3 23.064 34.219 -1.780 1.00 0.00 C
ATOM 12 C GLU A 3 21.682 34.241 -2.380 1.00 0.00 C
ATOM 13 O GLU A 3 21.239 33.190 -2.786 1.00 0.00 O
TER 17 HOH A 6
14 changes: 14 additions & 0 deletions tests/data/dummy_merge_B.pdb
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
HEADER CHAIN B FOR pdb_merge
ATOM 1 N ARG B 1 36.898 42.175 -2.688 1.00 0.00 N
ATOM 2 CA ARG B 1 37.080 43.455 -3.421 1.00 0.00 C
ATOM 3 C ARG B 1 36.102 44.524 -2.998 1.00 0.00 C
ATOM 4 O ARG B 1 36.577 45.677 -2.879 1.00 0.00 O
ATOM 5 N GLU B 2 34.849 44.167 -2.710 1.00 0.00 N
ATOM 6 CA GLU B 2 33.861 45.127 -2.233 1.00 0.00 C
ATOM 7 C GLU B 2 34.180 45.629 -0.820 1.00 0.00 C
ATOM 8 O GLU B 2 33.914 46.775 -0.464 1.00 0.00 O
ATOM 9 N ALA B 3 34.725 44.679 -0.087 1.00 0.00 N
ATOM 10 CA ALA B 3 35.081 45.036 1.305 1.00 0.00 C
ATOM 11 C ALA B 3 36.213 46.067 1.258 1.00 0.00 C
ATOM 12 O ALA B 3 36.287 47.046 2.028 1.00 0.00 O
HETATM 13 O HOH B 4 11.052 -12.419 29.700 1.00 73.70
21 changes: 21 additions & 0 deletions tests/data/dummy_merge_C.pdb
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
ATOM 1 N ARG C 1 36.898 42.175 -2.688 1.00 0.00 N
ATOM 2 CA ARG C 1 37.080 43.455 -3.421 1.00 0.00 C
ATOM 3 C ARG C 1 36.102 44.524 -2.998 1.00 0.00 C
ATOM 4 O ARG C 1 36.577 45.677 -2.879 1.00 0.00 O
ATOM 5 N GLU C 2 34.849 44.167 -2.710 1.00 0.00 N
ATOM 6 CA GLU C 2 33.861 45.127 -2.233 1.00 0.00 C
ATOM 7 C GLU C 2 34.180 45.629 -0.820 1.00 0.00 C
ATOM 8 O GLU C 2 33.914 46.775 -0.464 1.00 0.00 O
ATOM 9 N MET C 3 43.010 -16.998 71.911 1.00 54.34
ATOM 10 CA MET C 3 42.850 -16.494 70.506 1.00 52.98
ATOM 11 C MET C 3 41.752 -17.205 69.684 1.00 52.05
ATOM 12 O MET C 3 41.560 -18.418 69.777 1.00 54.00
TER 13 MET C 3
HETATM 14 O HOH C 4 -8.172 -22.003 57.197 1.00 70.53
HETATM 15 O HOH C 5 36.020 -23.583 73.186 1.00 24.82
HETATM 16 O HOH C 6 41.203 -28.852 57.698 1.00 53.16
HETATM 17 O HOH C 7 -4.491 -9.687 56.752 1.00 55.08
HETATM 18 O HOH C 8 24.561 0.532 70.565 1.00 44.77
CONECT 16 17
CONECT 1 2 4 5

49 changes: 49 additions & 0 deletions tests/data/dummy_merged.pdb
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
ATOM 1 N ASN A 1 22.066 40.557 0.420 1.00 0.00 N
ATOM 2 CA BASN A 1 20.000 30.000 0.005 0.60 0.00 C
ATOM 3 CA AASN A 1 21.411 39.311 0.054 0.40 0.00 C
ATOM 4 C ASN A 1 22.143 38.629 -1.102 1.00 0.00 C
ATOM 5 O ASN A 1 21.581 38.297 -2.176 1.00 0.00 O
ATOM 6 N ARG A 2 23.408 38.395 -0.829 1.00 0.00 N
ATOM 7 CA ARG A 2 24.384 37.823 -1.757 1.00 0.00 C
ATOM 8 C ARG A 2 24.061 36.421 -2.189 1.00 0.00 C
ATOM 9 O ARG A 2 24.411 36.106 -3.325 1.00 0.00 O
ATOM 10 N GLU A 3 23.456 35.570 -1.408 1.00 0.00 N
ATOM 11 CA GLU A 3 23.064 34.219 -1.780 1.00 0.00 C
ATOM 12 C GLU A 3 21.682 34.241 -2.380 1.00 0.00 C
ATOM 13 O GLU A 3 21.239 33.190 -2.786 1.00 0.00 O
TER 14 GLU A 3
ATOM 15 N ARG B 1 36.898 42.175 -2.688 1.00 0.00 N
ATOM 16 CA ARG B 1 37.080 43.455 -3.421 1.00 0.00 C
ATOM 17 C ARG B 1 36.102 44.524 -2.998 1.00 0.00 C
ATOM 18 O ARG B 1 36.577 45.677 -2.879 1.00 0.00 O
ATOM 19 N GLU B 2 34.849 44.167 -2.710 1.00 0.00 N
ATOM 20 CA GLU B 2 33.861 45.127 -2.233 1.00 0.00 C
ATOM 21 C GLU B 2 34.180 45.629 -0.820 1.00 0.00 C
ATOM 22 O GLU B 2 33.914 46.775 -0.464 1.00 0.00 O
ATOM 23 N ALA B 3 34.725 44.679 -0.087 1.00 0.00 N
ATOM 24 CA ALA B 3 35.081 45.036 1.305 1.00 0.00 C
ATOM 25 C ALA B 3 36.213 46.067 1.258 1.00 0.00 C
ATOM 26 O ALA B 3 36.287 47.046 2.028 1.00 0.00 O
TER 27 ALA B 3
HETATM 28 O HOH B 4 11.052 -12.419 29.700 1.00 73.70
ATOM 29 N ARG C 1 36.898 42.175 -2.688 1.00 0.00 N
ATOM 30 CA ARG C 1 37.080 43.455 -3.421 1.00 0.00 C
ATOM 31 C ARG C 1 36.102 44.524 -2.998 1.00 0.00 C
ATOM 32 O ARG C 1 36.577 45.677 -2.879 1.00 0.00 O
ATOM 33 N GLU C 2 34.849 44.167 -2.710 1.00 0.00 N
ATOM 34 CA GLU C 2 33.861 45.127 -2.233 1.00 0.00 C
ATOM 35 C GLU C 2 34.180 45.629 -0.820 1.00 0.00 C
ATOM 36 O GLU C 2 33.914 46.775 -0.464 1.00 0.00 O
ATOM 37 N MET C 3 43.010 -16.998 71.911 1.00 54.34
ATOM 38 CA MET C 3 42.850 -16.494 70.506 1.00 52.98
ATOM 39 C MET C 3 41.752 -17.205 69.684 1.00 52.05
ATOM 40 O MET C 3 41.560 -18.418 69.777 1.00 54.00
TER 41 MET C 3
HETATM 42 O HOH C 4 -8.172 -22.003 57.197 1.00 70.53
HETATM 43 O HOH C 5 36.020 -23.583 73.186 1.00 24.82
HETATM 44 O HOH C 6 41.203 -28.852 57.698 1.00 53.16
HETATM 45 O HOH C 7 -4.491 -9.687 56.752 1.00 55.08
HETATM 46 O HOH C 8 24.561 0.532 70.565 1.00 44.77
CONECT 44 45
CONECT 29 30 32 33
END
Loading
Loading