Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 43 additions & 21 deletions featurize_pdbbind_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,29 +11,42 @@ def parse_args(input_args=None):
parser = argparse.ArgumentParser()
parser.add_argument('--pdbbind-dir', required=1,
help='Directory containing pdbbind data.')
parser.add_argument("--script-dir", required=1,
help="Directory to generate qsub scripts.")
parser.add_argument("--tmp-dir", required=1, type=str,
help="Directory to save intermediate files.")
parser.add_argument("--script-template", default="script%d.pbs",
help="Template for script name. Must have one "
"string-substitutable entry for job number.")
parser.add_argument("--num-jobs", required=1, type=int,
help='Number of PBS jobs to launch.')
parser.add_argument('--pickle-dir', required=1,
parser.add_argument('--pickle-dir', required=1, type=str,
help='Directory to output pickled featured vectors.')
parser.add_argument('--queue-system', required=1,
help='Choose slurm or pbs')
parser.add_argument('--featurization-type', required=1,
help='Choose fingerprint or 3d_grid')
parser.add_argument('--box-width', required=0,
help='Input box width in Angstroms, default=16')
parser.add_argument('--voxel-width', required=0,
parser.add_argument('--box-width', required=0, default=str(16.0),
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You should say type=float, default=16.0 instead.

help='Input box width in Angstroms, default=16.0')
parser.add_argument('--voxel-width', required=0, default=str(0.5),
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same comment as above.

help='Input voxel width in Angstroms, default=0.5')
parser.add_argument('--tmp-dir', required=0,
help='Temporary directory for saving files')
parser.add_argument('--nb-rotations', required=0, default=str(0),
help='Number of times to rotate grid, integer')
parser.add_argument('--nb-reflections', required=0, default=str(0),
help='Number of times to reflect grid, integer')
parser.add_argument('--save-intermediates', action='store_true',
help="Save intermediate files.")
parser.add_argument('--grid-featurization-type', required=0, default="ecfp",
type=str, help="Which type of 3d_grid? options: ecfp, splif")
parser.add_argument('--fingerprint-bit-size', required=0, default=str(10),
help="Choose n in 2^n to define number of bits for ECFP fingerprint array")
parser.add_argument('--fingerprint-degree', required=0, default=str(2),
help="Choose degree to which fingerprints are computed, i.e. ECFP2 vs ECFP4")
parser.add_argument('--ligand-only', action='store_true',
help="Featurize only the ligands?")
return parser.parse_args(input_args)

def featurize_pdbbind(pdbbind_dir, script_dir, script_template, num_jobs,
pickle_dir, queue_system, featurization_type, box_width, voxel_width):
def featurize_pdbbind(pdbbind_dir, tmp_dir, script_template, num_jobs,
pickle_dir, queue_system, featurization_type, box_width, voxel_width, nb_rotations, nb_reflections,
save_intermediates, grid_featurization_type, fingerprint_bit_size, fingerprint_degree, ligand_only):
"""Featurize all entries in pdbbind_dir and write features to pickle_out

pdbbind_dir should be a dir, with K subdirs, one for each protein-ligand
Expand All @@ -46,6 +59,9 @@ def featurize_pdbbind(pdbbind_dir, script_dir, script_template, num_jobs,
pickle_out: string
Path to write pickle output.
"""
if not os.path.exists(tmp_dir): os.makedirs(tmp_dir)
if not os.path.exists(pickle_dir): os.makedirs(pickle_dir)

assert os.path.isdir(pdbbind_dir)

# Extract the subdirectories in pdbbind_dir
Expand All @@ -62,18 +78,23 @@ def featurize_pdbbind(pdbbind_dir, script_dir, script_template, num_jobs,
print job_dirs

# TODO(rbharath): This is horrible. Clean this script up...
pickle_out = os.path.join(pickle_dir, "features%d.p" % job)
pickle_out = os.path.join(pickle_dir, "features%d.pkl.gz" % job)
#TODO(enf): the path needs to be user-independent
command = " ".join(["python", "~/software/pbs_utils/featurize_pdbbind_cluster_job.py",

command = " ".join(["python", "/scratch/users/enf/deep-docking/cluster_utils/featurize_pdbbind_cluster_job.py",
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you make this a flag? We should fix this now since I'm sure this will cause us headaches.

"--pdb-directories"] + job_dirs + ["--featurization-type", featurization_type]
+ ["--box-width", box_width] + ["--voxel-width", voxel_width] + ["--tmp-dir", script_dir] + ["--pickle-out", pickle_out, "\n"])

+ ["--box-width", box_width] + ["--voxel-width", voxel_width] + ["--nb-rotations", nb_rotations]
+ ["--nb-reflections", nb_reflections] + (["--save-intermediates"] if save_intermediates else []) + ["--grid-featurization-type", grid_featurization_type]
+ ["--fingerprint-bit-size", fingerprint_bit_size] + ["--fingerprint-degree", fingerprint_degree] + (["--ligand-only"] if ligand_only else [])
+ ["--tmp-dir", tmp_dir] + ["--pickle-out", pickle_out, "\n"])

print "command: "
print command
script_loc = os.path.join(script_dir, script_template % job)

script_loc = os.path.join(tmp_dir, script_template % job)
print "script_loc: "
print script_loc


if queue_system == "pbs":
print "Writing pbs script!"
Expand All @@ -89,13 +110,13 @@ def featurize_pdbbind(pdbbind_dir, script_dir, script_template, num_jobs,
f.write("#!/bin/bash\n")
f.write("#SBATCH --output=out%d.out\n" %i)
f.write("#SBATCH --error=out%d.err\n" %i)
f.write("#SBATCH -p owners\n")
f.write("#SBATCH -p normal\n")
f.write("#SBATCH --mail-type=ALL")
f.write("#SBATCH --mail=enf@stanford.edu")
f.write("#SBATCH --job-name=deep%d\n" %i)
f.write("#SBATCH -n 1\n")
f.write("#SBATCH --time=4:00:00\n")
f.write("#SBATCH --qos=normal\n")
f.write("#SBATCH --time=12:00:00\n")
f.write("##SBATCH --qos=normal\n")
f.write(command)
slurm_command = ["sbatch", script_loc]
print slurm_command
Expand All @@ -107,5 +128,6 @@ def featurize_pdbbind(pdbbind_dir, script_dir, script_template, num_jobs,

if __name__ == '__main__':
args = parse_args()
featurize_pdbbind(args.pdbbind_dir, args.script_dir,
args.script_template, args.num_jobs, args.pickle_dir, args.queue_system, args.featurization_type, args.box_width, args.voxel_width)
featurize_pdbbind(args.pdbbind_dir, args.tmp_dir, args.script_template, args.num_jobs,
args.pickle_dir, args.queue_system, args.featurization_type, args.box_width, args.voxel_width, args.nb_rotations, args.nb_reflections,
args.save_intermediates, args.grid_featurization_type, args.fingerprint_bit_size, args.fingerprint_degree, args.ligand_only)
81 changes: 59 additions & 22 deletions featurize_pdbbind_cluster_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,9 @@
import cPickle as pickle
import mdtraj as md
from vs_utils.features.nnscore import Binana
from vs_utils.utils.nnscore_pdb import PDB
from vs_utils.utils.PDBTransformer import PDBTransformer
from vs_utils.utils.grid_factory import GridGenerator
from vs_utils.utils.grid_featurizer import grid_featurizer
from rdkit import Chem
import gzip

def parse_args(input_args=None):
"""Parse command-line arguments."""
Expand All @@ -25,8 +24,22 @@ def parse_args(input_args=None):
help='Input box width in Angstroms, default=16')
parser.add_argument('--voxel-width', required=1,
help='Input voxel width in Angstroms, default=0.5')
parser.add_argument('--nb-rotations', required=1,
help='Number of times to rotate grid, integer')
parser.add_argument('--nb-reflections', required=1,
help='Number of times to reflect grid, integer')
parser.add_argument('--tmp-dir', required=1,
help='Directory for saving all intermediate files')
parser.add_argument('--save-intermediates', action='store_true',
help="Save intermediate files.")
parser.add_argument('--grid-featurization-type', required=0, default=None,
type=str, help="Which type of 3d_grid? options: ecfp, splif")
parser.add_argument('--fingerprint-bit-size', required=0, default=None, type=int,
help="Choose n in 2^n to define number of bits for ECFP fingerprint array")
parser.add_argument('--fingerprint-degree', required=0, default=None, type=int,
help="Choose degree to which fingerprints are computed, i.e. ECFP2 vs ECFP4")
parser.add_argument('--ligand-only', action='store_true',
help="Featurize only the ligands?")
return parser.parse_args(input_args)

def featurize_fingerprint(pdb_directories, pickle_out):
Expand Down Expand Up @@ -93,10 +106,11 @@ def featurize_fingerprint(pdb_directories, pickle_out):
# Write the computed quantities
feature_vectors[pdb_dir] = (features, smiles, seq)
print "About to write pickle to " + pickle_out
with open(pickle_out, "wb") as f:
with gzip.open(pickle_out, "wb") as f:
pickle.dump(feature_vectors, f)

def featurize_3d_grid(pdb_directories, pickle_out, box_width, voxel_width, tmp_dir):
def featurize_3d_grid(pdb_directories, pickle_out, box_width, voxel_width, nb_rotations, nb_reflections,
feature_types, ecfp_degree, ecfp_power, save_intermediates, ligand_only, tmp_dir):
feature_vectors = {}
for count, pdb_dir in enumerate(pdb_directories):

Expand All @@ -116,33 +130,51 @@ def featurize_3d_grid(pdb_directories, pickle_out, box_width, voxel_width, tmp_d
protein_pdb = f
elif re.search("_protein_hyd.pdbqt$", f):
protein_pdbqt = f
elif re.search("_ligand.mol2$", f):
ligand_mol2 = f

print "Extracted Input Files:"
print (ligand_pdb, ligand_pdbqt, protein_pdb, protein_pdbqt)
print (ligand_pdb, ligand_pdbqt, protein_pdb, protein_pdbqt, ligand_mol2)
if (not ligand_pdb or not ligand_pdbqt or not protein_pdb or not
protein_pdbqt):
protein_pdbqt or not ligand_mol2):
raise ValueError("Required files not present for %s" % pdb_dir)

ligand_pdb_path = os.path.join(pdb_dir, ligand_pdb)
ligand_pdbqt_path = os.path.join(pdb_dir, ligand_pdbqt)
protein_pdb_path = os.path.join(pdb_dir, protein_pdb)
protein_pdbqt_path = os.path.join(pdb_dir, protein_pdbqt)
ligand_mol2_path = os.path.join(pdb_dir, ligand_mol2)


tmp_dir = os.path.join(tmp_dir, protein_pdb)
if not os.path.exists(tmp_dir): os.makedirs(tmp_dir)
system_pdb = os.path.join(tmp_dir, "system.pdb")
box_pdb = os.path.join(tmp_dir, "box.pdb")
box_pickle = os.path.join(tmp_dir, "box.pickle")
grid_pickle = os.path.join(tmp_dir, "grid.pickle")

p = PDBTransformer()
p.transform(protein_pdb_path, protein_pdbqt_path, ligand_pdb_path, ligand_pdbqt_path,
system_pdb, box_pdb, box_pickle, box_x = box_width, box_y = box_width, box_z = box_width)

g = GridGenerator()
grid = g.transform(box_pickle, box_width, box_width, box_width, voxel_width, grid_pickle, num_features=3)

print "About to compute sequence."
g = grid_featurizer()


kwargs = {}
if box_width is not None: kwargs["box_width"] = box_width
if voxel_width is not None: kwargs["voxel_width"] = voxel_width
if nb_rotations is not None: kwargs["nb_rotations"] = nb_rotations
if nb_reflections is not None: kwargs["nb_reflections"] = nb_reflections
if feature_types is not None: kwargs["feature_types"] = feature_types
if ecfp_degree is not None: kwargs["ecfp_degree"] = ecfp_degree
if ecfp_power is not None: kwargs["ecfp_power"] = ecfp_power
if save_intermediates is not None: kwargs["save_intermediates"] = save_intermediates
if ligand_only is not None: kwargs["ligand_only"] = ligand_only
print(kwargs)

features = g.transform(protein_pdb_path, ligand_mol2_path,
tmp_dir, **kwargs)

pdb_name = str(pdb_dir).split("/")[len(str(pdb_dir).split("/"))-1]
#for box_id, box in boxes.iteritems():
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you remove this commented out code?

#g = GridGenerator()
#grid_pkl_dir = "%s/%s_%d_%d_grid.pkl" %(tmp_dir, pdb_name, box_id[0], box_id[1])
#grid = g.transform(box, box_width, box_width, box_width, voxel_width, grid_pkl_dir, num_features=3)
#print "About to compute sequence."
protein = md.load(protein_pdb_path)
seq = [r.name for r in protein.top.residues]

Expand All @@ -152,22 +184,27 @@ def featurize_3d_grid(pdb_directories, pickle_out, box_width, voxel_width, tmp_d
if ligand_mol is None:
continue
smiles = Chem.MolToSmiles(ligand_mol)
for system_id, features in features.iteritems():
print(system_id)
feature_vectors["%s_rotation%d_reflection%d" %(pdb_name, system_id[0], system_id[1])] = (features, smiles, seq)

feature_vectors[pdb_dir] = (grid, smiles, seq)
print "About to write pickle to " + pickle_out
with open(pickle_out, "wb") as f:
with gzip.open(pickle_out, "wb") as f:
pickle.dump(feature_vectors, f)


def featurize_job(pdb_directories, pickle_out, featurization_type, box_width, voxel_width, tmp_dir):
def featurize_job(pdb_directories, pickle_out, featurization_type, box_width, voxel_width, tmp_dir, nb_rotations, nb_reflections,
grid_featurization_type, fingerprint_degree, fingerprint_bit_size, save_intermediates, ligand_only):
if featurization_type=="fingerprint":
featurize_fingerprint(pdb_directories, pickle_out)
elif featurization_type=="3d_grid":
featurize_3d_grid(pdb_directories, pickle_out, box_width, voxel_width, tmp_dir)
featurize_3d_grid(pdb_directories, pickle_out, box_width, voxel_width, nb_rotations, nb_reflections,
grid_featurization_type, fingerprint_degree, fingerprint_bit_size, save_intermediates, ligand_only, tmp_dir)
else:
print("Didn't understand featurization type. options are fingerprint or 3d_grid")


if __name__ == '__main__':
args = parse_args()
featurize_job(args.pdb_directories, args.pickle_out, args.featurization_type, args.box_width, args.voxel_width, args.tmp_dir)
featurize_job(args.pdb_directories, args.pickle_out, args.featurization_type, args.box_width, args.voxel_width, args.tmp_dir, args.nb_rotations, args.nb_reflections,
args.grid_featurization_type, args.fingerprint_degree, args.fingerprint_bit_size, args.save_intermediates, args.ligand_only)
2 changes: 2 additions & 0 deletions featurize_whole_pdbbind.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
python featurize_pdbbind_cluster.py --pdbbind-dir /scratch/users/enf/deep-docking/datasets/pdbbind/website --script-dir /scratch/users/enf/deep-docking/shallow/pdbbind_full_tmp/ --script-template "job%d.sbatch" --num-jobs 128 --pickle-dir /scratch/users/enf/deep-docking/shallow/pdbbind_10rot_1ref-3dgrid-t1 --queue-system slurm --featurization-type 3d_grid --box-width 16.0 --voxel-width 0.5 --nb-rotations 10 --nb-reflections 1

2 changes: 2 additions & 0 deletions featurize_whole_pdbbind_vanilla.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
python featurize_pdbbind_cluster.py --pdbbind-dir /scratch/users/enf/deep-docking/datasets/pdbbind/website --script-dir /scratch/users/enf/deep-docking/shallow/pdbbind_full_tmp/ --script-template "job%d.sbatch" --num-jobs 128 --pickle-dir /scratch/users/enf/deep-docking/shallow/pdbbind_0rot_0ref-3dgrid-t1 --queue-system slurm --featurization-type 3d_grid --box-width 16.0 --voxel-width 0.5 --nb-rotations 0 --nb-reflections 0

56 changes: 42 additions & 14 deletions generate_pdbbind_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import cPickle as pickle
import gzip
import pandas as pd
import multiprocessing as mp

def parse_args(input_args=None):
"""Parse command-line arguments."""
Expand All @@ -22,6 +23,9 @@ def parse_args(input_args=None):
parser.add_argument("--output_type", default="csv",
choices=["csv", "pkl.gz"],
help="Type of output file.")
parser.add_argument("--parallel", default=False,
type=bool,
help="Do processing in parallel")
return parser.parse_args(input_args)

def extract_labels(pdbbind_label_file):
Expand Down Expand Up @@ -58,25 +62,49 @@ def write_pkl_gz(feature_dict, labels, out):
with gzip.open(out, "wb") as f:
outputs = []
for key in feature_dict:
label = labels[key]
features = feature_dict[key]
# TODO(rbharath): Once smiles/sequences are added into 3D grid data,
# remove this line
smiles, sequence = None, None
labels_key = key.split("_")[0]
label = labels[labels_key]
features, smiles, sequence = feature_dict[key]
outputs.append({"smiles": smiles, "sequence": sequence, "label": label, "features": features})
df = pd.DataFrame(outputs)
pickle.dump(df, f)

def generate_dataset(pdbbind_label_file, feature_files, out, output_type):
def read_feature_file(feature_file, feature_dict = {}):
print("Currently reading %s" %feature_file)
with gzip.open(feature_file, "rb") as features:
contents = pickle.load(features)
for index, (key, value) in enumerate(contents.iteritems()):
name = os.path.basename(key)
feature_dict[name] = value
return(feature_dict)

'''
http://stackoverflow.com/questions/38987/how-can-i-merge-two-python-dictionaries-in-a-single-expression
'''
def merge_dicts(*dict_args):
'''
Given any number of dicts, shallow copy and merge into a new dict,
precedence goes to key value pairs in latter dicts.
'''
result = {}
for dictionary in dict_args:
result.update(dictionary)
return result

def generate_dataset(pdbbind_label_file, feature_files, out, output_type, parallel):
"""Generate dataset file."""
labels = extract_labels(pdbbind_label_file)
feature_dict = {}
for feature_file in feature_files:
with open(feature_file, "rb") as features:
contents = pickle.load(features)
for key, value in contents.iteritems():
name = os.path.basename(key)
feature_dict[name] = value
if not parallel:
feature_dict = {}
for feature_file in feature_files:
feature_dict = read_feature_file(feature_file, feature_dict)
else:
pool = mp.Pool(mp.cpu_count())
feature_dicts = pool.map(read_feature_file, feature_files)
feature_dict = merge_dicts(*feature_dicts)
pool.terminate()
print(len(feature_dict))
print(len(feature_dict.keys()))
if output_type == "csv":
write_csv(feature_dict, labels, out)
elif output_type == "pkl.gz":
Expand All @@ -85,4 +113,4 @@ def generate_dataset(pdbbind_label_file, feature_files, out, output_type):

if __name__ == '__main__':
args = parse_args()
generate_dataset(args.pdbbind_label_file, args.feature_files, args.out, args.output_type)
generate_dataset(args.pdbbind_label_file, args.feature_files, args.out, args.output_type, args.parallel)
1 change: 1 addition & 0 deletions generate_pdbbind_sample.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
python generate_pdbbind_data.py --pdbbind-label-file /scratch/users/enf/deep-docking/datasets/pdbbind/INDEX_general_PL_data.2014 --feature-files /scratch/users/enf/deep-docking/shallow/pdbbind_smiles_ecfp_ligonly_nohyd_t1/*.pkl.gz --out /scratch/users/enf/deep-docking/shallow/pdbbind_smiles_ecfp_ligonly_nohyd_t1.pkl.gz --output_type pkl.gz --parallel parallel
3 changes: 3 additions & 0 deletions sample_input.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#python featurize_pdbbind_cluster.py --pdbbind-dir /scratch/users/enf/deep-docking/datasets/pdbbind/website --tmp-dir /scratch/users/enf/deep-docking/shallow/tmp/ --script-template "job%d.sbatch" --num-jobs 10 --pickle-dir /scratch/users/enf/deep-docking/shallow/pdbbind_10rot_1ref_t4 --queue-system slurm --featurization-type 3d_grid --box-width 16.0 --voxel-width 0.5 --nb-rotations 10 --nb-reflections 1
python featurize_pdbbind_cluster.py --pdbbind-dir /scratch/users/enf/deep-docking/datasets/pdbbind/website --tmp-dir /scratch/users/enf/deep-docking/shallow/tmp/ --script-template "job%d.sbatch" --num-jobs 10 --pickle-dir /scratch/users/enf/deep-docking/shallow/pdbbind_smiles_ecfp_ligonly_nohyd_t1 --queue-system slurm --featurization-type 3d_grid --box-width 16.0 --voxel-width 0.5 --nb-rotations 0 --nb-reflections 0 --ligand-only