-
Notifications
You must be signed in to change notification settings - Fork 2
Added new featurization capabilities #6
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -11,29 +11,42 @@ def parse_args(input_args=None): | |
| parser = argparse.ArgumentParser() | ||
| parser.add_argument('--pdbbind-dir', required=1, | ||
| help='Directory containing pdbbind data.') | ||
| parser.add_argument("--script-dir", required=1, | ||
| help="Directory to generate qsub scripts.") | ||
| parser.add_argument("--tmp-dir", required=1, type=str, | ||
| help="Directory to save intermediate files.") | ||
| parser.add_argument("--script-template", default="script%d.pbs", | ||
| help="Template for script name. Must have one " | ||
| "string-substitutable entry for job number.") | ||
| parser.add_argument("--num-jobs", required=1, type=int, | ||
| help='Number of PBS jobs to launch.') | ||
| parser.add_argument('--pickle-dir', required=1, | ||
| parser.add_argument('--pickle-dir', required=1, type=str, | ||
| help='Directory to output pickled featured vectors.') | ||
| parser.add_argument('--queue-system', required=1, | ||
| help='Choose slurm or pbs') | ||
| parser.add_argument('--featurization-type', required=1, | ||
| help='Choose fingerprint or 3d_grid') | ||
| parser.add_argument('--box-width', required=0, | ||
| help='Input box width in Angstroms, default=16') | ||
| parser.add_argument('--voxel-width', required=0, | ||
| parser.add_argument('--box-width', required=0, default=str(16.0), | ||
| help='Input box width in Angstroms, default=16.0') | ||
| parser.add_argument('--voxel-width', required=0, default=str(0.5), | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same comment as above. |
||
| help='Input voxel width in Angstroms, default=0.5') | ||
| parser.add_argument('--tmp-dir', required=0, | ||
| help='Temporary directory for saving files') | ||
| parser.add_argument('--nb-rotations', required=0, default=str(0), | ||
| help='Number of times to rotate grid, integer') | ||
| parser.add_argument('--nb-reflections', required=0, default=str(0), | ||
| help='Number of times to reflect grid, integer') | ||
| parser.add_argument('--save-intermediates', action='store_true', | ||
| help="Save intermediate files.") | ||
| parser.add_argument('--grid-featurization-type', required=0, default="ecfp", | ||
| type=str, help="Which type of 3d_grid? options: ecfp, splif") | ||
| parser.add_argument('--fingerprint-bit-size', required=0, default=str(10), | ||
| help="Choose n in 2^n to define number of bits for ECFP fingerprint array") | ||
| parser.add_argument('--fingerprint-degree', required=0, default=str(2), | ||
| help="Choose degree to which fingerprints are computed, i.e. ECFP2 vs ECFP4") | ||
| parser.add_argument('--ligand-only', action='store_true', | ||
| help="Featurize only the ligands?") | ||
| return parser.parse_args(input_args) | ||
|
|
||
| def featurize_pdbbind(pdbbind_dir, script_dir, script_template, num_jobs, | ||
| pickle_dir, queue_system, featurization_type, box_width, voxel_width): | ||
| def featurize_pdbbind(pdbbind_dir, tmp_dir, script_template, num_jobs, | ||
| pickle_dir, queue_system, featurization_type, box_width, voxel_width, nb_rotations, nb_reflections, | ||
| save_intermediates, grid_featurization_type, fingerprint_bit_size, fingerprint_degree, ligand_only): | ||
| """Featurize all entries in pdbbind_dir and write features to pickle_out | ||
|
|
||
| pdbbind_dir should be a dir, with K subdirs, one for each protein-ligand | ||
|
|
@@ -46,6 +59,9 @@ def featurize_pdbbind(pdbbind_dir, script_dir, script_template, num_jobs, | |
| pickle_out: string | ||
| Path to write pickle output. | ||
| """ | ||
| if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) | ||
| if not os.path.exists(pickle_dir): os.makedirs(pickle_dir) | ||
|
|
||
| assert os.path.isdir(pdbbind_dir) | ||
|
|
||
| # Extract the subdirectories in pdbbind_dir | ||
|
|
@@ -62,18 +78,23 @@ def featurize_pdbbind(pdbbind_dir, script_dir, script_template, num_jobs, | |
| print job_dirs | ||
|
|
||
| # TODO(rbharath): This is horrible. Clean this script up... | ||
| pickle_out = os.path.join(pickle_dir, "features%d.p" % job) | ||
| pickle_out = os.path.join(pickle_dir, "features%d.pkl.gz" % job) | ||
| #TODO(enf): the path needs to be user-independent | ||
| command = " ".join(["python", "~/software/pbs_utils/featurize_pdbbind_cluster_job.py", | ||
|
|
||
| command = " ".join(["python", "/scratch/users/enf/deep-docking/cluster_utils/featurize_pdbbind_cluster_job.py", | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you make this a flag? We should fix this now since I'm sure this will cause us headaches. |
||
| "--pdb-directories"] + job_dirs + ["--featurization-type", featurization_type] | ||
| + ["--box-width", box_width] + ["--voxel-width", voxel_width] + ["--tmp-dir", script_dir] + ["--pickle-out", pickle_out, "\n"]) | ||
|
|
||
| + ["--box-width", box_width] + ["--voxel-width", voxel_width] + ["--nb-rotations", nb_rotations] | ||
| + ["--nb-reflections", nb_reflections] + (["--save-intermediates"] if save_intermediates else []) + ["--grid-featurization-type", grid_featurization_type] | ||
| + ["--fingerprint-bit-size", fingerprint_bit_size] + ["--fingerprint-degree", fingerprint_degree] + (["--ligand-only"] if ligand_only else []) | ||
| + ["--tmp-dir", tmp_dir] + ["--pickle-out", pickle_out, "\n"]) | ||
|
|
||
| print "command: " | ||
| print command | ||
| script_loc = os.path.join(script_dir, script_template % job) | ||
|
|
||
| script_loc = os.path.join(tmp_dir, script_template % job) | ||
| print "script_loc: " | ||
| print script_loc | ||
|
|
||
|
|
||
| if queue_system == "pbs": | ||
| print "Writing pbs script!" | ||
|
|
@@ -89,13 +110,13 @@ def featurize_pdbbind(pdbbind_dir, script_dir, script_template, num_jobs, | |
| f.write("#!/bin/bash\n") | ||
| f.write("#SBATCH --output=out%d.out\n" %i) | ||
| f.write("#SBATCH --error=out%d.err\n" %i) | ||
| f.write("#SBATCH -p owners\n") | ||
| f.write("#SBATCH -p normal\n") | ||
| f.write("#SBATCH --mail-type=ALL") | ||
| f.write("#SBATCH --mail=enf@stanford.edu") | ||
| f.write("#SBATCH --job-name=deep%d\n" %i) | ||
| f.write("#SBATCH -n 1\n") | ||
| f.write("#SBATCH --time=4:00:00\n") | ||
| f.write("#SBATCH --qos=normal\n") | ||
| f.write("#SBATCH --time=12:00:00\n") | ||
| f.write("##SBATCH --qos=normal\n") | ||
| f.write(command) | ||
| slurm_command = ["sbatch", script_loc] | ||
| print slurm_command | ||
|
|
@@ -107,5 +128,6 @@ def featurize_pdbbind(pdbbind_dir, script_dir, script_template, num_jobs, | |
|
|
||
| if __name__ == '__main__': | ||
| args = parse_args() | ||
| featurize_pdbbind(args.pdbbind_dir, args.script_dir, | ||
| args.script_template, args.num_jobs, args.pickle_dir, args.queue_system, args.featurization_type, args.box_width, args.voxel_width) | ||
| featurize_pdbbind(args.pdbbind_dir, args.tmp_dir, args.script_template, args.num_jobs, | ||
| args.pickle_dir, args.queue_system, args.featurization_type, args.box_width, args.voxel_width, args.nb_rotations, args.nb_reflections, | ||
| args.save_intermediates, args.grid_featurization_type, args.fingerprint_bit_size, args.fingerprint_degree, args.ligand_only) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -7,10 +7,9 @@ | |
| import cPickle as pickle | ||
| import mdtraj as md | ||
| from vs_utils.features.nnscore import Binana | ||
| from vs_utils.utils.nnscore_pdb import PDB | ||
| from vs_utils.utils.PDBTransformer import PDBTransformer | ||
| from vs_utils.utils.grid_factory import GridGenerator | ||
| from vs_utils.utils.grid_featurizer import grid_featurizer | ||
| from rdkit import Chem | ||
| import gzip | ||
|
|
||
| def parse_args(input_args=None): | ||
| """Parse command-line arguments.""" | ||
|
|
@@ -25,8 +24,22 @@ def parse_args(input_args=None): | |
| help='Input box width in Angstroms, default=16') | ||
| parser.add_argument('--voxel-width', required=1, | ||
| help='Input voxel width in Angstroms, default=0.5') | ||
| parser.add_argument('--nb-rotations', required=1, | ||
| help='Number of times to rotate grid, integer') | ||
| parser.add_argument('--nb-reflections', required=1, | ||
| help='Number of times to reflect grid, integer') | ||
| parser.add_argument('--tmp-dir', required=1, | ||
| help='Directory for saving all intermediate files') | ||
| parser.add_argument('--save-intermediates', action='store_true', | ||
| help="Save intermediate files.") | ||
| parser.add_argument('--grid-featurization-type', required=0, default=None, | ||
| type=str, help="Which type of 3d_grid? options: ecfp, splif") | ||
| parser.add_argument('--fingerprint-bit-size', required=0, default=None, type=int, | ||
| help="Choose n in 2^n to define number of bits for ECFP fingerprint array") | ||
| parser.add_argument('--fingerprint-degree', required=0, default=None, type=int, | ||
| help="Choose degree to which fingerprints are computed, i.e. ECFP2 vs ECFP4") | ||
| parser.add_argument('--ligand-only', action='store_true', | ||
| help="Featurize only the ligands?") | ||
| return parser.parse_args(input_args) | ||
|
|
||
| def featurize_fingerprint(pdb_directories, pickle_out): | ||
|
|
@@ -93,10 +106,11 @@ def featurize_fingerprint(pdb_directories, pickle_out): | |
| # Write the computed quantities | ||
| feature_vectors[pdb_dir] = (features, smiles, seq) | ||
| print "About to write pickle to " + pickle_out | ||
| with open(pickle_out, "wb") as f: | ||
| with gzip.open(pickle_out, "wb") as f: | ||
| pickle.dump(feature_vectors, f) | ||
|
|
||
| def featurize_3d_grid(pdb_directories, pickle_out, box_width, voxel_width, tmp_dir): | ||
| def featurize_3d_grid(pdb_directories, pickle_out, box_width, voxel_width, nb_rotations, nb_reflections, | ||
| feature_types, ecfp_degree, ecfp_power, save_intermediates, ligand_only, tmp_dir): | ||
| feature_vectors = {} | ||
| for count, pdb_dir in enumerate(pdb_directories): | ||
|
|
||
|
|
@@ -116,33 +130,51 @@ def featurize_3d_grid(pdb_directories, pickle_out, box_width, voxel_width, tmp_d | |
| protein_pdb = f | ||
| elif re.search("_protein_hyd.pdbqt$", f): | ||
| protein_pdbqt = f | ||
| elif re.search("_ligand.mol2$", f): | ||
| ligand_mol2 = f | ||
|
|
||
| print "Extracted Input Files:" | ||
| print (ligand_pdb, ligand_pdbqt, protein_pdb, protein_pdbqt) | ||
| print (ligand_pdb, ligand_pdbqt, protein_pdb, protein_pdbqt, ligand_mol2) | ||
| if (not ligand_pdb or not ligand_pdbqt or not protein_pdb or not | ||
| protein_pdbqt): | ||
| protein_pdbqt or not ligand_mol2): | ||
| raise ValueError("Required files not present for %s" % pdb_dir) | ||
|
|
||
| ligand_pdb_path = os.path.join(pdb_dir, ligand_pdb) | ||
| ligand_pdbqt_path = os.path.join(pdb_dir, ligand_pdbqt) | ||
| protein_pdb_path = os.path.join(pdb_dir, protein_pdb) | ||
| protein_pdbqt_path = os.path.join(pdb_dir, protein_pdbqt) | ||
| ligand_mol2_path = os.path.join(pdb_dir, ligand_mol2) | ||
|
|
||
|
|
||
| tmp_dir = os.path.join(tmp_dir, protein_pdb) | ||
| if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) | ||
| system_pdb = os.path.join(tmp_dir, "system.pdb") | ||
| box_pdb = os.path.join(tmp_dir, "box.pdb") | ||
| box_pickle = os.path.join(tmp_dir, "box.pickle") | ||
| grid_pickle = os.path.join(tmp_dir, "grid.pickle") | ||
|
|
||
| p = PDBTransformer() | ||
| p.transform(protein_pdb_path, protein_pdbqt_path, ligand_pdb_path, ligand_pdbqt_path, | ||
| system_pdb, box_pdb, box_pickle, box_x = box_width, box_y = box_width, box_z = box_width) | ||
|
|
||
| g = GridGenerator() | ||
| grid = g.transform(box_pickle, box_width, box_width, box_width, voxel_width, grid_pickle, num_features=3) | ||
|
|
||
| print "About to compute sequence." | ||
| g = grid_featurizer() | ||
|
|
||
|
|
||
| kwargs = {} | ||
| if box_width is not None: kwargs["box_width"] = box_width | ||
| if voxel_width is not None: kwargs["voxel_width"] = voxel_width | ||
| if nb_rotations is not None: kwargs["nb_rotations"] = nb_rotations | ||
| if nb_reflections is not None: kwargs["nb_reflections"] = nb_reflections | ||
| if feature_types is not None: kwargs["feature_types"] = feature_types | ||
| if ecfp_degree is not None: kwargs["ecfp_degree"] = ecfp_degree | ||
| if ecfp_power is not None: kwargs["ecfp_power"] = ecfp_power | ||
| if save_intermediates is not None: kwargs["save_intermediates"] = save_intermediates | ||
| if ligand_only is not None: kwargs["ligand_only"] = ligand_only | ||
| print(kwargs) | ||
|
|
||
| features = g.transform(protein_pdb_path, ligand_mol2_path, | ||
| tmp_dir, **kwargs) | ||
|
|
||
| pdb_name = str(pdb_dir).split("/")[len(str(pdb_dir).split("/"))-1] | ||
| #for box_id, box in boxes.iteritems(): | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you remove this commented out code? |
||
| #g = GridGenerator() | ||
| #grid_pkl_dir = "%s/%s_%d_%d_grid.pkl" %(tmp_dir, pdb_name, box_id[0], box_id[1]) | ||
| #grid = g.transform(box, box_width, box_width, box_width, voxel_width, grid_pkl_dir, num_features=3) | ||
| #print "About to compute sequence." | ||
| protein = md.load(protein_pdb_path) | ||
| seq = [r.name for r in protein.top.residues] | ||
|
|
||
|
|
@@ -152,22 +184,27 @@ def featurize_3d_grid(pdb_directories, pickle_out, box_width, voxel_width, tmp_d | |
| if ligand_mol is None: | ||
| continue | ||
| smiles = Chem.MolToSmiles(ligand_mol) | ||
| for system_id, features in features.iteritems(): | ||
| print(system_id) | ||
| feature_vectors["%s_rotation%d_reflection%d" %(pdb_name, system_id[0], system_id[1])] = (features, smiles, seq) | ||
|
|
||
| feature_vectors[pdb_dir] = (grid, smiles, seq) | ||
| print "About to write pickle to " + pickle_out | ||
| with open(pickle_out, "wb") as f: | ||
| with gzip.open(pickle_out, "wb") as f: | ||
| pickle.dump(feature_vectors, f) | ||
|
|
||
|
|
||
| def featurize_job(pdb_directories, pickle_out, featurization_type, box_width, voxel_width, tmp_dir): | ||
| def featurize_job(pdb_directories, pickle_out, featurization_type, box_width, voxel_width, tmp_dir, nb_rotations, nb_reflections, | ||
| grid_featurization_type, fingerprint_degree, fingerprint_bit_size, save_intermediates, ligand_only): | ||
| if featurization_type=="fingerprint": | ||
| featurize_fingerprint(pdb_directories, pickle_out) | ||
| elif featurization_type=="3d_grid": | ||
| featurize_3d_grid(pdb_directories, pickle_out, box_width, voxel_width, tmp_dir) | ||
| featurize_3d_grid(pdb_directories, pickle_out, box_width, voxel_width, nb_rotations, nb_reflections, | ||
| grid_featurization_type, fingerprint_degree, fingerprint_bit_size, save_intermediates, ligand_only, tmp_dir) | ||
| else: | ||
| print("Didn't understand featurization type. options are fingerprint or 3d_grid") | ||
|
|
||
|
|
||
| if __name__ == '__main__': | ||
| args = parse_args() | ||
| featurize_job(args.pdb_directories, args.pickle_out, args.featurization_type, args.box_width, args.voxel_width, args.tmp_dir) | ||
| featurize_job(args.pdb_directories, args.pickle_out, args.featurization_type, args.box_width, args.voxel_width, args.tmp_dir, args.nb_rotations, args.nb_reflections, | ||
| args.grid_featurization_type, args.fingerprint_degree, args.fingerprint_bit_size, args.save_intermediates, args.ligand_only) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,2 @@ | ||
| python featurize_pdbbind_cluster.py --pdbbind-dir /scratch/users/enf/deep-docking/datasets/pdbbind/website --script-dir /scratch/users/enf/deep-docking/shallow/pdbbind_full_tmp/ --script-template "job%d.sbatch" --num-jobs 128 --pickle-dir /scratch/users/enf/deep-docking/shallow/pdbbind_10rot_1ref-3dgrid-t1 --queue-system slurm --featurization-type 3d_grid --box-width 16.0 --voxel-width 0.5 --nb-rotations 10 --nb-reflections 1 | ||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,2 @@ | ||
| python featurize_pdbbind_cluster.py --pdbbind-dir /scratch/users/enf/deep-docking/datasets/pdbbind/website --script-dir /scratch/users/enf/deep-docking/shallow/pdbbind_full_tmp/ --script-template "job%d.sbatch" --num-jobs 128 --pickle-dir /scratch/users/enf/deep-docking/shallow/pdbbind_0rot_0ref-3dgrid-t1 --queue-system slurm --featurization-type 3d_grid --box-width 16.0 --voxel-width 0.5 --nb-rotations 0 --nb-reflections 0 | ||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| python generate_pdbbind_data.py --pdbbind-label-file /scratch/users/enf/deep-docking/datasets/pdbbind/INDEX_general_PL_data.2014 --feature-files /scratch/users/enf/deep-docking/shallow/pdbbind_smiles_ecfp_ligonly_nohyd_t1/*.pkl.gz --out /scratch/users/enf/deep-docking/shallow/pdbbind_smiles_ecfp_ligonly_nohyd_t1.pkl.gz --output_type pkl.gz --parallel parallel |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| #python featurize_pdbbind_cluster.py --pdbbind-dir /scratch/users/enf/deep-docking/datasets/pdbbind/website --tmp-dir /scratch/users/enf/deep-docking/shallow/tmp/ --script-template "job%d.sbatch" --num-jobs 10 --pickle-dir /scratch/users/enf/deep-docking/shallow/pdbbind_10rot_1ref_t4 --queue-system slurm --featurization-type 3d_grid --box-width 16.0 --voxel-width 0.5 --nb-rotations 10 --nb-reflections 1 | ||
| python featurize_pdbbind_cluster.py --pdbbind-dir /scratch/users/enf/deep-docking/datasets/pdbbind/website --tmp-dir /scratch/users/enf/deep-docking/shallow/tmp/ --script-template "job%d.sbatch" --num-jobs 10 --pickle-dir /scratch/users/enf/deep-docking/shallow/pdbbind_smiles_ecfp_ligonly_nohyd_t1 --queue-system slurm --featurization-type 3d_grid --box-width 16.0 --voxel-width 0.5 --nb-rotations 0 --nb-reflections 0 --ligand-only | ||
|
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You should say
type=float, default=16.0instead.