rbharath · evanfeinberg · Nov 3, 2015 · Nov 3, 2015 · Nov 13, 2015 · rbharath
diff --git a/featurize_pdbbind_cluster.py b/featurize_pdbbind_cluster.py
@@ -11,29 +11,42 @@ def parse_args(input_args=None):
   parser = argparse.ArgumentParser()
   parser.add_argument('--pdbbind-dir', required=1,
                       help='Directory containing pdbbind data.')
-  parser.add_argument("--script-dir", required=1,
-                      help="Directory to generate qsub scripts.")
+  parser.add_argument("--tmp-dir", required=1, type=str,
+                      help="Directory to save intermediate files.")
   parser.add_argument("--script-template", default="script%d.pbs",
                       help="Template for script name. Must have one "
                       "string-substitutable entry for job number.")
   parser.add_argument("--num-jobs", required=1, type=int,
                       help='Number of PBS jobs to launch.')
-  parser.add_argument('--pickle-dir', required=1,
+  parser.add_argument('--pickle-dir', required=1, type=str,
                       help='Directory to output pickled featured vectors.')
   parser.add_argument('--queue-system', required=1,
                       help='Choose slurm or pbs')
   parser.add_argument('--featurization-type', required=1,
                       help='Choose fingerprint or 3d_grid')
-  parser.add_argument('--box-width', required=0,
-                      help='Input box width in Angstroms, default=16')
-  parser.add_argument('--voxel-width', required=0,
+  parser.add_argument('--box-width', required=0, default=str(16.0),
+                      help='Input box width in Angstroms, default=16.0')
+  parser.add_argument('--voxel-width', required=0, default=str(0.5),
                       help='Input voxel width in Angstroms, default=0.5')
-  parser.add_argument('--tmp-dir', required=0,
-                      help='Temporary directory for saving files')
+  parser.add_argument('--nb-rotations', required=0, default=str(0),
+                      help='Number of times to rotate grid, integer')
+  parser.add_argument('--nb-reflections', required=0, default=str(0),
+                      help='Number of times to reflect grid, integer')
+  parser.add_argument('--save-intermediates', action='store_true',
+                      help="Save intermediate files.")
+  parser.add_argument('--grid-featurization-type', required=0, default="ecfp",
+                      type=str, help="Which type of 3d_grid? options: ecfp, splif")
+  parser.add_argument('--fingerprint-bit-size', required=0, default=str(10),
+                      help="Choose n in 2^n to define number of bits for ECFP fingerprint array")
+  parser.add_argument('--fingerprint-degree', required=0, default=str(2),
+                      help="Choose degree to which fingerprints are computed, i.e. ECFP2 vs ECFP4")
+  parser.add_argument('--ligand-only', action='store_true',
+                      help="Featurize only the ligands?")
   return parser.parse_args(input_args)
 
-def featurize_pdbbind(pdbbind_dir, script_dir, script_template, num_jobs,
-    pickle_dir, queue_system, featurization_type, box_width, voxel_width):
+def featurize_pdbbind(pdbbind_dir, tmp_dir, script_template, num_jobs,
+    pickle_dir, queue_system, featurization_type, box_width, voxel_width, nb_rotations, nb_reflections,
+    save_intermediates, grid_featurization_type, fingerprint_bit_size, fingerprint_degree, ligand_only):
   """Featurize all entries in pdbbind_dir and write features to pickle_out
 
   pdbbind_dir should be a dir, with K subdirs, one for each protein-ligand
@@ -46,6 +59,9 @@ def featurize_pdbbind(pdbbind_dir, script_dir, script_template, num_jobs,
   pickle_out: string
     Path to write pickle output.
   """
+  if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) 
+  if not os.path.exists(pickle_dir): os.makedirs(pickle_dir)
+
   assert os.path.isdir(pdbbind_dir)
 
   # Extract the subdirectories in pdbbind_dir
@@ -62,18 +78,23 @@ def featurize_pdbbind(pdbbind_dir, script_dir, script_template, num_jobs,
     print job_dirs
 
     # TODO(rbharath): This is horrible. Clean this script up...
-    pickle_out = os.path.join(pickle_dir, "features%d.p" % job)
+    pickle_out = os.path.join(pickle_dir, "features%d.pkl.gz" % job)
     #TODO(enf): the path needs to be user-independent 
-    command = " ".join(["python", "~/software/pbs_utils/featurize_pdbbind_cluster_job.py",
+
+    command = " ".join(["python", "/scratch/users/enf/deep-docking/cluster_utils/featurize_pdbbind_cluster_job.py",
         "--pdb-directories"] + job_dirs + ["--featurization-type", featurization_type] 
-        + ["--box-width", box_width] + ["--voxel-width", voxel_width] + ["--tmp-dir", script_dir] + ["--pickle-out", pickle_out, "\n"])
-
+        + ["--box-width", box_width] + ["--voxel-width", voxel_width] + ["--nb-rotations", nb_rotations]
+        + ["--nb-reflections", nb_reflections] + (["--save-intermediates"] if save_intermediates else []) + ["--grid-featurization-type", grid_featurization_type]
+        + ["--fingerprint-bit-size", fingerprint_bit_size] + ["--fingerprint-degree", fingerprint_degree] + (["--ligand-only"] if ligand_only else [])
+        + ["--tmp-dir", tmp_dir] + ["--pickle-out", pickle_out, "\n"])
+
     print "command: "
     print command
-    script_loc = os.path.join(script_dir, script_template % job)
+
+    script_loc = os.path.join(tmp_dir, script_template % job)
     print "script_loc: "
     print script_loc
-
+    
 
     if queue_system == "pbs":
       print "Writing pbs script!"
@@ -89,13 +110,13 @@ def featurize_pdbbind(pdbbind_dir, script_dir, script_template, num_jobs,
         f.write("#!/bin/bash\n")
         f.write("#SBATCH --output=out%d.out\n" %i)
         f.write("#SBATCH --error=out%d.err\n" %i)
-        f.write("#SBATCH  -p owners\n")
+        f.write("#SBATCH  -p normal\n")
         f.write("#SBATCH --mail-type=ALL")
         f.write("#SBATCH --mail=enf@stanford.edu")
         f.write("#SBATCH --job-name=deep%d\n" %i)
         f.write("#SBATCH -n 1\n")
-        f.write("#SBATCH --time=4:00:00\n")
-        f.write("#SBATCH --qos=normal\n")
+        f.write("#SBATCH --time=12:00:00\n")
+        f.write("##SBATCH --qos=normal\n")
         f.write(command)
         slurm_command = ["sbatch", script_loc]
         print slurm_command
@@ -107,5 +128,6 @@ def featurize_pdbbind(pdbbind_dir, script_dir, script_template, num_jobs,
 
 if __name__ == '__main__':
   args = parse_args()
-  featurize_pdbbind(args.pdbbind_dir, args.script_dir,
-      args.script_template, args.num_jobs, args.pickle_dir, args.queue_system, args.featurization_type, args.box_width, args.voxel_width)
+  featurize_pdbbind(args.pdbbind_dir, args.tmp_dir, args.script_template, args.num_jobs, 
+    args.pickle_dir, args.queue_system, args.featurization_type, args.box_width, args.voxel_width, args.nb_rotations, args.nb_reflections, 
+    args.save_intermediates, args.grid_featurization_type, args.fingerprint_bit_size, args.fingerprint_degree, args.ligand_only)
diff --git a/featurize_pdbbind_cluster_job.py b/featurize_pdbbind_cluster_job.py
@@ -7,10 +7,9 @@
 import cPickle as pickle
 import mdtraj as md
 from vs_utils.features.nnscore import Binana
-from vs_utils.utils.nnscore_pdb import PDB
-from vs_utils.utils.PDBTransformer import PDBTransformer
-from vs_utils.utils.grid_factory import GridGenerator
+from vs_utils.utils.grid_featurizer import grid_featurizer
 from rdkit import Chem
+import gzip
 
 def parse_args(input_args=None):
   """Parse command-line arguments."""
@@ -25,8 +24,22 @@ def parse_args(input_args=None):
                       help='Input box width in Angstroms, default=16')
   parser.add_argument('--voxel-width', required=1,
                       help='Input voxel width in Angstroms, default=0.5')
+  parser.add_argument('--nb-rotations', required=1,
+                      help='Number of times to rotate grid, integer')
+  parser.add_argument('--nb-reflections', required=1,
+                      help='Number of times to reflect grid, integer')
   parser.add_argument('--tmp-dir', required=1,
                       help='Directory for saving all intermediate files')
+  parser.add_argument('--save-intermediates', action='store_true',
+                      help="Save intermediate files.")
+  parser.add_argument('--grid-featurization-type', required=0, default=None,
+                      type=str, help="Which type of 3d_grid? options: ecfp, splif")
+  parser.add_argument('--fingerprint-bit-size', required=0, default=None, type=int,
+                      help="Choose n in 2^n to define number of bits for ECFP fingerprint array")
+  parser.add_argument('--fingerprint-degree', required=0, default=None, type=int,
+                      help="Choose degree to which fingerprints are computed, i.e. ECFP2 vs ECFP4")
+  parser.add_argument('--ligand-only', action='store_true',
+                      help="Featurize only the ligands?")
   return parser.parse_args(input_args)
 
 def featurize_fingerprint(pdb_directories, pickle_out):
@@ -93,10 +106,11 @@ def featurize_fingerprint(pdb_directories, pickle_out):
     # Write the computed quantities
     feature_vectors[pdb_dir] = (features, smiles, seq)
   print "About to write pickle to " + pickle_out
-  with open(pickle_out, "wb") as f:
+  with gzip.open(pickle_out, "wb") as f:
     pickle.dump(feature_vectors, f)
 
-def featurize_3d_grid(pdb_directories, pickle_out, box_width, voxel_width, tmp_dir):
+def featurize_3d_grid(pdb_directories, pickle_out, box_width, voxel_width, nb_rotations, nb_reflections, 
+  feature_types, ecfp_degree, ecfp_power, save_intermediates, ligand_only, tmp_dir):
   feature_vectors = {}
   for count, pdb_dir in enumerate(pdb_directories):
 
@@ -116,33 +130,51 @@ def featurize_3d_grid(pdb_directories, pickle_out, box_width, voxel_width, tmp_d
         protein_pdb = f
       elif re.search("_protein_hyd.pdbqt$", f):
         protein_pdbqt = f
+      elif re.search("_ligand.mol2$", f):
+        ligand_mol2 = f
 
     print "Extracted Input Files:"
-    print (ligand_pdb, ligand_pdbqt, protein_pdb, protein_pdbqt)
+    print (ligand_pdb, ligand_pdbqt, protein_pdb, protein_pdbqt, ligand_mol2)
     if (not ligand_pdb or not ligand_pdbqt or not protein_pdb or not
-        protein_pdbqt):
+        protein_pdbqt or not ligand_mol2):
         raise ValueError("Required files not present for %s" % pdb_dir)
-
     ligand_pdb_path = os.path.join(pdb_dir, ligand_pdb)
     ligand_pdbqt_path = os.path.join(pdb_dir, ligand_pdbqt)
     protein_pdb_path = os.path.join(pdb_dir, protein_pdb)
     protein_pdbqt_path = os.path.join(pdb_dir, protein_pdbqt)
+    ligand_mol2_path = os.path.join(pdb_dir, ligand_mol2)
+
 
-    tmp_dir = os.path.join(tmp_dir, protein_pdb)
     if not os.path.exists(tmp_dir): os.makedirs(tmp_dir)
     system_pdb = os.path.join(tmp_dir, "system.pdb")
     box_pdb = os.path.join(tmp_dir, "box.pdb")
     box_pickle = os.path.join(tmp_dir, "box.pickle")
     grid_pickle = os.path.join(tmp_dir, "grid.pickle")
 
-    p = PDBTransformer()
-    p.transform(protein_pdb_path, protein_pdbqt_path, ligand_pdb_path, ligand_pdbqt_path, 
-                system_pdb, box_pdb, box_pickle, box_x = box_width, box_y = box_width, box_z = box_width)
-
-    g = GridGenerator()
-    grid = g.transform(box_pickle, box_width, box_width, box_width, voxel_width, grid_pickle, num_features=3)
-
-    print "About to compute sequence."
+    g = grid_featurizer()
+
+
+    kwargs = {}
+    if box_width is not None: kwargs["box_width"] = box_width
+    if voxel_width is not None: kwargs["voxel_width"] = voxel_width
+    if nb_rotations is not None: kwargs["nb_rotations"] = nb_rotations
+    if nb_reflections is not None: kwargs["nb_reflections"] = nb_reflections
+    if feature_types is not None: kwargs["feature_types"] = feature_types
+    if ecfp_degree is not None: kwargs["ecfp_degree"] = ecfp_degree
+    if ecfp_power is not None: kwargs["ecfp_power"] = ecfp_power
+    if save_intermediates is not None: kwargs["save_intermediates"] = save_intermediates
+    if ligand_only is not None: kwargs["ligand_only"] = ligand_only
+    print(kwargs)
+
+    features = g.transform(protein_pdb_path, ligand_mol2_path, 
+                tmp_dir, **kwargs)
+
+    pdb_name = str(pdb_dir).split("/")[len(str(pdb_dir).split("/"))-1]
+    #for box_id, box in boxes.iteritems():
+      #g = GridGenerator()
+      #grid_pkl_dir = "%s/%s_%d_%d_grid.pkl" %(tmp_dir, pdb_name, box_id[0], box_id[1])
+      #grid = g.transform(box, box_width, box_width, box_width, voxel_width, grid_pkl_dir, num_features=3)
+      #print "About to compute sequence."
     protein = md.load(protein_pdb_path)
     seq = [r.name for r in protein.top.residues] 
 
@@ -152,22 +184,27 @@ def featurize_3d_grid(pdb_directories, pickle_out, box_width, voxel_width, tmp_d
     if ligand_mol is None:
       continue
     smiles = Chem.MolToSmiles(ligand_mol)
+    for system_id, features in features.iteritems():
+      print(system_id)
+      feature_vectors["%s_rotation%d_reflection%d" %(pdb_name, system_id[0], system_id[1])] = (features, smiles, seq)
 
-    feature_vectors[pdb_dir] = (grid, smiles, seq)
   print "About to write pickle to " + pickle_out
-  with open(pickle_out, "wb") as f:
+  with gzip.open(pickle_out, "wb") as f:
     pickle.dump(feature_vectors, f)
 
 
-def featurize_job(pdb_directories, pickle_out, featurization_type, box_width, voxel_width, tmp_dir):
+def featurize_job(pdb_directories, pickle_out, featurization_type, box_width, voxel_width, tmp_dir, nb_rotations, nb_reflections,
+  grid_featurization_type, fingerprint_degree, fingerprint_bit_size, save_intermediates, ligand_only):
   if featurization_type=="fingerprint":
     featurize_fingerprint(pdb_directories, pickle_out)
   elif featurization_type=="3d_grid":
-    featurize_3d_grid(pdb_directories, pickle_out, box_width, voxel_width, tmp_dir)
+    featurize_3d_grid(pdb_directories, pickle_out, box_width, voxel_width, nb_rotations, nb_reflections,
+      grid_featurization_type, fingerprint_degree, fingerprint_bit_size, save_intermediates, ligand_only, tmp_dir)
   else:
     print("Didn't understand featurization type. options are fingerprint or 3d_grid")
 
 
 if __name__ == '__main__':
   args = parse_args()
-  featurize_job(args.pdb_directories, args.pickle_out, args.featurization_type, args.box_width, args.voxel_width, args.tmp_dir)
+  featurize_job(args.pdb_directories, args.pickle_out, args.featurization_type, args.box_width, args.voxel_width, args.tmp_dir, args.nb_rotations, args.nb_reflections,
+    args.grid_featurization_type, args.fingerprint_degree, args.fingerprint_bit_size, args.save_intermediates, args.ligand_only)
diff --git a/featurize_whole_pdbbind.sh b/featurize_whole_pdbbind.sh
@@ -0,0 +1,2 @@
+python featurize_pdbbind_cluster.py --pdbbind-dir /scratch/users/enf/deep-docking/datasets/pdbbind/website --script-dir /scratch/users/enf/deep-docking/shallow/pdbbind_full_tmp/ --script-template "job%d.sbatch" --num-jobs 128 --pickle-dir /scratch/users/enf/deep-docking/shallow/pdbbind_10rot_1ref-3dgrid-t1 --queue-system slurm --featurization-type 3d_grid --box-width 16.0 --voxel-width 0.5 --nb-rotations 10 --nb-reflections 1
+
diff --git a/featurize_whole_pdbbind_vanilla.sh b/featurize_whole_pdbbind_vanilla.sh
@@ -0,0 +1,2 @@
+python featurize_pdbbind_cluster.py --pdbbind-dir /scratch/users/enf/deep-docking/datasets/pdbbind/website --script-dir /scratch/users/enf/deep-docking/shallow/pdbbind_full_tmp/ --script-template "job%d.sbatch" --num-jobs 128 --pickle-dir /scratch/users/enf/deep-docking/shallow/pdbbind_0rot_0ref-3dgrid-t1 --queue-system slurm --featurization-type 3d_grid --box-width 16.0 --voxel-width 0.5 --nb-rotations 0 --nb-reflections 0
+
diff --git a/generate_pdbbind_data.py b/generate_pdbbind_data.py
@@ -9,6 +9,7 @@
 import cPickle as pickle
 import gzip
 import pandas as pd
+import multiprocessing as mp
 
 def parse_args(input_args=None):
   """Parse command-line arguments."""
@@ -22,6 +23,9 @@ def parse_args(input_args=None):
   parser.add_argument("--output_type", default="csv",
                       choices=["csv", "pkl.gz"],
                       help="Type of output file.")
+  parser.add_argument("--parallel", default=False,
+                      type=bool,
+                      help="Do processing in parallel")
   return parser.parse_args(input_args)
 
 def extract_labels(pdbbind_label_file):
@@ -58,25 +62,49 @@ def write_pkl_gz(feature_dict, labels, out):
   with gzip.open(out, "wb") as f:
     outputs = []
     for key in feature_dict:
-      label = labels[key]
-      features = feature_dict[key]
-      # TODO(rbharath): Once smiles/sequences are added into 3D grid data,
-      # remove this line
-      smiles, sequence = None, None
+      labels_key = key.split("_")[0]
+      label = labels[labels_key]
+      features, smiles, sequence = feature_dict[key]
       outputs.append({"smiles": smiles, "sequence": sequence, "label": label, "features": features})
     df = pd.DataFrame(outputs)
     pickle.dump(df, f)
 
-def generate_dataset(pdbbind_label_file, feature_files, out, output_type):
+def read_feature_file(feature_file, feature_dict = {}):
+  print("Currently reading %s" %feature_file)
+  with gzip.open(feature_file, "rb") as features:
+    contents = pickle.load(features)
+    for index, (key, value) in enumerate(contents.iteritems()):
+      name = os.path.basename(key)
+      feature_dict[name] = value
+  return(feature_dict)
+
+'''
+http://stackoverflow.com/questions/38987/how-can-i-merge-two-python-dictionaries-in-a-single-expression
+'''
+def merge_dicts(*dict_args):
+    '''
+    Given any number of dicts, shallow copy and merge into a new dict,
+    precedence goes to key value pairs in latter dicts.
+    '''
+    result = {}
+    for dictionary in dict_args:
+        result.update(dictionary)
+    return result
+
+def generate_dataset(pdbbind_label_file, feature_files, out, output_type, parallel):
   """Generate dataset file."""
   labels = extract_labels(pdbbind_label_file)
-  feature_dict = {}
-  for feature_file in feature_files:
-    with open(feature_file, "rb") as features:
-      contents = pickle.load(features)
-      for key, value in contents.iteritems():
-        name = os.path.basename(key)
-        feature_dict[name] = value
+  if not parallel:
+    feature_dict = {}
+    for feature_file in feature_files:
+      feature_dict = read_feature_file(feature_file, feature_dict)
+  else:
+    pool = mp.Pool(mp.cpu_count())
+    feature_dicts = pool.map(read_feature_file, feature_files)
+    feature_dict = merge_dicts(*feature_dicts)
+    pool.terminate()
+    print(len(feature_dict))
+    print(len(feature_dict.keys()))
   if output_type == "csv":
     write_csv(feature_dict, labels, out)
   elif output_type == "pkl.gz":
@@ -85,4 +113,4 @@ def generate_dataset(pdbbind_label_file, feature_files, out, output_type):
 
 if __name__ == '__main__':
   args = parse_args()
-  generate_dataset(args.pdbbind_label_file, args.feature_files, args.out, args.output_type)
+  generate_dataset(args.pdbbind_label_file, args.feature_files, args.out, args.output_type, args.parallel)
diff --git a/generate_pdbbind_sample.sh b/generate_pdbbind_sample.sh
@@ -0,0 +1 @@
+python generate_pdbbind_data.py --pdbbind-label-file /scratch/users/enf/deep-docking/datasets/pdbbind/INDEX_general_PL_data.2014 --feature-files /scratch/users/enf/deep-docking/shallow/pdbbind_smiles_ecfp_ligonly_nohyd_t1/*.pkl.gz --out /scratch/users/enf/deep-docking/shallow/pdbbind_smiles_ecfp_ligonly_nohyd_t1.pkl.gz --output_type pkl.gz --parallel parallel
diff --git a/sample_input.sh b/sample_input.sh
@@ -0,0 +1,3 @@
+#python featurize_pdbbind_cluster.py --pdbbind-dir /scratch/users/enf/deep-docking/datasets/pdbbind/website --tmp-dir /scratch/users/enf/deep-docking/shallow/tmp/ --script-template "job%d.sbatch" --num-jobs 10 --pickle-dir /scratch/users/enf/deep-docking/shallow/pdbbind_10rot_1ref_t4 --queue-system slurm --featurization-type 3d_grid --box-width 16.0 --voxel-width 0.5 --nb-rotations 10 --nb-reflections 1
+python featurize_pdbbind_cluster.py --pdbbind-dir /scratch/users/enf/deep-docking/datasets/pdbbind/website --tmp-dir /scratch/users/enf/deep-docking/shallow/tmp/ --script-template "job%d.sbatch" --num-jobs 10 --pickle-dir /scratch/users/enf/deep-docking/shallow/pdbbind_smiles_ecfp_ligonly_nohyd_t1 --queue-system slurm --featurization-type 3d_grid --box-width 16.0 --voxel-width 0.5 --nb-rotations 0 --nb-reflections 0 --ligand-only
+
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		python featurize_pdbbind_cluster.py --pdbbind-dir /scratch/users/enf/deep-docking/datasets/pdbbind/website --script-dir /scratch/users/enf/deep-docking/shallow/pdbbind_full_tmp/ --script-template "job%d.sbatch" --num-jobs 128 --pickle-dir /scratch/users/enf/deep-docking/shallow/pdbbind_10rot_1ref-3dgrid-t1 --queue-system slurm --featurization-type 3d_grid --box-width 16.0 --voxel-width 0.5 --nb-rotations 10 --nb-reflections 1
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		python generate_pdbbind_data.py --pdbbind-label-file /scratch/users/enf/deep-docking/datasets/pdbbind/INDEX_general_PL_data.2014 --feature-files /scratch/users/enf/deep-docking/shallow/pdbbind_smiles_ecfp_ligonly_nohyd_t1/*.pkl.gz --out /scratch/users/enf/deep-docking/shallow/pdbbind_smiles_ecfp_ligonly_nohyd_t1.pkl.gz --output_type pkl.gz --parallel parallel
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		#python featurize_pdbbind_cluster.py --pdbbind-dir /scratch/users/enf/deep-docking/datasets/pdbbind/website --tmp-dir /scratch/users/enf/deep-docking/shallow/tmp/ --script-template "job%d.sbatch" --num-jobs 10 --pickle-dir /scratch/users/enf/deep-docking/shallow/pdbbind_10rot_1ref_t4 --queue-system slurm --featurization-type 3d_grid --box-width 16.0 --voxel-width 0.5 --nb-rotations 10 --nb-reflections 1
		python featurize_pdbbind_cluster.py --pdbbind-dir /scratch/users/enf/deep-docking/datasets/pdbbind/website --tmp-dir /scratch/users/enf/deep-docking/shallow/tmp/ --script-template "job%d.sbatch" --num-jobs 10 --pickle-dir /scratch/users/enf/deep-docking/shallow/pdbbind_smiles_ecfp_ligonly_nohyd_t1 --queue-system slurm --featurization-type 3d_grid --box-width 16.0 --voxel-width 0.5 --nb-rotations 0 --nb-reflections 0 --ligand-only