EMIRGE/plot_blast_hit_stats.R at master · DenefLab/EMIRGE · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
# Script written by Robin Rohwer
# This is step 6 in the Taxonomy Assignment Workflow

# Take in the blast table generated by find_seqIDs_with_pident.R
# Analyze the blast hits, are the best full-length corrected hits being returned as the highest scoring pair (1st hit)?

# purpose: are blast settings OK, or are they not returning full length hits that would make our cutoff.

# The script generates bar plots and csv tables of the results and saves them in the plots.folder.path

# Note: it will make plots with data for other cutoffs not supplied by the user.
# The script is calculating those for the given table, it's not a hard-coding mistake.

# terminal command line syntax:
# RScript plot_blast_hit_stats.R otus.custom.blast.table.modified 98 plots

# ####
# Receive arguments from terminal command line
# ####

userprefs <- commandArgs(trailingOnly = TRUE)
blast.file.path <- userprefs[1]
pident.cutoff <- as.numeric(userprefs[2])
plots.folder.path <- userprefs[3]
if (length(userprefs) > 3){
  mirror.location <- userprefs[4]
}else{
  mirror.location <- "https://cran.mtu.edu"
}

# cat("fuck you forgot to comment out the file paths in plot_blast_hit_stats.R!")
# blast.file.path <- "../../take19/otus.custom.blast.table.modified"
# pident.cutoff <- 98
# plots.folder.path <- "../../take19/plots"
# mirror.location <- "https://cran.mtu.edu"

# ####
# Install Necessary Packages
# ####

# The package reshape is needed, must specify cran mirror and library path with Rscript
library.path <- cat(.libPaths()) # this is the default installation path
get.necessary.packages <- function(){
  if (library(package = "reshape", logical.return = TRUE, lib.loc = library.path) == FALSE){
    install.packages("reshape", repos = mirror.location)
    library("reshape", lib.loc = library.path)
  }else{
    library("reshape", lib.loc = library.path)
  }
}

# ####
# Define Functions for Handling Data
# ####

# import the blast table generated in find_seqIDs_with_pident
import.BLAST.hits.table <- function(FilePath){
  blast.file <- FilePath
  blast <- read.table(file = blast.file, header = F, sep = "", colClasses = "character")
  colnames(blast) <- c("qseqid", "pident", "length", "qlen", "q.align", "true.pids", "hit.num")
  blast[ ,2:7] <- apply(blast[ ,2:7], 2, as.numeric)
  return(blast)
}

# Trim blast table to only contain hits above the pident cutoff
apply.pident.cutoff <- function(BlastTable, PidentCutoff){
  blast <- BlastTable
  cutoff <- PidentCutoff

  index <- which(blast$true.pids >= cutoff)
  blast.cutoff <- blast[index,]

  return(blast.cutoff)
}

# find number of seqIDs from each hit #
reformat.fract.ids.vs.hit.num <- function(BlastTable){
  blast <- BlastTable

  # Only look at seqID and hit.num
  blast <- blast[ ,c(1,7)]
  blast[ ,2] <- as.numeric(blast[ ,2])

  # Use the reshape package to rearrange your data tables
  # id is hit.num because that is the category we're arranging by
  # measured is qseqid because that is the variable we're applying the aggregate funciton to- what's the total #?
  blast.melted <- melt(data = blast, id.vars = "hit.num", measure.vars = "qseqid")
  blast.casted <- cast(data = blast.melted, formula = hit.num ~ variable, fun.aggregate = length)

  # Convert the qseqid length column to % total length/numbers
  blast.casted$qseqid <- blast.casted$qseqid / sum(blast.casted$qseqid) * 100
  colnames(blast.casted)[2] <- "perc.of.qseqids"
  return(blast.casted)
}

# ####
# Define Functions for Plotting Data
# ####

# bar plot blast hit number results, and also export a table
bar.plot.blast.results <- function(BlastHitsTable, OutputFolder, NumBars, Cutoff = 0){
  hits <- BlastHitsTable
  plot.folder <- OutputFolder
  hits.reported <- NumBars

  # at higher cutoffs some hits appear unreported, make sure these say zero so the bars don't disappear
  hits.matrix <- matrix(data = 0,nrow = hits.reported, ncol = 2)
  colnames(hits.matrix) <- c("hit.num", "perc.of.qseqids")
  row.names(hits.matrix) <- paste("hit",1:hits.reported, sep="")
  hits.matrix[,1] <- 1:hits.reported

  for (h in 1:hits.reported){
    index <- which(hits[,1] == h)
    if (length(index) > 0){
      hits.matrix[h,2] <- hits[index,2]
    }else{
      hits.matrix[h,2] <- 0
    }
  }

  if (Cutoff == 0){

    # export the plot
    plot.name <- paste(plot.folder, "/BLAST_hits_used_overall.png", sep="")
    png(filename = plot.name, width = 5.5, height = 5, units = "in", res = 100)
    barplot(height = hits.matrix[,2], names.arg = hits.matrix[,1], ylim = c(0,100),
            main = "Which BLAST hit gave the best \"full length\" pident?\n(no cutoff pident applied)",
            xlab = "Hit Number", ylab = "Percent of Best Hits (%)", col = "lightsalmon")
    unnecessary.comment <- dev.off()
    cat("made plot: ", plot.name, "\n")

    # export the table
    file.name <- paste(plot.folder, "/BLAST_hits_used_overall.csv", sep = "")
    write.csv(x = hits.matrix, file = file.name, quote = FALSE, row.names = FALSE)
    cat("made datafile: ", file.name, "\n")

  }else{

    # export the plot
    plot.name <- paste(plot.folder, "/BLAST_hits_used_for_pident_", Cutoff, ".png", sep="")
    png(filename = plot.name, width = 5.5, height = 5, units = "in", res = 100)
    barplot(height = hits.matrix[,2], names.arg = hits.matrix[,1], ylim = c(0,100),
            main = paste("Which BLAST hit gave the best \"full length\" pident?\n(out of seqIDs above cutoff: >=",Cutoff,")",sep=""),
            xlab = "Hit Number", ylab = "Percent of Best Hits (%)", col = "lightsalmon")
    unnecessary.comment <- dev.off()
    cat("made plot: ", plot.name, "\n")

    #export the table
    file.name <- paste(plot.folder, "/BLAST_hits_used_for_pident_", Cutoff, ".csv", sep = "")
    write.csv(x = hits.matrix, file = file.name, quote = FALSE, row.names = FALSE)
    cat("made datafile: ", file.name, "\n")
  }
}

# Make a stacked bar chart showing how the proportions change with different pidents
bar.plot.stacked.blast.results <- function(BlastTable, CutoffVector, OutputFolder, Reverse = FALSE){
  blast <- BlastTable
  cutoffs <- CutoffVector
  plot.folder <- OutputFolder
  num.hits.reported <- length(unique(blast$hit.num)) # the highest blast hit number output into the table

  # create a matrix from which to plot a stacked bar chart
  hits.matrix <- matrix(data = 0,nrow = num.hits.reported, ncol = length(cutoffs))
  colnames(hits.matrix) <- paste("", cutoffs, sep="")
  row.names(hits.matrix) <- paste("hit",1:num.hits.reported, sep="")

  # get hit stats for each cutoff
  for (c in 1:length(cutoffs)){
    trimmed.blast <- apply.pident.cutoff(BlastTable = blast, PidentCutoff = cutoffs[c])
    hits <- reformat.fract.ids.vs.hit.num(BlastTable = trimmed.blast)

    # at higher cutoffs some hits appear unreported, make sure these say zero so they fit in the matrix
    for (h in 1:num.hits.reported){
      index <- which(hits[,1] == h)
      if (length(index) > 0){
        hits.matrix[h,c] <- hits[index,2]
      }else{
        hits.matrix[h,c] <- 0
      }
    }
  }

  # Plot stacked bar that doesn't have hit #1 so that it's easier to see:
  if (Reverse == TRUE){
    missed.best.hits <- hits.matrix[-1, ]
    plot.name <- paste(plot.folder, "/BLAST_hits_used_for_pidents_", min(CutoffVector), "-", max(CutoffVector), "_only_incorrect_hits.png", sep = "")
    png(filename = plot.name, width = 8, height = 5, units = "in", res = 100)
    par(mar = c(8.5,5,4,.4))
    barplot(hits.matrix[-1, ], ylim = c(0,max(colSums(missed.best.hits))+2), col=rainbow(num.hits.reported),
            main = "Which BLAST hits missed the best \"full length\" pident?",
            xlab = "Full length pident cutoff used to filter seqIDs (%)",
            ylab = "SeqIDs corresponding to each blast hit number (%)")
    legend(x = num.hits.reported+1, y = max(colSums(missed.best.hits)), legend = row.names(missed.best.hits),
           text.col = rainbow(num.hits.reported-1), bty = "n")
    unnecessary.comment <- dev.off()
    cat("made plot: ", plot.name, "\n")
  }else{
    # export stacked bar plot with all hit numbers (the origional one)
    plot.name <- paste(plot.folder, "/BLAST_hits_used_for_pidents_", min(CutoffVector), "-", max(CutoffVector), ".png", sep = "")
    png(filename = plot.name, width = 8, height = 5, units = "in", res = 100)
    par(mar = c(8.5,5,4,.4))
    barplot(hits.matrix, ylim = c(0,100), col=c("grey",rainbow(num.hits.reported-1)),
            main = "Which BLAST hit gave the best \"full length\" pident?",
            xlab = "Full length pident cutoff used to filter seqIDs (%)",
            ylab = "SeqIDs corresponding to each blast hit number (%)")
    description <- paste("The grey bar is blast hit #1. These are the percent of seqIDs where both you and blast calculated the same hit to have the best pident.\n",
                         "The colored bars are blast hits #2 - ", num.hits.reported, ". These are the percent of seqIDs where the calculated full length pident was better for a different hit.\n",
                         "At a more stringent pident filter (x axis), fewer poor matches exist so BLAST reports longer matches leading to fewer discrepancies.\n",
                         "However, if there is a lot of color in your chosen pident's bar you must adjust the BLAST settings to correct that!\n",
                         sep = "")
    mtext(text = description, side = 1, line = 7.5, at = -2, cex = .75, adj = 0)
    unnecessary.comment <- dev.off()
    cat("made plot: ", plot.name, "\n")
  }

  #export .csv table of results
  file.name <- paste(plot.folder, "/BLAST_hits_used_for_pidents_", min(CutoffVector), "-", max(CutoffVector), ".csv", sep = "")
  write.csv(x = hits.matrix, file = file.name, quote = FALSE)
  cat("Made datafile: ", file.name, "\n")
}

# plot number of seqIDs at each hit #
line.plot.overlay.blast.results <- function(BlastTable, CutoffVector, OutputFolder){
  blast <- BlastTable
  cutoffs <- CutoffVector
  plots.folder <- OutputFolder

  # Set up a blank plot:
  png(filename = paste(plots.folder, "/BLAST_hits_line_plot.png", sep = ""), width = 5.5, height = 5, units = "in", res = 100)
  plot(0, type = "n", xlim = c(1,5), ylim = c(0,100), main = "Which BLAST hit gave the best \"full length\" pident?",
       xlab = "Hit Number", ylab = "Percent of Best Hits (%)" )

  # Plot results for each cutoff
  for (c in 1:length(cutoffs)){
    trimmed.blast <- apply.pident.cutoff(BlastTable = blast, PidentCutoff = cutoffs[c])
    hits <- reformat.fract.ids.vs.hit.num(BlastTable = trimmed.blast)
    lines(hits, col = rainbow(length(cutoffs))[c])
    points(hits, col = rainbow(length(cutoffs))[c], pch = 19)
  }

  # Add a legend:
  legend(x = 4, y = 100, legend = sort(cutoffs, decreasing=T), text.col = rainbow(length(cutoffs))[length(cutoffs):1])
  unnecessary.comment <- dev.off()
}

# ####
# Use Functions
# ####

get.necessary.packages()

blast <- import.BLAST.hits.table(FilePath = blast.file.path)
hits.reported <- length(unique(blast$hit.num))

# # View a table, 'hit.stats', that shows the percent of times each hit was best
# hit.stats <- reformat.fract.ids.vs.hit.num(BlastTable = blast)
# bar.plot.blast.results(BlastHitsTable = hit.stats, OutputFolder = plots.folder.path, NumBars = hits.reported)

# # View a table, 'cutoff.hit.stats', that shows the percent of time each hit was best
# # after the blast results have been sorted for users chosen cutoff (the one supplied in command line)
# blast.cutoff <- apply.pident.cutoff(BlastTable = blast, PidentCutoff = pident.cutoff)
# cutoff.hit.stats <- reformat.fract.ids.vs.hit.num(BlastTable = blast.cutoff)
# bar.plot.blast.results(BlastHitsTable = cutoff.hit.stats, Cutoff = pident.cutoff, OutputFolder = plots.folder.path, NumBars = hits.reported)

# # View a plot of overlayed lines showing how the proportion of best hits changes with different cutoffs
# line.plot.overlay.blast.results(BlastTable = blast, CutoffVector = 90:100, OutputFolder = plots.folder.path)

# View stacked bar chart to see how the proportion of best hits changes with different cutoffs
bar.plot.stacked.blast.results(BlastTable = blast, CutoffVector = c(0,90:100), OutputFolder = plots.folder.path)
# bar.plot.stacked.blast.results(BlastTable = blast, CutoffVector = 70:100, OutputFolder = plots.folder.path)
bar.plot.stacked.blast.results(BlastTable = blast, CutoffVector = c(0,90:100), OutputFolder = plots.folder.path, Reverse = TRUE)


# Note how that plot levels off as you look way lower on the pidents.
# That's probably because that's where blast's built-in e-value cutoff
# stops reporting worse values, so worse hits aren't included in output.