diff --git a/.github/workflows/check-bioc.yaml b/.github/workflows/check-bioc.yaml index 80e8721..946a33e 100644 --- a/.github/workflows/check-bioc.yaml +++ b/.github/workflows/check-bioc.yaml @@ -9,7 +9,7 @@ ## ## Using GitHub Actions exposes you to many details about how R packages are ## compiled and installed in several operating system.s -## If you need help, please follow the steps listed at +### If you need help, please follow the steps listed at ## https://github.com/r-lib/actions#where-to-find-help ## ## If you found an issue specific to biocthis's GHA workflow, please report it @@ -38,7 +38,8 @@ env: run_covr: 'false' run_pkgdown: 'true' has_RUnit: 'false' - cache-version: 'cache-v7' + cache-version: 'cache-v2' + run_docker: 'false' jobs: build-check: @@ -51,9 +52,11 @@ jobs: fail-fast: false matrix: config: - - { os: ubuntu-latest, r: '4.2', bioc: '3.15', cont: "bioconductor/bioconductor_docker:RELEASE_3_15", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" } - - { os: macOS-latest, r: '4.2', bioc: '3.15'} - - { os: windows-latest, r: '4.2', bioc: '3.15'} + - { os: ubuntu-latest, r: '4.5.1', bioc: 'devel', cont: "bioconductor/bioconductor_docker:devel-R-4.5.1"} + - { os: macOS-latest, r: '4.5', bioc: '3.21'} + - { os: windows-latest, r: '4.5', bioc: '3.21'} + ## Check https://github.com/r-lib/actions/tree/master/examples + ## for examples using the http-user-agent env: R_REMOTES_NO_ERRORS_FROM_WARNINGS: true RSPM: ${{ matrix.config.rspm }} @@ -66,21 +69,23 @@ jobs: ## Set the R library to the directory matching the ## R packages cache step further below when running on Docker (Linux). - - name: Set R Library home on Linux - if: runner.os == 'Linux' - run: | - mkdir /__w/_temp/Library - echo ".libPaths('/__w/_temp/Library')" > ~/.Rprofile + ## - name: Set R Library home on Linux + ## if: runner.os == 'Linux' + ## uses: r-lib/actions/setup-r@v2 + ## with: + ## r-version: ${{ matrix.config.r }} + ## run: | + ## mkdir -p /__w/_temp/Library + ## echo ".libPaths('/__w/_temp/Library')" > ~/.Rprofile ## Most of these steps are the same as the ones in ## https://github.com/r-lib/actions/blob/master/examples/check-standard.yaml ## If they update their steps, we will also need to update ours. - name: Checkout Repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 ## R is already included in the Bioconductor docker images - name: Setup R from r-lib - if: runner.os != 'Linux' uses: r-lib/actions/setup-r@v2 with: r-version: ${{ matrix.config.r }} @@ -96,28 +101,28 @@ jobs: saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) shell: Rscript {0} - - name: Cache R packages + - name: Restore R package cache if: "!contains(github.event.head_commit.message, '/nocache') && runner.os != 'Linux'" - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ${{ env.R_LIBS_USER }} - key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_15-r-4.2-${{ hashFiles('.github/depends.Rds') }} - restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_15-r-4.2- + key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_21-r-4.5-${{ hashFiles('.github/depends.Rds') }} + restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_21-r-4.5- - name: Cache R packages on Linux if: "!contains(github.event.head_commit.message, '/nocache') && runner.os == 'Linux' " - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: /home/runner/work/_temp/Library - key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_15-r-4.2--${{ hashFiles('.github/depends.Rds') }} - restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_15-r-4.2- + key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_21-r-4.5-${{ hashFiles('.github/depends.Rds') }} + restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_21-r-4.5- - - name: Install Linux system dependencies - if: runner.os == 'Linux' - run: | - sysreqs=$(Rscript -e 'cat("apt-get update -y && apt-get install -y", paste(gsub("apt-get install -y ", "", remotes::system_requirements("ubuntu", "20.04")), collapse = " "))') - echo $sysreqs - sudo -s eval "$sysreqs" + # - name: Install Linux system dependencies + # if: runner.os == 'Linux' + # run: | + # sysreqs=$(Rscript -e 'cat("apt-get update -y && apt-get install -y", paste(gsub("apt-get install -y ", "", remotes::system_requirements("ubuntu", "20.04")), collapse = " "))') + # echo $sysreqs + # sudo -s eval "$sysreqs" - name: Install macOS system dependencies if: matrix.config.os == 'macOS-latest' @@ -125,17 +130,15 @@ jobs: ## Enable installing XML from source if needed brew install libxml2 echo "XML_CONFIG=/usr/local/opt/libxml2/bin/xml2-config" >> $GITHUB_ENV - ## Required to install magick as noted at ## https://github.com/r-lib/usethis/commit/f1f1e0d10c1ebc75fd4c18fa7e2de4551fd9978f#diff-9bfee71065492f63457918efcd912cf2 brew install imagemagick@6 - ## For textshaping, required by ragg, and required by pkgdown brew install harfbuzz fribidi - ## For installing usethis's dependency gert brew install libgit2 - + ## Required for tcltk + brew install xquartz --cask - name: Install Windows system dependencies if: runner.os == 'Windows' run: | @@ -150,7 +153,7 @@ jobs: - name: Set BiocVersion run: | - BiocManager::install(version = "${{ matrix.config.bioc }}", ask = FALSE) + BiocManager::install(version = "${{ matrix.config.bioc }}", ask = FALSE, force = TRUE) shell: Rscript {0} - name: Install dependencies pass 1 @@ -161,10 +164,12 @@ jobs: ## https://stat.ethz.ch/pipermail/bioc-devel/2020-April/016675.html ## https://github.com/r-lib/remotes/issues/296 ## Ideally, all dependencies should get installed in the first pass. - + ## For running the checks + message(paste('****', Sys.time(), 'installing rcmdcheck and BiocCheck ****')) + install.packages(c("rcmdcheck", "BiocCheck"), repos = BiocManager::repositories()) ## Pass #1 at installing dependencies message(paste('****', Sys.time(), 'pass number 1 at installing dependencies: local dependencies ****')) - remotes::install_local(dependencies = TRUE, repos = BiocManager::repositories(), build_vignettes = TRUE, upgrade = TRUE) + remotes::install_local(dependencies = TRUE, repos = BiocManager::repositories(), build_vignettes = FALSE, upgrade = TRUE) continue-on-error: true shell: Rscript {0} @@ -172,12 +177,7 @@ jobs: run: | ## Pass #2 at installing dependencies message(paste('****', Sys.time(), 'pass number 2 at installing dependencies: any remaining dependencies ****')) - remotes::install_local(dependencies = TRUE, repos = BiocManager::repositories(), build_vignettes = TRUE, upgrade = TRUE) - - ## For running the checks - message(paste('****', Sys.time(), 'installing rcmdcheck and BiocCheck ****')) - remotes::install_cran("rcmdcheck") - BiocManager::install("BiocCheck") + remotes::install_local(dependencies = TRUE, repos = BiocManager::repositories(), build_vignettes = TRUE, upgrade = TRUE, force = TRUE) shell: Rscript {0} - name: Install BiocGenerics @@ -188,15 +188,15 @@ jobs: shell: Rscript {0} - name: Install covr - if: github.ref == 'refs/heads/master' && env.run_covr == 'true' && runner.os == 'Linux' + if: github.ref == 'refs/heads/devel' && env.run_covr == 'true' && runner.os == 'Linux' run: | remotes::install_cran("covr") shell: Rscript {0} - name: Install pkgdown - if: github.ref == 'refs/heads/master' && env.run_pkgdown == 'true' && runner.os == 'Linux' + if: github.ref == 'refs/heads/devel' && env.run_pkgdown == 'true' && runner.os == 'Linux' run: | - remotes::install_cran("pkgdown") + remotes::install_version("pkgdown", version="2.1.2") shell: Rscript {0} - name: Session info @@ -209,10 +209,12 @@ jobs: - name: Run CMD check env: _R_CHECK_CRAN_INCOMING_: false + DISPLAY: 99.0 run: | + options(crayon.enabled = TRUE) rcmdcheck::rcmdcheck( - args = c("--no-build-vignettes", "--no-manual", "--timings"), - build_args = c("--no-manual", "--no-resave-data"), + args = c("--no-manual", "--no-vignettes", "--timings"), + build_args = c("--no-manual", "--keep-empty-dirs", "--no-resave-data"), error_on = "warning", check_dir = "check" ) @@ -230,6 +232,8 @@ jobs: shell: Rscript {0} - name: Run BiocCheck + env: + DISPLAY: 99.0 run: | BiocCheck::BiocCheck( dir('check', 'tar.gz$', full.names = TRUE), @@ -240,31 +244,41 @@ jobs: shell: Rscript {0} - name: Test coverage - if: github.ref == 'refs/heads/master' && env.run_covr == 'true' && runner.os == 'Linux' + if: github.ref == 'refs/heads/devel' && env.run_covr == 'true' && runner.os == 'Linux' run: | - covr::codecov() + covr::codecov(coverage = covr::package_coverage(type = "all")) shell: Rscript {0} - name: Install package - if: github.ref == 'refs/heads/master' && env.run_pkgdown == 'true' && runner.os == 'Linux' + if: github.ref == 'refs/heads/devel' && env.run_pkgdown == 'true' && runner.os == 'Linux' run: R CMD INSTALL . - - name: Deploy package - if: github.ref == 'refs/heads/master' && env.run_pkgdown == 'true' && runner.os == 'Linux' - run: | - git config --global --add safe.directory '*' - git config --local user.name "$GITHUB_ACTOR" - git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" - Rscript -e "pkgdown::deploy_to_branch(new_process = FALSE)" - + - name: Build pkgdown site + if: github.ref == 'refs/heads/devel' && env.run_pkgdown == 'true' && runner.os == 'Linux' + run: | + remotes::install_version("pkgdown", version="2.1.1") + pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) + shell: Rscript {0} ## Note that you need to run pkgdown::deploy_to_branch(new_process = FALSE) ## at least one locally before this will work. This creates the gh-pages ## branch (erasing anything you haven't version controlled!) and ## makes the git history recognizable by pkgdown. + - name: Install deploy dependencies + if: github.ref == 'refs/heads/devel' && env.run_pkgdown == 'true' && runner.os == 'Linux' + run: | + apt-get update && apt-get -y install rsync + - name: Deploy pkgdown site to GitHub pages 🚀 + if: github.ref == 'refs/heads/devel' && env.run_pkgdown == 'true' && runner.os == 'Linux' + uses: JamesIves/github-pages-deploy-action@v4.5.0 + with: + clean: false + branch: gh-pages + folder: docs + - name: Upload check results if: failure() uses: actions/upload-artifact@master with: - name: ${{ runner.os }}-biocversion-RELEASE_3_15-r-4.2-results + name: ${{ runner.os }}-biocversion-RELEASE_3_21-r-4.5-results path: check diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml index 83976a3..db2b71d 100644 --- a/.github/workflows/test-coverage.yaml +++ b/.github/workflows/test-coverage.yaml @@ -1,3 +1,5 @@ +# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples +# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: push: pull_request: @@ -6,37 +8,26 @@ name: test-coverage jobs: test-coverage: - runs-on: macOS-latest + runs-on: ubuntu-latest env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} + steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: r-lib/actions/setup-r@v2 - - - uses: r-lib/actions/setup-pandoc@v2 - - - name: Query dependencies - run: | - install.packages('remotes') - saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) - writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version") - shell: Rscript {0} - - - name: Cache R packages - uses: actions/cache@v3 with: - path: ${{ env.R_LIBS_USER }} - key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }} - restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1- + use-public-rspm: true - - name: Install dependencies - run: | - install.packages(c("remotes")) - remotes::install_deps(dependencies = TRUE) - remotes::install_cran("covr", force = TRUE) - shell: Rscript {0} + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::covr + needs: coverage - name: Test coverage - run: covr::codecov() + run: covr::codecov(quiet = FALSE) shell: Rscript {0} + env: + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} + diff --git a/DESCRIPTION b/DESCRIPTION index 8734d9b..3aee5f8 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: CNVMetrics Type: Package -Version: 1.5.1 +Version: 1.15.1 Date: 2021-11-23 Title: Copy Number Variant Metrics Description: The CNVMetrics package calculates similarity metrics to @@ -49,4 +49,4 @@ VignetteBuilder: knitr URL: https://github.com/krasnitzlab/CNVMetrics, https://krasnitzlab.github.io/CNVMetrics/ BugReports: https://github.com/krasnitzlab/CNVMetrics/issues -RoxygenNote: 7.1.2 +RoxygenNote: 7.3.3 diff --git a/NEWS.md b/NEWS.md index 144afa7..aa61517 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,10 @@ +# CNVMetrics 1.15.1 + +NEW FEATURES + +* The documentation has been updated. + + # CNVMetrics 1.5.1 NEW FEATURES diff --git a/R/CNVMetrics.R b/R/CNVMetrics.R index 417cdd8..1af06a4 100644 --- a/R/CNVMetrics.R +++ b/R/CNVMetrics.R @@ -11,8 +11,6 @@ #' the log2 ratio values. #' Finally, a visualization tool is provided to explore resulting metrics. #' -#' @docType package -#' #' @name CNVMetrics-package #' #' @aliases CNVMetrics-package CNVMetrics @@ -25,14 +23,15 @@ #' #' @seealso #' \itemize{ -#' \item \code{\link{calculateOverlapMetric}} {for calculating metric +#' \item{\code{\link{calculateOverlapMetric}} for calculating metric #' using overlapping amplified/deleted regions} -#' \item \code{\link{calculateLog2ratioMetric}} {for calculating metric +#' \item{\code{\link{calculateLog2ratioMetric}} for calculating metric #' using log2ratio values} -#' \item \code{\link{processSim}} {for generating simulations} -#' \item \code{\link{plotMetric}} {for plotting metrics} +#' \item{\code{\link{processSim}} for generating simulations} +#' \item{\code{\link{plotMetric}} for plotting metrics} #' } #' #' @encoding UTF-8 #' @keywords package -NULL +"_PACKAGE" + diff --git a/R/CNVMetricsLog2ratioInternalMethods.R b/R/CNVMetricsLog2ratioInternalMethods.R index cbd4162..da3963f 100644 --- a/R/CNVMetricsLog2ratioInternalMethods.R +++ b/R/CNVMetricsLog2ratioInternalMethods.R @@ -1,151 +1,149 @@ #' @title Calculate metric using the log2ratio values between two samples. -#' -#' @description Calculate a specific metric using the level of -#' amplification/deletion, in log2 ratio, between -#' two samples. -#' -#' @param entry a \code{list} which contains the row and column indexes -#' (always in this order) of -#' the metric in the final matrix. Those values correspond to the positions +#' +#' @description Calculate a specific metric using the level of +#' amplification/deletion, in log2 ratio, between two samples. +#' +#' @param entry a \code{list} which contains the row and column indexes +#' (always in this order) of +#' the metric in the final matrix. Those values correspond to the positions #' of the two samples used #' to calculate the metric in the \code{GRangesList} (\code{segmentData}). -#' -#' @param segmentData a \code{GRangesList} that contains a collection of -#' genomic ranges representing copy number events, including amplified/deleted -#' status, from at least 2 samples. All samples must have a metadata column +#' +#' @param segmentData a \code{GRangesList} that contains a collection of +#' genomic ranges representing copy number events, including amplified/deleted +#' status, from at least 2 samples. All samples must have a metadata column #' called '\code{log2ratio}' with the log2ratio values. #' @param method a \code{character} string representing the metric to be #' used ('\code{weightedEuclideanDistance}'). -#' -#' @param minThreshold a single \code{numeric} setting the minimum value -#' to consider two segments as different during the metric calculation. If the -#' absolute difference is below or equal to threshold, the difference will be -#' replaced by zero. -#' -#' @param bedExclusion an optional \code{GRanges} containing the regions -#' that have to be excluded for the metric calculation or code{NULL}. -#' -#' @details -#' +#' +#' @param minThreshold a single \code{numeric} setting the minimum value +#' to consider two segments as different during the metric calculation. If the +#' absolute difference is below or equal to threshold, the difference will be +#' replaced by zero. +#' +#' @param bedExclusion an optional \code{GRanges} containing the regions +#' that have to be excluded for the metric calculation or \code{NULL}. +#' +#' @details +#' #' The method calculates a specified metric using overlapping #' regions between the samples. Only regions corresponding to the type -#' specified by user are used in the calculation of the metric. The strand of +#' specified by user are used in the calculation of the metric. The strand of #' the regions is not taken into account while #' calculating the metric. -#' -#' The Sorensen metric is calculated by dividing twice the size of +#' +#' The Sorensen metric is calculated by dividing twice the size of #' the intersection by the sum of the size of the two sets. If the sum of #' the size of the two sets is zero; the value \code{NA} is -#' returned instead. -#' -#' +#' returned instead. +#' +#' #' @return a \code{list} containing 1 entry: #' \itemize{ -#' \item{\code{metric}}{ a \code{data.frame}, which contains 3 columns. The 2 -#' first columns, called \code{row} and \code{column} correspond to the +#' \item{\code{metric} a \code{data.frame}, which contains 3 columns. The 2 +#' first columns, called \code{row} and \code{column} correspond to the #' indexes of the metric in the final matrix. Those -#' 2 first columns match to the \code{entry} parameter. The third column, -#' called \code{metric}, -#' contains the values of the specified metric for each combination. -#' If the metric cannot be calculated, \code{NA} is present. -#' } +#' 2 first columns match to the \code{entry} parameter. The third column, +#' called \code{metric}, +#' contains the values of the specified metric for each combination. +#' If the metric cannot be calculated, \code{NA} is present. } #' } -#' +#' #' @examples -#' +#' #' ## Load required package to generate the two samples #' require(GenomicRanges) -#' +#' #' ## Create a GRangesList object with 3 samples #' ## The stand of the regions doesn't affect the calculation of the metric #' demo <- GRangesList() -#' +#' #' ## Generate two samples with log2value information as a metadata column -#' demo[["sample01"]] <- GRanges(seqnames="chr1", -#' ranges=IRanges(start=c(100, 201, 400), +#' demo[["sample01"]] <- GRanges(seqnames="chr1", +#' ranges=IRanges(start=c(100, 201, 400), #' end=c(200, 350, 500)), strand="*", #' log2ratio=c(1.1111, 2.2222, -0.9999)) -#' demo[["sample02"]] <- GRanges(seqnames="chr1", -#' ranges=IRanges(start=c(150, 200, 450), +#' demo[["sample02"]] <- GRanges(seqnames="chr1", +#' ranges=IRanges(start=c(150, 200, 450), #' end=c(250, 350, 500)), strand="*", #' log2ratio=c(2.2121, 1.1212, -1.3939)) -#' +#' #' ## The 2 samples used to calculate the metric -#' entries <- data.frame(row=c(2), col=c(1)) -#' -#' ## Calculate weighted Euclidean distance -#' CNVMetrics:::calculateOneLog2valueMetricT(entry=entries, -#' segmentData=demo, method="weightedEuclideanDistance", +#' entries <- data.frame(row=c(2), col=c(1)) +#' +#' ## Calculate weighted Euclidean distance +#' CNVMetrics:::calculateOneLog2valueMetricT(entry=entries, +#' segmentData=demo, method="weightedEuclideanDistance", #' minThreshold=0.2, bedExclusion=NULL) -#' -#' +#' +#' #' @author Astrid DeschĂȘnes #' @encoding UTF-8 #' @keywords internal -calculateOneLog2valueMetricT <- function(entry, segmentData, method, +calculateOneLog2valueMetricT <- function(entry, segmentData, method, minThreshold, bedExclusion) { entries <- split(entry, entry$col) - + results <- list() - + for (data in entries) { - + result <- data result$metric <- rep(NA, nrow(result)) - + sample02 <- segmentData[[data$col[1]]] - + for(i in seq_len(nrow(data))) { sample01 <- segmentData[[data$row[i]]] - - # Obtain the disjoint segments with log2ratio values in + + # Obtain the disjoint segments with log2ratio values in # metadata columns disjoinR <- createDisjoinSegmentsForTwoSamples( segmentDataSample1=sample01, - segmentDataSample2=sample02, + segmentDataSample2=sample02, bedExclusion=bedExclusion) - if (length(sample01) > 0 && length(sample02) > 0) { + if (length(sample01) > 0 && length(sample02) > 0) { result$metric[i] <- switch(method, - weightedEuclideanDistance= + weightedEuclideanDistance= calculateWeightedEuclideanDistanceFor2Samples( - segmentData=disjoinR, + segmentData=disjoinR, minThreshold=minThreshold)) } } - + results[[length(results) + 1]] <- result } - + results <- do.call(rbind, results) - + return(result <- list(metric=results)) } -#' @title Generate common segments to enable calculation of metrics on +#' @title Generate common segments to enable calculation of metrics on #' two segmented samples. -#' -#' @description The two segments are gathered together, including excluded -#' regions when specified, and a disjoint operation is done to create a -#' collection of non-overlapping ranges. The ranges overlapping the excluded +#' +#' @description The two segments are gathered together, including excluded +#' regions when specified, and a disjoint operation is done to create a +#' collection of non-overlapping ranges. The ranges overlapping the excluded #' regions are marked as so to be removed from future analysis. The log2value #' of each samples are assigned to the new disjointed segments for each sample #' in the metadata columns. -#' -#' @param segmentDataSample1 a \code{GRanges}, the segments from the first +#' +#' @param segmentDataSample1 a \code{GRanges}, the segments from the first #' sample. -#' -#' @param segmentDataSample2 a \code{GRanges}, the segments from the second +#' +#' @param segmentDataSample2 a \code{GRanges}, the segments from the second #' sample. -#' +#' #' @param bedExclusion a \code{GRanges}, the regions that must be #' excluded from the analysis. Default: \code{NULL}. -#' +#' #' @return a \code{GRanges} containing the common segment information for the -#' two samples. The log2ration value are present, for the two samples, in -#' the metadata columns. When there is not log2ratio value for one sample, +#' two samples. The log2ration value are present, for the two samples, in +#' the metadata columns. When there is not log2ratio value for one sample, #' NA is the assigned value. A metadata column also specifies if the segments #' should be included in the analysis. #' @@ -153,45 +151,45 @@ calculateOneLog2valueMetricT <- function(entry, segmentData, method, #' #' ## Load required package to generate the two samples #' require(GenomicRanges) -#' +#' #' # Create first Granges representing first sample #' sample01 <- GRanges(seqnames="chr1", #' ranges=IRanges(start=c(100, 201, 400), end=c(200, 350, 500)), #' strand="*", log2ratio=c(0.3091175, 0.4582058, -0.3798390)) -#' +#' #' # Create second Granges representing second sample #' sample02 <- GRanges(seqnames="chr1", #' ranges=IRanges(start=c(150, 200, 450), end=c(250, 350, 500)), #' strand="*", log2ratio=c(0.222174, 0.3282156, -0.2728292)) -#' +#' #' # Create disjoint segment using the 2 samples and without any region #' # excluded from the analysis (parameter bedExclusion set to null) -#' CNVMetrics:::createDisjoinSegmentsForTwoSamples(segmentDataSample1=sample01, +#' CNVMetrics:::createDisjoinSegmentsForTwoSamples(segmentDataSample1=sample01, #' segmentDataSample2=sample02, bedExclusion=NULL) -#' +#' #' @author Astrid DeschĂȘnes #' @importFrom GenomicRanges disjoin findOverlaps elementMetadata #' @importFrom S4Vectors queryHits subjectHits values<- #' @importFrom magrittr %>% #' @keywords internal -createDisjoinSegmentsForTwoSamples <- function(segmentDataSample1, - segmentDataSample2, +createDisjoinSegmentsForTwoSamples <- function(segmentDataSample1, + segmentDataSample2, bedExclusion=NULL) { - + results <- disjoin(c(segmentDataSample1, segmentDataSample2)) results$included <- TRUE - + ## Add information about excluded regions - ## When a segment overlaps with an excluded region, it is marked as + ## When a segment overlaps with an excluded region, it is marked as ## excluded if (!is.null(bedExclusion) && (length(bedExclusion) > 0)) { olaps <- findOverlaps(results, bedExclusion) - + if (length(olaps) > 0) { results[queryHits(olaps)]$included <- FALSE } } - + ## Assign the log2value of the 2 samples for each new segment as ## metadata columns segList <- list(segmentDataSample1, segmentDataSample2) @@ -200,102 +198,102 @@ createDisjoinSegmentsForTwoSamples <- function(segmentDataSample1, temp <- elementMetadata(results) sampleName <- paste0("sample_", i) temp[, sampleName] <- NA - temp[queryHits(olaps), sampleName] <- + temp[queryHits(olaps), sampleName] <- segList[[i]]$log2ratio[subjectHits(olaps)] values(results) <- temp } - + return(results) } #' @title Calculate Weighted Euclidean distance-based metric between samples. -#' +#' #' @description The weighted Euclidean distance-based metric corresponds to the -#' euclidean distance between 2 samples multiplied by the natural logarithm +#' euclidean distance between 2 samples multiplied by the natural logarithm #' of the number of bases of the analyzed segment. The final metric is 1 over -#' 1 added to the -#' squared sum of the values obtained for all segments that are not +#' 1 added to the +#' squared sum of the values obtained for all segments that are not #' excluded of the analysis. -#' -#' @param segmentData a \code{list} marked as a \code{preMetricSegments} -#' \code{class} that contains the disjoint segment information from 2 +#' +#' @param segmentData a \code{list} marked as a \code{preMetricSegments} +#' \code{class} that contains the disjoint segment information from 2 #' samples and the log2ratio values of the samples in the metadata columns. -#' -#' @param minThreshold a single \code{numeric} setting the minimum value -#' to consider two segments as different for the metric calculation. If the -#' absolute difference is below or equal to threshold, the value will be +#' +#' @param minThreshold a single \code{numeric} setting the minimum value +#' to consider two segments as different for the metric calculation. If the +#' absolute difference is below or equal to threshold, the value will be #' replaced by zero. -#' -#' @return a \code{numeric} representing the weighted euclidean distance +#' +#' @return a \code{numeric} representing the weighted euclidean distance #' between the two samples. If the distance cannot be calculated as the two #' samples don't share any segments with log2ratio value, the value NA is #' assigned. -#' -#' @details -#' -#' The weighted euclidean distance is -#' \eqn{1/(1 + (\sum((x_i - y_i)^2 * log2(nbrBases_i))^0.5)} +#' +#' @details +#' +#' The weighted euclidean distance is +#' \eqn{1/(1 + (\sum((x_i - y_i)^2 * log2(nbrBases_i))^0.5)} #' where \code{x} and \code{y} are the -#' values of 2 samples for a specific segment \code{i} and \code{nbrBases} the +#' values of 2 samples for a specific segment \code{i} and \code{nbrBases} the #' number of bases of the segment \code{i}. -#' -#' +#' +#' #' @examples #' #' ## Load required package to generate the two samples #' require(GenomicRanges) -#' +#' #' # Create first Granges representing first sample #' sample01 <- GRanges(seqnames="chr1", #' ranges=IRanges(start=c(100, 201, 400), end=c(200, 350, 500)), #' strand="*", log2ratio=c(0.3091175, 0.4582058, -0.3798390)) -#' +#' #' # Create second Granges representing second sample #' sample02 <- GRanges(seqnames="chr1", #' ranges=IRanges(start=c(150, 200, 450), end=c(250, 350, 500)), #' strand="*", log2ratio=c(0.222174, 0.3282156, -0.2728292)) -#' +#' #' # Create disjoint segment using the 2 samples and without any region #' # excluded from the analysis (parameter bedExclusion set to null) #' disjoinGRange <- CNVMetrics:::createDisjoinSegmentsForTwoSamples( -#' segmentDataSample1=sample01, segmentDataSample2=sample02, +#' segmentDataSample1=sample01, segmentDataSample2=sample02, #' bedExclusion=NULL) -#' +#' #' ## Calculate the weighted ecucidean distance between the two samples #' CNVMetrics:::calculateWeightedEuclideanDistanceFor2Samples( #' segmentData=disjoinGRange, minThreshold=0.2) -#' +#' #' @author Astrid DeschĂȘnes #' @importFrom GenomicRanges elementMetadata #' @importFrom IRanges ranges width #' @keywords internal -calculateWeightedEuclideanDistanceFor2Samples <- function(segmentData, +calculateWeightedEuclideanDistanceFor2Samples <- function(segmentData, minThreshold) { - + names <- colnames(elementMetadata(segmentData)) names <- names[names != "included"] - + incResults <- elementMetadata(segmentData[segmentData$included, ]) temp01 <- incResults[, c(names[1])] - incResults[, c(names[2])] - + final <- NA - + ## Only calculate when at least one value is present if (!all(is.na(temp01))) { incWidth <- width(ranges(segmentData[segmentData$included, ])) - + ## Set values to zero when lower than threshold - tempPos <- which(abs(temp01) <= minThreshold) + tempPos <- which(abs(temp01) <= minThreshold) if (length(tempPos) > 0) { temp01[tempPos] <- 0.0 } - + ## Calculate metric temp01 <- temp01 * temp01 * log2(incWidth) final <- 1/(1 + sum(temp01, na.rm=TRUE) ^ (1/2)) } - - return(final) -} + + return(final) +} diff --git a/R/CNVMetricsMethods.R b/R/CNVMetricsMethods.R index 3735f7a..14213d6 100644 --- a/R/CNVMetricsMethods.R +++ b/R/CNVMetricsMethods.R @@ -58,7 +58,7 @@ #' metric. This object is a list where each entry corresponds to one state #' specified in the '\code{states}' parameter. Each entry is a \code{matrix}: #' \itemize{ -#' \item{\code{state}}{ a lower-triangular \code{matrix} with the +#' \item{\code{state} a lower-triangular \code{matrix} with the #' results of the selected metric on the amplified regions for each paired #' samples. The value \code{NA} is present when the metric cannot be #' calculated. The value \code{NA} is also present in the top-triangular @@ -69,11 +69,10 @@ #' The object has the following attributes (besides "class" equal #' to "CNVMetric"): #' \itemize{ -#' \item{\code{metric}}{ the metric used for the calculation. +#' \item{\code{metric} the metric used for the calculation.} +#' \item{\code{names} the names of the two matrix containing the metrics +#' for the amplified and deleted regions.} #' } -#' \item{\code{names}}{ the names of the two matrix containing the metrics for -#' the amplified and deleted regions. -#' }} #' #' #' @references @@ -261,7 +260,7 @@ calculateOverlapMetric <- function(segmentData, #' @return an object of class "\code{CNVMetric}" which contains the calculated #' metric. This object is a list with the following components: #' \itemize{ -#' \item{\code{LOG2RATIO}}{ a lower-triangular \code{matrix} with the +#' \item{\code{LOG2RATIO} a lower-triangular \code{matrix} with the #' results of the selected metric on the log2ratio values for each paired #' samples. The value \code{NA} is present when the metric cannot be #' calculated. The value \code{NA} is also present in the top-triangular @@ -271,11 +270,10 @@ calculateOverlapMetric <- function(segmentData, #' The object has the following attributes (besides "class" equal #' to "CNVMetric"): #' \itemize{ -#' \item{\code{metric}}{ the metric used for the calculation. +#' \item{\code{metric} the metric used for the calculation. } +#' \item{\code{names} the names of the two matrix containing the metrics for +#' the amplified and deleted regions. } #' } -#' \item{\code{names}}{ the names of the two matrix containing the metrics for -#' the amplified and deleted regions. -#' }} #' #' #' @examples diff --git a/R/CNVMetricsOverlapInternalMethods.R b/R/CNVMetricsOverlapInternalMethods.R index b8b563b..fd83239 100644 --- a/R/CNVMetricsOverlapInternalMethods.R +++ b/R/CNVMetricsOverlapInternalMethods.R @@ -1,253 +1,253 @@ -#' @title Calculate metric using overlapping amplified/deleted regions between +#' @title Calculate metric using overlapping amplified/deleted regions between #' two samples. -#' -#' @description Calculate a specific metric using overlapping -#' amplified/deleted regions between two samples. -#' -#' @param entry a \code{list} which contains the row and column indexes -#' (always in this order) of -#' the metric in the final matrix. Those values correspond to the positions +#' +#' @description Calculate a specific metric using overlapping +#' amplified/deleted regions between two samples. +#' +#' @param entry a \code{list} which contains the row and column indexes +#' (always in this order) of +#' the metric in the final matrix. Those values correspond to the positions #' of the two samples used #' to calculate the metric in the \code{GRangesList} (\code{segmentData}). -#' -#' @param segmentData a \code{GRangesList} that contains a collection of -#' genomic ranges representing copy number events, including amplified/deleted -#' status, from at least 2 samples. All samples must have a metadata column -#' called '\code{state}' with a state, in an character string format, +#' +#' @param segmentData a \code{GRangesList} that contains a collection of +#' genomic ranges representing copy number events, including amplified/deleted +#' status, from at least 2 samples. All samples must have a metadata column +#' called '\code{state}' with a state, in an character string format, #' specified for each region (ex: DELETION, LOH, AMPLIFICATION, NEUTRAL, etc.). -#' +#' #' @param method a \code{character} string representing the metric to be #' used ('\code{sorensen}' or '\code{szymkiewicz}'. -#' -#' @param type a \code{character} string representing the type of +#' +#' @param type a \code{character} string representing the type of #' copy number events to be used ('\code{AMPLIFICATION}' or '\code{DELETION}'). -#' +#' #' @return a \code{list} containing 1 entry: #' \itemize{ -#' \item{\code{metric}}{ a \code{data.frame}, which contains 3 columns. The 2 -#' first columns, called \code{row} and \code{column} correspond to the +#' \item{\code{metric} a \code{data.frame}, which contains 3 columns. The 2 +#' first columns, called \code{row} and \code{column} correspond to the #' indexes of the metric in the final matrix. Those -#' 2 first columns match to the \code{entry} parameter. The third column, -#' called \code{metric}, -#' contains the values of the specified metric for each combination. -#' If the metric cannot be calculated, \code{NA} is present. +#' 2 first columns match to the \code{entry} parameter. The third column, +#' called \code{metric}, +#' contains the values of the specified metric for each combination. +#' If the metric cannot be calculated, \code{NA} is present. #' } #' } -#' +#' #' #' @examples -#' +#' #' ## Load required package to generate the samples #' require(GenomicRanges) -#' +#' #' ## Create a GRangesList object with 3 samples #' ## The stand of the regions doesn't affect the calculation of the metric #' demo <- GRangesList() -#' demo[["sample01"]] <- GRanges(seqnames="chr1", -#' ranges=IRanges(start=c(1905048, 4554832, 31686841, 32686222), +#' demo[["sample01"]] <- GRanges(seqnames="chr1", +#' ranges=IRanges(start=c(1905048, 4554832, 31686841, 32686222), #' end=c(2004603, 4577608, 31695808, 32689222)), strand="*", #' state=c("AMPLIFICATION", "AMPLIFICATION", "DELETION", "LOH")) -#' -#' demo[["sample02"]] <- GRanges(seqnames="chr1", -#' ranges= IRanges(start=c(1995066, 31611222, 31690000, 32006222), -#' end=c(2204505, 31689898, 31895666, 32789233)), +#' +#' demo[["sample02"]] <- GRanges(seqnames="chr1", +#' ranges= IRanges(start=c(1995066, 31611222, 31690000, 32006222), +#' end=c(2204505, 31689898, 31895666, 32789233)), #' strand=c("-", "+", "+", "+"), #' state=c("AMPLIFICATION", "AMPLIFICATION", "DELETION", "LOH")) -#' -#' ## The amplified region in sample03 is a subset of the amplified regions +#' +#' ## The amplified region in sample03 is a subset of the amplified regions #' ## in sample01 -#' demo[["sample03"]] <- GRanges(seqnames="chr1", -#' ranges=IRanges(start=c(1906069, 4558838), +#' demo[["sample03"]] <- GRanges(seqnames="chr1", +#' ranges=IRanges(start=c(1906069, 4558838), #' end=c(1909505, 4570601)), strand="*", #' state=c("AMPLIFICATION", "DELETION")) -#' +#' #' ## The 2 samples used to calculate the metric -#' entries <- data.frame(row=c(2, 3), col=c(1, 1)) -#' -#' ## Calculate Sorensen metric for the amplified regions on samples 2 and 3 +#' entries <- data.frame(row=c(2, 3), col=c(1, 1)) +#' +#' ## Calculate Sorensen metric for the amplified regions on samples 2 and 3 #' CNVMetrics:::calculateOneOverlapMetricT(entry=entries, segmentData=demo, #' method="sorensen", type="AMPLIFICATION") -#' -#' ## Calculate Szymkiewicz-Simpson metric for the amplified regions -#' ## in samples 1 and 2 -#' ## Amplified regions of sample02 are a subset of the amplified +#' +#' ## Calculate Szymkiewicz-Simpson metric for the amplified regions +#' ## in samples 1 and 2 +#' ## Amplified regions of sample02 are a subset of the amplified #' ## regions in sample01 #' CNVMetrics:::calculateOneOverlapMetricT(entry=entries, segmentData=demo, #' method="szymkiewicz", type="AMPLIFICATION") -#' -#' ## Calculate Sorensen metric for the deleted regions in samples 1 and 2 +#' +#' ## Calculate Sorensen metric for the deleted regions in samples 1 and 2 #' CNVMetrics:::calculateOneOverlapMetricT(entry=entries, segmentData=demo, #' method="sorensen", type="DELETION") -#' +#' #' @author Astrid DeschĂȘnes #' @encoding UTF-8 #' @keywords internal calculateOneOverlapMetricT <- function(entry, segmentData, method, type) { - + entries <- split(entry, entry$col) - + results <- list() - + for (data in entries) { - + result <- data result$metric <- rep(NA, nrow(result)) - + sample02 <- segmentData[[data$col[1]]] sample02 <- sample02[sample02$state == type,] - + for(i in seq_len(nrow(data))) { sample01 <- segmentData[[data$row[i]]] sample01 <- sample01[sample01$state == type,] - - if (length(sample01) > 0 && length(sample02) > 0) { + + if (length(sample01) > 0 && length(sample02) > 0) { result$metric[i] <- switch(method, sorensen=calculateSorensen(sample01, sample02), szymkiewicz=calculateSzymkiewicz(sample01, sample02), jaccard=calculateJaccard(sample01, sample02)) } } - + results[[length(results) + 1]] <- result } - + results <- do.call(rbind, results) return(result <- list(metric=results)) } #' @title Calculate Sorensen metric -#' -#' @description Calculate Sorensen metric using overlapping regions between -#' two samples. -#' -#' @param sample01 a \code{GRanges} which contains a collection of -#' genomic ranges representing copy number events for the first sample. -#' @param sample02 a \code{GRanges} which contains a collection of +#' +#' @description Calculate Sorensen metric using overlapping regions between +#' two samples. +#' +#' @param sample01 a \code{GRanges} which contains a collection of +#' genomic ranges representing copy number events for the first sample. +#' @param sample02 a \code{GRanges} which contains a collection of #' genomic ranges representing copy number events for the second sample. -#' -#' @details -#' +#' +#' @details +#' #' The method calculates the Sorensen metric using overlapping #' regions between the samples. All regions present in both samples are used #' for the calculation of the metric. -#' -#' The Sorensen metric is calculated by dividing twice the size of +#' +#' The Sorensen metric is calculated by dividing twice the size of #' the intersection by the sum of the size of the two sets. If the sum of #' the size of the two sets is zero; the value \code{NA} is #' returned instead. The strand of the regions is not taken into account while #' calculating the intersection. -#' -#' +#' +#' #' @return a \code{numeric}, the value of the Sorensen metric. If #' the metric cannot be calculated, \code{NA} is returned. -#' -#' @references -#' -#' SĂžrensen, Thorvald. n.d. “A Method of Establishing Groups of Equal -#' Amplitude in Plant Sociology Based on Similarity of Species and Its -#' Application to Analyses of the Vegetation on Danish Commons.” +#' +#' @references +#' +#' SĂžrensen, Thorvald. n.d. “A Method of Establishing Groups of Equal +#' Amplitude in Plant Sociology Based on Similarity of Species and Its +#' Application to Analyses of the Vegetation on Danish Commons.” #' Biologiske Skrifter, no. 5: 1–34. -#' +#' #' @examples #' #' ## Load required package to generate the two samples #' require(GenomicRanges) #' #' ## Generate two samples with identical sequence levels -#' sample01 <- GRanges(seqnames="chr1", -#' ranges=IRanges(start=c(1905048, 4554832, 31686841), +#' sample01 <- GRanges(seqnames="chr1", +#' ranges=IRanges(start=c(1905048, 4554832, 31686841), #' end=c(2004603, 4577608, 31695808)), strand="*") -#' sample02 <- GRanges(seqnames="chr1", -#' ranges=IRanges(start=c(1995066, 31611222), +#' sample02 <- GRanges(seqnames="chr1", +#' ranges=IRanges(start=c(1995066, 31611222), #' end=c(2204505, 31689898)), strand="*") -#' -#' ## Calculate Sorensen metric +#' +#' ## Calculate Sorensen metric #' CNVMetrics:::calculateSorensen(sample01=sample01, sample02=sample02) -#' +#' #' @author Astrid DeschĂȘnes #' @importFrom GenomicRanges intersect width #' @encoding UTF-8 #' @keywords internal calculateSorensen <- function(sample01, sample02) { - - ## Calculate intersection between the two sets as well as the + + ## Calculate intersection between the two sets as well as the ## total size of each set - inter <- sum(as.numeric(width(intersect(sample01, sample02, + inter <- sum(as.numeric(width(intersect(sample01, sample02, ignore.strand=TRUE)))) widthSample01 <- sum(as.numeric(width(sample01))) widthSample02 <- sum(as.numeric(width(sample02))) - + ## Calculate Sorensen metric if possible; otherwise NA - result <- ifelse((widthSample01 + widthSample02) > 0, + result <- ifelse((widthSample01 + widthSample02) > 0, (2.0 * inter)/(widthSample01 + widthSample02), NA) return(result) } #' @title Calculate Szymkiewicz-Simpson metric -#' -#' @description Calculate Szymkiewicz-Simpson metric using overlapping -#' regions between two samples. -#' -#' @param sample01 a \code{GRanges} which contains a collection of -#' genomic ranges representing copy number events for the first sample. -#' @param sample02 a \code{GRanges} which contains a collection of +#' +#' @description Calculate Szymkiewicz-Simpson metric using overlapping +#' regions between two samples. +#' +#' @param sample01 a \code{GRanges} which contains a collection of +#' genomic ranges representing copy number events for the first sample. +#' @param sample02 a \code{GRanges} which contains a collection of #' genomic ranges representing copy number events for the second sample. -#' -#' @details -#' +#' +#' @details +#' #' The method calculates the Szymkiewicz-Simpson metric using overlapping #' regions between the samples. All regions present in both samples all used #' for the calculation of the metric. -#' -#' The Szymkiewicz-Simpson metric is calculated by dividing the size of +#' +#' The Szymkiewicz-Simpson metric is calculated by dividing the size of #' the intersection by the smaller of the size of the two sets. If one sample #' has a size of zero, the metric is not calculated; the value \code{NA} is #' returned instead. The strand of the regions is not taken into account while #' calculating the intersection. -#' +#' #' @return a \code{numeric}, the value of the Szymkiewicz-Simpson metric. If #' the metric cannot be calculated, \code{NA} is returned. -#' -#' @references -#' -#' Vijaymeena, M. K, and Kavitha K. 2016. “A Survey on Similarity Measures in -#' Text Mining.” Machine Learning and Applications: An International +#' +#' @references +#' +#' Vijaymeena, M. K, and Kavitha K. 2016. “A Survey on Similarity Measures in +#' Text Mining.” Machine Learning and Applications: An International #' Journal 3 (1): 19–28. doi: \url{https://doi.org/10.5121/mlaij.2016.3103} -#' +#' #' @examples #' #' ## Load required package to generate the two samples #' require(GenomicRanges) #' #' ## Generate two samples with identical sequence levels -#' sample01 <- GRanges(seqnames="chr1", -#' ranges=IRanges(start=c(1905048, 4554832, 31686841), +#' sample01 <- GRanges(seqnames="chr1", +#' ranges=IRanges(start=c(1905048, 4554832, 31686841), #' end=c(2004603, 4577608, 31695808)), strand="*") -#' sample02 <- GRanges(seqnames="chr1", -#' ranges=IRanges(start=c(1995066, 31611222), +#' sample02 <- GRanges(seqnames="chr1", +#' ranges=IRanges(start=c(1995066, 31611222), #' end=c(2204505, 31689898)), strand=c("+", "-")) -#' +#' #' ## Calculate Szymkiewicz-Simpson metric #' CNVMetrics:::calculateSzymkiewicz(sample01=sample01, sample02=sample02) -#' +#' #' @author Astrid DeschĂȘnes #' @importFrom GenomicRanges intersect width #' @encoding UTF-8 #' @keywords internal calculateSzymkiewicz <- function(sample01, sample02) { - - ## Calculate intersection between the two sets as well as the + + ## Calculate intersection between the two sets as well as the ## total size of each set - inter <- sum(as.numeric(width(intersect(sample01, sample02, + inter <- sum(as.numeric(width(intersect(sample01, sample02, ignore.strand=TRUE)))) widthSample01 <- sum(as.numeric(width(sample01))) widthSample02 <- sum(as.numeric(width(sample02))) - + ## Calculate Szymkiewicz-Simpson metric if possible; otherwise NA - result <- ifelse(min(widthSample01,widthSample02) > 0, + result <- ifelse(min(widthSample01,widthSample02) > 0, inter/min(widthSample01,widthSample02), NA) return(result) @@ -256,68 +256,68 @@ calculateSzymkiewicz <- function(sample01, sample02) { #' @title Calculate Jaccard metric -#' -#' @description Calculate Jaccard metric using overlapping regions between -#' two samples. -#' -#' @param sample01 a \code{GRanges} which contains a collection of -#' genomic ranges representing copy number events for the first sample. -#' @param sample02 a \code{GRanges} which contains a collection of +#' +#' @description Calculate Jaccard metric using overlapping regions between +#' two samples. +#' +#' @param sample01 a \code{GRanges} which contains a collection of +#' genomic ranges representing copy number events for the first sample. +#' @param sample02 a \code{GRanges} which contains a collection of #' genomic ranges representing copy number events for the second sample. -#' -#' @details -#' +#' +#' @details +#' #' The method calculates the Jaccard metric using overlapping #' regions between the samples. All regions present in both samples are used #' for the calculation of the metric. -#' -#' The Jaccard metric is calculated by dividing the size of +#' +#' The Jaccard metric is calculated by dividing the size of #' the intersection by the size of the union of the two sets. If the #' the size of the union of the two sets is zero; the value \code{NA} is #' returned instead. The strand of the regions is not taken into account while #' calculating the intersection. -#' -#' +#' +#' #' @return a \code{numeric}, the value of the Jaccard metric. If #' the metric cannot be calculated, \code{NA} is returned. -#' -#' @references -#' -#' Jaccard, P. (1912), The Distribution of the Flora in the Alpine Zone. -#' New Phytologist, 11: 37-50. +#' +#' @references +#' +#' Jaccard, P. (1912), The Distribution of the Flora in the Alpine Zone. +#' New Phytologist, 11: 37-50. #' DOI: \url{https://doi.org/10.1111/j.1469-8137.1912.tb05611.x} -#' +#' #' @examples #' #' ## Load required package to generate the two samples #' require(GenomicRanges) #' #' ## Generate two samples with identical sequence levels -#' sample01 <- GRanges(seqnames="chr1", -#' ranges=IRanges(start=c(1905048, 4554832, 31686841), +#' sample01 <- GRanges(seqnames="chr1", +#' ranges=IRanges(start=c(1905048, 4554832, 31686841), #' end=c(2004603, 4577608, 31695808)), strand="*") -#' sample02 <- GRanges(seqnames="chr1", -#' ranges=IRanges(start=c(1995066, 31611222), +#' sample02 <- GRanges(seqnames="chr1", +#' ranges=IRanges(start=c(1995066, 31611222), #' end=c(2204505, 31689898)), strand="*") -#' -#' ## Calculate Sorensen metric +#' +#' ## Calculate Sorensen metric #' CNVMetrics:::calculateJaccard(sample01=sample01, sample02=sample02) -#' +#' #' @author Astrid DeschĂȘnes #' @importFrom GenomicRanges intersect width #' @encoding UTF-8 #' @keywords internal calculateJaccard <- function(sample01, sample02) { - - ## Calculate intersection between the two sets as well as the + + ## Calculate intersection between the two sets as well as the ## total size of each set - inter <- sum(as.numeric(width(intersect(sample01, sample02, + inter <- sum(as.numeric(width(intersect(sample01, sample02, ignore.strand=TRUE)))) widthSample01 <- sum(as.numeric(width(sample01))) widthSample02 <- sum(as.numeric(width(sample02))) - + ## Calculate Jaccard metric if possible; otherwise NA - result <- ifelse((widthSample01 + widthSample02 - inter) > 0, + result <- ifelse((widthSample01 + widthSample02 - inter) > 0, (inter)/(widthSample01 + widthSample02 - inter), NA) return(result) diff --git a/R/CNVMetricsSimulations.R b/R/CNVMetricsSimulations.R index abdb22e..3e9f679 100644 --- a/R/CNVMetricsSimulations.R +++ b/R/CNVMetricsSimulations.R @@ -29,23 +29,20 @@ #' @param nbSim a single positive \code{integer} which is corresponding to #' the number of simulations that will be generated. #' -#' @details TODO -#' -#' -#' @return a code{list} containing one entry per simulation. Each entry is +#' @return a \code{list} containing one entry per simulation. Each entry is #' a \code{data.frame} containing shuffled segments with 6 columns: #' \itemize{ -#' \item{\code{ID}}{ The name of the simulation. } -#' \item{\code{chr}}{ The name fo the chromosome. } -#' \item{\code{start}}{ The starting position of the segment; the positions +#' \item{\code{ID} The name of the simulation. } +#' \item{\code{chr} The name fo the chromosome. } +#' \item{\code{start} The starting position of the segment; the positions #' are between zero and one. The segment width is representing the #' proportional size of the segment relative to the global segment size.} -#' \item{\code{end}}{ The ending position of the segment; the positions +#' \item{\code{end} The ending position of the segment; the positions #' are between zero and one. The segment width is representing the #' proportional size of the segment relative to the global segment size. } -#' \item{\code{log2ratio}} { The log2 copy number ratio assigned to +#' \item{\code{log2ratio} The log2 copy number ratio assigned to #' the segment. } -#' \item{\code{state}} { The state of the region (ex: DELETION, LOH, +#' \item{\code{state} The state of the region (ex: DELETION, LOH, #' AMPLIFICATION, NEUTRAL, etc.). } #' } #' @@ -67,7 +64,6 @@ #' "DELETION", "NEUTRAL", "NEUTRAL"), #' log2ratio=(c(0.5849625, 0, -1, -1, -0.87777, 0, 0))) #' -#' #' ## Generates 10 simulated chromosomes (one chromosome per simulated sample) #' ## based on chromosome 2 from the input sample. #' ## The shuffled chromosomes have a start and an end between 0 an 1 @@ -142,7 +138,6 @@ simChr <- function(curSample, chrCur, nbSim) { partEvents <- matrix(rep(listEvents, nbSim), ncol=1) } - ## Final returned list with all simulated samples res <- list() @@ -164,7 +159,7 @@ simChr <- function(curSample, chrCur, nbSim) { } -#' @title TODO +#' @title Generates a simulation for one chromosome #' #' @description TODO #' @@ -180,18 +175,14 @@ simChr <- function(curSample, chrCur, nbSim) { #' simulated chromosome (shuffled segments). The starting position and the #' ending position of the segments should be between zero and one. The segment #' width is representing the proportional size of the segment relative to the -#' global segment size for the chromosome.The \code{data.frame} columns names +#' global segment size for the chromosome. The \code{data.frame} columns names #' should be: 'ID', 'chr', 'start', 'end', 'log2ratio', 'state'. #' #' @param chrCur a \code{character} string representing the name of the #' chromosome. #' -#' @details TODO -#' -#' -#' @return df TODO -#' -#' +#' @return a \code{data.frame} containing the simulation for the specified +#' chromosome. #' #' @examples #' @@ -253,7 +244,6 @@ processChr <- function(curSample, simChr, chrCur) { end=listStart[-1]) listHole <- listHole[listHole$end - listHole$start > 1,] - ## Calculates the number of bases that each segment in the simulated ## chromosome should occupy in the final chromosome ## The proportion of each simulated segment is preserved when @@ -325,26 +315,21 @@ processChr <- function(curSample, simChr, chrCur) { #' @param nbSim a single positive \code{integer} which is corresponding to the #' number of simulations that will be generated. #' -#' @details TODO -#' -#' #' @return a \code{data.frame} containing the segments for each #' simulated sample. The \code{data.frame} has 6 columns: #' \itemize{ -#' \item{\code{ID}}{ a \code{character} string, the name of the simulated +#' \item{\code{ID} a \code{character} string, the name of the simulated #' sample } -#' \item{\code{chr}}{ a \code{character} string, the name fo the chromosome } -#' \item{\code{start}}{ a \code{integer}, the starting position of the +#' \item{\code{chr} a \code{character} string, the name fo the chromosome } +#' \item{\code{start} a \code{integer}, the starting position of the #' segment } -#' \item{\code{end}}{ a \code{integer}, the ending position of the segment } -#' \item{\code{log2ratio}} { a \code{numerical}, the log2 copy number +#' \item{\code{end} a \code{integer}, the ending position of the segment } +#' \item{\code{log2ratio} a \code{numerical}, the log2 copy number #' ratio assigned to the segment } -#' \item{\code{state}} { a \code{character} string, the state of the segment +#' \item{\code{state} a \code{character} string, the state of the segment #' (ex: DELETION, AMPLIFICATION, NEUTRAL, etc.) } #' } #' -#' -#' #' @examples #' #' ## Load required package to generate the sample diff --git a/man/CNVMetrics-package.Rd b/man/CNVMetrics-package.Rd index a2bc153..2f5dc11 100644 --- a/man/CNVMetrics-package.Rd +++ b/man/CNVMetrics-package.Rd @@ -20,12 +20,12 @@ Finally, a visualization tool is provided to explore resulting metrics. } \seealso{ \itemize{ - \item \code{\link{calculateOverlapMetric}} {for calculating metric + \item{\code{\link{calculateOverlapMetric}} for calculating metric using overlapping amplified/deleted regions} - \item \code{\link{calculateLog2ratioMetric}} {for calculating metric + \item{\code{\link{calculateLog2ratioMetric}} for calculating metric using log2ratio values} - \item \code{\link{processSim}} {for generating simulations} - \item \code{\link{plotMetric}} {for plotting metrics} + \item{\code{\link{processSim}} for generating simulations} + \item{\code{\link{plotMetric}} for plotting metrics} } } \author{ diff --git a/man/calculateJaccard.Rd b/man/calculateJaccard.Rd index fb1a0fb..bc8958a 100644 --- a/man/calculateJaccard.Rd +++ b/man/calculateJaccard.Rd @@ -8,10 +8,10 @@ calculateJaccard(sample01, sample02) } \arguments{ -\item{sample01}{a \code{GRanges} which contains a collection of +\item{sample01}{a \code{GRanges} which contains a collection of genomic ranges representing copy number events for the first sample.} -\item{sample02}{a \code{GRanges} which contains a collection of +\item{sample02}{a \code{GRanges} which contains a collection of genomic ranges representing copy number events for the second sample.} } \value{ @@ -19,7 +19,7 @@ a \code{numeric}, the value of the Jaccard metric. If the metric cannot be calculated, \code{NA} is returned. } \description{ -Calculate Jaccard metric using overlapping regions between +Calculate Jaccard metric using overlapping regions between two samples. } \details{ @@ -27,7 +27,7 @@ The method calculates the Jaccard metric using overlapping regions between the samples. All regions present in both samples are used for the calculation of the metric. -The Jaccard metric is calculated by dividing the size of +The Jaccard metric is calculated by dividing the size of the intersection by the size of the union of the two sets. If the the size of the union of the two sets is zero; the value \code{NA} is returned instead. The strand of the regions is not taken into account while @@ -39,20 +39,20 @@ calculating the intersection. require(GenomicRanges) ## Generate two samples with identical sequence levels -sample01 <- GRanges(seqnames="chr1", - ranges=IRanges(start=c(1905048, 4554832, 31686841), +sample01 <- GRanges(seqnames="chr1", + ranges=IRanges(start=c(1905048, 4554832, 31686841), end=c(2004603, 4577608, 31695808)), strand="*") -sample02 <- GRanges(seqnames="chr1", - ranges=IRanges(start=c(1995066, 31611222), +sample02 <- GRanges(seqnames="chr1", + ranges=IRanges(start=c(1995066, 31611222), end=c(2204505, 31689898)), strand="*") -## Calculate Sorensen metric +## Calculate Sorensen metric CNVMetrics:::calculateJaccard(sample01=sample01, sample02=sample02) - + } \references{ -Jaccard, P. (1912), The Distribution of the Flora in the Alpine Zone. -New Phytologist, 11: 37-50. +Jaccard, P. (1912), The Distribution of the Flora in the Alpine Zone. +New Phytologist, 11: 37-50. DOI: \url{https://doi.org/10.1111/j.1469-8137.1912.tb05611.x} } \author{ diff --git a/man/calculateLog2ratioMetric.Rd b/man/calculateLog2ratioMetric.Rd index af8dde1..c37a980 100644 --- a/man/calculateLog2ratioMetric.Rd +++ b/man/calculateLog2ratioMetric.Rd @@ -39,7 +39,7 @@ Default: \code{1} and always \code{1} for Windows.} an object of class "\code{CNVMetric}" which contains the calculated metric. This object is a list with the following components: \itemize{ -\item{\code{LOG2RATIO}}{ a lower-triangular \code{matrix} with the +\item{\code{LOG2RATIO} a lower-triangular \code{matrix} with the results of the selected metric on the log2ratio values for each paired samples. The value \code{NA} is present when the metric cannot be calculated. The value \code{NA} is also present in the top-triangular @@ -49,11 +49,10 @@ metric. This object is a list with the following components: The object has the following attributes (besides "class" equal to "CNVMetric"): \itemize{ -\item{\code{metric}}{ the metric used for the calculation. +\item{\code{metric} the metric used for the calculation. } +\item{\code{names} the names of the two matrix containing the metrics for +the amplified and deleted regions. } } -\item{\code{names}}{ the names of the two matrix containing the metrics for -the amplified and deleted regions. -}} } \description{ This function calculates a specific metric, as specified diff --git a/man/calculateOneLog2valueMetricT.Rd b/man/calculateOneLog2valueMetricT.Rd index b69e131..5b289e8 100644 --- a/man/calculateOneLog2valueMetricT.Rd +++ b/man/calculateOneLog2valueMetricT.Rd @@ -14,54 +14,52 @@ calculateOneLog2valueMetricT( ) } \arguments{ -\item{entry}{a \code{list} which contains the row and column indexes -(always in this order) of -the metric in the final matrix. Those values correspond to the positions +\item{entry}{a \code{list} which contains the row and column indexes +(always in this order) of +the metric in the final matrix. Those values correspond to the positions of the two samples used to calculate the metric in the \code{GRangesList} (\code{segmentData}).} -\item{segmentData}{a \code{GRangesList} that contains a collection of -genomic ranges representing copy number events, including amplified/deleted -status, from at least 2 samples. All samples must have a metadata column +\item{segmentData}{a \code{GRangesList} that contains a collection of +genomic ranges representing copy number events, including amplified/deleted +status, from at least 2 samples. All samples must have a metadata column called '\code{log2ratio}' with the log2ratio values.} \item{method}{a \code{character} string representing the metric to be used ('\code{weightedEuclideanDistance}').} -\item{minThreshold}{a single \code{numeric} setting the minimum value -to consider two segments as different during the metric calculation. If the -absolute difference is below or equal to threshold, the difference will be +\item{minThreshold}{a single \code{numeric} setting the minimum value +to consider two segments as different during the metric calculation. If the +absolute difference is below or equal to threshold, the difference will be replaced by zero.} -\item{bedExclusion}{an optional \code{GRanges} containing the regions -that have to be excluded for the metric calculation or code{NULL}.} +\item{bedExclusion}{an optional \code{GRanges} containing the regions +that have to be excluded for the metric calculation or \code{NULL}.} } \value{ a \code{list} containing 1 entry: \itemize{ -\item{\code{metric}}{ a \code{data.frame}, which contains 3 columns. The 2 -first columns, called \code{row} and \code{column} correspond to the + \item{\code{metric} a \code{data.frame}, which contains 3 columns. The 2 +first columns, called \code{row} and \code{column} correspond to the indexes of the metric in the final matrix. Those -2 first columns match to the \code{entry} parameter. The third column, -called \code{metric}, -contains the values of the specified metric for each combination. -If the metric cannot be calculated, \code{NA} is present. -} +2 first columns match to the \code{entry} parameter. The third column, +called \code{metric}, +contains the values of the specified metric for each combination. +If the metric cannot be calculated, \code{NA} is present. } } } \description{ -Calculate a specific metric using the level of -amplification/deletion, in log2 ratio, between -two samples. +Calculate a specific metric using the level of +amplification/deletion, in log2 ratio, between two samples. } \details{ The method calculates a specified metric using overlapping regions between the samples. Only regions corresponding to the type -specified by user are used in the calculation of the metric. The strand of +specified by user are used in the calculation of the metric. The strand of the regions is not taken into account while calculating the metric. -The Sorensen metric is calculated by dividing twice the size of +The Sorensen metric is calculated by dividing twice the size of the intersection by the sum of the size of the two sets. If the sum of the size of the two sets is zero; the value \code{NA} is returned instead. @@ -76,21 +74,21 @@ require(GenomicRanges) demo <- GRangesList() ## Generate two samples with log2value information as a metadata column -demo[["sample01"]] <- GRanges(seqnames="chr1", - ranges=IRanges(start=c(100, 201, 400), +demo[["sample01"]] <- GRanges(seqnames="chr1", + ranges=IRanges(start=c(100, 201, 400), end=c(200, 350, 500)), strand="*", log2ratio=c(1.1111, 2.2222, -0.9999)) -demo[["sample02"]] <- GRanges(seqnames="chr1", - ranges=IRanges(start=c(150, 200, 450), +demo[["sample02"]] <- GRanges(seqnames="chr1", + ranges=IRanges(start=c(150, 200, 450), end=c(250, 350, 500)), strand="*", log2ratio=c(2.2121, 1.1212, -1.3939)) ## The 2 samples used to calculate the metric -entries <- data.frame(row=c(2), col=c(1)) +entries <- data.frame(row=c(2), col=c(1)) -## Calculate weighted Euclidean distance -CNVMetrics:::calculateOneLog2valueMetricT(entry=entries, - segmentData=demo, method="weightedEuclideanDistance", +## Calculate weighted Euclidean distance +CNVMetrics:::calculateOneLog2valueMetricT(entry=entries, + segmentData=demo, method="weightedEuclideanDistance", minThreshold=0.2, bedExclusion=NULL) diff --git a/man/calculateOneOverlapMetricT.Rd b/man/calculateOneOverlapMetricT.Rd index d9d6d75..4950c19 100644 --- a/man/calculateOneOverlapMetricT.Rd +++ b/man/calculateOneOverlapMetricT.Rd @@ -3,45 +3,45 @@ \encoding{UTF-8} \name{calculateOneOverlapMetricT} \alias{calculateOneOverlapMetricT} -\title{Calculate metric using overlapping amplified/deleted regions between +\title{Calculate metric using overlapping amplified/deleted regions between two samples.} \usage{ calculateOneOverlapMetricT(entry, segmentData, method, type) } \arguments{ -\item{entry}{a \code{list} which contains the row and column indexes -(always in this order) of -the metric in the final matrix. Those values correspond to the positions +\item{entry}{a \code{list} which contains the row and column indexes +(always in this order) of +the metric in the final matrix. Those values correspond to the positions of the two samples used to calculate the metric in the \code{GRangesList} (\code{segmentData}).} -\item{segmentData}{a \code{GRangesList} that contains a collection of -genomic ranges representing copy number events, including amplified/deleted -status, from at least 2 samples. All samples must have a metadata column -called '\code{state}' with a state, in an character string format, +\item{segmentData}{a \code{GRangesList} that contains a collection of +genomic ranges representing copy number events, including amplified/deleted +status, from at least 2 samples. All samples must have a metadata column +called '\code{state}' with a state, in an character string format, specified for each region (ex: DELETION, LOH, AMPLIFICATION, NEUTRAL, etc.).} \item{method}{a \code{character} string representing the metric to be used ('\code{sorensen}' or '\code{szymkiewicz}'.} -\item{type}{a \code{character} string representing the type of +\item{type}{a \code{character} string representing the type of copy number events to be used ('\code{AMPLIFICATION}' or '\code{DELETION}').} } \value{ a \code{list} containing 1 entry: \itemize{ -\item{\code{metric}}{ a \code{data.frame}, which contains 3 columns. The 2 -first columns, called \code{row} and \code{column} correspond to the +\item{\code{metric} a \code{data.frame}, which contains 3 columns. The 2 +first columns, called \code{row} and \code{column} correspond to the indexes of the metric in the final matrix. Those -2 first columns match to the \code{entry} parameter. The third column, -called \code{metric}, -contains the values of the specified metric for each combination. -If the metric cannot be calculated, \code{NA} is present. +2 first columns match to the \code{entry} parameter. The third column, +called \code{metric}, +contains the values of the specified metric for each combination. +If the metric cannot be calculated, \code{NA} is present. } } } \description{ -Calculate a specific metric using overlapping +Calculate a specific metric using overlapping amplified/deleted regions between two samples. } \examples{ @@ -52,39 +52,39 @@ require(GenomicRanges) ## Create a GRangesList object with 3 samples ## The stand of the regions doesn't affect the calculation of the metric demo <- GRangesList() -demo[["sample01"]] <- GRanges(seqnames="chr1", - ranges=IRanges(start=c(1905048, 4554832, 31686841, 32686222), +demo[["sample01"]] <- GRanges(seqnames="chr1", + ranges=IRanges(start=c(1905048, 4554832, 31686841, 32686222), end=c(2004603, 4577608, 31695808, 32689222)), strand="*", state=c("AMPLIFICATION", "AMPLIFICATION", "DELETION", "LOH")) -demo[["sample02"]] <- GRanges(seqnames="chr1", - ranges= IRanges(start=c(1995066, 31611222, 31690000, 32006222), - end=c(2204505, 31689898, 31895666, 32789233)), +demo[["sample02"]] <- GRanges(seqnames="chr1", + ranges= IRanges(start=c(1995066, 31611222, 31690000, 32006222), + end=c(2204505, 31689898, 31895666, 32789233)), strand=c("-", "+", "+", "+"), state=c("AMPLIFICATION", "AMPLIFICATION", "DELETION", "LOH")) -## The amplified region in sample03 is a subset of the amplified regions +## The amplified region in sample03 is a subset of the amplified regions ## in sample01 -demo[["sample03"]] <- GRanges(seqnames="chr1", - ranges=IRanges(start=c(1906069, 4558838), +demo[["sample03"]] <- GRanges(seqnames="chr1", + ranges=IRanges(start=c(1906069, 4558838), end=c(1909505, 4570601)), strand="*", state=c("AMPLIFICATION", "DELETION")) ## The 2 samples used to calculate the metric -entries <- data.frame(row=c(2, 3), col=c(1, 1)) +entries <- data.frame(row=c(2, 3), col=c(1, 1)) -## Calculate Sorensen metric for the amplified regions on samples 2 and 3 +## Calculate Sorensen metric for the amplified regions on samples 2 and 3 CNVMetrics:::calculateOneOverlapMetricT(entry=entries, segmentData=demo, method="sorensen", type="AMPLIFICATION") -## Calculate Szymkiewicz-Simpson metric for the amplified regions -## in samples 1 and 2 -## Amplified regions of sample02 are a subset of the amplified +## Calculate Szymkiewicz-Simpson metric for the amplified regions +## in samples 1 and 2 +## Amplified regions of sample02 are a subset of the amplified ## regions in sample01 CNVMetrics:::calculateOneOverlapMetricT(entry=entries, segmentData=demo, method="szymkiewicz", type="AMPLIFICATION") -## Calculate Sorensen metric for the deleted regions in samples 1 and 2 +## Calculate Sorensen metric for the deleted regions in samples 1 and 2 CNVMetrics:::calculateOneOverlapMetricT(entry=entries, segmentData=demo, method="sorensen", type="DELETION") diff --git a/man/calculateOverlapMetric.Rd b/man/calculateOverlapMetric.Rd index 6edd5f9..dc44706 100644 --- a/man/calculateOverlapMetric.Rd +++ b/man/calculateOverlapMetric.Rd @@ -36,7 +36,7 @@ an object of class "\code{CNVMetric}" which contains the calculated metric. This object is a list where each entry corresponds to one state specified in the '\code{states}' parameter. Each entry is a \code{matrix}: \itemize{ -\item{\code{state}}{ a lower-triangular \code{matrix} with the +\item{\code{state} a lower-triangular \code{matrix} with the results of the selected metric on the amplified regions for each paired samples. The value \code{NA} is present when the metric cannot be calculated. The value \code{NA} is also present in the top-triangular @@ -47,11 +47,10 @@ specified in the '\code{states}' parameter. Each entry is a \code{matrix}: The object has the following attributes (besides "class" equal to "CNVMetric"): \itemize{ -\item{\code{metric}}{ the metric used for the calculation. + \item{\code{metric} the metric used for the calculation.} + \item{\code{names} the names of the two matrix containing the metrics + for the amplified and deleted regions.} } -\item{\code{names}}{ the names of the two matrix containing the metrics for -the amplified and deleted regions. -}} } \description{ This function calculates a specific metric, as specified by diff --git a/man/calculateSorensen.Rd b/man/calculateSorensen.Rd index 0b11d49..f5cc94f 100644 --- a/man/calculateSorensen.Rd +++ b/man/calculateSorensen.Rd @@ -8,10 +8,10 @@ calculateSorensen(sample01, sample02) } \arguments{ -\item{sample01}{a \code{GRanges} which contains a collection of +\item{sample01}{a \code{GRanges} which contains a collection of genomic ranges representing copy number events for the first sample.} -\item{sample02}{a \code{GRanges} which contains a collection of +\item{sample02}{a \code{GRanges} which contains a collection of genomic ranges representing copy number events for the second sample.} } \value{ @@ -19,7 +19,7 @@ a \code{numeric}, the value of the Sorensen metric. If the metric cannot be calculated, \code{NA} is returned. } \description{ -Calculate Sorensen metric using overlapping regions between +Calculate Sorensen metric using overlapping regions between two samples. } \details{ @@ -27,7 +27,7 @@ The method calculates the Sorensen metric using overlapping regions between the samples. All regions present in both samples are used for the calculation of the metric. -The Sorensen metric is calculated by dividing twice the size of +The Sorensen metric is calculated by dividing twice the size of the intersection by the sum of the size of the two sets. If the sum of the size of the two sets is zero; the value \code{NA} is returned instead. The strand of the regions is not taken into account while @@ -39,21 +39,21 @@ calculating the intersection. require(GenomicRanges) ## Generate two samples with identical sequence levels -sample01 <- GRanges(seqnames="chr1", - ranges=IRanges(start=c(1905048, 4554832, 31686841), +sample01 <- GRanges(seqnames="chr1", + ranges=IRanges(start=c(1905048, 4554832, 31686841), end=c(2004603, 4577608, 31695808)), strand="*") -sample02 <- GRanges(seqnames="chr1", - ranges=IRanges(start=c(1995066, 31611222), +sample02 <- GRanges(seqnames="chr1", + ranges=IRanges(start=c(1995066, 31611222), end=c(2204505, 31689898)), strand="*") -## Calculate Sorensen metric +## Calculate Sorensen metric CNVMetrics:::calculateSorensen(sample01=sample01, sample02=sample02) - + } \references{ -SĂžrensen, Thorvald. n.d. “A Method of Establishing Groups of Equal -Amplitude in Plant Sociology Based on Similarity of Species and Its -Application to Analyses of the Vegetation on Danish Commons.” +SĂžrensen, Thorvald. n.d. “A Method of Establishing Groups of Equal +Amplitude in Plant Sociology Based on Similarity of Species and Its +Application to Analyses of the Vegetation on Danish Commons.” Biologiske Skrifter, no. 5: 1–34. } \author{ diff --git a/man/calculateSzymkiewicz.Rd b/man/calculateSzymkiewicz.Rd index dd45d89..7f68f62 100644 --- a/man/calculateSzymkiewicz.Rd +++ b/man/calculateSzymkiewicz.Rd @@ -8,10 +8,10 @@ calculateSzymkiewicz(sample01, sample02) } \arguments{ -\item{sample01}{a \code{GRanges} which contains a collection of +\item{sample01}{a \code{GRanges} which contains a collection of genomic ranges representing copy number events for the first sample.} -\item{sample02}{a \code{GRanges} which contains a collection of +\item{sample02}{a \code{GRanges} which contains a collection of genomic ranges representing copy number events for the second sample.} } \value{ @@ -19,7 +19,7 @@ a \code{numeric}, the value of the Szymkiewicz-Simpson metric. If the metric cannot be calculated, \code{NA} is returned. } \description{ -Calculate Szymkiewicz-Simpson metric using overlapping +Calculate Szymkiewicz-Simpson metric using overlapping regions between two samples. } \details{ @@ -27,7 +27,7 @@ The method calculates the Szymkiewicz-Simpson metric using overlapping regions between the samples. All regions present in both samples all used for the calculation of the metric. -The Szymkiewicz-Simpson metric is calculated by dividing the size of +The Szymkiewicz-Simpson metric is calculated by dividing the size of the intersection by the smaller of the size of the two sets. If one sample has a size of zero, the metric is not calculated; the value \code{NA} is returned instead. The strand of the regions is not taken into account while @@ -39,20 +39,20 @@ calculating the intersection. require(GenomicRanges) ## Generate two samples with identical sequence levels -sample01 <- GRanges(seqnames="chr1", - ranges=IRanges(start=c(1905048, 4554832, 31686841), +sample01 <- GRanges(seqnames="chr1", + ranges=IRanges(start=c(1905048, 4554832, 31686841), end=c(2004603, 4577608, 31695808)), strand="*") -sample02 <- GRanges(seqnames="chr1", - ranges=IRanges(start=c(1995066, 31611222), +sample02 <- GRanges(seqnames="chr1", + ranges=IRanges(start=c(1995066, 31611222), end=c(2204505, 31689898)), strand=c("+", "-")) ## Calculate Szymkiewicz-Simpson metric CNVMetrics:::calculateSzymkiewicz(sample01=sample01, sample02=sample02) - + } \references{ -Vijaymeena, M. K, and Kavitha K. 2016. “A Survey on Similarity Measures in -Text Mining.” Machine Learning and Applications: An International +Vijaymeena, M. K, and Kavitha K. 2016. “A Survey on Similarity Measures in +Text Mining.” Machine Learning and Applications: An International Journal 3 (1): 19–28. doi: \url{https://doi.org/10.5121/mlaij.2016.3103} } \author{ diff --git a/man/calculateWeightedEuclideanDistanceFor2Samples.Rd b/man/calculateWeightedEuclideanDistanceFor2Samples.Rd index 2ee52e8..a51cbfd 100644 --- a/man/calculateWeightedEuclideanDistanceFor2Samples.Rd +++ b/man/calculateWeightedEuclideanDistanceFor2Samples.Rd @@ -7,34 +7,34 @@ calculateWeightedEuclideanDistanceFor2Samples(segmentData, minThreshold) } \arguments{ -\item{segmentData}{a \code{list} marked as a \code{preMetricSegments} -\code{class} that contains the disjoint segment information from 2 +\item{segmentData}{a \code{list} marked as a \code{preMetricSegments} +\code{class} that contains the disjoint segment information from 2 samples and the log2ratio values of the samples in the metadata columns.} -\item{minThreshold}{a single \code{numeric} setting the minimum value -to consider two segments as different for the metric calculation. If the -absolute difference is below or equal to threshold, the value will be +\item{minThreshold}{a single \code{numeric} setting the minimum value +to consider two segments as different for the metric calculation. If the +absolute difference is below or equal to threshold, the value will be replaced by zero.} } \value{ -a \code{numeric} representing the weighted euclidean distance +a \code{numeric} representing the weighted euclidean distance between the two samples. If the distance cannot be calculated as the two samples don't share any segments with log2ratio value, the value NA is assigned. } \description{ The weighted Euclidean distance-based metric corresponds to the -euclidean distance between 2 samples multiplied by the natural logarithm +euclidean distance between 2 samples multiplied by the natural logarithm of the number of bases of the analyzed segment. The final metric is 1 over -1 added to the -squared sum of the values obtained for all segments that are not +1 added to the +squared sum of the values obtained for all segments that are not excluded of the analysis. } \details{ -The weighted euclidean distance is -\eqn{1/(1 + (\sum((x_i - y_i)^2 * log2(nbrBases_i))^0.5)} +The weighted euclidean distance is +\eqn{1/(1 + (\sum((x_i - y_i)^2 * log2(nbrBases_i))^0.5)} where \code{x} and \code{y} are the -values of 2 samples for a specific segment \code{i} and \code{nbrBases} the +values of 2 samples for a specific segment \code{i} and \code{nbrBases} the number of bases of the segment \code{i}. } \examples{ @@ -55,7 +55,7 @@ sample02 <- GRanges(seqnames="chr1", # Create disjoint segment using the 2 samples and without any region # excluded from the analysis (parameter bedExclusion set to null) disjoinGRange <- CNVMetrics:::createDisjoinSegmentsForTwoSamples( - segmentDataSample1=sample01, segmentDataSample2=sample02, + segmentDataSample1=sample01, segmentDataSample2=sample02, bedExclusion=NULL) ## Calculate the weighted ecucidean distance between the two samples diff --git a/man/createDisjoinSegmentsForTwoSamples.Rd b/man/createDisjoinSegmentsForTwoSamples.Rd index af569fa..4cb1d3c 100644 --- a/man/createDisjoinSegmentsForTwoSamples.Rd +++ b/man/createDisjoinSegmentsForTwoSamples.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/CNVMetricsLog2ratioInternalMethods.R \name{createDisjoinSegmentsForTwoSamples} \alias{createDisjoinSegmentsForTwoSamples} -\title{Generate common segments to enable calculation of metrics on +\title{Generate common segments to enable calculation of metrics on two segmented samples.} \usage{ createDisjoinSegmentsForTwoSamples( @@ -12,10 +12,10 @@ createDisjoinSegmentsForTwoSamples( ) } \arguments{ -\item{segmentDataSample1}{a \code{GRanges}, the segments from the first +\item{segmentDataSample1}{a \code{GRanges}, the segments from the first sample.} -\item{segmentDataSample2}{a \code{GRanges}, the segments from the second +\item{segmentDataSample2}{a \code{GRanges}, the segments from the second sample.} \item{bedExclusion}{a \code{GRanges}, the regions that must be @@ -23,15 +23,15 @@ excluded from the analysis. Default: \code{NULL}.} } \value{ a \code{GRanges} containing the common segment information for the -two samples. The log2ration value are present, for the two samples, in -the metadata columns. When there is not log2ratio value for one sample, +two samples. The log2ration value are present, for the two samples, in +the metadata columns. When there is not log2ratio value for one sample, NA is the assigned value. A metadata column also specifies if the segments should be included in the analysis. } \description{ -The two segments are gathered together, including excluded -regions when specified, and a disjoint operation is done to create a -collection of non-overlapping ranges. The ranges overlapping the excluded +The two segments are gathered together, including excluded +regions when specified, and a disjoint operation is done to create a +collection of non-overlapping ranges. The ranges overlapping the excluded regions are marked as so to be removed from future analysis. The log2value of each samples are assigned to the new disjointed segments for each sample in the metadata columns. @@ -53,7 +53,7 @@ sample02 <- GRanges(seqnames="chr1", # Create disjoint segment using the 2 samples and without any region # excluded from the analysis (parameter bedExclusion set to null) -CNVMetrics:::createDisjoinSegmentsForTwoSamples(segmentDataSample1=sample01, +CNVMetrics:::createDisjoinSegmentsForTwoSamples(segmentDataSample1=sample01, segmentDataSample2=sample02, bedExclusion=NULL) } diff --git a/man/processChr.Rd b/man/processChr.Rd index e02ec4d..1aea287 100644 --- a/man/processChr.Rd +++ b/man/processChr.Rd @@ -3,7 +3,7 @@ \encoding{UTF-8} \name{processChr} \alias{processChr} -\title{TODO} +\title{Generates a simulation for one chromosome} \usage{ processChr(curSample, simChr, chrCur) } @@ -20,21 +20,19 @@ number ratios.} simulated chromosome (shuffled segments). The starting position and the ending position of the segments should be between zero and one. The segment width is representing the proportional size of the segment relative to the -global segment size for the chromosome.The \code{data.frame} columns names +global segment size for the chromosome. The \code{data.frame} columns names should be: 'ID', 'chr', 'start', 'end', 'log2ratio', 'state'.} \item{chrCur}{a \code{character} string representing the name of the chromosome.} } \value{ -df TODO +a \code{data.frame} containing the simulation for the specified +chromosome. } \description{ TODO } -\details{ -TODO -} \examples{ ## Load required package to generate the samples diff --git a/man/processSim.Rd b/man/processSim.Rd index 0db5d00..586c4db 100644 --- a/man/processSim.Rd +++ b/man/processSim.Rd @@ -24,15 +24,15 @@ number of simulations that will be generated.} a \code{data.frame} containing the segments for each simulated sample. The \code{data.frame} has 6 columns: \itemize{ -\item{\code{ID}}{ a \code{character} string, the name of the simulated +\item{\code{ID} a \code{character} string, the name of the simulated sample } -\item{\code{chr}}{ a \code{character} string, the name fo the chromosome } -\item{\code{start}}{ a \code{integer}, the starting position of the +\item{\code{chr} a \code{character} string, the name fo the chromosome } +\item{\code{start} a \code{integer}, the starting position of the segment } -\item{\code{end}}{ a \code{integer}, the ending position of the segment } -\item{\code{log2ratio}} { a \code{numerical}, the log2 copy number +\item{\code{end} a \code{integer}, the ending position of the segment } +\item{\code{log2ratio} a \code{numerical}, the log2 copy number ratio assigned to the segment } -\item{\code{state}} { a \code{character} string, the state of the segment +\item{\code{state} a \code{character} string, the state of the segment (ex: DELETION, AMPLIFICATION, NEUTRAL, etc.) } } } @@ -44,9 +44,6 @@ To generate realistic simulations, the specified sample must contain segments covering the majority of the genome. Most importantly, the NEUTRAL segments should be present. } -\details{ -TODO -} \examples{ ## Load required package to generate the sample diff --git a/man/simChr.Rd b/man/simChr.Rd index 85d2cf1..9a0b123 100644 --- a/man/simChr.Rd +++ b/man/simChr.Rd @@ -23,20 +23,20 @@ chromosome that is used as reference for the simulation.} the number of simulations that will be generated.} } \value{ -a code{list} containing one entry per simulation. Each entry is +a \code{list} containing one entry per simulation. Each entry is a \code{data.frame} containing shuffled segments with 6 columns: \itemize{ -\item{\code{ID}}{ The name of the simulation. } -\item{\code{chr}}{ The name fo the chromosome. } -\item{\code{start}}{ The starting position of the segment; the positions +\item{\code{ID} The name of the simulation. } +\item{\code{chr} The name fo the chromosome. } +\item{\code{start} The starting position of the segment; the positions are between zero and one. The segment width is representing the proportional size of the segment relative to the global segment size.} -\item{\code{end}}{ The ending position of the segment; the positions +\item{\code{end} The ending position of the segment; the positions are between zero and one. The segment width is representing the proportional size of the segment relative to the global segment size. } -\item{\code{log2ratio}} { The log2 copy number ratio assigned to +\item{\code{log2ratio} The log2 copy number ratio assigned to the segment. } -\item{\code{state}} { The state of the region (ex: DELETION, LOH, +\item{\code{state} The state of the region (ex: DELETION, LOH, AMPLIFICATION, NEUTRAL, etc.). } } } @@ -55,9 +55,6 @@ To ensure valuable results, the reference sample should have segments covering a good proportion of the chromosome; those should include NEUTRAL segments. } -\details{ -TODO -} \examples{ ## Load required package to generate the samples @@ -76,7 +73,6 @@ sample01 <- GRanges(seqnames=c(rep("chr1", 4), rep("chr2", 3)), "DELETION", "NEUTRAL", "NEUTRAL"), log2ratio=(c(0.5849625, 0, -1, -1, -0.87777, 0, 0))) - ## Generates 10 simulated chromosomes (one chromosome per simulated sample) ## based on chromosome 2 from the input sample. ## The shuffled chromosomes have a start and an end between 0 an 1