# readMSfile
#' Read .mzXML files and initiate msobject
#'
#' Read .mzXML files and initiate msobject
#'
#' @param file file path for a .mzXML file
#'
#' @return msobject
#'
#' @keywords internal
#'
#' @author M Isabel Alcoriza-Balaguer <maribel_alcoriza@iislafe.es>
readMSfile <- function(file){
  # 1. read data with readMzXmlFile
  ms <- readMzXmlData::readMzXmlFile(file.path(file))

  # 2. Extract metaData (general and scan by scan)
  generalMetadata <- list(file = file, scans = length(ms),
                          startTime = ms[[1]]$metaData$startTime,
                          endTime = ms[[1]]$metaData$endTime,
                          collisionEnergies = unique(unlist(lapply(ms, function(x) if(!is.null(x$metaData$collisionEnergy)){x$metaData$collisionEnergy} else {0}))))
  scansMetadata <-
    data.frame(msLevel = unlist(lapply(ms, function(x) x$metaData$msLevel)),
               polarity = unlist(lapply(ms, function(x) x$metaData$polarity)),
               scanType = unlist(lapply(ms, function(x) x$metaData$scanType)),
               centroided = unlist(lapply(ms, function(x) x$metaData$centroided)),
               retentionTime = unlist(lapply(ms, function(x) x$metaData$retentionTime)),
               peaksCount = unlist(lapply(ms, function(x) x$metaData$peaksCount)),
               lowMz = unlist(lapply(ms, function(x) x$metaData$lowMz)),
               highMz = unlist(lapply(ms, function(x) x$metaData$highMz)),
               basePeakMz = unlist(lapply(ms, function(x) x$metaData$basePeakMz)),
               basePeakInt = unlist(lapply(ms, function(x) x$metaData$basePeakInt)),
               totIonCurrent = unlist(lapply(ms, function(x) x$metaData$totIonCurrent)),
               precursor = unlist(lapply(ms, function(x) if(!is.null(x$metaData$precursorMz)){x$metaData$precursorMz} else {NA})),
               collisionEnergy = unlist(lapply(ms, function(x) if(!is.null(x$metaData$collisionEnergy)){x$metaData$collisionEnergy} else {0})),
               stringsAsFactors = FALSE)
  scanOrder <- rep(0,nrow(scansMetadata))
  for (l in unique(scansMetadata$msLevel)){
    for (c in unique(scansMetadata$collisionEnergy))
      scanOrder[scansMetadata$msLevel == l & scansMetadata$collisionEnergy == c] <-
        as.numeric(factor(scansMetadata$retentionTime[scansMetadata$msLevel == l & scansMetadata$collisionEnergy == c]))
  }
  scansMetadata$Scan <- scanOrder

  # 3. Generate msobject
  msobject <- list()
  msobject$metaData <- list(generalMetadata = generalMetadata,
                            scansMetadata = scansMetadata)
  msobject$processing <- list()


  # 4. Extract scans
  mz <- unlist(lapply(ms, function(x) x$spectrum$mass))
  int <- unlist(lapply(ms, function(x) x$spectrum$intensity))
  rt <- unlist(mapply(rep, scansMetadata$retentionTime, scansMetadata$peaksCount))
  mslevel <- unlist(mapply(rep, scansMetadata$msLevel, scansMetadata$peaksCount))
  collisionEnergy <- unlist(mapply(rep, scansMetadata$collisionEnergy, scansMetadata$peaksCount))
  scannum <- unlist(mapply(rep, scansMetadata$Scan, scansMetadata$peaksCount))
  scans <- data.frame(mz = mz, int = int, rt = rt, mslevel = mslevel,
                      collisionEnergy = collisionEnergy,
                      part = 0, clust = 0, peak = 0, Scan = scannum)

  if (any(scansMetadata$collisionEnergy == 0)){
    MS1 <-  scans[scans$collisionEnergy == 0,]
    MS1 <- split(MS1, MS1$collisionEnergy)
    # Add MS1 to msobject
    msobject$MS1 <- MS1
  }
  if (any(scans$collisionEnergy > 0)){
    MS2 <- scans[scans$collisionEnergy > 0,]
    MS2 <- split(MS2, MS2$collisionEnergy)
    # Add MS2 to msobject
    msobject$MS2 <- MS2
  }
  return(msobject)
}

# partitioning
#' agglomarative partitioning for LC-HRMS data based on enviPick algorithm
#'
#' agglomarative partitioning for LC-HRMS data based on enviPick algorithm
#'
#' @param msobject msobject generated by \link{readMSfile}
#' @param dmzagglom mz tolerance for partitions
#' @param drtagglom rt window for partitions
#' @param minpeak minimum number of measures to define a peak
#' @param mslevel MS level information to access msobject
#' @param cE collision energy information to access msobject
#'
#' @return msobject
#'
#' @keywords internal
#'
#' @author M Isabel Alcoriza-Balaguer <maribel_alcoriza@iislafe.es>
partitioning <- function(msobject,
                         dmzagglom,
                         drtagglom,
                         minpeak,
                         mslevel,
                         cE){
  # save parameters
  msobject$processing[[mslevel]]$parameters$maxint <- max(msobject[[mslevel]][[cE]]$int)
  msobject$processing[[mslevel]]$parameters$dmzagglom <- dmzagglom
  msobject$processing[[mslevel]]$parameters$drtagglom <- drtagglom
  msobject$processing[[mslevel]]$parameters$minpeak <- minpeak

  # order ms measures by increasing mz
  msobject[[mslevel]][[cE]] <- msobject[[mslevel]][[cE]][order(msobject[[mslevel]][[cE]]$mz,
                                                   decreasing = FALSE),]

  # Agglomerative partitioning: agglom function from enviPick package
  part <- .Call("agglom", as.numeric(msobject[[mslevel]][[cE]]$mz),
                as.numeric(msobject[[mslevel]][[cE]]$rt), as.integer(1),
                as.numeric(dmzagglom), as.numeric(drtagglom),
                PACKAGE = "LipidMS")

  # Index of partitions: indexed function from enviPick package
    # order ms measures by partition order
  msobject[[mslevel]][[cE]] <- msobject[[mslevel]][[cE]][order(part,
                                                               decreasing = FALSE),]
  part <- part[order(part, decreasing = FALSE)]
  maxit <- max(part)
  index <- .Call("indexed", as.integer(part), as.numeric(msobject[[mslevel]][[cE]]$int),
                 as.integer(minpeak),
                 as.numeric(msobject$processing[[mslevel]]$parameters$maxint),
                 as.integer(maxit),
                 PACKAGE = "LipidMS")
  index <- index[index[,2] != 0,,drop = FALSE]
  colnames(index) <- c("start", "end", "length")

  # Assign partition ID: partID function from enviPick
  partID <- .Call("partID", as.integer(index),
                  as.integer(nrow(msobject[[mslevel]][[cE]])),
                  PACKAGE = "LipidMS")

  # save partitions
  msobject[[mslevel]][[cE]]$part <- partID
  msobject$processing[[mslevel]]$partIndex[[cE]] <- index

  return(msobject)
}

# clustering
#' EIC extraction based on previous partitions generated by \link{partitioning}
#'
#' EIC extraction based on previous partitions generated by \link{partitioning}
#'
#' @param msobject msobject generated by \link{partitioning}
#' @param dmzagglom mz tolerance for clusters
#' @param drtclust rt window for clusters
#' @param minpeak minimum number of measures to define a peak
#' @param mslevel info to access msobject
#' @param cE info to access msobject
#'
#' @return msobject
#'
#' @keywords internal
#'
#' @author M Isabel Alcoriza-Balaguer <maribel_alcoriza@iislafe.es>
clustering <- function(msobject,
                       dmzagglom,
                       drtclust,
                       minpeak,
                       mslevel,
                       cE){

  # save parameters
  msobject$processing[[mslevel]]$parameters$drtclust <- drtclust

  startat <- 0
  roworder <- 1:nrow(msobject[[mslevel]][[cE]])
  for (k in 1:nrow(msobject$processing[[mslevel]]$partIndex[[cE]])) {
    # get EICs using getEIC function from enviPick
    start <- msobject$processing[[mslevel]]$partIndex[[cE]][k,1]
    end <- msobject$processing[[mslevel]]$partIndex[[cE]][k,2]
    if ((end - (start+1)) > 1){
      clusters <- .Call("getEIC",
                        as.numeric(msobject[[mslevel]][[cE]]$mz[start:end]),
                        as.numeric(msobject[[mslevel]][[cE]]$rt[start:end]),
                        as.numeric(msobject[[mslevel]][[cE]]$int[start:end]),
                        as.integer(order(msobject[[mslevel]][[cE]]$int[start:end], decreasing = TRUE)),
                        as.integer(order(msobject[[mslevel]][[cE]]$rt[start:end], decreasing = FALSE)),
                        as.numeric(dmzagglom), as.integer(1), as.numeric(drtclust),
                        as.integer(1), PACKAGE = "LipidMS")
      clust <- clusters[, 10] + startat
      msobject[[mslevel]][[cE]]$clust[start:end] <- clust
      roworder[start:end] <- roworder[start:end][order(clust, decreasing = FALSE)]
      startat <- max(clust)
    } else {
      msobject[[mslevel]][[cE]]$clust[start:end] <- startat
      startat <- startat + 1
    }
  }
  msobject[[mslevel]][[cE]] <- msobject[[mslevel]][[cE]][roworder,]

  # Index of clusters: indexed function from enviPick
  maxit <- max(msobject[[mslevel]][[cE]]$clust)
  index <- .Call("indexed",
                 as.integer(msobject[[mslevel]][[cE]]$clust),
                 as.numeric(msobject[[mslevel]][[cE]]$int),
                 as.integer(minpeak),
                 as.numeric(msobject$processing[[mslevel]]$parameters$maxint),
                 as.integer(maxit),
                 PACKAGE = "LipidMS")
  index <- index[index[,2] != 0,,drop = FALSE]
  colnames(index) <- c("start", "end", "length")

  # Assign cluster ID: partID function from enviPick
  clustID <- .Call("partID", as.integer(index),
                   as.integer(nrow(msobject[[mslevel]][[cE]])),
                   PACKAGE = "LipidMS")

  # save clusters
  msobject[[mslevel]][[cE]]$clust <- clustID
  msobject$processing[[mslevel]]$clustIndex[[cE]] <- index

  return(msobject)
}

# peakdetection
#' peak-pick based on previous EIC clusters generated by \link{clustering}
#'
#' peak-pick based on previous EIC clusters generated by \link{clustering}
#'
#' @param msobject msobject generated by \link{clustering}
#' @param minpeak minimum number of measures to define a peak
#' @param drtminpeak minimum rt length of a peak
#' @param drtmaxpeak maximum rt length of a peak
#' @param drtgap maximum rt gap to be filled
#' @param recurs maximum number of peaks for a EIC
#' @param weight weight for assigning measurements to a peak
#' @param ended number of failures allowed when detecting peaks
#' @param sb signal-to-base ration
#' @param sn signal-to-noise ratio
#' @param minint minimum intensity
#' @param mslevel info to access msobject
#' @param cE info to access msobject
#'
#' @return msobject
#'
#' @keywords internal
#'
#' @author M Isabel Alcoriza-Balaguer <maribel_alcoriza@iislafe.es>
peakdetection <- function(msobject,
                          minpeak,
                          drtminpeak,
                          drtmaxpeak,
                          drtgap,
                          recurs,
                          weight,
                          ended,
                          sb,
                          sn,
                          minint,
                          mslevel,
                          cE){

  # save parameters
  msobject$processing[[mslevel]]$parameters$drtminpeak <- drtminpeak
  msobject$processing[[mslevel]]$parameters$drtmaxpeak <- drtmaxpeak
  msobject$processing[[mslevel]]$parameters$drtgap <- drtgap
  msobject$processing[[mslevel]]$parameters$recurs <- recurs
  msobject$processing[[mslevel]]$parameters$weight <- weight
  msobject$processing[[mslevel]]$parameters$ended <- ended
  msobject$processing[[mslevel]]$parameters$sb <- sb
  msobject$processing[[mslevel]]$parameters$sn <- sn
  msobject$processing[[mslevel]]$parameters$minint <- minint

  msobject[[mslevel]][[cE]]$id <- 1:nrow(msobject[[mslevel]][[cE]])
  level <- as.numeric(gsub("MS", "", mslevel))

  startat <- 0
  npeaks <- 0
  roworder <- 1:nrow(msobject[[mslevel]][[cE]])
  for (k in 1:nrow(msobject$processing[[mslevel]]$clustIndex[[cE]])) {
    if (msobject$processing[[mslevel]]$clustIndex[[cE]][k, 3] >= minpeak){
      start <- msobject$processing[[mslevel]]$clustIndex[[cE]][k,1]
      end <- msobject$processing[[mslevel]]$clustIndex[[cE]][k,2]
      # Fill rt gaps < drtgap: gapfill function from enviPick
      out1 <- .Call("gapfill",
                    as.numeric(msobject[[mslevel]][[cE]]$rt[start:end]),
                    as.numeric(msobject[[mslevel]][[cE]]$int[start:end]),
                    as.integer(order(msobject[[mslevel]][[cE]]$rt[start:end], decreasing = FALSE)),
                    as.numeric(msobject[[mslevel]][[cE]]$mz[start:end]),
                    as.numeric(msobject[[mslevel]][[cE]]$id[start:end]),
                    as.numeric(msobject$metaData$scansMetadata$retentionTime[msobject$metaData$scansMetadata$msLevel == level & msobject$metaData$scansMetadata$collisionEnergy == cE]),
                    as.numeric(drtgap),
                    PACKAGE = "LipidMS")
      out1 <- matrix(out1,ncol=10)
      colnames(out1)<-c("m/z","intens","RT","index","intens_filt","1pick","pickcrit","baseline","intens_corr","2pick")
      # Filter step: yet to be implemented
      out1[, 5] <- out1[, 2]
      # Peak detection, baseline substraction and 2nd peak detection
      out2 <- .Call("pickpeak",
                    as.numeric(out1),
                    as.numeric(drtminpeak),
                    as.numeric(drtmaxpeak),
                    as.integer(minpeak),
                    as.integer(recurs),
                    as.numeric(weight),   # weight
                    as.numeric(sb),       # sb
                    as.numeric(sn),       # sn
                    as.numeric(minint),   # minimum intensity
                    as.numeric(msobject$processing[[mslevel]]$parameters$maxint),   # maximum intensity threshold
                    as.integer(ended),
                    as.integer(2),
                    PACKAGE = "LipidMS")
      out2 <- matrix(out2, ncol = 10)
      colnames(out2)<-c("m/z","intens","RT","index","intens_filt","1pick","pickcrit","baseline","intens_corr","2pick");
      if(!all(out2[,10] == 0)){
        npeaks <- npeaks + length(unique(out2[,10]))
        out2[,10] <- out2[,10] + startat
        out2 <- out2[out2[,10] != startat,]
        peak <- as.numeric(sapply(msobject[[mslevel]][[cE]]$id[start:end], function(x) if(x %in% out2[,4]){out2[out2[,4] == x,10]} else {0}))
        msobject[[mslevel]][[cE]]$peak[start:end] <- peak
        roworder[start:end] <- roworder[start:end][order(peak, decreasing = FALSE)]
        startat <- c(max(out2[,10]))
      }
    }
  }
  msobject[[mslevel]][[cE]] <- msobject[[mslevel]][[cE]][roworder,]

  # assign peakID
  # Index of peaks: indexed function from enviPick
  maxit <- max(msobject[[mslevel]][[cE]]$peak)
  if(maxit > 0){
    index <- .Call("indexed",
                   as.integer(msobject[[mslevel]][[cE]]$peak),
                   as.numeric(msobject[[mslevel]][[cE]]$int),
                   as.integer(minpeak),
                   as.numeric(msobject$processing[[mslevel]]$parameters$maxint),
                   as.integer(maxit),
                   PACKAGE="LipidMS")
    if(any(index[,2]!=0)){
      index <- index[index[,2] != 0,,drop=FALSE];
      # Assign peakID: partID function from enviPick
      peakID <- .Call("partID",
                      as.integer(index),
                      as.integer(length(msobject[[mslevel]][[cE]]$peak)),
                      PACKAGE = "LipidMS")
      colnames(index) <- c("start","end","length")

      # save peaks
      msobject[[mslevel]][[cE]]$peak <- peakID
      msobject$processing[[mslevel]]$peakIndex[[cE]] <- index
    }
  }

  # create peaklist
  maxit <- max(msobject[[mslevel]][[cE]]$peak)
  peaklist <- data.frame()
  if (maxit > 0){
    for (p in 1:nrow( msobject$processing[[mslevel]]$peakIndex[[cE]])){
      start <- msobject$processing[[mslevel]]$peakIndex[[cE]][p,1]
      end <- msobject$processing[[mslevel]]$peakIndex[[cE]][p,2]

      mz <- mean(msobject[[mslevel]][[cE]]$mz[start:end])
      # mz <- weighted.mean(msobject[[mslevel]][[cE]]$mz[start:end],
      #                     msobject[[mslevel]][[cE]]$int[start:end])
      max_int <- max(msobject[[mslevel]][[cE]]$int[start:end])
      sumint <- sum(msobject[[mslevel]][[cE]]$int[start:end])
      rt <- msobject[[mslevel]][[cE]]$rt[start:end][msobject[[mslevel]][[cE]]$int[start:end] == max_int]
      minrt <- min(msobject[[mslevel]][[cE]]$rt[start:end])
      maxrt <- max(msobject[[mslevel]][[cE]]$rt[start:end])
      peakid <- p

      peaklist <- rbind(peaklist,
                        data.frame(m.z = mz, RT = rt, int = sumint,
                                   minRT = minrt, maxRT = maxrt,
                                   peakID = p, stringsAsFactors = FALSE))
    }
    peaklist <- peaklist[order(peaklist$int, decreasing = TRUE),]
    peaklist$peakID <- paste(paste(mslevel, cE, sep="_"), peaklist$peakID, sep="_")
    msobject[[mslevel]][[cE]]$peak <- paste(paste(mslevel, cE, sep="_"), msobject[[mslevel]][[cE]]$peak, sep="_")
  } else {
    stop("No peaks found")
  }
  msobject$peaklist[[mslevel]][[cE]] <- peaklist
  return(msobject)
}

# annotateIsotopes
#' Annotate isotopes
#'
#' Annotate isotopes based on mass differences, retention time and peak
#' correlation if required.
#'
#' @param peaklist extracted peaks. Data.frame with 4 columns (m.z, RT, int
#' and peakID).
#' @param rawScans raw scan data. Data.frame with 5 columns (m.z, RT, int,
#' peakID and Scan).
#' @param dmzIso mass tolerance in ppm.
#' @param drtIso rt windows with the same units used in peaklist.
#'
#' @return peaklist with 6 columns (m.z, RT, int, peakID, isotope and group).
#'
#' @keywords internal
#'
#' @author M Isabel Alcoriza-Balaguer <maialba@iislafe.es>
annotateIsotopes <- function(peaklist,
                             rawScans,
                             dmzIso,
                             drtIso){
  peaklist <- peaklist[order(peaklist[,"m.z"], decreasing = F),]
  anno_peaklist <- peaklist[c(),]
  cluster <- 1 # contador
  while (nrow(peaklist) > 0){ # iniciamos un bucle para pasar los clusters agrupados
    # a una nueva peaklist, hasta que la anterior se vacie
    a <- peaklist[1,]
    ss <- peaklist[which(abs(peaklist[,"RT"] - a[,"RT"]) < drtIso &
                           peaklist[,"m.z"] >= a[,"m.z"]),]
    ss <- ss[order(ss[,"m.z"]),]
    if (nrow(ss) > 1){
      dm <- (ss[,"m.z"]-a[,"m.z"])
      ss[,"iso"] <- round(dm,0)
      errors <- abs(dm-ss[,"iso"]*1.003355)*1e6/(a[,"m.z"]+round(dm, 0)*1.003355)
      group <- cbind(ss[errors < dmzIso, c("m.z", "RT", "int", "minRT", "maxRT", "peakID", "iso"),])
      if (nrow(group) > 1){
        keep <- c(TRUE, rep(FALSE, nrow(group)-1))
        for (i in 2:nrow(group)){
          prev <- which(group[,"iso"] - group[i,"iso"] == -1 & keep[i-1] == TRUE)
          if(length(prev) > 0 & group[i,"iso"] == 1){
            intcheck <- any((group[i,"int"]/group[prev, "int"]) > 0.011 &
                              (group[i,"int"]/group[prev, "int"]) <
                              group[prev[1],"m.z"]*0.011/12)
            keep[i] <- intcheck
          } else if(length(prev) > 0 & group[i,"iso"] > 1){
            keep[i] <- any(group[,"iso"] - group[i,"iso"] == -1 & keep[i-1] == TRUE)
          } else {
            break
          }
        }
        group <- group[keep,]
        keep <- c(TRUE, rep(TRUE, nrow(group)-1))
        check_overlaps <- which(table(group[,"iso"]) > 1)
        if (length(check_overlaps) > 0){
          repeated <- as.numeric(names(check_overlaps))
          cors <- sapply(group[group[,"iso"] %in% repeated,"peakID"],
                         coelutionScore, group[1,"peakID"], rawScans)
          remove <- which(cors < 0.8)
          if (length(remove) > 0 & length(remove) < length(repeated)){
            keep[which(group[,"iso"] %in% repeated)[remove]] <- FALSE
          } else if (length(remove) > 0 & length(remove) == length(repeated)){
            remove[which.max(cors)] <- FALSE
            keep[which(group[,"iso"] %in% repeated)[remove]] <- FALSE
          }
        }
        group <- group[keep,]
      }
      if ( nrow(group) > 1){
        group[,"cluster"] <- cluster
        cluster <- cluster + 1
      } else {
        group[,"cluster"] <- 0
      }
    } else {
      ss[,"iso"] <- 0
      group <- ss
      group[,"cluster"] <- 0
    }
    anno_peaklist <- rbind(anno_peaklist, group)
    peaklist <- peaklist[which(!peaklist[,"peakID"] %in% group[,"peakID"]),]
  }
  anno_peaklist[,"iso"] <- paste("[M+", anno_peaklist[,"iso"], "]", sep="")
  anno_peaklist[,"iso"][anno_peaklist[,"cluster"] == 0] <- ""
  colnames(anno_peaklist) <- c("m.z", "RT", "int", "minRT", "maxRT", "peakID",
                               "isotope", "group")
  return(anno_peaklist)
}
