#####################################################################
#
# Peaks_R_1.3.R - Version 1.3
#
# Date released: 7/20/2005
#
# Version History:
#
# Version 1.3: released 7/20/2005
# - spectra.dir now accepts a vector of directories.
# - Spectra with too few reproducible peaks no longer cause an error during normalization.
# - Eliminated spectra appear in the output files as columns of NA instead of being left out completely.
# - Spectra marked by 0 in a Find.Peaks column of the metadata are no longer included in calculation of the chi-squared test
# They still receive scores so that the difference between control and experimental spectra can be measured.
# - Leaving peakPlotMinMZ blank defaults to the lowest value in m.z.regions instead of giving an error.
#
# Version 1.2.1: released 6/16/2005
# - Changing the number of regions no longer gives "Error in 1:widthSteps : NA/NaN argument".
# - Large negative values no longer occur for very narrow peaks.
# - getPeakAreas respects the m.z.step parameter.
# - An error is now generated if the metadata contains duplicate entries.
# - Fixed a bug in the fraction of the AUC kernel sometimes being used for the negative regions.
# - Added the peakWidthSteps parameter so that users can control the graininess with which peak edges are detected.
# - Added peakPlotMinMZ and peakPlotMaxMZ parameters to control the region that is plotted in the peak plot.
#
# Version 1.2: released 5/27/2005
# - The Memory.Conserve=FALSE flag has been added to the parameter list of getPeaks. Setting this flag to TRUE
# will cause spectra to be read from the hard-drive on demand instead of residing in memory. This eliminates
# the major memory bottleneck but each spectrum will have to be read several times from the hard-drive. Use this option
# if you are hitting the 4 GB limit on addressable memory or otherwise running out of memory while analyzing your dataset.
# - The Peaks.Show=TRUE flag has been added to getPeaks. If the flag is TRUE then R will generate a plot
# showing the averaged spectrum and the positions of all the peaks that passed the F-test.
# - Peak positions are now rounded to m.z.step in the output file of peak magnitudes, so that peak names will not be absurdly long.
# (does not affect the internal representation, and the peak bounds output file has positions to full precision so that the bounds
# remain exactly the same when that peak bounds file is loaded in the future).
# - The output list from getPeaks contains an element "spectrum.average" which is a numeric vector of the mean spectrum for the dataset.
# The M/Z axis is uniform with a spacing between points equal to m.z.step.
# - A peak.bounds.source parameter has been added to getPeaks to allow peak positions to be shared across analyses. This should point
# to the peak bounds output file produced by an earlier analysis. If this parameter is specified then peaks will not be detected and
# the peak bounds from the file will be used instead. (Please be careful comparing peaks values from samples that were not normalized together)
# - Added a column F.Confidence to the output file of peak positions. This is the p-value of each peak in the F-test.
# - Fixed the annoying uptick that could occur at the end of the mean spectrum due to a circular filter.
# - Fixed a bug when the F-test confidence threshold was set to zero.
# - Function named "preclassification" changed to "TestAndNormalize" to better describe its function.
# - The code has been tested without incident in the R environment version 2.0.1
#
# Version 1.1: released 1/31/2005
# - The estimate.em function now tests for convergence to a singularity by calling rescale.weights
# on each interation. This checks if any peak has been assigned a weight greater than the w.limit parameter
# (by default it equals 0.2 out of total weight of 1). If any peak has too much weight then a warning is printed
# and the weights are rescaled so that w.limit is the highest weight.
# - An error is now generated if the normalization fails for any reason.
#
#
# Email Scott Carlson (scottmc@stanford.edu) with any questions or if there are features that would make SSA more useful for your particular data.
# Also send an email if you find any bugs and I will release a fix as quickly as possible.
#
# COPYRIGHT NOTICE: This software, related source code, compiled code, and example data are all copyright of Harvey Cohen Research Group in the Department
# of Pediatrics in the Stanford University School of Medicine. They are freely available for non-commercial purposes with citation. Commercial use
# allowed only with express permission from Harvey Cohen or appropriate an authority within Stanford University.
#
# LIABILITY NOTICE: This software is provided as-is and the Harvey Cohen lab and its affiliates are not for responsible for damages occuring
# from or through the use of this software, including but not limited to normal operations, improper or illegal use, or errors in the software itself.
#
#####################################################################
#####################################################################
#
# getPeaks This is the main function used to extract peaks from spectra.
#
# Parameters:
# Either spectra.dir or spectra.paths must be specified. If spectra.dir is specified then all files matching the regular expression given by
# files.pattern will be loaded as spectra. Alternatively, spectra.paths may be used to specify specific files to load as spectra. spectra.paths is
# ignored if spectra.dir is given.
#
# metadata.path (required) - the complete path of a file in a table format recognized by R. It must include at least three columns, in order: the file name
# of each spectrum, the sample name for that spectrum (used to average replicates), and the group membership of that spectrum. Additional columns may appear
# in the metadata, and names of these columns should be listed as a character vector in the "additional.metadata" parameter. If one of the
# additional.metadata entries is "Find.Peaks" then that column will be used as a bit mask specifying which spectra to use when determining
# peak location and edges (peak areas will still be calculated for all spectra).
#
# output.path.screened (optional) - the complete path of an output file that will contain normalized peak magnitudes for every peak and spectrum that
# passes the F-test and Chi-squared test respectively.
#
# output.path.all (optional) - the complete path of an output file that will contain normalized peak magnitudes for every peak and in every spectrum that
# passes the Chi-squared test (including peaks that fail the Chi-squared test).
#
# output.path.averaged (optional) - the complete path of an output file that will contain normalized peak magnitudes for every peak averaged for every unique
# sample name (using only peaks and spectra that passed their respective quality tests, samples names with no passing spectra are assigned NA at every peak).
#
# output.path.peak.bounds (optional) - the complete path of an output file that will contain a table of the peak positions and edges, including peaks that do not
# pass the F-test.
#
# min.peak.widths (optional) - sets the minimum spacing between peaks as a fraction of the current m/z (so 0.01 means minimum spacing of 1%). This can be given as a numeric
# vector, as in c(0.004, 0.008), when different peak widths are to be applied to different m/z ranges. The default is set for a Ciphergen PBS-IIC SELDI-TOF-MS using
# fractionated plasma with SPA (all default settings are for this instrument).
#
# max.peak.widths (optional) - sets the largest peak width that will be tested when determining peak edges. Specified in the same manner as min.peak.widths.
#
# peakWidthSteps (default = 6) - sets the number of steps that are taken while searching for the left and right edge of each peak.
#
# m.z.regions (optional) - specifies where m/z regions start and end for each of the peak widths that was given. For example, c(3000, 18000) applies the first entry in
# each peak width vector to the m/z region from 3000 to 18000, and the second entry in each width vector to the range from 18000 to the end.
#
# peak.bounds.source (optional) - specifies the path of a file containing peak bounds (the peak bounds file that is output to the path in output.path.peak.bounds).
# If this parameter is specified then the peaks from the file will be used and no other peaks will be detected.
#
# m.z.step (default = 0.5) - gives the m/z data resolution used when calculating peaks. Allocating the array of uniform m/z spectra is a major memory bottleneck. If you are
# having memory problems then try increasing the m.z.step so that fewer points per spectrum will be calculated.
#
# F.test.threshold (default = 0.95) - this parameter sets the minimum confidence that peaks' magnitude exceeds their noise. Peaks with lower confidence on the F-test are not included
# in output files specified by output.path.screened or output.path.averaged.
#
# chi.sq.threshold (default = 0.99) - specifies the confidence needed to exclude a bad spectrum. Lowing the value will result in a larger number of excluded spectra.
#
# additional.metadata (required if metadata column present) - a character vector giving names to each column after the 3rd that appears in the metadata file.
#
# Memory.Conserve (default = FALSE) - a boolean flag that will cause spectra to be loaded from the hard-drive on demand instead of residing in memory. Use this option if you are hitting the maximum on allocatable memory.
#
# Peaks.Show (default = TRUE) - a boolean flag that will cause R to generate a plot showing the averaged spectrum and the position of peaks on that spectrum. Only peaks that pass the F-test are shown.
#
# peakPlotMinMZ (default = 0) - a numeric value giving the lowest m/z value to plot when drawing the average spectrum on the peak plot.
#
# peakPlotMaxMZ (default = highest m/z) - like peakPlotMinMZ for the highest m/z value in the peak plot.
#
#
# Output: returns a list with the peak magnitudes at several stages of processing. Elements of the list are
# return.code - contains a zero if the function returned correctly, a one if the function detected an error.
# peak.mask - a boolean vector with length equal to the number of peaks detected before F-test screening. Specifies if each peak passed the F-test with the specified confidence.
# spectrum.mask - a boolean vector with length equal to the number of spectra. Specifies whether each spectrum passed the chi-squared plausibility test.
# exclude.confidence - numeric vector length one entry per spectrum, giving the p-value that each spectrum is "bad"
# peaks.normalized - an n x m matrix where n is the number of peaks, m is the number of spectra. Peak values are given after EM-normalization and every peak is included
# regardless of its performance on the F-test. Each entry corresponds to the magnitude that peak in each spectrum (technically the AUC divided by peak width).
# peaks.screened - similar to peaks.normalized, but including only peaks that passed the F-test with given confidence.
# peaks.averaged - an i x k matrix where i is the number of peaks passing the F-test and k is the number of individuals in the experiment (not the number of spectra).
# Replicate measurements of individuals have been averaged.
# identities.screened - Vector of identities for columns (spectra) retained in of peaks.normalized and peaks.screened.
# identities.averaged - Vector of identities for columns (samples) appearing in peaks.averaged.
# scale.factors - Numeric vector of constants by which each spectrum was multiplied during normalization. These values should be checked for bias between groups.
# spectrum.average - a numeric vector containing the mean spectrum on which peaks are detected. The names of the vector elements contain the m/z for each point.
# peaks.bounds - A data frame with three columns, giving the center, left width, and right width of every peak.
# status - A vector containing group memberships for spectra in the order that they appear in the output matrices.
# status.averaged - A vector containing group memberships for columns of averaged replicates in the order that they appear in the output matrix with averaged replicates.
# metadata - A data frame containing the metadata file. Row names are the file names taken from the first column of the file, the first column contains sample
# names, the second has group membership, and additional columns are as specified
#
#####################################################################
getPeaks <- function( spectra.dir="", files.pattern="", spectra.paths="", metadata.path, output.path.screened="", output.path.all="", output.path.averaged="", output.path.peak.bounds="",
min.peak.widths=c(0.004, 0.008), max.peak.widths=c(0.01, 0.02), peakWidthSteps=6, m.z.regions=c(3000, 18000), peak.bounds.source="", m.z.step=0.5,
F.test.threshold=0.95, chi.sq.threshold=0.99, additional.metadata=c(""), Memory.Conserve=FALSE, Peaks.Show=TRUE, peakPlotMinMZ=m.z.regions[1], peakPlotMaxMZ="")
{
## Read the metadata file ##
print(noquote("Loading spectra"))
# Check that the metadata file exists and can be read
if ( file.access( metadata.path, 0) != 0 ) { stop( paste("Metadata file", metadata.path, "does not exist.", sep=" ")); next }
if ( file.access( metadata.path, 4) != 0 ) { stop( paste("Metadata file", metadata.path, "exists but can not be read.", sep=" ")); next }
# Load the metadata file
if ( additional.metadata[1] == "") {
metadata = read.table(metadata.path, sep="\t", col.names=c("File.Names", "Identities", "Groups"), row.names=NULL)
} else {
metadata = read.table(metadata.path, sep="\t", col.names=c("File.Names", "Identities","Groups", additional.metadata), row.names=NULL)
}
# Turn the first column into dimnames for the rows
if (any(duplicated(metadata[,1]))) {
print( noquote("Some metadata entries appear more than once in the metadata file:") )
print( noquote(as.character(unique(metadata[duplicated(metadata[,1]),1]))) )
stop()
}
dimnames(metadata)[[1]] = metadata[,1]
metadata = metadata[,-1]
## Load spectra ##
if (spectra.paths[1] == "") {
spectra.data = load.spectra(spectra.dir = spectra.dir, files.pattern = files.pattern, Memory.Conserve = Memory.Conserve)
} else {
spectra.data = load.spectra(spectra.paths = spectra.paths, Memory.Conserve = Memory.Conserve)
}
if ( length( dim(spectra.data$data)[[2]] ) == 0 ) { stop("No spectrum files loaded. Quitting analysis.") }
# Remove paths from the file names
file.names = sapply(spectra.data$file.names, FUN=function(full.path) {substring(full.path, attr(regexpr(".*[\\]", full.path), "match.length")+1 )})
if (any(duplicated(file.names))) { stop("All files must have unique names, even if they are stored in different directories.") }
# Remove extensions from the file names. This is an ugly line of code because it needs to work correctly in the presence of any number of . characters.
file.names = unlist(lapply( strsplit(file.names,"\\."), FUN=function(charvect){return(paste(charvect[1:(length(charvect)-1)], collapse="."))}))
# Check that all spectra have metadata entries
metadata.file.names = dimnames(metadata)[[1]]
file.matches = match(file.names, metadata.file.names, nomatch=NA)
if ( any(is.na(file.matches)) ) {
print(noquote("Some files do not have metadata entries: (check that the metadata file DOES NOT include directory paths or file extentions)"))
print(noquote(paste("\tMissing metadata:", file.names[is.na(file.matches)], sep=" ")))
return(list(return.code=0))
}
# Assign names to the spectra in each data matrix
dimnames(spectra.data$data)[[2]] = as.character(as.matrix(metadata[file.names, 1]))
dimnames(spectra.data$M.Z)[[2]] = as.character(as.matrix(metadata[file.names, 1]))
# Create the boolean mask that determines which spectra are used in determining canonical peaks
# (peak magnitudes will still be calculated for all spectra)
if ( any(additional.metadata == "Find.Peaks") ) {
use.for.peaks = (as.character(as.matrix(metadata[file.names,"Find.Peaks"])) == "1")
} else {
use.for.peaks = rep(TRUE, length(file.names))
}
## Locate peaks in the data or load the peak list from a file ##
if (peak.bounds.source == "") {
# Calculate peak locations from the data
print(noquote("Identifying peak locations"))
peak.bounds = canonicalPeaks( spectra.data$data[,use.for.peaks], spectra.data$M.Z[,use.for.peaks], groups = metadata[file.names, 2][use.for.peaks], regions=m.z.regions, minPeakWidths=min.peak.widths, maxPeakWidths=max.peak.widths, peakWidthSteps=rep(peakWidthSteps, length(m.z.regions)), M.Z.Step=m.z.step, smoothingSpan=11, Memory.Conserve = Memory.Conserve )
} else {
# Load peak bounds from a file
print(noquote("Loading peak locations"))
peak.bounds = load.peak.bounds(peak.bounds.source)
if (peak.bounds$success == FALSE) { stop("Error: failure loading peaks from the file.") }
# Determine what is the lowest final M/Z among the spectra
peak.bounds$M.Z.max = get.M.Z.Max(spectra.data$file.names)
}
# Check for errors in peak finding
if (is.null(peak.bounds$center)) { stop("Error: no peaks found.") }
if (length(peak.bounds$center)==0) { stop("Error: no peaks found.") }
print(noquote(paste( "Total peaks found:", length(peak.bounds$center), sep=" ")))
## Get peak magnitudes for each peak in each spectrum ##
print(noquote("Determining peak area in each spectrum"))
peak.magnitudes = getPeakAreas( spectra.data$data, spectra.data$M.Z, peak.bounds$M.Z.max, peak.bounds, smoothingSpan=11, M.Z.Step=m.z.step, Memory.Conserve = Memory.Conserve)
## Normalize spectra, select reproducible peaks, and remove implausible spectra ##
print(noquote("Calculating normalization coefficients and reproducibility checks"))
status.averaged = list()
# To keep all peaks set F.test.threshold = 0, and to keep all spectra set spectrum.confidence >= 1
peaks.processed = TestAndNormalize(peak.magnitudes, as.matrix(metadata[file.names, 1]), peak.confidence=F.test.threshold, spectrum.confidence=chi.sq.threshold, standards.mask=!use.for.peaks)
if (peaks.processed$return.code == 1) {
print(noquote("Error normalizing spectra: an error was returned while normalizing spectra. Please check the format of spectra files and the metadata file. Email details of the problem to Scott Carlson (scottmc@stanford.edu) if this appears to be a glitch with the code."))
return(list(return.code=1))
}
# Determine group assignments for columns of the replicate averaged peak matrix
status.averaged = sapply( peaks.processed$identities.averaged, FUN=function(ID, groups, IDs){
return( groups[ (IDs==ID) ][[1]] )
}, metadata[file.names, 2], metadata[file.names, 1])
## Write the output files and build the return list ##
print(noquote("Writing output files, if requested"))
if ( output.path.screened != "" ) { write.table(peaks.processed$peaks.screened, output.path.screened, sep="\t") }
if ( output.path.all != "" ) { write.table(peaks.processed$peaks.normalized, output.path.all, sep="\t") }
if ( output.path.averaged != "" ) { write.table(peaks.processed$peaks.averaged, output.path.averaged, sep="\t") }
# Generate a data.frame of peak positions and peak widths. These values are all rounded to the nearest m.z.step
peak.frame = data.frame(Center=peak.bounds$center, Left=peak.bounds$left, Right=peak.bounds$right, Passed = peaks.processed$peak.mask, F.Confidence = peaks.processed$F.confidence)
if ( output.path.peak.bounds != "" ) { write.table(peak.frame, output.path.peak.bounds, sep="\t", row.names=FALSE) }
## Print a summary ##
print(noquote(""))
print(noquote("Data Summary:"))
print(noquote(paste( "Spectra loaded:", dim(spectra.data$data)[[2]], sep=" ")))
spectra.kept = sum(peaks.processed$spectrum.mask)
print(noquote(paste( "Spectra kept:", spectra.kept, sep=" ")))
if ( any(!peaks.processed$spectrum.mask) ) { print(noquote(paste("\tSpectra removed:", file.names[peaks.processed$spectrum.mask==F],sep=" "))) }
print(noquote(paste( "Unique samples:", length(unique(as.matrix(metadata[,1]))), sep=" ")))
print(noquote(paste( "Peaks kept with", F.test.threshold, "confidence:", sum(peaks.processed$peak.mask), sep=" ")))
print(noquote(""))
print(noquote("Done"))
## Show a plot of the peak locations ##
if (Peaks.Show) {
if (peak.bounds.source=="") {
# Generate a plot showing the average spectrum
if (peakPlotMaxMZ == "") { peakPlotMaxMZ = peak.bounds$M.Z.max }
plot(m.z.step * (floor(peakPlotMinMZ/m.z.step):floor(peakPlotMaxMZ/m.z.step)), peak.bounds$spectrum.average[floor(peakPlotMinMZ/m.z.step):floor(peakPlotMaxMZ/m.z.step)], type="l", xlab="M/Z", ylab="Mean Intensity", main="Peaks With Reproducible Measurements")
# Add peak markers to the plot
points(peak.frame$Center[peak.frame$Passed], peak.bounds$spectrum.average[floor(peak.frame$Center[peak.frame$Passed] / m.z.step)], pch=20, col="red")
# These plot peak boundaries but can give a very messy plot.
#points(peak.frame$Center[peak.frame$Passed]-peak.frame$Left[peak.frame$Passed], peak.bounds$spectrum.average[floor(peak.frame$Center[peak.frame$Passed] / m.z.step)], pch=20, col="blue")
#points(peak.frame$Center[peak.frame$Passed]+peak.frame$Right[peak.frame$Passed], peak.bounds$spectrum.average[floor(peak.frame$Center[peak.frame$Passed] / m.z.step)], pch=20, col="blue")
} else {
print(noquote("A plot of the average spectrum could not be drawn because the average spectrum is not calculated if peak locations are loaded from a file."))
}
}
return(c(peaks.processed, list(spectrum.average=peak.bounds$spectrum.average, peak.bounds=peak.frame, status=metadata[file.names, 2], status.averaged=status.averaged, metadata = metadata)))
}
#####################################################################
#
# load.spectra Load all spectra in the directory specified by spectra.path, or alternatively load each file specified in the charater vector spectra.paths
#
# Parameters: either spectra.dir or spectra.paths must be specified. If spectra.dir is specified then all files matching the regular expression given by
# files.pattern will be loaded as spectra. Alternatively, spectra.paths may be used to specify specific files to load as spectra. spectra.paths is
# ignored if spectra.dir is given. If Memory.Conserve is set to true then the function will fill out the returnList object using 1-row data frames
# for the data and M.Z list elements where those elements will contain the path of the spectrum that would occupy that column.
#
# Output: returns a list with elements "data" and "M.Z". Both are numeric matrices with one spectrum in each column unless the Memory.Conserve flag is true (see above).
#
#####################################################################
load.spectra <- function(spectra.dir="", files.pattern="", spectra.paths="", Memory.Conserve)
{
# Get the list of files in the directory specified by spectrum.path if no vector of specific files was given in spectra.paths
if (spectra.paths[1] == "") {
spectra.paths = paste(spectra.dir[1], list.files(spectra.dir[1], pattern=files.pattern), sep="\\")
if (length(spectra.dir) > 1)
for (curDir in (2:length(spectra.dir)))
spectra.paths = c( spectra.paths, paste(spectra.dir[curDir], list.files(spectra.dir[curDir], pattern=files.pattern), sep="\\") )
}
curFileNum = 1
sampleCount = length(spectra.paths)
returnList = list(file.names = spectra.paths)
for (curFile in spectra.paths)
{
# Check that the requested file exists and can be read
if ( file.access( curFile, 0) != 0 ) { warning( paste("File", curFile, "does not exist.", sep=" ")); next }
if ( file.access( curFile, 4) != 0 ) { warning( paste("File", curFile, "exists but cannot be read.", sep=" ")); next }
# Load the spectra into active memory if memory conservation has not been turned on
# Otherwise fill out returnList$data and returnList$metadata with the paths of the files
if ( Memory.Conserve == FALSE)
{
new.data = scan( curFile, what=list(numeric(0), numeric(0)), quiet=TRUE, skip=1, sep="," )
new.length = length(new.data[[1]])
if (curFileNum > 1) {
curLength = dim(returnList$data)[[1]]
# Expand the data matrices if the new spectrum is longer than any previous
if ( new.length > curLength) {
returnList$data = rbind(returnList$data, matrix( rep(0, (new.length - curLength) * sampleCount ), nrow = new.length - curLength ) )
returnList$M.Z = rbind(returnList$M.Z, matrix( rep(0, (new.length - curLength) * sampleCount ), nrow = new.length - curLength ) )
}
returnList$data[1:length(new.data[[2]]),curFileNum] = new.data[[2]]
returnList$M.Z[1:length(new.data[[1]]),curFileNum] = new.data[[1]]
# If this spectrum was shorter than the longest read so far then fill out any extra space in the matrix with zeros
if (new.length < curLength) {
returnList$data[ (new.length+1):curLength ,curFileNum] = 0
returnList$M.Z[ (new.length+1):curLength ,curFileNum] = 0
}
curFileNum = curFileNum + 1
} else {
returnList$data = matrix(rep(0, new.length * sampleCount), ncol = sampleCount)
returnList$M.Z = matrix(rep(0, new.length * sampleCount), ncol = sampleCount)
returnList$data[,1] = new.data[[2]]
returnList$M.Z[ ,1] = new.data[[1]]
curFileNum = curFileNum + 1
}
}
}
if (Memory.Conserve == TRUE) {
returnList$data = matrix(spectra.paths, nrow=1)
returnList$M.Z = matrix(spectra.paths, nrow=1)
}
return(returnList)
}
#####################################################################
#
# load.peak.bounds Loads peak positions from the file specified by "path." This file should have the format of the peak bounds file
# produced as output from SSA>
#
# Parameters:
# path - the path of the file containing peak bounds to be loaded.
#
# Output:
# Returns a list with elements "left", "center", and "right", which are identical to the values returned by canonicalPeaks.
# The list also has an M.Z.max element which is set to zero to retain symmetry to the output from canonicalPeaks. The zero value in
# M.Z.max serves as a flag later to indicate that the value of M.Z.Max still needs to be determined. There is also a "success" field,
# which is a boolean value that is TRUE if the file loaded, FALSE otherwise.
#
#####################################################################
load.peak.bounds <- function(path)
{
# Check that the peak bounds file exists and is readable
if ( file.access( path, 0) != 0 ) {
warning( paste("Peak bounds file", path, "does not exist.", sep=" ") )
return(list(success=FALSE))
}
if ( file.access( path, 4) != 0 ) {
warning( paste("Peak bounds file ", path, "exists but cannot be read.", sep=" ") )
return(list(success=FALSE))
}
# Load the peak bounds
bounds = read.table(path, header=TRUE)
# Build the list of return data
return(list(left = bounds$Left, center=bounds$Center, right=bounds$Right, M.Z.max=0, success=TRUE))
}
#####################################################################
#
# canonicalPeaks locates peaks in the data set and their left and right edges.
#
# Parameters:
# Data parameters
# data - A matrix of raw spectra, with each spectrum in a column.
# M.Z - A matrix of m/z values corresponding to each point in the "data" matrix. This allows each spectrum to have a different m/z axis. If no "M.Z" is provided then the m/z values for each spectrum must be the same and they must appear as the dimnames of the "data" matrix.
# groups - A vector of class identities for each sample. If no "groups" vector is provided then all spectra are treated as belonging to the same group.
#
# Peak width parameters
# regions - Numeric vector specifying the starting point of m/z region of the spectrum to which the values in "minPeakWidth" and "maxPeakWidths" should be applied. For example, to start analysis at 3000 m/z and use different peak parameters after 18000 m/z then the regions parameter should be c(3000, 18000).
# minPeakWidths - A numeric vector specifying the size of the expected peak window in fraction of current m/z. For example 0.004 is a window width of 0.4% of the current m/z.
# maxPeakWidths - A numeric vector specifying the largest peak window that will be tested in determining peak edges as a fraction of current m/z.
#
# Parameters below only need to be modified for applications very different from protein profiling on a Ciphergen PBSIIc instrument in the m/z range from 3000 to 75000.
# peakWidthSteps - The number of different peak edges to test in determining the left and right edges of each peak.
# M.Z.Step - A numeric value that specifies the resolution of the common m/z axis onto which all spectra are mapped. The default value of 0.5 is appropriate for the m/z range from 3000 to 75000 on the Ciphergen PBSIIc instrument. For higher m/z the m/z step should be increased and for peptide mapping it should be reduced.
# smoothingSpan - The width in data points to apply a simple smoothing filter before determine peak locations. Filtering increases the range of support when local baseline is calculated for each peak.
#
# Memory.Conserve - tells the function that the data and M.Z parameters do not contain values, but that each column contains instead the path to the spectrum that would occupy that column.
#
# Output:
# The output is a list with three numeric vectors "left", "center", "right".
# "center" is the m/z position of the area under the curve maxima, "left" and "right" are the left and right width of the peak in m/z units.
# Two other elements of the list are M.Z.max which contains the largest M/Z value located in the data set, spectrum.average which contains the averaged spectrum on which peaks are detected,
# and F.Confidence, which is set to zero unless the F-test scores are loaded from a file.
#
#
#####################################################################
canonicalPeaks <- function(data, M.Z="", groups=rep(1, dim(data)[[2]]), regions = 3000, minPeakWidths, maxPeakWidths, peakWidthSteps, M.Z.Step = 0.5, smoothingSpan = 11, Memory.Conserve)
{
# Calculate the number of samples from the data
if (Memory.Conserve) {
numSamples = length(data)
} else {
numSamples = dim(data)[[2]]
}
# Determine the number of groups from the groups factor
numGroups = length(unique(groups))
groups = as.numeric(as.factor(as.numeric(as.factor(groups))))
# Generate an M/Z table from the data if one is not provided
# Also determine the maximum M/Z value - this is the smallest maximum M/Z among the spectra
# NOTE: Using canonicalPeaks without providing an M/Z table is not a supported use, it is left here to provide flexibility in future file formats.
if (is.null(dim(M.Z)) & (class(M.Z) != "character")) {
tmpM.Z = as.numeric(dimnames(data)[[1]])
M.Z = matrix( rep(tmpM.Z, numSamples), ncol=numSamples, byrow=FALSE )
M.Z.max = max(tmpM.Z)
} else {
# Find the highest M/Z value in the dataset
if (Memory.Conserve == FALSE) {
M.Z.max = min(apply(M.Z, MARGIN=2, FUN=max))
} else {
M.Z.max = get.M.Z.Max(M.Z)
}
}
# Map all spectra to a common m/z axis and calculate the average spectrum for the data set
# This axis starts at m/z = 0 and steps by the value of M.Z.Step up to M.Z.Max
# Initialize the mean spectrum to zero
meanSpectrum = rep(0, M.Z.max / M.Z.Step + 1)
# Determine the mean spectrum for each group and add it to the mean spectrum for data set
for (curGroup in (1:numGroups)) {
groupVector = (groups == groups[curGroup])
groupMean = rep(0, length(meanSpectrum))
for ( curSample in ((1:numSamples)[groupVector]) ) {
if (Memory.Conserve == FALSE) {
groupMean = groupMean + (approx( M.Z[,curSample], data[,curSample], xout = ( 0:(M.Z.max / M.Z.Step) ) * M.Z.Step, rule=2 ))$y
} else {
next.data = scan( data[curSample], what=list(numeric(0), numeric(0)), quiet=TRUE, skip=1, sep="," )
groupMean = groupMean + (approx( next.data[[1]], next.data[[2]], xout = ( 0:(M.Z.max / M.Z.Step) ) * M.Z.Step, rule=2 ))$y
}
}
# Add the current group mean spectrum to the data set mean spectrum
meanSpectrum = meanSpectrum + groupMean / (sum(groupVector) * numGroups)
}
# Apply a small smoothing filter to the mean spectrum. This increases the support base when finding the local baseline
meanSpectrum = smoothingFilter(meanSpectrum, smoothingSpan)
# Locate peaks in the mean spectrum
if (M.Z.max > max(regions)) { regions = c(regions, M.Z.max) }
peakInfo = list()
peakInfo$M.Z.max = M.Z.max
peakInfo$spectrum.average = meanSpectrum
names(peakInfo$spectrum.average) = (0:(length(meanSpectrum)-1)) * M.Z.Step
peakInfo$left = vector()
peakInfo$center = vector()
peakInfo$right = vector()
peakInfo$F.Confidence = ""
# Calculate the area under the curve (AUC) at every point in the mean spectrum
areaMeanData = rep(0, length(meanSpectrum))
for (curRegion in (1:(length(regions) - 1)))
{
leftEdge = floor(regions[curRegion ] / M.Z.Step)
rightEdge = floor(regions[curRegion+1] / M.Z.Step)
# Calculate a running peak area across the composite spectrum
areaMeanData[leftEdge:rightEdge] <- (peakArea.M.Z(meanSpectrum, regions[curRegion], regions[curRegion+1], 0, max(regions), minPeakWidths[curRegion]))[leftEdge:rightEdge]
}
# Locate peaks in the AUC of the mean spectrum
for (curRegion in (1:(length(regions) - 1)))
{
# Find peaks in the AUC for this region of the spectrum
peakVector <- peaks.M.Z(areaMeanData, regions[curRegion], regions[curRegion+1], 0, max(regions), minPeakWidths[curRegion])
# Determine the left and right edges of each peak
edges <- findPeakEdges(meanSpectrum, maxPeakWidths[curRegion], peakWidthSteps[curRegion], peakVector, 0, max(regions))
# Append this set of peak edges to the list object being returned. Round peak positions to the nearest M.Z.Step
peakInfo$left = c(peakInfo$left , edges$left)
peakInfo$center = c(peakInfo$center, edges$center)
peakInfo$right = c(peakInfo$right, edges$right)
}
# Round values to three digits after the decimal because that is how R will round when the numbers are converted to a data.frame
peakInfo$center = round(peakInfo$center, 3)
peakInfo$left = round(peakInfo$left, 3)
peakInfo$right = round(peakInfo$right, 3)
# Set the names of entries in the peak edge vectors
names(peakInfo$center) = peakInfo$center
names(peakInfo$left) = peakInfo$center
names(peakInfo$right) = peakInfo$center
# Return the list of peaks
return(peakInfo)
}
#####################################################################
#
# findPeakEdges determines the left and right edges of each peak identified by canonicalPeaks.
# This function is meant to be called only from canonicalPeaks.
#
# Parameters:
# Data parameters
# data - Average spectrum for the data set, which was computed by the canonicalPeaks function.
#
# Peak width parameters
# maxPeakWidth - Specifies the largest peak window that will be tested in determining peak edges as a fraction of current m/z.
# widthSteps - The number of steps to take when locating the left and right edges of a peak. For example, widthSteps = 6 will try six positions for each edge moving outward from the center.
#
# Parameters from the calling function
# peakVector - A boolean vector specifying which spots in the average spectrum have a maxima in the AUC.
# M.Z.first - The lowest m/z included in the "data" vector.
# M.Z.last - The highest m/z included in the "data" vector.
#
# Output:
# A list with three vectors "left", "center", "right".
# "center" is the m/z position of the area under the curve maxima, "left" and "right" are the left and right width of the peak in m/z units.
#####################################################################
findPeakEdges <- function(data, maxPeakWidth, widthSteps, peakVector, M.Z.first, M.Z.last)
{
returnList = list()
widthDelta = maxPeakWidth / widthSteps
peakPositions = (1:length(peakVector))[as.logical(peakVector)]
M.Z.step = (M.Z.last - M.Z.first) / (length(peakVector) + 1)
returnList$center = M.Z.first + peakPositions / length(data) * (M.Z.last - M.Z.first)
# Iterate through peaks and determine the right edge of each
returnList$left = sapply(peakPositions, FUN = function(position, data, span, spanStep, widthSteps, firstMZ, lastMZ, stepMZ) {
# Calculate the m/z of this peak
M.Z = firstMZ + position / length(data) * (lastMZ - firstMZ)
# Calculate the left and right edges of the peak in terms of indices
# This determines a single, fixed, left edge and a range of possible right edges to be tested
leftEdge = position - (M.Z * span * 0.5) / stepMZ
rightEdge = position + (M.Z * spanStep * (1:widthSteps)) / stepMZ
# Check bounds on peak edges
rightEdge[rightEdge > length(data)] = length(data)
if (leftEdge < 1) { leftEdge = 1 }
# Determine which of the possible edge positions to use
# This involves calculating the AUC divided by width for a fixed left edge and a variety of right edges.
# The right edge that is selected is the first maximum moving out from the center (or the maximum peak width if no maximum is found)
if (rightEdge[1] - leftEdge > 8) {
areas = apply( cbind(rep(leftEdge, widthSteps), rightEdge), MARGIN=1, FUN=function(edges, data) {
negSpan = (edges[2] - edges[1]) * 0.1
if (negSpan < 1) { negSpan = 1 }
posSpan = edges[2] - edges[1] - negSpan * 2 + 1
return( sum(data[edges[1]:(edges[1]+negSpan-1)]) * -0.5 / negSpan + sum(data[(edges[1]+negSpan):(edges[2]-negSpan)]) / posSpan + sum(data[(edges[2]-negSpan+1):edges[2]]) * -0.5 / negSpan )
}, data)
} else { areas = c(1,0) } # Choose the first option by default if the window is not wide enough to calculate a local baseline
# Return the distance to the right edge
return(firstMax(areas) * spanStep * M.Z)
}, data, maxPeakWidth, widthDelta, widthSteps, M.Z.first, M.Z.last, M.Z.step )
# Iterate through peaks and determine the left edge of each
returnList$right = sapply(peakPositions, FUN = function(position, data, span, spanStep, widthSteps, firstMZ, lastMZ, stepMZ) {
# Calculate the current M/Z
M.Z = firstMZ + position / length(data) * (lastMZ - firstMZ)
# Calculate the left and right edges of the peak in terms of indices
# This determines a single, fixed, right edge and a range of possible left edges to be tested
rightEdge = position + (M.Z * span * 0.5) / stepMZ
leftEdge = position - (M.Z * spanStep * (1:widthSteps)) / stepMZ
# Do bounds checking on peak edges
if (rightEdge > length(data)) { rightEdge = length(data) }
leftEdge[leftEdge < 1] = 1
# Determine which of the possible edge positions to use
# This involves calculating the AUC divided by width for a fixed right edge and a variety of left edges.
# The left edge that is selected is the first maximum moving out from the center (or the maximum peak width if no maximum is found)
if (rightEdge - leftEdge[1] > 8) {
areas = apply( cbind(rep(leftEdge, widthSteps), rightEdge), MARGIN=1, FUN=function(edges, data) {
negSpan = (edges[2] - edges[1]) * 0.1
if (negSpan < 1) { negSpan = 1 }
posSpan = edges[2] - edges[1] - negSpan * 2 + 1
return( sum(data[edges[1]:(edges[1]+negSpan-1)]) * -0.5 / negSpan + sum(data[(edges[1]+negSpan):(edges[2]-negSpan)]) / posSpan + sum(data[(edges[2]-negSpan+1):edges[2]]) * -0.5 / negSpan )
}, data)
} else { areas = c(1,0) } # Choose the first option by default if the window is not wide enough to calculate a local baseline
# Return the distance to the left edge
return(firstMax(areas) * spanStep * M.Z)
}, data, maxPeakWidth, widthDelta, widthSteps, M.Z.first, M.Z.last, M.Z.step )
return(returnList)
}
#####################################################################
#
# GetPeakAreas Takes the same "data" and "M.Z" parameters as canonicalPeaks. It takes the output of canonicalPeaks as the "peaks" parameter and determines the magnitude of each peak in each spectrum.
#
# Parameters:
# Data parameters
# data - The matrix of raw spectra with each spectrum in a column (same as in canonicalPeaks).
# M.Z - A matrix of m/z values corresponding to each point in the "data" matrix. This allows each spectrum to have a different m/z axis. If no "M.Z" is provided then the m/z values for each spectrum must be the same and they must appear as the dimnames of the "data" matrix.
# peaks - The list of center, left width, and right width assigned to every peak, as produced by the canoncalPeaks function.
#
# Parameters below only need to be modified for applications very different from protein profiling on a Ciphergen PBSIIc instrument in the m/z range from 3000 to 75000.
# smoothingSpan - The width in data points to apply a simple smoothing filter before determine peak areas. Filtering increases the range of support when local baseline is calculated for each peak.
# M.Z.Step - The M/Z spacing to use when integrating the AUC for each spectrum.
#
# Memory.Conserve - tells the function that the data and M.Z parameters do not contain values, but that each column contains instead the path to the spectrum that would occupy that column.
#
# Output:
# A matrix of peak magnitudes (AUC above the local baseline divided by the peak width), with one peak per row and one spectrum per column.
#
#####################################################################
getPeakAreas <- function(data, M.Z="", M.Z.Max, peaks, smoothingSpan=11, M.Z.Step=1, Memory.Conserve)
{
# Calculate the number of samples from the data
if (Memory.Conserve) {
numSamples = length(data)
} else {
numSamples = dim(data)[[2]]
}
# Generate an M/Z table from the data if one is not provided
# Also determine the maximum M/Z value - this is the smallest maximum M/Z among the spectra
# NOTE: Using canonicalPeaks without providing an M/Z table is not a supported use, it is left here to provide flexibility in future file formats.
if (is.null(dim(M.Z)) & (class(M.Z ) != "character")) {
tmpM.Z = as.numeric(dimnames(data)[[1]])
M.Z = matrix( rep(tmpM.Z, numSamples), ncol=numSamples, byrow=FALSE )
}
# Initialize a matrix to hold peak magnitudes
peakAreas = matrix(rep(NA, length(peaks$center) * numSamples), ncol=numSamples, dimnames = list( peaks$center, dimnames(data)[[2]]))
# Iterate through each spectrum and get the peak areas
for (curSpectrum in (1:dim(data)[[2]])) {
# Map the spectrum onto the common M/Z axis. This axis goes from m/z = 0 to m/z = M.Z.Max in increments of M.Z.Step.
if (Memory.Conserve == FALSE) {
mappedData = approx( M.Z[,curSpectrum], data[,curSpectrum], xout = 0:(M.Z.Max / M.Z.Step) * M.Z.Step, rule=2 )$y
} else {
curData = scan( data[curSpectrum], what=list(numeric(0), numeric(0)), quiet=TRUE, skip=1, sep="," )
mappedData = approx( curData[[1]], curData[[2]], xout = 0:(M.Z.Max / M.Z.Step) * M.Z.Step, rule=2 )$y
}
# Apply a smoothing filter to increase the support of the AUC calculation
mappedData = filter(mappedData, rep(1/smoothingSpan, smoothingSpan), circular = TRUE)
# Iterate through the peaks and determine the magnitude of each
peakAreas[,curSpectrum] = sapply(1:length(peaks$center), FUN=function(index, data, peaks) {
# Determine the left and right M/Z values of the peak
left = peaks$center[index] - peaks$left[ index]
right = peaks$center[index] + peaks$right[index]
# Map left and right M/Z to left and right indices
# Assume that the spectrum begins at zero, ends at M.Z.Max, and that each data point is separated by M.Z.Step
left = floor(left / M.Z.Step)
right = floor(right / M.Z.Step)
# Check bounds on the left and right indices of the peak
if (left < 1) { left = 1 }
if (right > length(data)) { right = length(data) }
# Determine the size of the regions used for local baseline subtraction
negSpan = floor((right - left) * 0.1)
if (negSpan < 1) { negSpan = 1 }
posSpan = right - left - 2 * negSpan
# Return the total area over the local baseline divided by the peak width
return( (sum(data[left:(left+negSpan-1)]) * -0.5 / negSpan + sum(data[(left+negSpan):(right-negSpan)]) / posSpan + sum(data[(right-negSpan+1):right]) * -0.5 / negSpan) )
}, mappedData, peaks)
}
return(peakAreas)
}
#####################################################################
#
# TestAndNormalize Determine normalization factors for each spectrum, compute
# F-test scores for the reproducibility of each peak, computer chi-squared scores
# for the quality of each spectrum, then apply the requested thresholds and average
# peaks for replicates of the same sample.
#
# Parameters:
# peakHeights - a matrix of peak magnitudes for each spectrum, baseline subtracted. NA is acceptable.
# identities - a character vector of spectrum labels, with the same label denoting the same biological source
# peak.confidence - a numeric value between 0 and 1 to threshold peaks for reproducibility by the F-test ( <0 specifying that all peaks should be accepted)
# spectrum.confidence - a numeric value between 0 and 1 specifying how much confidence is required to eliminate bad spectra by a chi-squared test (>1 specifying that all spectra should be accepted)
# standards.mask - a boolean vector where TRUE designates a spectrum that should not be included in the chi-sq test
#
# Output: a list with the following fields:
#
# peak.mask - Boolean vector of peaks that passed the F-test
# F.confidence - A numerical vector of the p-value from the F-test of each peak
# spectrum.mask - Boolean vector of spectra that passed the chi-squared test
# exclude.confidence - Numeric vector giving the p-value that each spectrum is "bad"
# peaks.normalized - Matrix of rescaled peak magnitudes, including all peaks but excluding eliminated spectra
# peaks.screened - Matrix of rescaled peak magnitudes, excluding peaks and spectra that did not pass their respective tests
# peaks.averaged - Matrix of rescaled peak magnitudes with values averaged among replicates of each sample
# identities.screened - Vector of identities for columns (spectra) retained in of peaks.normalized and peaks.screened
# identities.averaged - Vector of identities for columns (samples) appearing in peaks.averaged
# scale.factors - Numeric vector of constants by which each spectrum was multiplied during normalization. These values should be checked for bias between groups.
#
#####################################################################
TestAndNormalize <- function(peakValues, identities, peak.confidence=0.95, spectrum.confidence=0.99, standards.mask)
{
nSpectra = ncol(peakValues)
# Check that the data matrix and identities vector match
if (nSpectra != length(identities)) {
stop("Mismatch between ncol(peakValues) and length(identities)")
}
# Phase 1 of significant peak calculation
# From the unnormalized peak values, find those which seem clearly significant (95% confident)
# Calculate F-test scores and compare them to the 95% confidence limit
norm.cutoff = 0.95
peak.mask = FALSE
while ( (sum(peak.mask) < 10) & (norm.cutoff >= 0) ) {
peak.mask <- (findSignificantPeaks(peakValues, identities) > norm.cutoff)
norm.cutoff = norm.cutoff - 0.05
}
# Check that enough peaks are detected to do meaningful normalization
if (sum(peak.mask) >= 10) {
# Perform EM-normalization to obtain an estimate of the scaling factor per spectrum and outliers
est.scale.factor <- estimate.em(peakValues[peak.mask,])$est.scale
if (any(is.na(est.scale.factor))) return(list(return.code=1))
} else {
print(noquote("Not enough peaks found for reliable normalization. Spectra will not be normalized."))
est.scale.factor = rep(1, nSpectra)
}
# Normalize the peak values
peakValues.normalized <- t(t(peakValues)/est.scale.factor)
# Now determine which spectra are outliers in the normalized data
chisq.results <- estimate.chisq.fit(peakValues.normalized[peak.mask,], confidence=spectrum.confidence, standards.mask)
spectrum.mask <- chisq.results$mask
print(noquote(paste("Number of spectra eliminated = ", sum(!spectrum.mask))))
# If there are implausible spectra, recompute the scale factor
if (sum(!spectrum.mask) > 0) {
print(noquote("Spectra were eliminated so scale factors and peak reproducibility are being recalculated."))
if (sum(peak.mask) >= 10) {
# Perform EM-normalization to obtain an estimate of the scaling factor per spectrum and outliers
est.scale.factor <- rep(NA, nSpectra)
est.scale.factor[spectrum.mask] <- estimate.em(peakValues[peak.mask, spectrum.mask])$est.scale
if (any(is.na(est.scale.factor[spectrum.mask]))) return(list(return.code=1))
} else {
est.scale.factor = rep(1, nSpectra)
est.scale.factor[!spectrum.mask] = NA
}
#Re-normalize the peak heights
peakValues.normalized <- t(t(peakValues)/est.scale.factor)
}
# Phase 2 of significant peak calculation
# Recalculate the F-test use the normalized peaks and excluding implausbile spectra
F.confidence <- findSignificantPeaks(peakValues.normalized[,spectrum.mask], identities[spectrum.mask])
peak.mask <- (F.confidence > peak.confidence)
# Calculate average peak values among replicates of each sample
# Returns NA for any sample that has no replicates remaining after the chi-test screen
peaks.averaged = sapply( unique(identities), FUN=function(ID, data, IDs){
if (any(IDs == ID)) {
return(apply(as.matrix(data[,IDs==ID]), MARGIN=1, FUN=mean, na.rm=TRUE))
} else {
return(rep(NA, dim(data)[[1]]))
}
}, peakValues.normalized[peak.mask,], identities )
dimnames(peaks.averaged)[[2]] = unique(identities)
dimnames(peakValues.normalized)[[2]] = identities
return(list(return.code=0, peak.mask=peak.mask, F.confidence=F.confidence, spectrum.mask=spectrum.mask, exclude.confidence=chisq.results$exclude.confidence,
peaks.normalized=peakValues.normalized, peaks.screened=peakValues.normalized[peak.mask,],
peaks.averaged=peaks.averaged, identities.screened=identities[spectrum.mask], identities.averaged=unique(identities),
scale.factors=est.scale.factor
))
}
#####################################################################
#
# Functions below assist "TestAndNormalize"
#
#####################################################################
# Reject those spectra which have a statistically implausible set of peaks
# confidence is the level of confidence we need in order to dismiss a spectrum
# Thus, higher confidence means less likely to call a spectrum an outlier
# Spectra marked as TRUE in standards.mask are not included in the chi-squared calculation,
# although they still recieve a score.
estimate.chisq.fit <- function(peakValues, confidence=0.99, standards.mask)
{
peak.means <- apply(peakValues[, !standards.mask], 1, mean, na.rm=TRUE)
peak.vars <- apply(peakValues[, !standards.mask], 1, var, na.rm=TRUE)
counts <- apply(!is.na(peakValues[, !standards.mask]), 1, sum)
z2.score <- ((peakValues - peak.means)^2)/peak.vars
# Use only those peaks which have sufficient non-zero values (>=10)
chisq.stat <- apply(z2.score[counts>10,], 2, sum, na.rm=TRUE)
chisq.df <- apply(!is.na(z2.score[counts>10,]), 2, sum, na.rm=TRUE)
return( list(mask = ((pchisq(chisq.stat, df=chisq.df) <= confidence) | standards.mask), exclude.confidence = pchisq(chisq.stat, df=chisq.df) ) )
}
# Find all the peaks for whom the signal is greater than the measurement noise
# at the specified confidence. findSignificantPeaks was used in the paper.
# findSignficantPeaks.new performs a more complete F-test, and is able to handle
# any number of replicates (the original handles only duplicates).
# Sensitivity thresholds for the two functions are not interchangeable.
findSignificantPeaks <- function(peakHeights, identities)
{
pairs <- find.pairs(identities)$pairs
z <- peakHeights[,pairs[1,]] + peakHeights[,pairs[2,]]
w <- peakHeights[,pairs[1,]] - peakHeights[,pairs[2,]]
f.ratio <- apply(z, 1, var, na.rm=TRUE)/apply(w^2, 1, mean, na.rm=TRUE)
n.count <- apply(!is.na(z), 1, sum)
f.degrees.1 <- n.count-1
f.degrees.2 <- n.count
valid <- (f.degrees.1 > 0) & (!is.na(f.ratio))
f.significance.valid <- pf(f.ratio[valid], f.degrees.1[valid], f.degrees.2[valid])
f.significance <- rep(NA, length(valid))
f.significance[valid] <- f.significance.valid
f.significance[is.na(f.significance)] = 0
return(f.significance)
#return((f.significance > confidence) & (!is.na(f.significance)))
}
#Finds all pairs of spectra from the same sample.
#In cases when there are more than two spectra, only selects the first two copies
find.pairs <- function(identities)
{
groups <- tapply(1:length(identities), identities, function(x){x})
group.identities <- names(groups)
identities.paired <- vector()
pairs <- vector()
for (j in 1:length(groups)) {
if (length(groups[[j]]) >= 2) {
pairs <- cbind(pairs, groups[[j]][1:2])
identities.paired <- c(identities.paired, group.identities[j])
}
}
return(list(pairs=pairs, identities.paired=identities.paired))
}
# This function is not currently used. It will allow for correct handling of replicate numbers higher than two.
findSignificantPeaks.new <- function(peakHeights, identities, confidence=0.95)
{
groups <- tapply(1:length(identities), identities, function(idx){idx})
group.identities <- names(groups)
T.j2 <- sapply(groups,
function(idx, ph)
{
#If there is only one sample, the total is the value itself
#This special case is needed otherwise S-plus apply does not work
if (length(idx) < 2) {
return(ph[,idx]^2)
} else {
return(apply(ph[,idx], 1, sum, na.rm=TRUE)^2)
}
},
peakHeights,
simplify=TRUE)
nj <- sapply(groups,
function(idx, ph)
{
#If there is only one sample, the count is 0 or 1
#This special case is needed otherwise apply does not work
if (length(idx) < 2) {
return(!is.na(ph[,idx]))
} else {
return(apply(!is.na(ph[,idx]), 1, sum, na.rm=TRUE))
}
},
peakHeights,
simplify=TRUE)
T.j2[is.na(T.j2)] <- 0
T.j2.over.nj <- T.j2/(nj + (nj == 0)) #Prevent divide by zero
sum.T.j2.over.nj <- apply(T.j2.over.nj, 1, sum)
n.count <- apply(nj, 1, sum)
k.count <- apply((nj > 0), 1, sum)
T..2.over.n <- (apply(peakHeights, 1, sum, na.rm=TRUE)^2)/n.count
sum.Yij2 <- apply(peakHeights^2, 1, sum, na.rm=TRUE)
Q1 <- sum.T.j2.over.nj - T..2.over.n
Q2 <- sum.Yij2 - sum.T.j2.over.nj
f.degrees.1 <- k.count-1
f.degrees.2 <- n.count-k.count
valid <- (f.degrees.1 > 0) & (f.degrees.2 > 0)
f.ratio.valid <- (Q1[valid]/f.degrees.1[valid])/(Q2[valid]/f.degrees.2[valid])
f.degrees.1.valid <- f.degrees.1[valid]
f.degrees.2.valid <- f.degrees.2[valid]
f.significance.valid <- pf(f.ratio.valid, f.degrees.1.valid, f.degrees.2.valid)
f.significance <- rep(NA, length(valid))
f.significance[valid] <- f.significance.valid
if (confidence == 0)
return(f.significance)
else
return((f.significance > confidence) & (!is.na(f.significance)))
}
# Use the EM algorithm to find the maximum likelihood estimates of scale factors for each spectrum
# y is an array of peak heights
# Each peak is a row, each sample is a column
# returned value est.a is the estimate of scale factor
estimate.em <- function(y, trim=0, maxIter=20, maxDeltaRatio=0.001, a.max=5.0)
{
a.min <- 1/a.max
a <- rep(1, ncol(y)) #Set initial estimates of a to be all equal
loga.var <- 1 #Works out nicely as first calculation (equal weighting)
for (j in 1:maxIter) {
a.old <- a
z <- t(t(y)/a)
peak.means <- apply(z, 1, mean, na.rm=TRUE, trim=trim)
peak.vars <- apply(z^2, 1, mean, na.rm=TRUE, trim=trim) - peak.means^2
# Check for excessive weight assigned to one peak, this may indicate that the
# calculation is converging to a singularity
peak.weights <- (peak.means^2)/peak.vars
peak.vars <- peak.vars * rescaleWeights(peak.weights)
coeff.a <- apply(y^2/peak.vars, 2, sum, na.rm=TRUE)
coeff.b <- -(1/loga.var + apply(y*peak.means/peak.vars, 2, sum, na.rm=TRUE))
coeff.c <- (1/loga.var - 1)
beta.root.1 <- (-coeff.b/coeff.a + coeff.c/coeff.b)
beta.root.2 <- -coeff.c/coeff.b
a <- 1/pmax(beta.root.1, beta.root.2)
# Bound the values of a
a <- pmax(a.min, a)
a <- pmin(a.max, a)
loga.mean <- mean(log(a))
a <- a/exp(loga.mean) #Renormalize a to have E[log(a)] = 0
loga.var <- mean(log(a)^2, trim=trim)
# Check for convergence criteria of less than 1% change in scale factor
delta <- max(abs((a - a.old)/a.old))
#print(paste("Iteration", j, " delta =", delta))
if (any(is.na(a))) break;
if (delta < maxDeltaRatio) break;
}
return(list(est.scale=a, est.loga.var=loga.var))
}
#
# Rescale returns a vector by which to multiply the denominator of the original weights
# By doing so, the rescaling will bring the largest weight down to w.limit if needed.
#
rescaleWeights <- function(w, w.limit=0.2)
{
w <- w/sum(w)
max.w <- max(w)
if (max.w > w.limit) {
print(paste("Got close to singularity with weight = ",max.w))
w.star <- w.limit * (1-max.w)/(1-w.limit)
rescale <- w/pmin(w, w.star)
} else {
rescale <- rep(1, length(w))
}
}
######################
#
# Helper functions
#
######################
# Returns the minimum of the highest M/Z value from each file in the "paths" parameter
get.M.Z.Max <- function(paths)
{
M.Z.max = 0
for (curFile in paths) {
next.max = max(as.vector(scan( curFile, what=list(numeric(0), numeric(0)), quiet=TRUE, skip=1, sep="," )[[1]]))
if ((next.max < M.Z.max) | (M.Z.max == 0)) {M.Z.max = next.max}
}
return(M.Z.max)
}
# Used in "findPeaksEdges". Returns the index of the first value in "data" that is not smaller than the next value or else the index of the last value.
firstMax <- function(data)
{
dir = c(1, data[-1] - data[-length(data)])
dir = c((dir < 0), TRUE) # Append a termination signal so that the last width is selected if the peak size continues to increase
# Get the index of the first TRUE minus one
return( ((1:length(dir))[dir])[1] - 1)
}
# Apply a uniform smoothing window to the data
smoothingFilter = function(data, span = 11)
{
result = as.vector(filter(data, rep(1/span, span), circular = FALSE))
result[is.na(result)] = 0
return(result)
}
## Wrapper functions for calls to the C code
peaks.M.Z <- function(data, M.Z.start, M.Z.end, M.Z.first, M.Z.last, fSpan) {
return( .C("cPeaksMZ", data, as.integer(M.Z.start), as.integer(M.Z.end), as.integer(M.Z.first), as.integer(M.Z.last), as.integer(length(data)), fSpan, ret=as.integer(rep(0, length(data))), DUP=FALSE)$ret )
}
peakArea.M.Z <- function(data, M.Z.start, M.Z.end, M.Z.first, M.Z.last, fSpan) {
return( (.C("cPeakAreaMZ", data, as.integer(M.Z.start), as.integer(M.Z.end), as.integer(M.Z.first), as.integer(M.Z.last), as.integer(length(data)), fSpan, ret=numeric(length(data)), DUP=FALSE) )$ret )
}
##