##################################################################### # # Peaks_R_1.3.R - Version 1.3 # # Date released: 7/20/2005 # # Version History: # # Version 1.3: released 7/20/2005 # - spectra.dir now accepts a vector of directories. # - Spectra with too few reproducible peaks no longer cause an error during normalization. # - Eliminated spectra appear in the output files as columns of NA instead of being left out completely. # - Spectra marked by 0 in a Find.Peaks column of the metadata are no longer included in calculation of the chi-squared test # They still receive scores so that the difference between control and experimental spectra can be measured. # - Leaving peakPlotMinMZ blank defaults to the lowest value in m.z.regions instead of giving an error. # # Version 1.2.1: released 6/16/2005 # - Changing the number of regions no longer gives "Error in 1:widthSteps : NA/NaN argument". # - Large negative values no longer occur for very narrow peaks. # - getPeakAreas respects the m.z.step parameter. # - An error is now generated if the metadata contains duplicate entries. # - Fixed a bug in the fraction of the AUC kernel sometimes being used for the negative regions. # - Added the peakWidthSteps parameter so that users can control the graininess with which peak edges are detected. # - Added peakPlotMinMZ and peakPlotMaxMZ parameters to control the region that is plotted in the peak plot. # # Version 1.2: released 5/27/2005 # - The Memory.Conserve=FALSE flag has been added to the parameter list of getPeaks. Setting this flag to TRUE # will cause spectra to be read from the hard-drive on demand instead of residing in memory. This eliminates # the major memory bottleneck but each spectrum will have to be read several times from the hard-drive. Use this option # if you are hitting the 4 GB limit on addressable memory or otherwise running out of memory while analyzing your dataset. # - The Peaks.Show=TRUE flag has been added to getPeaks. If the flag is TRUE then R will generate a plot # showing the averaged spectrum and the positions of all the peaks that passed the F-test. # - Peak positions are now rounded to m.z.step in the output file of peak magnitudes, so that peak names will not be absurdly long. # (does not affect the internal representation, and the peak bounds output file has positions to full precision so that the bounds # remain exactly the same when that peak bounds file is loaded in the future). # - The output list from getPeaks contains an element "spectrum.average" which is a numeric vector of the mean spectrum for the dataset. # The M/Z axis is uniform with a spacing between points equal to m.z.step. # - A peak.bounds.source parameter has been added to getPeaks to allow peak positions to be shared across analyses. This should point # to the peak bounds output file produced by an earlier analysis. If this parameter is specified then peaks will not be detected and # the peak bounds from the file will be used instead. (Please be careful comparing peaks values from samples that were not normalized together) # - Added a column F.Confidence to the output file of peak positions. This is the p-value of each peak in the F-test. # - Fixed the annoying uptick that could occur at the end of the mean spectrum due to a circular filter. # - Fixed a bug when the F-test confidence threshold was set to zero. # - Function named "preclassification" changed to "TestAndNormalize" to better describe its function. # - The code has been tested without incident in the R environment version 2.0.1 # # Version 1.1: released 1/31/2005 # - The estimate.em function now tests for convergence to a singularity by calling rescale.weights # on each interation. This checks if any peak has been assigned a weight greater than the w.limit parameter # (by default it equals 0.2 out of total weight of 1). If any peak has too much weight then a warning is printed # and the weights are rescaled so that w.limit is the highest weight. # - An error is now generated if the normalization fails for any reason. # # # Email Scott Carlson (scottmc@stanford.edu) with any questions or if there are features that would make SSA more useful for your particular data. # Also send an email if you find any bugs and I will release a fix as quickly as possible. # # COPYRIGHT NOTICE: This software, related source code, compiled code, and example data are all copyright of Harvey Cohen Research Group in the Department # of Pediatrics in the Stanford University School of Medicine. They are freely available for non-commercial purposes with citation. Commercial use # allowed only with express permission from Harvey Cohen or appropriate an authority within Stanford University. # # LIABILITY NOTICE: This software is provided as-is and the Harvey Cohen lab and its affiliates are not for responsible for damages occuring # from or through the use of this software, including but not limited to normal operations, improper or illegal use, or errors in the software itself. # ##################################################################### ##################################################################### # # getPeaks This is the main function used to extract peaks from spectra. # # Parameters: # Either spectra.dir or spectra.paths must be specified. If spectra.dir is specified then all files matching the regular expression given by # files.pattern will be loaded as spectra. Alternatively, spectra.paths may be used to specify specific files to load as spectra. spectra.paths is # ignored if spectra.dir is given. # # metadata.path (required) - the complete path of a file in a table format recognized by R. It must include at least three columns, in order: the file name # of each spectrum, the sample name for that spectrum (used to average replicates), and the group membership of that spectrum. Additional columns may appear # in the metadata, and names of these columns should be listed as a character vector in the "additional.metadata" parameter. If one of the # additional.metadata entries is "Find.Peaks" then that column will be used as a bit mask specifying which spectra to use when determining # peak location and edges (peak areas will still be calculated for all spectra). # # output.path.screened (optional) - the complete path of an output file that will contain normalized peak magnitudes for every peak and spectrum that # passes the F-test and Chi-squared test respectively. # # output.path.all (optional) - the complete path of an output file that will contain normalized peak magnitudes for every peak and in every spectrum that # passes the Chi-squared test (including peaks that fail the Chi-squared test). # # output.path.averaged (optional) - the complete path of an output file that will contain normalized peak magnitudes for every peak averaged for every unique # sample name (using only peaks and spectra that passed their respective quality tests, samples names with no passing spectra are assigned NA at every peak). # # output.path.peak.bounds (optional) - the complete path of an output file that will contain a table of the peak positions and edges, including peaks that do not # pass the F-test. # # min.peak.widths (optional) - sets the minimum spacing between peaks as a fraction of the current m/z (so 0.01 means minimum spacing of 1%). This can be given as a numeric # vector, as in c(0.004, 0.008), when different peak widths are to be applied to different m/z ranges. The default is set for a Ciphergen PBS-IIC SELDI-TOF-MS using # fractionated plasma with SPA (all default settings are for this instrument). # # max.peak.widths (optional) - sets the largest peak width that will be tested when determining peak edges. Specified in the same manner as min.peak.widths. # # peakWidthSteps (default = 6) - sets the number of steps that are taken while searching for the left and right edge of each peak. # # m.z.regions (optional) - specifies where m/z regions start and end for each of the peak widths that was given. For example, c(3000, 18000) applies the first entry in # each peak width vector to the m/z region from 3000 to 18000, and the second entry in each width vector to the range from 18000 to the end. # # peak.bounds.source (optional) - specifies the path of a file containing peak bounds (the peak bounds file that is output to the path in output.path.peak.bounds). # If this parameter is specified then the peaks from the file will be used and no other peaks will be detected. # # m.z.step (default = 0.5) - gives the m/z data resolution used when calculating peaks. Allocating the array of uniform m/z spectra is a major memory bottleneck. If you are # having memory problems then try increasing the m.z.step so that fewer points per spectrum will be calculated. # # F.test.threshold (default = 0.95) - this parameter sets the minimum confidence that peaks' magnitude exceeds their noise. Peaks with lower confidence on the F-test are not included # in output files specified by output.path.screened or output.path.averaged. # # chi.sq.threshold (default = 0.99) - specifies the confidence needed to exclude a bad spectrum. Lowing the value will result in a larger number of excluded spectra. # # additional.metadata (required if metadata column present) - a character vector giving names to each column after the 3rd that appears in the metadata file. # # Memory.Conserve (default = FALSE) - a boolean flag that will cause spectra to be loaded from the hard-drive on demand instead of residing in memory. Use this option if you are hitting the maximum on allocatable memory. # # Peaks.Show (default = TRUE) - a boolean flag that will cause R to generate a plot showing the averaged spectrum and the position of peaks on that spectrum. Only peaks that pass the F-test are shown. # # peakPlotMinMZ (default = 0) - a numeric value giving the lowest m/z value to plot when drawing the average spectrum on the peak plot. # # peakPlotMaxMZ (default = highest m/z) - like peakPlotMinMZ for the highest m/z value in the peak plot. # # # Output: returns a list with the peak magnitudes at several stages of processing. Elements of the list are # return.code - contains a zero if the function returned correctly, a one if the function detected an error. # peak.mask - a boolean vector with length equal to the number of peaks detected before F-test screening. Specifies if each peak passed the F-test with the specified confidence. # spectrum.mask - a boolean vector with length equal to the number of spectra. Specifies whether each spectrum passed the chi-squared plausibility test. # exclude.confidence - numeric vector length one entry per spectrum, giving the p-value that each spectrum is "bad" # peaks.normalized - an n x m matrix where n is the number of peaks, m is the number of spectra. Peak values are given after EM-normalization and every peak is included # regardless of its performance on the F-test. Each entry corresponds to the magnitude that peak in each spectrum (technically the AUC divided by peak width). # peaks.screened - similar to peaks.normalized, but including only peaks that passed the F-test with given confidence. # peaks.averaged - an i x k matrix where i is the number of peaks passing the F-test and k is the number of individuals in the experiment (not the number of spectra). # Replicate measurements of individuals have been averaged. # identities.screened - Vector of identities for columns (spectra) retained in of peaks.normalized and peaks.screened. # identities.averaged - Vector of identities for columns (samples) appearing in peaks.averaged. # scale.factors - Numeric vector of constants by which each spectrum was multiplied during normalization. These values should be checked for bias between groups. # spectrum.average - a numeric vector containing the mean spectrum on which peaks are detected. The names of the vector elements contain the m/z for each point. # peaks.bounds - A data frame with three columns, giving the center, left width, and right width of every peak. # status - A vector containing group memberships for spectra in the order that they appear in the output matrices. # status.averaged - A vector containing group memberships for columns of averaged replicates in the order that they appear in the output matrix with averaged replicates. # metadata - A data frame containing the metadata file. Row names are the file names taken from the first column of the file, the first column contains sample # names, the second has group membership, and additional columns are as specified # ##################################################################### getPeaks <- function( spectra.dir="", files.pattern="", spectra.paths="", metadata.path, output.path.screened="", output.path.all="", output.path.averaged="", output.path.peak.bounds="", min.peak.widths=c(0.004, 0.008), max.peak.widths=c(0.01, 0.02), peakWidthSteps=6, m.z.regions=c(3000, 18000), peak.bounds.source="", m.z.step=0.5, F.test.threshold=0.95, chi.sq.threshold=0.99, additional.metadata=c(""), Memory.Conserve=FALSE, Peaks.Show=TRUE, peakPlotMinMZ=m.z.regions[1], peakPlotMaxMZ="") { ## Read the metadata file ## print(noquote("Loading spectra")) # Check that the metadata file exists and can be read if ( file.access( metadata.path, 0) != 0 ) { stop( paste("Metadata file", metadata.path, "does not exist.", sep=" ")); next } if ( file.access( metadata.path, 4) != 0 ) { stop( paste("Metadata file", metadata.path, "exists but can not be read.", sep=" ")); next } # Load the metadata file if ( additional.metadata[1] == "") { metadata = read.table(metadata.path, sep="\t", col.names=c("File.Names", "Identities", "Groups"), row.names=NULL) } else { metadata = read.table(metadata.path, sep="\t", col.names=c("File.Names", "Identities","Groups", additional.metadata), row.names=NULL) } # Turn the first column into dimnames for the rows if (any(duplicated(metadata[,1]))) { print( noquote("Some metadata entries appear more than once in the metadata file:") ) print( noquote(as.character(unique(metadata[duplicated(metadata[,1]),1]))) ) stop() } dimnames(metadata)[[1]] = metadata[,1] metadata = metadata[,-1] ## Load spectra ## if (spectra.paths[1] == "") { spectra.data = load.spectra(spectra.dir = spectra.dir, files.pattern = files.pattern, Memory.Conserve = Memory.Conserve) } else { spectra.data = load.spectra(spectra.paths = spectra.paths, Memory.Conserve = Memory.Conserve) } if ( length( dim(spectra.data$data)[[2]] ) == 0 ) { stop("No spectrum files loaded. Quitting analysis.") } # Remove paths from the file names file.names = sapply(spectra.data$file.names, FUN=function(full.path) {substring(full.path, attr(regexpr(".*[\\]", full.path), "match.length")+1 )}) if (any(duplicated(file.names))) { stop("All files must have unique names, even if they are stored in different directories.") } # Remove extensions from the file names. This is an ugly line of code because it needs to work correctly in the presence of any number of . characters. file.names = unlist(lapply( strsplit(file.names,"\\."), FUN=function(charvect){return(paste(charvect[1:(length(charvect)-1)], collapse="."))})) # Check that all spectra have metadata entries metadata.file.names = dimnames(metadata)[[1]] file.matches = match(file.names, metadata.file.names, nomatch=NA) if ( any(is.na(file.matches)) ) { print(noquote("Some files do not have metadata entries: (check that the metadata file DOES NOT include directory paths or file extentions)")) print(noquote(paste("\tMissing metadata:", file.names[is.na(file.matches)], sep=" "))) return(list(return.code=0)) } # Assign names to the spectra in each data matrix dimnames(spectra.data$data)[[2]] = as.character(as.matrix(metadata[file.names, 1])) dimnames(spectra.data$M.Z)[[2]] = as.character(as.matrix(metadata[file.names, 1])) # Create the boolean mask that determines which spectra are used in determining canonical peaks # (peak magnitudes will still be calculated for all spectra) if ( any(additional.metadata == "Find.Peaks") ) { use.for.peaks = (as.character(as.matrix(metadata[file.names,"Find.Peaks"])) == "1") } else { use.for.peaks = rep(TRUE, length(file.names)) } ## Locate peaks in the data or load the peak list from a file ## if (peak.bounds.source == "") { # Calculate peak locations from the data print(noquote("Identifying peak locations")) peak.bounds = canonicalPeaks( spectra.data$data[,use.for.peaks], spectra.data$M.Z[,use.for.peaks], groups = metadata[file.names, 2][use.for.peaks], regions=m.z.regions, minPeakWidths=min.peak.widths, maxPeakWidths=max.peak.widths, peakWidthSteps=rep(peakWidthSteps, length(m.z.regions)), M.Z.Step=m.z.step, smoothingSpan=11, Memory.Conserve = Memory.Conserve ) } else { # Load peak bounds from a file print(noquote("Loading peak locations")) peak.bounds = load.peak.bounds(peak.bounds.source) if (peak.bounds$success == FALSE) { stop("Error: failure loading peaks from the file.") } # Determine what is the lowest final M/Z among the spectra peak.bounds$M.Z.max = get.M.Z.Max(spectra.data$file.names) } # Check for errors in peak finding if (is.null(peak.bounds$center)) { stop("Error: no peaks found.") } if (length(peak.bounds$center)==0) { stop("Error: no peaks found.") } print(noquote(paste( "Total peaks found:", length(peak.bounds$center), sep=" "))) ## Get peak magnitudes for each peak in each spectrum ## print(noquote("Determining peak area in each spectrum")) peak.magnitudes = getPeakAreas( spectra.data$data, spectra.data$M.Z, peak.bounds$M.Z.max, peak.bounds, smoothingSpan=11, M.Z.Step=m.z.step, Memory.Conserve = Memory.Conserve) ## Normalize spectra, select reproducible peaks, and remove implausible spectra ## print(noquote("Calculating normalization coefficients and reproducibility checks")) status.averaged = list() # To keep all peaks set F.test.threshold = 0, and to keep all spectra set spectrum.confidence >= 1 peaks.processed = TestAndNormalize(peak.magnitudes, as.matrix(metadata[file.names, 1]), peak.confidence=F.test.threshold, spectrum.confidence=chi.sq.threshold, standards.mask=!use.for.peaks) if (peaks.processed$return.code == 1) { print(noquote("Error normalizing spectra: an error was returned while normalizing spectra. Please check the format of spectra files and the metadata file. Email details of the problem to Scott Carlson (scottmc@stanford.edu) if this appears to be a glitch with the code.")) return(list(return.code=1)) } # Determine group assignments for columns of the replicate averaged peak matrix status.averaged = sapply( peaks.processed$identities.averaged, FUN=function(ID, groups, IDs){ return( groups[ (IDs==ID) ][[1]] ) }, metadata[file.names, 2], metadata[file.names, 1]) ## Write the output files and build the return list ## print(noquote("Writing output files, if requested")) if ( output.path.screened != "" ) { write.table(peaks.processed$peaks.screened, output.path.screened, sep="\t") } if ( output.path.all != "" ) { write.table(peaks.processed$peaks.normalized, output.path.all, sep="\t") } if ( output.path.averaged != "" ) { write.table(peaks.processed$peaks.averaged, output.path.averaged, sep="\t") } # Generate a data.frame of peak positions and peak widths. These values are all rounded to the nearest m.z.step peak.frame = data.frame(Center=peak.bounds$center, Left=peak.bounds$left, Right=peak.bounds$right, Passed = peaks.processed$peak.mask, F.Confidence = peaks.processed$F.confidence) if ( output.path.peak.bounds != "" ) { write.table(peak.frame, output.path.peak.bounds, sep="\t", row.names=FALSE) } ## Print a summary ## print(noquote("")) print(noquote("Data Summary:")) print(noquote(paste( "Spectra loaded:", dim(spectra.data$data)[[2]], sep=" "))) spectra.kept = sum(peaks.processed$spectrum.mask) print(noquote(paste( "Spectra kept:", spectra.kept, sep=" "))) if ( any(!peaks.processed$spectrum.mask) ) { print(noquote(paste("\tSpectra removed:", file.names[peaks.processed$spectrum.mask==F],sep=" "))) } print(noquote(paste( "Unique samples:", length(unique(as.matrix(metadata[,1]))), sep=" "))) print(noquote(paste( "Peaks kept with", F.test.threshold, "confidence:", sum(peaks.processed$peak.mask), sep=" "))) print(noquote("")) print(noquote("Done")) ## Show a plot of the peak locations ## if (Peaks.Show) { if (peak.bounds.source=="") { # Generate a plot showing the average spectrum if (peakPlotMaxMZ == "") { peakPlotMaxMZ = peak.bounds$M.Z.max } plot(m.z.step * (floor(peakPlotMinMZ/m.z.step):floor(peakPlotMaxMZ/m.z.step)), peak.bounds$spectrum.average[floor(peakPlotMinMZ/m.z.step):floor(peakPlotMaxMZ/m.z.step)], type="l", xlab="M/Z", ylab="Mean Intensity", main="Peaks With Reproducible Measurements") # Add peak markers to the plot points(peak.frame$Center[peak.frame$Passed], peak.bounds$spectrum.average[floor(peak.frame$Center[peak.frame$Passed] / m.z.step)], pch=20, col="red") # These plot peak boundaries but can give a very messy plot. #points(peak.frame$Center[peak.frame$Passed]-peak.frame$Left[peak.frame$Passed], peak.bounds$spectrum.average[floor(peak.frame$Center[peak.frame$Passed] / m.z.step)], pch=20, col="blue") #points(peak.frame$Center[peak.frame$Passed]+peak.frame$Right[peak.frame$Passed], peak.bounds$spectrum.average[floor(peak.frame$Center[peak.frame$Passed] / m.z.step)], pch=20, col="blue") } else { print(noquote("A plot of the average spectrum could not be drawn because the average spectrum is not calculated if peak locations are loaded from a file.")) } } return(c(peaks.processed, list(spectrum.average=peak.bounds$spectrum.average, peak.bounds=peak.frame, status=metadata[file.names, 2], status.averaged=status.averaged, metadata = metadata))) } ##################################################################### # # load.spectra Load all spectra in the directory specified by spectra.path, or alternatively load each file specified in the charater vector spectra.paths # # Parameters: either spectra.dir or spectra.paths must be specified. If spectra.dir is specified then all files matching the regular expression given by # files.pattern will be loaded as spectra. Alternatively, spectra.paths may be used to specify specific files to load as spectra. spectra.paths is # ignored if spectra.dir is given. If Memory.Conserve is set to true then the function will fill out the returnList object using 1-row data frames # for the data and M.Z list elements where those elements will contain the path of the spectrum that would occupy that column. # # Output: returns a list with elements "data" and "M.Z". Both are numeric matrices with one spectrum in each column unless the Memory.Conserve flag is true (see above). # ##################################################################### load.spectra <- function(spectra.dir="", files.pattern="", spectra.paths="", Memory.Conserve) { # Get the list of files in the directory specified by spectrum.path if no vector of specific files was given in spectra.paths if (spectra.paths[1] == "") { spectra.paths = paste(spectra.dir[1], list.files(spectra.dir[1], pattern=files.pattern), sep="\\") if (length(spectra.dir) > 1) for (curDir in (2:length(spectra.dir))) spectra.paths = c( spectra.paths, paste(spectra.dir[curDir], list.files(spectra.dir[curDir], pattern=files.pattern), sep="\\") ) } curFileNum = 1 sampleCount = length(spectra.paths) returnList = list(file.names = spectra.paths) for (curFile in spectra.paths) { # Check that the requested file exists and can be read if ( file.access( curFile, 0) != 0 ) { warning( paste("File", curFile, "does not exist.", sep=" ")); next } if ( file.access( curFile, 4) != 0 ) { warning( paste("File", curFile, "exists but cannot be read.", sep=" ")); next } # Load the spectra into active memory if memory conservation has not been turned on # Otherwise fill out returnList$data and returnList$metadata with the paths of the files if ( Memory.Conserve == FALSE) { new.data = scan( curFile, what=list(numeric(0), numeric(0)), quiet=TRUE, skip=1, sep="," ) new.length = length(new.data[[1]]) if (curFileNum > 1) { curLength = dim(returnList$data)[[1]] # Expand the data matrices if the new spectrum is longer than any previous if ( new.length > curLength) { returnList$data = rbind(returnList$data, matrix( rep(0, (new.length - curLength) * sampleCount ), nrow = new.length - curLength ) ) returnList$M.Z = rbind(returnList$M.Z, matrix( rep(0, (new.length - curLength) * sampleCount ), nrow = new.length - curLength ) ) } returnList$data[1:length(new.data[[2]]),curFileNum] = new.data[[2]] returnList$M.Z[1:length(new.data[[1]]),curFileNum] = new.data[[1]] # If this spectrum was shorter than the longest read so far then fill out any extra space in the matrix with zeros if (new.length < curLength) { returnList$data[ (new.length+1):curLength ,curFileNum] = 0 returnList$M.Z[ (new.length+1):curLength ,curFileNum] = 0 } curFileNum = curFileNum + 1 } else { returnList$data = matrix(rep(0, new.length * sampleCount), ncol = sampleCount) returnList$M.Z = matrix(rep(0, new.length * sampleCount), ncol = sampleCount) returnList$data[,1] = new.data[[2]] returnList$M.Z[ ,1] = new.data[[1]] curFileNum = curFileNum + 1 } } } if (Memory.Conserve == TRUE) { returnList$data = matrix(spectra.paths, nrow=1) returnList$M.Z = matrix(spectra.paths, nrow=1) } return(returnList) } ##################################################################### # # load.peak.bounds Loads peak positions from the file specified by "path." This file should have the format of the peak bounds file # produced as output from SSA> # # Parameters: # path - the path of the file containing peak bounds to be loaded. # # Output: # Returns a list with elements "left", "center", and "right", which are identical to the values returned by canonicalPeaks. # The list also has an M.Z.max element which is set to zero to retain symmetry to the output from canonicalPeaks. The zero value in # M.Z.max serves as a flag later to indicate that the value of M.Z.Max still needs to be determined. There is also a "success" field, # which is a boolean value that is TRUE if the file loaded, FALSE otherwise. # ##################################################################### load.peak.bounds <- function(path) { # Check that the peak bounds file exists and is readable if ( file.access( path, 0) != 0 ) { warning( paste("Peak bounds file", path, "does not exist.", sep=" ") ) return(list(success=FALSE)) } if ( file.access( path, 4) != 0 ) { warning( paste("Peak bounds file ", path, "exists but cannot be read.", sep=" ") ) return(list(success=FALSE)) } # Load the peak bounds bounds = read.table(path, header=TRUE) # Build the list of return data return(list(left = bounds$Left, center=bounds$Center, right=bounds$Right, M.Z.max=0, success=TRUE)) } ##################################################################### # # canonicalPeaks locates peaks in the data set and their left and right edges. # # Parameters: # Data parameters # data - A matrix of raw spectra, with each spectrum in a column. # M.Z - A matrix of m/z values corresponding to each point in the "data" matrix. This allows each spectrum to have a different m/z axis. If no "M.Z" is provided then the m/z values for each spectrum must be the same and they must appear as the dimnames of the "data" matrix. # groups - A vector of class identities for each sample. If no "groups" vector is provided then all spectra are treated as belonging to the same group. # # Peak width parameters # regions - Numeric vector specifying the starting point of m/z region of the spectrum to which the values in "minPeakWidth" and "maxPeakWidths" should be applied. For example, to start analysis at 3000 m/z and use different peak parameters after 18000 m/z then the regions parameter should be c(3000, 18000). # minPeakWidths - A numeric vector specifying the size of the expected peak window in fraction of current m/z. For example 0.004 is a window width of 0.4% of the current m/z. # maxPeakWidths - A numeric vector specifying the largest peak window that will be tested in determining peak edges as a fraction of current m/z. # # Parameters below only need to be modified for applications very different from protein profiling on a Ciphergen PBSIIc instrument in the m/z range from 3000 to 75000. # peakWidthSteps - The number of different peak edges to test in determining the left and right edges of each peak. # M.Z.Step - A numeric value that specifies the resolution of the common m/z axis onto which all spectra are mapped. The default value of 0.5 is appropriate for the m/z range from 3000 to 75000 on the Ciphergen PBSIIc instrument. For higher m/z the m/z step should be increased and for peptide mapping it should be reduced. # smoothingSpan - The width in data points to apply a simple smoothing filter before determine peak locations. Filtering increases the range of support when local baseline is calculated for each peak. # # Memory.Conserve - tells the function that the data and M.Z parameters do not contain values, but that each column contains instead the path to the spectrum that would occupy that column. # # Output: # The output is a list with three numeric vectors "left", "center", "right". # "center" is the m/z position of the area under the curve maxima, "left" and "right" are the left and right width of the peak in m/z units. # Two other elements of the list are M.Z.max which contains the largest M/Z value located in the data set, spectrum.average which contains the averaged spectrum on which peaks are detected, # and F.Confidence, which is set to zero unless the F-test scores are loaded from a file. # # ##################################################################### canonicalPeaks <- function(data, M.Z="", groups=rep(1, dim(data)[[2]]), regions = 3000, minPeakWidths, maxPeakWidths, peakWidthSteps, M.Z.Step = 0.5, smoothingSpan = 11, Memory.Conserve) { # Calculate the number of samples from the data if (Memory.Conserve) { numSamples = length(data) } else { numSamples = dim(data)[[2]] } # Determine the number of groups from the groups factor numGroups = length(unique(groups)) groups = as.numeric(as.factor(as.numeric(as.factor(groups)))) # Generate an M/Z table from the data if one is not provided # Also determine the maximum M/Z value - this is the smallest maximum M/Z among the spectra # NOTE: Using canonicalPeaks without providing an M/Z table is not a supported use, it is left here to provide flexibility in future file formats. if (is.null(dim(M.Z)) & (class(M.Z) != "character")) { tmpM.Z = as.numeric(dimnames(data)[[1]]) M.Z = matrix( rep(tmpM.Z, numSamples), ncol=numSamples, byrow=FALSE ) M.Z.max = max(tmpM.Z) } else { # Find the highest M/Z value in the dataset if (Memory.Conserve == FALSE) { M.Z.max = min(apply(M.Z, MARGIN=2, FUN=max)) } else { M.Z.max = get.M.Z.Max(M.Z) } } # Map all spectra to a common m/z axis and calculate the average spectrum for the data set # This axis starts at m/z = 0 and steps by the value of M.Z.Step up to M.Z.Max # Initialize the mean spectrum to zero meanSpectrum = rep(0, M.Z.max / M.Z.Step + 1) # Determine the mean spectrum for each group and add it to the mean spectrum for data set for (curGroup in (1:numGroups)) { groupVector = (groups == groups[curGroup]) groupMean = rep(0, length(meanSpectrum)) for ( curSample in ((1:numSamples)[groupVector]) ) { if (Memory.Conserve == FALSE) { groupMean = groupMean + (approx( M.Z[,curSample], data[,curSample], xout = ( 0:(M.Z.max / M.Z.Step) ) * M.Z.Step, rule=2 ))$y } else { next.data = scan( data[curSample], what=list(numeric(0), numeric(0)), quiet=TRUE, skip=1, sep="," ) groupMean = groupMean + (approx( next.data[[1]], next.data[[2]], xout = ( 0:(M.Z.max / M.Z.Step) ) * M.Z.Step, rule=2 ))$y } } # Add the current group mean spectrum to the data set mean spectrum meanSpectrum = meanSpectrum + groupMean / (sum(groupVector) * numGroups) } # Apply a small smoothing filter to the mean spectrum. This increases the support base when finding the local baseline meanSpectrum = smoothingFilter(meanSpectrum, smoothingSpan) # Locate peaks in the mean spectrum if (M.Z.max > max(regions)) { regions = c(regions, M.Z.max) } peakInfo = list() peakInfo$M.Z.max = M.Z.max peakInfo$spectrum.average = meanSpectrum names(peakInfo$spectrum.average) = (0:(length(meanSpectrum)-1)) * M.Z.Step peakInfo$left = vector() peakInfo$center = vector() peakInfo$right = vector() peakInfo$F.Confidence = "" # Calculate the area under the curve (AUC) at every point in the mean spectrum areaMeanData = rep(0, length(meanSpectrum)) for (curRegion in (1:(length(regions) - 1))) { leftEdge = floor(regions[curRegion ] / M.Z.Step) rightEdge = floor(regions[curRegion+1] / M.Z.Step) # Calculate a running peak area across the composite spectrum areaMeanData[leftEdge:rightEdge] <- (peakArea.M.Z(meanSpectrum, regions[curRegion], regions[curRegion+1], 0, max(regions), minPeakWidths[curRegion]))[leftEdge:rightEdge] } # Locate peaks in the AUC of the mean spectrum for (curRegion in (1:(length(regions) - 1))) { # Find peaks in the AUC for this region of the spectrum peakVector <- peaks.M.Z(areaMeanData, regions[curRegion], regions[curRegion+1], 0, max(regions), minPeakWidths[curRegion]) # Determine the left and right edges of each peak edges <- findPeakEdges(meanSpectrum, maxPeakWidths[curRegion], peakWidthSteps[curRegion], peakVector, 0, max(regions)) # Append this set of peak edges to the list object being returned. Round peak positions to the nearest M.Z.Step peakInfo$left = c(peakInfo$left , edges$left) peakInfo$center = c(peakInfo$center, edges$center) peakInfo$right = c(peakInfo$right, edges$right) } # Round values to three digits after the decimal because that is how R will round when the numbers are converted to a data.frame peakInfo$center = round(peakInfo$center, 3) peakInfo$left = round(peakInfo$left, 3) peakInfo$right = round(peakInfo$right, 3) # Set the names of entries in the peak edge vectors names(peakInfo$center) = peakInfo$center names(peakInfo$left) = peakInfo$center names(peakInfo$right) = peakInfo$center # Return the list of peaks return(peakInfo) } ##################################################################### # # findPeakEdges determines the left and right edges of each peak identified by canonicalPeaks. # This function is meant to be called only from canonicalPeaks. # # Parameters: # Data parameters # data - Average spectrum for the data set, which was computed by the canonicalPeaks function. # # Peak width parameters # maxPeakWidth - Specifies the largest peak window that will be tested in determining peak edges as a fraction of current m/z. # widthSteps - The number of steps to take when locating the left and right edges of a peak. For example, widthSteps = 6 will try six positions for each edge moving outward from the center. # # Parameters from the calling function # peakVector - A boolean vector specifying which spots in the average spectrum have a maxima in the AUC. # M.Z.first - The lowest m/z included in the "data" vector. # M.Z.last - The highest m/z included in the "data" vector. # # Output: # A list with three vectors "left", "center", "right". # "center" is the m/z position of the area under the curve maxima, "left" and "right" are the left and right width of the peak in m/z units. ##################################################################### findPeakEdges <- function(data, maxPeakWidth, widthSteps, peakVector, M.Z.first, M.Z.last) { returnList = list() widthDelta = maxPeakWidth / widthSteps peakPositions = (1:length(peakVector))[as.logical(peakVector)] M.Z.step = (M.Z.last - M.Z.first) / (length(peakVector) + 1) returnList$center = M.Z.first + peakPositions / length(data) * (M.Z.last - M.Z.first) # Iterate through peaks and determine the right edge of each returnList$left = sapply(peakPositions, FUN = function(position, data, span, spanStep, widthSteps, firstMZ, lastMZ, stepMZ) { # Calculate the m/z of this peak M.Z = firstMZ + position / length(data) * (lastMZ - firstMZ) # Calculate the left and right edges of the peak in terms of indices # This determines a single, fixed, left edge and a range of possible right edges to be tested leftEdge = position - (M.Z * span * 0.5) / stepMZ rightEdge = position + (M.Z * spanStep * (1:widthSteps)) / stepMZ # Check bounds on peak edges rightEdge[rightEdge > length(data)] = length(data) if (leftEdge < 1) { leftEdge = 1 } # Determine which of the possible edge positions to use # This involves calculating the AUC divided by width for a fixed left edge and a variety of right edges. # The right edge that is selected is the first maximum moving out from the center (or the maximum peak width if no maximum is found) if (rightEdge[1] - leftEdge > 8) { areas = apply( cbind(rep(leftEdge, widthSteps), rightEdge), MARGIN=1, FUN=function(edges, data) { negSpan = (edges[2] - edges[1]) * 0.1 if (negSpan < 1) { negSpan = 1 } posSpan = edges[2] - edges[1] - negSpan * 2 + 1 return( sum(data[edges[1]:(edges[1]+negSpan-1)]) * -0.5 / negSpan + sum(data[(edges[1]+negSpan):(edges[2]-negSpan)]) / posSpan + sum(data[(edges[2]-negSpan+1):edges[2]]) * -0.5 / negSpan ) }, data) } else { areas = c(1,0) } # Choose the first option by default if the window is not wide enough to calculate a local baseline # Return the distance to the right edge return(firstMax(areas) * spanStep * M.Z) }, data, maxPeakWidth, widthDelta, widthSteps, M.Z.first, M.Z.last, M.Z.step ) # Iterate through peaks and determine the left edge of each returnList$right = sapply(peakPositions, FUN = function(position, data, span, spanStep, widthSteps, firstMZ, lastMZ, stepMZ) { # Calculate the current M/Z M.Z = firstMZ + position / length(data) * (lastMZ - firstMZ) # Calculate the left and right edges of the peak in terms of indices # This determines a single, fixed, right edge and a range of possible left edges to be tested rightEdge = position + (M.Z * span * 0.5) / stepMZ leftEdge = position - (M.Z * spanStep * (1:widthSteps)) / stepMZ # Do bounds checking on peak edges if (rightEdge > length(data)) { rightEdge = length(data) } leftEdge[leftEdge < 1] = 1 # Determine which of the possible edge positions to use # This involves calculating the AUC divided by width for a fixed right edge and a variety of left edges. # The left edge that is selected is the first maximum moving out from the center (or the maximum peak width if no maximum is found) if (rightEdge - leftEdge[1] > 8) { areas = apply( cbind(rep(leftEdge, widthSteps), rightEdge), MARGIN=1, FUN=function(edges, data) { negSpan = (edges[2] - edges[1]) * 0.1 if (negSpan < 1) { negSpan = 1 } posSpan = edges[2] - edges[1] - negSpan * 2 + 1 return( sum(data[edges[1]:(edges[1]+negSpan-1)]) * -0.5 / negSpan + sum(data[(edges[1]+negSpan):(edges[2]-negSpan)]) / posSpan + sum(data[(edges[2]-negSpan+1):edges[2]]) * -0.5 / negSpan ) }, data) } else { areas = c(1,0) } # Choose the first option by default if the window is not wide enough to calculate a local baseline # Return the distance to the left edge return(firstMax(areas) * spanStep * M.Z) }, data, maxPeakWidth, widthDelta, widthSteps, M.Z.first, M.Z.last, M.Z.step ) return(returnList) } ##################################################################### # # GetPeakAreas Takes the same "data" and "M.Z" parameters as canonicalPeaks. It takes the output of canonicalPeaks as the "peaks" parameter and determines the magnitude of each peak in each spectrum. # # Parameters: # Data parameters # data - The matrix of raw spectra with each spectrum in a column (same as in canonicalPeaks). # M.Z - A matrix of m/z values corresponding to each point in the "data" matrix. This allows each spectrum to have a different m/z axis. If no "M.Z" is provided then the m/z values for each spectrum must be the same and they must appear as the dimnames of the "data" matrix. # peaks - The list of center, left width, and right width assigned to every peak, as produced by the canoncalPeaks function. # # Parameters below only need to be modified for applications very different from protein profiling on a Ciphergen PBSIIc instrument in the m/z range from 3000 to 75000. # smoothingSpan - The width in data points to apply a simple smoothing filter before determine peak areas. Filtering increases the range of support when local baseline is calculated for each peak. # M.Z.Step - The M/Z spacing to use when integrating the AUC for each spectrum. # # Memory.Conserve - tells the function that the data and M.Z parameters do not contain values, but that each column contains instead the path to the spectrum that would occupy that column. # # Output: # A matrix of peak magnitudes (AUC above the local baseline divided by the peak width), with one peak per row and one spectrum per column. # ##################################################################### getPeakAreas <- function(data, M.Z="", M.Z.Max, peaks, smoothingSpan=11, M.Z.Step=1, Memory.Conserve) { # Calculate the number of samples from the data if (Memory.Conserve) { numSamples = length(data) } else { numSamples = dim(data)[[2]] } # Generate an M/Z table from the data if one is not provided # Also determine the maximum M/Z value - this is the smallest maximum M/Z among the spectra # NOTE: Using canonicalPeaks without providing an M/Z table is not a supported use, it is left here to provide flexibility in future file formats. if (is.null(dim(M.Z)) & (class(M.Z ) != "character")) { tmpM.Z = as.numeric(dimnames(data)[[1]]) M.Z = matrix( rep(tmpM.Z, numSamples), ncol=numSamples, byrow=FALSE ) } # Initialize a matrix to hold peak magnitudes peakAreas = matrix(rep(NA, length(peaks$center) * numSamples), ncol=numSamples, dimnames = list( peaks$center, dimnames(data)[[2]])) # Iterate through each spectrum and get the peak areas for (curSpectrum in (1:dim(data)[[2]])) { # Map the spectrum onto the common M/Z axis. This axis goes from m/z = 0 to m/z = M.Z.Max in increments of M.Z.Step. if (Memory.Conserve == FALSE) { mappedData = approx( M.Z[,curSpectrum], data[,curSpectrum], xout = 0:(M.Z.Max / M.Z.Step) * M.Z.Step, rule=2 )$y } else { curData = scan( data[curSpectrum], what=list(numeric(0), numeric(0)), quiet=TRUE, skip=1, sep="," ) mappedData = approx( curData[[1]], curData[[2]], xout = 0:(M.Z.Max / M.Z.Step) * M.Z.Step, rule=2 )$y } # Apply a smoothing filter to increase the support of the AUC calculation mappedData = filter(mappedData, rep(1/smoothingSpan, smoothingSpan), circular = TRUE) # Iterate through the peaks and determine the magnitude of each peakAreas[,curSpectrum] = sapply(1:length(peaks$center), FUN=function(index, data, peaks) { # Determine the left and right M/Z values of the peak left = peaks$center[index] - peaks$left[ index] right = peaks$center[index] + peaks$right[index] # Map left and right M/Z to left and right indices # Assume that the spectrum begins at zero, ends at M.Z.Max, and that each data point is separated by M.Z.Step left = floor(left / M.Z.Step) right = floor(right / M.Z.Step) # Check bounds on the left and right indices of the peak if (left < 1) { left = 1 } if (right > length(data)) { right = length(data) } # Determine the size of the regions used for local baseline subtraction negSpan = floor((right - left) * 0.1) if (negSpan < 1) { negSpan = 1 } posSpan = right - left - 2 * negSpan # Return the total area over the local baseline divided by the peak width return( (sum(data[left:(left+negSpan-1)]) * -0.5 / negSpan + sum(data[(left+negSpan):(right-negSpan)]) / posSpan + sum(data[(right-negSpan+1):right]) * -0.5 / negSpan) ) }, mappedData, peaks) } return(peakAreas) } ##################################################################### # # TestAndNormalize Determine normalization factors for each spectrum, compute # F-test scores for the reproducibility of each peak, computer chi-squared scores # for the quality of each spectrum, then apply the requested thresholds and average # peaks for replicates of the same sample. # # Parameters: # peakHeights - a matrix of peak magnitudes for each spectrum, baseline subtracted. NA is acceptable. # identities - a character vector of spectrum labels, with the same label denoting the same biological source # peak.confidence - a numeric value between 0 and 1 to threshold peaks for reproducibility by the F-test ( <0 specifying that all peaks should be accepted) # spectrum.confidence - a numeric value between 0 and 1 specifying how much confidence is required to eliminate bad spectra by a chi-squared test (>1 specifying that all spectra should be accepted) # standards.mask - a boolean vector where TRUE designates a spectrum that should not be included in the chi-sq test # # Output: a list with the following fields: # # peak.mask - Boolean vector of peaks that passed the F-test # F.confidence - A numerical vector of the p-value from the F-test of each peak # spectrum.mask - Boolean vector of spectra that passed the chi-squared test # exclude.confidence - Numeric vector giving the p-value that each spectrum is "bad" # peaks.normalized - Matrix of rescaled peak magnitudes, including all peaks but excluding eliminated spectra # peaks.screened - Matrix of rescaled peak magnitudes, excluding peaks and spectra that did not pass their respective tests # peaks.averaged - Matrix of rescaled peak magnitudes with values averaged among replicates of each sample # identities.screened - Vector of identities for columns (spectra) retained in of peaks.normalized and peaks.screened # identities.averaged - Vector of identities for columns (samples) appearing in peaks.averaged # scale.factors - Numeric vector of constants by which each spectrum was multiplied during normalization. These values should be checked for bias between groups. # ##################################################################### TestAndNormalize <- function(peakValues, identities, peak.confidence=0.95, spectrum.confidence=0.99, standards.mask) { nSpectra = ncol(peakValues) # Check that the data matrix and identities vector match if (nSpectra != length(identities)) { stop("Mismatch between ncol(peakValues) and length(identities)") } # Phase 1 of significant peak calculation # From the unnormalized peak values, find those which seem clearly significant (95% confident) # Calculate F-test scores and compare them to the 95% confidence limit norm.cutoff = 0.95 peak.mask = FALSE while ( (sum(peak.mask) < 10) & (norm.cutoff >= 0) ) { peak.mask <- (findSignificantPeaks(peakValues, identities) > norm.cutoff) norm.cutoff = norm.cutoff - 0.05 } # Check that enough peaks are detected to do meaningful normalization if (sum(peak.mask) >= 10) { # Perform EM-normalization to obtain an estimate of the scaling factor per spectrum and outliers est.scale.factor <- estimate.em(peakValues[peak.mask,])$est.scale if (any(is.na(est.scale.factor))) return(list(return.code=1)) } else { print(noquote("Not enough peaks found for reliable normalization. Spectra will not be normalized.")) est.scale.factor = rep(1, nSpectra) } # Normalize the peak values peakValues.normalized <- t(t(peakValues)/est.scale.factor) # Now determine which spectra are outliers in the normalized data chisq.results <- estimate.chisq.fit(peakValues.normalized[peak.mask,], confidence=spectrum.confidence, standards.mask) spectrum.mask <- chisq.results$mask print(noquote(paste("Number of spectra eliminated = ", sum(!spectrum.mask)))) # If there are implausible spectra, recompute the scale factor if (sum(!spectrum.mask) > 0) { print(noquote("Spectra were eliminated so scale factors and peak reproducibility are being recalculated.")) if (sum(peak.mask) >= 10) { # Perform EM-normalization to obtain an estimate of the scaling factor per spectrum and outliers est.scale.factor <- rep(NA, nSpectra) est.scale.factor[spectrum.mask] <- estimate.em(peakValues[peak.mask, spectrum.mask])$est.scale if (any(is.na(est.scale.factor[spectrum.mask]))) return(list(return.code=1)) } else { est.scale.factor = rep(1, nSpectra) est.scale.factor[!spectrum.mask] = NA } #Re-normalize the peak heights peakValues.normalized <- t(t(peakValues)/est.scale.factor) } # Phase 2 of significant peak calculation # Recalculate the F-test use the normalized peaks and excluding implausbile spectra F.confidence <- findSignificantPeaks(peakValues.normalized[,spectrum.mask], identities[spectrum.mask]) peak.mask <- (F.confidence > peak.confidence) # Calculate average peak values among replicates of each sample # Returns NA for any sample that has no replicates remaining after the chi-test screen peaks.averaged = sapply( unique(identities), FUN=function(ID, data, IDs){ if (any(IDs == ID)) { return(apply(as.matrix(data[,IDs==ID]), MARGIN=1, FUN=mean, na.rm=TRUE)) } else { return(rep(NA, dim(data)[[1]])) } }, peakValues.normalized[peak.mask,], identities ) dimnames(peaks.averaged)[[2]] = unique(identities) dimnames(peakValues.normalized)[[2]] = identities return(list(return.code=0, peak.mask=peak.mask, F.confidence=F.confidence, spectrum.mask=spectrum.mask, exclude.confidence=chisq.results$exclude.confidence, peaks.normalized=peakValues.normalized, peaks.screened=peakValues.normalized[peak.mask,], peaks.averaged=peaks.averaged, identities.screened=identities[spectrum.mask], identities.averaged=unique(identities), scale.factors=est.scale.factor )) } ##################################################################### # # Functions below assist "TestAndNormalize" # ##################################################################### # Reject those spectra which have a statistically implausible set of peaks # confidence is the level of confidence we need in order to dismiss a spectrum # Thus, higher confidence means less likely to call a spectrum an outlier # Spectra marked as TRUE in standards.mask are not included in the chi-squared calculation, # although they still recieve a score. estimate.chisq.fit <- function(peakValues, confidence=0.99, standards.mask) { peak.means <- apply(peakValues[, !standards.mask], 1, mean, na.rm=TRUE) peak.vars <- apply(peakValues[, !standards.mask], 1, var, na.rm=TRUE) counts <- apply(!is.na(peakValues[, !standards.mask]), 1, sum) z2.score <- ((peakValues - peak.means)^2)/peak.vars # Use only those peaks which have sufficient non-zero values (>=10) chisq.stat <- apply(z2.score[counts>10,], 2, sum, na.rm=TRUE) chisq.df <- apply(!is.na(z2.score[counts>10,]), 2, sum, na.rm=TRUE) return( list(mask = ((pchisq(chisq.stat, df=chisq.df) <= confidence) | standards.mask), exclude.confidence = pchisq(chisq.stat, df=chisq.df) ) ) } # Find all the peaks for whom the signal is greater than the measurement noise # at the specified confidence. findSignificantPeaks was used in the paper. # findSignficantPeaks.new performs a more complete F-test, and is able to handle # any number of replicates (the original handles only duplicates). # Sensitivity thresholds for the two functions are not interchangeable. findSignificantPeaks <- function(peakHeights, identities) { pairs <- find.pairs(identities)$pairs z <- peakHeights[,pairs[1,]] + peakHeights[,pairs[2,]] w <- peakHeights[,pairs[1,]] - peakHeights[,pairs[2,]] f.ratio <- apply(z, 1, var, na.rm=TRUE)/apply(w^2, 1, mean, na.rm=TRUE) n.count <- apply(!is.na(z), 1, sum) f.degrees.1 <- n.count-1 f.degrees.2 <- n.count valid <- (f.degrees.1 > 0) & (!is.na(f.ratio)) f.significance.valid <- pf(f.ratio[valid], f.degrees.1[valid], f.degrees.2[valid]) f.significance <- rep(NA, length(valid)) f.significance[valid] <- f.significance.valid f.significance[is.na(f.significance)] = 0 return(f.significance) #return((f.significance > confidence) & (!is.na(f.significance))) } #Finds all pairs of spectra from the same sample. #In cases when there are more than two spectra, only selects the first two copies find.pairs <- function(identities) { groups <- tapply(1:length(identities), identities, function(x){x}) group.identities <- names(groups) identities.paired <- vector() pairs <- vector() for (j in 1:length(groups)) { if (length(groups[[j]]) >= 2) { pairs <- cbind(pairs, groups[[j]][1:2]) identities.paired <- c(identities.paired, group.identities[j]) } } return(list(pairs=pairs, identities.paired=identities.paired)) } # This function is not currently used. It will allow for correct handling of replicate numbers higher than two. findSignificantPeaks.new <- function(peakHeights, identities, confidence=0.95) { groups <- tapply(1:length(identities), identities, function(idx){idx}) group.identities <- names(groups) T.j2 <- sapply(groups, function(idx, ph) { #If there is only one sample, the total is the value itself #This special case is needed otherwise S-plus apply does not work if (length(idx) < 2) { return(ph[,idx]^2) } else { return(apply(ph[,idx], 1, sum, na.rm=TRUE)^2) } }, peakHeights, simplify=TRUE) nj <- sapply(groups, function(idx, ph) { #If there is only one sample, the count is 0 or 1 #This special case is needed otherwise apply does not work if (length(idx) < 2) { return(!is.na(ph[,idx])) } else { return(apply(!is.na(ph[,idx]), 1, sum, na.rm=TRUE)) } }, peakHeights, simplify=TRUE) T.j2[is.na(T.j2)] <- 0 T.j2.over.nj <- T.j2/(nj + (nj == 0)) #Prevent divide by zero sum.T.j2.over.nj <- apply(T.j2.over.nj, 1, sum) n.count <- apply(nj, 1, sum) k.count <- apply((nj > 0), 1, sum) T..2.over.n <- (apply(peakHeights, 1, sum, na.rm=TRUE)^2)/n.count sum.Yij2 <- apply(peakHeights^2, 1, sum, na.rm=TRUE) Q1 <- sum.T.j2.over.nj - T..2.over.n Q2 <- sum.Yij2 - sum.T.j2.over.nj f.degrees.1 <- k.count-1 f.degrees.2 <- n.count-k.count valid <- (f.degrees.1 > 0) & (f.degrees.2 > 0) f.ratio.valid <- (Q1[valid]/f.degrees.1[valid])/(Q2[valid]/f.degrees.2[valid]) f.degrees.1.valid <- f.degrees.1[valid] f.degrees.2.valid <- f.degrees.2[valid] f.significance.valid <- pf(f.ratio.valid, f.degrees.1.valid, f.degrees.2.valid) f.significance <- rep(NA, length(valid)) f.significance[valid] <- f.significance.valid if (confidence == 0) return(f.significance) else return((f.significance > confidence) & (!is.na(f.significance))) } # Use the EM algorithm to find the maximum likelihood estimates of scale factors for each spectrum # y is an array of peak heights # Each peak is a row, each sample is a column # returned value est.a is the estimate of scale factor estimate.em <- function(y, trim=0, maxIter=20, maxDeltaRatio=0.001, a.max=5.0) { a.min <- 1/a.max a <- rep(1, ncol(y)) #Set initial estimates of a to be all equal loga.var <- 1 #Works out nicely as first calculation (equal weighting) for (j in 1:maxIter) { a.old <- a z <- t(t(y)/a) peak.means <- apply(z, 1, mean, na.rm=TRUE, trim=trim) peak.vars <- apply(z^2, 1, mean, na.rm=TRUE, trim=trim) - peak.means^2 # Check for excessive weight assigned to one peak, this may indicate that the # calculation is converging to a singularity peak.weights <- (peak.means^2)/peak.vars peak.vars <- peak.vars * rescaleWeights(peak.weights) coeff.a <- apply(y^2/peak.vars, 2, sum, na.rm=TRUE) coeff.b <- -(1/loga.var + apply(y*peak.means/peak.vars, 2, sum, na.rm=TRUE)) coeff.c <- (1/loga.var - 1) beta.root.1 <- (-coeff.b/coeff.a + coeff.c/coeff.b) beta.root.2 <- -coeff.c/coeff.b a <- 1/pmax(beta.root.1, beta.root.2) # Bound the values of a a <- pmax(a.min, a) a <- pmin(a.max, a) loga.mean <- mean(log(a)) a <- a/exp(loga.mean) #Renormalize a to have E[log(a)] = 0 loga.var <- mean(log(a)^2, trim=trim) # Check for convergence criteria of less than 1% change in scale factor delta <- max(abs((a - a.old)/a.old)) #print(paste("Iteration", j, " delta =", delta)) if (any(is.na(a))) break; if (delta < maxDeltaRatio) break; } return(list(est.scale=a, est.loga.var=loga.var)) } # # Rescale returns a vector by which to multiply the denominator of the original weights # By doing so, the rescaling will bring the largest weight down to w.limit if needed. # rescaleWeights <- function(w, w.limit=0.2) { w <- w/sum(w) max.w <- max(w) if (max.w > w.limit) { print(paste("Got close to singularity with weight = ",max.w)) w.star <- w.limit * (1-max.w)/(1-w.limit) rescale <- w/pmin(w, w.star) } else { rescale <- rep(1, length(w)) } } ###################### # # Helper functions # ###################### # Returns the minimum of the highest M/Z value from each file in the "paths" parameter get.M.Z.Max <- function(paths) { M.Z.max = 0 for (curFile in paths) { next.max = max(as.vector(scan( curFile, what=list(numeric(0), numeric(0)), quiet=TRUE, skip=1, sep="," )[[1]])) if ((next.max < M.Z.max) | (M.Z.max == 0)) {M.Z.max = next.max} } return(M.Z.max) } # Used in "findPeaksEdges". Returns the index of the first value in "data" that is not smaller than the next value or else the index of the last value. firstMax <- function(data) { dir = c(1, data[-1] - data[-length(data)]) dir = c((dir < 0), TRUE) # Append a termination signal so that the last width is selected if the peak size continues to increase # Get the index of the first TRUE minus one return( ((1:length(dir))[dir])[1] - 1) } # Apply a uniform smoothing window to the data smoothingFilter = function(data, span = 11) { result = as.vector(filter(data, rep(1/span, span), circular = FALSE)) result[is.na(result)] = 0 return(result) } ## Wrapper functions for calls to the C code peaks.M.Z <- function(data, M.Z.start, M.Z.end, M.Z.first, M.Z.last, fSpan) { return( .C("cPeaksMZ", data, as.integer(M.Z.start), as.integer(M.Z.end), as.integer(M.Z.first), as.integer(M.Z.last), as.integer(length(data)), fSpan, ret=as.integer(rep(0, length(data))), DUP=FALSE)$ret ) } peakArea.M.Z <- function(data, M.Z.start, M.Z.end, M.Z.first, M.Z.last, fSpan) { return( (.C("cPeakAreaMZ", data, as.integer(M.Z.start), as.integer(M.Z.end), as.integer(M.Z.first), as.integer(M.Z.last), as.integer(length(data)), fSpan, ret=numeric(length(data)), DUP=FALSE) )$ret ) } ##