## The following variables were specified as input arguments when calling the rendering function.
## They will be used in the workflow below.
experimentInfo <-
list()
species <-
"human"
pdOutputFolder <-
"/Users/charlottesoneson/Documents/Rpackages/einprot/einprot_examples/he_2022/pr2c00390_si_002/TMT16_PD3"
pdResultName <-
"TMT16plex_chimerys_intensity_nocontam_220914"
inputLevel <-
"Proteins"
pdAnalysisFile <-
NULL
idCol <-
function (df)
combineIds(df, combineCols = c("Gene.Symbol", "Accession"))
labelCol <-
function (df)
combineIds(df, combineCols = c("Gene.Symbol", "Accession"))
geneIdCol <-
function (df)
getFirstId(df, colName = "Gene.Symbol")
proteinIdCol <-
"Accession"
stringIdCol <-
function (df)
combineIds(df, combineCols = c("Gene.Symbol", "Accession"), combineWhen = "missing",
makeUnique = FALSE)
modificationsCol <-
"Modifications"
excludeUnmodifiedPeptides <-
FALSE
keepModifications <-
NULL
reportTitle <-
"He et al 2022"
reportAuthor <-
"Proteome Discoverer / TMT 16-plex"
iColPattern <-
"^Abundance."
sampleAnnot <-
structure(list(sample = c("F1.127C.Sample", "F1.127N.Sample",
"F1.128C.Sample", "F1.128N.Sample", "F1.129C.Sample", "F1.129N.Sample",
"F1.130C.Sample", "F1.130N.Sample", "F1.131C.Sample", "F1.131N.Sample",
"F1.132C.Sample", "F1.132N.Sample", "F1.133N.Sample", "F2.127C.Sample",
"F2.127N.Sample", "F2.128C.Sample", "F2.128N.Sample", "F2.129C.Sample",
"F2.129N.Sample", "F2.130C.Sample", "F2.130N.Sample", "F2.131C.Sample",
"F2.131N.Sample", "F2.132C.Sample", "F2.132N.Sample", "F2.133N.Sample",
"F3.127C.Sample", "F3.127N.Sample", "F3.128C.Sample", "F3.128N.Sample",
"F3.129C.Sample", "F3.129N.Sample", "F3.130C.Sample", "F3.130N.Sample",
"F3.131C.Sample", "F3.131N.Sample", "F3.132C.Sample", "F3.132N.Sample",
"F3.133C.Sample", "F3.133N.Sample", "F4.127C.Sample", "F4.127N.Sample",
"F4.128C.Sample", "F4.128N.Sample", "F4.129C.Sample", "F4.129N.Sample",
"F4.130C.Sample", "F4.130N.Sample", "F4.131C.Sample", "F4.131N.Sample",
"F4.132C.Sample", "F4.132N.Sample", "F4.133N.Sample"), group = c("Medulla_COVID",
"Cortex_COVID", "Medulla_COVID", "Cortex_COVID", "Medulla_COVID",
"Cortex_COVID", "Medulla_COVID", "Cortex_COVID", "Medulla_Control",
"Cortex_COVID", "Cortex_Control", "Cortex_Control", "Medulla_Control",
"Medulla_COVID", "Cortex_COVID", "Medulla_COVID", "Cortex_COVID",
"Medulla_COVID", "Cortex_COVID", "Medulla_COVID", "Cortex_COVID",
"Medulla_Control", "Cortex_Control", "Medulla_Control", "Cortex_Control",
"Cortex_Control", "Medulla_COVID", "Cortex_COVID", "Medulla_COVID",
"Cortex_COVID", "Medulla_COVID", "Cortex_COVID", "Medulla_COVID",
"Cortex_COVID", "Medulla_COVID", "Cortex_COVID", "Medulla_Control",
"Cortex_Control", "Medulla_Control", "Cortex_Control", "Medulla_COVID",
"Cortex_COVID", "Medulla_COVID", "Cortex_COVID", "Cortex_COVID",
"Cortex_COVID", "Cortex_Control", "Medulla_COVID", "Cortex_Control",
"Medulla_Control", "Cortex_Control", "Medulla_Control", "Medulla_Control"
), batch = c("b1", "b1", "b1", "b1", "b1", "b1", "b1", "b1",
"b1", "b1", "b1", "b1", "b1", "b2", "b2", "b2", "b2", "b2", "b2",
"b2", "b2", "b2", "b2", "b2", "b2", "b2", "b3", "b3", "b3", "b3",
"b3", "b3", "b3", "b3", "b3", "b3", "b3", "b3", "b3", "b3", "b4",
"b4", "b4", "b4", "b4", "b4", "b4", "b4", "b4", "b4", "b4", "b4",
"b4")), class = "data.frame", row.names = c(NA, -53L))
includeOnlySamples <-
c("F1.127C.Sample", "F1.127N.Sample", "F1.128C.Sample", "F1.128N.Sample",
"F1.129C.Sample", "F1.129N.Sample", "F1.130C.Sample", "F1.130N.Sample",
"F1.131C.Sample", "F1.131N.Sample", "F1.132C.Sample", "F1.132N.Sample",
"F1.133N.Sample", "F2.127C.Sample", "F2.127N.Sample", "F2.128C.Sample",
"F2.128N.Sample", "F2.129C.Sample", "F2.129N.Sample", "F2.130C.Sample",
"F2.130N.Sample", "F2.131C.Sample", "F2.131N.Sample", "F2.132C.Sample",
"F2.132N.Sample", "F2.133N.Sample", "F3.127C.Sample", "F3.127N.Sample",
"F3.128C.Sample", "F3.128N.Sample", "F3.129C.Sample", "F3.129N.Sample",
"F3.130C.Sample", "F3.130N.Sample", "F3.131C.Sample", "F3.131N.Sample",
"F3.132C.Sample", "F3.132N.Sample", "F3.133C.Sample", "F3.133N.Sample",
"F4.127C.Sample", "F4.127N.Sample", "F4.128C.Sample", "F4.128N.Sample",
"F4.129C.Sample", "F4.129N.Sample", "F4.130C.Sample", "F4.130N.Sample",
"F4.131C.Sample", "F4.131N.Sample", "F4.132C.Sample", "F4.132N.Sample",
"F4.133N.Sample")
excludeSamples <-
""
minScore <-
2
minDeltaScore <-
0.20000000000000001
minPeptides <-
1
minPSMs <-
2
masterProteinsOnly <-
FALSE
imputeMethod <-
"MinProb"
assaysForExport <-
NULL
mergeGroups <-
list()
comparisons <-
list(c("Medulla_Control", "Medulla_COVID"), c("Cortex_Control",
"Cortex_COVID"))
ctrlGroup <-
""
allPairwiseComparisons <-
TRUE
singleFit <-
FALSE
subtractBaseline <-
FALSE
baselineGroup <-
""
normMethod <-
"center.median"
spikeFeatures <-
NULL
stattest <-
"limma"
minNbrValidValues <-
2
minlFC <-
0
samSignificance <-
FALSE
nperm <-
250
volcanoAdjPvalThr <-
0.050000000000000003
volcanoLog2FCThr <-
0
volcanoMaxFeatures <-
25
volcanoLabelSign <-
"both"
volcanoS0 <-
0.10000000000000001
volcanoFeaturesToLabel <-
""
addInteractiveVolcanos <-
TRUE
interactiveDisplayColumns <-
c(Label = "einprotLabel", adjP = "adj.P.Val", logFC = "logFC"
)
interactiveGroupColumn <-
NULL
complexFDRThr <-
0.10000000000000001
maxNbrComplexesToPlot <-
10
seed <-
42
includeFeatureCollections <-
"GO"
minSizeToKeepSet <-
2
customComplexes <-
list()
complexSpecies <-
"all"
complexDbPath <-
NULL
stringVersion <-
"11.5"
stringDir <-
NULL
linkTableColumns <-
c("Description", "Master", "[^se]\\.logFC$")
This report describes a reproducible end-to-end analysis of a
proteomics dataset quantified with Proteome
Discoverer (Orsburn 2021). Most of the
code is hidden by default, but can be displayed by clicking on the
Code
buttons (or by selecting
Code -> Show All Code
in the top right corner of the
report). Navigation between the different sections can be done via the
table of contents in the left sidebar. In the first part of the report,
the quantified data is read into R and passed through a range of
processing and quality control steps. These are followed by statistical analysis to find and visualize
differentially abundant features. A summary
table provides direct links to external resources, and an additional
global overview of the data is provided via principal
component analysis.
## Get species info and define STRINGdb object
speciesInfo <- getSpeciesInfo(species)
if (inputLevel == "Proteins") {
if (is.null(stringDir)) stringDir <- ""
if (is.null(stringIdCol)) {
## If no STRING IDs are extracted, don't do STRING analysis
string_db <- NULL
} else {
string_db <- tryCatch({
tmp <- STRINGdb$new(version = stringVersion, species = speciesInfo$taxId,
score_threshold = 400, input_directory = stringDir)
if (!exists("tmp")) {
warning("The STRINGdb object can not be created. ",
"No STRING analysis will be performed.",
call. = FALSE)
tmp <- NULL
} else {
print(tmp)
}
tmp
}, error = function(e) {
warning("The STRINGdb object can not be created. ",
"No STRING analysis will be performed.",
call. = FALSE)
NULL
})
}
} else {
string_db <- NULL
}
## *********** STRING - https://string-db.org ***********
## (Search Tool for the Retrieval of Interacting Genes/Proteins)
## version: 11.5
## species: 9606
## ............please wait............
## proteins: 19566
## interactions: 1795134
## If needed and not provided, define path to complex DB
## (will be added to summary table below)
if ("complexes" %in% includeFeatureCollections && is.null(complexDbPath)) {
complexDbPath <- system.file(EINPROT_COMPLEXES_FILE,
package = "einprot")
}
## Get conversion tables for PomBase and WormBase IDs
pbconv <- readRDS(system.file(EINPROT_POMBASE_CONVTABLE,
package = "einprot"))
wbconv <- readRDS(system.file(EINPROT_WORMBASE_CONVTABLE,
package = "einprot"))
makeTableFromList(c(experimentInfo,
list(
"Species" = speciesInfo$species,
"Species (common)" = speciesInfo$speciesCommon,
"Taxonomic ID" = speciesInfo$taxId
)))
Species | Homo sapiens |
Species (common) | human |
Taxonomic ID | 9606 |
pd <- readProteomeDiscovererInfo(pdOutputFolder, pdResultName,
pdAnalysisFile)
## Missing files: /Users/charlottesoneson/Documents/Rpackages/einprot/einprot_examples/he_2022/pr2c00390_si_002/TMT16_PD3/TMT16plex_chimerys_intensity_nocontam_220914_InputFiles.txt, /Users/charlottesoneson/Documents/Rpackages/einprot/einprot_examples/he_2022/pr2c00390_si_002/TMT16_PD3/TMT16plex_chimerys_intensity_nocontam_220914_StudyInformation.txt
pdFile <- file.path(pdOutputFolder, paste0(pdResultName, "_", inputLevel, ".txt"))
pd <- c(list("PD quantification file" = pdFile), pd)
makeTableFromList(pd)
PD quantification file | /Users/charlottesoneson/Documents/Rpackages/einprot/einprot_examples/he_2022/pr2c00390_si_002/TMT16_PD3/TMT16plex_chimerys_intensity_nocontam_220914_Proteins.txt |
settingsList <- list(
"Column pattern" = iColPattern,
"Include only samples (if applicable)" = paste(includeOnlySamples,
collapse = ", "),
"Exclude samples" = paste(excludeSamples, collapse = ", "),
"Input type" = inputLevel,
"Min. number of peptides" = minPeptides,
"Min. number of PSMs" = minPSMs,
"Min. protein score" = minScore,
"Min. delta score" = minDeltaScore,
"Only retain master proteins" = masterProteinsOnly,
"Imputation method" = imputeMethod,
"Assays(s) to use for exported values" = paste(assaysForExport, collapse = ", "),
"Min. nbr valid values required for testing" = minNbrValidValues,
"Model fit" = ifelse(singleFit, "Single (one model fit for all samples)",
"Separate model fit for each comparison"),
"Groups to merge" = paste(unlist(
lapply(names(mergeGroups),
function(nm) paste0(nm, ":", paste(mergeGroups[[nm]], collapse = ",")))),
collapse = "; "),
"Comparisons" = paste(unlist(lapply(comparisons,
function(x) paste(x, collapse = " vs "))),
collapse = "; "),
"Control group" = ctrlGroup,
"Do all pairwise comparisons" = allPairwiseComparisons,
"Batch correction via baseline subtraction" = subtractBaseline,
"Baseline group" = baselineGroup,
"Normalization method" = normMethod,
"Spike features" = paste(spikeFeatures, collapse = ","),
"Statistical test" = stattest,
"Minimal fold change (limma/treat)" = minlFC,
"Adjusted p-value threshold for volcano plots" = volcanoAdjPvalThr,
"Log2 FC threshold for volcano plots" = volcanoLog2FCThr,
"Max nbr features to indicate in volcano plots" = volcanoMaxFeatures,
"Sign of features to indicate in volcano plots" = volcanoLabelSign,
"Use SAM statistic for significance" = samSignificance,
"s0" = volcanoS0,
"Features to always label in volcano plots" = paste(volcanoFeaturesToLabel,
collapse = ", "),
"Feature collections for enrichment testing" = paste(includeFeatureCollections, collapse = "; "),
"Minimal required size for feature sets" = minSizeToKeepSet,
"Complexes file" = gsub(".+\\/(.+.rds)", "\\1", complexDbPath),
"Complexes from species" = complexSpecies,
"Custom complexes" = paste(names(customComplexes), collapse = ";"),
"FDR Threshold for complexes" = complexFDRThr,
"Max nbr complexes to plot" = maxNbrComplexesToPlot,
"Number of permutations" = nperm,
"Random seed" = seed,
"Modifications column" = modificationsCol,
"Exclude unmodified peptides" = excludeUnmodifiedPeptides,
"Modifications to keep" = paste(keepModifications, collapse = ", "),
"Columns to add in link table" = paste(linkTableColumns, collapse = ";"),
"Interactive display columns" = paste(interactiveDisplayColumns, collapse = ";"),
"Interactive group column" = interactiveGroupColumn
)
if (stattest == "ttest") {
settingsList <- settingsList[!(names(settingsList) %in%
c("Minimal fold change (limma/treat)",
"Log2 FC threshold for volcano plots"))]
}
if (stattest %in% c("limma", "proDA")) {
settingsList <- settingsList[!(names(settingsList) %in%
c("s0", "Number of permutations",
"Use SAM statistic for significance"))]
}
if (inputLevel == "Proteins") {
settingsList <- settingsList[!(names(settingsList) %in%
c("Min. number of PSMs",
"Min. delta score"))]
}
if (inputLevel == "PeptideGroups") {
settingsList <- settingsList[!(names(settingsList) %in%
c("Min. number of peptides",
"Min. protein score",
"Only retain master proteins"))]
}
makeTableFromList(settingsList)
Column pattern | ^Abundance. |
Include only samples (if applicable) | F1.127C.Sample, F1.127N.Sample, F1.128C.Sample, F1.128N.Sample, F1.129C.Sample, F1.129N.Sample, F1.130C.Sample, F1.130N.Sample, F1.131C.Sample, F1.131N.Sample, F1.132C.Sample, F1.132N.Sample, F1.133N.Sample, F2.127C.Sample, F2.127N.Sample, F2.128C.Sample, F2.128N.Sample, F2.129C.Sample, F2.129N.Sample, F2.130C.Sample, F2.130N.Sample, F2.131C.Sample, F2.131N.Sample, F2.132C.Sample, F2.132N.Sample, F2.133N.Sample, F3.127C.Sample, F3.127N.Sample, F3.128C.Sample, F3.128N.Sample, F3.129C.Sample, F3.129N.Sample, F3.130C.Sample, F3.130N.Sample, F3.131C.Sample, F3.131N.Sample, F3.132C.Sample, F3.132N.Sample, F3.133C.Sample, F3.133N.Sample, F4.127C.Sample, F4.127N.Sample, F4.128C.Sample, F4.128N.Sample, F4.129C.Sample, F4.129N.Sample, F4.130C.Sample, F4.130N.Sample, F4.131C.Sample, F4.131N.Sample, F4.132C.Sample, F4.132N.Sample, F4.133N.Sample |
Exclude samples | |
Input type | Proteins |
Min. number of peptides | 1 |
Min. protein score | 2 |
Only retain master proteins | FALSE |
Imputation method | MinProb |
Assays(s) to use for exported values | |
Min. nbr valid values required for testing | 2 |
Model fit | Separate model fit for each comparison |
Groups to merge | |
Comparisons | Medulla_Control vs Medulla_COVID; Cortex_Control vs Cortex_COVID |
Control group | |
Do all pairwise comparisons | TRUE |
Batch correction via baseline subtraction | FALSE |
Baseline group | |
Normalization method | center.median |
Spike features | |
Statistical test | limma |
Minimal fold change (limma/treat) | 0 |
Adjusted p-value threshold for volcano plots | 0.05 |
Log2 FC threshold for volcano plots | 0 |
Max nbr features to indicate in volcano plots | 25 |
Sign of features to indicate in volcano plots | both |
Features to always label in volcano plots | |
Feature collections for enrichment testing | GO |
Minimal required size for feature sets | 2 |
Complexes from species | all |
Custom complexes | |
FDR Threshold for complexes | 0.1 |
Max nbr complexes to plot | 10 |
Random seed | 42 |
Modifications column | Modifications |
Exclude unmodified peptides | FALSE |
Modifications to keep | |
Columns to add in link table | Description;Master;[^se].logFC$ |
Interactive display columns | einprotLabel;adj.P.Val;logFC |
The input to this workflow is a Proteins.txt file from Proteome
Discoverer (see path in the table above). We read the PD intensities
into R
and store them in a SingleCellExperiment
object. This object will later be expanded with additional data, such as
transformed and imputed abundances.
At this point, the SingleCellExperiment object holds assays with the different types of intensities and annotations from the ProteomeDiscoverer file.
## Read Proteome Discoverer output
tmp <- importExperiment(inFile = pdFile, iColPattern = iColPattern,
includeOnlySamples = includeOnlySamples,
excludeSamples = excludeSamples)
sce <- tmp$sce
aName <- tmp$aName
sce
## class: SingleCellExperiment
## dim: 11724 53
## metadata(1): colList
## assays(1): Abundance
## rownames(11724): 1 2 ... 11723 11724
## rowData names(97): Checked Protein.FDR.Confidence.Combined ...
## TMTpro.N.term.Positions Modifications
## colnames(53): F1.127N.Sample F1.127C.Sample ... F4.132C.Sample
## F4.133N.Sample
## colData names(0):
## reducedDimNames(0):
## mainExpName: NULL
## altExpNames(0):
SingleCellExperiment
object.
S4Vectors::metadata(sce)$colList
## $Abundance
## [1] "Abundance.F1.127N.Sample" "Abundance.F1.127C.Sample"
## [3] "Abundance.F1.128N.Sample" "Abundance.F1.128C.Sample"
## [5] "Abundance.F1.129N.Sample" "Abundance.F1.129C.Sample"
## [7] "Abundance.F1.130N.Sample" "Abundance.F1.130C.Sample"
## [9] "Abundance.F1.131N.Sample" "Abundance.F1.131C.Sample"
## [11] "Abundance.F1.132N.Sample" "Abundance.F1.132C.Sample"
## [13] "Abundance.F1.133N.Sample" "Abundance.F2.127N.Sample"
## [15] "Abundance.F2.127C.Sample" "Abundance.F2.128N.Sample"
## [17] "Abundance.F2.128C.Sample" "Abundance.F2.129N.Sample"
## [19] "Abundance.F2.129C.Sample" "Abundance.F2.130N.Sample"
## [21] "Abundance.F2.130C.Sample" "Abundance.F2.131N.Sample"
## [23] "Abundance.F2.131C.Sample" "Abundance.F2.132N.Sample"
## [25] "Abundance.F2.132C.Sample" "Abundance.F2.133N.Sample"
## [27] "Abundance.F3.127N.Sample" "Abundance.F3.127C.Sample"
## [29] "Abundance.F3.128N.Sample" "Abundance.F3.128C.Sample"
## [31] "Abundance.F3.129N.Sample" "Abundance.F3.129C.Sample"
## [33] "Abundance.F3.130N.Sample" "Abundance.F3.130C.Sample"
## [35] "Abundance.F3.131N.Sample" "Abundance.F3.131C.Sample"
## [37] "Abundance.F3.132N.Sample" "Abundance.F3.132C.Sample"
## [39] "Abundance.F3.133N.Sample" "Abundance.F3.133C.Sample"
## [41] "Abundance.F4.127N.Sample" "Abundance.F4.127C.Sample"
## [43] "Abundance.F4.128N.Sample" "Abundance.F4.128C.Sample"
## [45] "Abundance.F4.129N.Sample" "Abundance.F4.129C.Sample"
## [47] "Abundance.F4.130N.Sample" "Abundance.F4.130C.Sample"
## [49] "Abundance.F4.131N.Sample" "Abundance.F4.131C.Sample"
## [51] "Abundance.F4.132N.Sample" "Abundance.F4.132C.Sample"
## [53] "Abundance.F4.133N.Sample"
Next, we compile the sample annotations. The sample IDs have been
extracted from the column names in the ProteomeDiscoverer
file, by removing the provided iColPattern
from the main
intensity columns. The group
column will be used to define
groups for the statistical testing later. If a batch
column
is present, that will also be accounted for in the limma
model. See the Comparisons and
design section below for more details about the fitted model(s).
Please check that the table below correspond to your expectations.
sce <- addSampleAnnots(sce, sampleAnnot = sampleAnnot)
DT::datatable(as.data.frame(colData(sce)),
options = list(scrollX = TRUE, pageLength = 20))
We already now define the names of the assays we will be generating and using later in the workflow.
aNames <- defineAssayNames(aName = aName, normMethod = normMethod,
doBatchCorr = "batch" %in% colnames(colData(sce)))
makeTableFromList(aNames)
assayInput | Abundance |
assayLog2WithNA | log2_Abundance_withNA |
assayImputIndic | imputed_Abundance |
assayLog2NormWithNA | log2_Abundance_withNA_norm |
assayImputed | log2_Abundance_norm |
assayBatchCorr | log2_Abundance_norm_batchCorr |
The figure below provides a high-level overview of the workflow, and where the assays defined above will be generated. Note that depending on the settings specified by the user, not all steps may be performed (this is typically indicated by multiple assay names in the table above being equal).
knitr::include_graphics(system.file("extdata", "einprot_workflow.png",
package = "einprot"))
The box plot below displays the distribution of values in the input assay defined above in each sample, on a log scale (excluding any missing values).
makeIntensityBoxplots(sce = sce, assayName = aNames$assayInput, doLog = TRUE,
ylab = aNames$assayInput)
Next, we filter out any proteins classified by PD as potential contaminants, and we may remove protein identifications based on score and number of peptides (i.e. to exclude one-hit wonders).
The excluded features, together with the available annotations, are written to a text file. The UpSet plot below illustrates the overlaps between the sets of features filtered out based on the different criteria (vertical bars).
nbrFeaturesBefore <- nrow(sce)
sce <- filterPDTMT(sce = sce, inputLevel = inputLevel, minScore = minScore,
minPeptides = minPeptides, minDeltaScore = minDeltaScore,
minPSMs = minPSMs, masterProteinsOnly = masterProteinsOnly,
modificationsCol = modificationsCol,
excludeUnmodifiedPeptides = excludeUnmodifiedPeptides,
keepModifications = keepModifications, plotUpset = TRUE,
exclFile = sub("\\.Rmd$", paste0("_filtered_out_features.txt"), knitr::current_input(dir = TRUE)))
nbrFeaturesAfter <- nrow(sce)
if (nbrFeaturesAfter == 0) {
stop("No features left after filtering!")
}
This filtering removed 0 features. The new sce
object
has 11724 features.
The feature ID used when reading the data above are numeric indices.
We replace these IDs with more interpretable ones, corresponding to
idCol
argument. We also add columns holding gene IDs (if
applicable), protein IDs, IDs for matching to STRING (if applicable) and
IDs for labelling points in plots.
sce <- fixFeatureIds(
sce,
colDefs = list(einprotId = idCol, einprotLabel = labelCol,
einprotGene = geneIdCol, einprotProtein = proteinIdCol,
IDsForSTRING = stringIdCol)
)
if (any(duplicated(rowData(sce)[["einprotId"]]))) {
stop("The 'einprotId' column cannot have duplicated entries.")
}
rownames(sce) <- rowData(sce)[["einprotId"]]
## einprotId einprotGene einprotProtein einprotLabel IDsForSTRING
## 1 TTN TTN Q8WZ42 TTN TTN
## 2 AHNAK AHNAK Q09666 AHNAK AHNAK
## 3 PLEC PLEC Q15149 PLEC PLEC
## 4 SPTAN1 SPTAN1 Q13813 SPTAN1 SPTAN1
## 5 MYH9 MYH9 P35579 MYH9 MYH9
## 6 ... ... ... ... ...
## 7 FOXL1 FOXL1 Q12952 FOXL1 FOXL1
## 8 TTC23 TTC23 Q5W5X9 TTC23 TTC23
## 9 GH2 GH2 P01242 GH2 GH2
## 10 CCN5 CCN5 O76076 CCN5 CCN5
## 11 Q14654 <NA> Q14654 Q14654 Q14654
In addition to testing individual features for differential abundance between groups, we can also test collections of features. Here we define the collections that will be used.
if (is.null(geneIdCol)) {
## If no gene IDs are extracted, don't compare to feature collections
featureCollections <- list()
} else {
(featureCollections <- prepareFeatureCollections(
sce = sce, idCol = "einprotGene",
includeFeatureCollections = includeFeatureCollections,
complexDbPath = complexDbPath, speciesInfo = speciesInfo,
complexSpecies = complexSpecies, customComplexes = customComplexes,
minSizeToKeep = minSizeToKeepSet))
}
## $GO
## CharacterList of length 15200
## [["GOBP_10_FORMYLTETRAHYDROFOLATE_METABOLIC_PROCESS"]] ALDH1L1 ... MTHFD2L
## [["GOBP_2_OXOGLUTARATE_METABOLIC_PROCESS"]] OGDH OGDHL IDH2 ... AADAT TAT
## [["GOBP_2FE_2S_CLUSTER_ASSEMBLY"]] NFS1 GLRX3 GLRX5 HSCB BOLA2
## [["GOBP_3_PHOSPHOADENOSINE_5_PHOSPHOSULFATE_BIOSYNTHETIC_PROCESS"]] PAPSS2 ...
## [["GOBP_3_PHOSPHOADENOSINE_5_PHOSPHOSULFATE_METABOLIC_PROCESS"]] PAPSS2 ...
## [["GOBP_3_UTR_MEDIATED_MRNA_DESTABILIZATION"]] UPF1 KHSRP ... ZFP36 RBM24
## [["GOBP_3_UTR_MEDIATED_MRNA_STABILIZATION"]] HNRNPC RBM10 ... ELAVL4 RBM24
## [["GOBP_5_PHOSPHORIBOSE_1_DIPHOSPHATE_METABOLIC_PROCESS"]] PYGL ... PRPS1L1
## [["GOBP_5S_CLASS_RRNA_TRANSCRIPTION_BY_RNA_POLYMERASE_III"]] GTF3C1 ... GTF3C6
## [["GOBP_ABSCISSION"]] SPART VPS4A IST1 ZFYVE19 CHMP4C AURKB
## ...
## <15190 more elements>
## Remove collections without any sets
featureCollections <- featureCollections[lapply(featureCollections, length) > 0]
Before the downstream analysis, we log2-transform the measured intensities. We also add an additional assay to keep track of the position of the missing values (which will be imputed later).
assay(sce, aNames$assayLog2WithNA) <- log2(assay(sce, aNames$assayInput))
## Add assay indicating missing values, which will be imputed
assay(sce, aNames$assayImputIndic) <- !is.finite(assay(sce, aNames$assayLog2WithNA))
sce
## class: SingleCellExperiment
## dim: 11724 53
## metadata(1): colList
## assays(3): Abundance log2_Abundance_withNA imputed_Abundance
## rownames(11724): TTN AHNAK ... CCN5 Q14654
## rowData names(102): Checked Protein.FDR.Confidence.Combined ...
## einprotProtein IDsForSTRING
## colnames(53): F1.127N.Sample F1.127C.Sample ... F4.132C.Sample
## F4.133N.Sample
## colData names(3): sample group batch
## reducedDimNames(0):
## mainExpName: NULL
## altExpNames(0):
The plot below shows the fraction of the total set of features that are detected (with a non-missing value) in each of the samples.
## Replace zeros/-Inf values by explicit NA values in the assays
assay(sce, aNames$assayInput)[assay(sce, aNames$assayInput) == 0] <- NA
assay(sce, aNames$assayLog2WithNA)[!is.finite(assay(sce, aNames$assayLog2WithNA))] <- NA
## Count number of NA values and add to SCE
colData(sce)$nNA <- colSums(is.na(assay(sce, aNames$assayInput)))
colData(sce)$pNA <- 100 * sce$nNA/nrow(sce)
rowData(sce)$nNA <- rowSums(is.na(assay(sce, aNames$assayInput)))
rowData(sce)$pNA <- 100 * rowData(sce)$nNA/ncol(sce)
plotFractionDetectedPerSample(dfNA = as.data.frame(colData(sce)[, c("sample", "nNA", "pNA")]))
We also plot the number of features that are detected in a given number of samples.
plotDetectedInSamples(dfNA = as.data.frame(rowData(sce)[, c("nNA", "pNA")]))
The log2 intensities are next normalized across samples using the center.median method.
sce <- doNormalization(sce, method = normMethod,
assayName = aNames$assayLog2WithNA,
normalizedAssayName = aNames$assayLog2NormWithNA,
spikeFeatures = spikeFeatures)
makeIntensityBoxplots(sce = sce, assayName = aNames$assayLog2NormWithNA,
doLog = FALSE,
ylab = aNames$assayLog2NormWithNA)
## Warning: Removed 99731 rows containing non-finite values (`stat_boxplot()`).
Next, we apply the MinProb method to perform imputation of the log2-transformed data, and plot the distribution of imputed and non-imputed values in each sample.
set.seed(seed)
sce <- doImputation(sce, method = imputeMethod,
assayName = aNames$assayLog2NormWithNA,
imputedAssayName = aNames$assayImputed)
## [1] 0.6858832
plotImputationDistribution(sce, assayToPlot = aNames$assayImputed,
assayImputation = aNames$assayImputIndic,
xlab = aNames$assayImputed)
Next we consider the overall distribution of log2-intensities among the samples (after imputation).
makeIntensityBoxplots(sce = sce, assayName = aNames$assayImputed,
doLog = FALSE,
ylab = aNames$assayImputed)
if ("batch" %in% colnames(colData(sce))) {
if (subtractBaseline) {
assay(sce, aNames$assayBatchCorr) <-
getMatSubtractedBaseline(sce, assayName = aNames$assayImputed,
baselineGroup = baselineGroup,
sceFull = sce)
} else {
assay(sce, aNames$assayBatchCorr) <-
removeBatchEffect(assay(sce, aNames$assayImputed),
batch = sce$batch,
design = model.matrix(~ sce$group))
}
}
For each feature, we then compare the (possibly imputed) log2 intensities between groups. For this, we use the limma R/Bioconductor package (Ritchie et al. 2015; Phipson et al. 2016). For more information about the df.prior, representing the amount of extra information that is borrowed from the full set of features in order to improve the inference for each feature, see section 13.2 in the limma user guide. In addition to the feature-wise tests, we apply the camera method (Wu and Smyth 2012) to test for significance of each included feature collection. These tests are based on the t-statistics returned from limma.
## Set the assay to use for tests later
if (stattest == "proDA") {
assayForTests <- aNames$assayLog2NormWithNA
} else {
assayForTests <- aNames$assayImputed
}
The log2_Abundance_norm assay will be used for the tests. The following pairwise comparisons will be performed (in each case, the first listed group will be the ‘baseline’ group):
if (subtractBaseline) {
discardGroup <- baselineGroup
} else {
discardGroup <- NULL
}
if (stattest == "none") {
comparisonList <- list(comparisons = list(),
groupComposition = list())
} else {
comparisonList <- makeListOfComparisons(
allGroups = unique(sce$group), comparisons = comparisons,
mergeGroups = mergeGroups,
allPairwiseComparisons = allPairwiseComparisons,
ctrlGroup = ctrlGroup, discardGroup = discardGroup)
}
comparisonList$comparisons
## $Medulla_COVID_vs_Medulla_Control
## [1] "Medulla_Control" "Medulla_COVID"
##
## $Cortex_COVID_vs_Cortex_Control
## [1] "Cortex_Control" "Cortex_COVID"
comparisonList$groupComposition
## $Cortex_COVID
## [1] "Cortex_COVID"
##
## $Medulla_COVID
## [1] "Medulla_COVID"
##
## $Medulla_Control
## [1] "Medulla_Control"
##
## $Cortex_Control
## [1] "Cortex_Control"
if (any(assaysForExport %in% assayNames(sce))) {
assaysForExport <- intersect(assaysForExport, assayNames(sce))
} else {
assaysForExport <- aNames$assayInput
}
testres <- runTest(sce = sce, comparisons = comparisonList$comparisons,
groupComposition = comparisonList$groupComposition,
testType = stattest,
assayForTests = assayForTests,
assayImputation = aNames$assayImputIndic,
minNbrValidValues = minNbrValidValues,
minlFC = minlFC, featureCollections = featureCollections,
complexFDRThr = complexFDRThr,
volcanoAdjPvalThr = volcanoAdjPvalThr,
volcanoLog2FCThr = volcanoLog2FCThr,
baseFileName = sub("\\.Rmd$", "", knitr::current_input()),
seed = seed, samSignificance = samSignificance,
nperm = nperm, volcanoS0 = volcanoS0,
aName = assaysForExport, addAbundanceValues = TRUE,
singleFit = singleFit,
subtractBaseline = subtractBaseline,
baselineGroup = baselineGroup,
extraColumns = union(interactiveDisplayColumns,
interactiveGroupColumn))
for (cmp in names(comparisonList$comparisons)) {
## Add WormBase/PomBase IDs if applicable
if (speciesInfo$speciesCommon == "roundworm") {
testres$tests[[cmp]]$WormBaseID <-
vapply(testres$tests[[cmp]][["einprotProtein"]],
function(mpds) {
wbids <- unlist(lapply(strsplit(mpds, ";")[[1]], function(mpd) {
wbconv$WormBaseID[wbconv$UniProtKB.ID == mpd |
wbconv$UniProtID == mpd]
}))
if (length(wbids[!is.na(wbids)]) != 0 &&
length(setdiff(wbids[!is.na(wbids)], "")) != 0) {
wbids <- setdiff(wbids, "")
wbids <- wbids[!is.na(wbids)]
paste(wbids, collapse = ";")
} else {
""
}
}, "NA")
} else if (speciesInfo$speciesCommon == "fission yeast") {
testres$tests[[cmp]]$PomBaseID <-
vapply(testres$tests[[cmp]][["einprotProtein"]],
function(mpds) {
pbids <- unlist(lapply(strsplit(mpds, ";")[[1]], function(mpd) {
pbconv$PomBaseID[pbconv$UniProtID == mpd]
}))
if (length(pbids[!is.na(pbids)]) != 0 &&
length(setdiff(pbids[!is.na(pbids)], "")) != 0) {
pbids <- setdiff(pbids, "")
pbids <- pbids[!is.na(pbids)]
paste(pbids, collapse = ";")
} else {
""
}
}, "NA")
}
}
The plots below illustrate the experimental design used for the
linear model(s) and contrasts by limma
. The plot to the
right shows the number of samples for each combination of factor levels
across the predictors, and is useful for detecting imbalances between
group sizes for different conditions. The plot to the left summarizes
the expected response value for each combination of predictor levels,
expressed in terms of the linear model coefficients. For more details on
how to interpret the plots, we refer to Soneson
et al. (2020) or Law et al. (2020).
Clicking on the arrow below the plots will reveal the design matrix used
by limma, as well as the contrasts that were fit for each
comparison.
if ("design" %in% names(testres$design)) {
cat("\n### Overall design \n")
vd <- VisualizeDesign(
testres$design$sampleData, designFormula = NULL,
designMatrix = testres$design$design)
print(cowplot::plot_grid(
plotlist = c(vd$plotlist, vd$cooccurrenceplots), nrow = 1)
)
cat("\n\n")
cat("<details>\n<summary><b>\nClick to display design matrix and contrast(s)\n</b></summary>\n")
cat("\n````\n")
print(testres$design$design)
cat("\n````\n")
cat("\n````\n")
cat("Contrast(s): \n")
print(testres$design$contrasts)
cat("\n````\n")
cat("\n````\n")
cat("Sample weights: \n")
print(testres$design$sampleWeights)
cat("\n````\n")
cat("\n</details>\n\n")
} else {
for (nm in names(testres$design)) {
cat("\n###", nm, "\n")
vd <- VisualizeDesign(
testres$design[[nm]]$sampleData, designFormula = NULL,
designMatrix = testres$design[[nm]]$design)
print(cowplot::plot_grid(
plotlist = c(vd$plotlist, vd$cooccurrenceplots), nrow = 1)
)
cat("\n\n")
cat("<details>\n<summary><b>\nClick to display design matrix and contrast(s)\n</b></summary>\n")
cat("\n````\n")
print(testres$design[[nm]]$design)
cat("\n````\n")
cat("\n````\n")
cat("Contrast: \n")
print(testres$design[[nm]]$contrast)
cat("\n````\n")
cat("\n````\n")
cat("Sample weights: \n")
print(testres$design[[nm]]$sampleWeights)
cat("\n````\n")
cat("\n</details>\n\n")
}
}
(Intercept) bcb2 bcb3 bcb4 fcMedulla_COVID
F1.127C.Sample 1 0 0 0 1
F1.128C.Sample 1 0 0 0 1
F1.129C.Sample 1 0 0 0 1
F1.130C.Sample 1 0 0 0 1
F1.131C.Sample 1 0 0 0 0
F1.133N.Sample 1 0 0 0 0
F2.127C.Sample 1 1 0 0 1
F2.128C.Sample 1 1 0 0 1
F2.129C.Sample 1 1 0 0 1
F2.130C.Sample 1 1 0 0 1
F2.131C.Sample 1 1 0 0 0
F2.132C.Sample 1 1 0 0 0
F3.127C.Sample 1 0 1 0 1
F3.128C.Sample 1 0 1 0 1
F3.129C.Sample 1 0 1 0 1
F3.130C.Sample 1 0 1 0 1
F3.131C.Sample 1 0 1 0 1
F3.132C.Sample 1 0 1 0 0
F3.133C.Sample 1 0 1 0 0
F4.127C.Sample 1 0 0 1 1
F4.128C.Sample 1 0 0 1 1
F4.130N.Sample 1 0 0 1 1
F4.131N.Sample 1 0 0 1 0
F4.132N.Sample 1 0 0 1 0
F4.133N.Sample 1 0 0 1 0
attr(,"assign")
[1] 0 1 1 1 2
attr(,"contrasts")
attr(,"contrasts")$bc
[1] "contr.treatment"
attr(,"contrasts")$fc
[1] "contr.treatment"
Contrast:
[1] 0 0 0 0 1
Sample weights:
NULL
(Intercept) bcb2 bcb3 bcb4 fcCortex_COVID
F1.127N.Sample 1 0 0 0 1
F1.128N.Sample 1 0 0 0 1
F1.129N.Sample 1 0 0 0 1
F1.130N.Sample 1 0 0 0 1
F1.131N.Sample 1 0 0 0 1
F1.132N.Sample 1 0 0 0 0
F1.132C.Sample 1 0 0 0 0
F2.127N.Sample 1 1 0 0 1
F2.128N.Sample 1 1 0 0 1
F2.129N.Sample 1 1 0 0 1
F2.130N.Sample 1 1 0 0 1
F2.131N.Sample 1 1 0 0 0
F2.132N.Sample 1 1 0 0 0
F2.133N.Sample 1 1 0 0 0
F3.127N.Sample 1 0 1 0 1
F3.128N.Sample 1 0 1 0 1
F3.129N.Sample 1 0 1 0 1
F3.130N.Sample 1 0 1 0 1
F3.131N.Sample 1 0 1 0 1
F3.132N.Sample 1 0 1 0 0
F3.133N.Sample 1 0 1 0 0
F4.127N.Sample 1 0 0 1 1
F4.128N.Sample 1 0 0 1 1
F4.129N.Sample 1 0 0 1 1
F4.129C.Sample 1 0 0 1 1
F4.130C.Sample 1 0 0 1 0
F4.131C.Sample 1 0 0 1 0
F4.132C.Sample 1 0 0 1 0
attr(,"assign")
[1] 0 1 1 1 2
attr(,"contrasts")
attr(,"contrasts")$bc
[1] "contr.treatment"
attr(,"contrasts")$fc
[1] "contr.treatment"
Contrast:
[1] 0 0 0 0 1
Sample weights:
NULL
We first show a diagnostic plot for each comparison. These plots
display the square root of the residual standard deviation (y-axis)
versus the mean abundance (across all the groups used to perform the
model fit, x-axis). The curve indicated in the plots show the
mean-variance trend inferred by limma
.
for (i in seq.int(ceiling(length(testres$tests) / 3))) {
tmplist <- testres$tests[(seq_along(testres$tests) - 1) %/% 3 == (i - 1)]
print(makeSAPlot(tmplist))
}
Below we display a volcano plot for each comparison. These plots are also saved to pdf files. In each plot, a subset of (up to 25) significant hits are indicated by name (selected as the ones with the largest Manhattan distance to the origin). These proteins are also used to generate STRING networks (Szklarczyk et al. 2021) (separately for the up- and downregulated ones), which are included in the pdf file. Any features explicitly requested (see the table above) are also labeled in the volcano plots. In addition to these pdf files, if “complexes” is specified to be included in the feature collections (and tested for significance using camera), we also generate a multi-page pdf file showing the position of the features of each significantly differentially abundant complex in the volcano plot, as well as bar plots of the features’ abundance values in the compared samples. This pdf file is only generated if there is at least one significant complex (with adjusted p-value below the specified complexFDRThr=0.1).
interactiveVolcanos <- htmltools::tagList()
for (nm in names(testres$tests)) {
plots <- plotVolcano(
sce = sce, res = testres$tests[[nm]], testType = stattest,
xv = NULL, yv = NULL, xvma = NULL, volcind = NULL,
plotnote = testres$plotnotes[[nm]],
plottitle = testres$plottitles[[nm]],
plotsubtitle = testres$plotsubtitles[[nm]],
volcanoFeaturesToLabel = volcanoFeaturesToLabel,
volcanoMaxFeatures = volcanoMaxFeatures,
volcanoLabelSign = volcanoLabelSign,
baseFileName = paste0(sub("\\.Rmd$", "", knitr::current_input()),
"_volcano_", nm),
comparisonString = nm,
groupComposition = comparisonList$groupComposition[comparisonList$comparisons[[nm]]],
stringDb = string_db,
featureCollections = testres$featureCollections,
complexFDRThr = complexFDRThr,
maxNbrComplexesToPlot = maxNbrComplexesToPlot,
curveparam = testres$curveparams[[nm]],
abundanceColPat = assaysForExport,
xlab = "log2(fold change)",
ylab = "-log10(p-value)",
xlabma = "Average abundance",
labelOnlySignificant = TRUE,
interactiveDisplayColumns = interactiveDisplayColumns,
interactiveGroupColumn = interactiveGroupColumn,
maxTextWidthBarplot = 5.1)
if (!is.null(plots$ggma) && !is.null(plots$ggwf) && !is.null(plots$ggbar)) {
cat("\n\n### ", nm, " \n\n\n")
print(plots$gg)
print(plots$ggma)
for (ggb in plots$ggbar) print(ggb)
print(plots$ggwf)
cat("\n\n")
} else if (!is.null(plots$ggma) && !is.null(plots$ggwf)) {
cat("\n\n### ", nm, " \n\n\n")
print(plots$gg)
print(plots$ggma)
print(plots$ggwf)
cat("\n\n")
} else if (!is.null(plots$ggma) && !is.null(plots$ggbar)) {
cat("\n\n### ", nm, " \n\n\n")
print(plots$gg)
print(plots$ggma)
for (ggb in plots$ggbar) print(ggb)
cat("\n\n")
} else if (!is.null(plots$ggbar) && !is.null(plots$ggwf)) {
cat("\n\n### ", nm, " \n\n\n")
print(plots$gg)
for (ggb in plots$ggbar) print(ggb)
print(plots$ggwf)
cat("\n\n")
} else if (!is.null(plots$ggma)) {
cat("\n\n### ", nm, " \n\n\n")
print(plots$gg)
print(plots$ggma)
cat("\n\n")
} else if (!is.null(plots$ggbar)) {
cat("\n\n### ", nm, " \n\n\n")
print(plots$gg)
for (ggb in plots$ggbar) print(ggb)
cat("\n\n")
} else if (!is.null(plots$ggwf)) {
cat("\n\n### ", nm, " \n\n\n")
print(plots$gg)
print(plots$ggwf)
cat("\n\n")
} else {
cat("\n\n### ", nm, " \n\n\n")
print(plots$gg)
cat("\n\n")
}
interactiveVolcanos[[nm]] <- plots$ggint
}
Warning: we couldn’t map to STRING 4% of your identifiers
For each comparison, we also save a text file with the “significant” features (defined as those colored in the volcano plots above). The features are ordered by the logFC value.
## Merge results from all tests and add to rowData(sce)
tests <- testres$tests
for (nm in names(tests)) {
idx <- which(colnames(tests[[nm]]) != "pid")
colnames(tests[[nm]])[idx] <- paste0(nm, ".", colnames(tests[[nm]])[idx])
}
all_tests <- as.data.frame(Reduce(function(...) dplyr::full_join(..., by = "pid"),
tests), optional = TRUE)
rownames(all_tests) <- all_tests$pid
all_tests$pid <- NULL
stopifnot(rownames(sce) == rownames(all_tests))
rowData(sce) <- cbind(rowData(sce), all_tests)
The UpSet plot below shows the overlap among the “significant” features (defined as the ones that are colored in the volcano plots above, based on the user specifications) from the different comparisons. Note that if there are many comparisons, not all combinations may be displayed in the plot (only the 50 combinations with the largest number of features are shown, for interpretability reasons). Moreover, the UpSet plot will only be shown if at least two comparisons have been made, and there are at least two comparisons where any features were deemed significant.
tmpsign <- all_tests %>% dplyr::select(contains("showInVolcano")) + 0
tmpsign[is.na(tmpsign)] <- 0
## Only features that are significant in at least one comparison
tmpsign <- tmpsign[rowSums(tmpsign) > 0, , drop = FALSE]
colnames(tmpsign) <- gsub("\\.showInVolcano$", "", colnames(tmpsign))
if (length(tests) > 1 && sum(colSums(tmpsign) > 0) > 1) {
ComplexUpset::upset(tmpsign, intersect = colnames(tmpsign),
sort_intersections_by = "cardinality")
}
Finally, we display the top significant feature sets in each of the tested collections, for each comparison. We recommend that the (adjusted) p-values for the feature sets is interpreted with caution, especially in situations where the feature abundances are measured on an isoform level and the feature sets are defined on the protein level, since there will be many (sometimes strongly correlated) features corresponding to a single gene or protein annotated to a feature set.
for (nm in names(testres$topsets)) {
if (length(testres$topsets[[nm]]) > 0) {
plts <- lapply(names(testres$topsets[[nm]]), function(snm) {
df <- testres$topsets[[nm]][[snm]]
if (nrow(df) > maxNbrComplexesToPlot) {
df <- df[seq_len(maxNbrComplexesToPlot), ]
}
if (nrow(df) > 0) {
ggplot(df %>% dplyr::mutate(set = factor(.data$set, levels = rev(.data$set))),
aes(x = .data$set, y = -log10(.data[[paste0(nm, "_FDR")]]),
fill = .data[[paste0(nm, "_Direction")]])) +
geom_bar(stat = "identity") +
coord_flip() + theme_bw() +
labs(x = "", title = snm) +
scale_fill_manual(values = c(Up = scales::muted("blue"),
Down = scales::muted("red")),
name = "Direction") +
theme(axis.text = element_text(size = 6))
} else {
NULL
}
})
if (sum(sapply(plts, is.null)) < length(plts)) {
cat("\n\n### ", nm, " \n\n\n")
print(cowplot::plot_grid(plotlist = plts, ncol = 1, align = "v"))
cat("\n\n")
}
}
}
The table below provides autogenerated links to the UniProt and
AlphaFold pages (as well as selected organism-specific databases) for
the protein IDs corresponding to each feature in the data set. The ‘pid’
column represents the unique feature ID used by einprot
,
and the einprotLabel
column contains the user-defined
feature labels. UniProt is a resource of protein sequence and functional
information hosted by EMBL-EBI, PIR and SIB. The AlphaFold Protein
Structure Database, developed by DeepMind and EMBL-EBI, provides open
access to protein structure predictions for the human proteome and other
key proteins of interest. Note that (depending on the species) many
proteins are not yet covered in AlphaFold (in this case, the link below
will lead to a non-existent page), and that numeric values are rounded
to four significant digits to increase readability.
linkTable <- makeDbLinkTable(
df = as.data.frame(rowData(sce)) %>%
rownames_to_column("pid") %>%
dplyr::select("pid", "einprotProtein",
matches(setdiff(c("^einprotLabel$", linkTableColumns), ""), perl = TRUE)),
idCol = "einprotProtein",
speciesCommon = speciesInfo$speciesCommon,
addSpeciesSpecificColumns = TRUE,
convTablePomBase = pbconv,
convTableWormBase = wbconv,
removeSuffix = TRUE, signifDigits = 4
) %>%
dplyr::select(-.data$einprotProtein) %>%
dplyr::relocate(any_of("Description"), .after = dplyr::last_col())
DT::datatable(
as.data.frame(linkTable), escape = FALSE,
filter = list(position = "top", clear = FALSE),
extensions = "Buttons",
options = list(scrollX = TRUE, pageLength = 20,
search = list(regex = FALSE, caseInsensitive = TRUE),
dom = "Bfrltip", buttons =
list(list(extend = "csv",
filename = paste0(sub("\\.Rmd$", "",
knitr::current_input()), "_linktable")),
list(extend = "excel", title = "",
filename = paste0(sub("\\.Rmd$", "",
knitr::current_input()), "_linktable"))))
)
We assemble all the information calculated above in a SingleCellExperiment object, which can later be used e.g. for exploration with iSEE (Rue-Albrecht et al. 2018).
sce <- prepareFinalSCE(
sce = sce, baseFileName = sub("\\.Rmd$", "", knitr::current_input()),
featureCollections = testres$featureCollections, expType = "ProteomeDiscoverer")
## Add experiment metadata
S4Vectors::metadata(sce) <- c(
S4Vectors::metadata(sce),
list(
pdFile = pdFile,
aNames = aNames,
aName = aNames$assayInput,
iColPattern = iColPattern,
imputeMethod = imputeMethod,
ctrlGroup = ctrlGroup,
allPairwiseComparisons = allPairwiseComparisons,
normMethod = normMethod,
stattest = stattest,
minlFC = minlFC,
analysisDate = as.character(Sys.Date()),
rmdFile = knitr::current_input(dir = TRUE),
testres = testres,
comparisonList = comparisonList,
modificationsCol = modificationsCol,
keepModifications = keepModifications
)
)
sce
## class: SingleCellExperiment
## dim: 11724 53
## metadata(18): colList iSEE ... modificationsCol keepModifications
## assays(6): Abundance log2_Abundance_withNA ... log2_Abundance_norm
## log2_Abundance_norm_batchCorr
## rownames(11724): TTN AHNAK ... CCN5 Q14654
## rowData names(145): Checked Protein.FDR.Confidence.Combined ...
## Cortex_COVID_vs_Cortex_Control.log2_Abundance.Cortex_Control.sd
## Cortex_COVID_vs_Cortex_Control.IDsForSTRING
## colnames(53): F1.127N.Sample F1.127C.Sample ... F4.132C.Sample
## F4.133N.Sample
## colData names(5): sample group batch nNA pNA
## reducedDimNames(0):
## mainExpName: NULL
## altExpNames(0):
pcafeatures <- which(rowSums(!assay(sce, aNames$assayImputIndic)) >= minNbrValidValues)
We run a principal component analysis to obtain a reduced dimensionality representation of the data, in order to visualize the samples in two dimensions. The PCA is based on features with at least 2 non-imputed values (11153 of the 11724 features). The figure below shows the sample representation in the first two principal components, the fraction of variance explained by each of the principal components, and the features with the highest positive and negative loadings in the displayed components.
interactivePCAs <- htmltools::tagList()
for (a in unique(c(aNames$assayImputed, aNames$assayBatchCorr))) {
pcares <- doPCA(sce = sce, assayName = a, ncomponents = 10, ntop = Inf,
plotpairs = list(c(1, 2)), maxTextWidthBarplot = 1.9,
subset_row = pcafeatures)
sce <- pcares$sce
interactivePCAs[[a]] <- ggplotly(pcares$plotcoord[[1]],
width = 750, height = 500)
for (plc in pcares$plotcombined) {
print(plc)
}
}
For another birds-eye view of the data, we represent it using a hierarchical clustering of the samples, based on Euclidean distance of feature-centered data, and Ward linkage.
plotassay <- assay(sce, aNames$assayImputed)
colnames(plotassay) <- sub(iColPattern, "", colnames(plotassay))
sampledists <- dist(scale(t(plotassay), center = TRUE, scale = FALSE))
plot(hclust(sampledists, method = "ward.D2"), hang = -1, xlab = "", sub = "")
The plot below shows the pairwise Pearson correlations between all pairs of samples, based on the log2_Abundance_norm assay.
plotassay <- assay(sce, aNames$assayImputed)
ggplot(data = as.data.frame(cor(plotassay, method = "pearson")) %>%
rownames_to_column("sample1") %>%
tidyr::pivot_longer(names_to = "sample2", values_to = "correlation",
-"sample1"),
aes(x = .data$sample1, y = .data$sample2, fill = .data$correlation)) +
geom_tile(color = "grey95", linewidth = 0.1) +
scale_fill_gradient(low = "grey95", high = "darkblue") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
labs(x = "", y = "") +
scale_x_discrete(expand = c(0, 0)) +
scale_y_discrete(expand = c(0, 0))
The SingleCellExperiment
object created above is saved
in the following location:
sceFile <- sub("\\.Rmd$", paste0("_sce.rds"), knitr::current_input(dir = TRUE))
saveRDS(sce, file = sceFile)
sceFile
## [1] "/Users/charlottesoneson/Documents/Rpackages/einprot/einprot_examples/he_2022/he_2022_pd_einprot_0.7.4/he_2022_pd_einprot_0.7.4_sce.rds"
In addition, all feature information (the rowData
of the
SingleCellExperiment
) is written to a text file:
textFile <- sub("\\.Rmd$", paste0("_feature_info.txt"),
knitr::current_input(dir = TRUE))
write.table(as.data.frame(rowData(sce)) %>%
rownames_to_column("FeatureID"),
file = textFile, row.names = FALSE, col.names = TRUE,
quote = FALSE, sep = "\t")
textFile
## [1] "/Users/charlottesoneson/Documents/Rpackages/einprot/einprot_examples/he_2022/he_2022_pd_einprot_0.7.4/he_2022_pd_einprot_0.7.4_feature_info.txt"
For interactive exploration of your results, we generate a script to launch an adapted iSEE interface. The script can be sourced from an R console:
source('/Users/charlottesoneson/Documents/Rpackages/einprot/einprot_examples/he_2022/he_2022_pd_einprot_0.7.4/he_2022_pd_einprot_0.7.4_iSEE.R')
That will open up an iSEE session where you can interactively explore your data.
This report was compiled with the following package versions:
## R version 4.3.1 (2023-06-16)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.4
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: Europe/Zurich
## tzcode source: internal
##
## attached base packages:
## [1] grid stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] einprot_0.7.4 ComplexHeatmap_2.16.0
## [3] plotly_4.10.2 ggalt_0.4.0
## [5] BiocSingular_1.16.0 scater_1.28.0
## [7] scuttle_1.10.1 SingleCellExperiment_1.22.0
## [9] tibble_3.2.1 ggplot2_3.4.2
## [11] ComplexUpset_1.3.3 dplyr_1.1.2
## [13] htmltools_0.5.5 cowplot_1.1.1
## [15] ExploreModelMatrix_1.12.0 limma_3.56.2
## [17] DT_0.28 SummarizedExperiment_1.30.2
## [19] Biobase_2.60.0 GenomicRanges_1.52.0
## [21] GenomeInfoDb_1.36.1 IRanges_2.34.1
## [23] S4Vectors_0.38.1 BiocGenerics_0.46.0
## [25] MatrixGenerics_1.12.2 matrixStats_1.0.0
## [27] STRINGdb_2.12.0
##
## loaded via a namespace (and not attached):
## [1] ProtGenerics_1.32.0 bitops_1.0-7
## [3] webshot_0.5.5 httr_1.4.6
## [5] ash_1.0-15 RColorBrewer_1.1-3
## [7] doParallel_1.0.17 tools_4.3.1
## [9] utf8_1.2.3 R6_2.5.1
## [11] lazyeval_0.2.2 mgcv_1.8-42
## [13] GetoptLong_1.0.5 withr_2.5.0
## [15] iSEEhex_1.2.0 GGally_2.1.2
## [17] gridExtra_2.3 cli_3.6.1
## [19] shinyjs_2.1.0 sandwich_3.0-2
## [21] labeling_0.4.2 sass_0.4.6
## [23] robustbase_0.99-0 mvtnorm_1.2-2
## [25] readr_2.1.4 genefilter_1.82.1
## [27] systemfonts_1.0.4 svglite_2.1.1
## [29] stringdist_0.9.10 rrcov_1.7-4
## [31] plotrix_3.8-2 maps_3.4.1
## [33] rstudioapi_0.15.0 impute_1.74.1
## [35] RSQLite_2.3.1 generics_0.1.3
## [37] shape_1.4.6 crosstalk_1.2.0
## [39] gtools_3.9.4 Matrix_1.5-4.1
## [41] ggbeeswarm_0.7.2 fansi_1.0.4
## [43] imputeLCMD_2.1 lifecycle_1.0.3
## [45] yaml_2.3.7 iSEEu_1.12.0
## [47] gplots_3.1.3 blob_1.2.4
## [49] promises_1.2.0.1 crayon_1.5.2
## [51] shinydashboard_0.7.2 miniUI_0.1.1.1
## [53] lattice_0.21-8 msigdbr_7.5.1
## [55] beachmat_2.16.0 annotate_1.78.0
## [57] KEGGREST_1.40.0 pillar_1.9.0
## [59] knitr_1.43.1 rjson_0.2.21
## [61] codetools_0.2-19 glue_1.6.2
## [63] ggiraph_0.8.7 pcaMethods_1.92.0
## [65] data.table_1.14.8 MultiAssayExperiment_1.26.0
## [67] vctrs_0.6.3 png_0.1-8
## [69] gtable_0.3.3 gsubfn_0.7
## [71] cachem_1.0.8 xfun_0.39
## [73] S4Arrays_1.0.4 mime_0.12
## [75] pcaPP_2.0-3 survival_3.5-5
## [77] rrcovNA_0.5-0 iterators_1.0.14
## [79] gmm_1.8 iSEE_2.12.0
## [81] ellipsis_0.3.2 nlme_3.1-162
## [83] bit64_4.0.5 bslib_0.5.0
## [85] tmvtnorm_1.5 irlba_2.3.5.1
## [87] vipor_0.4.5 KernSmooth_2.23-21
## [89] colorspace_2.1-0 DBI_1.1.3
## [91] proDA_1.14.0 tidyselect_1.2.0
## [93] bit_4.0.5 compiler_4.3.1
## [95] extrafontdb_1.0 chron_2.3-61
## [97] rvest_1.0.3 BiocNeighbors_1.18.0
## [99] xml2_1.3.5 DelayedArray_0.26.6
## [101] colourpicker_1.2.0 scales_1.2.1
## [103] caTools_1.18.2 DEoptimR_1.0-14
## [105] proj4_1.0-12 hexbin_1.28.3
## [107] stringr_1.5.0 digest_0.6.33
## [109] rmarkdown_2.23 XVector_0.40.0
## [111] pkgconfig_2.0.3 extrafont_0.19
## [113] sparseMatrixStats_1.12.0 highr_0.10
## [115] fastmap_1.1.1 rlang_1.1.1
## [117] GlobalOptions_0.1.2 htmlwidgets_1.6.2
## [119] shiny_1.7.4 DelayedMatrixStats_1.22.0
## [121] farver_2.1.1 jquerylib_0.1.4
## [123] zoo_1.8-12 jsonlite_1.8.7
## [125] mclust_6.0.0 BiocParallel_1.34.2
## [127] RCurl_1.98-1.12 magrittr_2.0.3
## [129] kableExtra_1.3.4 GenomeInfoDbData_1.2.10
## [131] patchwork_1.1.2 munsell_0.5.0
## [133] Rcpp_1.0.11 babelgene_22.9
## [135] viridis_0.6.3 proto_1.0.0
## [137] MsCoreUtils_1.13.1 sqldf_0.4-11
## [139] stringi_1.7.12 rintrojs_0.3.2
## [141] zlibbioc_1.46.0 MASS_7.3-60
## [143] plyr_1.8.8 ggseqlogo_0.1
## [145] parallel_4.3.1 ggrepel_0.9.3
## [147] forcats_1.0.0 Biostrings_2.68.1
## [149] splines_4.3.1 hash_2.2.6.2
## [151] hms_1.1.3 circlize_0.4.15
## [153] igraph_1.5.0 uuid_1.1-0
## [155] QFeatures_1.10.0 ScaledMatrix_1.8.1
## [157] XML_3.99-0.14 evaluate_0.21
## [159] tzdb_0.4.0 foreach_1.5.2
## [161] httpuv_1.6.11 Rttf2pt1_1.3.12
## [163] tidyr_1.3.0 purrr_1.0.1
## [165] reshape_0.8.9 clue_0.3-64
## [167] norm_1.0-11.1 rsvd_1.0.5
## [169] xtable_1.8-4 AnnotationFilter_1.24.0
## [171] later_1.3.1 viridisLite_0.4.2
## [173] memoise_2.0.1 beeswarm_0.4.0
## [175] AnnotationDbi_1.62.2 writexl_1.4.2
## [177] cluster_2.1.4 shinyWidgets_0.7.6
## [179] shinyAce_0.4.2