R calculate how many values used to calculate mean in aggregate function - r

I have a dataframe of daily observations dating from 1963-2022. I want to calculate the mean of the observation for each month. However, some months don't have data for each day and some only have one datapoint for one month. This skews some of the data points. How do I calculate how many observations have been used to calculate the mean for a given month.
Head of Data frame
structure(list(prcp_amt = c(0, 1.8, 6.4, 5.1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 4.3, 0, 0, 0, 0, 4.6, 0, 0, 0, 0, 0, 0, 0, 0.3,
4.8, 0, 0, 4.1, 0, 0, 0, 0.3, 3.6, 6.6, 0, 0, 0, 0, 0, 0, 0.8,
0, 0, 0, 0, 0), ob_date = structure(c(-220838400, -220752000,
-220665600, -220579200, -220492800, -220406400, -220320000, -220233600,
-220147200, -220060800, -219974400, -219888000, -219801600, -219715200,
-219628800, -219542400, -219456000, -219369600, -219283200, -219196800,
-219110400, -219024000, -218937600, -218851200, -218764800, -218678400,
-218592000, -218505600, -218419200, -218332800, -218246400, -218160000,
-218073600, -217987200, -217900800, -217814400, -217728000, -217641600,
-217555200, -217468800, -217382400, -217296000, -217209600, -217123200,
-217036800, -216950400, -216864000, -216777600, -216691200, -216604800
), class = c("POSIXct", "POSIXt"), tzone = "GMT")), row.names = c(NA,
50L), class = "data.frame")
Existing code
# historic monthly rainfall
rainHist$month <- as.numeric(format(rainHist$ob_date, '%m'))
rainHist$year <- as.numeric(format(rainHist$ob_date, '%Y'))
rainHistMean <- aggregate(prcp_amt ~ month + year, rainHist, FUN=mean)
rainHistMean$day <- 01
rainHistMean <-
rainHistMean %>%
mutate(rainHistMean, Date=paste(year, month, day, sep='-'))
rainHistMean[['Date']] <- as.POSIXct(rainHistMean[['Date']],
format='%Y-%m-%d',
tz='GMT'
)
Updated Code
rainHist$month <- as.numeric(format(rainHist$ob_date, '%m'))
rainHist$year <- as.numeric(format(rainHist$ob_date, '%Y'))
rainHistMean <- aggregate(prcp_amt ~ month + year, rainHist, FUN=function(x) c(mean(x), length(x)))
names(rainHistMean) <- c('month', 'year', 'prcp_amt', 'n')
How do I get there to be 4 columns not 3 with a matrix?
Solution
rainHist$month <- as.numeric(format(rainHist$ob_date, '%m'))
rainHist$year <- as.numeric(format(rainHist$ob_date, '%Y'))
rainHistMean <- aggregate(prcp_amt ~ month + year, rainHist, FUN=function(x) c(mean(x), length(x)))
rainHistMean <- data.frame(rainHistMean[1:2], rainHistMean[[3]])
names(rainHistMean) <- c('month', 'year', 'prcp_amt', 'n')

There may be more elegant solutions, but you can use dplyr to group by month and year, then get the count and mean in summarize:
df %>%
group_by(month(ob_date), year(ob_date)) %>%
summarize(mean_prcp = mean(prcp_amt),
count = n())
Output:
# # Groups: month(ob_date) [2]
# `month(ob_date)` `year(ob_date)` mean_prcp count
# <dbl> <dbl> <dbl> <int>
# 1 1 1963 0.91 30
# 2 2 1963 0.77 20

Related

R: How do I sort a dataframe based on a numeric vector?

I want to calculate the fold change between thyroid and testes dataframe using TPM values and provide the top 10 genes overexpressed in testes tissue (testes$gene_id in the testes dataframe).
In my code below, I first calculated the fold change and store it as a numeric vector tpm.foldchange but then I don't know how to sort the gene_id column of the testes dataframe based on the sorted fold-change values tpm.foldchange.
# Parse the gene results file from the testes and thyroid output
thyroid <- read.table("thyroid.genes.results", header=T, sep="\t")
testes <- read.table("testes.genes.results", header=T, sep="\t")
# Extract the TPM values
# Add one to each value and log them (base 2)
library(tidyverse)
thyroid.tpm <- log(thyroid %>% pull(TPM) + 1)
testes.tpm <- log(testes %>% pull(TPM) + 1)
# Pearson's correlation coefficient between thyroid and testes using TPM
cor(thyroid.tpm, testes.tpm, method="pearson")
# Calculate fold change between the testes and thyroid tissue TPM values and provide top 10 genes that are overexpressed in testes
library(gtools)
tpm.foldchange <- foldchange(testes.tpm, thyroid.tpm)
#tpm.df <- merge(testes.tpm, tpm.foldchange)
tpm.sorted <- sort(tpm.foldchange, decreasing=T)
tpm.sortedgenes <- testes[order(factor(testes$TPM, levels=tpm.sorted)),]
tpm.top10genes <- head(tpm.sortedgenes, 10)
testes[order(factor(testes$TPM, levels=tpm.sorted)),]
I initially wanted to sort after merging like this:
tpm.df <- merge(testes.tpm, tpm.foldchange)
tpm.sorted <- sort(tpm.df$tpm.foldchange, decreasing=T)
but it raised an error:
Error: cannot allocate vector of size 8.0 Gb
thyroid dataframe:
# Show only the first 20 rows, first column, and 6th column of thyroid dataframe
dput(thyroid[1:20, c(1,6)])
structure(list(gene_id = c("gene0_DDX11L1", "gene1_WASH7P", "gene100_C1orf233",
"gene1000_ZC3H12A", "gene10000_CD86", "gene10001_CASR", "gene10003_CSTA",
"gene10004_CCDC58", "gene10005_FAM162A", "gene10006_WDR5B", "gene10007_LOC102723582",
"gene10008_KPNA1", "gene1001_MIR6732", "gene10010_PARP9", "gene10011_DTX3L",
"gene10012_PARP15", "gene10015_PARP14", "gene10016_HSPBAP1",
"gene10017_DIRC2", "gene10018_LOC100129550"), TPM = c(0, 45.96,
2.72, 2.4, 1.67, 5.14, 4.33, 47.68, 81.1, 10.12, 0.96, 45.21,
0, 19.63, 15.06, 0.49, 21.76, 12.16, 19.37, 5.3)), row.names = c(NA,
20L), class = "data.frame")
testes dataframe:
# Show only the first 20 rows, first column, and 6th column of testes dataframe
dput(testes[1:20, c(1,6)])
structure(list(gene_id = c("gene0_DDX11L1", "gene1_WASH7P", "gene100_C1orf233",
"gene1000_ZC3H12A", "gene10000_CD86", "gene10001_CASR", "gene10003_CSTA",
"gene10004_CCDC58", "gene10005_FAM162A", "gene10006_WDR5B", "gene10007_LOC102723582",
"gene10008_KPNA1", "gene1001_MIR6732", "gene10010_PARP9", "gene10011_DTX3L",
"gene10012_PARP15", "gene10015_PARP14", "gene10016_HSPBAP1",
"gene10017_DIRC2", "gene10018_LOC100129550"), TPM = c(2.33, 47.56,
9.45, 2.03, 3.09, 0.11, 3.73, 28.52, 120.65, 6.89, 1.38, 30.89,
0, 20.39, 13.66, 0.59, 9.62, 22.04, 7.42, 2.53)), row.names = c(NA,
20L), class = "data.frame")
Based on Akrun's comment, I've attempted:
library(gtools)
tpm.foldchange <- foldchange(thyroid.tpm, testes.tpm)
testes.sorted <- testes %>%
left_join(thyroid, by="gene_id") %>%
mutate(TPM=testes.tpm, tpm.foldchange, .keep="unused") %>%
slice_max(n=10, order_by=tpm.foldchange)
Output:
> dim(testes.sorted)
[1] 304 15
> dput(testes.sorted[1:10,])
structure(list(gene_id = c("gene10075_LOC101927056", "gene10311_A4GNT",
"gene10394_SLC9A9-AS1", "gene10504_SUCNR1", "gene10511_TMEM14E",
"gene10798_LOC102724550", "gene10990_FLJ42393", "gene11054_DPPA2P3",
"gene11065_GP5", "gene11400_USP17L12"), transcript_id.s..x = c("rna28860_NR_125396.1,rna28861_NR_125395.1",
"rna29540_NM_016161.2", "rna29785_NR_048544.1", "rna30020_NM_033050.4",
"rna30060_NM_001123228.1", "rna30716_NR_110826.1", "rna31241_NR_024413.1",
"rna31390_NR_027764.1", "rna31430_NM_004488.2", "rna32519_NM_001256853.1"
), length.x = c(659, 1771, 518, 1650, 1293, 2957, 2266, 1146,
3493, 1593), effective_length.x = c(413.57, 1525.5, 272.62, 1404.5,
1047.5, 2711.5, 2020.5, 900.5, 3247.5, 1347.5), expected_count.x = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0.12), TPM.x = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0), FPKM.x = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), transcript_id.s..y = c("rna28860_NR_125396.1,rna28861_NR_125395.1",
"rna29540_NM_016161.2", "rna29785_NR_048544.1", "rna30020_NM_033050.4",
"rna30060_NM_001123228.1", "rna30716_NR_110826.1", "rna31241_NR_024413.1",
"rna31390_NR_027764.1", "rna31430_NM_004488.2", "rna32519_NM_001256853.1"
), length.y = c(796, 1771, 518, 1650, 1293, 2957, 2266, 1146,
3493, 1593), effective_length.y = c(535.05, 1510.04, 257.15,
1389.04, 1032.04, 2696.04, 2005.04, 885.04, 3232.04, 1332.04),
expected_count.y = c(9, 3, 2, 233, 2, 2, 36, 2, 35, 1.91),
TPM.y = c(0.58, 0.07, 0.27, 5.8, 0.07, 0.03, 0.62, 0.08,
0.37, 0.05), FPKM.y = c(0.29, 0.03, 0.14, 2.94, 0.03, 0.01,
0.31, 0.04, 0.19, 0.03), TPM = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 0), tpm.foldchange = c(Inf, Inf, Inf, Inf, Inf, Inf, Inf,
Inf, Inf, Inf)), row.names = c(NA, 10L), class = "data.frame")
This code returns a dataframe with (304, 15) dimensions. But I'm only looking for the top ten genes. Also, please note that thyroid.tpm is the log2-transformed TPM values.
If we want to order by the foldchange, do a join first, and arrange based on the foldchange between the 'TPM' columns
library(dplyr)
library(gtools)
testes2 <- testes %>%
left_join(thyroid, by = 'gene_id') %>%
mutate(across(starts_with("TPM"), ~ log(.x + 1),
.names = "tpm_{.col}")) %>%
mutate(foldchange = foldchange(tpm_TPM.x, tpm_TPM.y)) %>%
filter(is.finite(foldchange)) %>%
arrange(tpm_TPM.x) %>%
dplyr::select(gene_id, TPM = TPM.x, foldchange) %>%
slice_head(n = 10)
If we want to select top 10 foldchange rows, use slice_max
testes %>%
left_join(thyroid, by = 'gene_id') %>%
mutate(TPM = TPM.x, foldchange = foldchange(log(TPM.x + 1), log(TPM.y + 1)),
.keep = "unused") %>%
filter(is.finite(foldchange)) %>%
slice_max(n = 10, order_by = foldchange, with_ties = FALSE)
-output
gene_id TPM foldchange
1 gene100_C1orf233 9.45 1.786222
2 gene10000_CD86 3.09 1.434249
3 gene10007_LOC102723582 1.38 1.288517
4 gene10016_HSPBAP1 22.04 1.217311
5 gene10012_PARP15 0.59 1.162893
6 gene10005_FAM162A 120.65 1.089205
7 gene10010_PARP9 20.39 1.011953
8 gene1_WASH7P 47.56 1.008704
9 gene10011_DTX3L 13.66 -1.033968
10 gene10003_CSTA 3.73 -1.076854
The merge results in memory error because it was done on two vectors creating a cartesian join

Getting the last number in a series before lowering it below threshold in R

I have the following data:
dat<- structure(list(Pentad = 1:73, RR = c(0, 0.014285714, 0, 0.088571429,
0.071428571, 0, 0.065714286, 0.028571429, 0.094285714, 0.011428571,
0, 0, 0, 0, 0, 0, 0.04, 0, 0.814285714, 0.285714286, 1.14, 5.334285714,
2.351428571, 1.985714286, 1.494285714, 2.005714286, 20.04857143,
25.00857143, 16.32, 11.06857143, 8.965714286, 3.985714286, 5.202857143,
7.802857143, 4.451428571, 9.22, 32.04857143, 19.50571429, 3.148571429,
2.434285714, 9.057142857, 28.70857143, 34.15142857, 33.02571429,
46.50571429, 70.61714286, 3.168571429, 1.928571429, 7.031428571,
0.902857143, 5.377142857, 11.35714286, 15.04571429, 11.66285714,
21.24, 11.43714286, 11.69428571, 2.977142857, 4.337142857, 0.871428571,
1.391428571, 0.871428571, 1.145714286, 2.317142857, 0.182857143,
0.282857143, 0.348571429, 0, 0.345714286, 0.142857143, 0.18,
4.894285714, 0.037142857), YY = c(0.577142857, 0, 1.282857143,
1.445714286, 0.111428571, 0.36, 0, 0, 0, 1, 0.011428571, 0.008571429,
0.305714286, 0, 0, 0, 0, 0.8, 0.062857143, 0, 0, 0, 0, 0.013333333,
0.043333333, 1.486666667, 0, 2.486666667, 1.943333333, 0.773333333,
8.106666667, 7.733333333, 0.5, 4.356666667, 2.66, 6.626666667,
4.404285714, 7.977142857, 12.94285714, 18.49428571, 7.357142857,
11.08285714, 9.034285714, 14.29142857, 34.61428571, 45.30285714,
6.66, 6.702857143, 5.962857143, 14.85428571, 2.1, 2.837142857,
7.391428571, 32.03714286, 9.005714286, 3.525714286, 12.32, 2.32,
7.994285714, 6.565714286, 4.771428571, 2.354285714, 0.005714286,
2.508571429, 0.817142857, 2.885714286, 0.897142857, 0, 0, 0,
0, 0.145714286, 0.434285714)), class = "data.frame", row.names = c(NA,
-73L))
There are three columns: Pentad, RR, and YY.
I would like to get the following:
(a) Get the first pentad when the precipitation exceeds the "annual mean" in "at least three consecutive pentads"
(b) Get the last pentad when the precipitation exceeds the "annual mean" in at least three consecutive pentads BEFORE lowering it below the annual mean.
I was able to do (a) using the following script:
first_exceed_seq <- function(x, thresh = mean(x), len = 3)
{
# Logical vector, does x exceed the threshold
exceed_thresh <- x > thresh
# Indices of transition points; where exceed_thresh[i - 1] != exceed_thresh[i]
transition <- which(diff(c(0, exceed_thresh)) != 0)
# Reference index, grouping observations after each transition
index <- vector("numeric", length(x))
index[transition] <- 1
index <- cumsum(index)
# Break x into groups following the transitions
exceed_list <- split(exceed_thresh, index)
# Get the number of values exceeded in each index period
num_exceed <- vapply(exceed_list, sum, numeric(1))
# Get the starting index of the first sequence where more then len exceed thresh
transition[as.numeric(names(which(num_exceed >= len))[1])]
}
first_exceed_seq(dat$RR)
Here's the plot of the time series:
The correct answer in (a) is 27.
I would like to ask how can I do this for (b). The correct answer for (b) should be 57.
I'll appreciate any help on in this in R.
I don't know if I got your problem right.
This is what I tried:
dat %>%
mutate(
anual_mean = mean(RR),
exceed_thresh = RR > anual_mean,
lag1 = lag(exceed_thresh, 1),
lag2 = lag(exceed_thresh, 2),
pick_3 = ifelse(exceed_thresh & lag1 & lag2, RR, NA)
)

Count number of events in the previous time period

I trying to create a variable (the made up one "events60" in the data below, that keeps a "running" count of the number of events in the past (in this example it's 60 minutes, but it could be any arbitrary value). So, it keeps a tally "how many events occurred in the previous hour".
I'm making slow headway with cumsum, rle, diff etc. and whatnot but I'm certain there is a more elegant and quicker solution. It will be applied to a dataset of a minimum 30 million rows so a loop is probably not very efficient.
Example data below in R format
structure(list(Performed_DT_TM = structure(c(1508310211, 1508312843,
1508322697, 1508331061, 1508331161, 1508331452, 1508332222, 1508332900,
1508333781, 1508334349, 1508337531, 1508341065, 1508343542, 1508346756,
1508363905, 1508371639, 1508388245, 1508402001, 1508413612, 1508430173,
1508445426, 1508453675), class = c("POSIXct", "POSIXt"), tzone = ""),
time_since_prev_obs = c(0, 43.8666666666667, 164.233333333333,
139.4, 1.66666666666667, 4.85, 12.8333333333333, 11.3, 14.6833333333333,
9.46666666666667, 53.0333333333333, 58.9, 41.2833333333333,
53.5666666666667, 285.816666666667, 128.9, 276.766666666667,
229.266666666667, 193.516666666667, 276.016666666667, 254.216666666667,
137.483333333333), events60 = c(0, 1, 0, 0, 1, 2, 3, 4, 5,
6, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0)), row.names = c(NA,
-22L), class = "data.frame")
Any help greatly appreciated of course
Cheers
Norm
in Base R you could do:
m <- outer(df$Performed_DT_TM,df$Performed_DT_TM,"-")
c(0,rowsum(as.numeric(m[lower.tri(m)]<3600),row(m)[lower.tri(m)]))
[1] 0 1 0 0 1 2 3 4 5 6 1 1 1 1 0 0 0 0 0 0 0 0
Since your dataset is huge, you can try a rolling join and then an non-equi join from data.table for speed:
setDT(DT)[, Performed_DT_TM := as.POSIXct(Performed_DT_TM, format="%Y-%-%d %T")]
DT[, c("rn", "endtime") := .(.I, Performed_DT_TM - 60L*60L)]
DT[, Last60mins :=
DT[DT, on=.(Performed_DT_TM=endtime), roll=Inf, i.rn - x.rn - 1L]
]
DT[is.na(Last60mins), Last60mins := fcoalesce(Last60mins,
DT[.SD, on=.(Performed_DT_TM>=endtime, Performed_DT_TM<Performed_DT_TM), .N, by=.EACHI]$N)
]
DT
data:
library(data.table)
DT <- structure(list(Performed_DT_TM = structure(c(1508310211, 1508312843,
1508322697, 1508331061, 1508331161, 1508331452, 1508332222, 1508332900,
1508333781, 1508334349, 1508337531, 1508341065, 1508343542, 1508346756,
1508363905, 1508371639, 1508388245, 1508402001, 1508413612, 1508430173,
1508445426, 1508453675), class = c("POSIXct", "POSIXt"), tzone = ""),
time_since_prev_obs = c(0, 43.8666666666667, 164.233333333333,
139.4, 1.66666666666667, 4.85, 12.8333333333333, 11.3, 14.6833333333333,
9.46666666666667, 53.0333333333333, 58.9, 41.2833333333333,
53.5666666666667, 285.816666666667, 128.9, 276.766666666667,
229.266666666667, 193.516666666667, 276.016666666667, 254.216666666667,
137.483333333333), events60 = c(0, 1, 0, 0, 1, 2, 3, 4, 5,
6, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0)), row.names = c(NA,
-22L), class = "data.frame")

Create faceted xy scatters using vectors of column names in R

I have two character vectors of equal length; where position one in vector.x matches position one in vector.y and so on. The elements refer to column names in a data frame (wide format). I would like to somehow loop through these vectors to produce xy scatter graphs for each pair in the vector, preferably in a faceted plot. Here is a (hopefully) reproducible example. To be clear, with this example, I would end up with 10 scatter graphs.
vector.x <- c("Aplanochytrium", "Aplanochytrium", "Aplanochytrium", "Aplanochytrium", "Aplanochytrium", "Bathycoccus", "Brockmanniella", "Brockmanniella", "Caecitellus_paraparvulus", "Caecitellus_paraparvulus")
vector.y <- c("Aliiroseovarius", "Neptuniibacter", "Pseudofulvibacter", "Thalassobius", "unclassified_Porticoccus", "Tenacibaculum", "Pseudomonas", "unclassified_GpIIa", "Marinobacter", "Thalassobius")
structure(list(Aliiroseovarius = c(0, 0, 0, 0.00487132352941176,
0.0108639420589757), Marinobacter = c(0, 0.00219023779724656,
0, 0.00137867647058824, 0.00310398344542162), Neptuniibacter = c(0.00945829750644884,
0.00959532749269921, 0.0171310629514964, 0.2796875, 0.345835488877393
), Pseudofulvibacter = c(0, 0, 0, 0.00284926470588235, 0.00362131401965856
), Pseudomonas = c(0.00466773123694878, 0.00782227784730914,
0.0282765737874097, 0.00707720588235294, 0.00400931195033627),
Tenacibaculum = c(0, 0, 0, 0.00505514705882353, 0.00362131401965856
), Thalassobius = c(0, 0.00166875260742595, 0, 0.0633272058823529,
0.147697878944646), unclassified_GpIIa = c(0, 0.000730079265748853,
0, 0.003125, 0.00103466114847387), unclassified_Porticoccus = c(0,
0, 0, 0.00119485294117647, 0.00569063631660631), Aplanochytrium = c(0,
0, 0, 0.000700770847932726, 0.0315839846865529), Bathycoccus = c(0.000388802488335925,
0, 0, 0.0227750525578136, 0.00526399744775881), Brockmanniella = c(0,
0.00383141762452107, 0, 0.000875963559915907, 0), Caecitellus_paraparvulus = c(0,
0, 0, 0.000875963559915907, 0.00797575370872547)), row.names = c("B11",
"B13", "B22", "DI5", "FF6"), class = "data.frame")
As Rui Barradas shows, it's possible to get a very nice plot from ggplot and gridExta. If you wanted to stick to base R, here's how you'd do that (assuming your data set is called df1):
# set plot sizes
par(mfcol = c(floor(sqrt(length(vector.x))), ceiling(sqrt(length(vector.x)))))
# loop through plots
for (i in 1:length(vector.x)) {
plot(df1[[vector.x[i]]], df1[[vector.y[i]]], xlab = vector.x[i], ylab = vector.y[i])
}
# reset plot size
par(mfcol = c(1,1))
This is a bit long and convoluted but it works.
library(tidyverse)
library(gridExtra)
df_list <- apply(data.frame(vector.x, vector.y), 1, function(x){
DF <- df1[which(names(df1) %in% x)]
i <- which(names(DF) %in% vector.x)
if(i == 2) DF[2:1] else DF
})
gg_list <- lapply(df_list, function(DF){
ggplot(DF, aes(x = get(names(DF)[1]), y = get(names(DF)[2]))) +
geom_point() +
xlab(label = names(DF)[1]) +
ylab(label = names(DF)[2])
})
g <- do.call(grid.arrange, gg_list)
g
Not too elegant, but should get you going:
vector.x <- c("Aplanochytrium", "Aplanochytrium", "Aplanochytrium", "Aplanochytrium", "Aplanochytrium", "Bathycoccus", "Brockmanniella", "Brockmanniella", "Caecitellus_paraparvulus", "Caecitellus_paraparvulus")
vector.y <- c("Aliiroseovarius", "Neptuniibacter", "Pseudofulvibacter", "Thalassobius", "unclassified_Porticoccus", "Tenacibaculum", "Pseudomonas", "unclassified_GpIIa", "Marinobacter", "Thalassobius")
df1 = structure(
list(Aliiroseovarius = c(0, 0, 0, 0.00487132352941176, 0.0108639420589757),
Marinobacter = c(0, 0.00219023779724656, 0, 0.00137867647058824, 0.00310398344542162),
Neptuniibacter = c(0.00945829750644884, 0.00959532749269921, 0.0171310629514964, 0.2796875, 0.345835488877393),
Pseudofulvibacter = c(0, 0, 0, 0.00284926470588235, 0.00362131401965856),
Pseudomonas = c(0.00466773123694878, 0.00782227784730914, 0.0282765737874097, 0.00707720588235294, 0.00400931195033627),
Tenacibaculum = c(0, 0, 0, 0.00505514705882353, 0.00362131401965856),
Thalassobius = c(0, 0.00166875260742595, 0, 0.0633272058823529, 0.147697878944646),
unclassified_GpIIa = c(0, 0.000730079265748853, 0, 0.003125, 0.00103466114847387),
unclassified_Porticoccus = c(0, 0, 0, 0.00119485294117647, 0.00569063631660631),
Aplanochytrium = c(0, 0, 0, 0.000700770847932726, 0.0315839846865529),
Bathycoccus = c(0.000388802488335925, 0, 0, 0.0227750525578136, 0.00526399744775881),
Brockmanniella = c(0, 0.00383141762452107, 0, 0.000875963559915907, 0),
Caecitellus_paraparvulus = c(0, 0, 0, 0.000875963559915907, 0.00797575370872547)),
row.names = c("B11", "B13", "B22", "DI5", "FF6"),
class = "data.frame"
)
df2 = NULL
for(i in 1:10) {
df.tmp = data.frame(
plot = paste0(vector.x[i], ":", vector.y[i]),
x = df1[[vector.x[i]]],
y = df1[[vector.y[i]]]
)
if(is.null(df2)) df2=df.tmp else df2 = rbind(df2, df.tmp)
}
ggplot(data=df2, aes(x, y)) +
geom_point() +
facet_grid(cols = vars(plot))

R dataframe aggregate conditionally based on column value, per quarter

I am running this command:
aggregated_quarterly_realised <- aggregate(merged_dataset$dependent_variable, list(merged_dataset$qy), mean)
which gives me the total amount per quarter. But I would like to get separately the sums in case the merged_dataset$dependent_variable is equal to 0, 1, and the total. Thus I would like to get three values per quarter. How can I do that?
EDIT:
> dput(head(merged_dataset$dependent_variable,10))
c(0, 0, 0, 0, 1, 0, 0, 0, 1, 0)
> dput(head(merged_dataset$qy,10))
structure(c(2008.25, 2008.25, 2008.50, 2008.75, 2009.25, 2009.50,
2008.25, 2008.25, 2008.25, 2008.25), class = "yearqtr")
> dput(head(merged_dataset$test,10))
c(7101273.07, 6855586.59, 800585.78, 8029604.44, 6707122.59,
646079.46, 14598.96, 1303978, 15244705, 322058.74)
What I want is the aggregated values per quarter (quarters are in the merged_dataset$qy variable) for the test variable (merged_dataset$test) separately for the values 0 of the dependent variable, the value 1, and the total.
Using data.table:
Code
dtf = dt[, .(Dep1sum = sum(test[depvar == 1]),
Dep0sum = sum(test[depvar == 0]),
Sum = sum(test)), .(qy)]
Result
> dtf
qy Dep1sum Dep0sum Sum
1: 2008.25 15244705 15597495.4 30842200.4
2: 2008.50 0 800585.8 800585.8
3: 2008.75 0 8029604.4 8029604.4
4: 2009.25 6707123 0.0 6707122.6
5: 2009.50 0 646079.5 646079.5
Data
dt = data.table(
depvar = c(0, 0, 0, 0, 1, 0, 0, 0, 1, 0),
qy = c(2008.25, 2008.25, 2008.50, 2008.75, 2009.25, 2009.50, 2008.25, 2008.25, 2008.25, 2008.25),
test = c(7101273.07, 6855586.59, 800585.78, 8029604.44, 6707122.59, 646079.46, 14598.96, 1303978, 15244705, 322058.74)
)

Resources