Add rows to a data-frame based on values in one of the columns - r

Currently the data-frame looks something like this:
Scenario Month A B C
1 1 -0.593186301 1.045550808 -0.593816304
1 2 0.178626141 2.043084432 0.111370583
1 3 1.205779717 -0.324083723 -1.397716949
2 1 0.933615199 0.052647056 -0.656486153
2 2 1.647291688 -1.065793671 0.799040546
2 3 1.613663101 -1.955567231 -1.817457972
3 1 -0.621991775 1.634069402 -1.404981646
3 2 -1.899326887 -0.836322394 -1.826351541
3 3 0.164235141 -1.160701812 1.238246459
I'd like to add rows on top of the row where Month = 1 as below. I know dplyr has an add_rows function but I'd like to add rows based on a condition. Any help is hugely appreciated.
Scenario Month A B C
0
1 1 -0.593186301 1.045550808 -0.593816304
1 2 0.178626141 2.043084432 0.111370583
1 3 1.205779717 -0.324083723 -1.397716949
0
2 1 0.933615199 0.052647056 -0.656486153
2 2 1.647291688 -1.065793671 0.799040546
2 3 1.613663101 -1.955567231 -1.817457972
0
3 1 -0.621991775 1.634069402 -1.404981646
3 2 -1.899326887 -0.836322394 -1.826351541
3 3 0.164235141 -1.160701812 1.238246459

A solution using tidyverse.
library(tidyverse)
dat2 <- dat %>%
split(f = .$Scenario) %>%
map_dfr(~bind_rows(tibble(Scenario = 0), .x))
dat2
# # A tibble: 12 x 5
# Scenario Month A B C
# <dbl> <int> <dbl> <dbl> <dbl>
# 1 0 NA NA NA NA
# 2 1 1 -0.593 1.05 -0.594
# 3 1 2 0.179 2.04 0.111
# 4 1 3 1.21 -0.324 -1.40
# 5 0 NA NA NA NA
# 6 2 1 0.934 0.0526 -0.656
# 7 2 2 1.65 -1.07 0.799
# 8 2 3 1.61 -1.96 -1.82
# 9 0 NA NA NA NA
# 10 3 1 -0.622 1.63 -1.40
# 11 3 2 -1.90 -0.836 -1.83
# 12 3 3 0.164 -1.16 1.24
DATA
dat <- read.table(text = "Scenario Month A B C
1 1 -0.593186301 1.045550808 -0.593816304
1 2 0.178626141 2.043084432 0.111370583
1 3 1.205779717 -0.324083723 -1.397716949
2 1 0.933615199 0.052647056 -0.656486153
2 2 1.647291688 -1.065793671 0.799040546
2 3 1.613663101 -1.955567231 -1.817457972
3 1 -0.621991775 1.634069402 -1.404981646
3 2 -1.899326887 -0.836322394 -1.826351541
3 3 0.164235141 -1.160701812 1.238246459 ",
header = TRUE)

Somehow add_row doesn't take multiple values to its .before parameter.
One way is to split the dataframe wherever Month = 1 and then for each dataframe add a row using add_row above Month = 1.
library(tidyverse)
map_df(split(df, cumsum(df$Month == 1)),
~ add_row(., Scenario = 0, .before = which(.$Month == 1)))
# Scenario Month A B C
#1 0 NA NA NA NA
#2 1 1 -0.5931863 1.04555081 -0.5938163
#3 1 2 0.1786261 2.04308443 0.1113706
#4 1 3 1.2057797 -0.32408372 -1.3977169
#5 0 NA NA NA NA
#6 2 1 0.9336152 0.05264706 -0.6564862
#7 2 2 1.6472917 -1.06579367 0.7990405
#8 2 3 1.6136631 -1.95556723 -1.8174580
#9 0 NA NA NA NA
#10 3 1 -0.6219918 1.63406940 -1.4049816
#11 3 2 -1.8993269 -0.83632239 -1.8263515
#12 3 3 0.1642351 -1.16070181 1.2382465

Here is one option with data.table
library(data.table)
setDT(df1)[, .SD[c(.N+1, seq_len(.N))], Scenario][
!duplicated(Scenario), Scenario := 0][]
# Scenario Month A B C
# 1: 0 NA NA NA NA
# 2: 1 1 -0.5931863 1.04555081 -0.5938163
# 3: 1 2 0.1786261 2.04308443 0.1113706
# 4: 1 3 1.2057797 -0.32408372 -1.3977169
# 5: 0 NA NA NA NA
# 6: 2 1 0.9336152 0.05264706 -0.6564862
# 7: 2 2 1.6472917 -1.06579367 0.7990405
# 8: 2 3 1.6136631 -1.95556723 -1.8174580
# 9: 0 NA NA NA NA
#10: 3 1 -0.6219918 1.63406940 -1.4049816
#11: 3 2 -1.8993269 -0.83632239 -1.8263515
#12: 3 3 0.1642351 -1.16070181 1.2382465
Or as #chinsoon12 mentioned in the comments
setDT(df1)[, rbindlist(.(.(Scenario=0L), c(.(Scenario=rep(Scenario, .N)),
.SD)), use.names=TRUE, fill=TRUE), by=.(Scenario)][, -1L]
data
df1 <- structure(list(Scenario = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L
), Month = c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), A = c(-0.593186301,
0.178626141, 1.205779717, 0.933615199, 1.647291688, 1.613663101,
-0.621991775, -1.899326887, 0.164235141), B = c(1.045550808,
2.043084432, -0.324083723, 0.052647056, -1.065793671, -1.955567231,
1.634069402, -0.836322394, -1.160701812), C = c(-0.593816304,
0.111370583, -1.397716949, -0.656486153, 0.799040546, -1.817457972,
-1.404981646, -1.826351541, 1.238246459)), class = "data.frame",
row.names = c(NA,
-9L))

Here's a simple way (without loops) using base R -
df1 <- df[rep(1:nrow(df), (df$Month == 1)+1), ]
df1[duplicated(df1, fromLast = T), ] <- NA
df1$Scenario[is.na(df1$Scenario)] <- 0
df1
Scenario Month A B C
1 0 NA NA NA NA
1.1 1 1 -0.5931863 1.04555081 -0.5938163
2 1 2 0.1786261 2.04308443 0.1113706
3 1 3 1.2057797 -0.32408372 -1.3977169
4 0 NA NA NA NA
4.1 2 1 0.9336152 0.05264706 -0.6564862
5 2 2 1.6472917 -1.06579367 0.7990405
6 2 3 1.6136631 -1.95556723 -1.8174580
7 0 NA NA NA NA
7.1 3 1 -0.6219918 1.63406940 -1.4049816
8 3 2 -1.8993269 -0.83632239 -1.8263515
9 3 3 0.1642351 -1.16070181 1.2382465
Data -
df <- structure(list(Scenario = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L
), Month = c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), A = c(-0.593186301,
0.178626141, 1.205779717, 0.933615199, 1.647291688, 1.613663101,
-0.621991775, -1.899326887, 0.164235141), B = c(1.045550808,
2.043084432, -0.324083723, 0.052647056, -1.065793671, -1.955567231,
1.634069402, -0.836322394, -1.160701812), C = c(-0.593816304,
0.111370583, -1.397716949, -0.656486153, 0.799040546, -1.817457972,
-1.404981646, -1.826351541, 1.238246459)), class = "data.frame", row.names = c(NA,
-9L))

Related

Moving average with grouped data

I would like to calculate the Moving Average of my dataset that is composed by a column representing an index of grouped patients and a second column representing some measurements of a blood circulating molecule. Patients are grouped according to consecutive measurements of the molecule of interest.
Moreover I would like to plot the output in which the measurement per group is plotted against the patients group number.
I tried to code this analysis but I'm not sure I'm doing well.
SURG_DATE VES_2A Index
21/05/2013 1 1
10/06/2013 1 1
06/01/2014 1 1
29/01/2014 0 1
11/03/2014 3 2
05/04/2014 1 2
06/04/2014 1 2
14/05/2014 1 2
28/05/2014 3 3
02/09/2014 2 3
16/09/2014 2 3
17/09/2014 0 3
21/10/2014 2 5
05/12/2014 0 5
19/12/2014 2 5
11/01/2015 1 5
15/01/2015 1 6
17/01/2015 2 6
24/01/2015 1 6
19/02/2015 1 6
The code I tried:
tapply(test$VES_2A,
test$Index,
function(x) rollmean(x, 12, na.pad=TRUE))
It's a little ambiguous, but I think you want this:
test <- cbind(time=rownames(test), test) # first add a time variable
# then create a list with rolling mean for each id and time
ls1 <- lapply(seq_along(test$time),
function(x) cbind(time=x, # time variable
with(test[test$time %in% 1:x, ],
aggregate(list(VES_2A=VES_2A),
list(Index=Index), mean)) # rolling mean
))
tot <- transform(t(sapply(ls1, colMeans)), Index="total") # occasionally add a total column
long <- rbind(do.call(rbind, ls1), tot) # bind all rows together into long format data frame
wide <- reshape2::dcast(long, time ~ Index) # reshape to wide w/ e.g. reshape2::dcast()
rm(ls1, tot) # clean up
Yielding
> wide
time 1 2 3 5 6 total
1 1 1.00 NA NA NA NA 1.000000
2 2 1.00 NA NA NA NA 1.000000
3 3 1.00 NA NA NA NA 1.000000
4 4 0.75 NA NA NA NA 0.750000
5 5 0.75 3.000000 NA NA NA 1.875000
6 6 0.75 2.000000 NA NA NA 1.375000
7 7 0.75 1.666667 NA NA NA 1.208333
8 8 0.75 1.500000 NA NA NA 1.125000
9 9 0.75 1.500000 3.000000 NA NA 1.750000
10 10 0.75 1.500000 2.500000 NA NA 1.583333
11 11 0.75 1.500000 2.333333 NA NA 1.527778
12 12 0.75 1.500000 1.750000 NA NA 1.333333
13 13 0.75 1.500000 1.750000 2.000000 NA 1.500000
14 14 0.75 1.500000 1.750000 1.000000 NA 1.250000
15 15 0.75 1.500000 1.750000 1.333333 NA 1.333333
16 16 0.75 1.500000 1.750000 1.250000 NA 1.312500
17 17 0.75 1.500000 1.750000 1.250000 1.000000 1.250000
18 18 0.75 1.500000 1.750000 1.250000 1.500000 1.350000
19 19 0.75 1.500000 1.750000 1.250000 1.333333 1.316667
20 20 0.75 1.500000 1.750000 1.250000 1.250000 1.300000
Plot
library(ggplot2)
ggplot(long, aes(time, VES_2A, color=Index)) +
geom_line()
Tell me what you think, hope that's what you've wanted.
Data
test <- structure(list(VES_2A = c(1L, 1L, 1L, 0L, 3L, 1L, 1L, 1L, 3L,
2L, 2L, 0L, 2L, 0L, 2L, 1L, 1L, 2L, 1L, 1L), Index = c(1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 5L, 5L, 5L, 5L, 6L, 6L,
6L, 6L)), class = "data.frame", row.names = c(NA, -20L))
Using the data shown reproducibly in the Note at the end this takes the rolling mean of the current and prior two observations separately for each Index value and adds a sequence number. Since every value of Index takes up 4 rows we use 1:4.
It's not clear from the question what is to be plotted but we plot the rolling mean vs. seq for each Index on a single panel. For classic grpahics, replace screen = 1 with screen = colnames(wide) if you want separate panels. For ggplot2 to get separate panels omit facet=NULL.
library(zoo)
roll <- function(x) rollmeanr(x, 3, fill = NA)
df3 <- transform(df, mean3 = ave(VES_2A, Index, FUN = roll), seq = 1:4)
wide <- na.omit(read.zoo(df3[-1], index = "seq", split = "Index"))
# classic graphics
plot(wide, screen = 1, type = "o", pch = colnames(wide))
# ggplot2 gtraphics
library(ggplot2)
autoplot(wide[-3], facet = NULL)
Note
Lines <- " VES_2A Index
1 1
1 1
1 1
0 1
3 2
1 2
1 2
1 2
3 3
2 3
2 3
0 3
2 5
0 5
2 5
1 5
1 6
2 6
1 6
1 6"
df <- read.table(text = Lines, header = TRUE)

Set value of a column to NA based on conditions in R

I have a data frame, a reproducible example is as follows:
structure(list(subscriberid = c(1177460837L, 1177460837L, 1177460837L,
1146526049L, 1146526049L, 1146526049L), variable = c("3134",
"4550", "4550", "5160", "2530", "2530"), value = c(1, 2, 2, 1,
2, 2), gender = c(2, 2, 2, 1, 2, 2), cwe = c(NA, 50L, 50L, NA,
30L, 30L), hw = c(NA, 48L, 48L, NA, 26L, 26L), resp = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_
), cna = c(3L, 1L, 1L, 3L, 1L, 1L)), .Names = c("subscriberid",
"variable", "value", "gender", "cwe", "hw", "resp", "cna"), row.names = c(4L,
5L, 6L, 9L, 10L, 11L), class = "data.frame")
The actual data frame looks like this:
subscriberid variable value gender cwe hw resp cna
4 1177460837 3134 1 2 NA NA NA 3
5 1177460837 4550 2 2 50 48 NA 1
6 1177460837 4550 2 2 50 48 NA 1
9 1146526049 5160 1 1 NA NA NA 3
10 1146526049 2530 2 2 30 26 NA 1
11 1146526049 2530 2 2 30 26 NA 1
In the above df, row 5 and 6 are exactly the same. From row 5, I want to remove 48 and row 6 I want to remove 50. Essentially, I want to retain only one age in a row and set the other to NA. I tried using a for loop but that sets column values in the column that I refer in both the rows to NA.
for (i in 1:nrow(test)) {
test$hw[i] <- ifelse(!is.na(test$cwe[i]) & !is.na(test$hw[i]), NA, test$hw[i])
}
I am trying to set an if condition to identify if both the rows are same, then I want to iteratively remove one of the values from the first row and remove the other from the second.
The desired output is as follows:
subscriberid variable value gender cwe hw resp cna
4 1177460837 3134 1 2 NA NA NA 3
5 1177460837 4550 2 2 50 NA NA 1
6 1177460837 4550 2 2 NA 48 NA 1
9 1146526049 5160 1 1 NA NA NA 3
10 1146526049 2530 2 2 30 NA NA 1
11 1146526049 2530 2 2 NA 26 NA 1
You can use a combination of which() and duplicated() to receive duplicated rows.
Because you need to change values twice of the rows, you have to create a copy of the dataframe. Note that this will only work if the identical rows are always consecutive.
dfNA <- df
dfNA$hw[which(duplicated(df))-1] <- NA
dfNA$cwe[which(duplicated(df))] <- NA
dfNA
# subscriberid variable value gender cwe hw resp cna
#4 1177460837 3134 1 2 NA NA NA 3
#5 1177460837 4550 2 2 50 NA NA 1
#6 1177460837 4550 2 2 NA 48 NA 1
#9 1146526049 5160 1 1 NA NA NA 3
#10 1146526049 2530 2 2 30 NA NA 1
#11 1146526049 2530 2 2 NA 26 NA 1
A possible solution :
# create a logical vector indicating if current row is identical to previous one
# N.B.: do.call("paste",c(DF,sep="\r")) is used internally by "duplicated.data.frame" function
rowStrings <- do.call("paste", c(DF, sep = "\r"))
currRowIsEqualToPrev <- rowStrings[-1] == rowStrings[-length(rowStrings)]
# set first row hw = NA and second identical row cwe = NA
DF[c(FALSE,currRowIsEqualToPrev),'hw'] <- NA
DF[c(currRowIsEqualToPrev,FALSE),'cwe'] <- NA
> DF
subscriberid variable value gender cwe hw resp cna
4 1177460837 3134 1 2 NA NA NA 3
5 1177460837 4550 2 2 NA 48 NA 1
6 1177460837 4550 2 2 50 NA NA 1
9 1146526049 5160 1 1 NA NA NA 3
10 1146526049 2530 2 2 NA 26 NA 1
11 1146526049 2530 2 2 30 NA NA 1
Using lead and lag from dplyr package:
library(dplyr)
df1 %>%
group_by(subscriberid, variable) %>%
mutate(cwe = if_else(lead(cwe) == cwe, cwe, NA_integer_),
hw = if_else(lag(hw) == hw, hw, NA_integer_)) %>%
ungroup()
# # A tibble: 6 x 8
# subscriberid variable value gender cwe hw resp cna
# <int> <int> <int> <int> <int> <int> <lgl> <int>
# 1 1177460837 3134 1 2 NA NA NA 3
# 2 1177460837 4550 2 2 50 NA NA 1
# 3 1177460837 4550 2 2 NA 48 NA 1
# 4 1146526049 5160 1 1 NA NA NA 3
# 5 1146526049 2530 2 2 30 NA NA 1
# 6 1146526049 2530 2 2 NA 26 NA 1
I took a shot at it. This relies on using group_by from dplyr to find duplicate rows. This method assumes that rows can be reliably be identified as identical by using the subscriberid, variable, value, gender, resp, and cna columns alone.
Because it is operating within groups only, it will work even if a preceding non-identical row contains the same value for cwe (I did check this, but I would also confirm it for yourself if I were you).
library(dplyr)
ndf <- df %>%
group_by(subscriberid, variable, value, gender, resp, cna) %>%
mutate(cwe = na_if(cwe, lag(cwe)),
hw = na_if(hw, lead(hw))) %>%
ungroup()
Output:
# A tibble: 6 x 8
subscriberid variable value gender cwe hw resp cna
<int> <chr> <dbl> <dbl> <int> <int> <int> <int>
1 1177460837 3134 1. 2. NA NA NA 3
2 1177460837 4550 2. 2. 50 NA NA 1
3 1177460837 4550 2. 2. NA 48 NA 1
4 1146526049 5160 1. 1. NA NA NA 3
5 1146526049 2530 2. 2. 30 NA NA 1
6 1146526049 2530 2. 2. NA 26 NA 1

lapply alternative to for loop to append to data frame

I have a data frame:
df<-structure(list(chrom = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 3L, 3L, 4L, 4L, 4L, 4L), .Label = c("1", "2", "3", "4"), class = "factor"),
pos = c(10L, 200L, 134L, 400L, 600L, 1000L, 20L, 33L, 40L,
45L, 50L, 55L, 100L, 123L)), .Names = c("chrom", "pos"), row.names = c(NA, -14L), class = "data.frame")
> head(df)
chrom pos
1 1 10
2 1 200
3 1 134
4 1 400
5 1 600
6 1 1000
And I want to calculate pos[i+1] - pos[i] on the sample chromosome (chrom)
By using a for loop over each chrom level, and another over each row I get the expected results:
for (c in levels(df$chrom)){
df_chrom<-filter(df, chrom == c)
df_chrom<-arrange(df_chrom, df_chrom$pos)
for (i in 1:nrow(df_chrom)){
dist<-(df_chrom$pos[i+1] - df_chrom$pos[i])
logdist<-log10(dist)
cat(c, i, df_chrom$pos[i], dist, logdist, "\n")
}
}
However, I want to save this to a data frame, and think that lapply or apply is the right way to go about this. I can't work out how to make the pos[i+1] - pos[i] calculation though (seeing as lapply works on each row/column.
Any pointers would be appreciated
Here's the output from my solution:
chrom index pos dist log10dist
1 1 10 124 2.093422
1 2 134 66 1.819544
1 3 200 200 2.30103
1 4 400 200 2.30103
1 5 600 400 2.60206
1 6 1000 NA NA
2 1 20 13 1.113943
2 2 33 NA NA
3 1 40 5 0.69897
3 2 45 NA NA
4 1 50 5 0.69897
4 2 55 45 1.653213
4 3 100 23 1.361728
4 4 123 NA NA
We could do this using a group by difference. Convert the 'data.frame' to 'data.table' (setDT(df)), grouped by 'chrom', order the 'pos', get the difference of 'pos' (diff) and also log of the difference
library(data.table)
setDT(df)[order(pos), {v1 <- diff(pos)
.(index = seq_len(.N), pos = pos,
dist = c(v1, NA), logdiff = c(log10(v1), NA))}
, by = chrom]
# chrom index pos dist logdiff
# 1: 1 1 10 124 2.093422
# 2: 1 2 134 66 1.819544
# 3: 1 3 200 200 2.301030
# 4: 1 4 400 200 2.301030
# 5: 1 5 600 400 2.602060
# 6: 1 6 1000 NA NA
# 7: 2 1 20 13 1.113943
# 8: 2 2 33 NA NA
# 9: 3 1 40 5 0.698970
#10: 3 2 45 NA NA
#11: 4 1 50 5 0.698970
#12: 4 2 55 45 1.653213
#13: 4 3 100 23 1.361728
#14: 4 4 123 NA NA
Upon running the OP's code the output printed are
#1 1 10 124 2.093422
#1 2 134 66 1.819544
#1 3 200 200 2.30103
#1 4 400 200 2.30103
#1 5 600 400 2.60206
#1 6 1000 NA NA
#2 1 20 13 1.113943
#2 2 33 NA NA
#3 1 40 5 0.69897
#3 2 45 NA NA
#4 1 50 5 0.69897
#4 2 55 45 1.653213
#4 3 100 23 1.361728
#4 4 123 NA NA
We split df by df$chrom (Note that we reorder both df and df$chrom before splitting). Then we go through each of the subgroups (the subgroups are called a in this example) using lapply. On the pos column of each subgroup, we calculate difference (diff) of consecutive elements and take log10. Since diff decreases the number of elements by 1, we add a NA to the end. Finally, we rbind all the subgroups together using do.call.
do.call(rbind, lapply(split(df[order(df$chrom, df$pos),], df$chrom[order(df$chrom, df$pos)]),
function(a) data.frame(a, dist = c(log10(diff(a$pos)), NA))))
# chrom pos dist
#1.1 1 10 2.093422
#1.3 1 134 1.819544
#1.2 1 200 2.301030
#1.4 1 400 2.301030
#1.5 1 600 2.602060
#1.6 1 1000 NA
#2.7 2 20 1.113943
#2.8 2 33 NA
#3.9 3 40 0.698970
#3.10 3 45 NA
#4.11 4 50 0.698970
#4.12 4 55 1.653213
#4.13 4 100 1.361728
#4.14 4 123 NA

How can I use merge so that I have data for all times?

I'm trying to change a data into which all entities have value for all possible times(months). Here's what I'm trying;
Class Value month
A 10 1
A 12 3
A 9 12
B 11 1
B 10 8
From the data above, I want to get the following data;
Class Value month
A 10 1
A NA 2
A 12 3
A NA 4
....
A 9 12
B 11 1
B NA 2
....
B 10 8
B NA 9
....
B NA 12
So I want to have all possible cells with through month from 1 to 12;
How can I do this? I'm right now trying it with merge function, but appreciate any other ways to approach.
We can use tidyverse
library(tidyverse)
df1 %>%
complete(Class, month = min(month):max(month)) %>%
select_(.dots = names(df1)) %>% #if we need to be in the same column order
as.data.frame() #if needed to convert to 'data.frame'
In base R using merge (where df is your data):
res <- data.frame(Class=rep(levels(df$Class), each=12), value=NA, month=1:12)
merge(df, res, by = c("Class", "month"), all.y = TRUE)[,c(1,3,2)]
# Class Value month
# 1 A 10 1
# 2 A NA 2
# 3 A 12 3
# 4 A NA 4
# 5 A NA 5
# 6 A NA 6
# 7 A NA 7
# 8 A NA 8
# 9 A NA 9
# 10 A NA 10
# 11 A NA 11
# 12 A 9 12
# 13 B 11 1
# 14 B NA 2
# 15 B NA 3
# 16 B NA 4
# 17 B NA 5
# 18 B NA 6
# 19 B NA 7
# 20 B 10 8
# 21 B NA 9
# 22 B NA 10
# 23 B NA 11
# 24 B NA 12
df <- structure(list(Class = structure(c(1L, 1L, 1L, 2L, 2L), .Label = c("A",
"B"), class = "factor"), Value = c(10L, 12L, 9L, 11L, 10L), month = c(1L,
3L, 12L, 1L, 8L)), .Names = c("Class", "Value", "month"), class = "data.frame", row.names = c(NA,
-5L))
To add to #akrun's answer, if you want to replace the NA values with 0, you can do the following:
library(dplyr)
library(tidyr)
df1 %>%
complete(Class, month = min(month):max(month)) %>%
mutate(Value = ifelse(is.na(Value),0,Value))

Writing code for calculating Cmax and Tmax of Concentration_Time data

I have a concentration-time data of many individuals. I want to find out the Cmax (maximum concentration) and Tmax (the time at Cmax) for each individual. I want to retain the results in R by adding a new "Cmax" and "Tmax" columns to the original dataset.
The data frame looks like this:
#df <-
ID TIME CONC
1 0 0
1 1 10
1 2 15
1 5 12
2 1 5
2 2 10
2 5 20
2 6 10
Ans so on. I started with something to find Cmax for an individual but its not getting me any where. Any help in fixing the code or an easier way of finding both (Cmax, and Tmax) is highly appreciable !
Cmax=function(df) {
n = length(df$CONC)
c_temp=0 # this is a temporary counter
c_max=0
for(i in 2:n){
if(df$CONC[i] > df$CONC[i-1]{
c_temp= c_temp+1
if(c_temp > c_max) c_max=c_temp # check
}
}
return(c_max)
}
Try
library(dplyr)
df %>%
group_by(ID) %>%
mutate(Cmax= max(CONC), Tmax=TIME[which.max(CONC)])
# ID TIME CONC Cmax Tmax
#1 1 0 0 15 2
#2 1 1 10 15 2
#3 1 2 15 15 2
#4 1 5 12 15 2
#5 2 1 5 20 5
#6 2 2 10 20 5
#7 2 5 20 20 5
#8 2 6 10 20 5
Or using data.table
library(data.table)
setDT(df)[, c("Cmax", "Tmax") := list(max(CONC),
TIME[which.max(CONC)]), by=ID]
Or using split from base R
unsplit(lapply(split(df, df$ID), function(x)
within(x, {Cmax <- max(CONC)
Tmax <- TIME[which.max(CONC)] })),
df$ID)
# ID TIME CONC Tmax Cmax
#1 1 0 0 2 15
#2 1 1 10 2 15
#3 1 2 15 2 15
#4 1 5 12 2 15
#5 2 1 5 5 20
#6 2 2 10 5 20
#7 2 5 20 5 20
#8 2 6 10 5 20
data
df <- structure(list(ID = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), TIME = c(0L,
1L, 2L, 5L, 1L, 2L, 5L, 6L), CONC = c(0L, 10L, 15L, 12L, 5L,
10L, 20L, 10L)), .Names = c("ID", "TIME", "CONC"), class = "data.frame",
row.names = c(NA, -8L))

Resources