Add a new row in each group (Day) - r

I am trying to make a function with this data and would really appreciate help with this!
example<- data.frame(Day=c(2,4,8,16,32,44,2,4,8,16,32,44,2,4,8,16,32,44),
Replicate=c(1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,
1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,
1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3),
Treament=c("CC","CC","CC","CC","CC","CC","CC","CC","CC","CC","CC","CC","CC","CC","CC","CC","CC","CC",
"HP","HP","HP","HP","HP","HP","HP","HP","HP","HP","HP","HP","HP","HP","HP","HP","HP","HP",
"LL","LL","LL","LL","LL","LL","LL","LL","LL","LL","LL","LL","LL","LL","LL","LL","LL","LL"),
AFDM=c(94.669342,94.465752,84.897023,81.435993,86.556221,75.328294,94.262162,88.791240,75.735474,81.232403,
67.050593,76.346244,95.076522,88.968823,83.879073,73.958836,70.645724,67.184695,99.763156,92.022673,
92.245362,74.513934,50.083136,36.979418,94.872932,86.353037,81.843173,67.795465,46.622106,18.323099,
95.089932,93.244212,81.679814,65.352385,18.286525,7.517794,99.559972,86.759404,84.693433,79.196504,
67.456961,54.765706,94.074014,87.543693,82.492548,72.333367,51.304676,51.304676,98.340870,86.322153,
87.950873,84.693433,63.316485,63.723665))
Example:
I want to insert a new row with an AFDM value (e.g., 0.9823666) that was calculated with another function.
This new row must be on each Day 2 (and call it as Day 0), and I want to preserve the name of each Replica and Treatment of each group.
Thus, this new row must be: Day 0, Replicate=same, Treatment=same, AFDM=0.9823666.
This is so I can later run a regression with the data (from 0 to 44, 3 replicates for each Treatment).
I would prefer a solution on dplyr.
Thanks in advance

We can create a grouping column with cumsum, then expand the dataset with complete and fill the other columns
library(dplyr)
library(tidyr)
example %>%
group_by(grp = cumsum(Day == 2)) %>%
complete(Day = c(0, unique(Day)), fill = list(AFDM = 0.9823666)) %>%
fill(Replicate, Treament, .direction = 'updown')
# A tibble: 63 x 5
# Groups: grp [9]
# grp Day Replicate Treament AFDM
# <int> <dbl> <dbl> <chr> <dbl>
# 1 1 0 1 CC 0.982
# 2 1 2 1 CC 94.7
# 3 1 4 1 CC 94.5
# 4 1 8 1 CC 84.9
# 5 1 16 1 CC 81.4
# 6 1 32 1 CC 86.6
# 7 1 44 1 CC 75.3
# 8 2 0 2 CC 0.982
# 9 2 2 2 CC 94.3
#10 2 4 2 CC 88.8
# … with 53 more rows

You can use distinct to get unique Replicate and Treament, add Day and AFDM column with the default values and bind the rows to the original dataframe.
library(dplyr)
example %>%
distinct(Replicate, Treament) %>%
mutate(Day = 0, AFDM = 0.9823666) %>%
bind_rows(example) %>%
arrange(Replicate, Treament)
# Replicate Treament Day AFDM
#1 1 CC 0 0.9823666
#2 1 CC 2 94.6693420
#3 1 CC 4 94.4657520
#4 1 CC 8 84.8970230
#5 1 CC 16 81.4359930
#6 1 CC 32 86.5562210
#7 1 CC 44 75.3282940
#8 1 HP 0 0.9823666
#9 1 HP 2 99.7631560
#10 1 HP 4 92.0226730
#.....

Related

Infill missing variables of a df from a list

I have missing categorical variables in a list. I would like to add all the combinations of these classifications to the data frame using complete. I can do this for a single variable using mutate.
Simplified example:
library(tidyverse)
df <- tibble(a1 = 1:6,
b1 = rep(c(1,2),3),
c1 = rep(c(1:3), 2))
missing_cols <- list(d1 = c(7:8),
e1 = c(12:14))
# Use the first classification of d1 for mutate and complete with all classifications
df %>%
mutate(!!names(missing_cols)[1] := missing_cols[[1]][1]) %>%
complete(nesting(a1, b1,c1), d1 = missing_cols[[1]])
Desired output
df %>%
mutate(!!names(missing_cols)[1] := missing_cols[[1]][1]) %>%
mutate(!!names(missing_cols)[2] := missing_cols[[2]][1]) %>%
complete(nesting(a1, b1,c1), d1 = missing_cols[[1]], e1 = missing_cols[[2]])
This will get the correct output for d1. How can I do this for all variables in my list?
We can use crossing with cross_df :
library(tidyr)
crossing(df, cross_df(missing_cols))
# a1 b1 c1 d1 e1
# <int> <dbl> <int> <int> <int>
# 1 1 1 1 7 12
# 2 1 1 1 7 13
# 3 1 1 1 7 14
# 4 1 1 1 8 12
# 5 1 1 1 8 13
# 6 1 1 1 8 14
# 7 2 2 2 7 12
# 8 2 2 2 7 13
# 9 2 2 2 7 14
#10 2 2 2 8 12
# … with 26 more rows
cross_df creates all possible combination of missing_cols while crossing takes that output and creates all possible combination with df.
Using expand.grid
library(tidyr)
crossing(df, expand.grid(missing_cols))

Remove if unit only has one observation

I have a long form of clinical data that looks something like this:
patientid <- c(100,100,100,101,101,101,102,102,102,104,104,104)
outcome <- c(1,1,1,1,1,NA,1,NA,NA,NA,NA,NA)
time <- c(1,2,3,1,2,3,1,2,3,1,2,3)
Data <- data.frame(patientid=patientid, outcome=outcome, time=time)
A patient should be kept in the database only if they 2 or 3 observations (so patients that have complete data for 0 or only 1 time points should be thrown out. So for this example my desired result is this:
patientid <- c(100,100,100,101,101,101)
outcome <- c(1,1,1,1,1,NA)
time <- c(1,2,3,1,2,3)
Data <- data.frame(patientid=patientid, outcome=outcome, time=time)
Hence patients 102 and 104 are thrown out of the database because of they were missing the outcome variable in 2 or 3 of the time points.
We can create a logical expression on the sum of non-NA elements as a logical vector, grouped by 'patientid' to filter patientid's having more than one non-NA 'outcome'
library(dplyr)
Data %>%
group_by(patientid) %>%
filter(sum(!is.na(outcome)) > 1) %>%
ungroup
-output
# A tibble: 6 x 3
# patientid outcome time
# <dbl> <dbl> <dbl>
#1 100 1 1
#2 100 1 2
#3 100 1 3
#4 101 1 1
#5 101 1 2
#6 101 NA 3
A base R option using subset + ave
subset(
Data,
ave(!is.na(outcome), patientid, FUN = sum) > 1
)
giving
patientid outcome time
1 100 1 1
2 100 1 2
3 100 1 3
4 101 1 1
5 101 1 2
6 101 NA 3
A data.table option
setDT(Data)[, Y := sum(!is.na(outcome)), patientid][Y > 1, ][, Y := NULL][]
or a simpler one (thank #akrun)
setDT(Data)[Data[, .I[sum(!is.na(outcome)) > 1], .(patientid)]$V1]
which gives
patientid outcome time
1: 100 1 1
2: 100 1 2
3: 100 1 3
4: 101 1 1
5: 101 1 2
6: 101 NA 3
library(dplyr)
Data %>%
group_by(patientid) %>%
mutate(observation = sum(outcome, na.rm = TRUE)) %>% # create new variable (observation) and count the observation per patient
filter(observation >=2) %>%
ungroup
output:
# A tibble: 6 x 4
patientid outcome time observation
<dbl> <dbl> <dbl> <dbl>
1 100 1 1 3
2 100 1 2 3
3 100 1 3 3
4 101 1 1 2
5 101 1 2 2
6 101 NA 3 2

Determine percentage of rows with missing values in a dataframe in R

I have a data frame with three variables and some missing values in one of the variables that looks like this:
subject <- c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2)
part <- c(0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3)
sad <- c(1,7,7,4,NA,NA,2,2,NA,2,3,NA,NA,2,2,1,NA,5,NA,6,6,NA,NA,3,3,NA,NA,5,3,NA,7,2)
df1 <- data.frame(subject,part,sad)
I have created a new data frame with the mean values of 'sad' per subject and part using a loop, like this:
columns<-c("sad.m",
"part",
"subject")
df2<-matrix(data=NA,nrow=1,ncol=length(columns))
df2<-data.frame(df2)
names(df2)<-columns
tn<-unique(df1$subject)
row=1
for (s in tn){
for (i in 0:3){
TN<-df1[df1$subject==s&df1$part==i,]
df2[row,"sad.m"]<-mean(as.numeric(TN$sad), na.rm = TRUE)
df2[row,"part"]<-i
df2[row,"subject"]<-s
row=row+1
}
}
Now I want to include an additional variable 'missing' in that indicates the percentage of rows per subject and part with missing values, so that I get df3:
subject <- c(1,1,1,1,2,2,2,2)
part<-c(0,1,2,3,0,1,2,3)
sad.m<-df2$sad.m
missing <- c(0,50,50,25,50,50,50,25)
df3 <- data.frame(subject,part,sad.m,missing)
I'd really appreciate any help on how to go about this!
It's best to try and avoid loops in R where possible, since they can get messy and tend to be quite slow. For this sort of thing, dplyr library is perfect and well worth learning. It can save you a lot of time.
You can create a data frame with both variables by first grouping by subject and part, and then performing a summary of the grouped data frame:
df2 = df1 %>%
dplyr::group_by(subject, part) %>%
dplyr::summarise(
sad_mean = mean(na.omit(sad)),
na_count = (sum(is.na(sad) / n()) * 100)
)
df2
# A tibble: 8 x 4
# Groups: subject [2]
subject part sad_mean na_count
<dbl> <dbl> <dbl> <dbl>
1 1 0 4.75 0
2 1 1 2 50
3 1 2 2.5 50
4 1 3 1.67 25
5 2 0 5.5 50
6 2 1 4.5 50
7 2 2 4 50
8 2 3 4 25
For each subject and part you can calculate mean of sad and calculate ratio of NA value using is.na and mean.
library(dplyr)
df1 %>%
group_by(subject, part) %>%
summarise(sad.m = mean(sad, na.rm = TRUE),
perc_missing = mean(is.na(sad)) * 100)
# subject part sad.m perc_missing
# <dbl> <dbl> <dbl> <dbl>
#1 1 0 4.75 0
#2 1 1 2 50
#3 1 2 2.5 50
#4 1 3 1.67 25
#5 2 0 5.5 50
#6 2 1 4.5 50
#7 2 2 4 50
#8 2 3 4 25
Same logic with data.table :
library(data.table)
setDT(df1)[, .(sad.m = mean(sad, na.rm = TRUE),
perc_missing = mean(is.na(sad)) * 100), .(subject, part)]
Try this dplyr approach to compute df3:
library(dplyr)
#Code
df3 <- df1 %>% group_by(subject,part) %>% summarise(N=100*length(which(is.na(sad)))/length(sad))
Output:
# A tibble: 8 x 3
# Groups: subject [2]
subject part N
<dbl> <dbl> <dbl>
1 1 0 0
2 1 1 50
3 1 2 50
4 1 3 25
5 2 0 50
6 2 1 50
7 2 2 50
8 2 3 25
And for full interaction with df2 you can use left_join():
#Left join
df3 <- df1 %>% group_by(subject,part) %>%
summarise(N=100*length(which(is.na(sad)))/length(sad)) %>%
left_join(df2)
Output:
# A tibble: 8 x 4
# Groups: subject [2]
subject part N sad.m
<dbl> <dbl> <dbl> <dbl>
1 1 0 0 4.75
2 1 1 50 2
3 1 2 50 2.5
4 1 3 25 1.67
5 2 0 50 5.5
6 2 1 50 4.5
7 2 2 50 4
8 2 3 25 4

Manipulating large dataset with dcast

Apologies if this is a repeat question but I could not find the specific answer I am looking for. I have a dataframe with counts of different species caught on a given trip. A simplified example with 5 trips and 4 species is below:
trip = c(1,1,1,2,2,3,3,3,3,4,5,5)
species = c("a","b","c","b","d","a","b","c","d","c","c","d")
count = c(5,7,3,1,8,10,1,4,3,1,2,10)
dat = cbind.data.frame(trip, species, count)
dat
> dat
trip species count
1 1 a 5
2 1 b 7
3 1 c 3
4 2 b 1
5 2 d 8
6 3 a 10
7 3 b 1
8 3 c 4
9 3 d 3
10 4 c 1
11 5 c 2
12 5 d 10
I am only interested in the counts of species b for each trip. So I want to manipulate this data frame so I end up with one that looks like this:
trip2 = c(1,2,3,4,5)
species2 = c("b","b","b","b","b")
count2 = c(7,1,1,0,0)
dat2 = cbind.data.frame(trip2, species2, count2)
dat2
> dat2
trip2 species2 count2
1 1 b 7
2 2 b 1
3 3 b 1
4 4 b 0
5 5 b 0
I want to keep all trips, including trips where species b was not observed. So I can't just subset the data by species b. I know I can cast the data so species are the columns and then just remove the columns for the other species like so:
library(dplyr)
library(reshape2)
test = dcast(dat, trip ~ species, value.var = "count", fun.aggregate = sum)
test
> test
trip a b c d
1 1 5 7 3 0
2 2 0 1 0 8
3 3 10 1 4 3
4 4 0 0 1 0
5 5 0 0 2 10
However, my real dataset has several hundred species caught on thousands of trips, and if I try to cast that many species to columns R chokes. There are way too many columns. Is there a way to specify in dcast that I only want to cast species b? Or is there another way to do this that doesn't require casting the data? Thank you.
Here is a data.table approach which I suspect will be very fast for you:
library(data.table)
setDT(dat)
result <- dat[,.(species = "b", count = sum(.SD[species == "b",count])),by = trip]
result
trip species count
1: 1 b 7
2: 2 b 1
3: 3 b 1
4: 4 b 0
5: 5 b 0
We can use tidyverse
library(dplyr)
library(tidyr)
dat %>%
filter(species == 'b') %>%
group_by(trip, species) %>%
summarise(count = sum(count)) %>%
ungroup %>%
complete(trip = unique(dat$trip), fill = list(species = 'b', count = 0))
# A tibble: 5 x 3
# trip species count
# <dbl> <chr> <dbl>
#1 1 b 7
#2 2 b 1
#3 3 b 1
#4 4 b 0
#5 5 b 0

Create a count-consecutive variable which resets to 1

I have a dataset like the following, where "group" is a group variable. I want to count the number of 'next' days by group, but if it is not the next day I want the count to reset to one (as shown in the "want" column). Then, I want to return the max number of the "want" column (as in want2). Suggestions would be appreciated!
df<-data.frame(group=c(1, 1, 1, 1, 2, 2, 2),
date=c("2000-01-01", "2000-01-03", "2000-01-04", "2000-01-05", "2000-01-09", "2000-01-10", "2000-01-12"),
want=c(1,1,2,3,1,2,1),
want2=c(3,3,3,3,2,2,2))
bonus part 2: Thank you for all the feedback, it was extremely helpful. Is there a way to do the same with an added condition? I have a binary variable and I also want my count to reset when that variable==0. Like so:
# group date binary want
#1 1 2000-01-01 1 1
#2 1 2000-01-03 1 1
#3 1 2000-01-04 1 2
#4 1 2000-01-05 0 1
#5 2 2000-01-09 1 1
#6 2 2000-01-10 0 1
#7 2 2000-01-12 1 1
#8 3 2000-01-05 1 1
#9 3 2000-01-06 1 2
#10 3 2000-01-07 1 3
#11 3 2000-01-08 1 4
I have tried akrun's suggestion which worked very well without the binary var, I tried to modify it adding the binary var as part of cumsum but I get errors:
df %>% group_by(group)
%>% mutate(wantn = rowid(cumsum(c(TRUE, diff(as.Date(date)) !=1 & binary==1)))
Thanks!
An option is to group by 'group', then use diff on the Date class convered 'date', create a logical vector and use cumsum to replicate the results in 'want' ('wantn') and then with the 'wantn', apply max on it
library(dplyr)
library(data.table)
df %>%
group_by(group) %>%
mutate(wantn = rowid(cumsum(c(TRUE, diff(as.Date(date)) !=1))),
want2n = max(wantn))
# A tibble: 7 x 6
# Groups: group [2]
# group date want want2 wantn want2n
# <dbl> <fct> <dbl> <dbl> <int> <int>
#1 1 2000-01-01 1 3 1 3
#2 1 2000-01-03 1 3 1 3
#3 1 2000-01-04 2 3 2 3
#4 1 2000-01-05 3 3 3 3
#5 2 2000-01-09 1 2 1 2
#6 2 2000-01-10 2 2 2 2
#7 2 2000-01-12 1 2 1 2
or if we want to not use rowid, then create the grouping variable with cumsum and get the sequence
df %>%
group_by(group) %>%
group_by(group2 = cumsum(c(TRUE, diff(as.Date(date)) !=1)), add = TRUE) %>%
mutate(wantn = row_number()) %>%
group_by(group) %>%
mutate(want2n = max(wantn)) %>%
select(-group2)

Resources