Related
I have the following code:
Ni <- 133 # number of individuals
MXmeas <- 10 # number of measurements
# simulate number of observations for each individual
Nmeas <- round(runif(Ni, 1, MXmeas))
# simulate observation moments (under the assumption that everybody has at least one observation)
obs <- unlist(sapply(Nmeas, function(x) c(1, sort(sample(2:MXmeas, x-1, replace = FALSE)))))
# set up dataframe (id, observations)
dat <- data.frame(ID = rep(1:Ni, times = Nmeas), observations = obs)
This results in the following output:
ID observations
1 1
1 3
1 4
1 5
1 6
1 8
However, I also want a variable 'times' to indicate how many times of measurement there were for each individual. But since every ID has a different length, I am not sure how to implement this. This anybody know how to include that? I want it to look like this:
ID observations times
1 1 1
1 3 2
1 4 3
1 5 4
1 6 5
1 8 6
Using dplyr you could group by ID and use the row number for times:
library(dplyr)
dat |>
group_by(ID) |>
mutate(times = row_number()) |>
ungroup()
With base we could create the sequence based on each of the lengths of the ID variable:
dat$times <- sequence(rle(dat$ID)$lengths)
Output:
# A tibble: 734 × 3
ID observations times
<int> <dbl> <int>
1 1 1 1
2 1 3 2
3 1 9 3
4 2 1 1
5 2 5 2
6 2 6 3
7 2 8 4
8 3 1 1
9 3 2 2
10 3 5 3
# … with 724 more rows
Data (using a seed):
set.seed(1)
Ni <- 133 # number of individuals
MXmeas <- 10 # number of measurements
# simulate number of observations for each individual
Nmeas <- round(runif(Ni, 1, MXmeas))
# simulate observation moments (under the assumption that everybody has at least one observation)
obs <- unlist(sapply(Nmeas, function(x) c(1, sort(sample(2:MXmeas, x-1, replace = FALSE)))))
# set up dataframe (id, observations)
dat <- data.frame(ID = rep(1:Ni, times = Nmeas), observations = obs)
I have a data frame with three variables and some missing values in one of the variables that looks like this:
subject <- c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2)
part <- c(0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3)
sad <- c(1,7,7,4,NA,NA,2,2,NA,2,3,NA,NA,2,2,1,NA,5,NA,6,6,NA,NA,3,3,NA,NA,5,3,NA,7,2)
df1 <- data.frame(subject,part,sad)
I have created a new data frame with the mean values of 'sad' per subject and part using a loop, like this:
columns<-c("sad.m",
"part",
"subject")
df2<-matrix(data=NA,nrow=1,ncol=length(columns))
df2<-data.frame(df2)
names(df2)<-columns
tn<-unique(df1$subject)
row=1
for (s in tn){
for (i in 0:3){
TN<-df1[df1$subject==s&df1$part==i,]
df2[row,"sad.m"]<-mean(as.numeric(TN$sad), na.rm = TRUE)
df2[row,"part"]<-i
df2[row,"subject"]<-s
row=row+1
}
}
Now I want to include an additional variable 'missing' in that indicates the percentage of rows per subject and part with missing values, so that I get df3:
subject <- c(1,1,1,1,2,2,2,2)
part<-c(0,1,2,3,0,1,2,3)
sad.m<-df2$sad.m
missing <- c(0,50,50,25,50,50,50,25)
df3 <- data.frame(subject,part,sad.m,missing)
I'd really appreciate any help on how to go about this!
It's best to try and avoid loops in R where possible, since they can get messy and tend to be quite slow. For this sort of thing, dplyr library is perfect and well worth learning. It can save you a lot of time.
You can create a data frame with both variables by first grouping by subject and part, and then performing a summary of the grouped data frame:
df2 = df1 %>%
dplyr::group_by(subject, part) %>%
dplyr::summarise(
sad_mean = mean(na.omit(sad)),
na_count = (sum(is.na(sad) / n()) * 100)
)
df2
# A tibble: 8 x 4
# Groups: subject [2]
subject part sad_mean na_count
<dbl> <dbl> <dbl> <dbl>
1 1 0 4.75 0
2 1 1 2 50
3 1 2 2.5 50
4 1 3 1.67 25
5 2 0 5.5 50
6 2 1 4.5 50
7 2 2 4 50
8 2 3 4 25
For each subject and part you can calculate mean of sad and calculate ratio of NA value using is.na and mean.
library(dplyr)
df1 %>%
group_by(subject, part) %>%
summarise(sad.m = mean(sad, na.rm = TRUE),
perc_missing = mean(is.na(sad)) * 100)
# subject part sad.m perc_missing
# <dbl> <dbl> <dbl> <dbl>
#1 1 0 4.75 0
#2 1 1 2 50
#3 1 2 2.5 50
#4 1 3 1.67 25
#5 2 0 5.5 50
#6 2 1 4.5 50
#7 2 2 4 50
#8 2 3 4 25
Same logic with data.table :
library(data.table)
setDT(df1)[, .(sad.m = mean(sad, na.rm = TRUE),
perc_missing = mean(is.na(sad)) * 100), .(subject, part)]
Try this dplyr approach to compute df3:
library(dplyr)
#Code
df3 <- df1 %>% group_by(subject,part) %>% summarise(N=100*length(which(is.na(sad)))/length(sad))
Output:
# A tibble: 8 x 3
# Groups: subject [2]
subject part N
<dbl> <dbl> <dbl>
1 1 0 0
2 1 1 50
3 1 2 50
4 1 3 25
5 2 0 50
6 2 1 50
7 2 2 50
8 2 3 25
And for full interaction with df2 you can use left_join():
#Left join
df3 <- df1 %>% group_by(subject,part) %>%
summarise(N=100*length(which(is.na(sad)))/length(sad)) %>%
left_join(df2)
Output:
# A tibble: 8 x 4
# Groups: subject [2]
subject part N sad.m
<dbl> <dbl> <dbl> <dbl>
1 1 0 0 4.75
2 1 1 50 2
3 1 2 50 2.5
4 1 3 25 1.67
5 2 0 50 5.5
6 2 1 50 4.5
7 2 2 50 4
8 2 3 25 4
I have a dataset that looks like this, and I would like to write a code in order to keep some rows based on the time. It need to keep all the rows of an id if a certain value of time is reached (e.g. 5).
id <- c(rep(1, 5), rep(2,4))
time <- c(1,2,3,4,5,1,2,3,4)
amount <- c(10,20,40,50,60,12,20,32,42)
e <- cbind(id, time, amount)
e
For example, in this case, I would like to keep all the rows of id == 1 because its time reaches 5.
Since time never equals 5 for id == 2, its rows would be deleted.
Thank you very much for your help!
You can try this by creating a flag:
library(dplyr)
id <- c(rep(1, 5),rep(2,4))
time <- c(1,2,3,4,5,1,2,3,4)
amount <- c(10,20,40,50,60,12,20,32,42)
e<- data.frame(id, time, amount)
e
#Compute
e %>% group_by(id) %>% mutate(Flag=max(time,na.rm=T)) %>%
filter(Flag==5) %>% select(-Flag)
# A tibble: 5 x 3
# Groups: id [1]
id time amount
<dbl> <dbl> <dbl>
1 1 1 10
2 1 2 20
3 1 3 40
4 1 4 50
5 1 5 60
Or in base R:
e[e[,"id"] %in% names(which(tapply(e[,"time"], e[,"id"], max) >= 5)),]
#> id time amount
#> [1,] 1 1 10
#> [2,] 1 2 20
#> [3,] 1 3 40
#> [4,] 1 4 50
#> [5,] 1 5 60
A base solution:
e <- data.frame(id, time, amount)
subset(e, id %in% id[time >= 5])
# id time amount
# 1 1 1 10
# 2 1 2 20
# 3 1 3 40
# 4 1 4 50
# 5 1 5 60
The corresponding dplyr version:
library(dplyr)
e %>% filter(id %in% id[time >= 5])
I have a data set for a meta-analysis that contains pre-test data in a set of columns, post-test data in another set of columns, and one column for condition (i.e., treatment [Condition == 1] versus control [Condition == 0]). I need to widen this data set such that I create a new set of columns for control observations' pre-test data and post-test data which is placed alongside that of the original treatment data. These data are grouped by ID. This means that I need to conditionally copy only observations that are "control" into a set of columns alongside the "treatment" observations, but within each ID group.
I know that's an obnoxious way to describe it, so here's an example of the data set I have:
data_before.df <- data.frame(ID = c(1,1,1,2,2,2,3,3,3),
Condition = c(0,1,2,0,1,2,0,1,2),
Pre_M = c(1,2,3,4,5,6,7,8,9),
Post_M = c(90,80,70,60,50,40,30,20,10))
data_before.df
And here's what I need to get to:
data_after.df <- data.frame(ID = c(1,1,2,2,3,3),
Condition = c(1,2,1,2,1,2),
Pre_M = c(2,3,5,6,8,9),
Post_M = c(80,70,50,40,20,10),
Control_Pre_M = c(1,1,4,4,7,7),
Control_Post_M = c(90,90,60,60,30,30))
data_after.df
Here is one option with dplyr. After grouping by 'ID', create create two new column with 'Control' as part of the column by looping over the column that end with 'M' and subsetting the value where 'Condition' is 0, ungroup and filter out the row where 'Condition' is 0
library(dplyr)
library(stringr)
data_before.df %>%
group_by(ID) %>%
mutate_at(vars(ends_with('M')), list(Control = ~.[Condition == 0])) %>%
ungroup %>%
filter(Condition != 0) %>%
rename_at(vars(ends_with('Control')), ~
str_replace(., '(.*)_Control', 'Control_\\1'))
# A tibble: 6 x 6
# ID Condition Pre_M Post_M Control_Pre_M Control_Post_M
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 1 1 2 80 1 90
#2 1 2 3 70 1 90
#3 2 1 5 50 4 60
#4 2 2 6 40 4 60
#5 3 1 8 20 7 30
#6 3 2 9 10 7 30
Or an option with merge from base R
merge(subset(data_before.df, Condition != 0),
subset(data_before.df, Condition == 0,
select = c("ID", "Pre_M", "Post_M")), by = 'ID')
Or a join with data.table
library(data.table)
setDT(data_before.df)[Condition != 0][data_before.df[Condition == 0,
.(ID, Control_Pre_M = Pre_M, Control_Post_M = Post_M)], on = .(ID)]
# ID Condition Pre_M Post_M Control_Pre_M Control_Post_M
#1: 1 1 2 80 1 90
#2: 1 2 3 70 1 90
#3: 2 1 5 50 4 60
#4: 2 2 6 40 4 60
#5: 3 1 8 20 7 30
#6: 3 2 9 10 7 30
Consider this simple example:
library(dplyr)
library(broom)
dataframe <- data_frame(id = c(1,2,3,4,5,6),
group = c(1,1,1,2,2,2),
value = c(200,400,120,300,100,100))
# A tibble: 6 x 3
id group value
<dbl> <dbl> <dbl>
1 1 1 200
2 2 1 400
3 3 1 120
4 4 2 300
5 5 2 100
6 6 2 100
Here I want to group by group and create two columns.
One is the number of distinct values in value (I can use dplyr::n_distinct), the other is the constant term from a regression of value on the vector 1. That is, the output of
tidy(lm(data = dataframe, value ~ 1)) %>% select(estimate)
estimate
1 203.3333
The difficulty here is combining these two simple outputs into a single mutate statement that preserves the grouping.
I tried something like:
formula1 <- function(data, myvar){
tidy(lm(data = data, myvar ~ 1)) %>% select(estimate)
}
dataframe %>% group_by(group) %>%
mutate(distinct = n_distinct(value),
mean = formula1(., value))
but this does not work. What I am missing here?
Thanks!
This approach will work if you use pull in place of select. This extracts the single estimate value from the tidy output.
formula1 <- function(data, myvar){
tidy(lm(data = data, myvar ~ 1)) %>% pull(estimate)
}
dataframe %>%
group_by(group) %>%
mutate(distinct = n_distinct(value),
mean = formula1(., value))
# A tibble: 6 x 5
# Groups: group [2]
id group value distinct mean
<dbl> <dbl> <dbl> <int> <dbl>
1 1 1 200 3 240.0000
2 2 1 400 3 240.0000
3 3 1 120 3 240.0000
4 4 2 300 2 166.6667
5 5 2 100 2 166.6667
6 6 2 100 2 166.6667