How to add new variable with condition in longitudinal data in R - r

In the following data, I would like to add another variable say z .
mydata
y x sl
1 199.92989 1 1
2 27.73883 2 1
3 144.00000 3 1
4 72.00000 4 1
5 0.00000 5 1
6 392.60636 1 2
7 749.52499 2 2
8 3120.00000 3 2
9 1600.00000 4 2
10 1000.00000 5 2
11 5840.00000 6 2
12 3960.00000 7 2
13 4700.00000 8 2
14 1660.00000 9 2
15 5620.00000 10 2
16 0.00000 1 585
17 0.00000 2 585
18 0.00000 3 585
19 3062.32962 1 587
20 2048.97458 2 587
21 1280.00000 3 587
22 1440.00000 4 587
23 2960.00000 5 587
24 460.00000 6 587
25 530.00000 7 587
26 5190.00000 8 587
27 3200.00000 9 587
28 4620.00000 10 587
29 0.00000 1 651
30 0.00000 2 651
31 0.00000 3 651
32 0.00000 4 651
z=c(5,7,8) , The value 5 should be repeated 5 times and belongs to sl=1 , 7 should be repeated 10 times and belongs to sl=2, 8 should be repeated 10 times and belongs to sl=587, . If all the observations of y are for 0 for any sl say 585 and 651, then z must take value 0. the z column must be like this z=c(rep(5,5), rep(7,10), rep(0,3), rep(8,10), rep(0,4))=c(5 5 5 5 5 7 7 7 7 7 7 7 7 7 7 0 0 0 8 8 8 8 8 8 8 8 8 8 0 0 0 0)
How can I do it with the above conditions?

We can use case_when from dplyr and specify the conditions.
library(dplyr)
df %>%
mutate(z = case_when(sl == 1 ~ 5,
sl == 2 ~ 7,
sl == 587 ~ 8,
all(y[sl == 585] == 0) ~ 0,
all(y[sl == 651] == 0) ~ 0))
which returns :
# y x sl z
#1 199.92989 1 1 5
#2 27.73883 2 1 5
#3 144.00000 3 1 5
#4 72.00000 4 1 5
#5 0.00000 5 1 5
#6 392.60636 1 2 7
#7 749.52499 2 2 7
#8 3120.00000 3 2 7
#9 1600.00000 4 2 7
#10 1000.00000 5 2 7
#11 5840.00000 6 2 7
#12 3960.00000 7 2 7
#13 4700.00000 8 2 7
#14 1660.00000 9 2 7
#15 5620.00000 10 2 7
#16 0.00000 1 585 0
#17 0.00000 2 585 0
#18 0.00000 3 585 0
#19 3062.32962 1 587 8
#20 2048.97458 2 587 8
#21 1280.00000 3 587 8
#22 1440.00000 4 587 8
#23 2960.00000 5 587 8
#24 460.00000 6 587 8
#25 530.00000 7 587 8
#26 5190.00000 8 587 8
#27 3200.00000 9 587 8
#28 4620.00000 10 587 8
#29 0.00000 1 651 0
#30 0.00000 2 651 0
#31 0.00000 3 651 0
#32 0.00000 4 651 0
If we do not know which sl would have all 0 or if there are multiple such sl we can use
df %>%
mutate(z = case_when(sl == 1 ~ 5,
sl == 2 ~ 7,
sl == 587 ~ 8)) %>%
group_by(sl) %>%
mutate(z = replace(z, all(y == 0), 0))
data
df <- structure(list(y = c(199.92989, 27.73883, 144, 72, 0, 392.60636,
749.52499, 3120, 1600, 1000, 5840, 3960, 4700, 1660, 5620, 0,
0, 0, 3062.32962, 2048.97458, 1280, 1440, 2960, 460, 530, 5190,
3200, 4620, 0, 0, 0, 0), x = c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L), sl = c(1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 585L, 585L, 585L,
587L, 587L, 587L, 587L, 587L, 587L, 587L, 587L, 587L, 587L, 651L,
651L, 651L, 651L)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24",
"25", "26", "27", "28", "29", "30", "31", "32"))

Related

combine two datasets with different dimension by ID

I have two datasets:
df1:
ID score
1 1 30
2 1 10
3 1 22
4 2 44
5 2 6
6 3 5
7 3 20
8 4 35
9 4 2
10 4 60
11 5 14
12 5 5
df2:
ID para1 para2
1 1 10 5
2 1 10 5
3 2 20 10
4 2 20 10
5 3 30 15
6 4 40 20
7 4 40 20
8 4 40 20
9 4 40 20
10 5 50 25
11 5 50 25
12 5 50 25
13 6 60 30
14 6 60 30
I would like to combine df1 and df2 by ID and get df3 below. Tried merge and left_join but they don't work well as I probably missed something. Any simple way to get this?
df3:
ID score para1 para2
1 1 30 10 5
2 1 10 10 5
3 1 22 10 5
4 2 44 20 10
5 2 6 20 10
6 3 5 30 15
7 3 20 30 15
8 4 35 40 20
9 4 2 40 20
10 4 60 40 20
11 5 14 50 25
12 5 5 50 25
One option to achieve your desired result would be to first get rid of the duplicated rows in your df2 using e.g. dplyr::distinct:
library(dplyr)
df1 %>%
left_join(distinct(df2, ID, para1, para2))
#> Joining, by = "ID"
#> ID score para1 para2
#> 1 1 30 10 5
#> 2 1 10 10 5
#> 3 1 22 10 5
#> 4 2 44 20 10
#> 5 2 6 20 10
#> 6 3 5 30 15
#> 7 3 20 30 15
#> 8 4 35 40 20
#> 9 4 2 40 20
#> 10 4 60 40 20
#> 11 5 14 50 25
#> 12 5 5 50 25
DATA
df1 <- structure(list(ID = c(
1L, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 4L,
5L, 5L
), score = c(
30L, 10L, 22L, 44L, 6L, 5L, 20L, 35L, 2L,
60L, 14L, 5L
)), class = "data.frame", row.names = c(
"1", "2",
"3", "4", "5", "6", "7", "8", "9", "10", "11", "12"
))
df2 <- structure(list(ID = c(
1L, 1L, 2L, 2L, 3L, 4L, 4L, 4L, 4L, 5L,
5L, 5L, 6L, 6L
), para1 = c(
10L, 10L, 20L, 20L, 30L, 40L, 40L,
40L, 40L, 50L, 50L, 50L, 60L, 60L
), para2 = c(
5L, 5L, 10L, 10L,
15L, 20L, 20L, 20L, 20L, 25L, 25L, 25L, 30L, 30L
)), class = "data.frame", row.names = c(
"1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14"
))
Another approach using data.table would be to do a join and keep the first matched row in the second data.frame.
In this case, take the subset of df2, where that key column's values match df1's key column's values based on ID key.
You can also include a nomatch argument to fill in a value if there's no match. See ?data.table for more details.
library(data.table)
setDT(df1)
setDT(df2)
df2[df1, mult = "first", on = "ID"]
Output
ID para1 para2 score
1: 1 10 5 30
2: 1 10 5 10
3: 1 10 5 22
4: 2 20 10 44
5: 2 20 10 6
6: 3 30 15 5
7: 3 30 15 20
8: 4 40 20 35
9: 4 40 20 2
10: 4 40 20 60
11: 5 50 25 14
12: 5 50 25 5

R - How to make a mean/average of n previous values, excluding current observation (rolling average)

Could someone kindly advise how best to approach making a new column in a dataframe, where each observation is an average/mean of the previous 12 observations (excluding the current observation). I have failed so far to find a similar answer on here so this would be greatly appreciated!
My data.frame:
LateCounts <-
Date Count
1 Jan-19 7
2 Feb-19 4
3 Mar-19 9
4 Apr-19 8
5 May-19 7
6 Jun-19 4
7 Jul-19 4
8 Aug-19 5
9 Sep-19 2
10 Oct-19 5
11 Nov-19 7
12 Dec-19 4
13 Jan-20 3
14 Feb-20 4
15 Mar-20 5
16 Apr-20 2
17 May-20 3
18 Jun-20 2
19 Jul-20 3
20 Aug-20 4
21 Sep-20 3
22 Oct-20 2
I am currently using the following code:
LateCounts <- LateCounts %>% mutate(RollAvge=rollapplyr(Count, 12, mean, partial = TRUE))
This yields the following but the 12 month rolling average:
Date Count RollAvge
1 Jan-19 7 7
2 Feb-19 4 5.5
3 Mar-19 9 6.666667
4 Apr-19 8 7
5 May-19 7 7
6 Jun-19 4 6.5
7 Jul-19 4 6.142857
8 Aug-19 5 6
9 Sep-19 2 5.555556
10 Oct-19 5 5.5
11 Nov-19 7 5.636364
12 Dec-19 4 5.5
13 Jan-20 3 5.166667
14 Feb-20 4 5.166667
15 Mar-20 5 4.833333
16 Apr-20 2 4.333333
17 May-20 3 4
18 Jun-20 2 3.833333
19 Jul-20 3 3.75
20 Aug-20 4 3.666667
21 Sep-20 3 3.75
22 Oct-20 2 3.5
What i actually need to achieve is the below. This is 12 month trailing or rolling average (where the values in the 'RollAvge' column are averages/means of the previous values in 'Count' column - not including the current month.
Date Count RollAvge
1 Jan-19 7
2 Feb-19 4 7
3 Mar-19 9 5.5
4 Apr-19 8 6.666667
5 May-19 7 7
6 Jun-19 4 7
7 Jul-19 4 6.5
8 Aug-19 5 6.142857
9 Sep-19 2 6
10 Oct-19 5 5.555556
11 Nov-19 7 5.5
12 Dec-19 4 5.636364
13 Jan-20 3 5.5
14 Feb-20 4 5.166667
15 Mar-20 5 5.166667
16 Apr-20 2 4.833333
17 May-20 3 4.333333
18 Jun-20 2 4
19 Jul-20 3 3.833333
20 Aug-20 4 3.75
21 Sep-20 3 3.666667
22 Oct-20 2 3.755556
Thanks,
We need to take the lag of the output derived from rollapply.
library(dplyr)
library(zoo)
LateCounts %>%
mutate(RollAvge= lag(rollapplyr(Count, 12, mean, partial = TRUE)))
-output
# Date Count RollAvge
#1 Jan-19 7 NA
#2 Feb-19 4 7.000000
#3 Mar-19 9 5.500000
#4 Apr-19 8 6.666667
#5 May-19 7 7.000000
#6 Jun-19 4 7.000000
#7 Jul-19 4 6.500000
#8 Aug-19 5 6.142857
#9 Sep-19 2 6.000000
#10 Oct-19 5 5.555556
#11 Nov-19 7 5.500000
#12 Dec-19 4 5.636364
#13 Jan-20 3 5.500000
#14 Feb-20 4 5.166667
#15 Mar-20 5 5.166667
#16 Apr-20 2 4.833333
#17 May-20 3 4.333333
#18 Jun-20 2 4.000000
#19 Jul-20 3 3.833333
#20 Aug-20 4 3.750000
#21 Sep-20 3 3.666667
#22 Oct-20 2 3.750000
data
LateCounts <- structure(list(Date = c("Jan-19", "Feb-19", "Mar-19", "Apr-19",
"May-19", "Jun-19", "Jul-19", "Aug-19", "Sep-19", "Oct-19", "Nov-19",
"Dec-19", "Jan-20", "Feb-20", "Mar-20", "Apr-20", "May-20", "Jun-20",
"Jul-20", "Aug-20", "Sep-20", "Oct-20"), Count = c(7L, 4L, 9L,
8L, 7L, 4L, 4L, 5L, 2L, 5L, 7L, 4L, 3L, 4L, 5L, 2L, 3L, 2L, 3L,
4L, 3L, 2L)), class = "data.frame", row.names = c("1", "2", "3",
"4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15",
"16", "17", "18", "19", "20", "21", "22"))
Using dplyr and zoo there is a way to do it using data.frame function #NW320d using the same rolling average function but without mutate and pipes
library(dplyr)
library(zoo)
Using the LateCounts code by #akrun (thank you for that code snippet!)
> LateCounts <- structure(list(Date = c("Jan-19", "Feb-19", "Mar-19", "Apr-19",
+ "May-19", "Jun-19", "Jul-19", "Aug-19", "Sep-19", "Oct-19", "Nov-19",
+ "Dec-19", "Jan-20", "Feb-20", "Mar-20", "Apr-20", "May-20", "Jun-20",
+ "Jul-20", "Aug-20", "Sep-20", "Oct-20"), Count = c(7L, 4L, 9L,
+ 8L, 7L, 4L, 4L, 5L, 2L, 5L, 7L, 4L, 3L, 4L, 5L, 2L, 3L, 2L, 3L,
+ 4L, 3L, 2L)), class = "data.frame", row.names = c("1", "2", "3",
+ "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15",
+ "16", "17", "18", "19", "20", "21", "22"))
> data.frame(LateCounts$Count, rollavg=dplyr::lag(rollapplyr(LateCounts$Count, 12, mean, partial = TRUE)))
Output:
LateCounts.Count rollavg
1 7 NA
2 4 7.000000
3 9 5.500000
4 8 6.666667
5 7 7.000000
6 4 7.000000
7 4 6.500000
8 5 6.142857
9 2 6.000000
10 5 5.555556
11 7 5.500000
12 4 5.636364
13 3 5.500000
14 4 5.166667
15 5 5.166667
16 2 4.833333
17 3 4.333333
18 2 4.000000
19 3 3.833333
20 4 3.750000
21 3 3.666667
22 2 3.750000

How can I identify the first row with value lower than the first row in different column in groups in R?

I have a data set that looks like this:
unique score value day
1 2 52 33.75 1
2 2 39 36.25 2
3 3 47 41.25 1
4 3 26 41.00 2
5 3 17 32.25 3
6 3 22 28.00 4
7 3 11 19.00 5
8 3 9 14.75 6
9 3 20 15.50 7
10 4 32 18.00 1
11 4 20 20.25 2
12 5 32 26.00 1
13 5 31 28.75 2
14 5 25 27.00 3
15 5 27 28.75 4
16 6 44 31.75 1
17 6 25 30.25 2
18 6 31 31.75 3
19 6 37 34.25 4
20 6 28 30.25 5
I would like to identify the first row in each group (unique) where the score is lower than the value on day 1.
I have tried this:
result<-df %>%
group_by(unique.id) %>%
filter(dailyMyoActivity < globaltma[globalflareday==1])
But it doesn't seem to do exactly what I want it to do.
Is there a way of doing this?
If I understood your rationale correctly, and if your dataset is already ordered by day, this dplyr solution may come in handy
library(dplyr)
df %>%
group_by(unique) %>%
filter(score < value[day==1]) %>%
slice(1)
Output
# A tibble: 3 x 4
# Groups: unique [3]
# unique score value day
# <int> <int> <dbl> <int>
# 1 3 26 41 2
# 2 5 25 27 3
# 3 6 25 30.2 2
This could help:
library(dplyr)
df %>% group_by(unique) %>% mutate(Index=ifelse(score<value & day==1,1,0))
# A tibble: 20 x 5
# Groups: unique [5]
unique score value day Index
<int> <int> <dbl> <int> <dbl>
1 2 52 33.8 1 0
2 2 39 36.2 2 0
3 3 47 41.2 1 0
4 3 26 41 2 0
5 3 17 32.2 3 0
6 3 22 28 4 0
7 3 11 19 5 0
8 3 9 14.8 6 0
9 3 20 15.5 7 0
10 4 32 18 1 0
11 4 20 20.2 2 0
12 5 32 26 1 0
13 5 31 28.8 2 0
14 5 25 27 3 0
15 5 27 28.8 4 0
16 6 44 31.8 1 0
17 6 25 30.2 2 0
18 6 31 31.8 3 0
19 6 37 34.2 4 0
20 6 28 30.2 5 0
Then you filter by Index==1
We could also use slice
library(dplyr)
df1 %>%
group_by(unique) %>%
slice(which(score < value[day == 1])[1])
# A tibble: 3 x 4
# Groups: unique [3]
# unique score value day
# <int> <int> <dbl> <int>
#1 3 26 41 2
#2 5 25 27 3
#3 6 25 30.2 2
data
df1 <- structure(list(unique = c(2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
4L, 4L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L), score = c(52L, 39L,
47L, 26L, 17L, 22L, 11L, 9L, 20L, 32L, 20L, 32L, 31L, 25L, 27L,
44L, 25L, 31L, 37L, 28L), value = c(33.75, 36.25, 41.25, 41,
32.25, 28, 19, 14.75, 15.5, 18, 20.25, 26, 28.75, 27, 28.75,
31.75, 30.25, 31.75, 34.25, 30.25), day = c(1L, 2L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 1L, 2L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 5L)),
class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "20"))
Given that you have asked for identifying the first row which fulfills the criterion score < value a new column which gives you the row number has been added.
result <- df %>%
mutate(row_nr = row_number()) %>%
group_by(unique) %>%
filter(score < value) %>%
slice(1)

Generate all possible subsets of a given set and do some calculations

I have a data frame that looks like this
subj trial factor rt
1 1 Early 324
1 2 Early 405
1 3 Early 293
1 4 Early 738
1 5 Late 310
1 6 Late 389
1 7 Late 350
1 8 Late 782
1 9 Late 513
1 10 Late 401
2 1 Early 420
2 2 Early 230
2 3 Early 309
2 4 Late 456
2 5 Late 241
2 6 Late 400
2 7 Late 189
2 8 Late 329
2 9 Late 519
2 10 Late 230
3 1 Early 299
3 2 Early 499
3 3 Late 403
3 4 Late 389
3 5 Late 356
3 6 Late 365
3 7 Late 234
3 8 Late 345
3 9 Late 300
3 10 Late 402
As you can see there are unequal number of trials for both conditions.
What I want to do is for each participant, calculate the number of trials per condition (For participant 1 it would be Early = 3 and Late = 7, for participant 2 is Early = 4, Late = 6, and participant 3 is Early = 2 and Late 8).
The number of trials of Early condition will determine the size of the subsets I want to generate. So again, for participant 1, I want to generate all the possible combinations of 3 trials out of the 7 trials in the Late condition and calculate a mean for each combination. I don't know if I'm explaining it correctly.
So, it would go something like this. Since participant 1 only has 3 trials in the early condition, I will calculate a mean rt score for those 3 trials. But for the late condition, I want to generate all possible combinations of trials like 4 5 6, 4 5 7, 4 5 8, 4 5 9, 4 5 10, 4 6 7, 4 6 8, 4 6 9, 4 6 10 etc and then calculate the mean rt score for each combination of trials and then a general mean for the late condition.
I don't know how to go about doing this. I know expand.grid() function can help with the combination part, but I don't really know how to make the number of combinations be defined by the number of trials of the early condition since this will vary for each participant.
I don't know if I was clear enough, but I hope someone can help shade some light on it.
Thanks guys!
Here is a base R solution. You can define a customized function combavg to calculate the mean of combinations
combavg <- function(x) {
r <- data.frame(t(combn(which(x$factor == "Late"),sum(x$factor == "Early"), function(v) c(v,mean(x$rt[v])))))
names(r)[ncol(r)] <- "rt.avg"
r
}
and then use the following line to get the result
res <- Map(combavg,split(df,df$subj))
such that
> res
$`1`
X1 X2 X3 X4 rt.avg
1 5 6 7 8 457.75
2 5 6 7 9 390.50
3 5 6 7 10 362.50
4 5 6 8 9 498.50
5 5 6 8 10 470.50
6 5 6 9 10 403.25
7 5 7 8 9 488.75
8 5 7 8 10 460.75
9 5 7 9 10 393.50
10 5 8 9 10 501.50
11 6 7 8 9 508.50
12 6 7 8 10 480.50
13 6 7 9 10 413.25
14 6 8 9 10 521.25
15 7 8 9 10 511.50
$`2`
X1 X2 X3 rt.avg
1 4 5 6 365.6667
2 4 5 7 295.3333
3 4 5 8 342.0000
4 4 5 9 405.3333
5 4 5 10 309.0000
6 4 6 7 348.3333
7 4 6 8 395.0000
8 4 6 9 458.3333
9 4 6 10 362.0000
10 4 7 8 324.6667
11 4 7 9 388.0000
12 4 7 10 291.6667
13 4 8 9 434.6667
14 4 8 10 338.3333
15 4 9 10 401.6667
16 5 6 7 276.6667
17 5 6 8 323.3333
18 5 6 9 386.6667
19 5 6 10 290.3333
20 5 7 8 253.0000
21 5 7 9 316.3333
22 5 7 10 220.0000
23 5 8 9 363.0000
24 5 8 10 266.6667
25 5 9 10 330.0000
26 6 7 8 306.0000
27 6 7 9 369.3333
28 6 7 10 273.0000
29 6 8 9 416.0000
30 6 8 10 319.6667
31 6 9 10 383.0000
32 7 8 9 345.6667
33 7 8 10 249.3333
34 7 9 10 312.6667
35 8 9 10 359.3333
$`3`
X1 X2 rt.avg
1 3 4 396.0
2 3 5 379.5
3 3 6 384.0
4 3 7 318.5
5 3 8 374.0
6 3 9 351.5
7 3 10 402.5
8 4 5 372.5
9 4 6 377.0
10 4 7 311.5
11 4 8 367.0
12 4 9 344.5
13 4 10 395.5
14 5 6 360.5
15 5 7 295.0
16 5 8 350.5
17 5 9 328.0
18 5 10 379.0
19 6 7 299.5
20 6 8 355.0
21 6 9 332.5
22 6 10 383.5
23 7 8 289.5
24 7 9 267.0
25 7 10 318.0
26 8 9 322.5
27 8 10 373.5
28 9 10 351.0
DATA
df <- structure(list(subj = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L), trial = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L,
5L, 6L, 7L, 8L, 9L, 10L), factor = c("Early", "Early", "Early",
"Early", "Late", "Late", "Late", "Late", "Late", "Late", "Early",
"Early", "Early", "Late", "Late", "Late", "Late", "Late", "Late",
"Late", "Early", "Early", "Late", "Late", "Late", "Late", "Late",
"Late", "Late", "Late"), rt = c(324L, 405L, 293L, 738L, 310L,
389L, 350L, 782L, 513L, 401L, 420L, 230L, 309L, 456L, 241L, 400L,
189L, 329L, 519L, 230L, 299L, 499L, 403L, 389L, 356L, 365L, 234L,
345L, 300L, 402L)), class = "data.frame", row.names = c(NA, -30L
))
The following code splits the data set by subj and lapply a function to each subset. This function fun uses combn to determine the combinations of indices when factor == "Late" and computes the mean value of each rt indexed by those combinations.
fun <- function(DF){
n <- sum(DF[["factor"]] == "Early")
late <- which(DF[["factor"]] == "Late")
cmb <- combn(late, n)
apply(cmb, 2, function(i) mean(DF[i, "rt"]))
}
sp <- split(df1, df1$subj)
lapply(sp, fun)
#$`1`
# [1] 457.75 390.50 362.50 498.50 470.50 403.25 488.75
# [8] 460.75 393.50 501.50 508.50 480.50 413.25 521.25
#[15] 511.50
#
#$`2`
# [1] 365.6667 295.3333 342.0000 405.3333 309.0000 348.3333
# [7] 395.0000 458.3333 362.0000 324.6667 388.0000 291.6667
#[13] 434.6667 338.3333 401.6667 276.6667 323.3333 386.6667
#[19] 290.3333 253.0000 316.3333 220.0000 363.0000 266.6667
#[25] 330.0000 306.0000 369.3333 273.0000 416.0000 319.6667
#[31] 383.0000 345.6667 249.3333 312.6667 359.3333
#
#$`3`
# [1] 396.0 379.5 384.0 318.5 374.0 351.5 402.5 372.5 377.0
#[10] 311.5 367.0 344.5 395.5 360.5 295.0 350.5 328.0 379.0
#[19] 299.5 355.0 332.5 383.5 289.5 267.0 318.0 322.5 373.5
#[28] 351.0

How to sample from categorical variables in R data.frame ?

I am trying to sample from an R data frame but I have some problems with the categorical variables.
I am not taking a random subsamples of rows but I am generating rows such that the new variables have individually the same distribution of the original one.
I am having problem with the categorical variables.
> head(x0)
Symscore1 Symscore2 exercise3 exerciseduration3 groupchange age3
3 1 0 1 0 Transitional to Menopausal 52
4 0 0 5 2 Transitional to Menopausal 62
6 0 0 2 0 Transitional to Menopausal 54
8 0 0 5 3 Transitional to Menopausal 56
10 0 0 4 3 Transitional to Menopausal 59
13 0 1 4 3 Transitional to Menopausal 55
packyears bmi3 education3
3 2.357143 23.24380 Basic
4 2.000000 16.76574 University
6 1.000000 23.30668 Basic
8 1.428571 22.14533 University
10 1.428571 22.14533 University
13 0.000000 22.03857 University
> xa = as.data.frame(sapply(X = x0, FUN = sample))
> head(xa)
Symscore1 Symscore2 exercise3 exerciseduration3 groupchange age3 packyears
1 1 0 2 3 4 49 53.571430
2 0 0 3 0 3 46 2.142857
3 1 0 3 3 4 49 4.000000
4 0 1 3 3 4 58 0.000000
5 0 0 2 0 1 57 0.000000
6 0 0 3 0 1 47 26.871429
bmi3 education3
1 25.84777 2
2 21.25850 2
3 25.79592 3
4 23.93899 1
5 25.97012 2
6 23.53037 2
> X = rbind(x0,xa)
Warning messages:
1: In `[<-.factor`(`*tmp*`, ri, value = c(4, 3, 4, 4, 1, 1, 2, 4, 4, :
invalid factor level, NA generated
2: In `[<-.factor`(`*tmp*`, ri, value = c(2, 2, 3, 1, 2, 2, 3, 2, 2, :
invalid factor level, NA generated
>
You could try:
x2 <- x0
x2[] <- lapply(x0, FUN = sample)
x2
# Symscore1 Symscore2 exercise3 exerciseduration3 groupchange
#3 0 0 1 0 Transitional to Menopausal
#4 0 0 5 3 Transitional to Menopausal
#6 0 0 4 3 Transitional to Menopausal
#8 0 0 2 0 Transitional to Menopausal
#10 1 1 4 3 Transitional to Menopausal
#13 0 0 5 2 Transitional to Menopausal
age3
#3 54
#4 59
#6 52
#8 56
#10 62
#13 5
rbind(x0,x2)
data
x0 <- structure(list(Symscore1 = c(1L, 0L, 0L, 0L, 0L, 0L), Symscore2 = c(0L,
0L, 0L, 0L, 0L, 1L), exercise3 = c(1L, 5L, 2L, 5L, 4L, 4L), exerciseduration3 = c(0L,
2L, 0L, 3L, 3L, 3L), groupchange = structure(c(1L, 1L, 1L, 1L,
1L, 1L), .Label = "Transitional to Menopausal", class = "factor"),
age3 = c(52L, 62L, 54L, 56L, 59L, 5L)), .Names = c("Symscore1",
"Symscore2", "exercise3", "exerciseduration3", "groupchange",
"age3"), class = "data.frame", row.names = c("3", "4", "6", "8",
"10", "13"))

Resources