Apply same function to several data replicates in R - r

Consider the following data simulation mechanism:
set.seed(1)
simulW <- function(G)
{
# Let G be the number of groups
n<-2*G #Assume 2 individuals per group
i<-rep(1:G, rep(2,G)) # Group index
j<-rep (1:n)
Y<-rbinom(n, 1, 0.5) # binary
data.frame(id=1:n, i,Y)
}
r<-5 #5 replicates
dat1 <- replicate(r, simulW(G = 10 ), simplify=FALSE)
#For example the first data replicate will be
> dat1[[1]]
id i Y
1 1 1 0
2 2 1 1
3 3 2 0
4 4 2 0
5 5 3 0
6 6 3 0
7 7 4 0
8 8 4 1
9 9 5 1
10 10 5 0
The code below can perform group wise (i is the group) sum of Y but by default considers only the first replicate i.e dat1[[1]].
Di<-aggregate( Y, by=list ( i ),FUN=sum) #Sum per group for the first dataset
e<-colSums(Di [ 2 ] ) #Total sum of Y for all groups for dataset 1
> e
x
8
di<-Di [ 2 ] # Groupwise sum for replicate 1
> di
x
1 2
2 2
3 2
4 0
5 2
How can I use the same function to perform the group wise sum for the other replicates.
Maybe something like:
for (m in 1:r )
{
Di[m]<-
e[m]<-
di[m]<-
}

You may use aggregate in lapply -
result <- lapply(dat1, function(x) aggregate(Y~i, x, sum))
result
#[[1]]
# i Y
#1 1 1
#2 2 1
#3 3 0
#4 4 0
#5 5 1
#6 6 1
#7 7 0
#8 8 2
#9 9 1
#10 10 1
#[[2]]
# i Y
#1 1 2
#2 2 2
#3 3 2
#4 4 0
#5 5 2
#6 6 1
#7 7 0
#8 8 0
#9 9 1
#10 10 1
#...
#...

We may use tidyverse
library(purrr)
library(dplyr)
map(dat1, ~ .x %>%
group_by(i) %>%
summarise(Y = sum(Y)))
-output
[[1]]
# A tibble: 10 × 2
i Y
<int> <int>
1 1 0
2 2 2
3 3 1
4 4 2
5 5 1
6 6 0
7 7 1
8 8 1
9 9 2
10 10 1
[[2]]
# A tibble: 10 × 2
i Y
<int> <int>
1 1 1
2 2 1
3 3 0
4 4 0
5 5 1
6 6 1
7 7 0
8 8 2
9 9 1
10 10 1
[[3]]
# A tibble: 10 × 2
i Y
<int> <int>
1 1 2
2 2 2
3 3 2
4 4 0
5 5 2
6 6 1
7 7 0
8 8 0
9 9 1
10 10 1
[[4]]
# A tibble: 10 × 2
i Y
<int> <int>
1 1 1
2 2 0
3 3 1
4 4 1
5 5 1
6 6 1
7 7 0
8 8 1
9 9 1
10 10 2
[[5]]
# A tibble: 10 × 2
i Y
<int> <int>
1 1 1
2 2 0
3 3 1
4 4 1
5 5 0
6 6 0
7 7 2
8 8 2
9 9 0
10 10 2

Related

Replace row value in a data frame group by the smallest value in that group

I have the following data set:
time <- c(0,1,2,3,4,5,0,1,2,3,4,5,0,1,2,3,4,5)
value <- c(10,8,6,5,3,2,12,10,6,5,4,2,20,15,16,9,2,2)
group <- c(1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3)
data <- data.frame(time, value, group)
I want to create a new column called data$diff that is equal to data$value minus the value of data$value when data$time == 0 within each group.
I am beginning with the following code
for(i in 1:nrow(data)){
for(n in 1:max(data$group)){
if(data$group[i] == n) {
data$diff[i] <- ???????
}
}
}
But cannot figure out what to put in place of the question marks. The desired output would be this table: https://i.stack.imgur.com/1bAKj.png
Any thoughts are appreciated.
Since in your example data$time == 0 is always the first element of the group, you can use this data.table approach.
library(data.table)
setDT(data)
data[, diff := value[1] - value, by = group]
In case that data$time == 0 is not the first element in each group you can use this:
data[, diff := value[time==0] - value, by = group]
Output:
> data
time value group diff
1: 0 10 1 0
2: 1 8 1 2
3: 2 6 1 4
4: 3 5 1 5
5: 4 3 1 7
6: 5 2 1 8
7: 0 12 2 0
8: 1 10 2 2
9: 2 6 2 6
10: 3 5 2 7
11: 4 4 2 8
12: 5 2 2 10
13: 0 20 3 0
14: 1 15 3 5
15: 2 16 3 4
16: 3 9 3 11
17: 4 2 3 18
18: 5 2 3 18
Here is a base R approach.
within(data, diff <- ave(
seq_along(value), group,
FUN = \(i) value[i][time[i] == 0] - value[i]
))
Output
time value group diff
1 0 10 1 0
2 1 8 1 2
3 2 6 1 4
4 3 5 1 5
5 4 3 1 7
6 5 2 1 8
7 0 12 2 0
8 1 10 2 2
9 2 6 2 6
10 3 5 2 7
11 4 4 2 8
12 5 2 2 10
13 0 20 3 0
14 1 15 3 5
15 2 16 3 4
16 3 9 3 11
17 4 2 3 18
18 5 2 3 18
Here is a short way to do it with dplyr.
library(dplyr)
data %>%
group_by(group) %>%
mutate(diff = value[which(time == 0)] - value)
Which gives
# Groups: group [3]
time value group diff
<dbl> <dbl> <dbl> <dbl>
1 0 10 1 0
2 1 8 1 2
3 2 6 1 4
4 3 5 1 5
5 4 3 1 7
6 5 2 1 8
7 0 12 2 0
8 1 10 2 2
9 2 6 2 6
10 3 5 2 7
11 4 4 2 8
12 5 2 2 10
13 0 20 3 0
14 1 15 3 5
15 2 16 3 4
16 3 9 3 11
17 4 2 3 18
18 5 2 3 18
library(dplyr)
vals2use <- data %>%
group_by(group) %>%
filter(time==0) %>%
select(c(2,3)) %>%
rename(value4diff=value)
dataNew <- merge(data, vals2use, all=T)
dataNew$diff <- dataNew$value4diff-dataNew$value
dataNew <- dataNew[,c(1,2,3,5)]
dataNew
group time value diff
1 1 0 10 0
2 1 1 8 2
3 1 2 6 4
4 1 3 5 5
5 1 4 3 7
6 1 5 2 8
7 2 0 12 0
8 2 1 10 2
9 2 2 6 6
10 2 3 5 7
11 2 4 4 8
12 2 5 2 10
13 3 0 20 0
14 3 1 15 5
15 3 2 16 4
16 3 3 9 11
17 3 4 2 18
18 3 5 2 18

Retrieve a value by another column criteria in R

i need some help:
i got this df:
df <- data.frame(month = c(1,1,1,1,1,2,2,2,2,2),
day = c(1,2,3,4,5,1,2,3,4,5),
flow = c(2,5,7,8,5,4,6,7,9,2))
month day flow
1 1 1 2
2 1 2 5
3 1 3 7
4 1 4 8
5 1 5 5
6 2 1 4
7 2 2 6
8 2 3 7
9 2 4 9
10 2 5 2
but i want to know the day of min per month:
month day flow dayminflowofthemonth
1 1 1 2 1
2 1 2 5 1
3 1 3 7 1
4 1 4 8 1
5 1 5 5 1
6 2 1 4 5
7 2 2 6 5
8 2 3 7 5
9 2 4 9 5
10 2 5 2 5
this repetition is not a problem, i will use pivot fuction
tks people!
We can use which.min to return the index of 'min'imum 'flow' per group and use that to get the corresponding 'day' to create the column with mutate
library(dplyr)
df <- df %>%
group_by(month) %>%
mutate(dayminflowofthemonth = day[which.min(flow)]) %>%
ungroup
-output
df
# A tibble: 10 x 4
# month day flow dayminflowofthemonth
# <dbl> <dbl> <dbl> <dbl>
# 1 1 1 2 1
# 2 1 2 5 1
# 3 1 3 7 1
# 4 1 4 8 1
# 5 1 5 5 1
# 6 2 1 4 5
# 7 2 2 6 5
# 8 2 3 7 5
# 9 2 4 9 5
#10 2 5 2 5
Another option using indexing inside dplyr pipeline:
library(dplyr)
#Code
newdf <- df %>% group_by(month) %>% mutate(Val=day[flow==min(flow)][1])
Output:
# A tibble: 10 x 4
# Groups: month [2]
month day flow Val
<dbl> <dbl> <dbl> <dbl>
1 1 1 2 1
2 1 2 5 1
3 1 3 7 1
4 1 4 8 1
5 1 5 5 1
6 2 1 4 5
7 2 2 6 5
8 2 3 7 5
9 2 4 9 5
10 2 5 2 5
Here is a base R option using ave
transform(
df,
dayminflowofthemonth = ave(day*(ave(flow,month,FUN = min)==flow),month,FUN = max)
)
which gives
month day flow dayminflowofthemonth
1 1 1 2 1
2 1 2 5 1
3 1 3 7 1
4 1 4 8 1
5 1 5 5 1
6 2 1 4 5
7 2 2 6 5
8 2 3 7 5
9 2 4 9 5
10 2 5 2 5
One more base R approach:
df$dayminflowofthemonth <- by(
df,
df$month,
function(x) x$day[which.min(x$flow)]
)[df$month]

Adding sequence of numbers to the data

Hi I have a data frame like this
df <-data.frame(x=rep(rep(seq(0,3),each=2),2 ),gr=gl(2,8))
x gr
1 0 1
2 0 1
3 1 1
4 1 1
5 2 1
6 2 1
7 3 1
8 3 1
9 0 2
10 0 2
11 1 2
12 1 2
13 2 2
14 2 2
15 3 2
16 3 2
I want to add a new column numbering sequence of numbers when the x value ==0
I tried
library(dplyr)
df%>%
group_by(gr)%>%
mutate(numbering=seq(2,8,2))
Error in mutate_impl(.data, dots) :
Column `numbering` must be length 8 (the group size) or one, not 4
?
Just for side note mutate(numbering=rep(seq(2,8,2),each=2)) would work for this minimal example but for the general case its better to look x value change from 0!
the expected output
x gr numbering
1 0 1 2
2 0 1 2
3 1 1 4
4 1 1 4
5 2 1 6
6 2 1 6
7 3 1 8
8 3 1 8
9 0 2 2
10 0 2 2
11 1 2 4
12 1 2 4
13 2 2 6
14 2 2 6
15 3 2 8
16 3 2 8
Do you mean something like this?
library(tidyverse);
df %>%
group_by(gr) %>%
mutate(numbering = cumsum(c(1, diff(x) != 0)))
## A tibble: 16 x 3
## Groups: gr [2]
# x gr numbering
# <int> <fct> <dbl>
# 1 0 1 1.
# 2 0 1 1.
# 3 1 1 2.
# 4 1 1 2.
# 5 2 1 3.
# 6 2 1 3.
# 7 3 1 4.
# 8 3 1 4.
# 9 0 2 1.
#10 0 2 1.
#11 1 2 2.
#12 1 2 2.
#13 2 2 3.
#14 2 2 3.
#15 3 2 4.
#16 3 2 4.
Or if you must have a numbering sequence 2,4,6,... instead of 1,2,3,... you can do
df %>%
group_by(gr) %>%
mutate(numering = 2 * cumsum(c(1, diff(x) != 0)));
## A tibble: 16 x 3
## Groups: gr [2]
# x gr numering
# <int> <fct> <dbl>
# 1 0 1 2.
# 2 0 1 2.
# 3 1 1 4.
# 4 1 1 4.
# 5 2 1 6.
# 6 2 1 6.
# 7 3 1 8.
# 8 3 1 8.
# 9 0 2 2.
#10 0 2 2.
#11 1 2 4.
#12 1 2 4.
#13 2 2 6.
#14 2 2 6.
#15 3 2 8.
#16 3 2 8.
Here is an option using match to get the index and then pass on the seq values to fill
df %>%
group_by(gr) %>%
mutate(numbering = seq(2, length.out = n()/2, by = 2)[match(x, unique(x))])
# A tibble: 16 x 3
# Groups: gr [2]
# x gr numbering
# <int> <fct> <dbl>
# 1 0 1 2
# 2 0 1 2
# 3 1 1 4
# 4 1 1 4
# 5 2 1 6
# 6 2 1 6
# 7 3 1 8
# 8 3 1 8
# 9 0 2 2
#10 0 2 2
#11 1 2 4
#12 1 2 4
#13 2 2 6
#14 2 2 6
#15 3 2 8
#16 3 2 8

How to calculate recency in R

I have the following data:
set.seed(20)
round<-rep(1:10,2)
part<-rep(1:2, c(10,10))
game<-rep(rep(1:2,c(5,5)),2)
pay1<-sample(1:10,20,replace=TRUE)
pay2<-sample(1:10,20,replace=TRUE)
pay3<-sample(1:10,20,replace=TRUE)
decs<-sample(1:3,20,replace=TRUE)
previous_max<-c(0,1,0,0,0,0,0,1,0,0,0,0,1,1,1,0,0,1,1,0)
gamematrix<-cbind(part,game,round,pay1,pay2,pay3,decs,previous_max )
gamematrix<-data.frame(gamematrix)
Here is the output:
part game round pay1 pay2 pay3 decs previous_max
1 1 1 1 9 5 6 2 0
2 1 1 2 8 1 1 1 1
3 1 1 3 3 5 5 3 0
4 1 1 4 6 1 5 1 0
5 1 1 5 10 3 8 3 0
6 1 2 6 10 1 5 1 0
7 1 2 7 1 10 7 3 0
8 1 2 8 1 10 8 2 1
9 1 2 9 4 1 5 1 0
10 1 2 10 4 7 7 2 0
11 2 1 1 8 4 1 1 0
12 2 1 2 8 5 5 2 0
13 2 1 3 1 9 3 1 1
14 2 1 4 8 2 10 2 1
15 2 1 5 2 6 2 3 1
16 2 2 6 5 5 6 2 0
17 2 2 7 4 5 1 2 0
18 2 2 8 2 10 5 2 1
19 2 2 9 3 7 3 2 1
20 2 2 10 9 3 1 1 0
How can I calculate a new indicator variable "previous_max",which returns whether in the next round of the same game, the same participant choose the maximal payoff from the previous round.
So I want something like follows:
Participant (part) 1:
In the first round of each game, previous_max is "0" (no previous round), in round 2, previous_max ="1", because in round 1, the maximal pay was max(pay1,pay2,pay3)=max(9,5,6)=9, and in round 2, the participant's decisions (decs) was 1 (which was the maximal value in previous round).
In round 3, previous_max=0, because the maximal value in round 2 was 8 (which is "pay1"), but the participant choose "3" (which is pay3).
Here's a solution using dplyr and purr::map.
I would have preferred to use group_by than split but max.col ignores groups and I don't know of a dplyr equivalent`.
the output is slightly different but I think it's because of your mistakes, please explain if not and I'll update my answer.
library(purrr)
library(dplyr)
gamematrix %>%
split(.$part) %>%
map(~ .x %>% mutate(
prev_max = as.integer(
decs ==
c(0,max.col(.[c("pay1","pay2","pay3")])[-n()]) # the number of the max columns, offset by one
))) %>%
bind_rows
# ` part game round pay1 pay2 pay3 decs prev_max
# 1 1 1 1 9 5 6 2 0
# 2 1 1 2 8 1 1 1 1
# 3 1 1 3 3 5 5 3 0
# 4 1 1 4 6 1 5 1 0
# 5 1 1 5 10 3 8 3 0
# 6 1 2 6 10 1 5 1 1
# 7 1 2 7 1 10 7 3 0
# 8 1 2 8 1 10 8 2 1
# 9 1 2 9 4 1 5 1 0
# 10 1 2 10 4 7 7 2 0
# 11 2 1 1 8 4 1 1 0
# 12 2 1 2 8 5 5 2 0
# 13 2 1 3 1 9 3 1 1
# 14 2 1 4 8 2 10 2 1
# 15 2 1 5 2 6 2 3 1
# 16 2 2 6 5 5 6 2 1
# 17 2 2 7 4 5 1 2 0
# 18 2 2 8 2 10 5 2 1
# 19 2 2 9 3 7 3 2 1
# 20 2 2 10 9 3 1 1 0

count positive negative values in column by group

I want to create two variables giving me the total number of positive and negative values by id, hopefully using dplyr.
Example data:
library(dplyr)
set.seed(42)
df <- data.frame (id=rep(1:10,each=10),
ff=rnorm(100, 0,14 ))
> head(df,20)
id ff
1 1 19.1934183
2 1 -7.9057744
3 1 5.0837978
4 1 8.8600765
5 1 5.6597565
6 1 -1.4857432
7 1 21.1613080
8 1 -1.3252265
9 1 28.2579320
10 1 -0.8779974
11 2 18.2681752
12 2 32.0130355
13 2 -19.4440498
14 2 -3.9030427
15 2 -1.8664987
16 2 8.9033056
17 2 -3.9795409
18 2 -37.1903759
19 2 -34.1665370
20 2 18.4815868
the resulting dataset should look like:
> head(df,20)
id ff pos neg
1 1 19.1934183 6 4
2 1 -7.9057744 6 4
3 1 5.0837978 6 4
4 1 8.8600765 6 4
5 1 5.6597565 6 4
6 1 -1.4857432 6 4
7 1 21.1613080 6 4
8 1 -1.3252265 6 4
9 1 28.2579320 6 4
10 1 -0.8779974 6 4
11 2 18.2681752 4 6
12 2 32.0130355 4 6
13 2 -19.4440498 4 6
14 2 -3.9030427 4 6
15 2 -1.8664987 4 6
16 2 8.9033056 4 6
17 2 -3.9795409 4 6
18 2 -37.1903759 4 6
19 2 -34.1665370 4 6
20 2 18.4815868 4 6
I have thought something similar to this will work:
df<-df%>% group_by(id) %>% mutate(pos= nrow(ff>0)) %>% ungroup()
Any help would be great, thanks.
You need sum():
df %>% group_by(id) %>%
mutate(pos = sum(ff>0),
neg = sum(ff<0))
For a fun (and a fast) solution data.table can also be used:
library(data.table)
setDT(df)
df[, ":="(pos = sum(ff > 0), neg = sum(ff < 0)), by = id]
Here's an answer that add the ifelse part of your question:
df <- df %>% group_by(id) %>%
mutate(pos = sum(ff>0), neg = sum(ff<0)) %>%
group_by(id) %>%
mutate(any_neg=ifelse(any(ff < 0), 1, 0))
Output:
> head(df, 20)
Source: local data frame [20 x 5]
Groups: id [2]
id ff pos neg any_neg
<int> <dbl> <int> <int> <dbl>
1 1 19.1934183 6 4 1
2 1 -7.9057744 6 4 1
3 1 5.0837978 6 4 1
4 1 8.8600765 6 4 1
5 1 5.6597565 6 4 1
6 1 -1.4857432 6 4 1
7 1 21.1613080 6 4 1
8 1 -1.3252265 6 4 1
9 1 28.2579320 6 4 1
10 1 -0.8779974 6 4 1
11 2 18.2681752 4 6 1
12 2 32.0130355 4 6 1
13 2 -19.4440498 4 6 1
14 2 -3.9030427 4 6 1
15 2 -1.8664987 4 6 1
16 2 8.9033056 4 6 1
17 2 -3.9795409 4 6 1
18 2 -37.1903759 4 6 1
19 2 -34.1665370 4 6 1
20 2 18.4815868 4 6 1

Resources