dplyr differences between pairs in nested groups - r

I'd like to use dplyr to calculate differences in value between people nested in pair by session.
dat <- data.frame(person=c(rep(1, 10),
rep(2, 10),
rep(3, 10),
rep(4, 10),
rep(5, 10),
rep(6, 10),
rep(7, 10),
rep(8, 10)),
pair=c(rep(1, 20),
rep(2, 20),
rep(3, 20),
rep(4, 20)),
condition=c(rep("NEW", 10),
rep("OLD", 10),
rep("NEW", 10),
rep("OLD", 10),
rep("NEW", 10),
rep("OLD", 10),
rep("NEW", 10),
rep("OLD", 10)),
session=rep(seq(from=1, to=10, by=1), 8),
value=c(0, 2, 4, 8, 16, 16, 18, 20, 20, 20,
0, 1, 1, 2, 4, 5, 8, 12, 15, 15,
0, 2, 8, 10, 15, 16, 18, 20, 20, 20,
0, 4, 4, 6, 6, 8, 10, 12, 12, 18,
0, 6, 8, 10, 16, 16, 18, 20, 20, 20,
0, 2, 2, 3, 4, 8, 8, 8, 10, 12,
0, 10, 12, 16, 18, 18, 18, 20, 20, 20,
0, 2, 2, 8, 10, 10, 11, 12, 15, 20)
)
For instance, person 1 and 2 make a pair (pair==1):
person==1 & session==2: 2
person==2 & session==2: 1
Difference (NEW-OLD) is 2-1=1.
Here's what I have tried so far. I think I need to group_by() first and then summarise(), but I have not cracked this nut.
dat %>%
mutate(session = factor(session)) %>%
group_by(condition, pair, session) %>%
summarise(pairDiff = value-first(value))
Desired output:

Your output can be obtained by:
dat %>% group_by(pair,session) %>% arrange(condition) %>% summarise(diff = -diff(value))
Source: local data frame [40 x 3]
Groups: pair [?]
# A tibble: 40 x 3
pair session diff
<dbl> <dbl> <dbl>
1 1 1 0
2 1 2 1
3 1 3 3
4 1 4 6
5 1 5 12
6 1 6 11
7 1 7 10
8 1 8 8
9 1 9 5
10 1 10 5
# ... with 30 more rows
The arrange ensures that NEW and OLD are in the correct positions, but the solution does depend on there being exactly 2 values for each combination of pair and session.

You can spread condition to headers and then do the subtraction NEW - OLD:
library(dplyr); library(tidyr)
dat %>%
select(-person) %>%
spread(condition, value) %>%
mutate(diff = NEW - OLD) %>%
select(session, pair, diff)
# A tibble: 40 x 3
# session pair diff
# <dbl> <dbl> <dbl>
# 1 1 1 0
# 2 2 1 1
# 3 3 1 3
# 4 4 1 6
# 5 5 1 12
# 6 6 1 11
# 7 7 1 10
# 8 8 1 8
# 9 9 1 5
#10 10 1 5
# ... with 30 more rows

Related

Taking the mean of a column within a function and a for loop

I have the below function :
compute_treatment_effects <- function(dataset, outcome, baseline_outcome,
covariates,
standardize){
baseline_covariates <- c(baseline_outcome, covariates)
dataset <- dataset %>%
mutate(treat =ifelse(treatment_group == "trt", 1,
ifelse(treatment_group == "control", 0, NA))) %>%
filter(!is.na(treat))
if (standardize){
dataset[,outcome] <- (dataset[,outcome] - mean(dataset[dataset$treat==0,outcome], na.rm=TRUE))/
sd(dataset[dataset$treat==0,outcome], na.rm=TRUE)
}
}
Now the issue, is whenever it gets to the standardization procedure, I get an error :
"Error in is.data.frame(x) :
'list' object cannot be coerced to type 'double'
In addition: Warning message:
In mean.default(dataset[dataset$treat == 0, outcome], na.rm = TRUE)"
I am really not sure why this is the case, I dont believe the syntax is wrong anywhere ?
Here is an example of a dataframe to use with the code:
dataframe <- data.frame("var1" = c(1, 2, 5, 1, 642, 5, 1, 2, 5, 9, NA, 8, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10 ),
"Var2" = c(1, 3, 5, 1, 642, 5, NA, NA, NA, NA, NA, NA, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10 ),
"var3" = c(1, 2, 635, 9, NA, 1, 2, 5, NA, NA, 12, NA, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10),
"var4" = c(1, 21, 15, 19, NA, 1, 26656, 56,6 , NA, 512, NA, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10),
"cov1" = c(1, 22,335, 29, NA, NA, NA, 645, NA, NA, 12, NA, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10),
"cov2" = c(44251, 2322,5, 29, 45, 35, 42, 645, 55, 525, NA, NA, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10),
"cov3" = c(154, 2552,35, 53529, 5, 3, 53542, 645, 25, 2, 12, 23, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10))
dataframe <- dataframe %>%
mutate(treatment_group = ifelse(var3 == 2, "trt", ifelse(var3 == 10, "control", NA)))
dataset <- dataframe
outcome <- "Var2"
baseline_outcome <- "var1"
covariates = c("cov1", "cov2","cov3")
Thank you so much!!!
It is possible that the OP's original dataset is either tibble or data.table because both of them doesn't subset the columns into a vector when we do , column as it is drop = FALSE in both when compared to data.frame (which is drop = TRUE)
> compute_treatment_effects(as_tibble(dataset), outcome, baseline_outcome, covariates, standardize = TRUE)
Error in is.data.frame(x) :
'list' object cannot be coerced to type 'double'
In addition: Warning message:
In mean.default(dataset[dataset$treat == 0, outcome], na.rm = TRUE) :
argument is not numeric or logical: returning NA
The fix would be to either convert to data.frame with as.data.frame
compute_treatment_effects(as.data.frame(dataset), outcome, baseline_outcome, covariates, standardize = TRUE)
-output
var1 Var2 var3 var4 cov1 cov2 cov3 treatment_group treat
1 2 -Inf 2 21 22 2322 2552 trt 1
2 1 NA 2 26656 NA 42 53542 trt 1
3 10 NaN 10 10 10 10 10 control 0
4 10 NaN 10 10 10 10 10 control 0
5 10 NaN 10 10 10 10 10 control 0
6 10 NaN 10 10 10 10 10 control 0
7 10 NaN 10 10 10 10 10 control 0
8 10 NaN 10 10 10 10 10 control 0
9 10 NaN 10 10 10 10 10 control 0
10 10 NaN 10 10 10 10 10 control 0
11 10 NaN 10 10 10 10 10 control 0
12 10 NaN 10 10 10 10 10 control 0
Or make the changes in the function by using [[ instead of [ for subsetting the column i.e.
compute_treatment_effects <- function(dataset, outcome, baseline_outcome,
covariates,
standardize){
baseline_covariates <- c(baseline_outcome, covariates)
dataset <- dataset %>%
mutate(treat =ifelse(treatment_group == "trt", 1,
ifelse(treatment_group == "control", 0, NA))) %>%
filter(!is.na(treat))
if (standardize){
dataset[[outcome]] <- (dataset[[outcome]] -
mean(dataset[[outcome]][dataset$treat==0], na.rm=TRUE))/
sd(dataset[[outcome]][dataset$treat==0], na.rm=TRUE)
}
dataset
}

applying mean to a list after weighting

I have a list named df which contains three iterations with two years of projection.
What I want is: weighting the variable "district" just for year 2 in each iteration and finally I want to have mean of each weighted district for all three iterations. Note that each year has a variable named "weight" that weighting should be based on this variable.
iteration1 <- list(year1 = data.frame(age = c(10, 11, 12, 13),
district = c(1, 2, 3, 4),
gender = c(1, 2, 2, 1),
weight = c(12.2, 11.3, 11.2, 10.1)),
year2 = data.frame(age = c(10, 11, 12, 13, 10, 10),
district = c(1, 2, 3, 4, 2, 1),
gender = c(1, 2, 2, 1, 1, 1),
weight = c(12.2, 11.3, 11.2, 10.1, 12.2, 13.1)))
iteration2 <- list(year1 = data.frame(age = c(10, 11, 12, 13),
district = c(1, 2, 3, 4),
gender = c(2, 2, 1, 1),
weight = c(12.2, 11.3, 11.2, 10.1)),
year2 = data.frame(age = c(10, 11, 12, 13, 13, 13, 12),
district = c(1, 2, 3, 4, 1, 3, 3),
gender = c(2, 2, 1, 1, 2, 2, 2),
weight = c(12.2, 11.3, 11.2, 10.1, 10.9, 11.9, 15.1)))
iteration3 <- list(year1 = data.frame(age = c(10, 11, 12, 13),
district = c(1, 2, 3, 4),
gender = c(2, 2, 1, 1),
weight = c(12.2, 11.3, 11.2, 10.1)),
year2 = data.frame(age = c(10, 11, 12, 13, 10, 10, 11, 12),
district = c(1, 2, 3, 4, 4, 3, 2, 2),
gender = c(2, 2, 1, 1, 2, 2, 1, 2),
weight = c(12.2, 11.3, 11.2, 10.1, 13.5, 12.8, 13.9, 14.9)))
df <- list(iteration1 = iteration1, iteration2 = iteration2, iteration3 = iteration3)
Expected output:
district mean of each district for all three iterations
1 20.2
2 24.96
3 24.46
4 14.6
for calculating my expected output I have followed two steps. in first step، I have weighted year 2 in each iteration by wtd.table(df$iteration1$year2$district,weights=df$iteration1$year2$weight) . I repeated this code for three times (because I have three iterations). here is my output:
1 2 3 4
25.3 23.5 11.2 10.1
1 2 3 4
23.1 11.3 38.2 10.1
1 2 3 4
12.2 40.1 24.0 23.6
in second step, I calculate mean of each district for three iterations manually: mean(25.3,23.1,12.2)
data.table approach
library(data.table)
library(questionr)
ans <- rbindlist(
lapply(df, function(x)
as.data.table(
questionr::wtd.table(x[["year2"]]$district,
weights = x[["year2"]]$weight))),
use.names = TRUE, fill = TRUE)
# Summarise
ans[, .(weight = mean(N, na.rm = TRUE)), by = .(district = V1)]
# district weight
# 1: 1 20.20000
# 2: 2 24.96667
# 3: 3 24.46667
# 4: 4 14.60000
Version 2
With updated columns based on TS's comment below
ans <- rbindlist(
lapply(df, function(x)
as.data.table(
questionr::wtd.table(x = x[["year2"]]$district,
y = x[["year2"]]$gender,
weights = x[["year2"]]$weight) ) ),
use.names = TRUE, fill = TRUE )
# Summarise
ans[, .(n = .N,
mean = mean(N, na.rm = TRUE),
sd = sd(N, na.rm = TRUE)),
by = .(district = V1, gender = V2)]
# district gender n mean sd
# 1: 1 1 3 8.433333 14.606962
# 2: 2 1 3 8.700000 7.582216
# 3: 3 1 3 7.466667 6.466323
# 4: 4 1 3 10.100000 0.000000
# 5: 1 2 3 11.766667 11.556095
# 6: 2 2 3 16.266667 8.602519
# 7: 3 2 3 17.000000 8.697126
# 8: 4 2 3 4.500000 7.794229
Combine the list of dataframes into one and calculate average weight using questionr::wtd.table for each district and iteration in year2. Finally, get aggregated mean for each district.
Using tidyverse you can do -
library(dplyr)
library(purrr)
map_df(df, ~bind_rows(.x, .id = 'year'), .id = 'iter') %>%
filter(year == 'year2') %>%
group_by(district, iter) %>%
summarise(result = questionr::wtd.table(district,weights=weight)) %>%
summarise(result = mean(result))
# district result
# <dbl> <dbl>
#1 1 20.2
#2 2 25.0
#3 3 24.5
#4 4 14.6

Obtain mean vector for each parameter combination using R

I have a cvs file that has the following structure (minimum example):
ID Variable Vector
1 a [0,0,0]
2 a [1,2,3]
1 a [1,1,2]
2 a [1,2,3]
1 b [0,0,0]
2 b [1,1,1]
1 b [0,0,1]
2 b [3,5,7]
I would like to calculate the mean vector for each combination of parameters (in this case, ID and Variable). That is, I want to obtain a dataframe like the following one:
ID Variable Vector
1 a [0.5,0.5,1]
2 a [1,2,3]
1 b [0,0,0.5]
2 b [2,3,4]
I have generated this csv file with Python, that's why I have that structure with brackets. But I do not know how to start to do this using R. It doesn't seem to be a common data structure.
Update:
Vector variable structure (obtained from dput(head(data, 8))
Vector = c("[3, 16, 14, 5, 6, 13, 17, 7, 13, 6]",
"[7, 12, 6, 10, 6, 5, 16, 9, 19, 10]", "[4, 13, 4, 11, 6, 15, 17, 10, 12, 8]",
"[18, 11, 16, 8, 10, 10, 7, 4, 9, 7]", "[9, 9, 10, 17, 8, 13, 3, 13, 8, 10]",
"[17, 12, 7, 13, 6, 13, 8, 9, 5, 10]", "[9, 6, 14, 10, 8, 4, 8, 14, 15, 12]",
"[7, 13, 8, 10, 16, 8, 13, 13, 8, 4]")), row.names = c(NA, 8L
), class = "data.frame")
Assuming the 'Vector' column is a list, after grouping by 'ID', 'Variable', we reduce the 'Vector' by adding (+) the corresponding elements together and then divide by the total number of elements (n()) in that group
library(dplyr)
library(purrr)
out <- df1 %>%
group_by(ID, Variable) %>%
summarise(Vector = list(reduce(Vector, `+`)/n()), .groups = 'drop')
-output
out
# A tibble: 4 x 3
# ID Variable Vector
# <dbl> <chr> <list>
#1 1 a <dbl [3]>
#2 1 b <dbl [3]>
#3 2 a <dbl [3]>
#4 2 b <dbl [3]>
out$Vector
#[[1]]
#[1] 0.5 0.5 1.0
#[[2]]
#[1] 0.0 0.0 0.5
#[[3]]
#[1] 1 2 3
#[[4]]
#[1] 2 3 4
If the column 'Vector' is a character string, an option is to extract the numeric part into a list
library(stringr)
out <- df1 %>%
group_by(ID, Variable) %>%
summarise(Vector = list((str_extract_all(Vector, "\\d+") %>%
map(as.numeric) %>% reduce(`+`))/n()), .groups = 'drop')
data
df1 <- structure(list(ID = c(1, 2, 1, 2, 1, 2, 1, 2), Variable = c("a",
"a", "a", "a", "b", "b", "b", "b"), Vector = structure(list(c(0,
0, 0), c(1, 2, 3), c(1, 1, 2), c(1, 2, 3), c(0, 0, 0), c(1, 1,
1), c(0, 0, 1), c(3, 5, 7)), class = "AsIs")), class = "data.frame",
row.names = c(NA,
-8L))

Can I create many categories of one variable based in two other conditions in r? [duplicate]

This question already has answers here:
How collect additional row data on binned data in R
(1 answer)
Group value in range r
(3 answers)
Closed 3 years ago.
I am doing a statistic analysis in a big data frame (more than 48.000.000 rows) in r. Here is an exemple of the data:
structure(list(herd = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3), cows = c(1, 2,
3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1, 2, 3, 4,
5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1, 2, 3, 4, 5, 6,
7, 8, 9, 10, 11, 12, 13, 14, 15, 16), `date` = c("11/03/2013",
"12/03/2013", "13/03/2013", "14/03/2013", "15/03/2013", "16/03/2013",
"13/05/2012", "14/05/2012", "15/05/2012", "16/05/2012", "17/05/2012",
"18/05/2012", "10/07/2016", "11/07/2016", "12/07/2016", "13/07/2016",
"11/03/2013", "12/03/2013", "13/03/2013", "14/03/2013", "15/03/2013",
"16/03/2013", "13/05/2012", "14/05/2012", "15/05/2012", "16/05/2012",
"17/05/2012", "18/05/2012", "10/07/2016", "11/07/2016", "12/07/2016",
"13/07/2016", "11/03/2013", "12/03/2013", "13/03/2013", "14/03/2013",
"15/03/2013", "16/03/2013", "13/05/2012", "14/05/2012", "15/05/2012",
"16/05/2012", "17/05/2012", "18/05/2012", "10/07/2016", "11/07/2016",
"12/07/2016", "13/07/2016"), glicose = c(240666, 23457789, 45688688,
679, 76564, 6574553, 78654, 546432, 76455643, 6876, 7645432,
876875, 98654, 453437, 98676, 9887554, 76543, 9775643, 986545,
240666, 23457789, 45688688, 679, 76564, 6574553, 78654, 546432,
76455643, 6876, 7645432, 876875, 98654, 453437, 98676, 9887554,
76543, 9775643, 986545, 240666, 23457789, 45688688, 679, 76564,
6574553, 78654, 546432, 76455643, 6876)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -48L))
I need to identify how many cows are in the following category of glicose by herd and by date:
<=100000
100000 and <=150000
150000 and <=200000
200000 and <=250000
250000 and <=400000
>400000
I tried to use the functions filter() and select() but could not categorize the variable like that.
I tried either to make a vector for each category but it did not work:
ht <- df %>% group_by(herd, date) %>%
filter(glicose < 100000)
Actually I do not have a clue of how I could do this. Please help!
I expect to get the number of cows in each category of each herd based on each date in a table like this:
Calling your data df,
df %>%
mutate(glicose_group = cut(glicose, breaks = c(0, seq(1e5, 2.5e5, by = 0.5e5), 4e5, Inf)),
date = as.Date(date, format = "%d/%m/%Y")) %>%
group_by(herd, date, glicose_group) %>%
count
# # A tibble: 48 x 4
# # Groups: herd, date, glicose_group [48]
# herd date glicose_group n
# <dbl> <date> <fct> <int>
# 1 1 2012-05-13 (0,1e+05] 1
# 2 1 2012-05-14 (4e+05,Inf] 1
# 3 1 2012-05-15 (4e+05,Inf] 1
# 4 1 2012-05-16 (0,1e+05] 1
# 5 1 2012-05-17 (4e+05,Inf] 1
# 6 1 2012-05-18 (4e+05,Inf] 1
# 7 1 2013-03-11 (2e+05,2.5e+05] 1
# 8 1 2013-03-12 (4e+05,Inf] 1
# 9 1 2013-03-13 (4e+05,Inf] 1
# 10 1 2013-03-14 (0,1e+05] 1
# # ... with 38 more rows
I also threw in a conversion to Date class, which is probably a good idea.

How to generate sequence with exclusions in R

I need 4 functions that generate some numbers (each)
First function generates sequence from n odd numbers except 5, 15, 25, etc...
example with n=2: 1, 1, 3, 3, 7, 7, 9, 9, 11, 11, 13, 13, 17, 17,...
Second function generates sequence from n even numbers except 10, 20, 30, etc...
example witn n=2: 2, 2, 4, 4, 6, 6, 8, 8, 12, 12, 14, 14, 16, 16,...
Third function generates sequence from n numbers from 5 by 10
example witn n=2: 5, 5, 15, 15, 25, 25,...
Fourth function generates sequence from n numbers from 10 by 10
example witn n=2: 10, 10, 20, 20, 30, 30,...
Each function has to get vector 1: N and n as inputs.
For example,
f1(1:10, 3)
> 1, 1, 1, 3, 3, 3, 7, 7, 7, 9
f2(1:5, 10)
> 2, 2, 2, 2, 2
f3(1:15, 5)
> 5, 5, 5, 5, 5, 15, 15, 15, 15, 15, 25, 25, 25, 25, 25
f4(1:2, 1)
> 10, 20
I have some decision for first two functions but I don`t know how to exclude some numbers:
f1 <- function(x) 2*((x-1) %/% 10) + 1 # goes 1, 3, 5, etc for n = 10
f2 <- function(x) 2*((x-1) %/% 10 + 1) # goes 2, 4, 6, etc for n = 10
why not use seq and rep ?
n = 25
nrep = 2 # number of repetitions
by5 <- sort(rep(seq(5, n, by = 10), nrep )) # numbers from 5 by 10
by5
by10 <- sort(rep(seq(10, n, by = 10), nrep )) # numbers from 10 by 10
by10
odd <- sort(rep(seq(1, n, by = 2), nrep )) # odd number
odd[!odd %in% by5] # remove all the by5 values
even <- sort(rep(seq(2, n, by = 2), nrep )) # Even numbers
even[!even %in% by10] # remove all the by 10 values
output
> [1] 5 5 15 15 25 25
> [1] 10 10 20 20
> [1] 1 1 3 3 7 7 9 9 11 11 13 13 17 17 19 19 21 21 23 23
> [1] 2 2 4 4 6 6 8 8 12 12 14 14 16 16 18 18 22 22 24 24.

Resources