I need to extract a sample that has equal distribution in each experience-level group. For your info, there are total 4 groups (1, 2, 3, 4 years of exp), and total 8 people (A, B, C, D, E, F, G, H) in this example scenario. I was trying to come up with a function with loops, but don't know how to. Please help me out! Thank you! :)
library(tidyverse)
data <- tibble(id = c("A","A","A","B","B","C","C","D","D","D","D","E","E","E","E","F","F","G","G","G","H","H","H","H"),year_exp = c(1,2,3,1,2,1,2,1,2,3,4,1,2,3,4,1,2,1,2,3,1,2,3,4), pre_year_exp = year_exp - 1)
data_0 <- data %>% filter(year_exp == max(year_exp) - 0) %>% sample_n(2)
data_1 <- data %>% filter(year_exp == max(year_exp) - 1) %>% anti_join(data_0, by = 'id') %>% sample_n(2)
data_2 <- data %>% filter(year_exp == max(year_exp) - 2) %>% anti_join(data_0, by = 'id') %>% anti_join(data_1, by = 'id') %>% sample_n(2)
data_3 <- data %>% filter(year_exp == max(year_exp) - 3) %>% anti_join(data_0, by = 'id') %>% anti_join(data_1, by = 'id') %>% anti_join(data_2, by = 'id')
#Result Table
result <- data_0 %>% bind_rows(data_1, data_2, data_3)
result
The below produces the same output as your code and extends the idea to allow for an arbitrary number of values of year_exp using a for loop.
Please note that because this simply extends your code, it must share the following (possibly-undesirable) features with your code:
The code moves sequentially through groups, sampling from the members of later groups who were not sampled for early groups. Accordingly, there is a risk that the code throws an error because it tries to sample from groups whose members were already sampled from previous, other groups.
The probabilities of selection are not uniformly distributed across members of a group. Accordingly, the samples drawn from each group are not representative of that group.
In the event that there data were instead a balanced panel, there are much more efficient and simpler ways to accomplish this.
library(tibble)
library(dplyr)
set.seed(123)
# Create original data
data <- tibble(id = c("A","A","A","B","B","C","C","D","D","D","D","E","E","E","E","F","F","G","G","G","H","H","H","H"),
year_exp = c(1,2,3,1,2,1,2,1,2,3,4,1,2,3,4,1,2,1,2,3,1,2,3,4),
pre_year_exp = year_exp - 1)
# Assign values to parameters used by/in the loop.
J <- data$id %>% unique %>% length # unique units/persons (8)
K <- data$year_exp %>% unique %>% length # unique groups/years (4)
N <- 2 # sample size per group (2)
# Initialize objects loop will modify
samples_list <- vector(mode = "list", length = K) # stores each sample
used_ids <- rep(NA_character_, J) # stores used ids
index <- 1:N # initial indices for used ids
# For-loop solution
for (k in 1:K) {
# Identifier for current group
cur_group <- 1 + K - k
# Sample from persons in current group who were not previously sampled
one_sample <- data %>%
filter(year_exp == cur_group, !(id %in% used_ids)) %>%
slice_sample(n = N)
# Save sample and the id values for those sampled
samples_list[[k]] <- one_sample
used_ids[index] <- one_sample$id
index <- index + N
}
# Bind into a single data.frame
bind_rows(samples_list)
#> # A tibble: 8 x 3
#> id year_exp pre_year_exp
#> <chr> <dbl> <dbl>
#> 1 H 4 3
#> 2 D 4 3
#> 3 G 3 2
#> 4 E 3 2
#> 5 C 2 1
#> 6 B 2 1
#> 7 F 1 0
#> 8 A 1 0
Sample data
dat <-
data.frame(Sim.Y1 = rnorm(10), Sim.Y2 = rnorm(10),
Sim.Y3 = rnorm(10), obsY = rnorm(10),
ID = sample(1:10, 10), ID_s = rep(1:2, each = 5))
For the following vector, I want to calculate the mean across ID_s
simVec <- c('Sim.Y1.cor','Sim.Y2.cor')
for(s in simVec){
simRef <- simVec[s]
simID <- unlist(strsplit(simRef, split = '.cor',fixed = T))[1]
# this works
dat %>% dplyr::group_by(ID_s) %>%
dplyr::summarise(meanMod = mean(Sim.Y1))
# this doesn't work
dat %>% dplyr::group_by(ID_s) %>%
dplyr::summarise(meanMod = mean(!!(simID)))
}
How do I refer a column in dplyr not by its explicit name?
Note that your particular task can be performed without any non-standard evaluation by using summarize_at(), which works directly with strings:
simIDs <- stringr::str_split(simVec, ".cor") %>% purrr::map_chr(1)
# [1] "Sim.Y1" "Sim.Y2"
dat %>% dplyr::group_by(ID_s) %>% dplyr::summarise_at(simIDs, mean)
# # A tibble: 2 x 3
# ID_s Sim.Y1 Sim.Y2
# <int> <dbl> <dbl>
# 1 1 0.494 -0.0522
# 2 2 -0.104 -0.370
A custom suffix can also be supplied through the named list:
dat %>% dplyr::group_by(ID_s) %>% dplyr::summarise_at(simIDs, list(m=mean))
# # A tibble: 2 x 3
# ID_s Sim.Y1_m Sim.Y2_m <--- Note the _m suffix
# <int> <dbl> <dbl>
# 1 1 0.494 -0.0522
# 2 2 -0.104 -0.370
First, you have to use seq_along() if you want to index you vector with s.
Second, you are missing sym().
This should work:
simVec <- c('Sim.Y1.cor','Sim.Y3.cor')
for(s in seq_along(simVec)){
simRef <- simVec[s]
simID <- unlist(strsplit(simRef, split = '.cor',fixed = T))[1]
# this works
dat %>% dplyr::group_by(ID_s) %>%
dplyr::summarise(meanMod = mean(Sim.Y1))
# this doesn't work
dat %>% dplyr::group_by(ID_s) %>%
dplyr::summarise(meanMod = mean(!!sym(simID)))
}
edit: no Typo
Try this
library(dplyr)
dat %>% group_by(ID) %>%
summarise(mean_y1 =mean(Sim.Y1),
mean_y2 =mean(Sim.Y2),
mean_y3 =mean(Sim.Y3),
mean_obsY = mean(obsY))
I understand the question to be, how do you get a column without referencing the column name, i.e. using the index instead.
Let me know if my understanding is incorrect.
If not, I believe the easiest way would be as per below.
> df1 <- data.frame(ID_s=c('a','b','c'),Val=c('a1','b1','c1'))
> df1
ID_s Val
1 a a1
2 b b1
3 c c1
> df1[,1]
[1] a b c
Levels: a b c
If you want to save that as a dataframe, can be extended as per below:
cc <- data.frame(ID_s=df1[,1])
Hope this helps!
I have the following dataframe:
species <- c("a","a","a","b","b","b","c","c","c","d","d","d","e","e","e","f","f","f","g","h","h","h","i","i","i")
category <- c("h","l","m","h","l","m","h","l","m","h","l","m","h","l","m","h","l","m","l","h","l","m","h","l","m")
minus <- c(31,14,260,100,70,200,91,152,842,16,25,75,60,97,300,125,80,701,104,70,7,124,24,47,251)
plus <- c(2,0,5,0,1,1,4,4,30,1,0,0,2,0,5,0,0,3,0,0,0,0,0,0,4)
df <- cbind(species, category, minus, plus)
df<-as.data.frame(df)
I want to do a chisq.test for each category-species combination, like this:
Species a, category h and l: p-value
Species a, category h and m: p-value
Species a, category l and m: p-value
Species b, ... and so on
With the following chisq.test (dummy code):
chisq.test(c(minus(cat1, cat2),plus(cat1, cat2)))$p.value
I want to end up with a table that presents each chisq.test p-value for each comparison, like this:
Species Category1 Category2 p-value
a h l 0.05
a h m 0.2
a l m 0.1
b...
Where category and and category 2 are the compared categories in the chisq.test.
Is this possible to do using dplyr? I have tried tweaking what was mentioned in here and here, but they don't really apply to this issue, as I am seeing it.
EDIT: I also would like to see how this could be done for the following dataset:
species <- c(1:11)
minus <- c(132,78,254,12,45,76,89,90,100,42,120)
plus <- c(1,2,0,0,0,3,2,5,6,4,0)
I would like to do a chisq. test for each species in the table compared to every single other species in the table (a pairwise comparison between each species for all species). I want to end up with something like this:
species1 species2 p-value
1 2 0.5
1 3 0.7
1 4 0.2
...
11 10 0.02
I tried changing the code above to the following:
species_chisq %>%
do(data_frame(species1 = first(.$species),
species2 = last(.$species),
data = list(matrix(c(.$minus, .$plus), ncol = 2)))) %>%
mutate(chi_test = map(data, chisq.test, correct = FALSE)) %>%
mutate(p.value = map_dbl(chi_test, "p.value")) %>%
ungroup() %>%
select(species1, species2, p.value) %>%
However, this only created a table where each species was only compared to itself, and not the other species. I do not quite understand where in the original code given by #ycw it specifies which are compared.
EDIT 2:
I managed to do this by the code found here.
A solution from dplyr and purrr. Notice that I am not familiar with chi-square test, but I follow the way you specified in #Vincent Bonhomme's post: chisq.test(test, correct = FALSE).
In addition, to create example data frame, there is no need to use cbind, just data.frame would be sufficient. stringsAsFactors = FALSE is important to prevent columns become factor.
# Create example data frame
species <- c("a","a","a","b","b","b","c","c","c","d","d","d","e","e","e","f","f","f","g","h","h","h","i","i","i")
category <- c("h","l","m","h","l","m","h","l","m","h","l","m","h","l","m","h","l","m","l","h","l","m","h","l","m")
minus <- c(31,14,260,100,70,200,91,152,842,16,25,75,60,97,300,125,80,701,104,70,7,124,24,47,251)
plus <- c(2,0,5,0,1,1,4,4,30,1,0,0,2,0,5,0,0,3,0,0,0,0,0,0,4)
df <- data.frame(species, category, minus, plus, stringsAsFactors = FALSE)
# Load packages
library(dplyr)
library(purrr)
# Process the data
df2 <- df %>%
group_by(species) %>%
slice(c(1, 2, 1, 3, 2, 3)) %>%
mutate(test = rep(1:(n()/2), each = 2)) %>%
group_by(species, test) %>%
do(data_frame(species = first(.$species),
test = first(.$test[1]),
category1 = first(.$category),
category2 = last(.$category),
data = list(matrix(c(.$minus, .$plus), ncol = 2)))) %>%
mutate(chi_test = map(data, chisq.test, correct = FALSE)) %>%
mutate(p.value = map_dbl(chi_test, "p.value")) %>%
ungroup() %>%
select(species, category1, category2, p.value)
df2
# A tibble: 25 x 4
species category1 category2 p.value
<chr> <chr> <chr> <dbl>
1 a h l 0.3465104
2 a h m 0.1354680
3 a l m 0.6040227
4 b h l 0.2339414
5 b h m 0.4798647
6 b l m 0.4399181
7 c h l 0.4714005
8 c h m 0.6987413
9 c l m 0.5729834
10 d h l 0.2196806
# ... with 15 more rows
First, you should create your data.frame with data.frame, otherwise minus and plus columns are turned into factors.
species <- c("a","a","a","b","b","b","c","c","c","d","d","d","e","e","e","f","f","f","g","h","h","h","i","i","i")
category <- c("h","l","m","h","l","m","h","l","m","h","l","m","h","l","m","h","l","m","l","h","l","m","h","l","m")
minus <- c(31,14,260,100,70,200,91,152,842,16,25,75,60,97,300,125,80,701,104,70,7,124,24,47,251)
plus <- c(2,0,5,0,1,1,4,4,30,1,0,0,2,0,5,0,0,3,0,0,0,0,0,0,4)
df <- data.frame(species=species, category=category, minus=minus, plus=plus)
Then, I'm not sure there is a pure dplyr way to do it (would be glad to be shown the contrary), but I think here is a partly-dplyr way to do it:
df_combinations <-
# create a df with all interactions
expand.grid(df$species, df$category, df$category)) %>%
# rename columns
`colnames<-`(c("species", "category1", "category2")) %>%
# 3 lines below:
# manage to only retain within a species, category(1 and 2) columns
# with different values
unique %>%
group_by(species) %>%
filter(category1 != category2) %>%
# cosmetics
arrange(species, category1, category2) %>%
ungroup() %>%
# prepare an empty column
mutate(p.value=NA)
# now we loop to fill your result data.frame
for (i in 1:nrow(df_combinations)){
# filter appropriate lines
cat1 <- filter(df,
species==df_combinations$species[i],
category==df_combinations$category1[i])
cat2 <- filter(df,
species==df_combinations$species[i],
category==df_combinations$category2[i])
# calculate the chisq.test and assign its p-value to the right line
df_combinations$p.value[i] <- chisq.test(c(cat1$minus, cat2$minus,
cat1$plus, cat2$plus))$p.value
}
Let's have a look to the resulting data.frame:
head(df_combinations)
# A tibble: 6 x 4
# A tibble: 6 x 4
# Groups: species [1]
species category1 category2 p.value
<fctr> <fctr> <fctr> <dbl>
1 a h l 3.290167e-11
2 a h m 1.225872e-134
3 a l h 3.290167e-11
4 a l m 5.824842e-150
5 a m h 1.225872e-134
6 a m l 5.824842e-150
Checking the first row:
chisq.test(c(31, 14, 2, 0))$p.value
[1] 3.290167e-11
Is this what you wanted?
This seems fairly simple, and I have a solution, but it's kinda time consuming since I have a lot of columns. I have looked at other solutions, but it's always been for something slightly different (aggregate one column, mutate all columns etc). In SQL I would do select PAT_ID, max(X), max(Y), max(Z) from table_name group by PAT_ID.
I have a data set that looks like this (but with more columns):
dt <- data.frame(
PAT_ID = c('P','P','P','A','A','A'),
X = c(1,NA,NA, 1,NA,NA),
Y = c(NA,2,NA,NA,1,NA),
Z = c(NA,NA,1,NA,NA,0)
)
So I summarize and then combine the results:
results_X <-dt %>%
group_by(PAT_ID ) %>%
summarise(X = max(X, na.rm=TRUE))
results_Y <-dt %>%
group_by(PAT_ID ) %>%
summarise(Y = max(Y, na.rm=TRUE))
results_Z <-dt %>%
group_by(PAT_ID ) %>%
summarise(Z = max(Z, na.rm=TRUE))
resulted <- left_join(results_X, results_Y )
resulted <- left_join(resulted, results_Z)
My output is the "roll-up" record that is the max value for each column per PAT_ID:
myresult <- data.frame(
PAT_ID = c('P','A'),
X = c(1,1),
Y = c(2,1),
Z = c(1,0)
)
I'm sure there's a better way to do this, but how?
This can be done with a summarize_all in dplyr. Here you go
library(dplyr)
dt %>% group_by(PAT_ID) %>% summarize_all(max, na.rm=T)
# PAT_ID X Y Z
# <fctr> <dbl> <dbl> <dbl>
# 1 A 1 1 0
# 2 P 1 2 1
This can also be accomplished with base R using aggregate.
aggregate(dt[c("X","Y","Z")], dt["PAT_ID"], FUN=max, na.rm=TRUE)
PAT_ID X Y Z
1 A 1 1 0
2 P 1 2 1