I have the following dataframe:
species <- c("a","a","a","b","b","b","c","c","c","d","d","d","e","e","e","f","f","f","g","h","h","h","i","i","i")
category <- c("h","l","m","h","l","m","h","l","m","h","l","m","h","l","m","h","l","m","l","h","l","m","h","l","m")
minus <- c(31,14,260,100,70,200,91,152,842,16,25,75,60,97,300,125,80,701,104,70,7,124,24,47,251)
plus <- c(2,0,5,0,1,1,4,4,30,1,0,0,2,0,5,0,0,3,0,0,0,0,0,0,4)
df <- cbind(species, category, minus, plus)
df<-as.data.frame(df)
I want to do a chisq.test for each category-species combination, like this:
Species a, category h and l: p-value
Species a, category h and m: p-value
Species a, category l and m: p-value
Species b, ... and so on
With the following chisq.test (dummy code):
chisq.test(c(minus(cat1, cat2),plus(cat1, cat2)))$p.value
I want to end up with a table that presents each chisq.test p-value for each comparison, like this:
Species Category1 Category2 p-value
a h l 0.05
a h m 0.2
a l m 0.1
b...
Where category and and category 2 are the compared categories in the chisq.test.
Is this possible to do using dplyr? I have tried tweaking what was mentioned in here and here, but they don't really apply to this issue, as I am seeing it.
EDIT: I also would like to see how this could be done for the following dataset:
species <- c(1:11)
minus <- c(132,78,254,12,45,76,89,90,100,42,120)
plus <- c(1,2,0,0,0,3,2,5,6,4,0)
I would like to do a chisq. test for each species in the table compared to every single other species in the table (a pairwise comparison between each species for all species). I want to end up with something like this:
species1 species2 p-value
1 2 0.5
1 3 0.7
1 4 0.2
...
11 10 0.02
I tried changing the code above to the following:
species_chisq %>%
do(data_frame(species1 = first(.$species),
species2 = last(.$species),
data = list(matrix(c(.$minus, .$plus), ncol = 2)))) %>%
mutate(chi_test = map(data, chisq.test, correct = FALSE)) %>%
mutate(p.value = map_dbl(chi_test, "p.value")) %>%
ungroup() %>%
select(species1, species2, p.value) %>%
However, this only created a table where each species was only compared to itself, and not the other species. I do not quite understand where in the original code given by #ycw it specifies which are compared.
EDIT 2:
I managed to do this by the code found here.
A solution from dplyr and purrr. Notice that I am not familiar with chi-square test, but I follow the way you specified in #Vincent Bonhomme's post: chisq.test(test, correct = FALSE).
In addition, to create example data frame, there is no need to use cbind, just data.frame would be sufficient. stringsAsFactors = FALSE is important to prevent columns become factor.
# Create example data frame
species <- c("a","a","a","b","b","b","c","c","c","d","d","d","e","e","e","f","f","f","g","h","h","h","i","i","i")
category <- c("h","l","m","h","l","m","h","l","m","h","l","m","h","l","m","h","l","m","l","h","l","m","h","l","m")
minus <- c(31,14,260,100,70,200,91,152,842,16,25,75,60,97,300,125,80,701,104,70,7,124,24,47,251)
plus <- c(2,0,5,0,1,1,4,4,30,1,0,0,2,0,5,0,0,3,0,0,0,0,0,0,4)
df <- data.frame(species, category, minus, plus, stringsAsFactors = FALSE)
# Load packages
library(dplyr)
library(purrr)
# Process the data
df2 <- df %>%
group_by(species) %>%
slice(c(1, 2, 1, 3, 2, 3)) %>%
mutate(test = rep(1:(n()/2), each = 2)) %>%
group_by(species, test) %>%
do(data_frame(species = first(.$species),
test = first(.$test[1]),
category1 = first(.$category),
category2 = last(.$category),
data = list(matrix(c(.$minus, .$plus), ncol = 2)))) %>%
mutate(chi_test = map(data, chisq.test, correct = FALSE)) %>%
mutate(p.value = map_dbl(chi_test, "p.value")) %>%
ungroup() %>%
select(species, category1, category2, p.value)
df2
# A tibble: 25 x 4
species category1 category2 p.value
<chr> <chr> <chr> <dbl>
1 a h l 0.3465104
2 a h m 0.1354680
3 a l m 0.6040227
4 b h l 0.2339414
5 b h m 0.4798647
6 b l m 0.4399181
7 c h l 0.4714005
8 c h m 0.6987413
9 c l m 0.5729834
10 d h l 0.2196806
# ... with 15 more rows
First, you should create your data.frame with data.frame, otherwise minus and plus columns are turned into factors.
species <- c("a","a","a","b","b","b","c","c","c","d","d","d","e","e","e","f","f","f","g","h","h","h","i","i","i")
category <- c("h","l","m","h","l","m","h","l","m","h","l","m","h","l","m","h","l","m","l","h","l","m","h","l","m")
minus <- c(31,14,260,100,70,200,91,152,842,16,25,75,60,97,300,125,80,701,104,70,7,124,24,47,251)
plus <- c(2,0,5,0,1,1,4,4,30,1,0,0,2,0,5,0,0,3,0,0,0,0,0,0,4)
df <- data.frame(species=species, category=category, minus=minus, plus=plus)
Then, I'm not sure there is a pure dplyr way to do it (would be glad to be shown the contrary), but I think here is a partly-dplyr way to do it:
df_combinations <-
# create a df with all interactions
expand.grid(df$species, df$category, df$category)) %>%
# rename columns
`colnames<-`(c("species", "category1", "category2")) %>%
# 3 lines below:
# manage to only retain within a species, category(1 and 2) columns
# with different values
unique %>%
group_by(species) %>%
filter(category1 != category2) %>%
# cosmetics
arrange(species, category1, category2) %>%
ungroup() %>%
# prepare an empty column
mutate(p.value=NA)
# now we loop to fill your result data.frame
for (i in 1:nrow(df_combinations)){
# filter appropriate lines
cat1 <- filter(df,
species==df_combinations$species[i],
category==df_combinations$category1[i])
cat2 <- filter(df,
species==df_combinations$species[i],
category==df_combinations$category2[i])
# calculate the chisq.test and assign its p-value to the right line
df_combinations$p.value[i] <- chisq.test(c(cat1$minus, cat2$minus,
cat1$plus, cat2$plus))$p.value
}
Let's have a look to the resulting data.frame:
head(df_combinations)
# A tibble: 6 x 4
# A tibble: 6 x 4
# Groups: species [1]
species category1 category2 p.value
<fctr> <fctr> <fctr> <dbl>
1 a h l 3.290167e-11
2 a h m 1.225872e-134
3 a l h 3.290167e-11
4 a l m 5.824842e-150
5 a m h 1.225872e-134
6 a m l 5.824842e-150
Checking the first row:
chisq.test(c(31, 14, 2, 0))$p.value
[1] 3.290167e-11
Is this what you wanted?
Related
I need to extract a sample that has equal distribution in each experience-level group. For your info, there are total 4 groups (1, 2, 3, 4 years of exp), and total 8 people (A, B, C, D, E, F, G, H) in this example scenario. I was trying to come up with a function with loops, but don't know how to. Please help me out! Thank you! :)
library(tidyverse)
data <- tibble(id = c("A","A","A","B","B","C","C","D","D","D","D","E","E","E","E","F","F","G","G","G","H","H","H","H"),year_exp = c(1,2,3,1,2,1,2,1,2,3,4,1,2,3,4,1,2,1,2,3,1,2,3,4), pre_year_exp = year_exp - 1)
data_0 <- data %>% filter(year_exp == max(year_exp) - 0) %>% sample_n(2)
data_1 <- data %>% filter(year_exp == max(year_exp) - 1) %>% anti_join(data_0, by = 'id') %>% sample_n(2)
data_2 <- data %>% filter(year_exp == max(year_exp) - 2) %>% anti_join(data_0, by = 'id') %>% anti_join(data_1, by = 'id') %>% sample_n(2)
data_3 <- data %>% filter(year_exp == max(year_exp) - 3) %>% anti_join(data_0, by = 'id') %>% anti_join(data_1, by = 'id') %>% anti_join(data_2, by = 'id')
#Result Table
result <- data_0 %>% bind_rows(data_1, data_2, data_3)
result
The below produces the same output as your code and extends the idea to allow for an arbitrary number of values of year_exp using a for loop.
Please note that because this simply extends your code, it must share the following (possibly-undesirable) features with your code:
The code moves sequentially through groups, sampling from the members of later groups who were not sampled for early groups. Accordingly, there is a risk that the code throws an error because it tries to sample from groups whose members were already sampled from previous, other groups.
The probabilities of selection are not uniformly distributed across members of a group. Accordingly, the samples drawn from each group are not representative of that group.
In the event that there data were instead a balanced panel, there are much more efficient and simpler ways to accomplish this.
library(tibble)
library(dplyr)
set.seed(123)
# Create original data
data <- tibble(id = c("A","A","A","B","B","C","C","D","D","D","D","E","E","E","E","F","F","G","G","G","H","H","H","H"),
year_exp = c(1,2,3,1,2,1,2,1,2,3,4,1,2,3,4,1,2,1,2,3,1,2,3,4),
pre_year_exp = year_exp - 1)
# Assign values to parameters used by/in the loop.
J <- data$id %>% unique %>% length # unique units/persons (8)
K <- data$year_exp %>% unique %>% length # unique groups/years (4)
N <- 2 # sample size per group (2)
# Initialize objects loop will modify
samples_list <- vector(mode = "list", length = K) # stores each sample
used_ids <- rep(NA_character_, J) # stores used ids
index <- 1:N # initial indices for used ids
# For-loop solution
for (k in 1:K) {
# Identifier for current group
cur_group <- 1 + K - k
# Sample from persons in current group who were not previously sampled
one_sample <- data %>%
filter(year_exp == cur_group, !(id %in% used_ids)) %>%
slice_sample(n = N)
# Save sample and the id values for those sampled
samples_list[[k]] <- one_sample
used_ids[index] <- one_sample$id
index <- index + N
}
# Bind into a single data.frame
bind_rows(samples_list)
#> # A tibble: 8 x 3
#> id year_exp pre_year_exp
#> <chr> <dbl> <dbl>
#> 1 H 4 3
#> 2 D 4 3
#> 3 G 3 2
#> 4 E 3 2
#> 5 C 2 1
#> 6 B 2 1
#> 7 F 1 0
#> 8 A 1 0
So I'm coming back to R after a little time away and finding myself a bit rusty here. I've had a look around but can't seem to find a way to do this.
I have a dataset that look like df below and what I want is to group each column by itself, get the count for each group then get the percentage for each factor in the column and repeat the same process for all the columns in the dataset and end up with one data frame.
The below gives me what I want for the one column (in this case a). However, I cant think of the best way to reproduce this across multiple columns, where each column is grouped by itself, so b by b, c by c, d by d, etc, without doing it all manually. Where the end result would be one_col but with many rows.
#data
a<- rep(1:5, 5)
b <- rep(1:5, 5)
c <- rep(1:5, 5)
d <- rep(1:5, 5)
df <- data.frame(a=a, b=b, c=c, d=d)
head(df)
#example analysis on one column
library(tidyverse)
one_col<-df%>%
group_by(a)%>%
summarise(count=n())%>%
spread(a, count)%>%
mutate(sum=rowSums(.[1:5]), neg=(`1`+`2`)/sum, pos=(`4`+`5`)/sum, neut=`3`/sum)%>%
select(pos, neg, neut)
one_col
I had thought of doing a for loop but struggling with the formatting of it. Any ideas?
for(i in 1:ncol(df)) {
group_by(!!df[i,])%>%
summarise(count=n())
}
Thanks!
Here are two approaches using for loop and purrr::map_dfr().
library(tidyverse)
df1 <- df |>
mutate(across(a:d, ~ case_when(
. %in% c(1, 2) ~ "neg",
. %in% c(4, 5) ~ "pos",
. == 3 ~ "neut"
)))
# FOR LOOP
l <- vector("list", ncol(df1))
for (i in seq_along(df1)) {
l[[i]] <- table(df1[[i]]) |>
prop.table() |>
as.data.frame()
}
l |>
setNames(names(df1)) |>
bind_rows(.id = "var") |>
pivot_wider(id_cols = var, names_from = Var1, values_from = Freq)
# map_dfr()
df1 |>
map_dfr(
~ table(.x) |>
prop.table() |>
as.data.frame(),
.id = "var"
) |>
pivot_wider(id_cols = var, names_from = .x, values_from = Freq)
# var neg neut pos
# <chr> <dbl> <dbl> <dbl>
# 1 a 0.4 0.2 0.4
# 2 b 0.4 0.2 0.4
# 3 c 0.4 0.2 0.4
# 4 d 0.4 0.2 0.4
I have a data frame with N vars, M categorical and 2 numeric. I would like to create M data frames, one for each categorical variable.
Eg.,
data %>%
group_by(var1) %>%
summarise(sumVar5 = sum(var5),
meanVar6 = mean(var6))
data %>%
group_by(varM) %>%
summarise(sumVar5 = sum(var5),
meanVar6 = mean(var6))
etc...
Is there a way to iterate through the categorical variables and generate each of the summary tables? That is, without needing to repeat the above chunks M times.
Alternatively, these summary tables don't have to be individual objects, as long as I can easily reference / pull the summaries for each of the M variables.
Here is a solution (I hope). Creates a list of data frames with the formula you have:
library(tidyverse)
# Create sample data frame
data <- data.frame(var1 = sample(1:2, 5, replace = T),
var2 = sample(1:2, 5, replace = T),
var3 = sample(1:2, 5, replace = T),
varM = sample(1:2, 5, replace = T),
var5 = rnorm(5, 3, 6),
var6 = rnorm(5, 3, 6))
# Vars to be grouped (var1 until varM in this example)
vars_to_be_used <- names(select(data, var1:varM))
# Function to be used
group_fun <- function(x, .df = data) {
.df %>%
group_by_(.x) %>%
summarise(sumVar5 = sum(var5),
meanVar6 = mean(var6))
}
# Loop over vars
results <- map(vars_to_be_used, group_fun)
# Nice list names
names(results) <- vars_to_be_used
print(results)
You didn't supply a sample data.set so I created a small example to show how it works.
data <- data_frame(var1 = rep(letters[1:5], 2),
var2 = rep(LETTERS[11:15], 2),
var3 = 1:10,
var4 = 11:20)
A combination of tidyverse packages can get you where you need to be.
Steps used: First we gather all the columns we want to group by on in a cols column and keep the numeric vars separate. Next we split the data.frame in a list of data.frames so that every column we want to group by on has it's own table with the 2 numeric vars. Now that everything is in a list, we need to use the map functionality from the purrr package. Using map, we spread the data.frame again so the column names are as we expect them to be. Finally using map we use group_by_if to group by on the character column and summarise the rest. All the outcomes are stored in a list where you can access what you need.
Run the code in pieces to see what every step does.
library(dplyr)
library(purrr)
library(tidyr)
outcomes <- data %>%
gather(cols, value, -c(var3, var4)) %>%
split(.$cols) %>%
map(~ spread(.x, cols, value)) %>%
map(~ group_by_if(.x, is.character) %>%
summarise(sumvar3 = sum(var3),
meanvar4 = mean(var4)))
outcomes
$`var1`
# A tibble: 5 x 3
var1 sumvar3 meanvar4
<chr> <int> <dbl>
1 a 7 13.5
2 b 9 14.5
3 c 11 15.5
4 d 13 16.5
5 e 15 17.5
$var2
# A tibble: 5 x 3
var2 sumvar3 meanvar4
<chr> <int> <dbl>
1 K 7 13.5
2 L 9 14.5
3 M 11 15.5
4 N 13 16.5
5 O 15 17.5
I am kind of new to R and programming in general. I am currently strugling with a piece of code for data transformation and hope someone can take a little bit of time to help me.
Below a reproducible exemple :
# Data
a <- c(rnorm(12, 20))
b <- c(rnorm(12, 25))
f1 <- rep(c("X","Y","Z"), each=4) #family
f2 <- rep(x = c(0,1,50,100), 3) #reference and test levels
dt <- data.frame(f1=factor(f1), f2=factor(f2), a,b)
#library loading
library(tidyverse)
Goal : Compute all values (a,b) using a reference value. Calculation should be : a/a_ref with a_ref = a when f2=0 depending on the family (f1 can be X,Y or Z).
I tried to solve this by using this code :
test <- filter(dt, f2!=0) %>% group_by(f1) %>%
mutate("a/a_ref"=a/(filter(dt, f2==0) %>% group_by(f1) %>% distinct(a) %>% pull))
I get :
test results
as you can see a is divided by a_ref. But my script seems to recycle the use of reference values (a_ref) regardless of the family f1.
Do you have any suggestion so A is computed with regard of the family (f1) ?
Thank you for reading !
EDIT
I found a way to do it 'manualy'
filter(dt, f1=="X") %>% mutate("a/a_ref"=a/(filter(dt, f1=="X" & f2==0) %>% distinct(a) %>% pull()))
f1 f2 a b a/a_ref
1 X 0 21.77605 24.53115 1.0000000
2 X 1 20.17327 24.02512 0.9263973
3 X 50 19.81482 25.58103 0.9099366
4 X 100 19.90205 24.66322 0.9139422
the problem is that I'd have to update the code for each variable and family and thus is not a clean way to do it.
# use this to reproduce the same dataset and results
set.seed(5)
# Data
a <- c(rnorm(12, 20))
b <- c(rnorm(12, 25))
f1 <- rep(c("X","Y","Z"), each=4) #family
f2 <- rep(x = c(0,1,50,100), 3) #reference and test levels
dt <- data.frame(f1=factor(f1), f2=factor(f2), a,b)
#library loading
library(tidyverse)
dt %>%
group_by(f1) %>% # for each f1 value
mutate(a_ref = a[f2 == 0], # get the a_ref and add it in each row
"a/a_ref" = a/a_ref) %>% # divide a and a_ref
ungroup() %>% # forget the grouping
filter(f2 != 0) # remove rows where f2 == 0
# # A tibble: 9 x 6
# f1 f2 a b a_ref `a/a_ref`
# <fctr> <fctr> <dbl> <dbl> <dbl> <dbl>
# 1 X 1 21.38436 24.84247 19.15914 1.1161437
# 2 X 50 18.74451 23.92824 19.15914 0.9783583
# 3 X 100 20.07014 24.86101 19.15914 1.0475490
# 4 Y 1 19.39709 22.81603 21.71144 0.8934042
# 5 Y 50 19.52783 25.24082 21.71144 0.8994260
# 6 Y 100 19.36463 24.74064 21.71144 0.8919090
# 7 Z 1 20.13811 25.94187 19.71423 1.0215013
# 8 Z 50 21.22763 26.46796 19.71423 1.0767671
# 9 Z 100 19.19822 25.70676 19.71423 0.9738257
You can do this for more than one variable using:
dt %>%
group_by(f1) %>%
mutate_at(vars(a:b), funs(./.[f2 == 0])) %>%
ungroup()
Or generally use vars(a:z) to use all variables between a and z as long as they are one after the other in your dataset.
Another solution could be using mutate_if like:
dt %>%
group_by(f1) %>%
mutate_if(is.numeric, funs(./.[f2 == 0])) %>%
ungroup()
Where the function will be applied to all numeric variables you have. The variables f1 and f2 will be factor variables, so it just excludes those ones.
I have created a function which takes a little while to run (lots of crunching going on) and there are two distinct outputs that I need to return from this function. The inputs into these outputs are the same which is why I have combined them in the same function so that I don't have to crunch them twice, but the outputs are so entirely different in content and based on such entirely different calculations that there is no way to actually combine them into a one parse kinda statement. One object is tens of lines earlier than the other. But I need to return both, so I think it has to be in some type of format which mimics: store the two separate objects in a single list, lapply, then extract and rbind the two objects.
Any help on a solution to this would be appreciated - ideally not using a for loop or data.table. Dplyr solutions are fine.
Some dummy data:
df <- data.frame(ID = c(rep("A",10), rep("B", 10), rep("C", 10)),
subID = c(rep("U", 5),rep("V", 5),rep("W", 5),rep("X", 5),rep("Y", 5),rep("Z", 5)),
Val = c(1,6,3,8,6,5,2,4,7,20,4,2,3,5,7,3,2,5,7,12,5,3,7,1,6,1,34,9,5,3))
The function (again noting the function is much more complex than this, and I am calculating many more complex and unrelated things in each of the separate objects, not just the average!):
func <- function(x, df){
temp <- filter(df, ID == x)
average_id <- temp %>% group_by(ID) %>% summarise(avg = mean(Val))
average_subid <- temp %>% group_by(ID, subID) %>% summarise(avg = mean(Val))
df_list <- list(avgID=average_id, avgSubID=average_subid)
return(df_list)
}
Presently I have computed the results using this command, but am unsure whether this is correct or how to further extract the results after the objects are stored in this list (of lists) (i.e. I get stuck here):
result <- lapply(list("A","B","C"), func, df)
The result should look like:
> average_ID
ID avg
1 A 6.2
2 B 5.0
3 C 7.4
> average_subID
ID subID avg
1 A U 4.8
2 A V 7.6
3 B W 4.2
4 B X 5.8
5 C Y 4.4
6 C Z 10.4
I have previously used a for loop and stored the results in lists (i.e. avgListID[x] <- average_id, then binded together. But I don't think this is ideal.
Thanks in advance!
I realize this is a bit old, but since neither provided answer seems to have done the trick, how about this? Split the function into two, and run each within your lapply, returning a list of lists?
library(dplyr)
df <- data.frame(ID = c(rep("A",10), rep("B", 10), rep("C", 10)),
subID = c(rep("U", 5),rep("V", 5),rep("W", 5),rep("X", 5),rep("Y", 5),rep("Z", 5)),
Val = c(1,6,3,8,6,5,2,4,7,20,4,2,3,5,7,3,2,5,7,12,5,3,7,1,6,1,34,9,5,3))
subfunc1 <- function(temp){
return(temp %>% group_by(ID) %>% summarise(avg = mean(Val)))
}
subfunc2 <- function(temp){
return(temp %>% group_by(ID, subID) %>% summarise(avg = mean(Val)))
}
func <- function(x, df){
temp <- filter(df, ID == x)
df_list <- list(avgID=subfunc1(temp), avgSubID=subfunc2(temp))
return(df_list)
}
result <- lapply(list("A","B","C"), func, df)
To get the structure/order you need, transpose the lists as explained here:
n <- length(result[[1]]) # assuming all lists in result have the same length
result <- lapply(1:n, function(i) lapply(result, "[[", i))
> average_ID <- aggregate(df$Val, by = list(df$ID), FUN = mean)
>
> average_ID
Group.1 x
1 A 6.2
2 B 5.0
3 C 7.4
> average_subID <- aggregate(df$Val, by = list(df$ID,df$subID), FUN = mean)
>
> average_subID
Group.1 Group.2 x
1 A U 4.8
2 A V 7.6
3 B W 4.2
4 B X 5.8
5 C Y 4.4
6 C Z 10.4
What about returning a list where each element represents the averages at a specific grouping level. For example:
library(tidyverse)
fnc = function(groups=NULL, data=df) {
groups=as.list(groups)
data %>%
group_by_(.dots=groups) %>%
summarise(avg=mean(Val))
}
list(Avg_Overall=NULL, Avg_by_ID="ID", Avg_by_SubID=c("ID","subID")) %>%
map(~fnc(.x))
$Avg_Overall
# A tibble: 1 x 1
avg
<dbl>
1 6.2
$Avg_by_ID
# A tibble: 3 x 2
ID avg
<fctr> <dbl>
1 A 6.2
2 B 5.0
3 C 7.4
$Avg_by_SubID
# A tibble: 6 x 3
# Groups: ID [?]
ID subID avg
<fctr> <fctr> <dbl>
1 A U 4.8
2 A V 7.6
3 B W 4.2
4 B X 5.8
5 C Y 4.4
6 C Z 10.4
You could also just calculate the average by subID and then the average by ID can be calculated from that:
# Average by subID
avg = df %>% group_by(ID, subID) %>%
summarise(n = n(),
avg = mean(Val))
# Average by ID
avg %>%
group_by(ID) %>%
summarise(avg = sum(avg*n)/sum(n))
# Overall average
avg %>%
ungroup %>%
summarise(avg = sum(avg*n)/sum(n))