mydat = data.frame(Q1 = c(0, 1, 0, 1), Q2 = c(0, 1, 1, 1),
Q3 = c(1, 1, 1, 1), Gender = c("M", "M", "F", "F"))
> mydat
Q1 Q2 Q3 Gender
1 0 0 1 M
2 1 1 1 M
3 0 1 1 F
4 1 1 1 F
> table(mydat[,1:3], mydat$Gender)
Error in sort.list(y) : 'x' must be atomic for 'sort.list'
Have you called 'sort' on a list?
I have a very simple data set with 3 binary questions and a gender variable. I'm interested to see if there is any association between the 3 questions and gender, so I would like to tabulate my data into a 3 (questions) x 2 (gender) count table. I want my table to look something like this
Q1 Q2 Q3
M 1 1 2
F 1 2 2
Edit:
mydat = data.frame(Q1 = c(0, 1, NA, 1), Q2 = c(0, 1, 1, 1),
Q3 = c(1, NA, 1, 1), Gender = c("M", "M", "F", "F"))
> rowsum(mydat[1:3], mydat$Gender)
Q1 Q2 Q3
F NA 2 2
M 1 1 NA
We can do a group by operation and sum the elements of other columns
library(dplyr)
mydat %>%
group_by(Gender) %>%
summarise_all(funs(sum(., na.rm = TRUE)))
# A tibble: 2 x 4
# Gender Q1 Q2 Q3
# <fctr> <int> <int> <int>
#1 F 1 2 2
#2 M 1 1 2
Or using base R
rowsum(mydat[-4], mydat$Gender, na.rm = TRUE)
# Q1 Q2 Q3
#F 1 2 2
#M 1 1 2
Related
I have the input dataset, and I'm looking for generating the output dataset by recoding 1 as the name of the columns and 0 as NA. I managed to do it manually see Not optional solution below. But I have a dataset with hundreds of columns, so I'm looking for a way to automatize this process.
Packages
library(tibble)
library(dplyr)
Input
input <- tibble( a = c(1, 0, 0, 1, 0),
b = c(0, 0, 0, 1, 1),
c = c(1, 1, 1, 1, 1),
d = c(0, 0, 0, 0, 0))
# # A tibble: 5 × 4
# a b c d
# <dbl> <dbl> <dbl> <dbl>
# 1 1 0 1 0
# 2 0 0 1 0
# 3 0 0 1 0
# 4 1 1 1 0
# 5 0 1 1 0
Output
output <- tibble( a = c("a", NA, NA, "a", NA),
b = c(NA, NA, NA, "b", NA),
c = c("c", "c", "c", "c", "c"),
d = c(NA, NA, NA, NA, NA))
# # A tibble: 5 × 4
# a b c d
# <chr> <chr> <chr> <lgl>
# 1 a NA c NA
# 2 NA NA c NA
# 3 NA NA c NA
# 4 a b c NA
# 5 NA NA c NA
Not optional solution
input %>%
mutate(a = case_when(a == 1 ~ "a",
T ~ NA_character_),
b = case_when(b == 1 ~ "b",
T ~ NA_character_),
c = case_when(c == 1 ~ "c",
T ~ NA_character_),
d = case_when(d == 1 ~ "d",
T ~ NA_character_))
We could use across with an ifelse statement:
library(dplyr)
input %>%
mutate(across(everything(), ~ifelse(. == 1, cur_column(), NA)))
a b c d
<chr> <chr> <chr> <lgl>
1 a NA c NA
2 NA NA c NA
3 NA NA c NA
4 a b c NA
5 NA b c NA
I have a data frame of term frequencies and some other random demographic variables. I want to utilize two grouping variables, drop the ones I do not need, and sum the frequencies based on the grouping variables.
Here is similar to what I have
df <- data.frame(user= c(1:9),
Group1 = c("a", "a", "a", "b", "b","b","c", "c", "c"),
Group2 = c("d", "e", "d", "e", "d", "e", "e", "e", "e"),
term1 = c(0, 1, 1, 0, 1, 1, 0, 0, 0),
term2 = c(1, 0, 1, 1, 0, 1, 0, 1, 1),
term3 = c(0, 1, 0, 0, 0, 0, 1, 1, 0))
and here is what I am trying to get.
desired <- data.frame(Group1 = c("a", "a", "b", "b", "c", "c"),
Group2 = c("d", "e", "d", "e", "d", "e"),
term1 = c(1, 1, 1, 1, 0, 0),
term2 = c(2, 0, 0, 2, 0, 2),
term3 = c(0, 1, 0, 0, 0, 2))
My real frame has about 4000 term columns, so naming each one individual in a dplyr function does not seem feasible.
Thank you!
You can try aggregate + expand.grid + merge
merge(
with(df, expand.grid(Group1 = unique(Group1), Group2 = unique(Group2))),
aggregate(. ~ Group1 + Group2, df[-1], sum),
all = TRUE
)
which gives
Group1 Group2 term1 term2 term3
1 a d 1 2 0
2 a e 1 0 1
3 b d 1 0 0
4 b e 1 2 0
5 c d NA NA NA
6 c e 0 2 2
If you want to have NAs as 0, you can try
> res <- merge(
with(df, expand.grid(Group1 = unique(Group1), Group2 = unique(Group2))),
aggregate(. ~ Group1 + Group2, df[-1], sum),
all = TRUE
)
> replace(res, is.na(res), 0)
Group1 Group2 term1 term2 term3
1 a d 1 2 0
2 a e 1 0 1
3 b d 1 0 0
4 b e 1 2 0
5 c d 0 0 0
6 c e 0 2 2
We can group by 'Group1, 'Group2', get the sum of 'term' columns in summarise and expand the data with complete for the missing combinations
library(dplyr)
library(tidyr)
df %>%
group_by(Group1, Group2) %>%
summarise(across(starts_with('term'), sum), .groups = 'drop') %>%
complete(Group1, Group2, fill = list(term1 = 0, term2 = 0, term3 = 0))
-output
# A tibble: 6 x 5
Group1 Group2 term1 term2 term3
<chr> <chr> <dbl> <dbl> <dbl>
1 a d 1 2 0
2 a e 1 0 1
3 b d 1 0 0
4 b e 1 2 0
5 c d 0 0 0
6 c e 0 2 2
If you don't need to compete all varible, setDT(df)[,lapply(.SD[,-1], sum),.(Group1,Group2)] is enough. Otherwise, you can use complete in package tidyr (as used in the first answer) to fill
the lacking varible.
library(data.table)
library(tidyr)
setDT(df)[,lapply(.SD[,-1], sum),.(Group1,Group2)] %>%
complete(Group1, Group2, fill = list(term1 = 0, term2 = 0, term3 = 0))
#> # A tibble: 6 x 5
#> Group1 Group2 term1 term2 term3
#> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 a d 1 2 0
#> 2 a e 1 0 1
#> 3 b d 1 0 0
#> 4 b e 1 2 0
#> 5 c d 0 0 0
#> 6 c e 0 2 2
I want to conditionally summarize several variables by group. The following code does that, but I'm not sure how to do this without specifying each variable and the conditions in the summarize step.
library(tidyverse)
dat <- data.frame(group = c("A", "A", "A", "B", "B", "B"),
indicator = c(1, 2, 3, 1, 2, 3),
var1 = c(1, 0, 1, 2, 1, 2),
var2 = c(1, 0, 1, 1, 2, 1))
# dat
# group indicator var1 var2
#1 A 1 1 1
#2 A 2 0 0
#3 A 3 1 1
#4 B 1 2 1
#5 B 2 1 2
#6 B 3 2 1
dat %>%
group_by(group) %>%
summarise(var1 = sum(var1[indicator==1 | indicator==2]),
var2 = sum(var2[indicator==1 | indicator==2]))
# A tibble: 2 x 3
# group var1 var2
#* <chr> <dbl> <dbl>
#1 A 1 1
#2 B 3 3
Use across :
library(dplyr)
dat %>%
group_by(group) %>%
summarise(across(starts_with('var'), ~sum(.[indicator %in% 1:2])))
# group var1 var2
#* <chr> <dbl> <dbl>
#1 A 1 1
#2 B 3 3
I am trying to add a column to my existing data set.
The data set has three columns:
Student (which is the column with the participant ID),
Week (the number of the week of the year during which the data were collected),
and
Day (the number of the weekday during which the data were
collected).
Now, a new column Obs that I am trying to create would contain a progressive number (from 1 to n) referring to the week during which every student was tested.
I have tried to use group_by in combination with rep but it does not seem to produce the result I want:
Week <- c(1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4, 4)
Day <- c(1, 2, 3, 2, 3, 5, 1, 3, 2, 3, 4, 5)
Student <- c("A", "A", "A", "B", "B", "B", "B", "B", "C", "C", "C", "C")
fake.db <- data.frame(Student, Week, Day)
library(dplyr)
fake.db %>%
group_by(Student) %>%
mutate(Obs = rep(1:length(Student), each = Week))
# Student Week Day Obs
# <fct> <dbl> <dbl> <int>
# 1 A 1 1 1
# 2 A 1 2 2
# 3 A 1 3 3
# 4 B 2 2 1
# 5 B 2 3 2
# 6 B 2 5 3
# 7 B 3 1 4
# 8 B 3 3 5
# 9 C 4 2 1
#10 C 4 3 2
#11 C 4 4 3
#12 C 4 5 4
What I would like to obtain is different. For the first week of data collection, 1 should be reported, and for the students for whom data were collected during a second week, 2 should be reported, etc.:
# Student Week Day Obs
#1 A 1 1 1
#2 A 1 2 1
#3 A 1 3 1
#4 B 2 2 1
#5 B 2 3 1
#6 B 2 5 1
#7 B 3 1 2
#8 B 3 3 2
#9 C 4 2 1
#10 C 4 3 1
#11 C 4 4 1
#12 C 4 5 1
One dplyr possibility could be:
fake.db %>%
group_by(Student) %>%
mutate(Obs = cumsum(!duplicated(Week)))
Student Week Day Obs
<fct> <dbl> <dbl> <int>
1 A 1 1 1
2 A 1 2 1
3 A 1 3 1
4 B 2 2 1
5 B 2 3 1
6 B 2 5 1
7 B 3 1 2
8 B 3 3 2
9 C 4 2 1
10 C 4 3 1
11 C 4 4 1
12 C 4 5 1
It groups by "Student" column and calculates the cumulative sum of non-duplicate "Week" values.
Or:
fake.db %>%
group_by(Student) %>%
mutate(Obs = with(rle(Week), rep(seq_along(lengths), lengths)))
It groups by "Student" column and creates a run-length type group ID around "Week" column".
Or:
fake.db %>%
group_by(Student) %>%
mutate(Obs = dense_rank(Week))
It groups by "Student" column and ranks the values in "Week" column.
What I understand the issue to be is that you want to count the weeks since the first test week for each student. I.e. Week 2 is student B's first week of testing, so it gets Obs = 1. That means you can do a grouped mutate:
library(dplyr)
fake.db <- structure(list(Student = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L), .Label = c("A", "B", "C"), class = "factor"), Week = c(1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4, 4), Day = c(1, 2, 3, 2, 3, 5, 1, 3, 2, 3, 4, 5)), class = "data.frame", row.names = c(NA, -12L))
fake.db %>%
group_by(Student) %>%
mutate(Obs = Week - min(Week) + 1)
#> # A tibble: 12 x 4
#> # Groups: Student [3]
#> Student Week Day Obs
#> <fct> <dbl> <dbl> <dbl>
#> 1 A 1 1 1
#> 2 A 1 2 1
#> 3 A 1 3 1
#> 4 B 2 2 1
#> 5 B 2 3 1
#> 6 B 2 5 1
#> 7 B 3 1 2
#> 8 B 3 3 2
#> 9 C 4 2 1
#> 10 C 4 3 1
#> 11 C 4 4 1
#> 12 C 4 5 1
Created on 2019-05-10 by the reprex package (v0.2.1)
A brief method with by
unlist(by(fake.db, fake.db[, 1], function(x) as.numeric(factor(x[, 2]))))
# A1 A2 A3 B1 B2 B3 B4 B5 C1 C2 C3 C4
# 1 1 1 1 1 1 2 2 1 1 1 1
Data
fake.db <- structure(list(Student = structure(c(1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 3L, 3L, 3L, 3L), .Label = c("A", "B", "C"), class = "factor"),
Week = c(1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4, 4), Day = c(1,
2, 3, 2, 3, 5, 1, 3, 2, 3, 4, 5)), class = "data.frame", row.names = c(NA,
-12L))
You can see if there is a non-zero difference
fake.db %>%
group_by(Student) %>%
arrange(Week) %>%
mutate(Obs = cumsum(c(1, diff(Week)!=0)))
or if they values arne't numeric, you can compare to the lag value
fake.db %>%
group_by(Student) %>%
arrange(Week) %>%
mutate(Obs = cumsum(Week != lag(Week, default=first(Week))) + 1)
Consider the following example data:
tmp_df_dplyr <- data.frame(groups = rep(c("C", "B", "A"), each = 3),
a = c(-2, 0, -1, -1, 0, 1, 0, 1, 2),
b = rep(c(-1, 0, 1), each = 3))
I wish to do the following, except using colSums:
tmp_df_dplyr %>%
group_by(groups) %>%
summarise(min_group = min(c(sum(a), sum(b))))
# produces:
# A tibble: 3 × 2
groups min_group
<fctr> <dbl>
1 A 3
2 B 0
3 C -3
Using dot referencing, I get an unexpected result:
tmp_df_dplyr %>%
group_by(groups) %>%
summarise(min_group = min(colSums(.[, c('a', 'b')])))
# produces
# A tibble: 3 × 2
groups min_group
<fctr> <dbl>
1 A 0
2 B 0
3 C 0
that is, it looks like the groups are not being applied.