R Summarize With DataTable - r

data=data.frame("StudentID"=c(1,2,3,4,5),
"Class"=c(1,2,2,3,3),
"Type"=c('A','A','B','B','B'))
Say you have data as shown above and you wish for summaries like this,
What is the effective solution to do this and output to a csv in organized way such as shown above?
Example data if there is weights involved and you wanted weighted counts and porporitons.portions.
data1=data.frame("StudentID"=c(1,2,3,4,5),
"Class"=c(1,2,2,3,3),
"Type"=c('A','A','B','B','B'),
"Weighting"=c(10,6,13,12,2))

One option is map
library(dplyr)
library(purrr)
map_dfr(names(data)[2:3], ~
data %>%
select(.x) %>%
group_by_at(.x) %>%
summarise(COUNT = n()) %>%
mutate(PROP = COUNT/sum(COUNT)))
# A tibble: 5 x 4
# Class COUNT PROP Type
#* <dbl> <int> <dbl> <fct>
#1 1 1 0.2 <NA>
#2 2 2 0.4 <NA>
#3 3 2 0.4 <NA>
#4 NA 2 0.4 A
#5 NA 3 0.6 B
Or with data.table by melting into 'long' format
library(data.table)
melt(setDT(data), id.var = 'StudentID')[, .(COUNT = .N),
.(variable, value)][, PROP := COUNT/sum(COUNT),.(variable)][]
Or with base R using table and prop.table
lapply(data[-1], function(x) {x1 <- table(x); x2 <- prop.table(x1); cbind(COUNT = x1, PROP = x2)})

Both summaries are simple, here I use dplyr. To combine them in the way you want, it's going to need to be slapped together in a somewhat inelegant way. You can remove the name col1 if you want
library(dplyr)
df1 <- data %>% group_by(Class) %>%
summarise(Count = n(), Prop = n() / nrow(data))
df2 <- data %>% group_by(Type) %>%
summarise(Count = n(), Prop = n() / nrow(data))
names(df1)[1] <- 'col1'
names(df2)[1] <- 'col1'
rbind(
c('Class', '', ''),
df1,
c('Type', '', ''),
df2
)
# A tibble: 7 x 3
col1 Count Prop
<chr> <chr> <chr>
1 Class "" ""
2 1 1 0.2
3 2 2 0.4
4 3 2 0.4
5 Type "" ""
6 A 2 0.4
7 B 3 0.6

Related

Aggregate string variable using summarise and across function

df_input is the input file, and the ideal output file is df_output.
df_input <- data.frame(id = c(1,2,3,4,4,5,5,5,6,7,8,9,10),
party = c("A","B","C","D","E","F","G","H","I","J","K","L","M"),
winner= c(1,1,1,1,1,1,1,1,1,1,1,1,1))
df_output <- data.frame(id = c(1,2,3,4,5,6,7,8,9,10),
party = c("A","B","C","D,E","F_G_H","I","J","K","L","M"),
winner_sum = c(1,1,1,2,3,1,1,1,1,1))
Previously the code worked using the "summarise_at" function as follows:
df_output <- df_input %>%
dplyr::group_by_at(.vars = vars(id)) %>%
{left_join(
dplyr::summarise_at(., vars(party), ~ str_c(., collapse = ",")),
dplyr::summarise_at(., vars(winner), funs(sum))
)}
But it no longer works as it seems both "summarise_at" and "funs" has been deprecated.
I am trying to replicate using across with dplyr (1.0.10), but I am getting an error. Here is my attempt:
df_output <- df_input %>%
group_by(id) %>%
summarise(across(winner, sum, na.rm=T)) %>%
summarise(across(party, str_c(., collapse = ",")))
I have multiple numeric and character variables,s not just one, as in the example. Thanks a lot.
We don't need across if we need to apply different functions on single columns
library(dplyr)
library(stringr)
df_input %>%
group_by(id) %>%
summarise(party = str_c(party, collapse = ","),
winner_sum = sum(winner))
-output
# A tibble: 10 × 3
id party winner_sum
<dbl> <chr> <dbl>
1 1 A 1
2 2 B 1
3 3 C 1
4 4 D,E 2
5 5 F,G,H 3
6 6 I 1
7 7 J 1
8 8 K 1
9 9 L 1
10 10 M 1
If there are multiple 'party', 'winner' columns, loop across them in a single summarise as after the first summarise we have only the summarised column with the group column
df_input %>%
group_by(id) %>%
summarise(across(winner, sum, na.rm=TRUE),
across(party, ~ str_c(.x, collapse = ",")), .groups = "drop")
-output
# A tibble: 10 × 3
id winner party
<dbl> <dbl> <chr>
1 1 1 A
2 2 1 B
3 3 1 C
4 4 2 D,E
5 5 3 F,G,H
6 6 1 I
7 7 1 J
8 8 1 K
9 9 1 L
10 10 1 M
NOTE: If the columns have a simplar prefix then use starts_with to select all those columns i.e. across(starts_with("party"), or if there are different column names - across(c(party, othercol), or if the functions applied are based on their type - across(where(is.numeric), sum,, na.rm = TRUE)
df_input %>%
group_by(id) %>%
summarise(across(where(is.numeric), sum, na.rm = TRUE),
across(where(is.character), str_c, collapse = ","),
.groups = 'drop')

Replacing NA values with mode from multiple imputation in R

I ran 5 imputations on a data set with missing values. For my purposes, I want to replace missing values with the mode from the 5 imputations. Let's say I have the following data sets, where df is my original data, ID is a grouping variable to identify each case, and imp is my imputed data:
df <- data.frame(ID = c(1,2,3,4,5),
var1 = c(1,NA,3,6,NA),
var2 = c(NA,1,2,6,6),
var3 = c(NA,2,NA,4,3))
imp <- data.frame(ID = c(1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4,4,4,4,5,5,5,5,5),
var1 = c(1,2,3,3,2,5,4,5,6,6,7,2,3,2,5,6,5,6,6,6,3,1,2,3,2),
var2 = c(4,3,2,3,2,4,6,5,4,4,7,2,4,2,3,6,5,6,4,5,3,3,4,3,2),
var3 = c(7,6,5,6,6,2,3,2,4,2,5,4,5,3,5,1,2,1,3,2,1,2,1,1,1))
I have a method that works, but it involves a ton of manual coding as I have ~200 variables total (I'm doing this on 3 different data sets with different variables). My code looks like this for one variable:
library(dplyr)
mode <- function(codes){
which.max(tabulate(codes))
}
var1 <- imp %>% group_by(ID) %>% summarise(var1 = mode(var1))
df3 <- df %>%
left_join(var1, by = "ID") %>%
mutate(var1 = coalesce(var1.x, var1.y)) %>%
select(-var1.x, -var1.y)
Thus, the original value in df is replaced with the mode only if the value was NA.
It is taking forever to keep manually coding this for every variable. I'm hoping there is an easier way of calculating the mode from the imputed data set for each variable by ID and then replacing the NAs with that mode in the original data. I thought maybe I could put the variable names in a vector and somehow iterate through them with one code where i changes to each variable name, but I didn't know where to go with that idea.
x <- colnames(df)
# Attempting to iterate through variables names using i
i = as.factor(x[[2]])
This is where I am stuck. Any help is much appreciated!
Here is one option using tidyverse. Essentially, we can pivot both dataframes long, then join together and coalesce in one step rather than column by column. Mode function taken from here.
library(tidyverse)
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
imp_long <- imp %>%
group_by(ID) %>%
summarise(across(everything(), Mode)) %>%
pivot_longer(-ID)
df %>%
pivot_longer(-ID) %>%
left_join(imp_long, by = c("ID", "name")) %>%
mutate(var1 = coalesce(value.x, value.y)) %>%
select(-c(value.x, value.y)) %>%
pivot_wider(names_from = "name", values_from = "var1")
Output
# A tibble: 5 × 4
ID var1 var2 var3
<dbl> <dbl> <dbl> <dbl>
1 1 1 3 6
2 2 5 1 2
3 3 3 2 5
4 4 6 6 4
5 5 3 6 3
You can use -
library(dplyr)
mode_data <- imp %>%
group_by(ID) %>%
summarise(across(starts_with('var'), Mode))
df %>%
left_join(mode_data, by = 'ID') %>%
transmute(ID,
across(matches('\\.x$'),
function(x) coalesce(x, .[[sub('x$', 'y', cur_column())]]),
.names = '{sub(".x$", "", .col)}'))
# ID var1 var2 var3
#1 1 1 3 6
#2 2 5 1 2
#3 3 3 2 5
#4 4 6 6 4
#5 5 3 6 3
mode_data has Mode value for each of the var columns.
Join df and mode_data by ID.
Since all the pairs have name.x and name.y in their name, we can take all the name.x pairs replace x with y to get corresponding pair of columns. (.[[sub('x$', 'y', cur_column())]])
Use coalesce to select the non-NA value in each pair.
Change the column name by removing .x from the name. ({sub(".x$", "", .col)}) so var1.x becomes only var1.
where Mode function is taken from here
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
library(dplyr, warn.conflicts = FALSE)
imp %>%
group_by(ID) %>%
summarise(across(everything(), Mode)) %>%
bind_rows(df) %>%
group_by(ID) %>%
summarise(across(everything(), ~ coalesce(last(.x), first(.x))))
#> # A tibble: 5 × 4
#> ID var1 var2 var3
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 3 6
#> 2 2 5 1 2
#> 3 3 3 2 5
#> 4 4 6 6 4
#> 5 5 3 6 3
Created on 2022-01-03 by the reprex package (v2.0.1)
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}

How to group consecutive rows having same event and find average?

Value <- c(2,3,5,2,5,8,17,3,5)
Event <- c(1,1,2,2,2,1,1,2,2)
group <- data.frame(pt=Value, Event=Event)
I have data like above and I would like to group events in a way that would result into following.
Event Value
1 2.5
2 4
1 12.5
2 4
Thank you in advance.
We can create a grouping variable with rleid from data.table, use that to get the mean of 'pt' as well as return the first value of 'Event'
library(dplyr)
library(data.table)
group %>%
group_by(grp = rleid(Event)) %>%
summarise(Event = first(Event), Value = mean(pt)) %>%
select(-grp)
# A tibble: 4 x 2
# Event Value
# <dbl> <dbl>
#1 1 2.5
#2 2 4
#3 1 12.5
#4 2 4
Or using tapply/rle in base R
with(group, tapply(pt, with(rle(Event),
rep(seq_along(values), lengths)), FUN = mean))
# 1 2 3 4
# 2.5 4.0 12.5 4.0
Another option is using cumsum:
dfs %>%
mutate(gr = cumsum(Event != lag(Event, default = 0))) %>%
group_by(gr, Event) %>% summarise(mean_Value = mean(Value)) %>%
ungroup()

R: Break a data.frame according to value of column with dplyr

I have this data.frame
MWE <- data.frame(x = c("a", "a", "a", "b", "b", "b"), y = c(1,2,3,4,5,6))
and what I want to obtain is this data.frame
data.frame(a = c(1,2,3), b = c(4,5,6))
Actually, what I originally want is to sum the 2 vectors a and b (well, I have in reality many more vectors, but it is easier to explain with only 2), so that's why I thought about this transformation. I can do a rowSums then, or something equivalent.
I tried to use pivot_wider from tidyr but I had an error.
Any idea of how to do this with dplyr or tidyr?
Continuing from #Mr.Flick's attempt in tidyverse you could create an id column and grouped on that id column calculate the sum like
library(dplyr)
MWE %>%
group_by(x) %>%
mutate(row = row_number()) %>%
group_by(row) %>%
mutate(total_sum = sum(y)) %>%
tidyr::pivot_wider(names_from = x, values_from = y) %>%
ungroup() %>%
select(-row)
# A tibble: 3 x 3
# total_sum a b
# <dbl> <dbl> <dbl>
#1 5 1 4
#2 7 2 5
#3 9 3 6
We can use unstack from base R
unstack(MWE, y ~ x)
# a b
#1 1 4
#2 2 5
#3 3 6
Or using rowid from data.table with pivot_wider from tidyr
library(dplyr)
library(data.table)
library(tidyr)
MWE %>%
mutate(rn = rowid(x)) %>%
pivot_wider(names_from = x, values_from = y) %>%
select(-rn)
# A tibble: 3 x 2
# a b
# <dbl> <dbl>
#1 1 4
#2 2 5
#3 3 6
Using base R:
data.frame(with(MWE, split(y, x)))
a b
1 1 4
2 2 5
3 3 6

How to mutate() a list of columns using map2() in dplyr

I recently had to compile a data frame of student scores (one row per student, id column and several integer-valued columns, one per score component). I had to combine a "master" data frame and several "correction" data frames (containing mostly NA and some updates to the master), so that the result contains the maximum values from the master, and all corrections.
I succeeded by copy-pasting a sequence of mutate() calls, which works (see example below), but is not elegant in my opinion. What I would have wanted to do, was instead of copying and pasting, to use something along the lines of map2 and two lists of columns to compare the columns pair-wise. Something like (which obviously does not work as such):
list_of_cols1 <- list(col1.x, col2.x, col3.x)
list_of_cols2 <- list(col1.y, col2.y, col3.y
map2(list_of_cols1, list_of_cols2, ~ column = pmax(.x, .y, na.rm=T))
I can't seem to be able to figure out to do it. My question is: how to specify such lists of columns and mutate them in one map2() call in dplyr pipe, or is it even possible – have I gotten it all wrong?
Minimum working example
library(tidyverse)
master <- tibble(
id=c(1,2,3),
col1=c(1,1,1),
col2=c(2,2,2),
col3=c(3,3,3)
)
correction1 <- tibble(
id=seq(1,3),
col1=c(NA, NA, 2 ),
col2=c( 1, NA, 3 ),
col3=c(NA, NA, NA)
)
result <- reduce(
# Ultimately there would several correction data frames
list(master, correction1),
function(x,y) {
x <- x %>%
left_join(
y,
by = c("id")
) %>%
# Wish I knew how to do this mutate call with map2
mutate(
col1 = pmax(col1.x, col1.y, na.rm=T),
col2 = pmax(col2.x, col2.y, na.rm=T),
col3 = pmax(col3.x, col3.y, na.rm=T)
) %>%
select(id, col1:col3)
}
)
The result is
> result
# A tibble: 3 x 4
id col1 col2 col3
<int> <dbl> <dbl> <dbl>
1 1 1 2 3
2 2 1 2 3
3 3 2 3 3
Rather than do a left_join, just bind the rows then summarize. For example
result <- reduce(
list(master, master),
function(x,y) {
bind_rows(x, y) %>%
group_by(id) %>%
summarize_all(max, na.rm=T)
}
)
result
# id col1 col2 col3
# <dbl> <dbl> <dbl> <dbl>
# 1 1 1 2 3
# 2 2 1 2 3
# 3 3 2 3 3
Actually, you don't even need reduce as bind_rows can take a list
Adding another table
correction2 <- tibble(id=2,col1=NA,col2=8,col3=NA)
bind_rows(master, correction1, correction2) %>%
group_by(id) %>%
summarize_all(max, na.rm=T)
Sorry this doesn't answer your question about map2, I find it's easier to aggregate over rows than it is over columns in tidy R:
library(dplyr)
master <- tibble(
id=c(1,2,3),
col1=c(1,1,1),
col2=c(2,2,2),
col3=c(3,3,3)
)
correction1 <- tibble(
id=seq(1,3),
col1=c(NA, NA, 2 ),
col2=c( 1, NA, 3 ),
col3=c(NA, NA, NA)
)
result <- list(master, correction1) %>%
bind_rows() %>%
group_by(id) %>%
summarise_all(max, na.rm = TRUE)
result
#> # A tibble: 3 x 4
#> id col1 col2 col3
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 2 3
#> 2 2 1 2 3
#> 3 3 2 3 3
If correction tables will always have the same structure as master, you can do something like the following:
library(dplyr)
library(purrr)
update_master = function(...){
map(list(...), as.matrix) %>%
reduce(pmax, na.rm = TRUE) %>%
data.frame()
}
update_master(master, correction1)
To allow id to take character values, make the following modification:
update_master = function(x, ...){
map(list(x, ...), function(x) as.matrix(x[-1])) %>%
reduce(pmax, na.rm = TRUE) %>%
data.frame(id = x[[1]], .)
}
update_master(master, correction1)
Result:
id col1 col2 col3
1 1 1 2 3
2 2 1 2 3
3 3 2 3 3

Resources