How to combine multiple summary tables at once - r

Consider the following data frame:
set.seed(123)
dat <- data.frame(Region = rep(c("a","b"), each=100),
State =rep(c("NY","MA","FL","GA"), each = 50),
Loc = rep(letters[1:20], each = 5),
ID = 1:200,
count1 = sample(4, 200, replace=T),
count2 = sample(4, 200, replace=T))
Region, State, and Loc are grouping variables for individual measurements, each of which has a unique ID number. For each grouping variable, I want to summarize the number of observations in each level of count1 and count2. Normally I would do on of the following for each pair:
#example for count1 and region:
library(tidyverse)
dat%>%
dplyr::select(Region,count1)%>%
group_by(count1,Region)%>%
count()
##or
with(dat, table(Region, count1))
How can I do this for all combinations and wrap them into a single table (or at least a few tables that are grouped by equivalent lengths since they will differ depending on which grouping variable is being used)

Try something like this:
Region1 <- dat %>% group_by(Region, count1) %>%
summarise(TotalRegion1 = n())
State1 <- dat %>% group_by(State, count1) %>%
summarise(TotalState1 = n())
Loc1 <- dat %>% group_by(Loc, count1) %>%
summarise(TotalLoc1 = n())

You can try to get "all at once" (for count1) with
out <- dat %>%
select(-ID, -count2) %>%
pivot_longer(Region:Loc, names_to = "k", values_to = "v") %>%
group_by(k, v, count1) %>%
tally() %>%
ungroup()
out %>%
filter(k == "Region")
# # A tibble: 8 x 4
# k v count1 n
# <chr> <fct> <int> <int>
# 1 Region a 1 26
# 2 Region a 2 27
# 3 Region a 3 20
# 4 Region a 4 27
# 5 Region b 1 20
# 6 Region b 2 30
# 7 Region b 3 30
# 8 Region b 4 20
out
# # A tibble: 101 x 4
# k v count1 n
# <chr> <fct> <int> <int>
# 1 Loc a 2 5
# 2 Loc a 3 1
# 3 Loc a 4 4
# 4 Loc b 1 2
# 5 Loc b 2 2
# 6 Loc b 3 3
# 7 Loc b 4 3
# 8 Loc c 1 2
# 9 Loc c 2 2
# 10 Loc c 3 3
# # ... with 91 more rows

Related

Binding rows based on common id

I have a very simple case where I want to combine several data frames into one based on a common id elements of a particular data frame.
Example:
id <- c(1, 2, 3)
x <- c(10, 12, 14)
data1 <- data.frame(id, x)
id <- c(2, 3)
x <- c(20, 22)
data2 <- data.frame(id, x)
id <- c(1, 3)
x <- c(30, 32)
data3 <- data.frame(id, x)
Which gives us,
$data1
id x
1 1 10
2 2 12
3 3 14
$data2
id x
1 2 20
2 3 22
$data3
id x
1 1 30
2 3 32
Now, I want to combine all three data frames based on the id's of the data3. The expected output should look like
> comb
id x
1 1 10
2 1 NA
3 1 30
4 3 14
5 3 22
6 3 32
I am trying the following, but not getting the expected output.
library(dplyr)
library(tidyr)
combined <- bind_rows(data1, data2, data3, .id = "id") %>% arrange(id)
Any idea how to get the expected output?
Does this work:
library(dplyr)
library(tidyr)
data1 %>% full_join(data2, by = 'id') %>% full_join(data3, by = 'id') %>% arrange(id) %>% right_join(data3, by = 'id') %>%
pivot_longer(cols = -id) %>% select(-name) %>% distinct()
# A tibble: 6 x 2
id value
<dbl> <dbl>
1 1 10
2 1 NA
3 1 30
4 3 14
5 3 22
6 3 32
Combine the 3 dataframes in one list and use filter to select only the id's in 3rd dataframe.
library(dplyr)
library(tidyr)
bind_rows(data1, data2, data3, .id = "new_id") %>%
filter(id %in% id[new_id == 3]) %>%
complete(new_id, id)
# new_id id x
# <chr> <dbl> <dbl>
#1 1 1 10
#2 1 3 14
#3 2 1 NA
#4 2 3 22
#5 3 1 30
#6 3 3 32
A pure base R solution can also make it
lst <- list(data1, data2, data3)
reshape(
subset(
reshape(
do.call(rbind, Map(cbind, lst, grp = seq_along(lst))),
idvar = "id",
timevar = "grp",
direction = "wide"
),
id %in% lst[[3]]$id
),
idvar = "id",
varying = -1,
direction = "long"
)[c("id", "x")]
which gives
id x
1.1 1 10
3.1 3 14
1.2 1 NA
3.2 3 22
1.3 1 30
3.3 3 32
>
Using base R
do.call(rbind, unname(lapply(mget(ls(pattern = "^data\\d+$")), \(x) {
x1 <- subset(x, id %in% data3$id)
v1 <- setdiff(data3$id, x1$id)
if(length(v1) > 0) rbind(x1, cbind(id = v1, x = NA)) else x1
})))
-output
id x
1 1 10
3 3 14
2 3 22
11 1 NA
12 1 30
21 3 32
bind_rows(data1, data2, data3, .id = 'grp')%>%
complete(id, grp)%>%
select(-grp) %>%
filter(id%in%data3$id)
# A tibble: 6 x 2
id x
<dbl> <dbl>
1 1 10
2 1 NA
3 1 30
4 3 14
5 3 22
6 3 32

Transpose and sum distinct values in R

IS there a way to transpose and summing distinct values in R For example
df
Cola Order Quantity Loc
ABC 1 4 LocA
ABC 1 4 LocB
CSD 4 6 LocA
CDS 3 2 LocB
We have same values for Order and Quantity but still need to take sum of it.
Expected Output (Transpose with respect to Quantity)
Cola Order Quantity LocA_Quantity Loc B_Quantity
ABC 2 8 4 4
CSD 4 6 6
CDS 3 2 2
Create the dataset:
library(tibble)
df = tribble(
~Cola, ~Order, ~Quantity, ~Loc,
'ABC', 1, 4, 'LocA',
'ABC', 1, 4, 'LocB',
'CSD', 4, 6, 'LocA',
'CDS', 3, 2, 'LocB'
)
Create the summaries:
library(dplyr)
df %>%
group_by(Cola) %>%
summarise(
Order = sum(Order),
LocA_Quantity = sum(Quantity * if_else(Loc == "LocA", 1, 0)),
LocB_Quantity = sum(Quantity * if_else(Loc == "LocB", 1, 0)),
Quantity = sum(Quantity)
)
You can do it for both Quantity and order and drop columns you dont want at the end, i.e.
library(tidyverse)
df %>%
group_by(Cola) %>%
mutate_at(vars(2:3), list(new = sum)) %>%
pivot_wider(names_from = Loc, values_from = 2:3)
## A tibble: 3 x 7
## Groups: Cola [3]
# Cola Order_new Quantity_new Order_LocA Order_LocB Quantity_LocA Quantity_LocB
# <fct> <int> <int> <int> <int> <int> <int>
#1 ABC 2 8 1 1 4 4
#2 CSD 4 6 4 NA 6 NA
#3 CDS 3 2 NA 3 NA 2
1) dplyr/tidyr Using the data shown reproducibly in the Note at the end, sum the orders and quantity and create a Quantity_ column equal to Quantity by Cola. Then reshape the Quantity_ column to wide form.
library(dplyr)
library(tidyr)
df %>%
group_by(Cola) %>%
mutate(Quantity_ = Quantity,
Order = sum(Order),
Quantity = sum(Quantity)) %>%
ungroup %>%
pivot_wider(names_from = "Loc", values_from = "Quantity_",
names_prefix = "Quantity_", values_fill = list(Quantity_ = 0))
giving:
# A tibble: 3 x 5
Cola Order Quantity Quantity_LocA Quantity_LocB
<chr> <int> <int> <int> <int>
1 ABC 2 8 4 4
2 CSD 4 6 6 0
3 CDS 3 2 0 2
2) Base R We can do much the same in base R using transform/ave and reshape like this:
df2 <- transform(df,
Quantity_ = Quantity,
Quantity = ave(Quantity, Cola, FUN = sum),
Order = ave(Order, Cola, FUN = sum))
wide <- reshape(df2, dir = "wide", idvar = c("Cola", "Quantity", "Order"),
timevar = "Loc", sep = "")
wide
## Cola Order Quantity Quantity_LocA Quantity_LocB
## 1 ABC 2 8 4 4
## 3 CSD 4 6 6 NA
## 4 CDS 3 2 NA 2
Note
Lines <- "Cola Order Quantity Loc
ABC 1 4 LocA
ABC 1 4 LocB
CSD 4 6 LocA
CDS 3 2 LocB"
df <- read.table(text = Lines, header = TRUE, as.is = TRUE)

Pairwise count data from long format

Example data
I have the following data:
df <- data.frame(
id = c('X1','X1','X1','X1','X2','X2','X2','X2'),
pos = c(1,2,3,4,1,2,3,4),
group = c(100,200,100,300,100,200,100,200)
)
Which thus looks like:
id pos group
1 X1 1 100
2 X1 2 200
3 X1 3 100
4 X1 4 300
5 X2 1 100
6 X2 2 200
7 X2 3 100
8 X2 4 200
What I try to achieve
I want to plot this data using geom_segment(), where pos will be on the x-xis, and group on the y-axis. Then for each of these segments I want to count how often they are present in the dataset (based on the id column). When doing this for the example dataset the result would be:
pos1 pos2 group1 group2 id.count
1 2 100 200 2
2 3 200 100 2
3 4 100 300 1
3 4 100 200 1
I have no clue how to start with this, while I'm familiar with group_by from dplyr I can not figure out how to build the initial four columns.
If the ordering in your data set is as in your example you can try this:
library(dplyr)
df %>% group_by(id) %>%
transmute(pos1 = pos, pos2 = lead(pos),
group1 = group, group2 = lead(group)) %>%
na.omit() %>% ungroup()%>%
count(pos1, pos2, group1, group2, name = "id.count")
# A tibble: 4 x 5
# pos1 pos2 group1 group2 id.count
# <dbl> <dbl> <dbl> <dbl> <int>
# 1 2 100 200 2
# 2 3 200 100 2
# 3 4 100 200 1
# 3 4 100 300 1
I tried the following that works, but wonder if there is a more elegant solution for this:
# Simple stats
vals <- unique(df$pos)
min.val = min(vals)
max.val = max(vals)
# Combination
comb.df <- data.frame(
pos1 = min.val:(max.val - 1),
pos2 = (min.val + 1): max.val
)
# Combine
comb.df <- comb.df %>%
left_join(df %>% select(pos1 = pos, group1 = group, id )) %>%
left_join(df %>% select(pos2 = pos, group2 = group, id ))
# Count
comb.df <- comb.df %>%
group_by(pos1, pos2, group1, group2) %>%
summarise(n.ids = n_distinct(id))

R dplyr::Filter dataframe by group and numeric vector?

I have dataframe df1 containing data and groups, and df2 which stores the same groups, and one value per group.
I want to filter rows of df1 by df2 where lag by group is higher than indicated value.
Dummy example:
# identify the first year of disturbance by lag by group
df1 <- data.frame(year = c(1:4, 1:4),
mort = c(5,16,40,4,5,6,10,108),
distance = rep(c("a", "b"), each = 4))
df2 = data.frame(distance = c("a", "b"),
my.median = c(12,1))
Now calculate the lag between values (creates new column) and filter df1 based on column values of df2:
# calculate lag between years
df1 %>%
group_by(distance) %>%
dplyr::mutate(yearLag = mort - lag(mort, default = 0)) %>%
filter(yearLag > df2$my.median) ##
This however does not produce expected results:
# A tibble: 3 x 4
# Groups: distance [2]
year mort distance yearLag
<int> <dbl> <fct> <dbl>
1 2 16 a 11
2 3 40 a 24
3 4 108 b 98
Instead, I expect to get:
# A tibble: 3 x 4
# Groups: distance [2]
year mort distance yearLag
<int> <dbl> <fct> <dbl>
1 3 40 a 24
2 1 5 b 5
3 3 10 b 4
The filter works great while applied to single value, but how to adapt it to vector, and especially vector of groups (as the order of elements can potentially change?)
Is this what you're trying to do?
df1 %>%
group_by(distance) %>%
dplyr::mutate(yearLag = mort - lag(mort, default = 0)) %>%
left_join(df2) %>%
filter(yearLag > my.median)
Result:
# A tibble: 4 x 5
# Groups: distance [2]
year mort distance yearLag my.median
<int> <dbl> <fct> <dbl> <dbl>
1 3 40 a 24 12
2 1 5 b 5 1
3 3 10 b 4 1
4 4 108 b 98 1
here is a data.table approach
library( data.table )
#creatae data.tables
setDT(df1);setDT(df2)
#create yearLag variable
df1[, yearLag := mort - shift( mort, type = "lag", fill = 0 ), by = .(distance) ]
#update join and filter wanted rows
df1[ df2, median.value := i.my.median, on = .(distance)][ yearLag > median.value, ][]
# year mort distance yearLag median.value
# 1: 3 40 a 24 12
# 2: 1 5 b 5 1
# 3: 3 10 b 4 1
# 4: 4 108 b 98 1
Came to the same conclusion. You should left_join the data frames.
df1 %>% left_join(df2, by="distance") %>%
group_by(distance) %>%
dplyr::mutate(yearLag = mort - lag(mort, default = 0)) %>%
filter(yearLag > my.median)
# A tibble: 4 x 5
# Groups: distance [2]
year mort distance my.median yearLag
<int> <dbl> <fct> <dbl> <dbl>
1 3 40 a 12 24
2 1 5 b 1 5
3 3 10 b 1 4
4 4 108 b 1 98

Sum multiple variables by group and create new column with their sum

I have a data frame with grouped variable and I want to sum them by group. It's easy with dplyr.
library(dplyr)
library(magrittr)
data <- data.frame(group = c("a", "a", "b", "c", "c"), n1 = 1:5, n2 = 2:6)
data %>% group_by(group) %>%
summarise_all(sum)
# A tibble: 3 x 3
group n1 n2
<fctr> <int> <int>
1 a 3 5
2 b 3 4
3 c 9 11
But now I want a new column total with the sum of n1 and n2 by group. Like this:
# A tibble: 3 x 3
group n1 n2 ttl
<fctr> <int> <int> <int>
1 a 3 5 8
2 b 3 4 7
3 c 9 11 20
How can I do that with dplyr?
EDIT:
Actually, it's just an example, I have a lot of variables.
I tried these two codes but it's not in the right dimension...
data %>% group_by(group) %>%
summarise_all(sum) %>%
summarise_if(is.numeric, sum)
data %>% group_by(group) %>%
summarise_all(sum) %>%
mutate_if(is.numeric, .funs = sum)
You can use mutate after summarize:
data %>%
group_by(group) %>%
summarise_all(sum) %>%
mutate(tt1 = n1 + n2)
# A tibble: 3 x 4
# group n1 n2 tt1
# <fctr> <int> <int> <int>
#1 a 3 5 8
#2 b 3 4 7
#3 c 9 11 20
If need to sum all numeric columns, you can use rowSums with select_if (to select numeric columns) to sum columns up:
data %>%
group_by(group) %>%
summarise_all(sum) %>%
mutate(tt1 = rowSums(select_if(., is.numeric)))
# A tibble: 3 x 4
# group n1 n2 tt1
# <fctr> <int> <int> <dbl>
#1 a 3 5 8
#2 b 3 4 7
#3 c 9 11 20
We can use apply together with the dplyr functions.
data <- data.frame(group = c("a", "a", "b", "c", "c"), n1 = 1:5, n2 = 2:6)
data %>% group_by(group) %>%
summarise_all(sum) %>%
mutate(ttl = apply(.[, 2:ncol(.)], 1, sum))
# A tibble: 3 × 4
group n1 n2 ttl
<fctr> <int> <int> <int>
1 a 3 5 8
2 b 3 4 7
3 c 9 11 20
Or rowSums with the same strategy. The key is to use . to specify the data frame and [] with x:ncol(.) to keep the columns you want.
data %>% group_by(group) %>%
summarise_all(sum) %>%
mutate(ttl = rowSums(.[, 2:ncol(.)]))
# A tibble: 3 × 4
group n1 n2 ttl
<fctr> <int> <int> <dbl>
1 a 3 5 8
2 b 3 4 7
3 c 9 11 20
Base R
cbind(aggregate(.~group, data, sum), ttl = sapply(split(data[,-1], data$group), sum))
# group n1 n2 ttl
#a a 3 5 8
#b b 3 4 7
#c c 9 11 20
We can use data.table. Convert the 'data.frame' to 'data.table' (setDT(data)), grouped by 'group', get the sum of each columns in the Subset of data.table, and then with Reduce, get the sum of the rows of the columns of interest
library(data.table)
setDT(data)[, lapply(.SD, sum) , group][, tt1 := Reduce(`+`, .SD),
.SDcols = names(data)[-1]][]
# group n1 n2 tt1
#1: a 3 5 8
#2: b 3 4 7
#3: c 9 11 20
Or with base R
addmargins(as.matrix(rowsum(data[-1], data$group)), 2)
# n1 n2 Sum
#a 3 5 8
#b 3 4 7
#c 9 11 20
Or with dplyr
data %>%
group_by(group) %>%
summarise_all(sum) %>%
mutate(tt = rowSums(.[-1]))

Resources