Converting one-hot encoded data to aggregate in dplyr - r

I have age columns like so that are dummy encoded.
How can I aggregate the information so i can get counts in dplyr
Input:
age_010 age_11-20 age_2130 age_3140 age_41-50 age_5160
0 1 0 0 0 0
0 0 1 0 0 0
0 0 0 1 0 0
0 1 0 0 0 0
0 0 0 0 0 1
Expected Output:
age n
age_010 0
age_11-20 2
age_2130 1
age_3140 1
age_41-50 0
age_5160 1

We may do the column wise sum
v1 <- colSums(df1)
data.frame(age = names(v1), n = unname(v1))
-output
age n
1 age_010 0
2 age_11.20 2
3 age_2130 1
4 age_3140 1
5 age_41.50 0
6 age_5160 1
If we want the tidyverse, do the sum across all the columns and then reshape to 'long' with pivot_longer
library(dplyr)
library(tidyr)
df1 %>%
summarise(across(everything(), sum)) %>%
pivot_longer(cols = everything(), names_to = 'age', values_to = 'n')
# A tibble: 6 × 2
age n
<chr> <int>
1 age_010 0
2 age_11.20 2
3 age_2130 1
4 age_3140 1
5 age_41.50 0
6 age_5160 1
data
df1 <- structure(list(age_010 = c(0L, 0L, 0L, 0L, 0L), age_11.20 = c(1L,
0L, 0L, 1L, 0L), age_2130 = c(0L, 1L, 0L, 0L, 0L), age_3140 = c(0L,
0L, 1L, 0L, 0L), age_41.50 = c(0L, 0L, 0L, 0L, 0L), age_5160 = c(0L,
0L, 0L, 0L, 1L)), class = "data.frame", row.names = c(NA, -5L
))

Related

Transforming a one-hot encoded variable to one column

I have age columns like so that are dummy encoded.
How can I transform these columns to one column using dplyr?
Input:
age_0-10 age_11-20 age_21-30 age_31-40 age_41-50 age_51-60 gender
1 0 1 0 0 0 0 0
2 0 0 1 0 0 0 1
3 0 0 0 1 0 0 0
4 0 1 0 0 0 0 1
5 0 0 0 0 0 1 1
Expected output:
age gender
1 11-20 0
2 21-30 1
3 31-40 0
4 11-20 1
5 51-60 1
A possible solution, now, thanks to #Adam's comment, with names_prefix:
library(tidyverse)
df <- data.frame(
check.names = FALSE,
`age_0-10` = c(0L, 0L, 0L, 0L, 0L),
`age_11-20` = c(1L, 0L, 0L, 1L, 0L),
`age_21-30` = c(0L, 1L, 0L, 0L, 0L),
`age_31-40` = c(0L, 0L, 1L, 0L, 0L),
`age_41-50` = c(0L, 0L, 0L, 0L, 0L),
`age_51-60` = c(0L, 0L, 0L, 0L, 1L),
gender = c(0L, 1L, 0L, 1L, 1L)
)
df %>%
pivot_longer(col=starts_with("age"), names_to="age", names_prefix="age_") %>%
filter(value==1) %>%
select(age, gender, -value)
#> # A tibble: 5 × 2
#> age gender
#> <chr> <int>
#> 1 11-20 0
#> 2 21-30 1
#> 3 31-40 0
#> 4 11-20 1
#> 5 51-60 1
Here is a way in dplyr using c_across().
library(dplyr)
library(stringr)
df %>%
rowwise() %>%
mutate(age = str_remove(names(.)[which(c_across(starts_with("age")) == 1)], "^age_")) %>%
ungroup() %>%
select(age, gender)
# # A tibble: 5 x 2
# age gender
# <chr> <int>
# 1 11-20 0
# 2 21-30 1
# 3 31-40 0
# 4 11-20 1
# 5 51-60 1
Try the base R code below using max.col
cbind(
age = gsub("^age_", "", head(names(df), -1)[max.col(df[-ncol(df)])]),
df[ncol(df)]
)
which gives
age gender
1 11-20 0
2 21-30 1
3 31-40 0
4 11-20 1
5 51-60 1
Here is another tidyverse solution:
library(dplyr)
library(purrr)
df %>%
mutate(age = pmap_chr(select(cur_data(), !gender),
~ names(df)[-ncol(df)][as.logical(c(...))])) %>%
select(age, gender)
age gender
1 age_11-20 0
2 age_21-30 1
3 age_31-40 0
4 age_11-20 1
5 age_51-60 1

create new column with mutate() when value in any of several other columns is TRUE (or 1)

I have a dataframe (my_dataframe) with 5 columns. All have 0 or 1 values. I would like to create a new column called cn7_any, which should have values of 1 when any values from columns 2:5 are ==1.
structure(list(cn7_normal = c(1L, 1L, 1L, 1L, 1L, 1L),
cn7_right_paralysis_central = c(0L, 0L, 0L, 0L, 0L, 0L),
cn7_right_paralysis_peripheral = c(0L, 0L, 0L, 0L, 0L, 0L),
cn7_left_paralysis_central = c(0L, 0L, 0L, 0L, 0L, 0L),
cn7_left_paralysis_peripheral = c(0L, 0L, 0L, 0L, 0L, 0L)),
row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"
))
> head(my_dataframe)
# A tibble: 6 x 5
cn7_normal cn7_right_paralysis_cen… cn7_right_paralysis_perip… cn7_left_paralysis_cen… cn7_left_paralysis_peri…
<int> <int> <int> <int> <int>
1 1 0 0 0 0
2 1 0 0 0 0
I could do it successfully with case_when():
my_dataframe<-my_dataframe%>%
mutate(cn7_paralisis_any=case_when(cn7_right_paralysis_central==1 ~ 1,
cn7_right_paralysis_peripheral==1 ~ 1,
cn7_left_paralysis_central==1 ~ 1,
cn7_left_paralysis_peripheral==1 ~ 1,
TRUE ~ 0)
)
Although it worked, I wonder whether there is a simpler, less verbose solution. I feel I should be using any() somehow. Any ideas?
my_dataframe$cn7_any <- apply(my_dataframe[ , 2:5], 1, max)
Your data is all zeroes, so I'll change a couple to prove the point.
rowSums(my_dataframe[,2:5]) > 0
# [1] FALSE TRUE FALSE TRUE FALSE FALSE
+(rowSums(my_dataframe[,2:5]) > 0)
# [1] 0 1 0 1 0 0
my_dataframe$cn7_any <- +(rowSums(my_dataframe[,2:5]) > 0)
Within dplyr,
my_dataframe %>%
mutate(cn7_any = rowSums(across(-cn7_normal, ~ . > 0)) > 0)
# # A tibble: 6 x 6
# cn7_normal cn7_right_paralysis_central cn7_right_paralysis_peripheral cn7_left_paralysis_central cn7_left_paralysis_peripheral cn7_any
# <int> <int> <int> <int> <int> <lgl>
# 1 1 0 0 0 0 FALSE
# 2 1 0 0 0 1 TRUE
# 3 1 0 0 0 0 FALSE
# 4 1 0 0 1 0 TRUE
# 5 1 0 0 0 0 FALSE
# 6 1 0 0 0 0 FALSE
It seems like a logical thing you're doing, not a number thing, but if you want numbers, just use the +(.) trick as above:
my_dataframe %>%
mutate(cn7_any = +(rowSums(across(-cn7_normal, ~ . > 0)) > 0))
Similar to Using any() vs | in dplyr::mutate
I also changed a few digits in your dataset.
V2: Using or |
V3: Using the dplyr::rowwise() prior to mutate to effectively group input by rows, then use the all() function (all looks at the entire vector, which is why you get the unexpected result)
my_dataframe<-structure(list(cn7_normal = c(1L, 1L, 1L, 1L, 1L, 1L),
cn7_right_paralysis_central = c(0L, 0L, 0L, 0L, 0L, 0L),
cn7_right_paralysis_peripheral = c(1L, 0L, 0L, 0L, 0L, 0L),
cn7_left_paralysis_central = c(0L, 1L, 0L, 0L, 0L, 0L),
cn7_left_paralysis_peripheral = c(0L, 0L, 0L, 0L, 0L, 0L)),
row.names = c(NA, -6L),
class = c("tbl_df", "tbl", "data.frame"))
my_dataframe%>%
rowwise() %>% ### rowwise ###
mutate(cn7_paralisis_any=case_when(cn7_right_paralysis_central==1 ~ 1,
cn7_right_paralysis_peripheral==1 ~ 1,
cn7_left_paralysis_central==1 ~ 1,
cn7_left_paralysis_peripheral==1 ~ 1,
TRUE ~ 0),
cn7_v2=(cn7_right_paralysis_central|cn7_right_paralysis_peripheral|cn7_left_paralysis_central|cn7_left_paralysis_peripheral),
cn7_v3=any(cn7_right_paralysis_central ,cn7_right_paralysis_peripheral, cn7_left_paralysis_central, cn7_left_paralysis_peripheral)
) %>%
select(cn7_paralisis_any,cn7_v2,cn7_v3)
# A tibble: 6 x 3
# Rowwise:
# cn7_paralisis_any cn7_v2 cn7_v3
# <dbl> <lgl> <lgl>
#1 1 TRUE TRUE
#2 1 TRUE TRUE
#3 0 FALSE FALSE
#4 0 FALSE FALSE
#5 0 FALSE FALSE
#6 0 FALSE FALSE
I now use dplyr::if_any and dplyr::if_all in such cases. I think it makes the code very clear and readable whenever we must perform such rowwise logical operations in dplyr.
For this particular case, I would now use:
library(dplyr)
my_dataframe %>%
mutate(cn7_paralisis_any = +if_any(across(-cn7_normal)))

R: Convert yearly indicator variables to single year variable

I have a data frame that has a set of columns which are indicator variables for a given year. For instance, the "d80" column is 1 for rows where the year was 1980 and 0 otherwise.
for(i in names(df)[31:35]){
print(c(i, df[[i]][0:5]))
}
[1] "d80" "1" "0" "0" "0" "0"
[1] "d81" "0" "1" "0" "0" "0"
[1] "d82" "0" "0" "1" "0" "0"
[1] "d83" "0" "0" "0" "1" "0"
[1] "d84" "0" "0" "0" "0" "1"
Presented another way:
head(data$d80)
[1] 1 0 0 0 0 0
head(data$d81)
[1] 0 1 0 0 0 0
and a third way:
> x = df[1:3, 31:55]
> dput(x)
structure(list(d80 = c(1L, 0L, 0L), d81 = c(0L, 1L, 0L), d82 = c(0L,
0L, 1L), d83 = c(0L, 0L, 0L), d84 = c(0L, 0L, 0L), d85 = c(0L,
0L, 0L), d86 = c(0L, 0L, 0L), d87 = c(0L, 0L, 0L), d88 = c(0L,
0L, 0L), d89 = c(0L, 0L, 0L), d90 = c(0L, 0L, 0L), d91 = c(0L,
0L, 0L), d92 = c(0L, 0L, 0L), d93 = c(0L, 0L, 0L), d94 = c(0L,
0L, 0L), d95 = c(0L, 0L, 0L), d96 = c(0L, 0L, 0L), d97 = c(0L,
0L, 0L), d98 = c(0L, 0L, 0L), d99 = c(0L, 0L, 0L), d00 = c(0L,
0L, 0L), d01 = c(0L, 0L, 0L), d02 = c(0L, 0L, 0L), d03 = c(0L,
0L, 0L), d04 = c(0L, 0L, 0L)), row.names = c("1", "2", "3"), class = "data.frame")
My ultimate goal is to calculate the average of a given column across each year, so I'd like to add a column where the value in each row is equal to the year of the row. In other words, I'd like to collapse the set of year indicator columns into a single column. For instance, the data above would become
80
81
82
83
84
What's the best way to do this? Thank you for your help!
Assuming your dataset is df you can use this approach:
library(tidyverse)
df %>%
group_by(id = row_number()) %>% # for every row numer (row id)
nest() %>% # nest data
mutate(year = map(data, ~as.numeric(gsub("d", "", names(.)[.==1])))) %>% # keep the column name of value 1, remove "d" and make the value numeric
unnest() %>% # unnest data
select(-id) # remove row id
# # A tibble: 3 x 26
# year d80 d81 d82 d83 d84 d85 d86 d87 d88 d89 d90 d91 d92 d93
# <dbl> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
# 1 80 1 0 0 0 0 0 0 0 0 0 0 0 0 0
# 2 81 0 1 0 0 0 0 0 0 0 0 0 0 0 0
# 3 82 0 0 1 0 0 0 0 0 0 0 0 0 0 0
# # ... with 11 more variables: d94 <int>, d95 <int>, d96 <int>, d97 <int>, d98 <int>, d99 <int>,
# # d00 <int>, d01 <int>, d02 <int>, d03 <int>, d04 <int>
The new column is called year and it's in the beginning of the dataset.
An alternative way is with a bit of reshaping and joining:
library(tidyverse)
# add a row id (useful for reshaping after)
df = df %>% mutate(id = row_number())
df %>%
gather(year, value, -id) %>% # reshape data
filter(value == 1) %>% # keep 1s
mutate(year = as.numeric(gsub("d", "", year))) %>% # update year value
left_join(df, by="id") %>% # join back original dataset
select(-id, -value) # remove unnecessary columns
# year d80 d81 d82 d83 d84 d85 d86 d87 d88 d89 d90 d91 d92 d93 d94 d95 d96 d97 d98 d99 d00 d01
# 1 80 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 2 81 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 3 82 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# d02 d03 d04
# 1 0 0 0
# 2 0 0 0
# 3 0 0 0
A base R solution would be
df$year = as.numeric(gsub("d", "", apply(df , 1, function(x) names(x)[x==1])))

R: Combine rows in same data.frame [duplicate]

This question already has answers here:
How to sum a variable by group
(18 answers)
Closed 6 years ago.
I have a simple R problem, but I just can't find the answer.
I have a dataframe like this:
A 1 0 0 0 0 0
B 0 1 0 0 0 0
B 0 0 1 0 0 1
B 0 0 0 0 1 0
C 1 0 0 0 0 0
C 0 0 0 1 1 0
And i want it to be just like this:
A 1 0 0 0 0 0
B 0 1 1 0 1 1
C 1 0 0 1 1 0
Thank you very much!
Regards Lisanne
Here's one possbility using tapply:
cbind(unique(dat[1]), do.call(rbind, tapply(dat[-1], dat[[1]], colSums)))
# V1 V2 V3 V4 V5 V6 V7
# 1 A 1 0 0 0 0 0
# 2 B 0 1 1 0 1 1
# 5 C 1 0 0 1 1 0
where dat is the name of your data frame.
dat <- structure(list(V1 = structure(c(1L, 2L, 2L, 2L, 3L, 3L), .Label = c("A",
"B", "C"), class = "factor"), V2 = c(1L, 0L, 0L, 0L, 1L, 0L),
V3 = c(0L, 1L, 0L, 0L, 0L, 0L), V4 = c(0L, 0L, 1L, 0L, 0L,
0L), V5 = c(0L, 0L, 0L, 0L, 0L, 1L), V6 = c(0L, 0L, 0L, 1L,
0L, 1L), V7 = c(0L, 0L, 1L, 0L, 0L, 0L)), .Names = c("V1",
"V2", "V3", "V4", "V5", "V6", "V7"), class = "data.frame", row.names = c(NA,
-6L))
You could...
aggregate(.~ V1 , data =dat, sum)
or
library(plyr)
ddply(dat, .(V1), function(x) colSums(x[,2:7]) )
If you're working with a data.frame where there are duplicates but you only want the presence or absence of a 1 to be noted, then after these functions you might want to do something like dat[!(dat %in% c(1,0)] <- 1.
A possibility not mentioned is the aggregate function. I think this is quite 'readable'.
aggregate(cbind(data$X1, data$X2, data$X3, data$X4),
by = list(category = data$group), FUN = sum)

R: use a row as a grouping vector for row sums

If I have a data set laid out like:
Cohort Food1 Food2 Food 3 Food 4
--------------------------------
Group 1 1 2 3
A 1 1 0 1
B 0 0 1 0
C 1 1 0 1
D 0 0 0 1
I want to sum each row, where I can define food groups into different categories. So I would like to use the Group row as the defining vector.
Which would mean that food1 and food2 are in group 1, food3 is in group 2, food 4 is in group 3.
Ideal output something like:
Cohort Group1 Group2 Group3
A 2 0 1
B 0 1 0
C 2 0 1
D 0 0 1
I tried using this rowsum() based functions but no luck, do I need to use ddply() instead?
Example data from comment:
dat <-
structure(list(species = c("group", "princeps", "bougainvillei",
"hombroni", "lindsayi", "concretus", "galatea", "ellioti", "carolinae",
"hydrocharis"), locust = c(1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L,
0L), grasshopper = c(1L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L),
snake = c(2L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L), fish = c(2L,
1L, 0L, 1L, 1L, 0L, 1L, 0L, 1L, 0L), frog = c(2L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 0L, 0L), toad = c(2L, 0L, 0L, 0L, 0L,
1L, 0L, 0L, 0L, 0L), fruit = c(3L, 0L, 0L, 0L, 0L, 1L, 1L,
0L, 0L, 0L), seed = c(3L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L,
0L)), .Names = c("species", "locust", "grasshopper", "snake",
"fish", "frog", "toad", "fruit", "seed"), class = "data.frame", row.names = c(NA,
-10L))
There are most likely more direct approaches, but here is one you can try:
First, create a copy of your data minus the second header row.
dat2 <- dat[-1, ]
melt() and dcast() and so on from the "reshape2" package don't work nicely with duplicated column names, so let's make the column names more "reshape2 appropriate".
Seq <- ave(as.vector(unlist(dat[1, -1])),
as.vector(unlist(dat[1, -1])),
FUN = seq_along)
names(dat2)[-1] <- paste("group", dat[1, 2:ncol(dat)],
".", Seq, sep = "")
melt() the dataset
m.dat2 <- melt(dat2, id.vars="species")
Use the colsplit() function to split the columns correctly.
m.dat2 <- cbind(m.dat2[-2],
colsplit(m.dat2$variable, "\\.",
c("group", "time")))
head(m.dat2)
# species value group time
# 1 princeps 0 group1 1
# 2 bougainvillei 0 group1 1
# 3 hombroni 1 group1 1
# 4 lindsayi 0 group1 1
# 5 concretus 0 group1 1
# 6 galatea 0 group1 1
Proceed with dcast() as usual
dcast(m.dat2, species ~ group, sum)
# species group1 group2 group3
# 1 bougainvillei 0 0 0
# 2 carolinae 1 1 0
# 3 concretus 0 2 2
# 4 ellioti 0 1 0
# 5 galatea 1 1 1
# 6 hombroni 2 1 0
# 7 hydrocharis 0 0 0
# 8 lindsayi 0 1 0
# 9 princeps 0 1 0
Note: Edited because original answer was incorrect.
Update: An easier way in base R
This problem is much more easily solved if you start by transposing your data.
dat3 <- t(dat[-1, -1])
dat3 <- as.data.frame(dat3)
names(dat3) <- dat[[1]][-1]
t(do.call(rbind, lapply(split(dat3, as.numeric(dat[1, -1])), colSums)))
# 1 2 3
# princeps 0 1 0
# bougainvillei 0 0 0
# hombroni 2 1 0
# lindsayi 0 1 0
# concretus 0 2 2
# galatea 1 1 1
# ellioti 0 1 0
# carolinae 1 1 0
# hydrocharis 0 0 0
You can do this using base R fairly easily. Here's an example.
First, figure out which animals belong in which group:
groupings <- as.data.frame(table(as.numeric(dat[1,2:9]),names(dat)[2:9]))
attach(groupings)
grp1 <- groupings[Freq==1 & Var1==1,2]
grp2 <- groupings[Freq==1 & Var1==2,2]
grp3 <- groupings[Freq==1 & Var1==3,2]
detach(groupings)
Then, use the groups to do a rowSums() on the correct columns.
dat <- cbind(dat,rowSums(dat[as.character(grp1)]))
dat <- cbind(dat,rowSums(dat[as.character(grp2)]))
dat <- cbind(dat,rowSums(dat[as.character(grp3)]))
Delete the initial row and the intermediate columns:
dat <- dat[-1,-c(2:9)]
Then, just rename things correctly:
row.names(dat) <- rm()
names(dat) <- c("species","group_1","group_2","group_3")
And you ultimately get:
species group_1 group_2 group_3
bougainvillei 0 0 0
carolinae 1 1 0
concretus 0 2 2
ellioti 0 1 0
galatea 1 1 1
hombroni 2 1 0
hydrocharis 0 0 0
lindsayi 0 1 0
princeps 0 1 0
EDITED: Changed sort order to alphabetical, like other answer.

Resources