Transforming a one-hot encoded variable to one column - r

I have age columns like so that are dummy encoded.
How can I transform these columns to one column using dplyr?
Input:
age_0-10 age_11-20 age_21-30 age_31-40 age_41-50 age_51-60 gender
1 0 1 0 0 0 0 0
2 0 0 1 0 0 0 1
3 0 0 0 1 0 0 0
4 0 1 0 0 0 0 1
5 0 0 0 0 0 1 1
Expected output:
age gender
1 11-20 0
2 21-30 1
3 31-40 0
4 11-20 1
5 51-60 1

A possible solution, now, thanks to #Adam's comment, with names_prefix:
library(tidyverse)
df <- data.frame(
check.names = FALSE,
`age_0-10` = c(0L, 0L, 0L, 0L, 0L),
`age_11-20` = c(1L, 0L, 0L, 1L, 0L),
`age_21-30` = c(0L, 1L, 0L, 0L, 0L),
`age_31-40` = c(0L, 0L, 1L, 0L, 0L),
`age_41-50` = c(0L, 0L, 0L, 0L, 0L),
`age_51-60` = c(0L, 0L, 0L, 0L, 1L),
gender = c(0L, 1L, 0L, 1L, 1L)
)
df %>%
pivot_longer(col=starts_with("age"), names_to="age", names_prefix="age_") %>%
filter(value==1) %>%
select(age, gender, -value)
#> # A tibble: 5 × 2
#> age gender
#> <chr> <int>
#> 1 11-20 0
#> 2 21-30 1
#> 3 31-40 0
#> 4 11-20 1
#> 5 51-60 1

Here is a way in dplyr using c_across().
library(dplyr)
library(stringr)
df %>%
rowwise() %>%
mutate(age = str_remove(names(.)[which(c_across(starts_with("age")) == 1)], "^age_")) %>%
ungroup() %>%
select(age, gender)
# # A tibble: 5 x 2
# age gender
# <chr> <int>
# 1 11-20 0
# 2 21-30 1
# 3 31-40 0
# 4 11-20 1
# 5 51-60 1

Try the base R code below using max.col
cbind(
age = gsub("^age_", "", head(names(df), -1)[max.col(df[-ncol(df)])]),
df[ncol(df)]
)
which gives
age gender
1 11-20 0
2 21-30 1
3 31-40 0
4 11-20 1
5 51-60 1

Here is another tidyverse solution:
library(dplyr)
library(purrr)
df %>%
mutate(age = pmap_chr(select(cur_data(), !gender),
~ names(df)[-ncol(df)][as.logical(c(...))])) %>%
select(age, gender)
age gender
1 age_11-20 0
2 age_21-30 1
3 age_31-40 0
4 age_11-20 1
5 age_51-60 1

Related

Converting one-hot encoded data to aggregate in dplyr

I have age columns like so that are dummy encoded.
How can I aggregate the information so i can get counts in dplyr
Input:
age_010 age_11-20 age_2130 age_3140 age_41-50 age_5160
0 1 0 0 0 0
0 0 1 0 0 0
0 0 0 1 0 0
0 1 0 0 0 0
0 0 0 0 0 1
Expected Output:
age n
age_010 0
age_11-20 2
age_2130 1
age_3140 1
age_41-50 0
age_5160 1
We may do the column wise sum
v1 <- colSums(df1)
data.frame(age = names(v1), n = unname(v1))
-output
age n
1 age_010 0
2 age_11.20 2
3 age_2130 1
4 age_3140 1
5 age_41.50 0
6 age_5160 1
If we want the tidyverse, do the sum across all the columns and then reshape to 'long' with pivot_longer
library(dplyr)
library(tidyr)
df1 %>%
summarise(across(everything(), sum)) %>%
pivot_longer(cols = everything(), names_to = 'age', values_to = 'n')
# A tibble: 6 × 2
age n
<chr> <int>
1 age_010 0
2 age_11.20 2
3 age_2130 1
4 age_3140 1
5 age_41.50 0
6 age_5160 1
data
df1 <- structure(list(age_010 = c(0L, 0L, 0L, 0L, 0L), age_11.20 = c(1L,
0L, 0L, 1L, 0L), age_2130 = c(0L, 1L, 0L, 0L, 0L), age_3140 = c(0L,
0L, 1L, 0L, 0L), age_41.50 = c(0L, 0L, 0L, 0L, 0L), age_5160 = c(0L,
0L, 0L, 0L, 1L)), class = "data.frame", row.names = c(NA, -5L
))

How can i count frequency AMT by ABCD?

C1:
AMT A B C D
1 13 0 1 0 0
2 17 0 0 1 0
3 19 0 0 0 1
4 1 0 0 1 0
5 9 0 1 0 0
How can i count frequency AMT by ABCD?
C2= t(as.matrix(C1[1])) %*% as.matrix(C1[2:5])
It gives me a result of Total Sum by Region.
My desired output to combine A B C D in one col since it is binary then count frequency by Type. ie.
AMT GROUP N
1 1 A 1
2 9 B 1
3 13 B 1
4 17 C 1
5 19 D 1
...
AMT IS NOT LIMITED TO 1 9 13 17 ... RANGE FROM 0-30
res <- C1 %>% group_by( ) %>% summarise(Freq=n())
library(tidyverse)
C1 %>%
tidyr::pivot_longer(
cols = A:D,
names_to = "Names",
values_to = "Values",
) %>%
group_by(Names) %>%
filter(Values == 1) %>%
summarise(AMT = sum(AMT))
select(Names, AMT, -Values)
Output:
Names AMT
<chr> <dbl>
1 B 22
2 C 18
3 D 19
You can use max.col to get the column name which has value 1 in it.
library(dplyr)
C1 %>%
transmute(AMT,
GROUP = names(.)[-1][max.col(select(., -1))],
N = 1) %>%
arrange(AMT) -> res
res
# AMT GROUP N
#4 1 C 1
#5 9 B 1
#1 13 B 1
#2 17 C 1
#3 19 D 1
data
C1 <- structure(list(AMT = c(13L, 17L, 19L, 1L, 9L), A = c(0L, 0L,
0L, 0L, 0L), B = c(1L, 0L, 0L, 0L, 1L), C = c(0L, 1L, 0L, 1L,
0L), D = c(0L, 0L, 1L, 0L, 0L)), class = "data.frame", row.names = c(NA, -5L))

Trying to expand all values in specific rows in a dataframe that correspond to a value in another column into rows

I have a data frame that looks like:
d s X3 X4 X5 X6
1 0 1 1 0 1 1
2 1 1 1 0 1 1
3 2 2 0 0 0 1
4 3 2 1 0 0 1
5 4 3 0 0 0 0
6 5 3 0 1 0 0
I want to combine the values in the X3-X6 columns into rows that correspond to the value in column s such that it looks something like:
s G1 G2 G3 G4 G5 G6 G7 G8
1 1 1 1 0 0 1 1 1 1
2 2 0 1 0 0 0 0 1 1
3 3 0 0 0 1 0 0 0 0
I did:
combined_data <- fake_data[,c(2:6)] %>%
melt(id = 's') %>% group_by(s) %>%
summarise(paste(value, collapse = ',')) %>%
separate("paste(value, collapse = \",\")", into = c("G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8"))
It does what I want but I'm not convinced it's the best way to do it.
Any help would be appreciated.
We can pivot to 'long' format, create a group by sequence column and reshape it back to 'wide'
library(dplyr)
library(tidyr)
library(stringr)
fake_data %>%
# // remove the d column
select(-d) %>%
# // pivot to long format
pivot_longer(cols = starts_with('X')) %>%
# // order the columns to get the same order as melt
arrange(s, name) %>%
group_by(s) %>%
# // update the name column by pasteing 'G' with sequence after grouping
mutate(name = str_c('G', row_number())) %>%
# // reshape to wide format
pivot_wider(names_from = name, values_from = value)
# A tibble: 3 x 9
# Groups: s [3]
# s G1 G2 G3 G4 G5 G6 G7 G8
# <int> <int> <int> <int> <int> <int> <int> <int> <int>
#1 1 1 1 0 0 1 1 1 1
#2 2 0 1 0 0 0 0 1 1
#3 3 0 0 0 1 0 0 0 0
data
fake_data <- structure(list(d = 0:5, s = c(1L, 1L, 2L, 2L, 3L, 3L), X3 = c(1L,
1L, 0L, 1L, 0L, 0L), X4 = c(0L, 0L, 0L, 0L, 0L, 1L), X5 = c(1L,
1L, 0L, 0L, 0L, 0L), X6 = c(1L, 1L, 1L, 1L, 0L, 0L)),
class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6"))

Concatenate two rows with the same ID into 0,1,2 from presence/absence

I am trying to record an original table with SNP ID in rows and Sample ID in columns.
So far, I only managed to convert the data into presence/absence with 0 and 1.
I tried some easy codes to do further conversion but cannot find one that does I want.
The original table looks like this
snpID Cal_X1 Cal_X2 Cal_X3 Cal_X4 Cal_X5 Cal_X6 Cal_X7 Cal_X8
A_001 0 1 1 1 0 0 1 0
A_001 0 0 1 0 1 0 1 1
A_002 1 1 0 1 1 1 0 0
A_002 0 1 1 0 1 0 1 1
A_003 1 0 0 1 0 1 1 0
A_003 1 1 0 1 1 0 0 1
A_004 0 0 1 0 0 1 0 0
A_004 1 0 0 1 0 1 1 0
I would like to record the scores to 0/0 = NA, 0/1 = 0, 1/1 = 2, 1/0 = 1 so the product looks something like this.
snpID Cal_X1 Cal_X2 Cal_X3 Cal_X4 Cal_X5 Cal_X6 Cal_X7 Cal_X8
A_001 NA 1 2 1 0 NA 2 0
A_002 1 2 0 1 2 1 0 0
A_003 2 0 NA 2 0 1 1 0
A_004 0 NA 1 0 NA 2 0 NA
This is just an example. My total snpID is ~96000 and total sample ID column is ~500.
Any helps with writing this code would be really appreciated.
Here are a few dplyr-based examples that each work in a single pipe and get the same output. The main first step is to group by your ID, then collapse all the columns with a /. Then you can use mutate_at to select all columns that start with Cal_—this may be useful if you have other columns besides the ID that you don't want to do this operation on.
First method is a case_when:
library(dplyr)
dat %>%
group_by(snpID) %>%
summarise_all(paste, collapse = "/") %>%
mutate_at(vars(starts_with("Cal_")), ~case_when(
. == "0/1" ~ 0,
. == "1/1" ~ 2,
. == "1/0" ~ 1,
TRUE ~ NA_real_
))
#> # A tibble: 4 x 9
#> snpID Cal_X1 Cal_X2 Cal_X3 Cal_X4 Cal_X5 Cal_X6 Cal_X7 Cal_X8
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 A_001 NA 1 2 1 0 NA 2 0
#> 2 A_002 1 2 0 1 2 1 0 0
#> 3 A_003 2 0 NA 2 0 1 1 0
#> 4 A_004 0 NA 1 0 NA 2 0 NA
However, (in my opinion) case_when is a little tricky to read, and this doesn't showcase its real power, which is doing if/else checks on multiple variables. Better suited to checks on one variable at a time is dplyr::recode:
dat %>%
group_by(snpID) %>%
summarise_all(paste, collapse = "/") %>%
mutate_at(vars(starts_with("Cal_")),
~recode(.,
"0/1" = 0,
"1/1" = 2,
"1/0" = 1,
"0/0" = NA_real_))
# same output as above
Or, for more flexibility & readability, create a small lookup object. That way, you can reuse the recode logic and change it easily. recode takes a set of named arguments; using tidyeval, you can pass in a named vector and unquo it with !!! (there's a similar example in the recode docs):
lookup <- c("0/1" = 0, "1/1" = 2, "1/0" = 1, "0/0" = NA_real_)
dat %>%
group_by(snpID) %>%
summarise_all(paste, collapse = "/") %>%
mutate_at(vars(starts_with("Cal_")), recode, !!!lookup)
# same output
You might use aggregate to concatenate the values for each snpID and then replace the values according to your needs with the help of case_when from dplyr.
(out <- aggregate(.~ snpID, dat, toString))
# snpID Cal_X1 Cal_X2 Cal_X3 Cal_X4 Cal_X5 Cal_X6 Cal_X7 Cal_X8
#1 A_001 0, 0 1, 0 1, 1 1, 0 0, 1 0, 0 1, 1 0, 1
#2 A_002 1, 0 1, 1 0, 1 1, 0 1, 1 1, 0 0, 1 0, 1
#3 A_003 1, 1 0, 1 0, 0 1, 1 0, 1 1, 0 1, 0 0, 1
#4 A_004 0, 1 0, 0 1, 0 0, 1 0, 0 1, 1 0, 1 0, 0
Now recode the columns
library(dplyr)
out[-1] <- case_when(out[-1] == "0, 0" ~ NA_integer_,
out[-1] == "0, 1" ~ 0L,
out[-1] == "1, 0" ~ 1L,
TRUE ~ 2L)
Result
out
# snpID Cal_X1 Cal_X2 Cal_X3 Cal_X4 Cal_X5 Cal_X6 Cal_X7 Cal_X8
#1 A_001 NA 1 2 1 0 NA 2 0
#2 A_002 1 2 0 1 2 1 0 0
#3 A_003 2 0 NA 2 0 1 1 0
#4 A_004 0 NA 1 0 NA 2 0 NA
data
dat <- structure(list(snpID = c("A_001", "A_001", "A_002", "A_002",
"A_003", "A_003", "A_004", "A_004"), Cal_X1 = c(0L, 0L, 1L, 0L,
1L, 1L, 0L, 1L), Cal_X2 = c(1L, 0L, 1L, 1L, 0L, 1L, 0L, 0L),
Cal_X3 = c(1L, 1L, 0L, 1L, 0L, 0L, 1L, 0L), Cal_X4 = c(1L,
0L, 1L, 0L, 1L, 1L, 0L, 1L), Cal_X5 = c(0L, 1L, 1L, 1L, 0L,
1L, 0L, 0L), Cal_X6 = c(0L, 0L, 1L, 0L, 1L, 0L, 1L, 1L),
Cal_X7 = c(1L, 1L, 0L, 1L, 1L, 0L, 0L, 1L), Cal_X8 = c(0L,
1L, 0L, 1L, 0L, 1L, 0L, 0L)), .Names = c("snpID", "Cal_X1",
"Cal_X2", "Cal_X3", "Cal_X4", "Cal_X5", "Cal_X6", "Cal_X7", "Cal_X8"
), class = "data.frame", row.names = c(NA, -8L))

R: Convert yearly indicator variables to single year variable

I have a data frame that has a set of columns which are indicator variables for a given year. For instance, the "d80" column is 1 for rows where the year was 1980 and 0 otherwise.
for(i in names(df)[31:35]){
print(c(i, df[[i]][0:5]))
}
[1] "d80" "1" "0" "0" "0" "0"
[1] "d81" "0" "1" "0" "0" "0"
[1] "d82" "0" "0" "1" "0" "0"
[1] "d83" "0" "0" "0" "1" "0"
[1] "d84" "0" "0" "0" "0" "1"
Presented another way:
head(data$d80)
[1] 1 0 0 0 0 0
head(data$d81)
[1] 0 1 0 0 0 0
and a third way:
> x = df[1:3, 31:55]
> dput(x)
structure(list(d80 = c(1L, 0L, 0L), d81 = c(0L, 1L, 0L), d82 = c(0L,
0L, 1L), d83 = c(0L, 0L, 0L), d84 = c(0L, 0L, 0L), d85 = c(0L,
0L, 0L), d86 = c(0L, 0L, 0L), d87 = c(0L, 0L, 0L), d88 = c(0L,
0L, 0L), d89 = c(0L, 0L, 0L), d90 = c(0L, 0L, 0L), d91 = c(0L,
0L, 0L), d92 = c(0L, 0L, 0L), d93 = c(0L, 0L, 0L), d94 = c(0L,
0L, 0L), d95 = c(0L, 0L, 0L), d96 = c(0L, 0L, 0L), d97 = c(0L,
0L, 0L), d98 = c(0L, 0L, 0L), d99 = c(0L, 0L, 0L), d00 = c(0L,
0L, 0L), d01 = c(0L, 0L, 0L), d02 = c(0L, 0L, 0L), d03 = c(0L,
0L, 0L), d04 = c(0L, 0L, 0L)), row.names = c("1", "2", "3"), class = "data.frame")
My ultimate goal is to calculate the average of a given column across each year, so I'd like to add a column where the value in each row is equal to the year of the row. In other words, I'd like to collapse the set of year indicator columns into a single column. For instance, the data above would become
80
81
82
83
84
What's the best way to do this? Thank you for your help!
Assuming your dataset is df you can use this approach:
library(tidyverse)
df %>%
group_by(id = row_number()) %>% # for every row numer (row id)
nest() %>% # nest data
mutate(year = map(data, ~as.numeric(gsub("d", "", names(.)[.==1])))) %>% # keep the column name of value 1, remove "d" and make the value numeric
unnest() %>% # unnest data
select(-id) # remove row id
# # A tibble: 3 x 26
# year d80 d81 d82 d83 d84 d85 d86 d87 d88 d89 d90 d91 d92 d93
# <dbl> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
# 1 80 1 0 0 0 0 0 0 0 0 0 0 0 0 0
# 2 81 0 1 0 0 0 0 0 0 0 0 0 0 0 0
# 3 82 0 0 1 0 0 0 0 0 0 0 0 0 0 0
# # ... with 11 more variables: d94 <int>, d95 <int>, d96 <int>, d97 <int>, d98 <int>, d99 <int>,
# # d00 <int>, d01 <int>, d02 <int>, d03 <int>, d04 <int>
The new column is called year and it's in the beginning of the dataset.
An alternative way is with a bit of reshaping and joining:
library(tidyverse)
# add a row id (useful for reshaping after)
df = df %>% mutate(id = row_number())
df %>%
gather(year, value, -id) %>% # reshape data
filter(value == 1) %>% # keep 1s
mutate(year = as.numeric(gsub("d", "", year))) %>% # update year value
left_join(df, by="id") %>% # join back original dataset
select(-id, -value) # remove unnecessary columns
# year d80 d81 d82 d83 d84 d85 d86 d87 d88 d89 d90 d91 d92 d93 d94 d95 d96 d97 d98 d99 d00 d01
# 1 80 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 2 81 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 3 82 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# d02 d03 d04
# 1 0 0 0
# 2 0 0 0
# 3 0 0 0
A base R solution would be
df$year = as.numeric(gsub("d", "", apply(df , 1, function(x) names(x)[x==1])))

Resources