R: Convert yearly indicator variables to single year variable - r

I have a data frame that has a set of columns which are indicator variables for a given year. For instance, the "d80" column is 1 for rows where the year was 1980 and 0 otherwise.
for(i in names(df)[31:35]){
print(c(i, df[[i]][0:5]))
}
[1] "d80" "1" "0" "0" "0" "0"
[1] "d81" "0" "1" "0" "0" "0"
[1] "d82" "0" "0" "1" "0" "0"
[1] "d83" "0" "0" "0" "1" "0"
[1] "d84" "0" "0" "0" "0" "1"
Presented another way:
head(data$d80)
[1] 1 0 0 0 0 0
head(data$d81)
[1] 0 1 0 0 0 0
and a third way:
> x = df[1:3, 31:55]
> dput(x)
structure(list(d80 = c(1L, 0L, 0L), d81 = c(0L, 1L, 0L), d82 = c(0L,
0L, 1L), d83 = c(0L, 0L, 0L), d84 = c(0L, 0L, 0L), d85 = c(0L,
0L, 0L), d86 = c(0L, 0L, 0L), d87 = c(0L, 0L, 0L), d88 = c(0L,
0L, 0L), d89 = c(0L, 0L, 0L), d90 = c(0L, 0L, 0L), d91 = c(0L,
0L, 0L), d92 = c(0L, 0L, 0L), d93 = c(0L, 0L, 0L), d94 = c(0L,
0L, 0L), d95 = c(0L, 0L, 0L), d96 = c(0L, 0L, 0L), d97 = c(0L,
0L, 0L), d98 = c(0L, 0L, 0L), d99 = c(0L, 0L, 0L), d00 = c(0L,
0L, 0L), d01 = c(0L, 0L, 0L), d02 = c(0L, 0L, 0L), d03 = c(0L,
0L, 0L), d04 = c(0L, 0L, 0L)), row.names = c("1", "2", "3"), class = "data.frame")
My ultimate goal is to calculate the average of a given column across each year, so I'd like to add a column where the value in each row is equal to the year of the row. In other words, I'd like to collapse the set of year indicator columns into a single column. For instance, the data above would become
80
81
82
83
84
What's the best way to do this? Thank you for your help!

Assuming your dataset is df you can use this approach:
library(tidyverse)
df %>%
group_by(id = row_number()) %>% # for every row numer (row id)
nest() %>% # nest data
mutate(year = map(data, ~as.numeric(gsub("d", "", names(.)[.==1])))) %>% # keep the column name of value 1, remove "d" and make the value numeric
unnest() %>% # unnest data
select(-id) # remove row id
# # A tibble: 3 x 26
# year d80 d81 d82 d83 d84 d85 d86 d87 d88 d89 d90 d91 d92 d93
# <dbl> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
# 1 80 1 0 0 0 0 0 0 0 0 0 0 0 0 0
# 2 81 0 1 0 0 0 0 0 0 0 0 0 0 0 0
# 3 82 0 0 1 0 0 0 0 0 0 0 0 0 0 0
# # ... with 11 more variables: d94 <int>, d95 <int>, d96 <int>, d97 <int>, d98 <int>, d99 <int>,
# # d00 <int>, d01 <int>, d02 <int>, d03 <int>, d04 <int>
The new column is called year and it's in the beginning of the dataset.
An alternative way is with a bit of reshaping and joining:
library(tidyverse)
# add a row id (useful for reshaping after)
df = df %>% mutate(id = row_number())
df %>%
gather(year, value, -id) %>% # reshape data
filter(value == 1) %>% # keep 1s
mutate(year = as.numeric(gsub("d", "", year))) %>% # update year value
left_join(df, by="id") %>% # join back original dataset
select(-id, -value) # remove unnecessary columns
# year d80 d81 d82 d83 d84 d85 d86 d87 d88 d89 d90 d91 d92 d93 d94 d95 d96 d97 d98 d99 d00 d01
# 1 80 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 2 81 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 3 82 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# d02 d03 d04
# 1 0 0 0
# 2 0 0 0
# 3 0 0 0
A base R solution would be
df$year = as.numeric(gsub("d", "", apply(df , 1, function(x) names(x)[x==1])))

Related

Converting one-hot encoded data to aggregate in dplyr

I have age columns like so that are dummy encoded.
How can I aggregate the information so i can get counts in dplyr
Input:
age_010 age_11-20 age_2130 age_3140 age_41-50 age_5160
0 1 0 0 0 0
0 0 1 0 0 0
0 0 0 1 0 0
0 1 0 0 0 0
0 0 0 0 0 1
Expected Output:
age n
age_010 0
age_11-20 2
age_2130 1
age_3140 1
age_41-50 0
age_5160 1
We may do the column wise sum
v1 <- colSums(df1)
data.frame(age = names(v1), n = unname(v1))
-output
age n
1 age_010 0
2 age_11.20 2
3 age_2130 1
4 age_3140 1
5 age_41.50 0
6 age_5160 1
If we want the tidyverse, do the sum across all the columns and then reshape to 'long' with pivot_longer
library(dplyr)
library(tidyr)
df1 %>%
summarise(across(everything(), sum)) %>%
pivot_longer(cols = everything(), names_to = 'age', values_to = 'n')
# A tibble: 6 × 2
age n
<chr> <int>
1 age_010 0
2 age_11.20 2
3 age_2130 1
4 age_3140 1
5 age_41.50 0
6 age_5160 1
data
df1 <- structure(list(age_010 = c(0L, 0L, 0L, 0L, 0L), age_11.20 = c(1L,
0L, 0L, 1L, 0L), age_2130 = c(0L, 1L, 0L, 0L, 0L), age_3140 = c(0L,
0L, 1L, 0L, 0L), age_41.50 = c(0L, 0L, 0L, 0L, 0L), age_5160 = c(0L,
0L, 0L, 0L, 1L)), class = "data.frame", row.names = c(NA, -5L
))

Transforming a one-hot encoded variable to one column

I have age columns like so that are dummy encoded.
How can I transform these columns to one column using dplyr?
Input:
age_0-10 age_11-20 age_21-30 age_31-40 age_41-50 age_51-60 gender
1 0 1 0 0 0 0 0
2 0 0 1 0 0 0 1
3 0 0 0 1 0 0 0
4 0 1 0 0 0 0 1
5 0 0 0 0 0 1 1
Expected output:
age gender
1 11-20 0
2 21-30 1
3 31-40 0
4 11-20 1
5 51-60 1
A possible solution, now, thanks to #Adam's comment, with names_prefix:
library(tidyverse)
df <- data.frame(
check.names = FALSE,
`age_0-10` = c(0L, 0L, 0L, 0L, 0L),
`age_11-20` = c(1L, 0L, 0L, 1L, 0L),
`age_21-30` = c(0L, 1L, 0L, 0L, 0L),
`age_31-40` = c(0L, 0L, 1L, 0L, 0L),
`age_41-50` = c(0L, 0L, 0L, 0L, 0L),
`age_51-60` = c(0L, 0L, 0L, 0L, 1L),
gender = c(0L, 1L, 0L, 1L, 1L)
)
df %>%
pivot_longer(col=starts_with("age"), names_to="age", names_prefix="age_") %>%
filter(value==1) %>%
select(age, gender, -value)
#> # A tibble: 5 × 2
#> age gender
#> <chr> <int>
#> 1 11-20 0
#> 2 21-30 1
#> 3 31-40 0
#> 4 11-20 1
#> 5 51-60 1
Here is a way in dplyr using c_across().
library(dplyr)
library(stringr)
df %>%
rowwise() %>%
mutate(age = str_remove(names(.)[which(c_across(starts_with("age")) == 1)], "^age_")) %>%
ungroup() %>%
select(age, gender)
# # A tibble: 5 x 2
# age gender
# <chr> <int>
# 1 11-20 0
# 2 21-30 1
# 3 31-40 0
# 4 11-20 1
# 5 51-60 1
Try the base R code below using max.col
cbind(
age = gsub("^age_", "", head(names(df), -1)[max.col(df[-ncol(df)])]),
df[ncol(df)]
)
which gives
age gender
1 11-20 0
2 21-30 1
3 31-40 0
4 11-20 1
5 51-60 1
Here is another tidyverse solution:
library(dplyr)
library(purrr)
df %>%
mutate(age = pmap_chr(select(cur_data(), !gender),
~ names(df)[-ncol(df)][as.logical(c(...))])) %>%
select(age, gender)
age gender
1 age_11-20 0
2 age_21-30 1
3 age_31-40 0
4 age_11-20 1
5 age_51-60 1

Concatenate two rows with the same ID into 0,1,2 from presence/absence

I am trying to record an original table with SNP ID in rows and Sample ID in columns.
So far, I only managed to convert the data into presence/absence with 0 and 1.
I tried some easy codes to do further conversion but cannot find one that does I want.
The original table looks like this
snpID Cal_X1 Cal_X2 Cal_X3 Cal_X4 Cal_X5 Cal_X6 Cal_X7 Cal_X8
A_001 0 1 1 1 0 0 1 0
A_001 0 0 1 0 1 0 1 1
A_002 1 1 0 1 1 1 0 0
A_002 0 1 1 0 1 0 1 1
A_003 1 0 0 1 0 1 1 0
A_003 1 1 0 1 1 0 0 1
A_004 0 0 1 0 0 1 0 0
A_004 1 0 0 1 0 1 1 0
I would like to record the scores to 0/0 = NA, 0/1 = 0, 1/1 = 2, 1/0 = 1 so the product looks something like this.
snpID Cal_X1 Cal_X2 Cal_X3 Cal_X4 Cal_X5 Cal_X6 Cal_X7 Cal_X8
A_001 NA 1 2 1 0 NA 2 0
A_002 1 2 0 1 2 1 0 0
A_003 2 0 NA 2 0 1 1 0
A_004 0 NA 1 0 NA 2 0 NA
This is just an example. My total snpID is ~96000 and total sample ID column is ~500.
Any helps with writing this code would be really appreciated.
Here are a few dplyr-based examples that each work in a single pipe and get the same output. The main first step is to group by your ID, then collapse all the columns with a /. Then you can use mutate_at to select all columns that start with Cal_—this may be useful if you have other columns besides the ID that you don't want to do this operation on.
First method is a case_when:
library(dplyr)
dat %>%
group_by(snpID) %>%
summarise_all(paste, collapse = "/") %>%
mutate_at(vars(starts_with("Cal_")), ~case_when(
. == "0/1" ~ 0,
. == "1/1" ~ 2,
. == "1/0" ~ 1,
TRUE ~ NA_real_
))
#> # A tibble: 4 x 9
#> snpID Cal_X1 Cal_X2 Cal_X3 Cal_X4 Cal_X5 Cal_X6 Cal_X7 Cal_X8
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 A_001 NA 1 2 1 0 NA 2 0
#> 2 A_002 1 2 0 1 2 1 0 0
#> 3 A_003 2 0 NA 2 0 1 1 0
#> 4 A_004 0 NA 1 0 NA 2 0 NA
However, (in my opinion) case_when is a little tricky to read, and this doesn't showcase its real power, which is doing if/else checks on multiple variables. Better suited to checks on one variable at a time is dplyr::recode:
dat %>%
group_by(snpID) %>%
summarise_all(paste, collapse = "/") %>%
mutate_at(vars(starts_with("Cal_")),
~recode(.,
"0/1" = 0,
"1/1" = 2,
"1/0" = 1,
"0/0" = NA_real_))
# same output as above
Or, for more flexibility & readability, create a small lookup object. That way, you can reuse the recode logic and change it easily. recode takes a set of named arguments; using tidyeval, you can pass in a named vector and unquo it with !!! (there's a similar example in the recode docs):
lookup <- c("0/1" = 0, "1/1" = 2, "1/0" = 1, "0/0" = NA_real_)
dat %>%
group_by(snpID) %>%
summarise_all(paste, collapse = "/") %>%
mutate_at(vars(starts_with("Cal_")), recode, !!!lookup)
# same output
You might use aggregate to concatenate the values for each snpID and then replace the values according to your needs with the help of case_when from dplyr.
(out <- aggregate(.~ snpID, dat, toString))
# snpID Cal_X1 Cal_X2 Cal_X3 Cal_X4 Cal_X5 Cal_X6 Cal_X7 Cal_X8
#1 A_001 0, 0 1, 0 1, 1 1, 0 0, 1 0, 0 1, 1 0, 1
#2 A_002 1, 0 1, 1 0, 1 1, 0 1, 1 1, 0 0, 1 0, 1
#3 A_003 1, 1 0, 1 0, 0 1, 1 0, 1 1, 0 1, 0 0, 1
#4 A_004 0, 1 0, 0 1, 0 0, 1 0, 0 1, 1 0, 1 0, 0
Now recode the columns
library(dplyr)
out[-1] <- case_when(out[-1] == "0, 0" ~ NA_integer_,
out[-1] == "0, 1" ~ 0L,
out[-1] == "1, 0" ~ 1L,
TRUE ~ 2L)
Result
out
# snpID Cal_X1 Cal_X2 Cal_X3 Cal_X4 Cal_X5 Cal_X6 Cal_X7 Cal_X8
#1 A_001 NA 1 2 1 0 NA 2 0
#2 A_002 1 2 0 1 2 1 0 0
#3 A_003 2 0 NA 2 0 1 1 0
#4 A_004 0 NA 1 0 NA 2 0 NA
data
dat <- structure(list(snpID = c("A_001", "A_001", "A_002", "A_002",
"A_003", "A_003", "A_004", "A_004"), Cal_X1 = c(0L, 0L, 1L, 0L,
1L, 1L, 0L, 1L), Cal_X2 = c(1L, 0L, 1L, 1L, 0L, 1L, 0L, 0L),
Cal_X3 = c(1L, 1L, 0L, 1L, 0L, 0L, 1L, 0L), Cal_X4 = c(1L,
0L, 1L, 0L, 1L, 1L, 0L, 1L), Cal_X5 = c(0L, 1L, 1L, 1L, 0L,
1L, 0L, 0L), Cal_X6 = c(0L, 0L, 1L, 0L, 1L, 0L, 1L, 1L),
Cal_X7 = c(1L, 1L, 0L, 1L, 1L, 0L, 0L, 1L), Cal_X8 = c(0L,
1L, 0L, 1L, 0L, 1L, 0L, 0L)), .Names = c("snpID", "Cal_X1",
"Cal_X2", "Cal_X3", "Cal_X4", "Cal_X5", "Cal_X6", "Cal_X7", "Cal_X8"
), class = "data.frame", row.names = c(NA, -8L))

R summaryBy or other summary method

I am trying to create a summary table and having a mental hang up. Essentially, what I think I want is a summaryBy statement getting colSums for the subsets for ALL columns except the factor to summarize on.
My data frame looks like this:
Cluster GO:0003677 GO:0003700 GO:0046872 GO:0008270 GO:0043565 GO:0005524
comp103680_c0 10 0 0 0 0 0 1
comp103947_c0 3 0 0 0 0 0 0
comp104660_c0 1 1 1 0 0 0 0
comp105255_c0 10 0 0 0 0 0 0
What I would like to do is get colSums for all columns after Cluster using Cluster as the grouping factor.
I have tried a bunch of things. The last was the ply ddply
> groupColumns = "Cluster"
> dataColumns = colnames(GO_matrix_MF[,2:ncol(GO_matrix_MF)])
> res = ddply(GO_matrix_MF, groupColumns, function(x) colSums(GO_matrix_MF[dataColumns]))
> head(res)
Cluster GO:0003677 GO:0003700 GO:0046872 GO:0008270 GO:0043565 GO:0005524 GO:0004674 GO:0045735
1 1 121 138 196 94 43 213 97 20
2 2 121 138 196 94 43 213 97 20
I am not sure what the return values represent, but they do not represent the colSums
Try:
> aggregate(.~Cluster, data=ddf, sum)
Cluster GO.0003677 GO.0003700 GO.0046872 GO.0008270 GO.0043565 GO.0005524
1 1 1 1 0 0 0 0
2 3 0 0 0 0 0 0
3 10 0 0 0 0 0 1
I think you are looking for something like this. I modified your data a bit. There are other options too.
# Modified data
foo <- structure(list(Cluster = c(10L, 3L, 1L, 10L), GO.0003677 = c(11L,
0L, 1L, 5L), GO.0003700 = c(0L, 0L, 1L, 0L), GO.0046872 = c(0L,
9L, 0L, 0L), GO.0008270 = c(0L, 0L, 0L, 0L), GO.0043565 = c(0L,
0L, 0L, 0L), GO.0005524 = c(1L, 0L, 0L, 0L)), .Names = c("Cluster",
"GO.0003677", "GO.0003700", "GO.0046872", "GO.0008270", "GO.0043565",
"GO.0005524"), class = "data.frame", row.names = c("comp103680_c0",
"comp103947_c0", "comp104660_c0", "comp105255_c0"))
library(dplyr)
foo %>%
group_by(Cluster) %>%
summarise_each(funs(sum))
# Cluster GO.0003677 GO.0003700 GO.0046872 GO.0008270 GO.0043565 GO.0005524
#1 1 1 1 0 0 0 0
#2 3 0 0 9 0 0 0
#3 10 16 0 0 0 0 1

R: use a row as a grouping vector for row sums

If I have a data set laid out like:
Cohort Food1 Food2 Food 3 Food 4
--------------------------------
Group 1 1 2 3
A 1 1 0 1
B 0 0 1 0
C 1 1 0 1
D 0 0 0 1
I want to sum each row, where I can define food groups into different categories. So I would like to use the Group row as the defining vector.
Which would mean that food1 and food2 are in group 1, food3 is in group 2, food 4 is in group 3.
Ideal output something like:
Cohort Group1 Group2 Group3
A 2 0 1
B 0 1 0
C 2 0 1
D 0 0 1
I tried using this rowsum() based functions but no luck, do I need to use ddply() instead?
Example data from comment:
dat <-
structure(list(species = c("group", "princeps", "bougainvillei",
"hombroni", "lindsayi", "concretus", "galatea", "ellioti", "carolinae",
"hydrocharis"), locust = c(1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L,
0L), grasshopper = c(1L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L),
snake = c(2L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L), fish = c(2L,
1L, 0L, 1L, 1L, 0L, 1L, 0L, 1L, 0L), frog = c(2L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 0L, 0L), toad = c(2L, 0L, 0L, 0L, 0L,
1L, 0L, 0L, 0L, 0L), fruit = c(3L, 0L, 0L, 0L, 0L, 1L, 1L,
0L, 0L, 0L), seed = c(3L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L,
0L)), .Names = c("species", "locust", "grasshopper", "snake",
"fish", "frog", "toad", "fruit", "seed"), class = "data.frame", row.names = c(NA,
-10L))
There are most likely more direct approaches, but here is one you can try:
First, create a copy of your data minus the second header row.
dat2 <- dat[-1, ]
melt() and dcast() and so on from the "reshape2" package don't work nicely with duplicated column names, so let's make the column names more "reshape2 appropriate".
Seq <- ave(as.vector(unlist(dat[1, -1])),
as.vector(unlist(dat[1, -1])),
FUN = seq_along)
names(dat2)[-1] <- paste("group", dat[1, 2:ncol(dat)],
".", Seq, sep = "")
melt() the dataset
m.dat2 <- melt(dat2, id.vars="species")
Use the colsplit() function to split the columns correctly.
m.dat2 <- cbind(m.dat2[-2],
colsplit(m.dat2$variable, "\\.",
c("group", "time")))
head(m.dat2)
# species value group time
# 1 princeps 0 group1 1
# 2 bougainvillei 0 group1 1
# 3 hombroni 1 group1 1
# 4 lindsayi 0 group1 1
# 5 concretus 0 group1 1
# 6 galatea 0 group1 1
Proceed with dcast() as usual
dcast(m.dat2, species ~ group, sum)
# species group1 group2 group3
# 1 bougainvillei 0 0 0
# 2 carolinae 1 1 0
# 3 concretus 0 2 2
# 4 ellioti 0 1 0
# 5 galatea 1 1 1
# 6 hombroni 2 1 0
# 7 hydrocharis 0 0 0
# 8 lindsayi 0 1 0
# 9 princeps 0 1 0
Note: Edited because original answer was incorrect.
Update: An easier way in base R
This problem is much more easily solved if you start by transposing your data.
dat3 <- t(dat[-1, -1])
dat3 <- as.data.frame(dat3)
names(dat3) <- dat[[1]][-1]
t(do.call(rbind, lapply(split(dat3, as.numeric(dat[1, -1])), colSums)))
# 1 2 3
# princeps 0 1 0
# bougainvillei 0 0 0
# hombroni 2 1 0
# lindsayi 0 1 0
# concretus 0 2 2
# galatea 1 1 1
# ellioti 0 1 0
# carolinae 1 1 0
# hydrocharis 0 0 0
You can do this using base R fairly easily. Here's an example.
First, figure out which animals belong in which group:
groupings <- as.data.frame(table(as.numeric(dat[1,2:9]),names(dat)[2:9]))
attach(groupings)
grp1 <- groupings[Freq==1 & Var1==1,2]
grp2 <- groupings[Freq==1 & Var1==2,2]
grp3 <- groupings[Freq==1 & Var1==3,2]
detach(groupings)
Then, use the groups to do a rowSums() on the correct columns.
dat <- cbind(dat,rowSums(dat[as.character(grp1)]))
dat <- cbind(dat,rowSums(dat[as.character(grp2)]))
dat <- cbind(dat,rowSums(dat[as.character(grp3)]))
Delete the initial row and the intermediate columns:
dat <- dat[-1,-c(2:9)]
Then, just rename things correctly:
row.names(dat) <- rm()
names(dat) <- c("species","group_1","group_2","group_3")
And you ultimately get:
species group_1 group_2 group_3
bougainvillei 0 0 0
carolinae 1 1 0
concretus 0 2 2
ellioti 0 1 0
galatea 1 1 1
hombroni 2 1 0
hydrocharis 0 0 0
lindsayi 0 1 0
princeps 0 1 0
EDITED: Changed sort order to alphabetical, like other answer.

Resources