how to add columns iteratively for recoding with semi-modified names - r

I would use this dataset as an example
BEZ <- c("A","A","A","A","B","B","B")
var <- c("B","B","B","B","B","B","B")
bar <- c("B","B","B","B","B","B","B")
Bez1 <- c("A","A","A","A","B","B","B")
var1 <- c("B","B","B","B","B","B","B")
bar1 <- c("B","B","B","B","B","B","B")
dat <- data.frame(BEZ, var, bar, Bez1, var1, bar1)
the tricky thing that I would like to do is use a method (loops, map(), apply(), dplyr functions, and so on) to create aside the already existing new column where based on the respective row value is converted into a number.
Excepeted result
BEZ BEZ_num var var_num bar bar_num Bez1 BEZ1_num var1 var1_num bar1 bar1_num
A 0 B 1 B 1 A 0 B 1 B 1
A 0 B 1 B 1 A 0 B 1 B 1
A 0 B 1 C 2 A 0 B 1 A 0
A 0 B 1 B 1 A 0 C 2 B 1
B 1 B 1 B 1 B 1 C 2 C 2
B 1 B 1 B 1 A 0 B 1 B 1
B 1 B 1 B 1 A 0 B 1 B 1
This is more or less the idea I would like to hit. Any suggestions?
Thanks

Using factor
library(dplyr)
dat %>%
mutate(across(everything(), ~ as.integer(factor(.x))-1, .names = "{.col}_num"))
-output
BEZ var bar Bez1 var1 bar1 BEZ_num var_num bar_num Bez1_num var1_num bar1_num
1 A B B A B B 0 0 0 0 0 0
2 A B B A B B 0 0 0 0 0 0
3 A B B A B B 0 0 0 0 0 0
4 A B B A B B 0 0 0 0 0 0
5 B B B B B B 1 0 0 1 0 0
6 B B B B B B 1 0 0 1 0 0
7 B B B B B B 1 0 0 1 0 0

See in the comments. The provided data frame and the expected output do not match. But I think we could use mutate(across..) with the .names argument combined with case_when:
library(dplyr)
dat %>%
mutate(across(everything(), ~case_when(
. == "A" ~ "0",
. == "B" ~ "1",
. == "C" ~ "2"), .names = "{col}_num"))
BEZ var bar Bez1 var1 bar1 BEZ_num var_num bar_num Bez1_num var1_num bar1_num
1 A B B A B B 0 1 1 0 1 1
2 A B B A B B 0 1 1 0 1 1
3 A B B A B B 0 1 1 0 1 1
4 A B B A B B 0 1 1 0 1 1
5 B B B B B B 1 1 1 1 1 1
6 B B B B B B 1 1 1 1 1 1
7 B B B B B B 1 1 1 1 1 1

Using a for loop in base R:
dat2 <- dat[, 1, drop = FALSE]
for (col in names(dat)) {
dat2[[col]] <- dat[[col]]
dat2[[paste0(col, "_num")]] <- match(dat[[col]], LETTERS) - 1
}
dat2
# BEZ BEZ_num var var_num bar bar_num Bez1 Bez1_num var1 var1_num bar1 bar1_num
# 1 A 0 B 1 B 1 A 0 B 1 B 1
# 2 A 0 B 1 B 1 A 0 B 1 B 1
# 3 A 0 B 1 B 1 A 0 B 1 B 1
# 4 A 0 B 1 B 1 A 0 B 1 B 1
# 5 B 1 B 1 B 1 B 1 B 1 B 1
# 6 B 1 B 1 B 1 B 1 B 1 B 1
# 7 B 1 B 1 B 1 B 1 B 1 B 1
Or a (slightly convoluted) approach using dplyr::across():
library(dplyr)
dat %>%
mutate(
across(BEZ:bar1, list(TMP = identity, num = \(x) match(x, LETTERS) - 1)),
.keep = "unused"
) %>%
rename_with(\(x) gsub("_TMP$", "", x))
# same output as above
Or finally, if you don't care about the order of the output columns, you could also use dplyr::across() with the .names argument:
dat %>%
mutate(across(
BEZ:bar1,
\(x) match(x, LETTERS) - 1,
.names = "{.col}_num"
))
# BEZ var bar Bez1 var1 bar1 BEZ_num var_num bar_num Bez1_num var1_num bar1_num
# 1 A B B A B B 0 1 1 0 1 1
# 2 A B B A B B 0 1 1 0 1 1
# 3 A B B A B B 0 1 1 0 1 1
# 4 A B B A B B 0 1 1 0 1 1
# 5 B B B B B B 1 1 1 1 1 1
# 6 B B B B B B 1 1 1 1 1 1
# 7 B B B B B B 1 1 1 1 1 1

To add two further options:
With dplyr v.1.1.0 we can use consecutive_id():
library(dplyr) # v.1.1.0
dat %>%
mutate(across(everything(),
~ consecutive_id(.x)-1,
.names = "{.col}_num"))
#> BEZ var bar Bez1 var1 bar1 BEZ_num var_num bar_num Bez1_num var1_num bar1_num
#> 1 A B B A B B 0 0 0 0 0 0
#> 2 A B B A B B 0 0 0 0 0 0
#> 3 A B B A B B 0 0 0 0 0 0
#> 4 A B B A B B 0 0 0 0 0 0
#> 5 B B B B B B 1 0 0 1 0 0
#> 6 B B B B B B 1 0 0 1 0 0
#> 7 B B B B B B 1 0 0 1 0 0
Similar we could use data.table::rleid():
dat %>%
mutate(across(everything(),
~ data.table::rleid(.x)-1,
.names = "{.col}_num"))
#> BEZ var bar Bez1 var1 bar1 BEZ_num var_num bar_num Bez1_num var1_num bar1_num
#> 1 A B B A B B 0 0 0 0 0 0
#> 2 A B B A B B 0 0 0 0 0 0
#> 3 A B B A B B 0 0 0 0 0 0
#> 4 A B B A B B 0 0 0 0 0 0
#> 5 B B B B B B 1 0 0 1 0 0
#> 6 B B B B B B 1 0 0 1 0 0
#> 7 B B B B B B 1 0 0 1 0 0
Created on 2023-02-03 with reprex v2.0.2

Related

Count occurrences of distinct values across multiple columns and groups

I've got a dataframe like the one below (in the actual dataset the number of rows are a few thousands and I 've got more than 300 variables):
df <- data.frame (Gr = c("A","A","A","B","B","B","B","B","B"),
Var1 = c("a","b","c","e","a","a","c","e","b"),
Var2 = c("a","a","a","d","b","b","c","a","e"),
Var3 = c("e","a","b",NA,"a","b","c","d","a"),
Var4 = c("e",NA,"a","e","a","b","d","c",NA))
which returns:
Gr Var1 Var2 Var3 Var4
1 A a a e e
2 A b a a <NA>
3 A c a b a
4 B e d <NA> e
5 B a b a a
6 B a b b b
7 B c c c d
8 B e a d c
9 B b e a <NA>
and would like to get number of occurrences of each value (a,b,c,d,e, and NA) in each variable and in each group. Hence, the output should look something like the following:
df1 <- data.frame(Vars = c("Var1","Var2","Var3","Var4"),
a = c(1,3,1,1),
b = c(1,0,1,0),
c = c(1,0,0,0),
d = c(0,0,0,0),
e = c(0,0,1,1),
na = c(0,0,0,1))
df2 <- data.frame(Vars = c("Var1","Var2","Var3","Var4"),
a = c(2,1,2,1),
b = c(0,2,1,1),
c = c(1,1,1,1),
d = c(0,1,1,1),
e = c(2,1,0,1),
na = c(0,0,1,1))
output <- list(df1,df2)
names(output) <- c("A","B")
which looks like:
$A
Vars a b c d e na
1 Var1 1 1 1 0 0 0
2 Var2 3 0 0 0 0 0
3 Var3 1 1 0 0 1 0
4 Var4 1 0 0 0 1 1
$B
Vars a b c d e na
1 Var1 2 0 1 0 2 0
2 Var2 1 2 1 1 1 0
3 Var3 2 1 1 1 0 1
4 Var4 1 1 1 1 1 1
I haven't been able to make any considerable progress so far, and a tidyverse solution is preferred.
We may use mtabulate after spliting
library(qdapTools)
lapply(split(df[-1], df$Gr), mtabulate)
If we need the na count as well
lapply(split(replace(df[-1], is.na(df[-1]), "na"), df$Gr), mtabulate)
-output
$A
a b c e na
Var1 1 1 1 0 0
Var2 3 0 0 0 0
Var3 1 1 0 1 0
Var4 1 0 0 1 1
$B
a b c d e na
Var1 2 1 1 0 2 0
Var2 1 2 1 1 1 0
Var3 2 1 1 1 0 1
Var4 1 1 1 1 1 1
Or using tidyverse
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = -Gr, names_to = "Vars") %>%
pivot_wider(names_from = value, values_from = value,
values_fn = length, values_fill = 0) %>%
{split(.[-1], .$Gr)}
-output
$A
# A tibble: 4 × 7
Vars a e b `NA` c d
<chr> <int> <int> <int> <int> <int> <int>
1 Var1 1 0 1 0 1 0
2 Var2 3 0 0 0 0 0
3 Var3 1 1 1 0 0 0
4 Var4 1 1 0 1 0 0
$B
# A tibble: 4 × 7
Vars a e b `NA` c d
<chr> <int> <int> <int> <int> <int> <int>
1 Var1 2 2 1 0 1 0
2 Var2 1 1 2 0 1 1
3 Var3 2 0 1 1 1 1
4 Var4 1 1 1 1 1 1
A NA save base R approach using colSums
val <- sort(unique(unlist(df[-1])), na.last=T)
as.list(lapply(split(df[-1], df$Gr), function(dlist)
data.frame(sapply(val, function(x)
colSums(dlist == x | (is.na(dlist) & is.na(x)), na.rm=T)), check.names=F)))
$A
a b c d e NA
Var1 1 1 1 0 0 0
Var2 3 0 0 0 0 0
Var3 1 1 0 0 1 0
Var4 1 0 0 0 1 1
$B
a b c d e NA
Var1 2 1 1 0 2 0
Var2 1 2 1 1 1 0
Var3 2 1 1 1 0 1
Var4 1 1 1 1 1 1
reshape2::recast(df,Gr+variable~value,length,id.var = 'Gr')
Gr variable a b c d e NA
1 A Var1 1 1 1 0 0 0
2 A Var2 3 0 0 0 0 0
3 A Var3 1 1 0 0 1 0
4 A Var4 1 0 0 0 1 1
5 B Var1 2 1 1 0 2 0
6 B Var2 1 2 1 1 1 0
7 B Var3 2 1 1 1 0 1
If you must split them:
split(reshape2::recast(df,Gr+variable~value,length,id.var = 'Gr'), ~Gr)
$A
Gr variable a b c d e NA
1 A Var1 1 1 1 0 0 0
2 A Var2 3 0 0 0 0 0
3 A Var3 1 1 0 0 1 0
4 A Var4 1 0 0 0 1 1
$B
Gr variable a b c d e NA
5 B Var1 2 1 1 0 2 0
6 B Var2 1 2 1 1 1 0
7 B Var3 2 1 1 1 0 1
8 B Var4 1 1 1 1 1 1
in base R:
ftable(cbind(df[1], stack(replace(df, is.na(df),'na'), -1)),col.vars = 2)
values a b c d e na
Gr ind
A Var1 1 1 1 0 0 0
Var2 3 0 0 0 0 0
Var3 1 1 0 0 1 0
Var4 1 0 0 0 1 1
B Var1 2 1 1 0 2 0
Var2 1 2 1 1 1 0
Var3 2 1 1 1 0 1
Var4 1 1 1 1 1 1

Identify the column name of the last occurrence of a value in R data frame

I have a dataset like below with columns of 1s and 0s. I would like to add a final column that identifies the column name of the final occurrence of 0 per row.
have = data.frame(a = c(1,0,1,1,0,0,1,1,1,0),
b = c(1,0,1,1,1,0,1,1,0,0),
c = c(0,0,0,1,0,1,1,1,1,0),
d = c(1,0,1,1,0,0,0,1,0,1),
e = c(1,1,1,1,1,1,1,1,1,1))
> have
a b c d e
1 1 1 0 1 1
2 0 0 0 0 1
3 1 1 0 1 1
4 1 1 1 1 1
5 0 1 0 0 1
6 0 0 1 0 1
7 1 1 1 0 1
8 1 1 1 1 1
9 1 0 1 0 1
10 0 0 0 1 1
I would like the output to look like this where the final column specifies the column name of the last occurring 0 and if one does not exist return NA.
> want
a b c d e last_0
1 1 1 0 1 1 c
2 0 0 0 0 1 d
3 1 1 0 1 1 c
4 1 1 1 1 1 <NA>
5 0 1 0 0 1 d
6 0 0 1 0 1 d
7 1 1 1 0 1 d
8 1 1 1 1 1 <NA>
9 1 0 1 0 1 d
10 0 0 0 1 1 c
I've tried using max.col but it returns the last column name if a zero does not exist. Any other solutions? A dplyr solution is preferred.
> have$last_0 = names(have)[max.col(have == 0, ties.method = "last")]
> have
a b c d e last_0
1 1 1 0 1 1 c
2 0 0 0 0 1 d
3 1 1 0 1 1 c
4 1 1 1 1 1 e
5 0 1 0 0 1 d
6 0 0 1 0 1 d
7 1 1 1 0 1 d
8 1 1 1 1 1 e
9 1 0 1 0 1 d
10 0 0 0 1 1 c
Here is an approach with purrr::pmap:
library(dplyr);library(purrr)
have %>%
mutate(want = pmap_chr(cur_data(),
~ tail(c(NA,names(which(c(...)==0))),1)))
a b c d e want
1 1 1 0 1 1 c
2 0 0 0 0 1 d
3 1 1 0 1 1 c
4 1 1 1 1 1 <NA>
5 0 1 0 0 1 d
6 0 0 1 0 1 d
7 1 1 1 0 1 d
8 1 1 1 1 1 <NA>
9 1 0 1 0 1 d
10 0 0 0 1 1 c
purrr:pmap is a very useful function because it will work row wise on data and it comes in various flavors so you can control what returns. You can refer to the entire row of data with c(...).
If you wanted to apply the procedure to only a subset of columns, you might use dplyr::select:
have %>%
mutate(want = pmap_chr(cur_data() %>% select(a,b,c),
~ tail(c(NA,names(which(c(...)==0))),1)))
a b c d e want
1 1 1 0 1 1 c
2 0 0 0 0 1 c
3 1 1 0 1 1 c
4 1 1 1 1 1 <NA>
5 0 1 0 0 1 c
6 0 0 1 0 1 b
7 1 1 1 0 1 <NA>
8 1 1 1 1 1 <NA>
9 1 0 1 0 1 b
10 0 0 0 1 1 c
We can use max.col and then replace those elements that doesn't have any 0 to NA
have$last_0 <- names(have)[(NA^!rowSums(have == 0)) * max.col(have == 0, 'last')]
-output
have
a b c d e last_0
1 1 1 0 1 1 c
2 0 0 0 0 1 d
3 1 1 0 1 1 c
4 1 1 1 1 1 <NA>
5 0 1 0 0 1 d
6 0 0 1 0 1 d
7 1 1 1 0 1 d
8 1 1 1 1 1 <NA>
9 1 0 1 0 1 d
10 0 0 0 1 1 c
Here is my approach:
library(dplyr)
have %>%
purrr::pmap_dfr(\(...) tibble(...,
last_0 = which(c(...) == 0) %>%
names %>%
last))
Returns:
# A tibble: 10 x 6
a b c d e last_0
<dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 1 1 0 1 1 c
2 0 0 0 0 1 d
3 1 1 0 1 1 c
4 1 1 1 1 1 NA
5 0 1 0 0 1 d
6 0 0 1 0 1 d
7 1 1 1 0 1 d
8 1 1 1 1 1 NA
9 1 0 1 0 1 d
10 0 0 0 1 1 c
We can use pmax
have$last_0 <- names(have)[replace(do.call(pmax, data.frame((have == 0) * col(have))), rowSums(have) == ncol(have), NA)]
Another base R option using max.col
have$last_0 <- replace(names(have)[max.col(1 - have, "last")], rowSums(have) == ncol(have), NA)
such that
> have
a b c d e last_0
1 1 1 0 1 1 c
2 0 0 0 0 1 d
3 1 1 0 1 1 c
4 1 1 1 1 1 <NA>
5 0 1 0 0 1 d
6 0 0 1 0 1 d
7 1 1 1 0 1 d
8 1 1 1 1 1 <NA>
9 1 0 1 0 1 d
10 0 0 0 1 1 c
You can also use the following solution, however it may sound a bit verbose. As most of the possible "row_wise" operations had already been suggested, I thought I would try something I had never done before:
library(dplyr)
library(tidyr)
have %>%
mutate(id = row_number()) %>%
pivot_longer(-id, names_to = "Last_0", values_to = "val") %>%
group_by(id) %>%
arrange(desc(val), .by_group = TRUE) %>%
slice_tail(n = 1) %>%
mutate(Last_0 = ifelse(val == 1, "NA", Last_0)) %>%
select(Last_0) %>%
bind_cols(have) %>%
relocate(Last_0, .after = last_col())
# A tibble: 10 x 7
# Groups: id [10]
id a b c d e Last_0
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 1 1 1 0 1 1 c
2 2 0 0 0 0 1 d
3 3 1 1 0 1 1 c
4 4 1 1 1 1 1 NA
5 5 0 1 0 0 1 d
6 6 0 0 1 0 1 d
7 7 1 1 1 0 1 d
8 8 1 1 1 1 1 NA
9 9 1 0 1 0 1 d
10 10 0 0 0 1 1 c
Though the strategy used by dear #akrun is fantabulous, but you can simply store the value calculated by you inside a temp variable and replace it like this. This can be done in dplyr only using cur_data()
library(dplyr)
have %>% mutate(last_0 = {xx <- names(.)[max.col(cur_data() == 0, ties.method = 'last')];
replace(xx, rowSums(cur_data() == 0) == 0, NA)})
#> a b c d e last_0
#> 1 1 1 0 1 1 c
#> 2 0 0 0 0 1 d
#> 3 1 1 0 1 1 c
#> 4 1 1 1 1 1 <NA>
#> 5 0 1 0 0 1 d
#> 6 0 0 1 0 1 d
#> 7 1 1 1 0 1 d
#> 8 1 1 1 1 1 <NA>
#> 9 1 0 1 0 1 d
#> 10 0 0 0 1 1 c
Created on 2021-06-10 by the reprex package (v2.0.0)
A minimalist base R approach (that actually uses nothing but a grep-search for the condition):
data.frame( have,
last_0=unlist(apply( have, 1, function(x){
sol <- grep(0,x);
if( length(sol > 0 )){
colnames(have)[sol[length(sol)]]
}else{
NA } } )) )
a b c d e last_0
1 1 1 0 1 1 c
2 0 0 0 0 1 d
3 1 1 0 1 1 c
4 1 1 1 1 1 <NA>
5 0 1 0 0 1 d
6 0 0 1 0 1 d
7 1 1 1 0 1 d
8 1 1 1 1 1 <NA>
9 1 0 1 0 1 d
10 0 0 0 1 1 c

R: df header columns are ordinal ranking and spread across columns for each observation

I have a questionnaire data that look like below:
items no_stars1 no_stars2 no_stars3 average satisfied bad
1 A 1 0 0 0 0 1
2 B 0 1 0 1 0 0
3 C 0 0 1 0 1 0
4 D 0 1 0 0 1 0
5 E 0 0 1 1 0 0
6 F 0 0 1 0 1 0
7 G 1 0 0 0 0 1
Basically, the header columns (no. of stars rating and satisfactory) are ordinal ranking for each Items. I would like to summarize the no_stars(col 2:4) and satisfactory(col 5:7) into one column so that the output would look like this :
items no_stars satisfactory
1 A 1 1
2 B 2 2
3 C 3 3
4 D 2 3
5 E 3 2
6 F 3 3
7 G 1 1
$no_stars <- 1 is for no_stars1, 2 for no_stars2, 3 for no_stars3
$satisfactory <- 1 is for bad, 2 for average, 3 for good
I have tried the code below
df$no_stars2[df$no_stars2 == 1] <- 2
df$no_stars3[df$no_stars3 == 1] <- 3
df$average[df$average == 1] <- 2
df$satisfied[df$satisfied == 1] <- 3
no_stars <- df$no_stars1 + df$no_stars2 + df$no_stars3
satisfactory <- df$bad + df$average + df$satisfied
tidy_df <- data.frame(df$Items, no_stars, satisfactory)
tidy_df
Is there any function in R that can do the same thing? or
anyone got better and simpler solution ?
Thanks
Just use max.col and set preferences:
starsOrder<-c("no_stars1","no_stars2","no_stars3")
satOrder<-c("bad","average","satisfied")
data.frame(items=df$items,no_stars=max.col(df[,starsOrder]),
satisfactory=max.col(df[,satOrder]))
# items no_stars satisfactory
#1 A 1 1
#2 B 2 2
#3 C 3 3
#4 D 2 3
#5 E 3 2
#6 F 3 3
#7 G 1 1
Another tidyverse solution making use of factor to integer conversions to encode no_stars and satisfactory and spreading from wide to long twice:
library(tidyverse)
df %>%
gather(no_stars, v1, starts_with("no_stars")) %>%
mutate(no_stars = as.integer(factor(no_stars))) %>%
gather(satisfactory, v2, average, satisfied, bad) %>%
filter(v1 > 0 & v2 > 0) %>%
mutate(satisfactory = as.integer(factor(
satisfactory, levels = c("bad", "average", "satisfied")))) %>%
select(-v1, -v2) %>%
arrange(items)
# items no_stars satisfactory
#1 A 1 1
#2 B 2 2
#3 C 3 3
#4 D 2 3
#5 E 3 2
#6 F 3 3
#7 G 1 1
While there may be more elegant solutions, using dplyr::case_when() gives you the flexibility to code things however you want:
library(dplyr)
df %>%
dplyr::mutate(
no_stars = dplyr::case_when(
no_stars1 == 1 ~ 1,
no_stars2 == 1 ~ 2,
no_stars3 == 1 ~ 3)
, satisfactory = dplyr::case_when(
average == 1 ~ 2,
satisfied == 1 ~ 3,
bad == 1 ~ 1)
)
# items no_stars1 no_stars2 no_stars3 average satisfied bad no_stars satisfactory
# 1 A 1 0 0 0 0 1 1 1
# 2 B 0 1 0 1 0 0 2 2
# 3 C 0 0 1 0 1 0 3 3
# 4 D 0 1 0 0 1 0 2 3
# 5 E 0 0 1 1 0 0 3 2
# 6 F 0 0 1 0 1 0 3 3
# 7 G 1 0 0 0 0 1 1 1
dat%>%
replace(.==1,NA)%>%
replace_na(setNames(as.list(names(.)),names(.)))%>%
replace(.==0,NA)%>%
mutate(s=coalesce(!!!.[2:4]),
no_stars=as.numeric(factor(s,unique(s))),
t=coalesce(!!!.[5:7]),
satisfactory=as.numeric(factor(t,unique(t))))%>%
select(items,no_stars,satisfactory)
items no_stars satisfactory
1 A 1 1
2 B 2 2
3 C 3 3
4 D 2 3
5 E 3 2
6 F 3 3
7 G 1 1
using apply and match :
data.frame(
items = df1$items,
no_stars = apply(df1[2:4], 1, match, x=1),
satisfactory = apply(df1[c(7,5:6)], 1, match, x=1))
# items no_stars satisfactory
# 1 A 1 1
# 2 B 2 2
# 3 C 3 3
# 4 D 2 3
# 5 E 3 2
# 6 F 3 3
# 7 G 1 1
data
df1 <- read.table(header=TRUE,stringsAsFactors=FALSE,text="
items no_stars1 no_stars2 no_stars3 average satisfied bad
1 A 1 0 0 0 0 1
2 B 0 1 0 1 0 0
3 C 0 0 1 0 1 0
4 D 0 1 0 0 1 0
5 E 0 0 1 1 0 0
6 F 0 0 1 0 1 0
7 G 1 0 0 0 0 1")

How to replace certain values with their column name

I have a following table in R
df <- data.frame('a' = c(1,0,0,1,0),
'b' = c(1,0,0,1,0),
'c' = c(1,1,0,1,1))
df
a b c
1 1 1 1
2 0 0 1
3 0 0 0
4 1 1 1
4 0 0 1
What I want is to replace the row value with the column name whenever the row is equal to 1. The output would be this one:
a b c
1 a b c
2 0 0 c
3 0 0 0
4 a b c
4 0 0 c
How can I do this in R? Thanks.
I would use Map and replace:
df[] <- Map(function(n, x) replace(x, x == 1, n), names(df), df)
df
# a b c
# 1 a b c
# 2 0 0 c
# 3 0 0 0
# 4 a b c
# 5 0 0 c
We can use
df[] <- names(df)[(NA^!df) * col(df)]
df[is.na(df)] <- 0
df
# a b c
#1 a b c
#2 0 0 c
#3 0 0 0
#4 a b c
#4 0 0 c
You can try stack and unstack
a=stack(df)
a
values ind
1 1 a
2 0 a
3 0 a
4 1 a
5 0 a
6 1 b
7 0 b
8 0 b
9 1 b
10 0 b
11 1 c
12 1 c
13 0 c
14 1 c
15 1 c
a$values[a$values==1]=as.character(a$ind)[a$values==1]
unstack(a)
a b c
1 a b c
2 0 0 c
3 0 0 0
4 a b c
5 0 0 c
We can try iterating over the names of the data frame, and then handling each column, for a base R option:
df <- data.frame(a=c(1,0,0,1,0), b=c(1,0,0,1,0), c=c(1,1,0,1,1))
df <- data.frame(sapply(names(df), function(x) {
y <- df[[x]]
y[y == 1] <- x
return(y)
}))
df
a b c
1 a b c
2 0 0 c
3 0 0 0
4 a b c
5 0 0 c
Demo
You can do it with ifelse, but you have to do some intermediate transposing to account for R's column-major order processing.
data.frame(t(ifelse(t(df)==1,names(df),0)))
a b c
1 a b c
2 0 0 c
3 0 0 0
4 a b c
5 0 0 c

dataframes in R with multi-level rows and columns

Let's say I have the following data frame.
> df = data.frame(rowsA = sample(c('A','B','C'), 100, replace=TRUE),
rowsB = sample(c('D','E','F'), 100, replace=TRUE),
colsA = sample(c('G','H','I'), 100, replace=TRUE),
colsB = sample(c('J','K','L'), 100, replace=TRUE))
> head(df)
rowsA rowsB colsA colsB
1 B E I L
2 A E G J
3 A E H K
4 A D I J
5 C F G J
6 A F G J
Is it possible to create a multi-level table of counts?
In excel, it is possible with the PivotTable functionality
I think it possible in python in pandas with the df.columns.levels method.
I also figured out how to do multi-level rows only in R with dplyr (but haven't figured out multi-level columns)
df %>%
group_by(rowsA, rowsB, colsA) %>%
summarise(count = n()) %>%
spread(colsA, count)
# A tibble: 9 x 5
# Groups: rowsA, rowsB [9]
rowsA rowsB G H I
* <fctr> <fctr> <int> <int> <int>
1 A D 5 3 1
2 A E 1 2 1
3 A F 5 8 NA
4 B D 5 5 5
5 B E 2 4 6
6 B F 4 6 5
7 C D 2 6 NA
8 C E 6 5 3
9 C F 4 3 3
Paste the columns that goes to the header into one column, then reshape it, in which way you have a contingency table with the same meaning as the multi level count:
library(dplyr); library(tidyr)
df %>%
unite(header, c('colsA', 'colsB')) %>%
count(rowsA, rowsB, header) %>%
spread(header, n, fill = 0)
# A tibble: 9 x 11
# rowsA rowsB G_J G_K G_L H_J H_K H_L I_J I_K I_L
#* <fctr> <fctr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 A D 1 0 0 0 3 1 1 1 0
#2 A E 2 0 0 1 1 0 0 0 1
#3 A F 5 0 0 3 2 1 0 1 1
#4 B D 0 1 1 1 0 3 1 1 1
#5 B E 2 2 1 3 1 1 0 3 1
#6 B F 1 1 2 3 3 0 1 2 1
#7 C D 0 2 3 1 2 0 4 3 2
#8 C E 2 2 2 1 2 0 0 1 1
#9 C F 1 0 1 2 0 1 2 1 2
Or if you are OK with a table/array/matrix as result, you can use xtabs, (borrowed from this answer), which essentially gives a 4-d array but with ftable, it can be displayed as you need:
ftable(xtabs(data = df), row.vars = 1:2, col.vars = 3:4)
# colsA G H I
# colsB J K L J K L J K L
#rowsA rowsB
#A D 1 0 0 0 3 1 1 1 0
# E 2 0 0 1 1 0 0 0 1
# F 5 0 0 3 2 1 0 1 1
#B D 0 1 1 1 0 3 1 1 1
# E 2 2 1 3 1 1 0 3 1
# F 1 1 2 3 3 0 1 2 1
#C D 0 2 3 1 2 0 4 3 2
# E 2 2 2 1 2 0 0 1 1
# F 1 0 1 2 0 1 2 1 2

Resources