How to replace certain values with their column name - r

I have a following table in R
df <- data.frame('a' = c(1,0,0,1,0),
'b' = c(1,0,0,1,0),
'c' = c(1,1,0,1,1))
df
a b c
1 1 1 1
2 0 0 1
3 0 0 0
4 1 1 1
4 0 0 1
What I want is to replace the row value with the column name whenever the row is equal to 1. The output would be this one:
a b c
1 a b c
2 0 0 c
3 0 0 0
4 a b c
4 0 0 c
How can I do this in R? Thanks.

I would use Map and replace:
df[] <- Map(function(n, x) replace(x, x == 1, n), names(df), df)
df
# a b c
# 1 a b c
# 2 0 0 c
# 3 0 0 0
# 4 a b c
# 5 0 0 c

We can use
df[] <- names(df)[(NA^!df) * col(df)]
df[is.na(df)] <- 0
df
# a b c
#1 a b c
#2 0 0 c
#3 0 0 0
#4 a b c
#4 0 0 c

You can try stack and unstack
a=stack(df)
a
values ind
1 1 a
2 0 a
3 0 a
4 1 a
5 0 a
6 1 b
7 0 b
8 0 b
9 1 b
10 0 b
11 1 c
12 1 c
13 0 c
14 1 c
15 1 c
a$values[a$values==1]=as.character(a$ind)[a$values==1]
unstack(a)
a b c
1 a b c
2 0 0 c
3 0 0 0
4 a b c
5 0 0 c

We can try iterating over the names of the data frame, and then handling each column, for a base R option:
df <- data.frame(a=c(1,0,0,1,0), b=c(1,0,0,1,0), c=c(1,1,0,1,1))
df <- data.frame(sapply(names(df), function(x) {
y <- df[[x]]
y[y == 1] <- x
return(y)
}))
df
a b c
1 a b c
2 0 0 c
3 0 0 0
4 a b c
5 0 0 c
Demo

You can do it with ifelse, but you have to do some intermediate transposing to account for R's column-major order processing.
data.frame(t(ifelse(t(df)==1,names(df),0)))
a b c
1 a b c
2 0 0 c
3 0 0 0
4 a b c
5 0 0 c

Related

how to add columns iteratively for recoding with semi-modified names

I would use this dataset as an example
BEZ <- c("A","A","A","A","B","B","B")
var <- c("B","B","B","B","B","B","B")
bar <- c("B","B","B","B","B","B","B")
Bez1 <- c("A","A","A","A","B","B","B")
var1 <- c("B","B","B","B","B","B","B")
bar1 <- c("B","B","B","B","B","B","B")
dat <- data.frame(BEZ, var, bar, Bez1, var1, bar1)
the tricky thing that I would like to do is use a method (loops, map(), apply(), dplyr functions, and so on) to create aside the already existing new column where based on the respective row value is converted into a number.
Excepeted result
BEZ BEZ_num var var_num bar bar_num Bez1 BEZ1_num var1 var1_num bar1 bar1_num
A 0 B 1 B 1 A 0 B 1 B 1
A 0 B 1 B 1 A 0 B 1 B 1
A 0 B 1 C 2 A 0 B 1 A 0
A 0 B 1 B 1 A 0 C 2 B 1
B 1 B 1 B 1 B 1 C 2 C 2
B 1 B 1 B 1 A 0 B 1 B 1
B 1 B 1 B 1 A 0 B 1 B 1
This is more or less the idea I would like to hit. Any suggestions?
Thanks
Using factor
library(dplyr)
dat %>%
mutate(across(everything(), ~ as.integer(factor(.x))-1, .names = "{.col}_num"))
-output
BEZ var bar Bez1 var1 bar1 BEZ_num var_num bar_num Bez1_num var1_num bar1_num
1 A B B A B B 0 0 0 0 0 0
2 A B B A B B 0 0 0 0 0 0
3 A B B A B B 0 0 0 0 0 0
4 A B B A B B 0 0 0 0 0 0
5 B B B B B B 1 0 0 1 0 0
6 B B B B B B 1 0 0 1 0 0
7 B B B B B B 1 0 0 1 0 0
See in the comments. The provided data frame and the expected output do not match. But I think we could use mutate(across..) with the .names argument combined with case_when:
library(dplyr)
dat %>%
mutate(across(everything(), ~case_when(
. == "A" ~ "0",
. == "B" ~ "1",
. == "C" ~ "2"), .names = "{col}_num"))
BEZ var bar Bez1 var1 bar1 BEZ_num var_num bar_num Bez1_num var1_num bar1_num
1 A B B A B B 0 1 1 0 1 1
2 A B B A B B 0 1 1 0 1 1
3 A B B A B B 0 1 1 0 1 1
4 A B B A B B 0 1 1 0 1 1
5 B B B B B B 1 1 1 1 1 1
6 B B B B B B 1 1 1 1 1 1
7 B B B B B B 1 1 1 1 1 1
Using a for loop in base R:
dat2 <- dat[, 1, drop = FALSE]
for (col in names(dat)) {
dat2[[col]] <- dat[[col]]
dat2[[paste0(col, "_num")]] <- match(dat[[col]], LETTERS) - 1
}
dat2
# BEZ BEZ_num var var_num bar bar_num Bez1 Bez1_num var1 var1_num bar1 bar1_num
# 1 A 0 B 1 B 1 A 0 B 1 B 1
# 2 A 0 B 1 B 1 A 0 B 1 B 1
# 3 A 0 B 1 B 1 A 0 B 1 B 1
# 4 A 0 B 1 B 1 A 0 B 1 B 1
# 5 B 1 B 1 B 1 B 1 B 1 B 1
# 6 B 1 B 1 B 1 B 1 B 1 B 1
# 7 B 1 B 1 B 1 B 1 B 1 B 1
Or a (slightly convoluted) approach using dplyr::across():
library(dplyr)
dat %>%
mutate(
across(BEZ:bar1, list(TMP = identity, num = \(x) match(x, LETTERS) - 1)),
.keep = "unused"
) %>%
rename_with(\(x) gsub("_TMP$", "", x))
# same output as above
Or finally, if you don't care about the order of the output columns, you could also use dplyr::across() with the .names argument:
dat %>%
mutate(across(
BEZ:bar1,
\(x) match(x, LETTERS) - 1,
.names = "{.col}_num"
))
# BEZ var bar Bez1 var1 bar1 BEZ_num var_num bar_num Bez1_num var1_num bar1_num
# 1 A B B A B B 0 1 1 0 1 1
# 2 A B B A B B 0 1 1 0 1 1
# 3 A B B A B B 0 1 1 0 1 1
# 4 A B B A B B 0 1 1 0 1 1
# 5 B B B B B B 1 1 1 1 1 1
# 6 B B B B B B 1 1 1 1 1 1
# 7 B B B B B B 1 1 1 1 1 1
To add two further options:
With dplyr v.1.1.0 we can use consecutive_id():
library(dplyr) # v.1.1.0
dat %>%
mutate(across(everything(),
~ consecutive_id(.x)-1,
.names = "{.col}_num"))
#> BEZ var bar Bez1 var1 bar1 BEZ_num var_num bar_num Bez1_num var1_num bar1_num
#> 1 A B B A B B 0 0 0 0 0 0
#> 2 A B B A B B 0 0 0 0 0 0
#> 3 A B B A B B 0 0 0 0 0 0
#> 4 A B B A B B 0 0 0 0 0 0
#> 5 B B B B B B 1 0 0 1 0 0
#> 6 B B B B B B 1 0 0 1 0 0
#> 7 B B B B B B 1 0 0 1 0 0
Similar we could use data.table::rleid():
dat %>%
mutate(across(everything(),
~ data.table::rleid(.x)-1,
.names = "{.col}_num"))
#> BEZ var bar Bez1 var1 bar1 BEZ_num var_num bar_num Bez1_num var1_num bar1_num
#> 1 A B B A B B 0 0 0 0 0 0
#> 2 A B B A B B 0 0 0 0 0 0
#> 3 A B B A B B 0 0 0 0 0 0
#> 4 A B B A B B 0 0 0 0 0 0
#> 5 B B B B B B 1 0 0 1 0 0
#> 6 B B B B B B 1 0 0 1 0 0
#> 7 B B B B B B 1 0 0 1 0 0
Created on 2023-02-03 with reprex v2.0.2

add column with total count of rows meeting a condition in dplyr

Trying to get totals by class and condition but not grouping data.
Reproducible example:
df <- data.frame("class" = c("a","b","c","d","b","b","b","b","c","c","a"),"increment" = c(0,0,0,0,0,0,32,12,0,0,0))
R> df
class increment
1 a 0
2 b 0
3 c 0
4 d 0
5 b 0
6 b 0
7 b 32
8 b 12
9 c 0
10 c 0
11 a 0
I want the total cases where increment is different from Zero but for every class.
Desired output:
R> df
class increment increment_count_per_class
1 a 0 0
2 b 0 2
3 c 0 0
4 d 0 0
5 b 0 2
6 b 0 2
7 b 32 2
8 b 12 2
9 c 0 0
10 c 0 0
11 a 0 0
My first approach is here below, but I know there must be a less convoluted way using dplyr:
df <- df %>% mutate(has.increment = ifelse(increment>0,1,0))
R> df
class increment has.increment
1 a 0 0
2 b 0 0
3 c 0 0
4 d 0 0
5 b 0 0
6 b 0 0
7 b 32 1
8 b 12 1
9 c 0 0
10 c 0 0
11 a 0 0
Get totals per class when increment exists
N <- df %>% group_by(class,has.increment) %>% tally() %>% filter(has.increment == 1)
R> N
# A tibble: 1 x 3
# Groups: class [1]
class has.increment n
<chr> <dbl> <int>
1 b 1 2
Then join:
merge(N,df, by = "class", all = TRUE)
R> merge(N,df, by = "class", all = TRUE)
class has.increment.x n increment has.increment.y
1 a NA NA 0 0
2 a NA NA 0 0
3 b 1 2 0 0
4 b 1 2 12 1
5 b 1 2 0 0
6 b 1 2 0 0
7 b 1 2 32 1
8 c NA NA 0 0
9 c NA NA 0 0
10 c NA NA 0 0
11 d NA NA 0 0
Try this:
df %>%
group_by(class) %>%
mutate(increment_count_per_class = sum(increment!=0))

R: df header columns are ordinal ranking and spread across columns for each observation

I have a questionnaire data that look like below:
items no_stars1 no_stars2 no_stars3 average satisfied bad
1 A 1 0 0 0 0 1
2 B 0 1 0 1 0 0
3 C 0 0 1 0 1 0
4 D 0 1 0 0 1 0
5 E 0 0 1 1 0 0
6 F 0 0 1 0 1 0
7 G 1 0 0 0 0 1
Basically, the header columns (no. of stars rating and satisfactory) are ordinal ranking for each Items. I would like to summarize the no_stars(col 2:4) and satisfactory(col 5:7) into one column so that the output would look like this :
items no_stars satisfactory
1 A 1 1
2 B 2 2
3 C 3 3
4 D 2 3
5 E 3 2
6 F 3 3
7 G 1 1
$no_stars <- 1 is for no_stars1, 2 for no_stars2, 3 for no_stars3
$satisfactory <- 1 is for bad, 2 for average, 3 for good
I have tried the code below
df$no_stars2[df$no_stars2 == 1] <- 2
df$no_stars3[df$no_stars3 == 1] <- 3
df$average[df$average == 1] <- 2
df$satisfied[df$satisfied == 1] <- 3
no_stars <- df$no_stars1 + df$no_stars2 + df$no_stars3
satisfactory <- df$bad + df$average + df$satisfied
tidy_df <- data.frame(df$Items, no_stars, satisfactory)
tidy_df
Is there any function in R that can do the same thing? or
anyone got better and simpler solution ?
Thanks
Just use max.col and set preferences:
starsOrder<-c("no_stars1","no_stars2","no_stars3")
satOrder<-c("bad","average","satisfied")
data.frame(items=df$items,no_stars=max.col(df[,starsOrder]),
satisfactory=max.col(df[,satOrder]))
# items no_stars satisfactory
#1 A 1 1
#2 B 2 2
#3 C 3 3
#4 D 2 3
#5 E 3 2
#6 F 3 3
#7 G 1 1
Another tidyverse solution making use of factor to integer conversions to encode no_stars and satisfactory and spreading from wide to long twice:
library(tidyverse)
df %>%
gather(no_stars, v1, starts_with("no_stars")) %>%
mutate(no_stars = as.integer(factor(no_stars))) %>%
gather(satisfactory, v2, average, satisfied, bad) %>%
filter(v1 > 0 & v2 > 0) %>%
mutate(satisfactory = as.integer(factor(
satisfactory, levels = c("bad", "average", "satisfied")))) %>%
select(-v1, -v2) %>%
arrange(items)
# items no_stars satisfactory
#1 A 1 1
#2 B 2 2
#3 C 3 3
#4 D 2 3
#5 E 3 2
#6 F 3 3
#7 G 1 1
While there may be more elegant solutions, using dplyr::case_when() gives you the flexibility to code things however you want:
library(dplyr)
df %>%
dplyr::mutate(
no_stars = dplyr::case_when(
no_stars1 == 1 ~ 1,
no_stars2 == 1 ~ 2,
no_stars3 == 1 ~ 3)
, satisfactory = dplyr::case_when(
average == 1 ~ 2,
satisfied == 1 ~ 3,
bad == 1 ~ 1)
)
# items no_stars1 no_stars2 no_stars3 average satisfied bad no_stars satisfactory
# 1 A 1 0 0 0 0 1 1 1
# 2 B 0 1 0 1 0 0 2 2
# 3 C 0 0 1 0 1 0 3 3
# 4 D 0 1 0 0 1 0 2 3
# 5 E 0 0 1 1 0 0 3 2
# 6 F 0 0 1 0 1 0 3 3
# 7 G 1 0 0 0 0 1 1 1
dat%>%
replace(.==1,NA)%>%
replace_na(setNames(as.list(names(.)),names(.)))%>%
replace(.==0,NA)%>%
mutate(s=coalesce(!!!.[2:4]),
no_stars=as.numeric(factor(s,unique(s))),
t=coalesce(!!!.[5:7]),
satisfactory=as.numeric(factor(t,unique(t))))%>%
select(items,no_stars,satisfactory)
items no_stars satisfactory
1 A 1 1
2 B 2 2
3 C 3 3
4 D 2 3
5 E 3 2
6 F 3 3
7 G 1 1
using apply and match :
data.frame(
items = df1$items,
no_stars = apply(df1[2:4], 1, match, x=1),
satisfactory = apply(df1[c(7,5:6)], 1, match, x=1))
# items no_stars satisfactory
# 1 A 1 1
# 2 B 2 2
# 3 C 3 3
# 4 D 2 3
# 5 E 3 2
# 6 F 3 3
# 7 G 1 1
data
df1 <- read.table(header=TRUE,stringsAsFactors=FALSE,text="
items no_stars1 no_stars2 no_stars3 average satisfied bad
1 A 1 0 0 0 0 1
2 B 0 1 0 1 0 0
3 C 0 0 1 0 1 0
4 D 0 1 0 0 1 0
5 E 0 0 1 1 0 0
6 F 0 0 1 0 1 0
7 G 1 0 0 0 0 1")

R: Update adjacency matrix/data frame using pairwise combinations

Question
Let's say I have this dataframe:
# mock data set
df.size = 10
cluster.id<- sample(c(1:5), df.size, replace = TRUE)
letters <- sample(LETTERS[1:5], df.size, replace = TRUE)
test.set <- data.frame(cluster.id, letters)
Will be something like:
cluster.id letters
<int> <fctr>
1 5 A
2 4 B
3 4 B
4 3 A
5 3 E
6 3 D
7 3 C
8 2 A
9 2 E
10 1 A
Now I want to group these per cluster.id and see what kind of letters I can find within a cluster, so for example cluster 3 contains the letters A,E,D,C. Then I want to get all unique pairwise combinations (but not combinations with itself so no A,A e.g.): A,E ; A,D, A,C etc. Then I want to update the pairwise distance for these combination in an adjacency matrix/data frame.
Idea
# group by cluster.id
# per group get all (unique) pairwise combinations for the letters (excluding pairwise combinations with itself, e.g. A,A)
# update adjacency for each pairwise combinations
What I tried
# empty adjacency df
possible <- LETTERS
adj.df <- data.frame(matrix(0, ncol = length(possible), nrow = length(possible)))
colnames(adj.df) <- rownames(adj.df) <- possible
# what I tried
update.adj <- function( data ) {
for (comb in combn(data$letters,2)) {
# stucked
}
}
test.set %>% group_by(cluster.id) %>% update.adj(.)
Probably there is an easy way to do this because I see adjacency matrices all the time, but I'm not able to figure it out.. Please let me know if it's not clear
Answer to comment
Answer to #Manuel Bickel:
For the data I gave as example (the table under "will be something like"):
This matrix will be A-->Z for the full dataset, keep that in mind.
A B C D E
A 0 0 1 1 2
B 0 0 0 0 0
C 1 0 0 1 1
D 1 0 1 0 1
E 2 0 1 1 0
I will explain what I did:
cluster.id letters
<int> <fctr>
1 5 A
2 4 B
3 4 B
4 3 A
5 3 E
6 3 D
7 3 C
8 2 A
9 2 E
10 1 A
Only the clusters containing more > 1 unique letter are relevant (because we don't want combinations with itself, e.g cluster 1 containing only letter B, so it would result in combination B,B and is therefore not relevant):
4 3 A
5 3 E
6 3 D
7 3 C
8 2 A
9 2 E
Now I look for each cluster what pairwise combinations I can make:
cluster 3:
A,E
A,D
A,C
E,D
E,C
D,C
Update these combination in the adjacency matrix:
A B C D E
A 0 0 1 1 1
B 0 0 0 0 0
C 1 0 0 1 1
D 1 0 1 0 1
E 2 0 1 1 0
Then go to the next cluster
cluster 2
A,E
Update the adjacency matrix again:
A B C D E
A 0 0 1 1 2 <-- note the 2 now
B 0 0 0 0 0
C 1 0 0 1 1
D 1 0 1 0 1
E 2 0 1 1 0
As reaction to the huge dataset
library(reshape2)
test.set <- read.table(text = "
cluster.id letters
1 5 A
2 4 B
3 4 B
4 3 A
5 3 E
6 3 D
7 3 C
8 2 A
9 2 E
10 1 A", header = T, stringsAsFactors = F)
x1 <- reshape2::dcast(test.set, cluster.id ~ letters)
x1
#cluster.id A B C D E
#1 1 1 0 0 0 0
#2 2 1 0 0 0 1
#3 3 1 0 1 1 1
#4 4 0 2 0 0 0
#5 5 1 0 0 0 0
x2 <- table(test.set)
x2
# letters
#cluster.id A B C D E
# 1 1 0 0 0 0
# 2 1 0 0 0 1
# 3 1 0 1 1 1
# 4 0 2 0 0 0
# 5 1 0 0 0 0
x1.c <- crossprod(x1)
#Error in crossprod(x, y) :
# requires numeric/complex matrix/vector arguments
x2.c <- crossprod(x2)
#works fine
Following above comment, here the code of Tyler Rinker used with your data. I hope this is what you want.
UPDATE: Following below comments, I added a solution using the package reshape2 in order to be able to handle larger amounts of data.
test.set <- read.table(text = "
cluster.id letters
1 5 A
2 4 B
3 4 B
4 3 A
5 3 E
6 3 D
7 3 C
8 2 A
9 2 E
10 1 A", header = T, stringsAsFactors = F)
x <- table(test.set)
x
letters
#cluster.id A B C D E
# 1 1 0 0 0 0
# 2 1 0 0 0 1
# 3 1 0 1 1 1
# 4 0 2 0 0 0
# 5 1 0 0 0 0
#base approach, based on answer by Tyler Rinker
x <- crossprod(x)
diag(x) <- 0 #this is to set matches such as AA, BB, etc. to zero
x
# letters
# letters
# A B C D E
# A 0 0 1 1 2
# B 0 0 0 0 0
# C 1 0 0 1 1
# D 1 0 1 0 1
# E 2 0 1 1 0
#reshape2 approach
x <- acast(test.set, cluster.id ~ letters)
x <- crossprod(x)
diag(x) <- 0
x
# A B C D E
# A 0 0 1 1 2
# B 0 0 0 0 0
# C 1 0 0 1 1
# D 1 0 1 0 1
# E 2 0 1 1 0

Split data frame into chunk and assign names to chunks from vectors

I have a data frame with 5*n columns, where n is the number of categories listed in a vector. I want to break the data frame into chunks of 5 columns (eg. category 1 is columns 1:5, category 2 is columns 6:10) and then assign the category names from the vector to the chunks.
eg.
*original data frame* *vector of category names*
X a b c d e a b c d e a b c d e 1 apples
1 1 0 0 0 1 0 1 0 1 0 0 0 1 1 0 2 oranges
2 0 1 0 1 0 0 0 1 0 1 1 0 0 0 1 3 bananas
Will become
*apples* *oranges* *bananas*
X a b c d e X a b c d e X a b c d e
1 1 0 0 0 1 1 0 1 0 1 0 1 0 0 1 1 0
2 0 1 0 1 0 2 0 0 1 0 1 2 1 0 0 0 1
I can find a whole lot of information about splitting data.frames by rows, which is much more common to do, but I can't find anything about splitting a data frame into n chunks by columns. Thanks!
You could split your original_data_frame by column indices similarely:
df <- read.table(header=T, check.names = F, text="
X a b c d e a b c d e a b c d e
1 1 0 0 0 1 0 1 0 1 0 0 0 1 1 0
2 0 1 0 1 0 0 0 1 0 1 1 0 0 0 1")
n <- 5 # fixed chunksize (a-e)
lst <- lapply(split(2:ncol(df), rep(seq(ncol(df[-1])/n), each=n)), function(x) df[, x])
names(lst) <- c("apples", "oranges", "bananas")
# lst
# $apples
# a b c d e
# 1 1 0 0 0 1
# 2 0 1 0 1 0
#
# $oranges
# a b c d e
# 1 0 1 0 1 0
# 2 0 0 1 0 1
#
# $bananas
# a b c d e
# 1 0 0 1 1 0
# 2 1 0 0 0 1
I don't know if this is elegant, but it came to my mind, first.

Resources