Replace column values based on column name - r

I have a data frame with several binary variables: x1, x2, ... x100. I want to replace the entry 1 in each column with the number in the name of the column, i.e.:
data$x2[data$x2 == 1] <- 2
data$x3[data$x3 == 1] <- 3
data$x4[data$x4 == 1] <- 4
data$x5[data$x5 == 1] <- 5
...
How can I achieve this in a loop?

Using col:
# example data
set.seed(1); d <- as.data.frame(matrix(sample(0:1, 12, replace = TRUE), nrow = 3))
names(d) <- paste0("x", seq(ncol(d)))
d
# x1 x2 x3 x4
# 1 0 0 0 1
# 2 1 1 0 0
# 3 0 0 1 0
ix <- d == 1
d[ ix ] <- col(d)[ ix ]
d
# x1 x2 x3 x4
# 1 0 0 0 4
# 2 1 2 0 0
# 3 0 0 3 0

dplyr approach (using zx8754's data):
library(dplyr)
d %>%
mutate(across(starts_with('x'), ~ . * as.numeric(gsub('x', '', cur_column()))))
#> x1 x2 x3 x4
#> 1 0 0 0 4
#> 2 1 2 0 0
#> 3 0 0 3 0
Created on 2021-05-26 by the reprex package (v2.0.0)

Here is a base R solution with a lapply loop.
data[-1] <- lapply(names(data)[-1], function(k){
n <- as.integer(sub("[^[:digit:]]*", "", k))
data[data[[k]] == 1, k] <- n
data[[k]]
})
data
Test data.
set.seed(2021)
data <- replicate(6, rbinom(10, 1, 0.5))
data <- as.data.frame(data)
names(data) <- paste0("x", 1:6)

A solution based on a simple for loop is below (otherwise similar to the accepted answer using lapply):
for (i in 2:100) {
k <- paste0('x', i)
data[data[[k]] == 1, k] <- i
}

Related

Replace selected columns' negative values with 0s or NAs using R

For the example data df, I want to replace the negative values in the first column (x1) with 0 and the third column (x3) with NA by the function replace_negatives as follows:
df <- data.frame(x1 = -3:1,
x2 = -1,
x3 = -2:2)
df
Out:
x1 x2 x3
1 -3 -1 -2
2 -2 -1 -1
3 -1 -1 0
4 0 -1 1
5 1 -1 2
Please note that I do not index by column names because there are many columns in the actual data and the column names are not fixed.
replace_negatives <- function(data){
df <<- data %>%
mutate(.[[1]] = if_else(.[[2]] < 0, 0, .[[1]])) %>%
mutate(.[[3]] = if_else(.[[3]] < 0, NA, .[[3]]))
return(df)
}
lapply(df, replace_negatives)
But it raises an error:
> replace_negatives <- function(data){
+ df <<- data %>%
+ mutate(.[[1]] = if_else(.[[2]] < 0, 0, .[[1]])) %>%
Error: unexpected '=' in:
" df <<- data %>%
mutate(.[[1]] ="
> mutate(.[[3]] = if_else(.[[3]] < 0, NA, .[[3]]))
Error: unexpected '=' in " mutate(.[[3]] ="
> return(df)
Error: no function to return from, jumping to top level
> }
Error: unexpected '}' in "}"
Any helps would be appreciated.
The expected output:
x1 x2 x3
1 0 -1 NA
2 0 -1 NA
3 0 -1 0
4 0 -1 1
5 1 -1 2
To perform the required operation, here's a base R method:
df <- data.frame(x1 = -3:1,
x2 = -1,
x3 = -2:2)
df[[1]] <- ifelse(df[[1]] < 0, 0, df[[1]])
df[[3]] <- ifelse(df[[3]] < 0, NA, df[[3]])
df
#> x1 x2 x3
#> 1 0 -1 NA
#> 2 0 -1 NA
#> 3 0 -1 0
#> 4 0 -1 1
#> 5 1 -1 2
Created on 2022-04-18 by the reprex package (v2.0.1)
You could use across in the function:
library(tidyverse)
replace_negatives <- function(data){
df <- data %>%
mutate(across(1, ~ ifelse(. < 0, 0, .)),
across(3, ~ ifelse(. < 0, NA, .)))
return(df)
}
replace_negatives(df)
Output
x1 x2 x3
1 0 -1 NA
2 0 -1 NA
3 0 -1 0
4 0 -1 1
5 1 -1 2
Here is base R version of your function:
replace_negatives <- function(df){
is.na(df[,1]) <- df[,1] < 0
index <- df[,3] < 0
df[,3][index] <- 0
return(df)
}
replace_negatives(df)
x1 x2 x3
1 NA -1 0
2 NA -1 0
3 NA -1 0
4 0 -1 1
5 1 -1 2

Extract certain columns from data frame R

My data frame looks like this:
x s1 s2 s3 s4
1 x1 1 1954 1 yes
2 x2 2 1955 1 no
3 x3 1 1976 2 yes
4 x4 2 1954 2 yes
5 x5 3 1943 1 no
Sample data:
df <- data.frame(x=c('x1','x2','x3','x4','x5'),
s1=c(1,2,1,2,3),
s2=c(1954,1955,1976,1954,1943),
s3=c(1,1,2,2,1),
s4=c('yes','no','yes','yes','no'))```
Is it possible to extract the data frame's columns containing integers 1 to 3? For example, the new data frame would look like:
newdf
x s1 s3
1 x1 1 1
2 x2 2 1
3 x3 1 2
4 x4 2 2
5 x5 3 1
Is it possible to change the s1 and s3 columns to 0 or 1 depending on whether or not the value in the column is 1? The altered data frame would then look like:
newdf2
x s1 s3
1 x1 1 1
2 x2 0 1
3 x3 1 0
4 x4 0 0
5 x5 0 1
base R
newdf <- df[, unique(c("x", names(which(sapply(df, function(z) is.numeric(z) & any(c(1, 3) %in% z)))))), drop = FALSE]
newdf
# x s1 s3
# 1 x1 1 1
# 2 x2 2 1
# 3 x3 1 2
# 4 x4 2 2
# 5 x5 3 1
newdf[-1] <- lapply(newdf[-1], function(z) +(z == 1))
newdf
# x s1 s3
# 1 x1 1 1
# 2 x2 0 1
# 3 x3 1 0
# 4 x4 0 0
# 5 x5 0 1
Walk-through:
first, we determine which columns are numbers and contain the numbers 1 or 3:
sapply(df, function(z) is.numeric(z) & any(c(1, 3) %in% z))
# x s1 s2 s3 s4
# FALSE TRUE FALSE TRUE FALSE
This will exclude any column that is not numeric, meaning that a character column that contains a literal "1" or "3" will not be retained. This is complete inference on my end; if you want to accept the string versions then remove the is.numeric(z) component.
second, we extract the names of those that are true, and prepend "x"
c("x", names(which(sapply(df, function(z) is.numeric(z) & any(c(1, 3) %in% z)))))
# [1] "x" "s1" "s3"
wrap that in unique(.) if, for some reason, "x" is also numeric and contains 1 or 3 (this step is purely defensive, you may not strictly need it)
select those columns, defensively adding drop=FALSE so that if only one column is matched, it still returns a full data.frame
replace just those columns (excluding the first column which is "x") with 0 or 1; the z == 1 returns logical, and the wrapping +(..) converts logical to 0 (false) or 1 (true).
dplyr
library(dplyr)
df %>%
select(x, where(~ is.numeric(.) & any(c(1, 3) %in% .))) %>%
mutate(across(-x, ~ +(. == 1)))
# x s1 s3
# 1 x1 1 1
# 2 x2 0 1
# 3 x3 1 0
# 4 x4 0 0
# 5 x5 0 1
I think this is what you expect :
my_df <- data.frame(x=c('x1','x2','x3','x4','x5'),
s1=c(1,2,1,2,3),
s2=c(1954,1955,1976,1954,1943),
s3=c(1,1,2,2,1),
s4=c('yes','no','yes','yes','no'))
my_df$end <- apply(my_df, 2, function(x) paste(x, collapse = " "))
my_df <- my_df %>% group_by(x) %>% mutate(end2 = paste(str_extract_all(string = end, pattern = "1|2|3", simplify = TRUE), collapse = " "))
my_var <- which(my_df$end == my_df$end2)
my_df[, my_var] <- t(apply(my_df[, my_var], 1, function(x) ifelse(test = x == 1, yes = 1, no = 0)))
my_df <- my_df[, c(1, my_var)]

Assigning values to the matrix by matching column names of matrix and character vectors

I want to create a matrix and assign 1s based on matching the rownames of the matrix to the character vector.
## Here is the small example matrix
x <- as.character(c("rm78", "mn05", "hg78"))
y <- as.character(c("JU67", "EX56", "abcd", "rm78", "xyh56", "def", "terr6572"))
z <- as.character(c("abcd", "rh990", "mn05", "rm78", "xyh56", "efdg", "bett72"))
common <- Reduce(union, list(x,y,z))
dat.names <- c("x", "y", "z")
mat0 <- matrix(0, nrow = length(common), ncol = length(dat.names))
colnames(mat0) <- dat.names
rownames(mat0) <- common
mat0
If the character vectors x, y, and z matches the rownames of the matrix mat0 then assign 1 to the corresponding value in the matrix.
I am doing this individually for each vector and adding values to the matrix. I have a list of more than 12 such vectors and doing this way would be redundant. I think there may be a much efficient way of doing this.
for(i in rownames(mat0)[rownames(mat0) %in% x])
{
# first column
mat0[i , 1] <- 1
}
for(i in rownames(mat0)[rownames(mat0) %in% y])
{
# second column
mat0[i , 2] <- 1
}
for(i in rownames(mat0)[rownames(mat0) %in% z])
{
# third column
mat0[i , 3] <- 1
}
Yes, you don't need multiple loops. In fact, you don't need any:
mat0[] <- do.call(cbind, lapply(list(x, y, z), function(i) +(rownames(mat0) %in% i)))
mat0
#> x y z
#> rm78 1 1 1
#> mn05 1 0 1
#> hg78 1 0 0
#> JU67 0 1 0
#> EX56 0 1 0
#> abcd 0 1 1
#> xyh56 0 1 1
#> def 0 1 0
#> terr6572 0 1 0
#> rh990 0 0 1
#> efdg 0 0 1
#> bett72 0 0 1

repeating apply with different combination of cols

I can´t nest an apply funtion to repeat it many times with other combination of columns
I need to get a percentage of sp1==1 & s1==1and sp2==1 & s1==1 regarding s1, and in the same way regarding s2, s3... s1000. Here a short example:
x <- data.frame("sp1"=rep(0:1, times=5),
"sp2"=rep(0:1, each=5),
"s1" = rep(0:1, times=10),
"s2" = rep(0:1, each=2),
"s3" = rep(1:0, each=2))
> x
sp1 sp2 s1 s2 s3
1 0 0 0 0 1
2 1 0 1 0 1
3 0 0 0 1 0
4 1 0 1 1 0
5 0 0 0 0 1
6 1 1 1 0 1
7 0 1 0 1 0
8 1 1 1 1 0
9 0 1 0 0 1
10 1 1 1 0 1
11 0 0 0 1 0
12 1 0 1 1 0
13 0 0 0 0 1
14 1 0 1 0 1
15 0 0 0 1 0
16 1 1 1 1 0
17 0 1 0 0 1
18 1 1 1 0 1
19 0 1 0 1 0
20 1 1 1 1 0
Now I typed a function to calculate percentage regarding s1:
r <- as.data.frame(sapply(x[,1:2],
function(i) sum(i ==1 & x$s1 == 1)/sum(i ==1)))
> r
sapply(x[, 1:2], function(i) sum(i == 1 & x$s1 == 1)/sum(i == 1))
sp1 1.0
sp2 0.6
I want to built a df with all percentages of sp1, sp2, sp3, ...sp200 regarding s1, s2, s3, ...s1000...
> r
s1 s2 s3 ... s1000
sp1 1.0 0.5 0.5
sp2 0.6 0.5 0.5
...
sp200
I've tried to do a function with both groups-variables, one for sp's and another for s's:
intento <- as.data.frame(sapply(i=x[,1:2],
j=x[,3:5],
function(i,j)sum(i ==1 & j == 1)/sum(i ==1)))
But logically that´s not the way:
Error in match.fun(FUN) : argument "FUN" is missing, with no default
We can seperate the columns based on their names and use sapply on them
sp_cols <- grep("^sp", names(x))
s_cols <- grep("^s\\d+", names(x))
sapply(x[sp_cols], function(i) sapply(x[s_cols],
function(j) sum(i == 1 & j == 1)/sum(i == 1)))
If you have only 1 and 0's as values in the columns this can be reduced to
sapply(x[s_cols], function(i) sapply(x[sp_cols], function(j) sum(i & j)/sum(j)))
# s1 s2 s3
#sp1 1.0 0.5 0.5
#sp2 0.6 0.5 0.5
You're looking for outer. Your function just needs to be Vectorized.
FUN <- Vectorize(function(i,j) sum(x[i] == 1 & x[j] == 1)/sum(x[i] == 1))
outer(1:2, 3:5, FUN)
# [,1] [,2] [,3]
# [1,] 1.0 0.5 0.5
# [2,] 0.6 0.5 0.5
You could refine this using grep to find the columns automatically
outer(grep("sp", names(x)), grep("s\\d+", names(x)), FUN)
A similar approach is to use lapply(x, function(x) which(x == 1) and then use that down the road. The thought process being that we might as well store the information instead of repeatedly checking it.
#as suggested by #Ronak
sp_cols <- grep("^sp", names(x))
s_cols <- grep("^s\\d+", names(x))
x_l_zero <- lapply(x, function(x) which(x == 1))
sapply(x_l_zero[s_cols]
, function(x) sapply(x_l_zero[sp_cols]
, function(y) length(intersect(x,y))/length(y)))
s1 s2 s3
sp1 1.0 0.5 0.5
sp2 0.6 0.5 0.5
#Ronak has the fastest solution and is more-or-less the OP's code that's been addressed.
Unit: microseconds
expr min lq mean median uq max neval
jay.sf_outer_FUN 1190.8 1240.85 1360.103 1284.50 1337.30 2627.0 100
cole_which_apply 268.4 289.00 454.609 306.05 322.00 7610.7 100
ronak_1_unsimple 181.3 193.95 321.863 209.95 233.40 6227.4 100
ronak_2_simple 228.5 241.25 342.354 250.65 276.05 7478.4 100
akrun_dplyr 5218.7 5506.05 6108.997 5721.80 6081.65 25147.3 100
Code for performance:
library(microbenchmark)
library(tidyverse)
##data set
x <- data.frame("sp1"=rep(0:1, times=5),
"sp2"=rep(0:1, each=5),
"s1" = rep(0:1, times=10),
"s2" = rep(0:1, each=2),
"s3" = rep(1:0, each=2))
#for jay.sf
FUN <- Vectorize(function(i,j) sum(x[i] == 1 & x[j] == 1)/sum(x[i] == 1))
#names of columns
sp_cols <- grep("^sp", names(x))
s_cols <- grep("^s\\d+", names(x))
sp_cols_nam <- grep("^sp", names(x), value = T)
s_cols_nam <- grep("^s\\d+", names(x), value = T)
#benchmark
microbenchmark(
outer_FUN = {
outer(sp_cols, s_cols, FUN)
}
, apply_heaven = {
x_l_zero <- lapply(x, function(x) which(x == 1))
sapply(x_l_zero[s_cols], function(x) sapply(x_l_zero[sp_cols] , function(y) length(intersect(x,y))/length(y)))
}
, ronak_1_unsimple = {
sapply(x[sp_cols], function(i) sapply(x[s_cols],
function(j) sum(i == 1 & j == 1)/sum(i == 1)))
}
, ronak_2_simple = {
sapply(x[s_cols], function(i) sapply(x[sp_cols], function(j) sum(i & j)/sum(j)))
}
, akrun_dplyr = {
crossing(nm1 = sp_cols_nam,
nm2 = s_cols_nam) %>%
mutate(val = pmap_dbl(., ~ sum(x[..1] ==1 & x[..2] == 1)/sum(x[..1]))) %>%
spread(nm2, val)
}
)
Here is an option with tidyverse
library(tidyverse)
crossing(nm1 = names(x)[startsWith(names(x), "sp")],
nm2 = grep("^s\\d+", names(x), value = TRUE)) %>%
mutate(val = pmap_dbl(., ~ sum(x[..1] ==1 & x[..2] == 1)/sum(x[..1]))) %>%
spread(nm2, val)
# A tibble: 2 x 4
# nm1 s1 s2 s3
# <chr> <dbl> <dbl> <dbl>
#1 sp1 1 0.5 0.5
#2 sp2 0.6 0.5 0.5

Converting counts to individual observations in r

I have a data set that looks as follows
df <- data.frame( name = c("a", "b", "c"),
judgement1= c(5, 0, NA),
judgement2= c(1, 1, NA),
judgement3= c(2, 1, NA))
I want to reshape the dataframe to look like this
# name judgement1 judgement2 judgement3
# a 1 0 0
# a 1 0 0
# a 1 0 0
# a 1 0 0
# a 1 0 0
# b 1 0 0
# b 0 1 0
# b 0 0 1
And so on. I have seen that untable is recommended on some other threads, but it does not appear to work with the current version of r. Is there a package that can convert summarised counts into individual observations?
You could try something like this:
df <- data.frame( name = c("a", "b", "c"),
judgement1= c(5, 0, NA),
judgement2= c(1, 1, NA),
judgement3= c(2, 1, NA))
rep.vec <- colSums(df[colnames(df) %in% paste0("judgement", (1:nrow(df)), sep="")], na.rm = TRUE)
want <- data.frame(name=df$name, cbind(diag(nrow(df))))
colnames(want)[-1] <- paste0("judgement", (1:nrow(df)), sep="")
(want <- want[rep(1:nrow(want), rep.vec), ])
I wrote a function that works to give you your desired output:
untabl <- function(df, id.col, count.cols) {
df[is.na(df)] <- 0 # replace NAs
out <- lapply(count.cols, function(x) { # for each column with counts
z <- df[rep(1:nrow(df), df[,x]), ] # replicate rows
z[, -c(id.col)] <- 0 # set all other columns to zero
z[, x] <- 1 # replace the count values with 1
z
})
out <- do.call(rbind, out) # combine the list
out <- out[order(out[,c(id.col)]),] # reorder (you can change this)
rownames(out) <- NULL # return to simple row numbers
out
}
untabl(df = df, id.col = 1, count.cols = c(2,3,4))
# name judgement1 judgement2 judgement3
#1 a 1 0 0
#2 a 1 0 0
#3 a 1 0 0
#4 a 1 0 0
#5 a 1 0 0
#6 a 0 1 0
#7 b 0 1 0
#8 a 0 0 1
#9 a 0 0 1
#10 b 0 0 1
And for your reference, reshape::untable consists of the following code:
function (df, num)
{
df[rep(1:nrow(df), num), ]
}

Resources