Combine multiple factor columns into a single numeric column - r

Can someone think of a more interesting way to combine multiple factor columns into a single numeric column?
MWE dataset:
df <- data.frame(q.82=factor(c(1,2,2,1,1)), q.77=factor(c(2,1,1,1,1)), q.72=factor(c(1,1,1,2,2)))
levels(df$q.82) <- c("","$80 and above")
levels(df$q.77) <- c("", "$75 to $79")
levels(df$q.72) <- c("", "$70 to $74")
str(df$q.82)
Factor w/ 2 levels "","$80 and above": 1 2 2 1 1
df looks like this:
q.82 q.77 q.72
1 $74 to $79
2 $80 and above
3 $80 and above
4 $70 to $74
5 $70 to $74
What I'd like is something like this, where the columns are numeric:
q.82 q.77 q.72 q
1 0 77 0 77
2 82 0 0 82
3 82 0 0 82
4 0 0 72 72
5 0 0 72 72
The following works, but seems klunky—mostly because the actual dataset has many columns.
df$q.82 <- as.numeric(as.factor(df$q.82))
df$q.82[df$q.82 == 2] <- 82
df$q.82[df$q.82 == 1] <- 0
df$q.77 <- as.numeric(as.factor(df$q.77))
df$q.77[df$q.77 == 2] <- 77
df$q.77[df$q.77 == 1] <- 0
df$q.72 <- as.numeric(as.factor(df$q.72))
df$q.72[df$q.72 == 2] <- 72
df$q.72[df$q.72 == 1] <- 0
df <- df %>% mutate(q=q.82+q.77+q.72)

A possible approach with base R using sapply:
For each column, replace non-empty strings by the numeric part of the column name and replace empty strings by zero.
Add an additional column q that contains the summed value of each row.
out_df <- sapply(names(df), function(name) {
ifelse(nchar(as.character(df[[name]])) > 0, as.numeric(sub("^q\\.", "", name)), 0)
})
out_df <- transform(out_df, q = rowSums(out_df))
out_df
#> q.82 q.77 q.72 q
#> 1 0 77 0 77
#> 2 82 0 0 82
#> 3 82 0 0 82
#> 4 0 0 72 72
#> 5 0 0 72 72
Similarly, using the tidyverse:
library(tidyverse)
df_out <- imap_dfc(.x = df, .f = ~{
if_else(nchar(as.character(.x)) > 0, as.numeric(str_remove(.y, "^q\\.")), 0)
}) %>%
mutate(q = rowSums(.))
df_out
#> # A tibble: 5 x 4
#> q.82 q.77 q.72 q
#> <dbl> <dbl> <dbl> <dbl>
#> 1 0 77 0 77
#> 2 82 0 0 82
#> 3 82 0 0 82
#> 4 0 0 72 72
#> 5 0 0 72 72
Or with data.table:
library(data.table)
setDT(df)
for(j in names(df))
set(df, j = j, value = ifelse(nchar(as.character(df[[j]])) > 0, as.numeric(sub("^q\\.", "", j)), 0))
df[, q := rowSums(.SD)][]
#> q.82 q.77 q.72 q
#> 1: 0 77 0 77
#> 2: 82 0 0 82
#> 3: 82 0 0 82
#> 4: 0 0 72 72
#> 5: 0 0 72 72
Data
df <- data.frame(q.82=factor(c(1,2,2,1,1)), q.77=factor(c(2,1,1,1,1)), q.72=factor(c(1,1,1,2,2)))
levels(df$q.82) <- c("","$80 and above")
levels(df$q.77) <- c("", "$75 to $79")
levels(df$q.72) <- c("", "$70 to $74")

Here is another base R method, where we replace non-blank value in the column with the numeric part in the column name using sub.
df[] <- t(as.integer(sub(".*?(\\d+)", "\\1", names(df))) * t(df != ""))
df
# q.82 q.77 q.72
#1 0 77 0
#2 82 0 0
#3 82 0 0
#4 0 0 72
#5 0 0 72
and then if you want to row-wise sum the values you can use rowSums
df$q <- rowSums(df)

Related

Using for loops with mutate function?

I have a task that's becoming quite difficult for me.
I have to create a variable (pr_test_1) to test whether a variable for a procedure (I10_PR1) is in a list of procedures, and this code is working great:
df <- df %>%
mutate(pr_test_1=ifelse(I10_PR1 %in% abl_pr, 1,0))
However, I have 25 variables for procedures (I10_PR1 to I10_PR25) and I have to create one for each (pr_test_1 to pr_test_25).
I don't seem to find the right syntax to get a for loop to work.
Any help will be greatly appreciated!
dplyr::across allows you to apply a function to multiple columns as specified with a selector (the below uses the starts_with selector).
library(dplyr)
library(purrr)
# sample data
df <- tibble::tibble(
I10_PR1 = sample(100),
I10_PR2 = sample(100),
I10_PR3 = sample(100),
I10_PR4 = sample(100)
)
# a sample list of values to compare against
match_list <- sample(10)
df %>%
mutate(
across(
starts_with("I10_PR"),
~ if_else(.x %in% match_list, 1, 0),
.names = "pr_test_{.col}"
)
)
#> # A tibble: 100 × 8
#> I10_PR1 I10_PR2 I10_PR3 I10_PR4 pr_test_I10_PR1 pr_test_I10…¹ pr_te…² pr_te…³
#> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 93 45 47 46 0 0 0 0
#> 2 91 89 90 76 0 0 0 0
#> 3 16 32 30 24 0 0 0 0
#> 4 66 26 46 41 0 0 0 0
#> 5 53 51 79 9 0 0 0 1
#> 6 36 64 61 32 0 0 0 0
#> 7 45 75 23 25 0 0 0 0
#> 8 86 61 77 52 0 0 0 0
#> 9 17 87 64 53 0 0 0 0
#> 10 6 42 57 33 1 0 0 0
#> # … with 90 more rows, and abbreviated variable names ¹​pr_test_I10_PR2,
#> # ²​pr_test_I10_PR3, ³​pr_test_I10_PR4
Created on 2022-10-26 with reprex v2.0.2
This for() loop works perfectly with your one (slightly modified) line of code and dynamic variable names
for(i in 1:3){
df <- df %>%
mutate(!!paste0("pr_test_",i) := ifelse(!!as.name(paste0("I10_PR",i)) %in% abl_pr, 1,0))
}
Data used:
abl_pr <- sample(LETTERS)[1:10]
I10_PR1 <- sample(LETTERS)
I10_PR2 <- sample(LETTERS)
I10_PR3 <- sample(LETTERS)
df <- data.frame(I10_PR1,I10_PR2,I10_PR3)

remove trailing zeros from column based on condition

I have a following data frame:
set.seed(12)
id<-rep(letters[1:10],each=20)
var1<-rbinom(200,1,0.25)
df<-data.frame(id, var1)
I would like to remove zeros at the end of var1 for each id if the number of repeating zeros
is larger than or equal to the number of subsequently repeating zeros anywhere within the data frame for a particular id.
example:
df[df$id=="a",]
id var1
1 a 0
2 a 1
3 a 1
4 a 0
5 a 0
6 a 0
7 a 0
8 a 0
9 a 0
10 a 0
11 a 0
12 a 1
13 a 0
14 a 0
15 a 0
16 a 0
17 a 0
18 a 0
19 a 0
20 a 0
For id "a", we have a series of 8 subsequent zeros at the end, which is the same length as a previous series of zeros, therefore, zeros at the end should be removed and new id "a" should look like this:
id var1
1 a 0
2 a 1
3 a 1
4 a 0
5 a 0
6 a 0
7 a 0
8 a 0
9 a 0
10 a 0
11 a 0
12 a 1
For id "b", we see there are only 2 subsequent zeros at the end, which is less than max number of subsequent zeros and therefore nothing should be done.
df[df$id=="b",]
id var1
21 b 0
22 b 1
23 b 0
24 b 0
25 b 0
26 b 0
27 b 0
28 b 0
29 b 0
30 b 0
31 b 0
32 b 1
33 b 1
34 b 1
35 b 0
36 b 1
37 b 0
38 b 1
39 b 0
40 b 0
set.seed(12)
id<-rep(letters[1:10],each=20)
var1<-rbinom(200,1,0.25)
df<-data.frame(id, var1)
library(data.table)
library(magrittr)
setDT(df)
to_remove <-
# get all run lengths of 0s
df[, .N, .(id, var1, rleid(var1))][var1 == 0] %>%
# only for ids with trailing 0s
.[df[, if (last(var1) == 0) id, id], on = .(id)] %>%
# only if the last is longer than all previous for that id
.[, if (last(N) >= max(N[-.N])) .(n_rem = last(N)), id] %>%
.[, setNames(n_rem, id)]
to_remove
#> a h
#> 8 8
df[, head(.SD, .N - fcoalesce(to_remove[id], 0L))
, by = id]
#> id var1
#> 1: a 0
#> 2: a 1
#> 3: a 1
#> 4: a 0
#> 5: a 0
#> ---
#> 180: j 0
#> 181: j 0
#> 182: j 1
#> 183: j 0
#> 184: j 0
Created on 2021-11-24 by the reprex package (v2.0.1)
In a single chain:
df[, .N, .(id, var1, rleid(var1))][var1 == 0] %>%
.[df[, if (last(var1) == 0) id, id], on = .(id)] %>%
.[, if (last(N) >= max(N[-.N])) .(n_rem = last(N)), id] %>%
.[, setNames(n_rem, id)] %>%
{df[, head(.SD, .N - fcoalesce(.[id], 0L)), id]}
# id var1
# 1: a 0
# 2: a 1
# 3: a 1
# 4: a 0
# 5: a 0
# ---
# 180: j 0
# 181: j 0
# 182: j 1
# 183: j 0
# 184: j 0
We create a function with rle - with two parameters 'x' and the threshold ('thresh'), apply the rle (run-length-encoding) on the input 'x' (rle - returns a list output with lengths and values as two vectors). Check whether the last element of values is 0 and its corresponding lengths is greater than or equal to the threshold passed, then replace the last element of logical TRUE vector ('tmp1') to FALSE, and return the replicated 'tmp1'.
Do a group by 'id' in ave, apply the function and subset the rows
f1 <- function(x, thresh) {
with(rle(x), {
tmp1 <- rep(TRUE, length(values))
tmp2 <- values[length(values)]
tmp1[length(tmp1)][tmp2 == 0 & lengths[length(values)] >= thresh] <- FALSE
rep(tmp1, lengths)
})
}
-testing
out <- subset(df, as.logical(ave(var1, id, FUN = function(x) f1(x, 8))))
-output
> subset(out, id == 'a')
id var1
1 a 0
2 a 1
3 a 1
4 a 0
5 a 0
6 a 0
7 a 0
8 a 0
9 a 0
10 a 0
11 a 0
12 a 1
> subset(out, id == 'b')
id var1
21 b 0
22 b 1
23 b 0
24 b 0
25 b 0
26 b 0
27 b 0
28 b 0
29 b 0
30 b 0
31 b 0
32 b 1
33 b 1
34 b 1
35 b 0
36 b 1
37 b 0
38 b 1
39 b 0
40 b 0
Version that is a single pipeline for no particular reason:
set.seed(12)
id<-rep(letters[1:10],each=20)
var1<-rbinom(200,1,0.25)
df<-data.frame(id, var1)
df %>%
group_by(id) %>%
tidyr::nest() %>%
dplyr::mutate(
data = purrr::map(data, ~rle(.x$var1)),
max = purrr::map_int(data, ~max(.x$lengths[.x$values == 0])),
last = purrr::map_int(data, ~{
.x$lengths[.x$values == 0][length(.x$lengths[.x$values == 0])]
})
) %>%
dplyr::mutate(
data = purrr::map(
data, ~{
if(max > last) {
x <- inverse.rle(.x)
len <- length(x)
x[(len - last):len] <- NA
x
} else {
inverse.rle(.x)
}
}
)
) %>%
dplyr::select(id, data) %>%
tidyr::unnest(c(id, data)) %>%
tidyr::drop_na()

Calculate multiple columns based on threshold

data
data=data.frame("student"=c(1,2,3,4,5),
"score1"=c(77,NA,52,99,89),
"score2"=c(95,89,79,89,73),
"score3"=c(92,52,73,64,90),
"score4"=c(84,57,78,81,66),
"score1x"=c(0,NA,0,1,1),
"score2x"=c(1,1,0,1,0),
"score3x"=c(1,0,0,0,1),
"score4x"=c(1,0,0,1,0))
I have data with student id and score1-score4 and hope to create score1x-score4x in a simple fast way. The rule is if say score1 is less than 80 then score1x is 0 otherwise it is 1.
I can do this by: data$score1x=ifelse(score1<80,0,1) but am wondering is there a way to do this for all of them at the same time to create score1x-score4x more quickly?
Try:
cbind(data, (data[, 1:4] < 80) * 1)
data.table solution:
setDT(data)
cols <- paste0("score", 1:4)
data[, paste0(cols, "x") := lapply(.SD, function(x) as.integer(x > 80)), .SDcols = cols]
data
student score1 score2 score3 score4 score1x score2x score3x score4x
1: 1 77 95 92 84 0 1 1 1
2: 2 NA 89 52 57 NA 1 0 0
3: 3 52 79 73 78 0 0 0 0
4: 4 99 89 64 81 1 1 0 1
5: 5 89 73 90 66 1 0 1 0
You could use this dplyr solution which creates new variables for columns with "score" in the name using mutate_at(), then uses rename_at() to change "_x" to "x" at the end of the column names:
library(dplyr)
data[1:5] %>%
mutate_at(vars(contains("score")), list(x = ~as.integer(. > 80))) %>%
rename_at(vars(contains("_x")), ~gsub("_", "", ., fixed = T))
student score1 score2 score3 score4 score1x score2x score3x score4x
1 1 77 95 92 84 0 1 1 1
2 2 NA 89 52 57 NA 1 0 0
3 3 52 79 73 78 0 0 0 0
4 4 99 89 64 81 1 1 0 1
5 5 89 73 90 66 1 0 1 0

Shape data frame by adding zero in R

I have a data frame with distance in the first colomn and class in the second:
data.tab <- read.table(text = "
644 1
76 1
78 1
350 1
45 1
37 2
366 2
46 2
71 3
28 3
97 3
30 3
55 3
65 3
116 3
30 3
18 4
143 4
99 4")
I want to shape it into a new data frame by adding zero according to the longest class. The result will be:
data.tab <- read.table(text = "
1 644 76 78 350 45 0 0 0
2 37 366 46 0 0 0 0 0
3 71 28 97 30 55 65 116 30
4 18 143 99 0 0 0 0 0")
This essentially boils down to a simple long to wide reshape
library(tidyverse)
data.tab %>%
group_by(V2) %>%
mutate(col = paste0("V", 1:n())) %>%
spread(col, V1, fill = 0) %>%
ungroup()
## A tibble: 4 x 8
# V1 V2 V3 V4 V5 V6 V7 V8
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 644 76 78 350 45 0 0 0
#2 37 366 46 0 0 0 0 0
#3 71 28 97 30 55 65 116 30
#4 18 143 99 0 0 0 0 0
Using df as name instead of data.tab:
MAX <- max(table(df$V2))
t(sapply(split(df$V1, df$V2), function(x) c(x, rep(0, MAX-length(x)))))
(The idea is to split V1 into groups defined by V2, making the vectors equal in length by adding 0's at the end when necessary, and then combining that into a single matrix. sapply does the last bit automatically but columnwise, so t is needed.)
another way using length<-
U <- unstack(df) # a hack learned from G.Grothendieck's answer
U <- with(df, split(V1,V2)) # more readable version of the above
M <- max(lengths(U))
R <- t(sapply(U, "length<-", M)) # setting all lengths equal
replace(R, is.na(R), 0) # replacing NAs by zeroes
And a (rather unreadable) one-liner doing the same thing:
"[<-"(R<-t(sapply(U<-unstack(df),"length<-",max(lengths(U)))),is.na(R),0)
1) xtabs Using only base R create a sequence number column within class and then use xtabs to rearrange it into a table. Finally convert that to data frame. Omit the last line of code if a table is sufficient.
data.tab2 <- transform(data.tab, seq = ave(V2, V2, FUN = seq_along))
xt <- xtabs(V1 ~ V2 + seq, data.tab2)
as.data.frame.matrix(xt)
giving:
1 2 3 4 5 6 7 8
1 644 76 78 350 45 0 0 0
2 37 366 46 0 0 0 0 0
3 71 28 97 30 55 65 116 30
4 18 143 99 0 0 0 0 0
2) ts Another base R solution is to convert the elements of each class to a ts series giving tt a multivariate time series with NAs at the ends of the shorter ones. Convert those NAs to 0 in the second line of code and then convert that to a data frame in the last line.
tt <- do.call("cbind", lapply(unstack(data.tab), ts))
tt[] <- ifelse(is.na(tt), 0, tt)
as.data.frame(t(tt))
3) Using data.tab2 from (1) use tapply to create the matrix mat and then convert that to a data.frame. Omit the last line of code if a matrix is sufficient.
mat <- with(data.tab2, tapply(V1, list(V2, seq), c, default = 0))
as.data.frame(mat)
Note
A comment claimed ifelse would be slower than a suggested alternative but benchmarking it showed no overall difference on the data in the question. Of course performance may not be very important here in the first place.
library(rbenchmark)
benchmark(
ifelse = {
tt <- do.call("cbind", lapply(unstack(data.tab), ts))
tt[] <- ifelse(is.na(tt), 0, tt)
as.data.frame(t(tt))
},
replace = {
tt <- do.call("cbind", lapply(unstack(data.tab), ts))
tt[is.na(tt)] <- 0
as.data.frame(t(tt))
}
)[1:4]
giving:
test replications elapsed relative
1 ifelse 100 0.25 1
2 replace 100 0.25 1
using data.table's transpose
cbind(sort(unique(data.tab$V2)),do.call(rbind,transpose(transpose(split(data.tab$V1, data.tab$V2), 0))))
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
#[1,] 1 644 76 78 350 45 0 0 0
#[2,] 2 37 366 46 0 0 0 0 0
#[3,] 3 71 28 97 30 55 65 116 30
#[4,] 4 18 143 99 0 0 0 0 0

R- Changing the Values of a dataframe depending on a condition

I have numeric values in a dataframe on each columns and I want to change the value of those columns.
For example if a value is less than 100 then it has to be 0 and if it is more than 100 then it has to be 1.
data<- read.table(text = "
A B C D
99 101 99 50
90 110 110 151", header = TRUE)
then this becomes
A B C D
0 1 0 0
0 1 1 1
You can use this :
data<- read.table(text = "
A B C D
99 101 99 50
90 110 110 151", header = TRUE)
dat <- ifelse(data>100 , 1, 0)
dat
A B C D
[1,] 0 1 0 0
[2,] 0 1 1 1
data.frame((data > 100)*1)
# A B C D
#1 0 1 0 0
#2 0 1 1 1
data<- read.table(text = "
A B C D
99 101 99 50
90 110 110 151", header = TRUE)
for(cl in 1:ncol(data)){
data[,cl] <- ifelse(data[,cl]>=100, 1, 0)
}
data
We can use tidyverse
library(tidyverse)
data %>%
mutate_all(funs(+(.>100)))
# A B C D
#1 0 1 0 0
#2 0 1 1 1
Or with base R
data[] <- lapply(data, function(x) +(x > 100))
Other options suggested by #lmo include
data[] <- +sapply(data, `>`, 100)
vapply(data, `>`, 100, FUN.VALUE= numeric(nrow(data)))

Resources