R- Changing the Values of a dataframe depending on a condition - r

I have numeric values in a dataframe on each columns and I want to change the value of those columns.
For example if a value is less than 100 then it has to be 0 and if it is more than 100 then it has to be 1.
data<- read.table(text = "
A B C D
99 101 99 50
90 110 110 151", header = TRUE)
then this becomes
A B C D
0 1 0 0
0 1 1 1

You can use this :
data<- read.table(text = "
A B C D
99 101 99 50
90 110 110 151", header = TRUE)
dat <- ifelse(data>100 , 1, 0)
dat
A B C D
[1,] 0 1 0 0
[2,] 0 1 1 1

data.frame((data > 100)*1)
# A B C D
#1 0 1 0 0
#2 0 1 1 1

data<- read.table(text = "
A B C D
99 101 99 50
90 110 110 151", header = TRUE)
for(cl in 1:ncol(data)){
data[,cl] <- ifelse(data[,cl]>=100, 1, 0)
}
data

We can use tidyverse
library(tidyverse)
data %>%
mutate_all(funs(+(.>100)))
# A B C D
#1 0 1 0 0
#2 0 1 1 1
Or with base R
data[] <- lapply(data, function(x) +(x > 100))
Other options suggested by #lmo include
data[] <- +sapply(data, `>`, 100)
vapply(data, `>`, 100, FUN.VALUE= numeric(nrow(data)))

Related

remove trailing zeros from column based on condition

I have a following data frame:
set.seed(12)
id<-rep(letters[1:10],each=20)
var1<-rbinom(200,1,0.25)
df<-data.frame(id, var1)
I would like to remove zeros at the end of var1 for each id if the number of repeating zeros
is larger than or equal to the number of subsequently repeating zeros anywhere within the data frame for a particular id.
example:
df[df$id=="a",]
id var1
1 a 0
2 a 1
3 a 1
4 a 0
5 a 0
6 a 0
7 a 0
8 a 0
9 a 0
10 a 0
11 a 0
12 a 1
13 a 0
14 a 0
15 a 0
16 a 0
17 a 0
18 a 0
19 a 0
20 a 0
For id "a", we have a series of 8 subsequent zeros at the end, which is the same length as a previous series of zeros, therefore, zeros at the end should be removed and new id "a" should look like this:
id var1
1 a 0
2 a 1
3 a 1
4 a 0
5 a 0
6 a 0
7 a 0
8 a 0
9 a 0
10 a 0
11 a 0
12 a 1
For id "b", we see there are only 2 subsequent zeros at the end, which is less than max number of subsequent zeros and therefore nothing should be done.
df[df$id=="b",]
id var1
21 b 0
22 b 1
23 b 0
24 b 0
25 b 0
26 b 0
27 b 0
28 b 0
29 b 0
30 b 0
31 b 0
32 b 1
33 b 1
34 b 1
35 b 0
36 b 1
37 b 0
38 b 1
39 b 0
40 b 0
set.seed(12)
id<-rep(letters[1:10],each=20)
var1<-rbinom(200,1,0.25)
df<-data.frame(id, var1)
library(data.table)
library(magrittr)
setDT(df)
to_remove <-
# get all run lengths of 0s
df[, .N, .(id, var1, rleid(var1))][var1 == 0] %>%
# only for ids with trailing 0s
.[df[, if (last(var1) == 0) id, id], on = .(id)] %>%
# only if the last is longer than all previous for that id
.[, if (last(N) >= max(N[-.N])) .(n_rem = last(N)), id] %>%
.[, setNames(n_rem, id)]
to_remove
#> a h
#> 8 8
df[, head(.SD, .N - fcoalesce(to_remove[id], 0L))
, by = id]
#> id var1
#> 1: a 0
#> 2: a 1
#> 3: a 1
#> 4: a 0
#> 5: a 0
#> ---
#> 180: j 0
#> 181: j 0
#> 182: j 1
#> 183: j 0
#> 184: j 0
Created on 2021-11-24 by the reprex package (v2.0.1)
In a single chain:
df[, .N, .(id, var1, rleid(var1))][var1 == 0] %>%
.[df[, if (last(var1) == 0) id, id], on = .(id)] %>%
.[, if (last(N) >= max(N[-.N])) .(n_rem = last(N)), id] %>%
.[, setNames(n_rem, id)] %>%
{df[, head(.SD, .N - fcoalesce(.[id], 0L)), id]}
# id var1
# 1: a 0
# 2: a 1
# 3: a 1
# 4: a 0
# 5: a 0
# ---
# 180: j 0
# 181: j 0
# 182: j 1
# 183: j 0
# 184: j 0
We create a function with rle - with two parameters 'x' and the threshold ('thresh'), apply the rle (run-length-encoding) on the input 'x' (rle - returns a list output with lengths and values as two vectors). Check whether the last element of values is 0 and its corresponding lengths is greater than or equal to the threshold passed, then replace the last element of logical TRUE vector ('tmp1') to FALSE, and return the replicated 'tmp1'.
Do a group by 'id' in ave, apply the function and subset the rows
f1 <- function(x, thresh) {
with(rle(x), {
tmp1 <- rep(TRUE, length(values))
tmp2 <- values[length(values)]
tmp1[length(tmp1)][tmp2 == 0 & lengths[length(values)] >= thresh] <- FALSE
rep(tmp1, lengths)
})
}
-testing
out <- subset(df, as.logical(ave(var1, id, FUN = function(x) f1(x, 8))))
-output
> subset(out, id == 'a')
id var1
1 a 0
2 a 1
3 a 1
4 a 0
5 a 0
6 a 0
7 a 0
8 a 0
9 a 0
10 a 0
11 a 0
12 a 1
> subset(out, id == 'b')
id var1
21 b 0
22 b 1
23 b 0
24 b 0
25 b 0
26 b 0
27 b 0
28 b 0
29 b 0
30 b 0
31 b 0
32 b 1
33 b 1
34 b 1
35 b 0
36 b 1
37 b 0
38 b 1
39 b 0
40 b 0
Version that is a single pipeline for no particular reason:
set.seed(12)
id<-rep(letters[1:10],each=20)
var1<-rbinom(200,1,0.25)
df<-data.frame(id, var1)
df %>%
group_by(id) %>%
tidyr::nest() %>%
dplyr::mutate(
data = purrr::map(data, ~rle(.x$var1)),
max = purrr::map_int(data, ~max(.x$lengths[.x$values == 0])),
last = purrr::map_int(data, ~{
.x$lengths[.x$values == 0][length(.x$lengths[.x$values == 0])]
})
) %>%
dplyr::mutate(
data = purrr::map(
data, ~{
if(max > last) {
x <- inverse.rle(.x)
len <- length(x)
x[(len - last):len] <- NA
x
} else {
inverse.rle(.x)
}
}
)
) %>%
dplyr::select(id, data) %>%
tidyr::unnest(c(id, data)) %>%
tidyr::drop_na()

Combine multiple factor columns into a single numeric column

Can someone think of a more interesting way to combine multiple factor columns into a single numeric column?
MWE dataset:
df <- data.frame(q.82=factor(c(1,2,2,1,1)), q.77=factor(c(2,1,1,1,1)), q.72=factor(c(1,1,1,2,2)))
levels(df$q.82) <- c("","$80 and above")
levels(df$q.77) <- c("", "$75 to $79")
levels(df$q.72) <- c("", "$70 to $74")
str(df$q.82)
Factor w/ 2 levels "","$80 and above": 1 2 2 1 1
df looks like this:
q.82 q.77 q.72
1 $74 to $79
2 $80 and above
3 $80 and above
4 $70 to $74
5 $70 to $74
What I'd like is something like this, where the columns are numeric:
q.82 q.77 q.72 q
1 0 77 0 77
2 82 0 0 82
3 82 0 0 82
4 0 0 72 72
5 0 0 72 72
The following works, but seems klunky—mostly because the actual dataset has many columns.
df$q.82 <- as.numeric(as.factor(df$q.82))
df$q.82[df$q.82 == 2] <- 82
df$q.82[df$q.82 == 1] <- 0
df$q.77 <- as.numeric(as.factor(df$q.77))
df$q.77[df$q.77 == 2] <- 77
df$q.77[df$q.77 == 1] <- 0
df$q.72 <- as.numeric(as.factor(df$q.72))
df$q.72[df$q.72 == 2] <- 72
df$q.72[df$q.72 == 1] <- 0
df <- df %>% mutate(q=q.82+q.77+q.72)
A possible approach with base R using sapply:
For each column, replace non-empty strings by the numeric part of the column name and replace empty strings by zero.
Add an additional column q that contains the summed value of each row.
out_df <- sapply(names(df), function(name) {
ifelse(nchar(as.character(df[[name]])) > 0, as.numeric(sub("^q\\.", "", name)), 0)
})
out_df <- transform(out_df, q = rowSums(out_df))
out_df
#> q.82 q.77 q.72 q
#> 1 0 77 0 77
#> 2 82 0 0 82
#> 3 82 0 0 82
#> 4 0 0 72 72
#> 5 0 0 72 72
Similarly, using the tidyverse:
library(tidyverse)
df_out <- imap_dfc(.x = df, .f = ~{
if_else(nchar(as.character(.x)) > 0, as.numeric(str_remove(.y, "^q\\.")), 0)
}) %>%
mutate(q = rowSums(.))
df_out
#> # A tibble: 5 x 4
#> q.82 q.77 q.72 q
#> <dbl> <dbl> <dbl> <dbl>
#> 1 0 77 0 77
#> 2 82 0 0 82
#> 3 82 0 0 82
#> 4 0 0 72 72
#> 5 0 0 72 72
Or with data.table:
library(data.table)
setDT(df)
for(j in names(df))
set(df, j = j, value = ifelse(nchar(as.character(df[[j]])) > 0, as.numeric(sub("^q\\.", "", j)), 0))
df[, q := rowSums(.SD)][]
#> q.82 q.77 q.72 q
#> 1: 0 77 0 77
#> 2: 82 0 0 82
#> 3: 82 0 0 82
#> 4: 0 0 72 72
#> 5: 0 0 72 72
Data
df <- data.frame(q.82=factor(c(1,2,2,1,1)), q.77=factor(c(2,1,1,1,1)), q.72=factor(c(1,1,1,2,2)))
levels(df$q.82) <- c("","$80 and above")
levels(df$q.77) <- c("", "$75 to $79")
levels(df$q.72) <- c("", "$70 to $74")
Here is another base R method, where we replace non-blank value in the column with the numeric part in the column name using sub.
df[] <- t(as.integer(sub(".*?(\\d+)", "\\1", names(df))) * t(df != ""))
df
# q.82 q.77 q.72
#1 0 77 0
#2 82 0 0
#3 82 0 0
#4 0 0 72
#5 0 0 72
and then if you want to row-wise sum the values you can use rowSums
df$q <- rowSums(df)

How to get a three-way table displaying the mean of one variable depending on all combinations of two other variables?

I want a three-way table displaying the mean of a for all combinations of b and c.
table() and outer() give me separately what I want:
> with(df1, table(c, b))
b
c 0 1
5 1 1
10 1 2
15 3 2
> t(outer(0:1, c(5, 10, 15), Vectorize(function(x, y)
+ with(df1, mean(a[b == x & c == y])))))
[,1] [,2]
[1,] 17.00000 20.0
[2,] 17.00000 16.5
[3,] 16.66667 15.0
How could I combine this, preferably in a base R solution?
I tried ftable(), which gives me this:
> with(df1, ftable(c, a, b))
b 0 1
c a
5 11 0 0
13 0 0
15 0 0
17 1 0
18 0 0
19 0 0
20 0 1
10 11 0 0
13 0 0
15 0 1
17 1 0
18 0 1
19 0 0
20 0 0
15 11 1 0
13 0 1
15 0 0
17 0 1
18 0 0
19 1 0
20 1 0
but what I want is this:
b
c 0 1
5 17 20
1 1
10 17 16.5
1 2
15 16.7 15
3 2
Data:
set.seed(42)
df1 <- data.frame(a=sample(10:20, 10, replace = TRUE),
b=sample(0:1, 10, replace = TRUE),
c=sample(c(5, 10, 15), 10, replace = TRUE))
Notice that objects of class table are not really special; they just have this class and dimnames attribute:
str(table(1:2, 2:3))
# 'table' int [1:2, 1:2] 1 0 0 1
# - attr(*, "dimnames")=List of 2
# ..$ : chr [1:2] "1" "2"
# ..$ : chr [1:2] "2" "3"
So, actually it's easy to convert your result into a table:
tmp <- t(outer(0:1, c(5, 10, 15), Vectorize(function(x, y)
with(df1, mean(a[b == x & c == y])))))
class(tmp) <- "table"
dimnames(tmp) <- list(c = c("5", "10", "15"), b = c("0", "1"))
tmp
# b
# c 0 1
# 5 17.00000 20.00000
# 10 17.00000 16.50000
# 15 16.66667 15.00000
However, instead of all this you may also run
xtabs(a ~ b + c, data = aggregate(a ~ b + c, data = df1, mean))
# c
# b 5 10 15
# 0 17.00000 17.00000 16.66667
# 1 20.00000 16.50000 15.00000
Lastly, to add another row of frequencies right below you may run
out <- do.call(rbind, lapply(c(mean, length), function(fun)
xtabs(a ~ b + c, data = aggregate(a ~ b + c, data = df1, fun))))
out[order(rownames(out)), ]
# 5 10 15
# 0 17 17.0 16.66667
# 0 1 1.0 3.00000
# 1 20 16.5 15.00000
# 1 1 2.0 2.00000
Clearly now you may keep adding other functions in addition to mean and length.
If you want c and b to be visible or some row names to be empty then an analogous assignment of dimnames(out) as that above will work.

How to use mutate_if with a function

I want to transform dataframe d into the result below (result) using the function replace_by_sym. What am I doing wrong?
library(tidyverse)
d <- data.frame(dir = c(-1,1,-1,1,1), a = rep(100,5), b = 105:109, c = 108:112)
replace_by_sym <- function(x){
x <- x * (-1) + 200
}
d %>%
mutate_if(dir=-1, vars(a:c),
funs(replace_by_sym(.))) -> result
to obtain
dir a b c
1 -1 100 95 92
2 1 100 106 109
3 -1 100 93 90
4 1 100 108 111
5 1 100 109 112
It can be done with ifelse
d %>%
mutate_at(vars(a:c), funs( ifelse (dir == -1, (dir * .) + 200, .)))
# dir a b c
#1 -1 100 95 92
#2 1 100 106 109
#3 -1 100 93 90
#4 1 100 108 111
#5 1 100 109 112
Or with case_when
d %>%
mutate_at(vars(a:c), funs(case_when(dir == -1 ~ (dir * .) + 200,
TRUE ~ as.numeric(.))))

R add columns by loop in data table

I have a data table like this:
DT <- data.table(ID=rep(c(1:2),each=6), year=rep(c(2003:2006),each=3), month=rep(c(5:8),3), day=rep(c(11:14),3),value=c(101:112))
And I would like to add columns with the conditions:
1, add 5 columns with names: V100, V102, V105, V108, V112
2, in each column, grouped by ID and year, sum up the values less than the value in the column name, eg: for column V112, sum up grouped values less than 112
So the outcome will look like:
DT1 <- data.table(ID=rep(c(1:2),each=2), year=c(2003:2006), "100"=rep(0,4), "102"=c(2,0,0,0),"105"=c(3,2,0,0),"108"=c(3,3,2,0),"112"=rep(3,4))
I tried write codes but couldn't figure out:
degree <- c(100,102,105,108,112)
for (d in degree)
{
f_year <- function(d) {sum(DT$value <= d)}
DT <- DT[,d:=f_year(),by=list(ID,year)]
}
Any help would be appreciated!
Thats what lapply can be used for.
degree <- c(100, 102, 105, 108, 112)
myfun <- function(x,y) sum(y <= x)
DT1 <- DT[, lapply(degree, myfun, value), by = .(ID, year)]
setnames(DT1, c("ID", "year", as.character(degree)))
Result:
> DT1
ID year 100 102 105 108 112
1: 1 2003 0 2 3 3 3
2: 1 2004 0 0 2 3 3
3: 2 2005 0 0 0 2 3
4: 2 2006 0 0 0 0 3
Just another way:
cols = c(100,102,105,108,112)
DT[, lapply(cols, function(x) sum(value <= x)), by=.(ID, year)]
# ID year V1 V2 V3 V4 V5
# 1: 1 2003 0 2 3 3 3
# 2: 1 2004 0 0 2 3 3
# 3: 2 2005 0 0 0 2 3
# 4: 2 2006 0 0 0 0 3
Then you can set the names.
Instead if you'd like to set names directly, then you can create a named list first:
named_cols = setattr(as.list(cols), 'names', cols)
DT[, lapply(named_cols, function(x) sum(value<=x)), by=.(ID, year)]
# ID year 100 102 105 108 112
# 1: 1 2003 0 2 3 3 3
# 2: 1 2004 0 0 2 3 3
# 3: 2 2005 0 0 0 2 3
# 4: 2 2006 0 0 0 0 3

Resources