I have a following data frame:
set.seed(12)
id<-rep(letters[1:10],each=20)
var1<-rbinom(200,1,0.25)
df<-data.frame(id, var1)
I would like to remove zeros at the end of var1 for each id if the number of repeating zeros
is larger than or equal to the number of subsequently repeating zeros anywhere within the data frame for a particular id.
example:
df[df$id=="a",]
id var1
1 a 0
2 a 1
3 a 1
4 a 0
5 a 0
6 a 0
7 a 0
8 a 0
9 a 0
10 a 0
11 a 0
12 a 1
13 a 0
14 a 0
15 a 0
16 a 0
17 a 0
18 a 0
19 a 0
20 a 0
For id "a", we have a series of 8 subsequent zeros at the end, which is the same length as a previous series of zeros, therefore, zeros at the end should be removed and new id "a" should look like this:
id var1
1 a 0
2 a 1
3 a 1
4 a 0
5 a 0
6 a 0
7 a 0
8 a 0
9 a 0
10 a 0
11 a 0
12 a 1
For id "b", we see there are only 2 subsequent zeros at the end, which is less than max number of subsequent zeros and therefore nothing should be done.
df[df$id=="b",]
id var1
21 b 0
22 b 1
23 b 0
24 b 0
25 b 0
26 b 0
27 b 0
28 b 0
29 b 0
30 b 0
31 b 0
32 b 1
33 b 1
34 b 1
35 b 0
36 b 1
37 b 0
38 b 1
39 b 0
40 b 0
set.seed(12)
id<-rep(letters[1:10],each=20)
var1<-rbinom(200,1,0.25)
df<-data.frame(id, var1)
library(data.table)
library(magrittr)
setDT(df)
to_remove <-
# get all run lengths of 0s
df[, .N, .(id, var1, rleid(var1))][var1 == 0] %>%
# only for ids with trailing 0s
.[df[, if (last(var1) == 0) id, id], on = .(id)] %>%
# only if the last is longer than all previous for that id
.[, if (last(N) >= max(N[-.N])) .(n_rem = last(N)), id] %>%
.[, setNames(n_rem, id)]
to_remove
#> a h
#> 8 8
df[, head(.SD, .N - fcoalesce(to_remove[id], 0L))
, by = id]
#> id var1
#> 1: a 0
#> 2: a 1
#> 3: a 1
#> 4: a 0
#> 5: a 0
#> ---
#> 180: j 0
#> 181: j 0
#> 182: j 1
#> 183: j 0
#> 184: j 0
Created on 2021-11-24 by the reprex package (v2.0.1)
In a single chain:
df[, .N, .(id, var1, rleid(var1))][var1 == 0] %>%
.[df[, if (last(var1) == 0) id, id], on = .(id)] %>%
.[, if (last(N) >= max(N[-.N])) .(n_rem = last(N)), id] %>%
.[, setNames(n_rem, id)] %>%
{df[, head(.SD, .N - fcoalesce(.[id], 0L)), id]}
# id var1
# 1: a 0
# 2: a 1
# 3: a 1
# 4: a 0
# 5: a 0
# ---
# 180: j 0
# 181: j 0
# 182: j 1
# 183: j 0
# 184: j 0
We create a function with rle - with two parameters 'x' and the threshold ('thresh'), apply the rle (run-length-encoding) on the input 'x' (rle - returns a list output with lengths and values as two vectors). Check whether the last element of values is 0 and its corresponding lengths is greater than or equal to the threshold passed, then replace the last element of logical TRUE vector ('tmp1') to FALSE, and return the replicated 'tmp1'.
Do a group by 'id' in ave, apply the function and subset the rows
f1 <- function(x, thresh) {
with(rle(x), {
tmp1 <- rep(TRUE, length(values))
tmp2 <- values[length(values)]
tmp1[length(tmp1)][tmp2 == 0 & lengths[length(values)] >= thresh] <- FALSE
rep(tmp1, lengths)
})
}
-testing
out <- subset(df, as.logical(ave(var1, id, FUN = function(x) f1(x, 8))))
-output
> subset(out, id == 'a')
id var1
1 a 0
2 a 1
3 a 1
4 a 0
5 a 0
6 a 0
7 a 0
8 a 0
9 a 0
10 a 0
11 a 0
12 a 1
> subset(out, id == 'b')
id var1
21 b 0
22 b 1
23 b 0
24 b 0
25 b 0
26 b 0
27 b 0
28 b 0
29 b 0
30 b 0
31 b 0
32 b 1
33 b 1
34 b 1
35 b 0
36 b 1
37 b 0
38 b 1
39 b 0
40 b 0
Version that is a single pipeline for no particular reason:
set.seed(12)
id<-rep(letters[1:10],each=20)
var1<-rbinom(200,1,0.25)
df<-data.frame(id, var1)
df %>%
group_by(id) %>%
tidyr::nest() %>%
dplyr::mutate(
data = purrr::map(data, ~rle(.x$var1)),
max = purrr::map_int(data, ~max(.x$lengths[.x$values == 0])),
last = purrr::map_int(data, ~{
.x$lengths[.x$values == 0][length(.x$lengths[.x$values == 0])]
})
) %>%
dplyr::mutate(
data = purrr::map(
data, ~{
if(max > last) {
x <- inverse.rle(.x)
len <- length(x)
x[(len - last):len] <- NA
x
} else {
inverse.rle(.x)
}
}
)
) %>%
dplyr::select(id, data) %>%
tidyr::unnest(c(id, data)) %>%
tidyr::drop_na()
Related
I have the following example.
I want to create a new column with the absolute difference in AGE compared to each Treat==1 in the same PairID.
Desired output should be as shown below.
I have tried using dplyr with:
Data complete:
Treat <- c(1,0,0,1,0,0,1,0)
PairID <- c(1,1,1,2,2,2,3,3)
Age <- c(30,60,31,20,20,40,50,52)
D <- data.frame(Treat,PairID,Age)
D
D %>%
group_by(PairID) %>%
abs(Age - Age[Treat == 1])
in Base-R:
D$absD <- unlist(lapply(split(D,D$PairID), function(x) abs(x$Age - x$Age[x$Treat==1])))
> D
Treat PairID Age absD
1 1 1 30 0
2 0 1 60 30
3 0 1 31 1
4 1 2 20 0
5 0 2 20 0
6 0 2 40 20
7 1 3 50 0
8 0 3 52 2
Can someone think of a more interesting way to combine multiple factor columns into a single numeric column?
MWE dataset:
df <- data.frame(q.82=factor(c(1,2,2,1,1)), q.77=factor(c(2,1,1,1,1)), q.72=factor(c(1,1,1,2,2)))
levels(df$q.82) <- c("","$80 and above")
levels(df$q.77) <- c("", "$75 to $79")
levels(df$q.72) <- c("", "$70 to $74")
str(df$q.82)
Factor w/ 2 levels "","$80 and above": 1 2 2 1 1
df looks like this:
q.82 q.77 q.72
1 $74 to $79
2 $80 and above
3 $80 and above
4 $70 to $74
5 $70 to $74
What I'd like is something like this, where the columns are numeric:
q.82 q.77 q.72 q
1 0 77 0 77
2 82 0 0 82
3 82 0 0 82
4 0 0 72 72
5 0 0 72 72
The following works, but seems klunky—mostly because the actual dataset has many columns.
df$q.82 <- as.numeric(as.factor(df$q.82))
df$q.82[df$q.82 == 2] <- 82
df$q.82[df$q.82 == 1] <- 0
df$q.77 <- as.numeric(as.factor(df$q.77))
df$q.77[df$q.77 == 2] <- 77
df$q.77[df$q.77 == 1] <- 0
df$q.72 <- as.numeric(as.factor(df$q.72))
df$q.72[df$q.72 == 2] <- 72
df$q.72[df$q.72 == 1] <- 0
df <- df %>% mutate(q=q.82+q.77+q.72)
A possible approach with base R using sapply:
For each column, replace non-empty strings by the numeric part of the column name and replace empty strings by zero.
Add an additional column q that contains the summed value of each row.
out_df <- sapply(names(df), function(name) {
ifelse(nchar(as.character(df[[name]])) > 0, as.numeric(sub("^q\\.", "", name)), 0)
})
out_df <- transform(out_df, q = rowSums(out_df))
out_df
#> q.82 q.77 q.72 q
#> 1 0 77 0 77
#> 2 82 0 0 82
#> 3 82 0 0 82
#> 4 0 0 72 72
#> 5 0 0 72 72
Similarly, using the tidyverse:
library(tidyverse)
df_out <- imap_dfc(.x = df, .f = ~{
if_else(nchar(as.character(.x)) > 0, as.numeric(str_remove(.y, "^q\\.")), 0)
}) %>%
mutate(q = rowSums(.))
df_out
#> # A tibble: 5 x 4
#> q.82 q.77 q.72 q
#> <dbl> <dbl> <dbl> <dbl>
#> 1 0 77 0 77
#> 2 82 0 0 82
#> 3 82 0 0 82
#> 4 0 0 72 72
#> 5 0 0 72 72
Or with data.table:
library(data.table)
setDT(df)
for(j in names(df))
set(df, j = j, value = ifelse(nchar(as.character(df[[j]])) > 0, as.numeric(sub("^q\\.", "", j)), 0))
df[, q := rowSums(.SD)][]
#> q.82 q.77 q.72 q
#> 1: 0 77 0 77
#> 2: 82 0 0 82
#> 3: 82 0 0 82
#> 4: 0 0 72 72
#> 5: 0 0 72 72
Data
df <- data.frame(q.82=factor(c(1,2,2,1,1)), q.77=factor(c(2,1,1,1,1)), q.72=factor(c(1,1,1,2,2)))
levels(df$q.82) <- c("","$80 and above")
levels(df$q.77) <- c("", "$75 to $79")
levels(df$q.72) <- c("", "$70 to $74")
Here is another base R method, where we replace non-blank value in the column with the numeric part in the column name using sub.
df[] <- t(as.integer(sub(".*?(\\d+)", "\\1", names(df))) * t(df != ""))
df
# q.82 q.77 q.72
#1 0 77 0
#2 82 0 0
#3 82 0 0
#4 0 0 72
#5 0 0 72
and then if you want to row-wise sum the values you can use rowSums
df$q <- rowSums(df)
I have numeric values in a dataframe on each columns and I want to change the value of those columns.
For example if a value is less than 100 then it has to be 0 and if it is more than 100 then it has to be 1.
data<- read.table(text = "
A B C D
99 101 99 50
90 110 110 151", header = TRUE)
then this becomes
A B C D
0 1 0 0
0 1 1 1
You can use this :
data<- read.table(text = "
A B C D
99 101 99 50
90 110 110 151", header = TRUE)
dat <- ifelse(data>100 , 1, 0)
dat
A B C D
[1,] 0 1 0 0
[2,] 0 1 1 1
data.frame((data > 100)*1)
# A B C D
#1 0 1 0 0
#2 0 1 1 1
data<- read.table(text = "
A B C D
99 101 99 50
90 110 110 151", header = TRUE)
for(cl in 1:ncol(data)){
data[,cl] <- ifelse(data[,cl]>=100, 1, 0)
}
data
We can use tidyverse
library(tidyverse)
data %>%
mutate_all(funs(+(.>100)))
# A B C D
#1 0 1 0 0
#2 0 1 1 1
Or with base R
data[] <- lapply(data, function(x) +(x > 100))
Other options suggested by #lmo include
data[] <- +sapply(data, `>`, 100)
vapply(data, `>`, 100, FUN.VALUE= numeric(nrow(data)))
I am tinkering around with this loop (im new to writing loops but trying to learn).
I am aiming when x == 1, on first 1 match, store the value of z, then on each successive z value subtract that z value from the first value. If x == 0 then it will do nothing (not sure if i have to tell the code to do nothing when x ==0?)
This is my dummy data:
x <- c(0,0,1,1,1,0,1,1,1,1,0,0,0)
z <- c(10,34,56,43,56,98,78,98,23,21,45,65,78)
df <- data.frame(x,z)
for (i in 1:nrow(df)) {
if (df$x[i] == 1)
first_price <- df$z[i]
df$output <- first_price - df$z
}
}
I have my if (df$x == 1)
Then I want to save the first price... so first_price <- df$z[i]
The i in here, that means the first in the series right?
Then for my output... i wanted to subtract the first price from each successive price. If I fix the first price with [i] is this the correct way? And if I leave df$z would that then take the next price each time in the loop and subtract from
first_price <- df$z[i]?
Heres a visual:
******Progress****
> for (i in 1:nrow(df)) {
+ if (df$x[i] == 1) {
+ first_price <- df$z[1]
+ df$output <- first_price - df$z
+ }
+ }
> df$output
[1] 0 -24 -46 -33 -46 -88 -68 -88 -13 -17 -35 -55 -68
If i add [1] which is assigning the first element in df$z this actually fixes the first element and then subtracts each successive, now It needs to be rule based and understand that this is only to be the case when df$x == 1
This should work for you
library(dplyr)
library(data.table)
ans <- df %>%
mutate(originalrow = row_number()) %>% # original row position
group_by(rleid(x)) %>%
mutate(ans = first(z) - z) %>%
filter(x==1)
# # A tibble: 7 x 5
# # Groups: rleid(x) [2]
# x z originalrow `rleid(x)` ans
# <dbl> <dbl> <int> <int> <dbl>
# 1 1 56 3 2 0
# 2 1 43 4 2 13
# 3 1 56 5 2 0
# 4 1 78 7 4 0
# 5 1 98 8 4 -20
# 6 1 23 9 4 55
# 7 1 21 10 4 57
vans <- ans$ans
# [1] 0 13 0 0 -20 55 57
EDIT
To keep all rows, and outputting 0 where x==0
ans <- df %>%
mutate(originalrow = row_number()) %>%
group_by(rleid(x)) %>%
mutate(ans = ifelse(x==0, 0, first(z) - z))
# # A tibble: 13 x 5
# # Groups: rleid(x) [5]
# x z originalrow `rleid(x)` ans
# <dbl> <dbl> <int> <int> <dbl>
# 1 0 10 1 1 0
# 2 0 34 2 1 0
# 3 1 56 3 2 0
# 4 1 43 4 2 13
# 5 1 56 5 2 0
# 6 0 98 6 3 0
# 7 1 78 7 4 0
# 8 1 98 8 4 -20
# 9 1 23 9 4 55
# 10 1 21 10 4 57
# 11 0 45 11 5 0
# 12 0 65 12 5 0
# 13 0 78 13 5 0
I have a data table like this:
DT <- data.table(ID=rep(c(1:2),each=6), year=rep(c(2003:2006),each=3), month=rep(c(5:8),3), day=rep(c(11:14),3),value=c(101:112))
And I would like to add columns with the conditions:
1, add 5 columns with names: V100, V102, V105, V108, V112
2, in each column, grouped by ID and year, sum up the values less than the value in the column name, eg: for column V112, sum up grouped values less than 112
So the outcome will look like:
DT1 <- data.table(ID=rep(c(1:2),each=2), year=c(2003:2006), "100"=rep(0,4), "102"=c(2,0,0,0),"105"=c(3,2,0,0),"108"=c(3,3,2,0),"112"=rep(3,4))
I tried write codes but couldn't figure out:
degree <- c(100,102,105,108,112)
for (d in degree)
{
f_year <- function(d) {sum(DT$value <= d)}
DT <- DT[,d:=f_year(),by=list(ID,year)]
}
Any help would be appreciated!
Thats what lapply can be used for.
degree <- c(100, 102, 105, 108, 112)
myfun <- function(x,y) sum(y <= x)
DT1 <- DT[, lapply(degree, myfun, value), by = .(ID, year)]
setnames(DT1, c("ID", "year", as.character(degree)))
Result:
> DT1
ID year 100 102 105 108 112
1: 1 2003 0 2 3 3 3
2: 1 2004 0 0 2 3 3
3: 2 2005 0 0 0 2 3
4: 2 2006 0 0 0 0 3
Just another way:
cols = c(100,102,105,108,112)
DT[, lapply(cols, function(x) sum(value <= x)), by=.(ID, year)]
# ID year V1 V2 V3 V4 V5
# 1: 1 2003 0 2 3 3 3
# 2: 1 2004 0 0 2 3 3
# 3: 2 2005 0 0 0 2 3
# 4: 2 2006 0 0 0 0 3
Then you can set the names.
Instead if you'd like to set names directly, then you can create a named list first:
named_cols = setattr(as.list(cols), 'names', cols)
DT[, lapply(named_cols, function(x) sum(value<=x)), by=.(ID, year)]
# ID year 100 102 105 108 112
# 1: 1 2003 0 2 3 3 3
# 2: 1 2004 0 0 2 3 3
# 3: 2 2005 0 0 0 2 3
# 4: 2 2006 0 0 0 0 3