Only rows where difference between them is less than 'n' in groups - r

Let's say we have the below dataset where values in V2 are ordered ascending in groups V1:
Input =(" V1 V2
1 A 3
2 A 4
3 A 5
4 A 6
5 A 12
6 A 13
7 B 4
8 B 5
9 B 6
10 B 12
11 C 13
12 C 14
13 C 18")
df = as.data.frame(read.table(textConnection(Input), header = T, row.names = 1))
Now I want to keep rows where the difference between consecutive ones is <= 1, so my desired output:
V1 V2
1 A 3
2 A 4
3 A 5
4 A 6
5 A 12
6 A 13
7 B 4
8 B 5
9 B 6
11 C 13
12 C 14
However when I use:
df %>%
group_by(V1) %>%
filter(c(0,diff(V2)) <= 1)
I have:
V1 V2
1 A 3
2 A 4
3 A 5
4 A 6
5 A 13
6 B 4
7 B 5
8 B 6
9 C 13
10 C 14
The row with V2 value 12 is missing and it should be in dataset. I tried also with lag() but result is same.
df %>%
group_by(V1) %>%
filter(V2 - lag(V2) <= 1 | is.na(V2 - lag(V2)))
Could you point my mistake?

You need to subtract the values from both the sides. Try lead and lag :
library(dplyr)
df %>%
group_by(V1) %>%
filter(V2 - lag(V2) <= 1 | V2 - lead(V2) <= 1)
# V1 V2
# <chr> <int>
# 1 A 3
# 2 A 4
# 3 A 5
# 4 A 6
# 5 A 12
# 6 A 13
# 7 B 4
# 8 B 5
# 9 B 6
#10 C 13
#11 C 14

Here is another idea where we create groups with a tolerance of 1, and filter out those groups with only one observation, i.e.
df %>%
group_by(V1, grp = cumsum(c(TRUE, diff(V2) != 1))) %>%
filter(n() > 1) %>%
ungroup() %>%
select(-grp)
# A tibble: 11 x 2
# V1 V2
# <fct> <int>
# 1 A 3
# 2 A 4
# 3 A 5
# 4 A 6
# 5 A 12
# 6 A 13
# 7 B 4
# 8 B 5
# 9 B 6
#10 C 13
#11 C 14

Related

Replace all subsequent column values, after the first instance of a value greater than x

I have a dataframe (df1) with two columns, one (grp) is a grouping variable, the second (num) has some measurements.
For each group I want to:
replace all numbers greater than 3.5 with 4
replace all numbers after the first instance of 4 with 4
I just want to get to step 2, but step 1 seems like a logical starting point, maybe it isn't required though?
Example data
library(dplyr)
df1 <- data.frame(
grp = rep(c("a", "b"), each = 10),
num = c(0,1,2,5,0,1,7,0,2,1,2,2,2,2,5,0,0,0,0,6))
I can get the first part:
df1 %>%
group_by(grp) %>%
mutate(num = ifelse(num > 3.5, 4, num))
For the second part I tried using dplyr::lag and dplyr::case_when but no luck. Here is the desired output:
grp num
1 a 0
2 a 1
3 a 2
4 a 4
5 a 4
6 a 4
7 a 4
8 a 4
9 a 4
10 a 4
11 b 2
12 b 2
13 b 2
14 b 2
15 b 4
16 b 4
17 b 4
18 b 4
19 b 4
20 b 4
Any advice would be much appreciated.
You could use cumany() to find all cases after the first event, i.e. num > 3.5.
library(dplyr)
df1 %>%
group_by(grp) %>%
mutate(num2 = replace(num, cumany(num > 3.5), 4)) %>%
ungroup()
# A tibble: 20 × 3
grp num num2
<chr> <dbl> <dbl>
1 a 0 0
2 a 1 1
3 a 2 2
4 a 5 4
5 a 0 4
6 a 1 4
7 a 7 4
8 a 0 4
9 a 2 4
10 a 1 4
11 b 2 2
12 b 2 2
13 b 2 2
14 b 2 2
15 b 5 4
16 b 0 4
17 b 0 4
18 b 0 4
19 b 0 4
20 b 6 4
You can also replace cumany(num > 3.5) with cumsum(num > 3.5) > 0.

Mean of a column only for observations meeting a condition

How can I add a column with the mean of z for each group "y" for values where if x < 10 for any other case the mean column can take the value of z?
df <- data.frame(y = c(LETTERS[1:5], LETTERS[1:5],LETTERS[3:7]), x = 1:15, z = c(4:9,1:4,2:6))
y x z
1 A 1 4
2 B 2 5
3 C 3 6
4 D 4 7
5 E 5 8
6 A 6 9
7 B 7 1
8 C 8 2
9 D 9 3
10 E 10 4
11 C 11 2
12 D 12 3
13 E 13 4
14 F 14 5
I am trying something like
df %>% group_by(y) %>%
mutate(gr.mean = mean(z))
But this provides the mean for any case of x.
We can subset the 'z' with a logical condition on 'x':
library(dplyr)
df %>%
group_by(y) %>%
mutate(gr.mean = if(all(x >=10)) z else mean(z[x < 10])) %>%
ungroup
Output
# A tibble: 15 × 4
y x z gr.mean
<chr> <int> <int> <dbl>
1 A 1 4 6.5
2 B 2 5 3
3 C 3 6 4
4 D 4 7 5
5 E 5 8 8
6 A 6 9 6.5
7 B 7 1 3
8 C 8 2 4
9 D 9 3 5
10 E 10 4 8
11 C 11 2 4
12 D 12 3 5
13 E 13 4 8
14 F 14 5 5
15 G 15 6 6
Or without if/else
df %>%
group_by(y) %>%
mutate(gr.mean = coalesce(mean(z[x < 10]), z))

Using dplyr: Within groups, select the first value meeting a condition

I need assistance obtaining a solution that will scan backwards in time and obtain the first value meeting a condition. I have data similar to:
set.seed(42)
df <- data.frame(
id = sample(LETTERS[1:3], 20, replace = TRUE),
time.var = sample(1:20, 20, replace = TRUE),
x = sample(c(1:10), 20, replace = TRUE)
)
df <- df[order(df$id, df$time.var),]
id time.var x
A 5 2
A 14 8
A 19 7
A 20 1
B 1 1
B 2 5
B 9 10
B 11 10
B 13 6
B 15 4
B 19 3
C 1 7
C 3 5
C 8 9
C 8 4
C 17 7
C 17 4
C 17 8
C 19 4
C 19 10
For the last member of each group defined in time order by time.var, I'd like to obtain the first value from x less than 5 by scanning in descending time order.
I have tried:
test <- df %>%
group_by(id) %>%
arrange(id, time.var) %>%
mutate(less.5 = which.max(x[x < 5]) )
What strategy can I use to obtain this type of output:
id time.var x previous.less.5
A 5 2
A 14 8
A 19 7
A 20 1 2
B 1 1
B 2 5
B 9 10
B 11 10
B 13 6
B 15 4
B 19 3 4
C 1 7
C 3 5
C 8 9
C 8 4
C 17 7
C 17 4
C 17 8
C 19 4
C 19 10 4
Using library(dplyr):
df %>%
arrange(id, time.var) %>%
group_by(id) %>%
mutate(previous.less.5 = tail(c(x[c((x[-n()] < 5), FALSE)]),1)) %>%
group_by(id) %>%
mutate(previous.less.5 = if_else(row_number() == n(), previous.less.5, NULL))
or
df %>%
arrange(id, time.var) %>%
group_by(id) %>%
slice(1:(n()-1)) %>%
filter(x < 5) %>%
slice(n()) %>%
select(-time.var) %>%
right_join(df, ., by="id", suffix =c("",".y")) %>%
group_by(id) %>%
mutate(previous.less.5 = if_else(row_number() == n(), x.y, NULL)) %>%
select(-x.y)
giving:
#> # A tibble: 20 x 4
#> # Groups: id [3]
#> id time.var x previous.less.5
#> <fct> <int> <int> <int>
#> 1 A 3 10 NA
#> 2 A 4 8 NA
#> 3 A 4 6 NA
#> 4 A 5 2 NA
#> 5 A 5 8 NA
#> 6 A 5 7 NA
#> 7 A 11 6 NA
#> 8 A 13 3 NA
#> 9 A 15 2 3
#> 10 B 2 1 NA
#> 11 B 4 3 NA
#> 12 B 4 6 NA
#> 13 B 8 5 NA
#> 14 B 8 4 NA
#> 15 B 20 7 4
#> 16 C 1 2 NA
#> 17 C 2 10 NA
#> 18 C 10 6 NA
#> 19 C 13 2 NA
#> 20 C 18 5 2
Update:
If there's a group with no record less than 5 (or only last record less than 5) then following works:
df %>%
arrange(id, time.var) %>%
group_by(id) %>%
mutate(previous.less.5 = if_else(row_number() == n(),
max(tail(c( x[ c( x[-n()] < 5, FALSE) ] ), 1)),
NULL)) %>%
mutate(previous.less.5 = replace(previous.less.5, is.infinite(previous.less.5), NA))
Data:
set.seed(42) # I am getting different data than what you've shown with this seed
df <- data.frame(
id = sample(LETTERS[1:3], 20, replace = TRUE),
time.var = sample(1:20, 20, replace = TRUE),
x = sample(c(1:10), 20, replace = TRUE)
)
df <- df[order(df$id, df$time.var),]
We can reverse value of x by id get the first number which is less than 5 using which. The last replace is to assign NA to all the values in previous.less.5 except the last one.
library(dplyr)
df %>%
#Data is already sorted by `id` and `time.var` but if your still need use
#arrange(id, time.var) %>%
group_by(id) %>%
mutate(rev_x = c(NA, rev(x)[-1]), previous.less.5 = rev_x[which(rev_x < 5)[1]],
previous.less.5 = replace(previous.less.5, row_number() != n(), NA)) %>%
select(-rev_x)
# id time.var x previous.less.5
# <fct> <int> <int> <int>
# 1 A 5 2 NA
# 2 A 14 8 NA
# 3 A 19 7 NA
# 4 A 20 1 2
# 5 B 1 1 NA
# 6 B 2 5 NA
# 7 B 9 10 NA
# 8 B 11 10 NA
# 9 B 13 6 NA
#10 B 15 4 NA
#11 B 19 3 4
#12 C 1 7 NA
#13 C 3 5 NA
#14 C 8 9 NA
#15 C 8 4 NA
#16 C 17 7 NA
#17 C 17 4 NA
#18 C 17 8 NA
#19 C 19 4 NA
#20 C 19 10 4
This should also handle the case and return NA's if there is no value less than 5 in an id.

Expand dataframe by ID to generate a special column

I have the following dataframe
df<-data.frame("ID"=c("A", "A", "A", "A", "A", "B", "B", "B", "B", "B"),
'A_Frequency'=c(1,2,3,4,5,1,2,3,4,5),
'B_Frequency'=c(1,2,NA,4,6,1,2,5,6,7))
The dataframe appears as follows
ID A_Frequency B_Frequency
1 A 1 1
2 A 2 2
3 A 3 NA
4 A 4 4
5 A 5 6
6 B 1 1
7 B 2 2
8 B 3 5
9 B 4 6
10 B 5 7
I Wish to create a new dataframe df2 from df that looks as follows
ID CFreq
1 A 1
2 A 2
3 A 3
4 A 4
5 A 5
6 A 6
7 B 1
8 B 2
9 B 3
10 B 4
11 B 5
12 B 6
13 B 7
The new dataframe has a column CFreq that takes unique values from A_Frequency, B_Frequency and groups them by ID. Then it ignores the NA values and generates the CFreq column
I have tried dplyr but am unable to get the required response
df2<-df%>%group_by(ID)%>%select(ID, A_Frequency,B_Frequency)%>%
mutate(Cfreq=unique(A_Frequency, B_Frequency))
This yields the following which is quite different
ID A_Frequency B_Frequency Cfreq
<fct> <dbl> <dbl> <dbl>
1 A 1 1 1
2 A 2 2 2
3 A 3 NA 3
4 A 4 4 4
5 A 5 6 5
6 B 1 1 1
7 B 2 2 2
8 B 3 5 3
9 B 4 6 4
10 B 5 7 5
Request someone to help me here
gather function from tidyr package will be helpful here:
library(tidyverse)
df %>%
gather(x, CFreq, -ID) %>%
select(-x) %>%
na.omit() %>%
unique() %>%
arrange(ID, CFreq)
A different tidyverse possibility could be:
df %>%
nest(A_Frequency, B_Frequency, .key = C_Frequency) %>%
mutate(C_Frequency = map(C_Frequency, function(x) unique(x[!is.na(x)]))) %>%
unnest()
ID C_Frequency
1 A 1
2 A 2
3 A 3
4 A 4
5 A 5
9 A 6
10 B 1
11 B 2
12 B 3
13 B 4
14 B 5
18 B 6
19 B 7
Base R approach would be to split the dataframe based on ID and for every list we count the number of unique enteries and create a sequence based on that.
do.call(rbind, lapply(split(df, df$ID), function(x) data.frame(ID = x$ID[1] ,
CFreq = seq_len(length(unique(na.omit(unlist(x[-1]))))))))
# ID CFreq
#A.1 A 1
#A.2 A 2
#A.3 A 3
#A.4 A 4
#A.5 A 5
#A.6 A 6
#B.1 B 1
#B.2 B 2
#B.3 B 3
#B.4 B 4
#B.5 B 5
#B.6 B 6
#B.7 B 7
This will also work when A_Frequency B_Frequency has characters in them or some other random numbers instead of sequential numbers.
In tidyverse we can do
library(tidyverse)
df %>%
group_split(ID) %>%
map_dfr(~ data.frame(ID = .$ID[1],
CFreq= seq_len(length(unique(na.omit(flatten_chr(.[-1])))))))
A data.table option
library(data.table)
cols <- c('A_Frequency', 'B_Frequency')
out <- setDT(df)[, .(CFreq = sort(unique(unlist(.SD)))),
.SDcols = cols,
by = ID]
out
# ID CFreq
# 1: A 1
# 2: A 2
# 3: A 3
# 4: A 4
# 5: A 5
# 6: A 6
# 7: B 1
# 8: B 2
# 9: B 3
#10: B 4
#11: B 5
#12: B 6
#13: B 7

How to mutate multiple variables without repeating codes?

I'm trying to create new variables from existing variables like below:
a1+a2=a3, b1+b2=b3, ..., z1+z2=z3
Here is an example data frame
df <- data.frame(replicate(10,sample(1:10)))
colnames(df) <- c("a1","a2","b1","b2","c1","c2","d1","d2","e1","e2")
Here's my solution with repeating codes
# a solution by base R
df$a3 <- df$a1 + df$a2
df$b3 <- df$b1 + df$b2
df$c3 <- df$c1 + df$c2
df$d3 <- df$d1 + df$d2
df$e3 <- df$e1 + df$e2
Or
# a solution by dplyr
library(dplyr)
df <- df %>%
mutate(a3 = a1+a2,
b3 = b1+b2,
c3 = c1+c2,
d3 = d1+d2,
e3 = e1+d2)
Or
# a solution by data.table
library(data.table)
DT <- data.table(df)
DT[,a3:=a1+a2][,b3:=b1+b2][,c3:=c1+c2][,d3:=d1+d2][,e3:=e1+e2]
Actually I have more than 100 variables, so I want to find a way to do so without repeating code... Although I tried to use mutate_ with standard evaluation and regular expression, I lost my way because I'm a newbie in R. Can you mutate multiple variables without repeating code?
Your data format is making this hard - I would reshape the data like this. In general, you shouldn't encode actual data information in column names, if the difference between a1 and a2 is meaningful, it is better to have a column with letter, a, b, c and a column with number, 1, 2.
df$id = 1:nrow(df)
library(tidyr)
library(dplyr)
tdf = gather(df, key = key, value = value, -id) %>%
separate(key, into = c("letter", "number"), sep = 1) %>%
mutate(number = paste0("V", number)) %>%
spread(key = number, value = value)
## now data is "tidy":
head(tdf)
# id letter V1 V2
# 1 1 a 2 7
# 2 1 b 10 4
# 3 1 c 9 10
# 4 1 d 9 4
# 5 1 e 5 8
# 6 2 a 9 8
## and the operation is simple:
tdf$V3 = tdf$V1 + tdf$V2
head(tdf)
# id letter V1 V2 V3
# 1 1 a 2 7 9
# 2 1 b 10 4 14
# 3 1 c 9 10 19
# 4 1 d 9 4 13
# 5 1 e 5 8 13
# 6 2 a 9 8 17
A possible solution using data.table:
DT <- data.table(df)[, rn := .I]
DTadd3 <- dcast(melt(DT, measure.vars = 1:10)[, `:=` (let = substr(variable,1,1), rn = 1:.N), variable
][, s3 := sum(value), .(let,rn)],
rn ~ paste0(let,3), value.var = 's3', mean)
DT[DTadd3, on = 'rn'][, rn := NULL][]
which gives:
a1 a2 b1 b2 c1 c2 d1 d2 e1 e2 a3 b3 c3 d3 e3
1: 10 5 9 5 10 4 5 3 7 10 15 14 14 8 17
2: 2 6 6 8 3 8 7 1 4 7 8 14 11 8 11
3: 6 4 7 4 4 3 4 6 3 3 10 11 7 10 6
4: 1 2 4 2 9 9 3 7 10 4 3 6 18 10 14
5: 9 10 8 1 8 7 10 5 9 1 19 9 15 15 10
6: 8 8 10 6 2 5 2 4 2 6 16 16 7 6 8
7: 7 9 1 7 5 10 9 2 1 8 16 8 15 11 9
8: 5 1 2 9 7 2 1 8 5 5 6 11 9 9 10
9: 3 7 3 3 1 6 8 10 8 9 10 6 7 18 17
10: 4 3 5 10 6 1 6 9 6 2 7 15 7 15 8
A similar solution using dplyr and tidyr:
df %>%
bind_cols(., df %>%
gather(var, val) %>%
group_by(var) %>%
mutate(let = substr(var,1,1), rn = 1:n()) %>%
group_by(let,rn) %>%
summarise(s3 = sum(val)) %>%
spread(let, s3) %>%
select(-rn)
)
However, as noted by #Gregor, it is much better to transform your data into long format. The data.table equivalent of #Gregor's answer:
DT <- data.table(df)
melt(DT[, rn := .I],
variable.name = 'let',
measure.vars = patterns('1$','2$'),
value.name = paste0('v',1:2)
)[, `:=` (let = letters[let], v3 = v1 + v2)][]
which gives (first 15 rows):
rn let v1 v2 v3
1: 1 a 10 5 15
2: 2 a 2 6 8
3: 3 a 6 4 10
4: 4 a 1 2 3
5: 5 a 9 10 19
6: 6 a 8 8 16
7: 7 a 7 9 16
8: 8 a 5 1 6
9: 9 a 3 7 10
10: 10 a 4 3 7
11: 1 b 9 5 14
12: 2 b 6 8 14
13: 3 b 7 4 11
14: 4 b 4 2 6
15: 5 b 8 1 9
My data.table solution:
sapply(c("a", "b", "c", "d", "e"), function(ll)
df[ , paste0(ll, 3) := get(paste0(ll, 1)) + get(paste0(ll, 2))])
df[]
# a1 a2 b1 b2 c1 c2 d1 d2 e1 e2 a3 b3 c3 d3 e3
# 1: 5 2 2 6 4 1 10 7 3 9 7 8 5 17 12
# 2: 4 8 7 3 3 7 9 6 9 7 12 10 10 15 16
# 3: 10 7 6 10 1 9 4 1 2 4 17 16 10 5 6
# 4: 3 4 1 7 6 4 7 4 7 5 7 8 10 11 12
# 5: 8 3 4 2 2 2 3 3 4 10 11 6 4 6 14
# 6: 6 6 5 1 8 10 1 10 5 3 12 6 18 11 8
# 7: 2 10 8 9 5 6 2 5 10 2 12 17 11 7 12
# 8: 1 1 10 8 9 5 6 9 6 8 2 18 14 15 14
# 9: 9 5 3 5 10 3 5 2 1 6 14 8 13 7 7
# 10: 7 9 9 4 7 8 8 8 8 1 16 13 15 16 9
Or, more extensibly:
sapply(c("a", "b", "c", "d", "e"), function(ll)
df[ , paste0(ll, 3) := Reduce(`+`, mget(paste0(ll, 1:2)))])
If all of the variables fit the pattern of ending with 1 or 2, you might try:
stems = unique(gsub("[0-9]", "", names(df)))
Then sapply(stems, ...)
library(tidyverse)
reduce(.init=df, .x=letters[1:5], .f~{
mutate(.x, '{.y}3' := get(str_c(.y, 1)) + get(str_c(.y, 2)))
})

Resources