My Data Frame looks something like the first three columns of this example:
id obs value newCol
a 1 uncool NA
a 2 cool 1
a 3 uncool NA
a 4 uncool NA
a 5 cool 2
a 6 uncool NA
a 7 cool 1
a 8 uncool NA
b 1 cool 0
What I need is a column (newCol above) that counts the number of "uncool"s between the observations with value "cool" or the first row of the group (grouped by id).
How do I do that (by using dplyr ideally)?
We can define groups by doing a cumsum starting from the bottom, then use ave to build a vector for each group :
transform(dat, newCol = ave(
value, id, rev(cumsum(rev(value=="cool"))),
FUN = function(x) ifelse(x=="cool", length(x)-1, NA)))
# id obs value newCol
# 1 a 1 uncool <NA>
# 2 a 2 cool 1
# 3 a 3 uncool <NA>
# 4 a 4 uncool <NA>
# 5 a 5 cool 2
# 6 a 6 uncool <NA>
# 7 a 7 cool 1
# 8 a 8 uncool <NA>
# 9 b 1 cool 0
With dplyr :
dat %>%
group_by(id,temp = rev(cumsum(rev(value=="cool")))) %>%
mutate(newCol = ifelse(value=="cool", n()-1, NA)) %>%
ungroup() %>%
select(-temp)
# # A tibble: 9 x 4
# id obs value newCol
# <chr> <int> <chr> <dbl>
# 1 a 1 uncool NA
# 2 a 2 cool 1
# 3 a 3 uncool NA
# 4 a 4 uncool NA
# 5 a 5 cool 2
# 6 a 6 uncool NA
# 7 a 7 cool 1
# 8 a 8 uncool NA
# 9 b 1 cool 0
Besides id you need another grouping variable, given by grp = cumsum(dat$value == "cool") - (dat$value == "cool") which is shown below.
Then you can use mutate where we assign sum(value == "uncool") to observations where value == "cool" and NA otherwise within each group.
library(dplyr)
dat %>%
group_by(id, grp = cumsum(dat$value == "cool") - (dat$value == "cool")) %>%
mutate(newCool = if_else(value == "cool", sum(value == "uncool"), NA_integer_))
# A tibble: 9 x 6
# Groups: id, grp [5]
id obs value newCol grp newCool
<chr> <int> <chr> <int> <int> <int>
1 a 1 uncool NA 0 NA
2 a 2 cool 1 0 1
3 a 3 uncool NA 1 NA
4 a 4 uncool NA 1 NA
5 a 5 cool 2 1 2
6 a 6 uncool NA 2 NA
7 a 7 cool 1 2 1
8 a 8 uncool NA 3 NA
9 b 1 cool 0 3 0
data
dat <- structure(list(id = c("a", "a", "a", "a", "a", "a", "a", "a",
"b"), obs = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 1L), value = c("uncool",
"cool", "uncool", "uncool", "cool", "uncool", "cool", "uncool",
"cool"), newCol = c(NA, 1L, NA, NA, 2L, NA, 1L, NA, 0L)), .Names = c("id",
"obs", "value", "newCol"), class = "data.frame", row.names = c(NA,
-9L))
We can create a helper function that will group value based on cool/uncool, and count the cools, i.e.
library(tidyverse)
f1 <- function(x) {
i1 <- which(x == 'cool')
v1 <- rep(seq_along(i1), c(i1[1], diff(i1)))
if (tail(x, 1) != 'cool') {
return(c(v1, tail(v1, 1) + 1))
} else {
return(v1)
}
}
df %>%
group_by(id) %>%
mutate(new_grp = f1(value)) %>%
group_by(id, new_grp) %>%
mutate(new = length(value[value != 'cool']),
new = replace(new, value != 'cool', NA)) %>%
ungroup() %>%
select(-new_grp)
which gives,
# A tibble: 9 x 5
id obs value newCol new
<fct> <int> <fct> <int> <int>
1 a 1 uncool NA NA
2 a 2 cool 1 1
3 a 3 uncool NA NA
4 a 4 uncool NA NA
5 a 5 cool 2 2
6 a 6 uncool NA NA
7 a 7 cool 1 1
8 a 8 uncool NA NA
9 b 1 cool 0 0
Writing simple function to solve your problem:
# Your data
data <- data.frame(id = c("a", "a", "a", "a", "a", "a" ,"a" ,"a", "b"),
obs = c(1,2,3,4,5,6,7,8,1),
value = c("uncool", "cool", "uncool", "uncool", "cool", "uncool" ,"cool" ,"uncool", "cool"),
stringsAsFactors = FALSE)
# Function for solving problem
cool_counter <- function(vector) {
uncool <- FALSE
count <- 0
results <- list()
for(i in 1:length(vector)) {
if(i == 1) {
uncool <- vector[i] == "uncool"
results[[i]] <- NA
if(uncool) {
count <- 1
}
}
if(i > 1) {
uncool <- vector[i] == "uncool"
if(uncool) {
count <- count + 1
results[[i]] <- NA
}
if(!uncool) {
results[[i]] <- count
count <- 0
}
}
}
return(unlist(results))
}
This gives:
# Running function
library(dplyr)
data <- data %>%
group_by(id) %>%
mutate(newCol = cool_counter(value))
# Results
data
id obs value newCol
<chr> <dbl> <chr> <dbl>
1 a 1 uncool NA
2 a 2 cool 1
3 a 3 uncool NA
4 a 4 uncool NA
5 a 5 cool 2
6 a 6 uncool NA
7 a 7 cool 1
8 a 8 uncool NA
9 b 1 cool NA
Related
How can I expand a group to length of the max group:
df <- structure(list(ID = c(1L, 1L, 2L, 3L, 3L, 3L), col1 = c("A",
"B", "O", "U", "L", "R")), class = "data.frame", row.names = c(NA,
-6L))
ID col1
1 A
1 B
2 O
3 U
3 L
3 R
Desired Output:
1 A
1 B
NA NA
2 O
NA NA
NA NA
3 U
3 L
3 R
You can take advantage of the fact that df[n_bigger_than_nrow,] gives a row of NAs
dplyr
max_n <- max(count(df, ID)$n)
df %>%
group_by(ID) %>%
summarise(cur_data()[seq(max_n),])
#> `summarise()` has grouped output by 'ID'. You can override using the `.groups`
#> argument.
#> # A tibble: 9 × 2
#> # Groups: ID [3]
#> ID col1
#> <int> <chr>
#> 1 1 A
#> 2 1 B
#> 3 1 <NA>
#> 4 2 O
#> 5 2 <NA>
#> 6 2 <NA>
#> 7 3 U
#> 8 3 L
#> 9 3 R
base R
n <- tapply(df$ID, df$ID, length)
max_n <- max(n)
i <- lapply(n, \(x) c(seq(x), rep(Inf, max_n - x)))
i <- Map(`+`, i, c(0, cumsum(head(n, -1))))
df <- df[unlist(i),]
rownames(df) <- NULL
df$ID <- rep(as.numeric(names(i)), each = max_n)
df
#> ID col1
#> 1 1 A
#> 2 1 B
#> 3 1 <NA>
#> 4 2 O
#> 5 2 <NA>
#> 6 2 <NA>
#> 7 3 U
#> 8 3 L
#> 9 3 R
Here's a base R solution.
split the df by the ID column, then use lapply to iterate over the split df, and rbind with a data frame of NA if there's fewer row than 3 (max(table(df$ID))).
do.call(rbind,
lapply(split(df, df$ID),
\(x) rbind(x, data.frame(ID = NA, col1 = NA)[rep(1, max(table(df$ID)) - nrow(x)), ]))
)
ID col1
1.1 1 A
1.2 1 B
1.3 NA <NA>
2.3 2 O
2.1 NA <NA>
2.1.1 NA <NA>
3.4 3 U
3.5 3 L
3.6 3 R
Here is a possible tidyverse solution. We can use add_row inside of summarise to add n number of rows to each group. I use max(count(df, ID)$n) to get the max group length, then I subtract that from the number of rows in each group to get the total number of rows that need to be added for each group. I use rep to produce the correct number of values that we need to add for each group. Finally, I replace ID with NA when there is an NA in col1.
library(tidyverse)
df %>%
group_by(ID) %>%
summarise(add_row(cur_data(),
col1 = rep(NA_character_,
unique(max(count(df, ID)$n) - n()))),
.groups = "drop") %>%
mutate(ID = replace(ID, is.na(col1), NA))
Output
ID col1
<int> <chr>
1 1 A
2 1 B
3 NA NA
4 2 O
5 NA NA
6 NA NA
7 3 U
8 3 L
9 3 R
Or another option without using add_row:
library(dplyr)
# Get maximum number of rows for all groups
N = max(count(df,ID)$n)
df %>%
group_by(ID) %>%
summarise(col1 = c(col1, rep(NA, N-length(col1))), .groups = "drop") %>%
mutate(ID = replace(ID, is.na(col1), NA))
Another option could be:
df %>%
group_split(ID) %>%
map_dfr(~ rows_append(.x, tibble(col1 = rep(NA_character_, max(pull(count(df, ID), n)) - group_size(.x)))))
ID col1
<int> <chr>
1 1 A
2 1 B
3 NA NA
4 2 O
5 NA NA
6 NA NA
7 3 U
8 3 L
9 3 R
A base R using merge + rle
merge(
transform(
data.frame(ID = with(rle(df$ID), rep(values, each = max(lengths)))),
q = ave(ID, ID, FUN = seq_along)
),
transform(
df,
q = ave(ID, ID, FUN = seq_along)
),
all = TRUE
)[-2]
gives
ID col1
1 1 A
2 1 B
3 1 <NA>
4 2 O
5 2 <NA>
6 2 <NA>
7 3 U
8 3 L
9 3 R
A data.table option may also work
> setDT(df)[, .(col1 = `length<-`(col1, max(df[, .N, ID][, N]))), ID]
ID col1
1: 1 A
2: 1 B
3: 1 <NA>
4: 2 O
5: 2 <NA>
6: 2 <NA>
7: 3 U
8: 3 L
9: 3 R
An option to tidyr::complete the ID and row_new, using row_old to replace ID with NA.
library (tidyverse)
df %>%
group_by(ID) %>%
mutate(
row_new = row_number(),
row_old = row_number()) %>%
ungroup() %>%
complete(ID, row_new) %>%
mutate(ID = if_else(is.na(row_old),
NA_integer_,
ID)) %>%
select(-matches("row_"))
# A tibble: 9 x 2
ID col1
<int> <chr>
1 1 A
2 1 B
3 NA <NA>
4 2 O
5 NA <NA>
6 NA <NA>
7 3 U
8 3 L
9 3 R
n <- max(table(df$ID))
df %>%
group_by(ID) %>%
summarise(col1 =`length<-`(col1, n), .groups = 'drop') %>%
mutate(ID = `is.na<-`(ID, is.na(col1)))
# A tibble: 9 x 2
ID col1
<int> <chr>
1 1 A
2 1 B
3 NA NA
4 2 O
5 NA NA
6 NA NA
7 3 U
8 3 L
9 3 R
Another base R solution using sequence.
print(
df[
sequence(
abs(rep(i <- rle(df$ID)$lengths, each = 2) - c(0L, max(i))),
rep(cumsum(c(1L, i))[-length(i) - 1L], each = 2) + c(0L, nrow(df)),
),
],
row.names = FALSE
)
#> ID col1
#> 1 A
#> 1 B
#> NA <NA>
#> 2 O
#> NA <NA>
#> NA <NA>
#> 3 U
#> 3 L
#> 3 R
I have a table in R like this:
x Y
1 2 1
2 1 1
3 NA 1
4 2 NA
5 1 2
6 2 2
7 1 1
and what I'm hoping to do is make a new column called xy which bases on if there is a 1 exist in either x or y.
For example, if x is 1 and y is 2 then the xy should be 1 ; if x is NAand y is 1 then the xyshould be 1. If both x and y is 2 then xyshould be 2.
The priority of the categorical variables 1, 2 and NA is 1>2>NA.
In short what my desired output looks like this:
x Y XY
1 2 1 1
2 1 1 1
3 NA 1 1
4 2 NA 2
5 NA NA NA
6 2 2 2
7 1 1 1
I'm new to R and trying to trim my data. Thank you for your help! I'm really appreciated:)
Try this
library(dplyr)
df |> rowwise() |>
mutate(z1 = coalesce(c_across(x) , 0) , z2 = coalesce(c_across(Y) , 0)) |>
mutate(XY = case_when(any(c_across(z1:z2) == 1) ~ 1 , any(c_across(z1:z2) == 2) ~ 2)) |>
select(-z1 , -z2) |> ungroup() -> ans
output
# A tibble: 7 × 3
x Y XY
<int> <int> <dbl>
1 2 1 1
2 1 1 1
3 NA 1 1
4 2 NA 2
5 NA NA NA
6 2 2 2
7 1 1 1
data
df <- structure(list(x = c(2L, 1L, NA, 2L, NA, 2L, 1L), Y = c(1L, 1L,
1L, NA, NA, 2L, 1L)), row.names = c("1", "2", "3", "4", "5",
"6", "7"), class = "data.frame")
You could do it with a case_when (remembering that it evaluates from the bottom and up):
library(dplyr)
df <-
df |>
mutate(XY = case_when(x == 1 | Y == 1 ~ 1,
x == 2 | Y == 2 ~ 2,
TRUE ~ NA_real_))
Or apply the same logic using base functionalities:
df$XY <- NA
df$XY[df$x == 2 | df$Y == 2] <- 2
df$XY[df$x == 1 | df$Y == 1] <- 1
Output:
x Y XY
<dbl> <dbl> <dbl>
1 2 1 1
2 1 1 1
3 NA 1 1
4 2 NA 2
5 NA NA NA
6 2 2 2
7 1 1 1
Data:
library(readr)
df <- read_table("
x Y
2 1
1 1
NA 1
2 NA
NA NA
2 2
1 1")
Here is a base R approach. For each row, check if any value is 1 (removing NA) and if so, set value of XY of 1. Then, check for any value of 2 in a similar fashion. If neither of those found, then set as NA. If you have more columns, you can subset in the function call for those specific columns to be evaluated (in this case, x and Y).
df$XY <- apply(df,
1,
function(x) {
if (any(x == 1, na.rm = T)) return(1)
if (any(x == 2, na.rm = T)) return(2)
return(NA)
})
Output
x Y XY
1 2 1 1
2 1 1 1
3 NA 1 1
4 2 NA 2
5 NA NA NA
6 2 2 2
7 1 1 1
I want to replace value(s) with NA by group.
have <- data.frame(id = c(1,1,1,1,2,2,2),
value = c(1,2,3,4,5,6,7))
want1 <- data.frame(id = c(1,1,1,1,2,2,2),
value = c(1,2,3,NA,5,6,NA))
want2 <- data.frame(id = c(1,1,1,1,2,2,2),
value = c(1,2,NA,NA,5,NA,NA))
want1 corresponds to replacing the last obs of value with NA and want2 corresponds to replacing last obs of value & last 2nd value with NA. I'm currently trying to do with with dplyr package but can't seem to get any traction. Any help would be much appreciated. Thanks!
We can use row_number() to test the current row against n() the total rows in the group.
have |>
group_by(id) |>
mutate(
last1 = ifelse(row_number() == n(), NA, value),
last2 = ifelse(row_number() >= n() - 1, NA, value)
)
# # A tibble: 7 × 4
# # Groups: id [2]
# id value last1 last2
# <dbl> <dbl> <dbl> <dbl>
# 1 1 1 1 1
# 2 1 2 2 2
# 3 1 3 3 NA
# 4 1 4 NA NA
# 5 2 5 5 5
# 6 2 6 6 NA
# 7 2 7 NA NA
And a general way to provide variants as different data frames.
lapply(
1:2,
function(k) {
have %>%
group_by(id) %>%
mutate(value=ifelse(row_number() <= (n() - k), value, NA))
}
)
[[1]]
# A tibble: 7 × 2
# Groups: id [2]
id value
<dbl> <dbl>
1 1 1
2 1 2
3 1 3
4 1 NA
5 2 5
6 2 6
7 2 NA
[[2]]
# A tibble: 7 × 2
# Groups: id [2]
id value
<dbl> <dbl>
1 1 1
2 1 2
3 1 NA
4 1 NA
5 2 5
6 2 NA
7 2 NA
Here is a base R way.
have <- data.frame(id = c(1,1,1,1,2,2,2),
value = c(1,2,3,4,5,6,7))
want1 <- data.frame(id = c(1,1,1,1,2,2,2),
value = c(1,2,3,NA,5,6,NA))
want2 <- data.frame(id = c(1,1,1,1,2,2,2),
value = c(1,2,NA,NA,5,NA,NA))
with(have, ave(value, id, FUN = \(x){
x[length(x)] <- NA
x
}))
#> [1] 1 2 3 NA 5 6 NA
with(have, ave(value, id, FUN = \(x){
x[length(x)] <- NA
if(length(x) > 1)
x[length(x) - 1L] <- NA
x
}))
#> [1] 1 2 NA NA 5 NA NA
Created on 2022-06-09 by the reprex package (v2.0.1)
Then reassign these results to column value.
I have the following data
df <- tibble(Type=c(1,2,2,1,1,2),ID=c(6,4,3,2,1,5))
Type ID
1 6
2 4
2 3
1 2
1 1
2 5
For each of the type 2 rows, I want to find the IDs of the type 1 rows just below and above them. For the above dataset, the output will be:
Type ID IDabove IDbelow
1 6 NA NA
2 4 6 2
2 3 6 2
1 2 NA NA
1 1 NA NA
2 5 1 NA
Naively, I can write a for loop to achieve this, but that would be too time consuming for the dataset I am dealing with.
One approach using dplyr lead,lag to get next and previous value respectively and data.table's rleid to create groups of consecutive Type values.
library(dplyr)
library(data.table)
df %>%
mutate(IDabove = ifelse(Type == 2, lag(ID), NA),
IDbelow = ifelse(Type == 2, lead(ID), NA),
grp = rleid(Type)) %>%
group_by(grp) %>%
mutate(IDabove = first(IDabove),
IDbelow = last(IDbelow)) %>%
ungroup() %>%
select(-grp)
# Type ID IDabove IDbelow
# <dbl> <dbl> <dbl> <dbl>
#1 1 6 NA NA
#2 2 4 6 2
#3 2 3 6 2
#4 1 2 NA NA
#5 1 1 NA NA
#6 2 5 1 NA
A dplyr only solution:
You could create your own rleid function then apply the logic provided by Ronak(Many thanks. Upvoted).
library(dplyr)
my_func <- function(x) {
x <- rle(x)$lengths
rep(seq_along(x), times=x)
}
# this part is the same as provided by Ronak.
df %>%
mutate(IDabove = ifelse(Type == 2, lag(ID), NA),
IDbelow = ifelse(Type == 2, lead(ID), NA),
grp = my_func(Type)) %>%
group_by(grp) %>%
mutate(IDabove = first(IDabove),
IDbelow = last(IDbelow)) %>%
ungroup() %>%
select(-grp)
Output:
Type ID IDabove IDbelow
<dbl> <dbl> <dbl> <dbl>
1 1 6 NA NA
2 2 4 6 2
3 2 3 6 2
4 1 2 NA NA
5 1 1 NA NA
6 2 5 1 NA
I have a data frame like this:
name count
a 3
a 5
a 8
b 2
a 9
b 7
so I want to calculate the row differences group by name. so my code is:
data%>%group_by(Name)%>%mutate(last_count = lag(count),diff = count - last_count)
However, I get a result like the below table
name count last_count diff
a 3 NA NA
a 5 3 2
a 8 5 3
b 2 NA NA
a 9 8 1
b 7 2 5
But what I want should look like this:
name count last_count diff
a 3 NA NA
a 5 3 2
a 8 5 3
b 2 NA NA
a 9 NA NA
b 7 NA NA
Thanks in advance to whoever can help me fix it!
Does this work:
> library(dplyr)
> df %>% mutate(last_count = case_when(name == lag(name) ~ lag(count), TRUE ~ NA_real_),
diff = case_when(name == lag(name) ~ count - lag(count), TRUE ~ NA_real_))
# A tibble: 6 x 4
name count last_count diff
<chr> <dbl> <dbl> <dbl>
1 a 3 NA NA
2 a 5 3 2
3 a 8 5 3
4 b 2 NA NA
5 a 9 NA NA
6 b 7 NA NA
>
We could use rleid to create a grouping column based on the adjacent matching values in the 'name' column and then apply the diff
library(dplyr)
library(data.table)
data %>%
group_by(grp = rleid(name)) %>%
mutate(last_count = lag(count), diff = count - last_count) %>%
ungroup %>%
select(-grp)
-output
# A tibble: 6 x 4
# name count last_count diff
# <chr> <int> <int> <int>
#1 a 3 NA NA
#2 a 5 3 2
#3 a 8 5 3
#4 b 2 NA NA
#5 a 9 NA NA
#6 b 7 NA NA
Or using base R with ave and rle
data$diff <- with(data, ave(count, with(rle(name),
rep(seq_along(values), lengths)), FUN = function(x) c(NA, diff(x)))
data
data <- structure(list(name = c("a", "a", "a", "b", "a", "b"), count = c(3L,
5L, 8L, 2L, 9L, 7L)), class = "data.frame", row.names = c(NA,
-6L))