my_df <- tibble(
b1 = c(2, 6, 3, 6, 4, 2, 1, 9, NA),
b2 = c(NA, 4, 6, 2, 6, 6, 1, 1, 7),
b3 = c(5, 9, 8, NA, 2, 3, 9, 5, NA),
b4 = c(NA, 6, NA, 10, 12, 8, 3, 6, 2),
b5 = c(2, 12, 1, 7, 8, 5, 5, 6, NA),
b6 = c(9, 2, 4, 6, 7, 6, 6, 7, 9),
b7 = c(1, 3, 7, 7, 4, 2, 2, 9, 5),
b8 = c(NA, 8, 4, 5, 1, 4, 1, 3, 6),
b9 = c(4, 5, 7, 9, 5, 1, 1, 2, NA),
b10 = c(14, 2, 4, 2, 1, 1, 1, 1, 5))
Hi Guys,
Hope you are all good. I have a df like this (very big one), and I want to tell R to add 10 to the values in b1 if there is 2 in either b6, 67, b8 or b9.
Thanks once again in anticipation.
We can create a logical condition in case_when by taking the row sums of subset of columns b6:b9 to find if the row have at least 2 in any of the row then add 10 to b1 or else return the original column
library(dplyr)
my_df <- my_df %>%
mutate(b1 = case_when(rowSums(select(cur_data(), b6:b9) == 2,
na.rm = TRUE) > 0 ~ b1 + 10, TRUE ~ b1))
-output
my_df
# A tibble: 9 x 10
b1 b2 b3 b4 b5 b6 b7 b8 b9 b10
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2 NA 5 NA 2 9 1 NA 4 14
2 16 4 9 6 12 2 3 8 5 2
3 3 6 8 NA 1 4 7 4 7 4
4 6 2 NA 10 7 6 7 5 9 2
5 4 6 2 12 8 7 4 1 5 1
6 12 6 3 8 5 6 2 4 1 1
7 11 1 9 3 5 6 2 1 1 1
8 19 1 5 6 6 7 9 3 2 1
9 NA 7 NA 2 NA 9 5 6 NA 5
Or may also use if_any
my_df %>%
mutate(b1 = case_when(if_any(b6:b9, `%in%`, 2) ~ b1 + 10, TRUE ~ b1))
-output
# A tibble: 9 x 10
b1 b2 b3 b4 b5 b6 b7 b8 b9 b10
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2 NA 5 NA 2 9 1 NA 4 14
2 16 4 9 6 12 2 3 8 5 2
3 3 6 8 NA 1 4 7 4 7 4
4 6 2 NA 10 7 6 7 5 9 2
5 4 6 2 12 8 7 4 1 5 1
6 12 6 3 8 5 6 2 4 1 1
7 11 1 9 3 5 6 2 1 1 1
8 19 1 5 6 6 7 9 3 2 1
9 NA 7 NA 2 NA 9 5 6 NA 5
Or the same in base R
i1 <- rowSums(my_df[6:9] == 2, na.rm = TRUE) > 0
my_df$b1[i1] <- my_df$b1[i1] + 10
Or with Reduce/lapply and %in%
i1 <- Reduce(`|`, lapply(my_df[6:9], `%in%`, 2))
my_df$b1[i1] <- my_df$b1[i1] + 10
You can also use the following solution:
library(dplyr)
library(purrr)
my_df %>%
pmap_df(~ {x <- c(...)[6:9];
y <- c(...)[1]
if(any(2 %in% x[!is.na(x)])) {
y + 10
} else {
y
}
}) %>%
bind_cols(my_df[-1])
# A tibble: 9 x 10
b1 b2 b3 b4 b5 b6 b7 b8 b9 b10
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2 NA 5 NA 2 9 1 NA 4 14
2 16 4 9 6 12 2 3 8 5 2
3 3 6 8 NA 1 4 7 4 7 4
4 6 2 NA 10 7 6 7 5 9 2
5 4 6 2 12 8 7 4 1 5 1
6 12 6 3 8 5 6 2 4 1 1
7 11 1 9 3 5 6 2 1 1 1
8 19 1 5 6 6 7 9 3 2 1
9 NA 7 NA 2 NA 9 5 6 NA 5
Or we can use this thanks to a great suggestion by dear #akrun:
my_df %>%
mutate(b1 = ifelse(pmap_lgl(select(cur_data(), b6:b9), ~ 2 %in% c(...)), b1 + 10, b1))
Like your previous question, you can also use rowwise() here
my_df %>% rowwise() %>%
mutate(b1 = ifelse(any(c_across(b6:b9) == 2, na.rm = T), b1 + 10, b1))
# A tibble: 9 x 10
# Rowwise:
b1 b2 b3 b4 b5 b6 b7 b8 b9 b10
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2 NA 5 NA 2 9 1 NA 4 14
2 16 4 9 6 12 2 3 8 5 2
3 3 6 8 NA 1 4 7 4 7 4
4 6 2 NA 10 7 6 7 5 9 2
5 4 6 2 12 8 7 4 1 5 1
6 12 6 3 8 5 6 2 4 1 1
7 11 1 9 3 5 6 2 1 1 1
8 19 1 5 6 6 7 9 3 2 1
9 NA 7 NA 2 NA 9 5 6 NA 5
Related
I have a set of dataframes in a list and have to create an extra column for each dataframe (which I´ve done) and then create a formula for the first row, and a different one from the second row onwards taking lags from the same column:
Let say the list name is "CCNRRF_list"
Creation of the fourth column (X4)
CNRRF_list<- mapply(cbind, CNRRF_list, "X4"=NA,SIMPLIFY=F)
one of the resulting dataframes
x1 x2 x3 x4
1 1 1 1 NA
2 2 2 2 NA
3 3 3 3 NA
4 4 4 4 NA
5 5 5 5 NA
6 6 6 6 NA
7 7 7 7 NA
8 8 8 8 NA
First formula first row
for (i in seq_along(CNRRF_list)) {
CNRRF_list[[i]]$X4[1]<-(1+CNRRF_list[[i]]$X3[1])
}
Resulting data
x1 x2 x3 x4
1 1 1 1 2 ===> "formula (1+X3)=(1+1)=2"
2 2 2 2 NA
3 3 3 3 NA
4 4 4 4 NA
5 5 5 5 NA
6 6 6 6 NA
7 7 7 7 NA
8 8 8 8 NA
now it gets tricky, from the second row onwards the formula is:
lag(X4)*(1+X3)
so the resulting data should look like this for each dataframe in the list:
x1 x2 x3 x4
1 1 1 1 2
2 2 2 2 6 ===> "formula lag(X4)*(1+x3)=2*(1+2)=6"
3 3 3 3 24 ===> "formula 6*(1+3)"
4 4 4 4 120 ===> "formula 24*(1+4)"
5 5 5 5 720 ===> "formula 120*(1+5)"
6 6 6 6 5040 ===> "formula 720*(1+6)"
7 7 7 7 40320 ===> "formula 5040*(1+7)"
8 8 8 8 362880 ===> "formula 40320*(1+8)"
But I haven´t been able to create a good enough formula.
some of my attempts
for (i in seq_along(CNRRF_list)) {
CNRRF_list[[i]] <- mutate(CNRRF_list[[i]], X4 = (ifelse(is.na(CNRRF_list[[i]]$X4),lag(CNRRF_list[[i]]$X4)*(1+CNRRF_list[[i]]$X3), 1*(1+CNRRF_list[[i]]$X3))))
}
Not working...any help will be appreciate.
Thanks
How about this:
dat <- tibble::tribble(
~x1, ~x2, ~x3, ~x4,
1, 1, 1, NA,
2, 2, 2, NA,
3, 3, 3, NA,
4, 4, 4, NA,
5, 5, 5, NA,
6, 6, 6, NA,
7, 7, 7, NA,
8, 8, 8, NA)
for(i in 1:nrow(dat)){
dat$x4[i] <- prod(c(NA, lag(dat$x4))[i], (1+dat$x3[i]), na.rm=TRUE)
}
dat
#> # A tibble: 8 × 4
#> x1 x2 x3 x4
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 1 2
#> 2 2 2 2 6
#> 3 3 3 3 24
#> 4 4 4 4 120
#> 5 5 5 5 720
#> 6 6 6 6 5040
#> 7 7 7 7 40320
#> 8 8 8 8 362880
Created on 2022-04-05 by the reprex package (v2.0.1)
Edit: Apply to a list of data frames
Here's how you could apply this to a list of data frames.
dat <- tibble::tribble(
~x1, ~x2, ~x3, ~x4,
1, 1, 1, NA,
2, 2, 2, NA,
3, 3, 3, NA,
4, 4, 4, NA,
5, 5, 5, NA,
6, 6, 6, NA,
7, 7, 7, NA,
8, 8, 8, NA)
dat_list <- list(dat, dat, dat)
res <- lapply(dat_list, function(x){
for(i in 1:nrow(x)){
x$x4[i] <- prod(c(NA, lag(x$x4))[i], (1+x$x3[i]), na.rm=TRUE)
}
x
})
res
#> [[1]]
#> # A tibble: 8 × 4
#> x1 x2 x3 x4
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 1 2
#> 2 2 2 2 6
#> 3 3 3 3 24
#> 4 4 4 4 120
#> 5 5 5 5 720
#> 6 6 6 6 5040
#> 7 7 7 7 40320
#> 8 8 8 8 362880
#>
#> [[2]]
#> # A tibble: 8 × 4
#> x1 x2 x3 x4
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 1 2
#> 2 2 2 2 6
#> 3 3 3 3 24
#> 4 4 4 4 120
#> 5 5 5 5 720
#> 6 6 6 6 5040
#> 7 7 7 7 40320
#> 8 8 8 8 362880
#>
#> [[3]]
#> # A tibble: 8 × 4
#> x1 x2 x3 x4
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 1 2
#> 2 2 2 2 6
#> 3 3 3 3 24
#> 4 4 4 4 120
#> 5 5 5 5 720
#> 6 6 6 6 5040
#> 7 7 7 7 40320
#> 8 8 8 8 362880
Created on 2022-04-05 by the reprex package (v2.0.1)
Another option is to use the base function Reduce
using data.table
library(data.table)
setDT(dt) # make it a data.table if it is not already
dt[, x4 := 2 * Reduce(f = function(a, b) { a * (b + 1) }, accumulate = T, x = x3)]
using dplyr
dt %>%
mutate(x4 = 2 * Reduce(f = function(a, b) { a * (b + 1) }, accumulate = T, x = x3))
output
# x1 x2 x3 x4
# 1: 1 1 1 2
# 2: 2 2 2 6
# 3: 3 3 3 24
# 4: 4 4 4 120
# 5: 5 5 5 720
# 6: 6 6 6 5040
# 7: 7 7 7 40320
# 8: 8 8 8 362880
data
dt <- data.frame(x1 = seq(1:8), x2 = seq(1:8), x3 = seq(1:8))
With accumulate:
library(tidyverse)
dat %>%
mutate(x4 = accumulate(seq(nrow(.) + 1), ~ .y * .x)[-1])
# A tibble: 8 x 4
x1 x2 x3 x4
<dbl> <dbl> <dbl> <int>
1 1 1 1 2
2 2 2 2 6
3 3 3 3 24
4 4 4 4 120
5 5 5 5 720
6 6 6 6 5040
7 7 7 7 40320
8 8 8 8 362880
For multiple dataframes:
list <- list(dat, dat, dat)
dat_list %>%
map(~ .x %>%
mutate(x4 = purrr::accumulate(seq(nrow(.) + 1), ~ .y * .x)[-1])
)
I have a dataset with the following layout:
ABC1a_1 <- c(1, 5, 3, 4, 3, 4, 5, 2, 2, 1)
ABC1b_1 <- c(4, 2, 1, 1, 5, 3, 2, 1, 1, 5)
ABC1a_2 <- c(4, 5, 5, 4, 2, 5, 5, 1, 2, 4)
ABC1b_2 <- c(2, 3, 3, 2, 2, 3, 2, 1, 4, 2)
ABC2a_1 <- c(2, 5, 3, 5, 3, 4, 5, 3, 2, 3)
ABC2b_1 <- c(1, 2, 2, 4, 5, 3, 2, 4, 1, 4)
ABC2a_2 <- c(2, 5, 5, 1, 2, 1, 5, 1, 3, 4)
ABC2b_2 <- c(2, 3, 3, 2, 1, 3, 1, 1, 2, 2)
df <- data.frame(ABC1a_1, ABC1b_1, ABC1a_2, ABC1b_2, ABC2a_1, ABC2b_1, ABC2a_2, ABC2b_2)
I want to collapse all of the ABC[N][x]_[n] variables into a single ABC[N]_[n] variable like this:
ABC1_1 <- c(1, 5, 3, 4, 3, 4, 5, 2, 2, 1, 4, 2, 1, 1, 5, 3, 2, 1, 1, 5)
ABC1_2 <- c(4, 5, 5, 4, 2, 5, 5, 1, 2, 4, 2, 3, 3, 2, 2, 3, 2, 1, 4, 2)
ABC2_1 <- c(2, 5, 3, 5, 3, 4, 5, 3, 2, 3, 1, 2, 2, 4, 5, 3, 2, 4, 1, 4)
ABC2_2 <- c(2, 5, 5, 1, 2, 1, 5, 1, 3, 4, 2, 3, 3, 2, 1, 3, 1, 1, 2, 2)
df2 <- data.frame(ABC1_1, ABC1_2, ABC2_1, ABC2_2)
What's the best way to achieve this, ideally with a tidyverse solution?
You could also use pivot_longer:
df %>%
rename_with(~str_replace(.x, "(.)(_\\d)", "\\2:\\1")) %>%
pivot_longer(everything(), names_sep = ':', names_to = c(".value", "group")) %>%
arrange(group)
# A tibble: 20 x 5
group ABC1_1 ABC1_2 ABC2_1 ABC2_2
<chr> <dbl> <dbl> <dbl> <dbl>
1 a 1 4 2 2
2 a 5 5 5 5
3 a 3 5 3 5
4 a 4 4 5 1
5 a 3 2 3 2
6 a 4 5 4 1
7 a 5 5 5 5
8 a 2 1 3 1
9 a 2 2 2 3
10 a 1 4 3 4
11 b 4 2 1 2
12 b 2 3 2 3
13 b 1 3 2 3
14 b 1 2 4 2
15 b 5 2 5 1
16 b 3 3 3 3
17 b 2 2 2 1
18 b 1 1 4 1
19 b 1 4 1 2
20 b 5 2 4 2
If you desire to go the Base R way, you could do:
reshape(df, split(names(df), sub("._", "_", names(df))), dir="long")
time ABC1a_1 ABC1a_2 ABC2a_1 ABC2a_2 id
1.1 1 1 4 2 2 1
2.1 1 5 5 5 5 2
3.1 1 3 5 3 5 3
4.1 1 4 4 5 1 4
5.1 1 3 2 3 2 5
6.1 1 4 5 4 1 6
7.1 1 5 5 5 5 7
8.1 1 2 1 3 1 8
9.1 1 2 2 2 3 9
10.1 1 1 4 3 4 10
1.2 2 4 2 1 2 1
2.2 2 2 3 2 3 2
3.2 2 1 3 2 3 3
4.2 2 1 2 4 2 4
5.2 2 5 2 5 1 5
6.2 2 3 3 3 3 6
7.2 2 2 2 2 1 7
8.2 2 1 1 4 1 8
9.2 2 1 4 1 2 9
10.2 2 5 2 4 2 10
Then you can change the names.
If you care about the names from the very beginning:
df1 <- setNames(df, gsub("(.)(_\\d)", "\\2.\\1", names(df)))
reshape(df1, names(df1), dir = "long")
time ABC1_1 ABC1_2 ABC2_1 ABC2_2 id
1 a 1 4 2 2 1
2 a 5 5 5 5 2
3 a 3 5 3 5 3
4 a 4 4 5 1 4
5 a 3 2 3 2 5
6 a 4 5 4 1 6
7 a 5 5 5 5 7
8 a 2 1 3 1 8
9 a 2 2 2 3 9
10 a 1 4 3 4 10
11 b 4 2 1 2 1
12 b 2 3 2 3 2
13 b 1 3 2 3 3
14 b 1 2 4 2 4
15 b 5 2 5 1 5
16 b 3 3 3 3 6
17 b 2 2 2 1 7
18 b 1 1 4 1 8
19 b 1 4 1 2 9
20 b 5 2 4 2 10
A base R solution to collapse it:
res <- as.data.frame(lapply(split.default(df, sub('._', '_', names(df))), unlist))
rownames(res) <- NULL
# >res
# ABC1_1 ABC1_2 ABC2_1 ABC2_2
# 1 1 4 2 2
# 2 5 5 5 5
# 3 3 5 3 5
# 4 4 4 5 1
# 5 3 2 3 2
# 6 4 5 4 1
# 7 5 5 5 5
# 8 2 1 3 1
# 9 2 2 2 3
# 10 1 4 3 4
# 11 4 2 1 2
# 12 2 3 2 3
# 13 1 3 2 3
# 14 1 2 4 2
# 15 5 2 5 1
# 16 3 3 3 3
# 17 2 2 2 1
# 18 1 1 4 1
# 19 1 4 1 2
# 20 5 2 4 2
identical(df2, res)
# [1] TRUE
Using rowSums as the function to combine column values would be better I guess:
> as.data.frame(lapply(split.default(df, sub('._', '_', names(df))), rowSums))
ABC1_1 ABC1_2 ABC2_1 ABC2_2
1 5 6 3 4
2 7 8 7 8
3 4 8 5 8
4 5 6 9 3
5 8 4 8 3
6 7 8 7 4
7 7 7 7 6
8 3 2 7 2
9 3 6 3 5
10 6 6 7 6
You could do:
library(tidyverse)
map_dfr(c("a", "b"),
~df %>%
select(contains(.x, ignore.case = FALSE)) %>%
rename_all(funs(str_remove_all(., .x))))
#ABC1_1 ABC1_2 ABC2_1 ABC2_2
#1 1 4 2 2
#2 5 5 5 5
#3 3 5 3 5
#4 4 4 5 1
# ..
Depending on your actual data, you could replace c("a", "b") with letters[1:2] or unique(str_extract(colnames(df), "[a-z]")).
library(tidyr)
library(dplyr)
df %>%
pivot_longer(everything()) %>%
arrange(name) %>%
mutate(name = gsub("[a-z]_", "_", name)) %>%
pivot_wider(values_fn = list) %>%
unchop(everything())
pivot_longer will put all the column names into a single column name that you can then edit by removing the lowercase letter preceding the underscore.
Then when you pivot back to a wide format the columns will automatically group. The output of pivot_wider are list-columns, unchop will convert these lists into a longer dataframe.
Output
ABC1_1 ABC1_2 ABC2_1 ABC2_2
<dbl> <dbl> <dbl> <dbl>
1 1 4 2 2
2 5 5 5 5
3 3 5 3 5
4 4 4 5 1
5 3 2 3 2
6 4 5 4 1
7 5 5 5 5
8 2 1 3 1
9 2 2 2 3
10 1 4 3 4
11 4 2 1 2
12 2 3 2 3
13 1 3 2 3
14 1 2 4 2
15 5 2 5 1
16 3 3 3 3
17 2 2 2 1
18 1 1 4 1
19 1 4 1 2
20 5 2 4 2
I have the following data set containing duplicate columns and I would like to stack them but in the following way. I can get the desired output with bind_rows but I would like to try it with tidyr functions:
df <- tibble(
runs = c(1, 2, 3, 4),
col1 = c(3, 4, 5, 5),
col2 = c(5, 3, 1, 4),
col3 = c(6, 4, 9, 2),
col1 = c(0, 2, 2, 1),
col2 = c(2, 3, 1, 7),
col3 = c(2, 4, 9, 9),
col1 = c(3, 4, 5, 7),
col2 = c(3, 3, 1, 4),
col3 = c(3, 2, NA, NA), .name_repair = "minimal")
df %>%
select(runs, 2:4) %>%
bind_rows(df %>%
select(runs, 5:7)) %>%
bind_rows(df %>%
select(runs, 8:10))
# A tibble: 12 x 4 # This is my desired output in a way that column runs is a repeated number of 1 to 4
runs col1 col2 col3
<dbl> <dbl> <dbl> <dbl>
1 1 3 5 6
2 2 4 3 4
3 3 5 1 9
4 4 5 4 2
5 1 0 2 2
6 2 2 3 4
7 3 2 1 9
8 4 1 7 9
9 1 3 3 3
10 2 4 3 2
11 3 5 1 NA
12 4 7 4 NA
However when I use tidyr the runs is arranged differently in the following way.
df %>%
pivot_longer(-runs) %>%
group_by(name) %>%
mutate(id = row_number()) %>%
pivot_wider(names_from = name, values_from = value) %>%
select(-id)
# A tibble: 12 x 4
runs col1 col2 col3
<dbl> <dbl> <dbl> <dbl>
1 1 3 5 6
2 1 0 2 2
3 1 3 3 3
4 2 4 3 4
5 2 2 3 4
6 2 4 3 2
7 3 5 1 9
8 3 2 1 9
9 3 5 1 NA
10 4 5 4 2
11 4 1 7 9
12 4 7 4 NA
I would be grateful if you could let me know how I could rearrange runs so that the numbers are sequential and not like three 1 in a row and ...
Thank you very much in advance.
There may be a more elegant way to do this, but could you not simply group by runs and use the row numbers to arrange.
df %>%
pivot_longer(cols = starts_with("col"),
names_to = c(".value")) %>%
group_by(runs) %>%
mutate(grp_n = row_number()) %>%
ungroup() %>%
arrange(grp_n, runs)
# A tibble: 12 x 5
runs col1 col2 col3 grp_n
<dbl> <dbl> <dbl> <dbl> <int>
1 1 3 5 6 1
2 2 4 3 4 1
3 3 5 1 9 1
4 4 5 4 2 1
5 1 0 2 2 2
6 2 2 3 4 2
7 3 2 1 9 2
8 4 1 7 9 2
9 1 3 3 3 3
10 2 4 3 2 3
11 3 5 1 NA 3
12 4 7 4 NA 3
A base R option using split.default :
data.frame(runs = df$runs,
sapply(split.default(df[-1], names(df)[-1]), unlist),row.names = NULL)
# runs col1 col2 col3
#1 1 3 5 6
#2 2 4 3 4
#3 3 5 1 9
#4 4 5 4 2
#5 1 0 2 2
#6 2 2 3 4
#7 3 2 1 9
#8 4 1 7 9
#9 1 3 3 3
#10 2 4 3 2
#11 3 5 1 NA
#12 4 7 4 NA
I have two columns about the IDs of participants in my study. The column ID contains progressive order of numbers as the subjects were all distinct people. The second column new_ID contains the information about which IDs correspond to the same person. Unfortunately they are not in the progressive order.
ID <- c(1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6)
new_ID <- c(8, 8, 8, 8, 10, 10, 10, 10, 10, 10, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 10, 10, 10, 10, 10, 10, 10)
data.frame(ID, new_ID)
# ID new_ID
#1 1 8
#2 1 8
#3 1 8
#4 1 8
#5 2 10
#6 2 10
#7 2 10
#8 2 10
#9 2 10
#10 2 10
#11 3 8
#12 3 8
#13 3 8
#14 3 8
#15 3 8
#16 4 4
#17 4 4
#18 4 4
#19 4 4
#20 4 4
#21 4 4
#22 5 5
#23 5 5
#24 5 5
#25 5 5
#26 6 10
#27 6 10
#28 6 10
#29 6 10
#30 6 10
#31 6 10
#32 6 10
I reported below what I would like to achieve, i.e. assigning the new ID (final_ID) based on the information in the two first columns. Any helps will be appreciated (best if using dplyr)!
# ID new_ID ID_final
#1 1 8 1
#2 1 8 1
#3 1 8 1
#4 1 8 1
#5 2 10 2
#6 2 10 2
#7 2 10 2
#8 2 10 2
#9 2 10 2
#10 2 10 2
#11 3 8 1
#12 3 8 1
#13 3 8 1
#14 3 8 1
#15 3 8 1
#16 4 4 4
#17 4 4 4
#18 4 4 4
#19 4 4 4
#20 4 4 4
#21 4 4 4
#22 5 5 5
#23 5 5 5
#24 5 5 5
#25 5 5 5
#26 6 10 2
#27 6 10 2
#28 6 10 2
#29 6 10 2
#30 6 10 2
#31 6 10 2
#32 6 10 2
Here's a data.table solution as well.
EDIT: added a dplyr solution too at the request of the OP.
library(data.table)
ID <- c(1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6)
new_ID <- c(8, 8, 8, 8, 10, 10, 10, 10, 10, 10, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 10, 10, 10, 10, 10, 10, 10)
d <- data.table(ID, new_ID)
d[, ID_final := min(.SD[,ID]), new_ID]
d
#> ID new_ID ID_final
#> 1: 1 8 1
#> 2: 1 8 1
#> 3: 1 8 1
#> 4: 1 8 1
#> 5: 2 10 2
#> 6: 2 10 2
#> 7: 2 10 2
#> 8: 2 10 2
#> 9: 2 10 2
#> 10: 2 10 2
#> 11: 3 8 1
#> 12: 3 8 1
#> 13: 3 8 1
#> 14: 3 8 1
#> 15: 3 8 1
#> 16: 4 4 4
#> 17: 4 4 4
#> 18: 4 4 4
#> 19: 4 4 4
#> 20: 4 4 4
#> 21: 4 4 4
#> 22: 5 5 5
#> 23: 5 5 5
#> 24: 5 5 5
#> 25: 5 5 5
#> 26: 6 10 2
#> 27: 6 10 2
#> 28: 6 10 2
#> 29: 6 10 2
#> 30: 6 10 2
#> 31: 6 10 2
#> 32: 6 10 2
#> ID new_ID ID_final
library(dplyr)
df <- data.frame(ID, new_ID)
df <- df %>% group_by(new_ID) %>%
mutate(ID_final = min(ID))
df
#> # A tibble: 32 x 3
#> # Groups: new_ID [4]
#> ID new_ID ID_final
#> <dbl> <dbl> <dbl>
#> 1 1 8 1
#> 2 1 8 1
#> 3 1 8 1
#> 4 1 8 1
#> 5 2 10 2
#> 6 2 10 2
#> 7 2 10 2
#> 8 2 10 2
#> 9 2 10 2
#> 10 2 10 2
#> # ... with 22 more rows
Created on 2019-09-30 by the reprex package (v0.3.0)
What you want to do is find the correct ID for each new_ID, and then join to that mapping.
final_id_map <- df %>% group_by(new_ID) %>% summarise(ID_final=min(ID))
> final_id_map
# A tibble: 4 x 2
new_ID ID_final
<dbl> <dbl>
1 4 4
2 5 5
3 8 1
4 10 2
Then you can just do a
df %>% join(final_id_map)
to produce the desired output.
I have a data frame df which looks like this
> g <- c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6)
> m <- c(1, NA, NA, NA, 3, NA, 2, 1, 3, NA, 3, NA, NA, 4, NA, NA, NA, 2, 1, NA, 7, 3, NA, 1)
> df <- data.frame(g, m)
where g is the category (1 to 6) and m are values in that category.
I've managed to find the amount of none NA values per category by :
aggregate(m ~ g, data=df, function(x) {sum(!is.na(x))}, na.action = NULL)
g m
1 1 1
2 2 3
3 3 2
4 4 1
5 5 2
6 6 3
and would now like to eliminate the rows (categories) where the number of None-NA is 1 and only keep those where the number of NA is 2 and above.
the desired outcome would be
g m
5 2 3
6 2 NA
7 2 2
8 2 1
9 3 3
10 3 NA
11 3 3
12 3 NA
17 5 NA
18 5 2
19 5 1
20 5 NA
21 6 7
22 6 3
23 6 NA
24 6 1
every g=1 and g=4 is eliminated because as shown there is only 1 none-NA in each of those categories
any suggestions :)?
If you want base R, then I suggest you use your aggregation:
df2 <- aggregate(m ~ g, data=df, function(x) {sum(!is.na(x))}, na.action = NULL)
df[ ! df$g %in% df2$g[df2$m < 2], ]
# g m
# 5 2 3
# 6 2 NA
# 7 2 2
# 8 2 1
# 9 3 3
# 10 3 NA
# 11 3 3
# 12 3 NA
# 17 5 NA
# 18 5 2
# 19 5 1
# 20 5 NA
# 21 6 7
# 22 6 3
# 23 6 NA
# 24 6 1
If you want to use dplyr, perhaps
library(dplyr)
group_by(df, g) %>%
filter(sum(!is.na(m)) > 1) %>%
ungroup()
# # A tibble: 16 × 2
# g m
# <dbl> <dbl>
# 1 2 3
# 2 2 NA
# 3 2 2
# 4 2 1
# 5 3 3
# 6 3 NA
# 7 3 3
# 8 3 NA
# 9 5 NA
# 10 5 2
# 11 5 1
# 12 5 NA
# 13 6 7
# 14 6 3
# 15 6 NA
# 16 6 1
One can try a dplyr based solution. group_by on g will help to get the desired count.
library(dplyr)
df %>% group_by(g) %>%
filter(!is.na(m)) %>%
filter(n() >=2) %>%
summarise(count = n())
#Result
# # A tibble: 6 x 2
# g count
# <dbl> <int>
# 1 2.00 3
# 2 3.00 2
# 3 5.00 2
# 4 6.00 3