variable based on other variables in R - r

I have a df like this
my_df <- data.frame(
b1 = c(2, 6, 3, 6, 4, 2, 1, 9, NA),
b2 = c(100, 4, 106, 102, 6, 6, 1, 1, 7),
b3 = c(75, 79, 8, 0, 2, 3, 9, 5, 80),
b4 = c(NA, 6, NA, 10, 12, 8, 3, 6, 2),
b5 = c(2, 12, 1, 7, 8, 5, 5, 6, NA),
b6 = c(9, 2, 4, 6, 7, 6, 6, 7, 9),
b7 = c(1, 3, 7, 7, 4, 2, 2, 9, 5),
b8 = c(NA, 8, 4, 5, 1, 4, 1, 3, 6),
b9 = c(4, 5, 7, 9, 5, 1, 1, 2, NA),
b10 = c(14, 2, 4, 2, 1, 1, 1, 1, 5))
I want to create a new column (NEW) which says BLUE or RED based on columns b2 and b3. so, if column b2 is Greater than or equal to 100 0R b3 is Greater than or equal to 75, then input BLUE otherwise input RED.
So that I will have something like this:
my_df <- data.frame(
b1 = c(2, 6, 3, 6, 4, 2, 1, 9, NA),
b2 = c(100, 4, 106, 102, 6, 6, 1, 1, 7),
b3 = c(75, 79, 8, 0, 2, 3, 9, 5, 80),
b4 = c(NA, 6, NA, 10, 12, 8, 3, 6, 2),
b5 = c(2, 12, 1, 7, 8, 5, 5, 6, NA),
b6 = c(9, 2, 4, 6, 7, 6, 6, 7, 9),
b7 = c(1, 3, 7, 7, 4, 2, 2, 9, 5),
b8 = c(NA, 8, 4, 5, 1, 4, 1, 3, 6),
b9 = c(4, 5, 7, 9, 5, 1, 1, 2, NA),
b10 = c(14, 2, 4, 2, 1, 1, 1, 1, 5),
NEW = c("BLUE", "BLUE", "BLUE", "BLUE", "RED", "RED", "RED", "RED", "BLUE"))
I have been able to work this out using this:
library (tidyverse)
greater_threshold <- 99.9
greater_threshold1 <- 74.9
my_df1 <- my_df %>%
mutate(NEW = case_when(b2 > greater_threshold ~ "BLUE",
b3 > greater_threshold1 ~ "BLUE",
+ T~"RED"))
At the moment, you can see that I am setting my 'greater threshold' to be slightly less than the required value. Although it works well. My question is this. Is there a way I set set my 'greater threshold to be ≥ 100 for b2 and ≥ 75 for b3.

For this example, I'd go whit if_else instead of case_when:
library(dplyr)
greater_threshold <- 100
greater_threshold1 <- 75
my_df <- data.frame(
b1 = c(2, 6, 3, 6, 4, 2, 1, 9, NA),
b2 = c(100, 4, 106, 102, 6, 6, 1, 1, 7),
b3 = c(75, 79, 8, 0, 2, 3, 9, 5, 80),
b4 = c(NA, 6, NA, 10, 12, 8, 3, 6, 2),
b5 = c(2, 12, 1, 7, 8, 5, 5, 6, NA),
b6 = c(9, 2, 4, 6, 7, 6, 6, 7, 9),
b7 = c(1, 3, 7, 7, 4, 2, 2, 9, 5),
b8 = c(NA, 8, 4, 5, 1, 4, 1, 3, 6),
b9 = c(4, 5, 7, 9, 5, 1, 1, 2, NA),
b10 = c(14, 2, 4, 2, 1, 1, 1, 1, 5)
)
my_df1 <- my_df %>%
mutate(
NEW = if_else(
b2 >= greater_threshold | b3 >= greater_threshold1,
"BLUE",
"RED"
)
)
my_df1
# b1 b2 b3 b4 b5 b6 b7 b8 b9 b10 NEW
# 1 2 100 75 NA 2 9 1 NA 4 14 BLUE
# 2 6 4 79 6 12 2 3 8 5 2 BLUE
# 3 3 106 8 NA 1 4 7 4 7 4 BLUE
# 4 6 102 0 10 7 6 7 5 9 2 BLUE
# 5 4 6 2 12 8 7 4 1 5 1 RED
# 6 2 6 3 8 5 6 2 4 1 1 RED
# 7 1 1 9 3 5 6 2 1 1 1 RED
# 8 9 1 5 6 6 7 9 3 2 1 RED
# 9 NA 7 80 2 NA 9 5 6 NA 5 BLUE

Related

Removing a row based on a condition

my_df <- tibble(
b1 = c(2, 1, 1, 2, 2, 2, 1, 1, 2),
b2 = c(NA, 4, 6, 2, 6, 6, 1, 1, 7),
b3 = c(5, 9, 8, NA, 2, 3, 9, 5, NA),
b4 = c(NA, 6, NA, 10, 12, 8, 3, 6, 2),
b5 = c(2, 12, 1, 7, 8, 5, 5, 6, NA),
b6 = c(9, 2, 4, 6, 7, 6, 6, 7, 9),
b7 = c(1, 3, 7, 7, 4, 2, 2, 9, 5),
b8 = c(NA, 8, 4, 5, 1, 4, 1, 3, 6),
b9 = c(4, 5, NA, 9, 5, 1, 1, 2, NA),
b10 = c(14, 3, NA, 2, 2, 2, 3, NA, 5))
I have a df like this, and would like to tell R to remove all '3' or 'NA' in b10 if b1 = 1. I have tried this with this, but it seems to keep the '3' and 'NA' instead of removing them;
new_df <- my_df %>% filter(is.na(b10) | b10 == 3 | b1==1 & b10 ==NA)
This should do the trick:
library(dplyr)
my_df %>%
filter(!(b1 == 1 & (is.na(b10) | b10 == 3)))
Edit: assuming you want to remove the rows where the conditions meet

How to sort each column of a df in descending order regarless of the row order?

I am trying to sort my data in descending or ascending order regardless of the data in the rows. I made a dummy example below:
A <- c(9,9,5,4,6,3,2,NA)
B <- c(9,5,3,4,1,4,NA,NA)
C <- c(1,4,5,6,7,4,2,4)
base <- data.frame(A,B,C)
df <- base
df$A <- sort(df$A,na.last = T)
df$B <- sort(df$B,na.last = T)
df$C <- sort(df$C)
We get this
structure(list(A = c(2, 3, 3, 4, 4, 4, 5, 5, 6, 9, 9, NA), B = c(1,
2, 3, 4, 4, 4, 5, 5, 9, 10, NA, NA), C = c(1, 2, 3, 4, 4, 4,
5, 5, 6, 7, 8, 8)), row.names = c(NA, -12L), class = "data.frame")
I want to get something similar to df but my data have hundreds of columns, is there an easier way to do it?
I tried arrange_all() but the result is not what i want.
library(tidyverse)
test <- base%>%
arrange_all()
Obtaining this:
structure(list(A = c(2, 3, 3, 4, 4, 4, 5, 5, 6, 9, 9, NA), B = c(NA,
2, 4, 4, 5, 10, 3, 4, 1, 5, 9, NA), C = c(2, 3, 4, 6, 8, 5, 5,
8, 7, 4, 1, 4)), class = "data.frame", row.names = c(NA, -12L
))
You can sort each column individually :
library(dplyr)
base %>% mutate(across(.fns = sort, na.last = TRUE))
# A B C
#1 2 1 1
#2 3 3 2
#3 4 4 4
#4 5 4 4
#5 6 5 4
#6 9 9 5
#7 9 NA 6
#8 NA NA 7
Or in base R :
base[] <- lapply(base, sort, na.last = TRUE)

Reducing dataframe if first 4 columns match

I have a dataframe that looks like this
> head(printing_id_map_unique_frames)
# A tibble: 6 x 5
# Groups: frame_number [6]
X1 X2 X3 row_in_frame frame_number
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 2 3 15 1
2 1 2 3 15 2
3 1 2 3 15 3
4 1 2 3 15 4
5 1 2 3 15 5
6 1 2 3 15 6
As you can see, X1,X2,X3, row_in_frame is identical
However, eventually you get to a
X1 X2 X3 row_in_frame frame_number
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 2 3 15 32
2 1 2 3 15 33
3 1 2 3 5 34**
4 1 4 5 15 35
5 1 4 5 15 36
What I would like to do is essentially compute a dataframe that looks like:
X1 X2 X3 row_in_frame num_duplicates
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 2 3 15 33
2 1 2 3 5 1
...
Essentially, what I want is to "collapse" over identical first 4 columns and count how many rows of that type there are in the "num_duplicates" column.
Is there a nice way to do this in dplyr without a messy for loop that tracks a count and if there is a change.
Below please find a full data structure via dput:
> dput(printing_id_map_unique_frames)
structure(list(X1 = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), X2 = c(2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
), X3 = c(3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5), row_in_frame = c(15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 5, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 5
), frame_number = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
62, 63, 64, 65, 66, 67, 68)), row.names = c(NA, -68L), class = c("tbl_df",
"tbl", "data.frame"))
Here is one option with count
library(dplyr) # 1.0.0
df1 %>%
count(!!! rlang::syms(names(.)[1:4]))
Or specify the unquoted column names
df1 %>%
count(X1, X2, X3, row_in_frame)
If we don't want to change the order, an option is to convert the first 4 columns to factor with levels specified as the unique values (which is the same as the order of occurrence of values) and then apply the count
df1 %>%
mutate(across(1:4, ~ factor(.x, levels = unique(.x)))) %>%
count(!!! rlang::syms(names(.)[1:4])) %>%
type.convert(as.is = TRUE)
# A tibble: 4 x 5
# X1 X2 X3 row_in_frame n
# <int> <int> <int> <int> <int>
#1 1 2 3 15 33
#2 1 2 3 5 1
#3 1 4 5 15 33
#4 1 4 5 5 1

R Tidyverse expand a dataframe with all combinations of two variables (edgelist)

This code generates a data frame just so:
library(tidyverse)
A = c(7, 4, 3, 12, 6)
B = c(1, 10, 9, 8, 5)
C = c(5, 3, 1, 7, 6)
df <- data_frame(A, B, C) %>% gather(letter1, rank)
nested <- df %>% group_by(letter1) %>% nest(ranks = c(rank))
nested
A grouped_df: 3 × 2
letter1 ranks
<chr> <list>
A 7, 4, 3, 12, 6
B 1, 10, 9, 8, 5
C 5, 3, 1, 7, 6
This is the desired data frame:
A tibble: 9 × 4
letter1 letter2 data1 data2
<chr> <chr> <list> <list>
A A 7, 4, 3, 12, 6 7, 4, 3, 12, 6
B A 1, 10, 9, 8, 5 7, 4, 3, 12, 6
C A 5, 3, 1, 7, 6 7, 4, 3, 12, 6
A B 7, 4, 3, 12, 6 1, 10, 9, 8, 5
B B 1, 10, 9, 8, 5 1, 10, 9, 8, 5
C B 5, 3, 1, 7, 6 1, 10, 9, 8, 5
A C 7, 4, 3, 12, 6 5, 3, 1, 7, 6
B C 1, 10, 9, 8, 5 5, 3, 1, 7, 6
C C 5, 3, 1, 7, 6 5, 3, 1, 7, 6
Once this step is solved, I'll run a mutate using data1 and data2 to get value, and then selecting letter1, letter2 and value will give an edgelist. I'm working with about 700 letters and the ranks lists will all be the same size and contain about 20 elements.
I'd expected to be able to use expand or expand.grid, but to no avail. Any tidyverse suggestions will be greatly appreciated.
crossing can be used
library(tidyr)
library(purrr)
library(dplyr)
crossing(ind1 = seq_len(nrow(nested)),
ind2 = seq_len(nrow(nested))) %>%
pmap_dfr(~ bind_cols(nested[..1,], nested[..2,]) )
We can use crossing after renaming the second dataframe.
tidyr::crossing(nested, setNames(nested, c('letter2', 'rank2')))
# letter1 ranks letter2 rank2
#1 A 7, 4, 3, 12, 6 A 7, 4, 3, 12, 6
#2 A 7, 4, 3, 12, 6 B 1, 10, 9, 8, 5
#3 A 7, 4, 3, 12, 6 C 5, 3, 1, 7, 6
#4 B 1, 10, 9, 8, 5 A 7, 4, 3, 12, 6
#5 B 1, 10, 9, 8, 5 B 1, 10, 9, 8, 5
#6 B 1, 10, 9, 8, 5 C 5, 3, 1, 7, 6
#7 C 5, 3, 1, 7, 6 A 7, 4, 3, 12, 6
#8 C 5, 3, 1, 7, 6 B 1, 10, 9, 8, 5
#9 C 5, 3, 1, 7, 6 C 5, 3, 1, 7, 6
The same is also valid for expand_grid.
tidyr::expand_grid(nested, setNames(nested, c('letter2', 'rank2')))

Reorganize a List with Lists in a new list in R

This is my list:
mylist=list(list(a = c(2, 3, 4, 5), b = c(3, 4, 5, 5), c = c(3, 7, 5,
5), d = c(3, 4, 9, 5), e = c(3, 4, 5, 9), f = c(3, 4, 1, 9),
g = c(3, 1, 5, 9), h = c(3, 3, 5, 9), i = c(3, 17, 3, 9),
j = c(3, 17, 3, 9)), list(a = c(2, 5, 48, 4), b = c(7, 4,
5, 5), c = c(3, 7, 35, 5), d = c(3, 843, 9, 5), e = c(3, 43,
5, 9), f = c(3, 4, 31, 39), g = c(3, 1, 5, 9), h = c(3, 3, 5,
9), i = c(3, 17, 3, 9), j = c(3, 17, 3, 9)), list(a = c(2, 3,
4, 35), b = c(3, 34, 5, 5), c = c(3, 37, 5, 5), d = c(38, 4,
39, 5), e = c(3, 34, 5, 9), f = c(33, 4, 1, 9), g = c(3, 1, 5,
9), h = c(3, 3, 35, 9), i = c(3, 17, 33, 9), j = c(3, 137, 3,
9)), list(a = c(23, 3, 4, 85), b = c(3, 4, 53, 5), c = c(3, 7,
5, 5), d = c(3, 4, 9, 5), e = c(3, 4, 5, 9), f = c(3, 34, 1,
9), g = c(38, 1, 5, 9), h = c(3, 3, 5, 9), i = c(3, 137, 3, 9
), j = c(3, 17, 3, 9)), list(a = c(2, 3, 48, 5), b = c(3, 4,
5, 53), c = c(3, 73, 53, 5), d = c(3, 43, 9, 5), e = c(33, 4,
5, 9), f = c(33, 4, 13, 9), g = c(3, 81, 5, 9), h = c(3, 3, 5,
9), i = c(3, 137, 3, 9), j = c(3, 173, 3, 9)))
As you can see my list has 5 entries. Each entry has 10 others entries filled by 4 elements.
> mylist[[4]][[1]]
[1] 23 3 4 85
I want to create another list with only one entry.
All want to put all entr of tipe mylist[[i]][[1]] in first position of a new list: mynewlist[[1]][[1]] will be filled by the mylist[[1]][[1]],mylist[[2]][[1]],mylist[[3]][[1]],mylist[[4]][[1]],mylist[[5]][[1]] elements.
The secon position of mynewlist (mynewlist[[2]][[1]]) will be: mylist[[1]][[2]],mylist[[2]][[2]],mylist[[3]][[2]],mylist[[4]][[2]],mylist[[5]][[2]] elements.
Until
The fith position of mynewlist (mynewlist[[5]][[1]]) will be: mylist[[1]][[5]],mylist[[2]][[5]],mylist[[3]][[5]],mylist[[4]][[5]],mylist[[5]][[5]] elements.
In other words, I want to put every mylist[[i]][[1]]$a in the mynewlist[[1]][[1]] position; the mylist[[i]][[1]]$b in the mynewlist[[1]][[2]] position and so on until mylist[[i]][[1]]$j in the mynewlist[[1]][[10]]
This should be my output for the first position of mynewlist:
#[[1]]
#[1] 2 3 4 5
2 5 48 4
2 3 4 35
23 3 4 85
2 3 48 5
Any help?
We can use transpose
library(dplyr)
out <- mylist %>%
purrr::transpose(.)
out[[1]]
#[[1]]
#[1] 2 3 4 5
#[[2]]
#[1] 2 5 48 4
#[[3]]
#[1] 2 3 4 35
#[[4]]
#[1] 23 3 4 85
#[[5]]
#[1] 2 3 48 5

Resources