find value furthest to the right in a table r - r

Let's say I've got some data:
data <- tibble(A = c("a", "b", "c", "d"),
B = c("e", "f", "g", NA_character_),
C = c("h", "i", NA_character_, NA_character_))
Which looks like this:
# A tibble: 4 x 3
A B C
<chr> <chr> <chr>
1 a e h
2 b f i
3 c g NA
4 d NA NA
What I'd like to do is get the value that's furthest to the right into a new column:
# A tibble: 4 x 4
A B C D
<chr> <chr> <chr> <chr>
1 a e h h
2 b f i i
3 c g NA g
4 d NA NA d
I know I could do it with case_when and a bunch of logical !is.na(A) ~ A, statements, but say I've got a load of columns and that's not feasible. I feel like there probably is an easy way that I just don't know about and haven't been able to find. Thanks

coalesce would be more easier
library(dplyr)
data %>%
mutate(D = coalesce(C, B, A))
-output
# A tibble: 4 x 4
# A B C D
# <chr> <chr> <chr> <chr>
#1 a e h h
#2 b f i i
#3 c g <NA> g
#4 d <NA> <NA> d
Or if there are many column, rev the column names, convert to symbols and evaluate (!!!)
data %>%
mutate(D = coalesce(!!! rlang::syms(rev(names(.)))))

Related

Is there a R function for conditional values across different columns?

Suppose you have a dataframe that looks something like this:
df <- tibble(PatientID = c(1,2,3,4,5),
Treat1 = c("R", "O", "C", "O", "C"),
Treat2 = c("O", "R", "R", NA, "O"),
Treat3 = c("C", NA, "O", NA, "R"),
Treat4 = c("H", NA, "H", NA, "H"),
Treat5 = c("H", NA, NA, NA, "H"))
Treat 1:Treat5 are different treatments that a patient has had. I'm looking to create a new variable "Chemo" with 1 for yes, 0 for no based on whether a patient has had treatment "C".
I've been using if_else(), but as I have 10 different treatment variables in my actual dataset, and I would like to create such a column per treatment, i wonder if I can do it without writing such long if statements. Is there an easier way to do this?
Use if_any to loop over the columns that starts_with 'Treat', create a logical vector with %in% - if_any returns TRUE/FALSE if any of the columns selected have 'C' for a particular row, the logical is converted to binary with + (or as.integer)
library(dplyr)
df <- df %>%
mutate(Chemo = +(if_any(starts_with("Treat"), ~ .x %in% "C")))
-output
df
# A tibble: 5 × 7
PatientID Treat1 Treat2 Treat3 Treat4 Treat5 Chemo
<dbl> <chr> <chr> <chr> <chr> <chr> <int>
1 1 R O C H H 1
2 2 O R <NA> <NA> <NA> 0
3 3 C R O H <NA> 1
4 4 O <NA> <NA> <NA> <NA> 0
5 5 C O R H H 1
Or using base R with rowSums
df$Chemo <- +(rowSums(df[startsWith(names(df), "Treat")] == "C",
na.rm = TRUE) > 0)
Another option using str_detect and any to determine if C occurs in any of the Treat columns for each row. The + converts the logical to an integer.
library(tidyverse)
df %>%
rowwise() %>%
mutate(Chemo = +any(str_detect(c_across(starts_with("Treat")), "C"), na.rm = TRUE)) %>%
ungroup
Output
PatientID Treat1 Treat2 Treat3 Treat4 Treat5 Chemo
<dbl> <chr> <chr> <chr> <chr> <chr> <int>
1 1 R O C H H 1
2 2 O R NA NA NA 0
3 3 C R O H NA 1
4 4 O NA NA NA NA 0
5 5 C O R H H 1
An alternative dplyr way:
library(dplyr)
df %>%
mutate(across(starts_with("Treat"), ~case_when(.=="C" ~1,
TRUE ~0), .names = 'new_{col}')) %>%
mutate(Chemo = rowSums(select(., starts_with("new")))) %>%
select(-starts_with("new"))
PatientID Treat1 Treat2 Treat3 Treat4 Treat5 Chemo
<dbl> <chr> <chr> <chr> <chr> <chr> <dbl>
1 1 R O C H H 1
2 2 O R NA NA NA 0
3 3 C R O H NA 1
4 4 O NA NA NA NA 0
5 5 C O R H H 1

Adding new column names to use in spread()

Big picture: I'm trying to set up an export that has one route as a row and columns for each value.
This code: I'm trying to select the top three transfers for each route (using slice(1:3) because I need no more than three values. top_n() allows for ties). Then, I'm trying to spread() to create 6 columns: a name and a pct for each.
If I were to spread the data right now, the names would become columns, but I need to keep the names in the rows (see Desired Output). I want to create the column names as a key column to use to spread(). My approach is creating an error. I'm having trouble thinking of another strategy.
Data frame:
# A tibble: 7 x 3
route_shortname transfer_to pct
<chr> <chr> <dbl>
1 A D 0.5
2 A E 0.5
3 B F 0.667
4 B G 0.333
5 C D 0.111
6 C E 0.111
7 C G 0.111
Desired output:
# A tibble: 3 x 7
route_shortname transfer1 transfer1_pct transfer2 transfer2_pct transfer3 transfer3_pct
<chr> <chr> <dbl> <chr> <dbl> <chr> <dbl>
1 A D 0.5 E 0.5 NA NA
2 B F 0.667 G 0.333 NA NA
3 C D 0.111 E 0.111 G 0.111
Reprex:
library(tidyverse)
sample_data <- tibble::tribble(
~route_shortname, ~transfer_to, ~pct,
"A", "D", 0.5,
"A", "E", 0.5,
"B", "F", 0.666666666666667,
"B", "G", 0.333333333333333,
"C", "D", 0.111111111111111,
"C", "E", 0.111111111111111,
"C", "G", 0.111111111111111
)
transfer_to_table <- sample_data %>%
group_by(route_shortname) %>%
mutate(key = c("transfer1", "transfer2", "transfer3"))
#> Error in mutate_impl(.data, dots): Column `key` must be length 2 (the group size) or one, not 3
df = read.table(text = "
route_shortname transfer_to pct
1 A D 0.5
2 A E 0.5
3 B F 0.667
4 B G 0.333
5 C D 0.111
6 C E 0.111
7 C G 0.111
", header=T)
library(tidyverse)
df %>%
group_by(route_shortname) %>%
mutate(id = paste0("transfer", row_number())) %>%
ungroup() %>%
unite(v, transfer_to, pct) %>%
spread(id, v) %>%
separate(transfer1, c("transfer1","transfer1_pct"), sep = "_", convert = T) %>%
separate(transfer2, c("transfer2","transfer2_pct"), sep = "_", convert = T) %>%
separate(transfer3, c("transfer3","transfer3_pct"), sep = "_", convert = T)
# route_shortname transfer1 transfer1_pct transfer2 transfer2_pct transfer3 transfer3_pct
# <fct> <chr> <dbl> <chr> <dbl> <chr> <dbl>
# 1 A D 0.5 E 0.5 NA NA
# 2 B F 0.667 G 0.333 NA NA
# 3 C D 0.111 E 0.111 G 0.111
Though you tagged this question with tidyverse packages, here is an option using dcast from data.table which let's you do the reshaping in one (admittedly long) line.
library(data.table)
setDT(sample_data)
dcast(sample_data, route_shortname ~ rowid(route_shortname), value.var = c('transfer_to', 'pct'))
# route_shortname transfer_to_1 transfer_to_2 transfer_to_3 pct_1 pct_2 pct_3
#1: A D E <NA> 0.5000000 0.5000000 NA
#2: B F G <NA> 0.6666667 0.3333333 NA
#3: C D E G 0.1111111 0.1111111 0.1111111
You could also use reshape from base R
sample_data <- as.data.frame(sample_data) # does not work with tibbles for some reason
sample_data$idx <- with(sample_data,
ave(route_shortname, route_shortname, FUN = seq_along))
reshape(sample_data, idvar = "route_shortname", timevar = "idx", direction = "wide", sep = "_")
# route_shortname transfer_to_1 pct_1 transfer_to_2 pct_2 transfer_to_3 pct_3
#1 A D 0.5000000 E 0.5000000 <NA> NA
#3 B F 0.6666667 G 0.3333333 <NA> NA
#5 C D 0.1111111 E 0.1111111 G 0.1111111
In both cases you'd need to rename columns but I that shouldn't be too hard.

Writing a Function to Change Certain Columns to Lowercase

I'm trying to write a function that will allow me to change the case of certain fields in my data frame to lowercase. I'm trying to do this by using the function, for, and tolower commands, but I'm not having any luck. I'm still fairly new to R, so I might be missing something obvious. I would appreciate any help anyone can provide.
standardize_lowercase <- function(df, objs) {
for(i in 1:length(objs)) {
df[i] <- tolower(df[i])
}
}
I'm using df to refer to my main data frame, and objs would be a character vector with the names of the fields from the data frame I would like to convert to lowercase.
We can use the dplyr package as follows. Provide the column names as a string and tolower to the mutate_at function.
library(dplyr)
# Create example data frame
dat <- data_frame(A = c("A", "B", "C"),
B = c("A", "B", "C"),
C = c("A", "B", "C"),
D = c("A", "B", "C"),
E = c("A", "B", "C"))
# Assuming that we want to change the column B, C, E to lower case
obj <- c("B", "C", "E")
dat2 <- dat %>%
mutate_at(vars(obj), funs(tolower(.)))
dat2
# # A tibble: 3 x 5
# A B C D E
# <chr> <chr> <chr> <chr> <chr>
# 1 A a a A a
# 2 B b b B b
# 3 C c c C c
Or here is a base R solution using lapply.
dat[obj] <- lapply(dat[obj], tolower)
dat
# # A tibble: 3 x 5
# A B C D E
# <chr> <chr> <chr> <chr> <chr>
# 1 A a a A a
# 2 B b b B b
# 3 C c c C c
Here is an example to convert the second option to a function.
dat_tolower <- function(data, target){
data[target] <- lapply(data[target], tolower)
return(data)
}
dat_tolower(dat, target = obj)
# # A tibble: 3 x 5
# A B C D E
# <chr> <chr> <chr> <chr> <chr>
# 1 A a a A a
# 2 B b b B b
# 3 C c c C c

rstudio dplyr group _by multiple column

In Rstudio, I have a dataframe which contains 4 columns and I need to get the list of every different triplet of the 3 first columns sorted decreasingly by the sum on the 4th column. For example, with:
A B C 2
D E F 5
A B C 4
G H I 5
D E F 3
I need as a result:
D E F 8
A B C 6
G H I 5
I've tried the following different approach but I can't manage to have exactly the result I need:
df_list<-df_raw_data %>%
group_by(param1, param2, param3) %>%
summarise_all(total = sum(param4))
arrange(df_list, desc(total))
and:
df_list<-unique(df_raw_data[, c('param1', 'param2', 'param3')])
cbind(df_list, total)
for(i in 1:nrow(df_raw_data))
{
filter ???????????
}
I would prefer to use the dplyr package since it's a more elegant solution.
EDIT: Okay, thanks for your working answers. I think that I've lost some time figuring out that the plyr package shouldn't be loaded after dplyr...
We can use group_by_at to select the columns to group.
library(dplyr)
dat2 <- dat %>%
group_by_at(vars(-V4)) %>%
summarise(V4 = sum(V4)) %>%
ungroup()
dat2
# # A tibble: 3 x 4
# V1 V2 V3 V4
# <chr> <chr> <chr> <int>
# 1 A B C 6
# 2 D E F 8
# 3 G H I 5
Or use group_by_if to select columns to group based on column types.
dat2 <- dat %>%
group_by_if(is.character) %>%
summarise(V4 = sum(V4)) %>%
ungroup()
dat2
# # A tibble: 3 x 4
# V1 V2 V3 V4
# <chr> <chr> <chr> <int>
# 1 A B C 6
# 2 D E F 8
# 3 G H I 5
DATA
dat <- read.table(text = "A B C 2
D E F 5
A B C 4
G H I 5
D E F 3",
header = FALSE, stringsAsFactors = FALSE)
Would this be what you are looking for?
df <- data_frame(var1 = c("A", "D", "A", "G", "D"),
var2 = c("B", "E", "B", "H", "E"),
var3 = c("C", "F", "C", "I", "F"),
var4 = c(2, 5, 4, 5, 3))
df %>% group_by(var1, var2, var3) %>%
summarise(sum = sum(var4)) %>%
arrange(desc(sum))

Can one separate column into several columns starting from the end of the line?

I wonder if there is some secret argument that would allow to apply separate from the end of the line? Some magic_argument?
The desired output would be as follows:
library(dplyr)
df <- data.frame(x = c(NA, "a.b.b", "a.b.d", "b.c"))
df %>% separate(x, c("A", "B"), magic_argument = TRUE)
#> A B
#> 1 <NA> <NA>
#> 2 a.b b
#> 3 a.b d
#> 4 b c
Try:
df %>% separate(x, c("A", "B"), sep="\\.(?=[^\\.]+$)")
# A B
#1 <NA> <NA>
#2 a.b b
#3 a.b d
#4 b c

Resources