I have a large dataset with multiple columns of the following structure
A B
1. 1. D1
2. 1. D2
3. 2 D2
4. 3. D1
5. 3. D2
I'm trying to create a new data frame based on unique observations in column A, with a dummy variable "Dummy" coded as 1=D1, 2=D2, 3=both, like so:
A. Dummy
1. 1. 3
2. 2. 2
3. 3. 3
Any idea how I can go about this?
You can use aggregate.
aggregate(B ~ A, df, function(x) if(all(x == "D1")) 1 else if(all(x == "D2")) 2 else 3)
# A B
# 1 1 3
# 2 2 2
# 3 3 3
Another possible solution:
df %>%
group_by(A) %>%
summarise(B = paste0(B, collapse = "_")) %>%
mutate(Dummy = case_when(
B == "D1" ~ 1,
B == "D2" ~ 2,
B == "D1_D2" | B == "D2_D1" ~ 3,
TRUE ~ NA_real_
)) %>%
select(-B)
Result
# A tibble: 3 x 2
A Dummy
<dbl> <dbl>
1 1 3
2 2 2
3 3 3
Here is an option with dplyr. After grouping by 'A', if the number of distinct elements are greater than 1, return 3 or else use a named vector to match the first element of 'B'
library(dplyr)
df1 %>%
group_by(A) %>%
summarise(Dummy = if(n_distinct(B) > 1) 3L else
setNames(1:2, c("D1", "D2"))[first(B)])
# A tibble: 3 x 2
# A Dummy
#* <dbl> <int>
#1 1 3
#2 2 2
#3 3 3
data
df1 <- structure(list(A = c(1, 1, 2, 3, 3), B = c("D1", "D2", "D2",
"D1", "D2")), class = "data.frame", row.names = c("1.", "2.",
"3.", "4.", "5."))
Related
let's assume I have this simple dataframe:
df <- tibble(a = c(1, 1), b = c(2, 2))
I now want to know how to use a group_by inside a pipline that depends on a variable. Something like,
flag <- T
resulting <- df %>%
filter(a > 0 & b >0) %>%
group_by(ifelse(flag), yes = c(a), no = c(a, b))
That is, if flag == T, then I want to group only on column a. If flag is false I want to group an both columns.
I think this worked for me
flag <- T
resulting <- df %>%
filter(a > 0 & b >0) %>%
{if(flag) group_by(.,a) else group_by(. ,a , b)}
resulting
# A tibble: 2 × 2
# Groups: a [1] # <======== here grouped by a
a b
<dbl> <dbl>
1 1 2
2 1 2
by changing the flag
flag <- F
resulting <- df %>%
filter(a > 0 & b >0) %>%
{if(flag) group_by(.,a) else group_by(. ,a , b)}
resulting
# A tibble: 2 × 2
# Groups: a, b [1] # <======== here grouped by a ,b
a b
<dbl> <dbl>
1 1 2
2 1 2
I would like to convert data frame df1 into data frame df2.
id <- c(1,2,3)
outcome_1 <- c(1,0,1)
outcome_2 <- c(1,1,0)
df1 <- data.frame(id,outcome_1,outcome_2)
id <- c(1,2,3)
outcome <- c("1,2","2","1")
df2 <- data.frame(id,outcome)
The answers to the following question almost do what I want, but in my case a row can have more than one positive outcome (e.g. first row needs to be "1,2"). Also, I would like the resulting column to be a character column.
R: Converting multiple binary columns into one factor variable whose factors are binary column names
Please kindly help. Thank you.
Subset the substrings of the outcomes with their binary values coerced as.logical.
apply(df1[-1], 1, \(x) toString(substring(names(df1)[-1], 9)[as.logical(x)]))
# [1] "1, 2" "2" "1"
or
apply(df1[-1], 1, \(x) paste(substring(names(df1)[-1], 9)[as.logical(x)], collapse=','))
# [1] "1,2" "2" "1"
Using the first method:
cbind(df1[1], outcome=apply(df1[-1], 1, \(x) toString(substring(names(df1)[-1], 9)[as.logical(x)])))
# id outcome
# 1 1 1, 2
# 2 2 2
# 3 3 1
If you want a nested list you may use list2DF.
l <- list2DF(c(df1[1],
outcome=list(apply(df1[-1], 1, \(x)
as.numeric(substring(names(df1)[-1], 9))[as.logical(x)]))))
l
# id outcome
# 1 1 1, 2
# 2 2 2
# 3 3 1
where
str(l)
# 'data.frame': 3 obs. of 2 variables:
# $ id : num 1 2 3
# $ outcome:List of 3
# ..$ : num 1 2
# ..$ : num 2
# ..$ : num 1
Data:
df1 <- structure(list(id = c(1, 2, 3), outcome_1 = c(1, 0, 1), outcome_2 = c(1,
1, 0)), class = "data.frame", row.names = c(NA, -3L))
Here is one more tidyverse approach:
library(dplyr)
library(tidyr)
df1 %>%
mutate(across(-id, ~case_when(. == 1 ~ cur_column()), .names = 'new_{col}'), .keep="unused") %>%
unite(outcome, starts_with('new'), na.rm = TRUE, sep = ', ') %>%
mutate(outcome = gsub('outcome_', '', outcome))
id outcome
1 1 1, 2
2 2 2
3 3 1
How many outcome_ columns are there? If just 2, this will work fine.
library(dplyr)
df1 %>%
rowwise() %>%
summarise(id = id,
outcome = paste(which(c(outcome_1,outcome_2)==1), collapse =","))
# A tibble: 3 x 2
id outcome
<dbl> <chr>
1 1 1,2
2 2 2
3 3 1
If there are more than 2, try this:
df1 %>%
rowwise() %>%
summarise(id=id,
outcome = paste(which(c_across(-id)== 1), collapse =","))
Another possible solution, based on dplyr and purrr::pmap:
library(tidyverse)
df1 %>%
transmute(id, outcome = pmap(., ~ c(1*..2, 2*..3) %>% .[. != 0] %>% toString))
#> id outcome
#> 1 1 1, 2
#> 2 2 2
#> 3 3 1
Or simply:
library(tidyverse)
pmap_dfr(df1, ~ data.frame(id = ..1, outcome = c(1*..2, 2*..3) %>% .[. != 0]
%>% toString))
#> id outcome
#> 1 1 1, 2
#> 2 2 2
#> 3 3 1
outcome_col_idx <- grepl("outcome", colnames(df1))
cbind(
df1[,!outcome_col_idx, drop = FALSE],
outcome = apply(
replace(df1, df1 == 0, NA)[,outcome_col_idx],
1,
function(x){
as.factor(
toString(
gsub(
"outcome_",
"",
names(x)[complete.cases(x)]
)
)
)
}
)
)
I have a table with ID and other columns. I want to group the data by Ids and get the unique values of all columns.
from above table group by ID and get unique(Alt1, Alt2, Alt3)
Resul should be in vector form
A -> 1,2,3,5
B ->1,3,4,5,7
We can get data in long format and for each ID make a list of unique values.
library(dplyr)
library(tidyr)
df1 <- df %>%
pivot_longer(cols = -ID) %>%
group_by(ID) %>%
summarise(value = list(unique(value))) %>%
unnest(value)
df1
# ID value
# <fct> <dbl>
# 1 A 1
# 2 A 3
# 3 A 2
# 4 A 5
# 5 B 1
# 6 B 4
# 7 B 5
# 8 B 3
# 9 B 6
#10 B 7
We can store it as a list if needed using split.
split(df1$value, df1$ID)
#$A
#[1] 1 3 2 5
#$B
#[1] 1 4 5 3 6 7
data.table equivalent of the above would be :
library(Data.table)
setDT(df)
df2 <- melt(df, id.vars = 'ID')[, .(value = list(unique(value))), ID]
unique values are present in df2$value as a vector.
data
df <- data.frame(ID = c('A', 'A', 'B', 'B'),
Alt1 = c(1, 2, 1, 3),
Alt2 = c(3, 5, 4, 6),
Alt3 = c(1, 3, 5, 7))
I have a Data Frame with a variable with different values for another variable.
Like this:
DataFrame
So, I need a subset when the value of S contain all the possible values of B. In this example, el subset is conformed by S = a and S = b:
Subset
Any idea? Thanks!!
An option would be to group by 'S' and filter the rows having all the unique values of the column 'B' %in% 'B'
library(dplyr)
un1 <- unique(df1$B)
df1 %>%
group_by(S) %>%
filter(all(un1 %in% B))
# A tibble: 8 x 2
# Groups: S [2]
# S B
# <fct> <dbl>
#1 a 1
#2 a 2
#3 a 3
#4 a 4
#5 d 1
#6 d 2
#7 d 3
#8 d 4
Or with data.table
library(data.table)
setDT(df1)[, .SD[all(un1 %in% B)], S]
Or using base R
df1[with(df1, ave(B, S, FUN = function(x) all(un1 %in% x)) == 1),]
data
df1 <- data.frame(S = rep(letters[1:4], c(4, 3, 2, 4)),
B = c(1:4, c(1, 3, 4), 1:2, 1:4))
I am newish to R and having trouble with a for loop over unique values.
with the df:
id = c(1,2,2,3,3,4)
rank = c(1,2,1,3,3,4)
df = data.frame(id, rank)
I run:
df$dg <- logical(6)
for(i in unique(df$id)){
ifelse(!unique(df$rank), df$dg ==T, df$dg == F)
}
I am trying to mark the $dg variable as T providing that rank is different for each unique id and F if rank is the same within each id.
I am not getting any errors, but I am only getting F for all values of $dg even though I should be getting a mix.
I have also used the following loop with the same results:
for(i in unique(df$id)){
ifelse(length(unique(df$rank)), df$dg ==T, df$dg == F)
}
I have read other similar posts but the advice has not worked for my case.
From Comments:
I want to mark dg TRUE for all instances of an id if rank changed at all for a given id. Im looking to say for a given ID which has anywhere between 1-13 instances, mark dg TRUE if rank differs across instances.
Update: How to identify groups (ids) that only have one rank?
After clarification that OP provided this would be a solution for this particular case:
library(dplyr)
df %>%
group_by(id) %>%
mutate(dg = ifelse( length(unique(rank))>1 | n() == 1, T, F))
For another data-set that has also an id, which has duplicates but also non-duplicate rank (presented below) this would be the output:
df2 %>%
group_by(id) %>%
mutate(dg = ifelse( length(unique(rank))>1 | n() == 1, T, F))
#:OUTPUT:
# Source: local data frame [9 x 3]
# Groups: id [5]
#
# # A tibble: 9 x 3
# id rank dg
# <dbl> <dbl> <lgl>
# 1 1 1 TRUE
# 2 2 2 TRUE
# 3 2 1 TRUE
# 4 3 3 FALSE
# 5 3 3 FALSE
# 6 4 4 TRUE
# 7 5 1 TRUE
# 8 5 1 TRUE
# 9 5 3 TRUE
Data-no-2:
df2 <- structure(list(id = c(1, 2, 2, 3, 3, 4, 5, 5, 5), rank = c(1, 2, 1, 3, 3, 4, 1, 1, 3
)), .Names = c("id", "rank"), row.names = c(NA, -9L), class = "data.frame")
How to identify duplicated rows within each group (id)?
You can use dplyr package:
library(dplyr)
df %>%
group_by(id, rank) %>%
mutate(dg = ifelse(n() > 1, F,T))
This will give you:
# Source: local data frame [6 x 3]
# Groups: id, rank [5]
#
# # A tibble: 6 x 3
# id rank dg
# <dbl> <dbl> <lgl>
# 1 1 1 TRUE
# 2 2 2 TRUE
# 3 2 1 TRUE
# 4 3 3 FALSE
# 5 3 3 FALSE
# 6 4 4 TRUE
Note: You can simply convert it back to a data.frame().
A data.table solution would be:
dt <- data.table(df)
dt$dg <- ifelse(dt[ , dg := .N, by = list(id, rank)]$dg>1,F,T)
Data:
df <- structure(list(id = c(1, 2, 2, 3, 3, 4), rank = c(1, 2, 1, 3,
3, 4)), .Names = c("id", "rank"), row.names = c(NA, -6L), class = "data.frame")
# > df
# id rank
# 1 1 1
# 2 2 2
# 3 2 1
# 4 3 3
# 5 3 3
# 6 4 4
N. B. Unless you want a different identifier rather than TRUE/FALSE, using ifelse() is redundant and costs computationally. #DavidArenburg