I have a dataframe that takes similar form as the toy dataframe below. I would like to merge the rows if var1, var2, and var3 are all equal values, creating a combination of data in the merged rows. For rows 4 - 6, where there are different values in the rows, I was wondering if there is a way to put them in the same column with a seperator in between.
df <- data.frame(var1 = c("1635", "1635", "1729", "1847", "1847", "1847"),
var2 = c("Aa", "Aa", "Bb", "Cc", "Cc", "Cc"),
var3 = c("28", "28", "85", "27", "27", "27"),
var4 = c("apple", NA, "orange", "pear", NA, NA),
var5 = c(NA, "tree", NA, NA, "ground", "desk")
)
So the output would look something like this:
in base R you would do:
aggregate(.~var1+var2+var3, df, \(x)toString(unique(na.omit(x))), na.action = identity)
var1 var2 var3 var4 var5
1 1847 Cc 27 pear ground, desk
2 1635 Aa 28 apple tree
3 1729 Bb 85 orange
in tidyverse:
library(tidyverse)
df %>%
group_by(var1,var2,var3) %>%
summarize(across(var4:var5, ~toString(unique(na.omit(.x)))),.groups = 'drop')
# Groups: var1, var2 [3]
var1 var2 var3 var4 var5
<chr> <chr> <chr> <chr> <chr>
1 1635 Aa 28 apple "tree"
2 1729 Bb 85 orange ""
3 1847 Cc 27 pear "ground, desk"
With dplyr, you can group_by the three columns, then use summarize to concatenate the strings if they are not NA.
library(dplyr)
df %>%
group_by(var1, var2, var3) %>%
summarize(across(var4:var5, ~ifelse(all(is.na(.x)), NA, paste0(na.omit(.x), collapse = ","))), .groups = "drop")
# A tibble: 3 × 5
var1 var2 var3 var4 var5
<chr> <chr> <chr> <chr> <chr>
1 1635 Aa 28 apple tree
2 1729 Bb 85 orange NA
3 1847 Cc 27 pear ground,desk
With data.table:
setDT(df)
df[,
lapply(.SD, \(x) if (all(is.na(x))) NA_character_ else paste(na.omit(x), collapse = "; ")),
by = var1:var3]
setDF(df)
# var1 var2 var3 var4 var5
# <char> <char> <char> <char> <char>
# 1: 1635 Aa 28 apple tree
# 2: 1729 Bb 85 orange <NA>
# 3: 1847 Cc 27 pear ground; desk
Related
I have a data frame of the following form:
a <- data.frame(list(X1=c("stn", "s1", "stn", "s2"),
X2=c("var1", "1", "var4", "2"),
X3=c("var2", "2", "var3", "3"),
X4=c("NA", "NA", "var2", "2")))
X1 X2 X3 X4
1 stn var1 var2 NA
2 s1 1 2 NA
3 stn var4 var3 var2
4 s2 2 3 2
How can I get the result:
b <- data.frame(list(stn=c("s1", "s2"),
var1=c(1, NA),
var2=c(2, 2),
var3=c(NA, 3),
var4=c(NA, 2)))
stn var1 var2 var3 var4
1 s1 1 2 NA NA
2 s2 NA 2 3 2
A (mostly) base R solution could be split the data.frame and call janitor::row_to_names(1) on each split data.frame, recombine it and remove bad columns using subset
do.call(dplyr::bind_rows,
df |>
split(cumsum(df$X1 == "stn")) |>
lapply(\(x) x |>
janitor::row_to_names(1))
) |>
subset(, -`NA`)
stn var1 var2 var4 var3
1 s1 1 2 <NA> <NA>
2 s2 <NA> 2 2 3
A dplyr solution:
library(dplyr)
a %>%
group_by(grp = rep(1:2, each = 2)) %>%
group_map(~ setNames(.x[-1, ], .x[1, ])) %>%
bind_rows() %>%
select(-`NA`) %>%
mutate(across(var1:var3, as.numeric))
# # A tibble: 2 × 5
# stn var1 var2 var4 var3
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 s1 1 2 NA NA
# 2 s2 NA 2 2 3
I'm close but don't have the syntax correct. I'm trying to select all columns of a data table based on selection of unique combinations of two variables (columns) based on the maximum value of a third. MWE of progress thus far. Thx. J
library(dplyr)
dt1 <- tibble (var1 = c("num1", "num2", "num3", "num4", "num5"),
var2 = rep("A", 5),
var3 = c(rep("B", 2), rep("C", 3)),
var4 = c(5, 10, 3, 7, 19))
dt1 %>% distinct(var2, var3, max(var4), .keep_all = TRUE)
# A tibble: 2 x 5
var1 var2 var3 var4 `max(var4)`
<chr> <chr> <chr> <dbl> <dbl>
1 num1 A B 5 19
2 num3 A C 3 19
which is close, but I want the row where the value of var4 is the max value, within the unique combination of var2 and var3. I'm attempting to get:
# A tibble: 2 x 5
var1 var2 var3 var4 `max(var4)`
<chr> <chr> <chr> <dbl> <dbl>
1 num2 A B 5 10
2 num5 A C 3 19
Do I need a formula for the third argument of the distinct function?
We can add an arrange statement before the distinct
library(dplyr)
dt1 %>%
arrange(var2, var3, desc(var4)) %>%
distinct(var2, var3, .keep_all = TRUE)
-output
# A tibble: 2 x 4
var1 var2 var3 var4
<chr> <chr> <chr> <dbl>
1 num2 A B 10
2 num5 A C 19
Or another option is slice_max
dt1 %>%
group_by(var2, var3) %>%
mutate(var4new = first(var4)) %>%
slice_max(order_by= var4, n = 1) %>%
ungroup
-output
# A tibble: 2 x 5
var1 var2 var3 var4 var4new
<chr> <chr> <chr> <dbl> <dbl>
1 num2 A B 10 5
2 num5 A C 19 3
slice() will do what you want. Though you have drop "var4" = 5, 3 (not really sure if that is important)?
tibble (var1 = c("num1", "num2", "num3", "num4", "num5"),
var2 = rep("A", 5),
var3 = c(rep("B", 2), rep("C", 3)),
var4 = c(5, 10, 3, 7, 19)) %>%
group_by(var2, var3) %>%
slice(which.max(var4)) %>%
ungroup()
# A tibble: 2 x 4
var1 var2 var3 var4
<chr> <chr> <chr> <dbl>
1 num2 A B 10
2 num5 A C 19
Does this work:
library(dplyr)
dt1 %>% group_by(var2, var3) %>% filter(dense_rank(desc(var4)) == 1)
# A tibble: 2 x 4
# Groups: var2, var3 [2]
var1 var2 var3 var4
<chr> <chr> <chr> <dbl>
1 num2 A B 10
2 num5 A C 19
I have a large dataset with duplicated values in the first column, like so:
ID date var1 var2
person1 052016 509 1678
person2 122016 301 NA
person1 072016 NA 45
I want to combine the IDs and to take the most recent value by "date", and if it`s NA - to take the last value that it's not NA.
The output should be like this:
ID date var1 var2
person2 122016 301 NA
person1 072016 509 45
I have tried with this, but it didn't worked.
library(dplyr)
data %>% group_by(ID) %>% summarise_all(funs(max(data$date))) %>% funs(first(.[!is.na(.)]))
What should I use to apply a working code to the whole dataset?
A solution using dplyr.
library(dplyr)
dat2 <- dat %>%
arrange(ID, desc(date)) %>%
group_by(ID) %>%
summarise_all(funs(first(.[!is.na(.)]))) %>%
ungroup()
dat2
# # A tibble: 2 x 4
# ID date var1 var2
# <chr> <int> <int> <int>
# 1 person1 72016 509 45
# 2 person2 122016 301 NA
DATA
dat <- read.table(text = "ID date var1 var2
person1 '052016' 509 1678
person2 '122016' 301 NA
person1 '072016' NA 45",
header = TRUE, stringsAsFactors = FALSE)
Using tidyverse and fill function.
Load data:
Mar_df <- structure(list(ID = structure(c(1L, 2L, 1L), .Label = c("person1",
"person2"), class = "factor"), date = c(52016L, 122016L, 72016L
), var1 = c(509L, 301L, NA), var2 = c(1678L, NA, 45L)), .Names = c("ID",
"date", "var1", "var2"), class = "data.frame", row.names = c(NA,
-3L))
Then:
Mar_df_summarised <- Mar_df %>%
arrange(ID,date) %>%
fill(...=var1,.direction="down") %>%
group_by(ID) %>%
summarise_all(.funs=funs(last(.)))
The result is:
# A tibble: 2 x 4
ID date var1 var2
<fctr> <int> <int> <int>
1 person1 72016 509 45
2 person2 122016 301 NA
I know there are many questions on this topic so I apologize if this is a duplicate question. I'm trying to collapse multiple columns in a data set into one column:
Assuming this is the structure of the dataset I am working with,
df <- data.frame(
cbind(
variable_1 = c('Var1', NA, NA,'Var1'),
variable_2 = c('Var2', 'No', NA, NA),
variable_3 = c(NA, NA, 'Var3', NA),
variable_4 = c(NA, 'Var4', NA, NA),
variable_5 = c(NA, 'No', 'Var5', NA),
variable_6 = c(NA, NA, 'Var6', NA)
))
variable_1 variable_2 variable_3 variable_4 variable_5 variable_6
Var1 Var2 NA NA NA NA
NA No NA Var4 No NA
NA NA Var3 NA Var5 Var6
Var1 NA NA NA NA NA
What I am expecting is a one column variable_7 like this
variable_1 variable_2 variable_3 variable_4 variable_5 variable_6 variable_7
Var1 Var2 NA NA NA NA Var1, Var2
NA No NA Var4 No NA Var4
NA NA Var3 NA Var5 Var6 Var3, Var5, Var6
Var1 NA NA NA NA NA Var1
df$variable_7 <- apply(df, 1, function(x) paste(x[!is.na(x) & x != "No"], collapse = ", "));
df;
# variable_1 variable_2 variable_3 variable_4 variable_5 variable_6
#1 Var1 Var2 <NA> <NA> <NA> <NA>
#2 <NA> No <NA> Var4 No <NA>
#3 <NA> <NA> Var3 <NA> Var5 Var6
#4 Var1 <NA> <NA> <NA> <NA> <NA>
# variable_7
#1 Var1, Var2
#2 Var4
#3 Var3, Var5, Var6
#4 Var1
Explanation: Use apply and paste(..., collapse = ", ") to concatenate all row entries (except NAs and "No"s) and store in new column variable_7.
Sample data
df <- data.frame(
cbind(
variable_1 = c('Var1', NA, NA,'Var1'),
variable_2 = c('Var2', 'No', NA, NA),
variable_3 = c(NA, NA, 'Var3', NA),
variable_4 = c(NA, 'Var4', NA, NA),
variable_5 = c(NA, 'No', 'Var5', NA),
variable_6 = c(NA, NA, 'Var6', NA)
))
I gather that if there are n rows then objective is to create a an n-vector of comma-separated character strings of those values in each row that contain the characters Var. (If you intended some other criterion for separating the desired and undesired values then change the grep accordingly.)
apply(df, 1, function(x) toString(grep("Var", x, value = TRUE)))
## [1] "Var1, Var2" "Var4" "Var3, Var5, Var6" "Var1"
A solution using dplyr. df4 is the final output. Please see how I created the data frame df. The cbind is not required, and it would be great to add stringsAsFactors = FALSE to prevent the creation of factor columns.
library(dplyr)
library(tidyr)
df2 <- df %>% mutate(ID = 1:n())
df3 <- df2 %>%
gather(Variable, Value, -ID, na.rm = TRUE) %>%
filter(!Value %in% "No") %>%
group_by(ID) %>%
summarise(variable_7 = toString(Value))
df4 <- df2 %>%
left_join(df3, by = "ID") %>%
select(-ID)
df4
# variable_1 variable_2 variable_3 variable_4 variable_5 variable_6 variable_7
# 1 Var1 Var2 <NA> <NA> <NA> <NA> Var1, Var2
# 2 <NA> No <NA> Var4 No <NA> Var4
# 3 <NA> <NA> Var3 <NA> Var5 Var6 Var3, Var5, Var6
# 4 Var1 <NA> <NA> <NA> <NA> <NA> Var1
DATA
df <- data.frame(
variable_1 = c('Var1', NA, NA,'Var1'),
variable_2 = c('Var2', 'No', NA, NA),
variable_3 = c(NA, NA, 'Var3', NA),
variable_4 = c(NA, 'Var4', NA, NA),
variable_5 = c(NA, 'No', 'Var5', NA),
variable_6 = c(NA, NA, 'Var6', NA),
stringsAsFactors = FALSE
)
Using a data.table 'reshap'-ing approach rather than a loop/apply
library(data.table)
setDT(df)
df[, id := .I][
melt(df, id.vars = "id")[grepl("Var", value), .(variable_7 = paste0(value, collapse = ",")), by = .(id)]
, on = "id"
, nomatch = 0
][order(id)]
# variable_1 variable_2 variable_3 variable_4 variable_5 variable_6 id variable_7
# 1: Var1 Var2 NA NA NA NA 1 Var1,Var2
# 2: NA No NA Var4 No NA 2 Var4
# 3: NA NA Var3 NA Var5 Var6 3 Var3,Var5,Var6
# 4: Var1 NA NA NA NA NA 4 Var1
I need to fill in some missing data from a merge that is the same in all columns. After the merge, all the values are NA, but i would like a quick way to fill them in since their values are the same.
Example:
df <- structure(list(date = structure(c(-25932, -25931, -25930, -25929,
-25928), class = "Date"), year = c(1899, 1899, 1899, 1899, 1899
), month = c(1, 1, 1, 1, 1), day = c(1, 2, 3, 4, 5), test1 = c(NA,
NA, "VAR1", NA, NA), test2 = c(NA, NA, "VAR2", NA, NA), test3 = c(NA,
NA, "VAR3", NA, NA)), .Names = c("date", "year", "month", "day",
"test1", "test2", "test3"), row.names = c(NA, 5L), class = "data.frame")
# Tedious way, but works
df$test1 <- "VAR1"
# Desired output
date year month day test1 test2 test3
1 1899-01-01 1899 1 1 VAR1 VAR2 VAR3
2 1899-01-02 1899 1 2 VAR1 VAR2 VAR3
3 1899-01-03 1899 1 3 VAR1 VAR2 VAR3
4 1899-01-04 1899 1 4 VAR1 VAR2 VAR3
5 1899-01-05 1899 1 5 VAR1 VAR2 VAR3
You can try something like the following:
df
# date year month day test1 test2 test3
# 1 1899-01-01 1899 1 1 <NA> <NA> <NA>
# 2 1899-01-02 1899 1 2 <NA> <NA> <NA>
# 3 1899-01-03 1899 1 3 VAR1 VAR2 VAR3
# 4 1899-01-04 1899 1 4 <NA> <NA> <NA>
# 5 1899-01-05 1899 1 5 <NA> <NA> <NA>
df[grep("test", names(df))] <- lapply(df[grep("test", names(df))],
function(x) x[!is.na(x)][1])
df
# date year month day test1 test2 test3
# 1 1899-01-01 1899 1 1 VAR1 VAR2 VAR3
# 2 1899-01-02 1899 1 2 VAR1 VAR2 VAR3
# 3 1899-01-03 1899 1 3 VAR1 VAR2 VAR3
# 4 1899-01-04 1899 1 4 VAR1 VAR2 VAR3
# 5 1899-01-05 1899 1 5 VAR1 VAR2 VAR3
We can also use data.table. Convert the 'data.frame' to 'data.table' (setDT(df)). Based on the index of 'test' columns ('nm1'), we loop with for and set the NA elements by the non-NA elements in each column.
library(data.table)
nm1 <- grep('^test', names(df))
setDT(df)
for(j in nm1){
set(df, i=which(is.na(df[[j]])), j=j, value= na.omit(df[[j]]))
}
df
# date year month day test1 test2 test3
#1: 1899-01-01 1899 1 1 VAR1 VAR2 VAR3
#2: 1899-01-02 1899 1 2 VAR1 VAR2 VAR3
#3: 1899-01-03 1899 1 3 VAR1 VAR2 VAR3
#4: 1899-01-04 1899 1 4 VAR1 VAR2 VAR3
#5: 1899-01-05 1899 1 5 VAR1 VAR2 VAR3