I need to put the row that belongs to the conditional. In this example the maximum value between days_op and days_tc is 90 but the belonging row is PN. (Note: days_op and xvar_op are together the same with days_tc and xvar_tc).
test = structure(list(identificacionSujeto = c("dave",
"dave", "dave",
"dave", "dave"
), date= structure(c(18992, 18992, 18992, 18992, 18992), class = c("IDate", "Date")),
categ = c("T", "T", "T", "T", "T"),
days_op = c(24, 50, 3, 11, 70),
xvar_op = c("CO", "ON", "CO", "ON", "CO"),
categ_op = c("T", "T", "T", "T", "T"),
days_tc = c(54, 15, 90, 10, 54),
xvar_op = c("PN", "NM", "PN", "PN", "PN"),
categ_tc = c("Y", "V", "Y", "Y", "Y")),
class = c("grouped_df", "tbl_df", "tbl", "data.frame"),
row.names = c(NA, -5L),
groups = structure(list(identificacionSujeto = "dave",
.rows = structure(list(1:5), ptype = integer(0),
class = c("vctrs_list_of", "vctrs_vctr", "list"))),
class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -1L), .drop = TRUE))
identif…¹ date categ days_op xvar_op categ…² days_tc xvar_op expected_output
<chr> <IDate> <chr> <dbl> <chr> <chr> <dbl> <chr>
1 dave 2021-12-31 T 24 CO T 54 PN PN
2 dave 2021-12-31 T 50 ON T 15 NM PN
3 dave 2021-12-31 T 3 CO T 90 PN PN
4 dave 2021-12-31 T 11 ON T 10 PN PN
5 dave 2021-12-31 T 70 CO T 54 PN PN
As it is already grouped, we may get the max index of the columns as well as the max value to subset the xvar_op columns (the columns were named the same so use make.unique to modify duplicate column names before applying the function)
library(dplyr)
test %>%
setNames(make.unique(names(.))) %>%
mutate(ind = max.col(cbind(days_op, days_tc)),
val = pmax(days_op, days_tc),
out = cbind(xvar_op, xvar_op.1)[which.max(val),][ind[which.max(val)]],
ind = NULL, val = NULL) %>%
ungroup
-output
# A tibble: 5 × 10
identificacionSujeto date categ days_op xvar_op categ_op days_tc xvar_op.1 categ_tc out
<chr> <IDate> <chr> <dbl> <chr> <chr> <dbl> <chr> <chr> <chr>
1 dave 2021-12-31 T 24 CO T 54 PN Y PN
2 dave 2021-12-31 T 50 ON T 15 NM V PN
3 dave 2021-12-31 T 3 CO T 90 PN Y PN
4 dave 2021-12-31 T 11 ON T 10 PN Y PN
5 dave 2021-12-31 T 70 CO T 54 PN Y PN
Related
This question already has answers here:
combine rows in data frame containing NA to make complete row
(7 answers)
Closed last month.
Let's say I have
# A tibble: 4 × 3
Gene.names Case Control
<chr> <dbl> <dbl>
1 A1BG 52 NA
2 A1BG NA 32
3 A2M 16 NA
4 A2M NA 15
As you can see, Gene.names are duplicates and have corresponding values for Case and Control. I need to combine the values for Case and Control so they are printed on the same row for each Gene.name.
I am looking for a solution in dplyr.
Expected output
Gene.names Case Control
<chr> <dbl> <dbl>
1 A1BG 52 32
2 A2M 16 15
Data
df <- structure(list(Gene.names = c("A1BG", "A1BG", "A2M", "A2M"),
Case = c(52, NA, 16, NA), Control = c(NA, 32, NA, 15)), row.names = c(NA,
-4L), class = c("tbl_df", "tbl", "data.frame"))
A combination of pivot_longer and picot_wider will do this.
library(tidyverse)
df <- structure(list(
Gene.names = c("A1BG", "A1BG", "A2M", "A2M"),
Case = c(52, NA, 16, NA), Control = c(NA, 32, NA, 15)
), row.names = c(
NA,
-4L
), class = c("tbl_df", "tbl", "data.frame"))
df |>
pivot_longer(cols = Case:Control) |>
filter(!is.na(value)) |>
pivot_wider(names_from = name, values_from = value)
#> # A tibble: 2 × 3
#> Gene.names Case Control
#> <chr> <dbl> <dbl>
#> 1 A1BG 52 32
#> 2 A2M 16 15
Having a dataframe like this:
structure(list(id = c("id1", "id1", "id2", "id2", "id3", "id3"
), title_num = c(1, 2, 1, 2, 1, 2), title_name = c("amazon1",
"yahoo2", "google1", NA, "yahoo1", "amazon2")), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
and another one like this:
dfcheck <- structure(list(status = c("open/close", "close", "open"), stock = c("company energy",
"goods and books", "other"), name = c("amazon1;google1", "google3;yahoo1",
"yahoo2;amazon2;google2")), class = "data.frame", row.names = c(NA,
-3L))
How is it possible to have an output like this:
id title_num title_name stock status
id1 1 amazon1 company energy open/close
id1 2 yahoo2 other open
id2 1 google1 company energy open/close
id2 2 <NA> <NA> <NA>
id3 1 yahoo1 goods and books close
id3 2 amazon2 other open
library(dplyr)
df <-
structure(list(id = c("id1", "id1", "id2", "id2", "id3", "id3"
), title_num = c(1, 2, 1, 2, 1, 2), title_name = c("amazon1",
"yahoo2", "google1", NA, "yahoo1", "amazon2")), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
dfcheck <-
structure(list(status = c("open/close", "close", "open"), stock = c("company energy",
"goods and books", "other"), name = c("amazon1;google1", "google3;yahoo1",
"yahoo2;amazon2;google2")), class = "data.frame", row.names = c(NA,
-3L))
df %>%
left_join(
dfcheck %>%
separate_rows(name,sep = ";"),
by = c("title_name" = "name")
)
# A tibble: 6 x 5
id title_num title_name status stock
<chr> <dbl> <chr> <chr> <chr>
1 id1 1 amazon1 open/close company energy
2 id1 2 yahoo2 open other
3 id2 1 google1 open/close company energy
4 id2 2 NA NA NA
5 id3 1 yahoo1 close goods and books
6 id3 2 amazon2 open other
You can use left_join on a strsplit column of the second data set.
library(dplyr)
library(tidyr)
left_join(df1, dfcheck %>%
mutate(name = strsplit(name, ";")) %>%
unnest(name), c("title_name" = "name"))
# A tibble: 6 × 5
id title_num title_name status stock
<chr> <dbl> <chr> <chr> <chr>
1 id1 1 amazon1 open/close company energy
2 id1 2 yahoo2 open other
3 id2 1 google1 open/close company energy
4 id2 2 NA NA NA
5 id3 1 yahoo1 close goods and books
6 id3 2 amazon2 open other
Data
df1 <- structure(list(id = c("id1", "id1", "id2", "id2", "id3", "id3"
), title_num = c(1, 2, 1, 2, 1, 2), title_name = c("amazon1",
"yahoo2", "google1", NA, "yahoo1", "amazon2")), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
dfcheck <- structure(list(status = c("open/close", "close", "open"), stock = c("company energy",
"goods and books", "other"), name = c("amazon1;google1", "google3;yahoo1",
"yahoo2;amazon2;google2")), class = "data.frame", row.names = c(NA,
-3L))
I just want to know how to find and replace empty columns into na for a whole data frame
sample data
structure(list(id = structure(8.44425875736171e-318, class = "integer64"),
project_id = 11L, experiment_id = 85L,
gene = "", si = -0.381, pi = ""
on1 = "CC",
on2 = "GG",
on3 = "aa",
created_at = structure(1618862091.85075, class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -1L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x000001ba09da3590>)
i have a solution to check for a particular column but i dont how to apply this for whole dataframe
data$gene <- ifelse((is.na(data$gene) == TRUE),'NA',data$gene)
You could use lapply with gsub to replace each empty cell with NA like this:
df <- structure(list(id = structure(8.44425875736171e-318, class = "integer64"),
project_id = 11L, experiment_id = 85L,
gene = "", si = -0.381, pi = "",
on1 = "CC",
on2 = "GG",
on3 = "aa",
created_at = structure(1618862091.85075, class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -1L), class = c("data.table",
"data.frame"))
df
#> id project_id experiment_id gene si pi on1 on2 on3
#> 1 8.444259e-318 11 85 -0.381 CC GG aa
#> created_at
#> 1 2021-04-19 19:54:51
df[] <- lapply(df, function(x) gsub("^$", NA, x))
df
#> id project_id experiment_id gene si pi on1 on2 on3
#> 1 8.44425875736171e-318 11 85 <NA> -0.381 <NA> CC GG aa
#> created_at
#> 1 2021-04-19 19:54:51
Created on 2022-11-02 with reprex v2.0.2
You can also use dplyr with mutate and across
library(dplyr)
library(tidyr)
df <- structure(list(id = structure(8.44425875736171e-318, class = "integer64"),
project_id = 11L, experiment_id = 85L,
gene = "", si = -0.381, pi = "",
on1 = "CC",
on2 = "GG",
on3 = "aa",
created_at = structure(1618862091.85075, class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -1L), class = c("data.table",
"data.frame"))
df %>%
mutate(dplyr::across(where(is.character), ~ gsub("^$", NA, .x)))
Note that I also attempted to use replace_na, however this only works on values that are actually NA.
test %>%
mutate(dplyr::across(where(is.character), ~ replace_na(.x, "NA")))
"" is not considered
NA is considered NA
Keep that in mind while you are performing your analysis.
Using na_if
library(data.table)
library(dplyr)
df[, lapply(.SD, \(x) if(is.character(x)) na_if(x, "") else x)]
-output
id project_id experiment_id gene si pi on1 on2 on3 created_at
<i64> <int> <int> <char> <num> <char> <char> <char> <char> <POSc>
1: 1709137 11 85 <NA> -0.381 <NA> CC GG aa 2021-04-19 19:54:51
I have two matrix as like this
Vehicle1 Year type
Car1 20 A
Car2 21 A
Car8 20 A
Second one
Vehicle2 Year type
Car1 20 M
Car2 21 M
Car7 90 M
I just need to combine the matrix based on the first column(Vehicle) and need to mark common as A/M as like this
Vehicle Year type
Car1 20 A/M
Car2 21 A/M
Car7 90 M
Car8 20 A
I used merge function for this but it only printing the common one
You can join the two dataframe and combine the type columns :
dplyr::full_join(df1, df2, by = c('Vehicle1' = 'Vehicle2', 'Year')) %>%
tidyr::unite(type, type.x, type.y, sep = '/', na.rm = TRUE)
# Vehicle1 Year type
#1 Car1 20 A/M
#2 Car2 21 A/M
#3 Car8 20 A
#4 Car7 90 M
data
df1 <- structure(list(Vehicle1 = c("Car1", "Car2", "Car8"), Year = c(20L,
21L, 20L), type = c("A", "A", "A")), class = "data.frame", row.names = c(NA, -3L))
df2 <- structure(list(Vehicle2 = c("Car1", "Car2", "Car7"), Year = c(20L,
21L, 90L), type = c("M", "M", "M")), class = "data.frame", row.names = c(NA, -3L))
Another dplyr solution.
library(dplyr)
df2 %>%
rename(Vehicle1 = Vehicle2) %>%
bind_rows(df1, .) %>%
group_by(Vehicle1, Year) %>%
summarise(type = paste(type, collapse = "/")) %>%
ungroup()
# # A tibble: 4 x 3
# Vehicle1 Year type
# <chr> <int> <chr>
# 1 Car1 20 A/M
# 2 Car2 21 A/M
# 3 Car7 90 M
# 4 Car8 20 A
You can also do this easily in base R.
rr <- merge(m1, m2, all=T, by.x="Vehicle1", by.y="Vehicle2")
rr <- setNames(na.omit(reshape(rr, idvar="Vehicle1", varying=list(c(2, 4), c(3, 5)),
direction="long")), c("Vehicle1", "t", names(m1)[-1]))
dupes <- which(duplicated(rr$Vehicle1))
rr[rr$Vehicle1 %in% rr$Vehicle1[dupes], 4] <- "A/M"
res <- rr[-dupes, -2]
res
# Vehicle1 Year type
# Car1.1 Car1 20 A/M
# Car2.1 Car2 21 A/M
# Car8.1 Car8 20 A
# Car7.2 Car7 90 M
Data:
m1 <- structure(list(Vehicle1 = c("Car1", "Car2", "Car8"), Year = c(20L,
21L, 20L), type = c("A", "A", "A")), class = "data.frame", row.names = c(NA,
-3L))
m2 <- structure(list(Vehicle2 = c("Car1", "Car2", "Car7"), Year = c(20L,
21L, 90L), type = c("M", "M", "M")), class = "data.frame", row.names = c(NA,
-3L))
Here is a base R option using merge
tmp <- merge(df1, df2, by.x = c("Vehicle1", "Year"), by.y = c("Vehicle2", "Year"), all = TRUE)
dfout <- cbind(tmp[c("Vehicle1", "Year")],
type = apply(
tmp[grep("type", names(tmp))],
1,
function(...) ifelse(any(is.na(...)), na.omit(...), paste0(..., collapse = "/"))
)
)
such that
> dfout
Vehicle1 Year type
1 Car1 20 A/M
2 Car2 21 A/M
3 Car7 90 M
4 Car8 20 A
Data
> dput(df1)
structure(list(Vehicle1 = c("Car1", "Car2", "Car8"), Year = c(20L,
21L, 20L), type = c("A", "A", "A")), class = "data.frame", row.names = c(NA,
-3L))
> dput(df2)
structure(list(Vehicle2 = c("Car1", "Car2", "Car7"), Year = c(20L,
21L, 90L), type = c("M", "M", "M")), class = "data.frame", row.names = c(NA,
-3L))
How to add column dynamically in dataframe if list of variable increases. My dataframe:
ID Value
1 list(F="20",B="rt")
2 list(F="20",B="rt",`H'="ty")
3 list(F="20",B="rt")
4 list(F="20")
Desired output:
ID Value F B H
1 list(F="20",B="rt") 20 rt NA
2 list(F="20",B="rt",H="ty") 20 rt ty
3 list(F="20",B="rt") 20 rt NA
4 list(F="20") 20 NA NA
structure(list(Billing = list(NULL, structure
(list(`EUcust#` = "3",`Cust#` = "5", Com = "I", `Com#` = "6", Add = "Y"), .Names
= c("EUcust#",
"Cust#", "Com", "Com#", "Add"), class
= "data.frame", row.names = 1L))), .Names
= "Billing",
row.names = 8:9, class = "data.frame")
We can use tidyverse
library(tidyverse)
df1 %>%
bind_cols(., map_df(.$Value, ~do.call(cbind.data.frame, .)))
# ID Value F B H
#1 1 20, rt 20 rt <NA>
#2 2 20, rt, ty 20 rt ty
#3 3 20, rt 20 rt <NA>
#4 4 20 20 <NA> <NA>
data
df1 <- structure(list(ID = 1:4, Value = structure(list(structure(list(
F = "20", B = "rt"), .Names = c("F", "B")), structure(list(
F = "20", B = "rt", H = "ty"), .Names = c("F", "B", "H")),
structure(list(F = "20", B = "rt"), .Names = c("F", "B")),
structure(list(F = "20"), .Names = "F")), class = "AsIs")),
.Names = c("ID",
"Value"), row.names = c(NA, -4L), class = "data.frame")