How to add na based on condition for a whole dataframe - r

I just want to know how to find and replace empty columns into na for a whole data frame
sample data
structure(list(id = structure(8.44425875736171e-318, class = "integer64"),
project_id = 11L, experiment_id = 85L,
gene = "", si = -0.381, pi = ""
on1 = "CC",
on2 = "GG",
on3 = "aa",
created_at = structure(1618862091.85075, class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -1L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x000001ba09da3590>)
i have a solution to check for a particular column but i dont how to apply this for whole dataframe
data$gene <- ifelse((is.na(data$gene) == TRUE),'NA',data$gene)

You could use lapply with gsub to replace each empty cell with NA like this:
df <- structure(list(id = structure(8.44425875736171e-318, class = "integer64"),
project_id = 11L, experiment_id = 85L,
gene = "", si = -0.381, pi = "",
on1 = "CC",
on2 = "GG",
on3 = "aa",
created_at = structure(1618862091.85075, class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -1L), class = c("data.table",
"data.frame"))
df
#> id project_id experiment_id gene si pi on1 on2 on3
#> 1 8.444259e-318 11 85 -0.381 CC GG aa
#> created_at
#> 1 2021-04-19 19:54:51
df[] <- lapply(df, function(x) gsub("^$", NA, x))
df
#> id project_id experiment_id gene si pi on1 on2 on3
#> 1 8.44425875736171e-318 11 85 <NA> -0.381 <NA> CC GG aa
#> created_at
#> 1 2021-04-19 19:54:51
Created on 2022-11-02 with reprex v2.0.2

You can also use dplyr with mutate and across
library(dplyr)
library(tidyr)
df <- structure(list(id = structure(8.44425875736171e-318, class = "integer64"),
project_id = 11L, experiment_id = 85L,
gene = "", si = -0.381, pi = "",
on1 = "CC",
on2 = "GG",
on3 = "aa",
created_at = structure(1618862091.85075, class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -1L), class = c("data.table",
"data.frame"))
df %>%
mutate(dplyr::across(where(is.character), ~ gsub("^$", NA, .x)))
Note that I also attempted to use replace_na, however this only works on values that are actually NA.
test %>%
mutate(dplyr::across(where(is.character), ~ replace_na(.x, "NA")))
"" is not considered
NA is considered NA
Keep that in mind while you are performing your analysis.

Using na_if
library(data.table)
library(dplyr)
df[, lapply(.SD, \(x) if(is.character(x)) na_if(x, "") else x)]
-output
id project_id experiment_id gene si pi on1 on2 on3 created_at
<i64> <int> <int> <char> <num> <char> <char> <char> <char> <POSc>
1: 1709137 11 85 <NA> -0.381 <NA> CC GG aa 2021-04-19 19:54:51

Related

dplyr join with three data frame

I have 3 data frames as like this
df1 <- structure(list(Vehicle = c("Car1", "Car2", "Car8"), Year = c(20L,
21L, 20L), type = c("A", "A", "A")), class = "data.frame", row.names = c(NA, -3L))
df2 <- structure(list(Vehicle = c("Car1", "Car2", "Car7"), Year = c(20L,
21L, 90L), type = c("M", "M", "M")), class = "data.frame", row.names = c(NA, -3L))
df3 <- structure(list(Vehicle = c("Car1", "Car2", "Car9"), Year = c(20L,
21L, 92L), type = c("I", "I", "I")), class = "data.frame", row.names = c(NA, -3L))
And I need to make a new table as follows
Vehicle Year type
Car1 20 A/M/I
Car2 21 A/M/I
Car7 90 M
Car8 20 A
Car9 92 I
for this purpose I used this code using dplyr as like this, but it is not working with 3 data frames:
dplyr::full_join(df1, df2, df3, by = c('Vehicle', 'Year')) %>%
tidyr::unite(type, type.x, type.y, sep = '/', na.rm = TRUE)
Try this approach. Instead of merging it looks like you want to combine all dataframes and then aggregate. Here the code using dplyr:
library(dplyr)
#Code
newdf <- bind_rows(df1,df2,df3) %>%
group_by(Vehicle,Year) %>%
summarise(type=paste0(type,collapse='|'))
Output:
# A tibble: 5 x 3
# Groups: Vehicle [5]
Vehicle Year type
<chr> <int> <chr>
1 Car1 20 A|M|I
2 Car2 21 A|M|I
3 Car7 90 M
4 Car8 20 A
5 Car9 92 I
Generally, to merge >2 data.frame's/tibble's you'd use either base R's Reduce or purrr::reduce; for example using the latter:
list(df1, df2, df3) %>%
purrr::reduce(dplyr::full_join, by = c("Vehicle", "Year")) %>%
tidyr::unite(type, dplyr::starts_with("type"), sep = "/", na.rm = TRUE)
# Vehicle Year type
#1 Car1 20 A/M/I
#2 Car2 21 A/M/I
#3 Car8 20 A
#4 Car7 90 M
#5 Car9 92 I
Using base R
aggregate(type ~ Vehicle + Year, rbind(df1, df2, df3) ,
FUN = paste, collapse="|")
-output
# Vehicle Year type
#1 Car1 20 A|M|I
#2 Car8 20 A
#3 Car2 21 A|M|I
#4 Car7 90 M
#5 Car9 92 I

Combine two matrix and mark common

I have two matrix as like this
Vehicle1 Year type
Car1 20 A
Car2 21 A
Car8 20 A
Second one
Vehicle2 Year type
Car1 20 M
Car2 21 M
Car7 90 M
I just need to combine the matrix based on the first column(Vehicle) and need to mark common as A/M as like this
Vehicle Year type
Car1 20 A/M
Car2 21 A/M
Car7 90 M
Car8 20 A
I used merge function for this but it only printing the common one
You can join the two dataframe and combine the type columns :
dplyr::full_join(df1, df2, by = c('Vehicle1' = 'Vehicle2', 'Year')) %>%
tidyr::unite(type, type.x, type.y, sep = '/', na.rm = TRUE)
# Vehicle1 Year type
#1 Car1 20 A/M
#2 Car2 21 A/M
#3 Car8 20 A
#4 Car7 90 M
data
df1 <- structure(list(Vehicle1 = c("Car1", "Car2", "Car8"), Year = c(20L,
21L, 20L), type = c("A", "A", "A")), class = "data.frame", row.names = c(NA, -3L))
df2 <- structure(list(Vehicle2 = c("Car1", "Car2", "Car7"), Year = c(20L,
21L, 90L), type = c("M", "M", "M")), class = "data.frame", row.names = c(NA, -3L))
Another dplyr solution.
library(dplyr)
df2 %>%
rename(Vehicle1 = Vehicle2) %>%
bind_rows(df1, .) %>%
group_by(Vehicle1, Year) %>%
summarise(type = paste(type, collapse = "/")) %>%
ungroup()
# # A tibble: 4 x 3
# Vehicle1 Year type
# <chr> <int> <chr>
# 1 Car1 20 A/M
# 2 Car2 21 A/M
# 3 Car7 90 M
# 4 Car8 20 A
You can also do this easily in base R.
rr <- merge(m1, m2, all=T, by.x="Vehicle1", by.y="Vehicle2")
rr <- setNames(na.omit(reshape(rr, idvar="Vehicle1", varying=list(c(2, 4), c(3, 5)),
direction="long")), c("Vehicle1", "t", names(m1)[-1]))
dupes <- which(duplicated(rr$Vehicle1))
rr[rr$Vehicle1 %in% rr$Vehicle1[dupes], 4] <- "A/M"
res <- rr[-dupes, -2]
res
# Vehicle1 Year type
# Car1.1 Car1 20 A/M
# Car2.1 Car2 21 A/M
# Car8.1 Car8 20 A
# Car7.2 Car7 90 M
Data:
m1 <- structure(list(Vehicle1 = c("Car1", "Car2", "Car8"), Year = c(20L,
21L, 20L), type = c("A", "A", "A")), class = "data.frame", row.names = c(NA,
-3L))
m2 <- structure(list(Vehicle2 = c("Car1", "Car2", "Car7"), Year = c(20L,
21L, 90L), type = c("M", "M", "M")), class = "data.frame", row.names = c(NA,
-3L))
Here is a base R option using merge
tmp <- merge(df1, df2, by.x = c("Vehicle1", "Year"), by.y = c("Vehicle2", "Year"), all = TRUE)
dfout <- cbind(tmp[c("Vehicle1", "Year")],
type = apply(
tmp[grep("type", names(tmp))],
1,
function(...) ifelse(any(is.na(...)), na.omit(...), paste0(..., collapse = "/"))
)
)
such that
> dfout
Vehicle1 Year type
1 Car1 20 A/M
2 Car2 21 A/M
3 Car7 90 M
4 Car8 20 A
Data
> dput(df1)
structure(list(Vehicle1 = c("Car1", "Car2", "Car8"), Year = c(20L,
21L, 20L), type = c("A", "A", "A")), class = "data.frame", row.names = c(NA,
-3L))
> dput(df2)
structure(list(Vehicle2 = c("Car1", "Car2", "Car7"), Year = c(20L,
21L, 90L), type = c("M", "M", "M")), class = "data.frame", row.names = c(NA,
-3L))

Reshaping Data with values in variable names

I have a very wide dataset (2000+ variables) that I'm trying to make tidy but I am getting stuck trying to pull out a value from the variable name. If I have a variable that is "E1Time1_Date" I'd like to reshape it to be three variables: E=1, Time=1, and Date=the original date value.
Is this even possible? I've tried to use gather() but am guessing there is a step I need to do first that I am missing. Thank you for your help!
And here is the sample dataset if anyone wanted to make the magic happen:
structure(list(ID = c(123, 225), UnrelatedV1 = c("Unrelated1",
"Unrelated1"), UnrelatedV2 = c("Unrelated2", "Unrelated2"), E1T1_Date = structure(c(1506816000,
1513296000), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
E1T1_v1 = c(10, 20), E1T1_v2 = c(20, 20), E1T1_v3 = c(30,
20), E1T1_v4 = c(40, 20), E1T2_Date = structure(c(1512086400,
NA), class = c("POSIXct", "POSIXt"), tzone = "UTC"), E1T2_v1 = c(10,
NA), E1T2_v2 = c(10, NA), E1T2_v3 = c(10, NA), E1T2_v4 = c(10,
NA), E2T1_Date = structure(c(1522540800, 1525132800), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), E2T1_v1 = c(10, 20), E2T1_v2 = c(20,
20), E2T1_v3 = c(10, 20), E2T1_v4 = c(10, 20), E2T2_Date = structure(c(1533859200,
NA), class = c("POSIXct", "POSIXt"), tzone = "UTC"), E2T2_v1 = c(10,
NA), E2T2_v2 = c(30, NA), E2T2_v3 = c(10, NA), E2T2_v4 = c(10,
NA)), .Names = c("ID", "UnrelatedV1", "UnrelatedV2", "E1T1_Date",
"E1T1_v1", "E1T1_v2", "E1T1_v3", "E1T1_v4", "E1T2_Date", "E1T2_v1",
"E1T2_v2", "E1T2_v3", "E1T2_v4", "E2T1_Date", "E2T1_v1", "E2T1_v2",
"E2T1_v3", "E2T1_v4", "E2T2_Date", "E2T2_v1", "E2T2_v2", "E2T2_v3",
"E2T2_v4"), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-2L))
Looks like you've got a mix of numeric and date values, which will make gathering a little tricky. One way to do it is to convert dates to numeric for now, then you can change them back once you get to your final format. This should get you started.
library(tidyverse)
-2L))
data %>%
#convert dates to numeric so we can gather them in the same column
mutate_if(is.POSIXct, as.integer) %>%
gather(-ID, -contains("Unrelated"), key = variable, value = value) %>%
#add an underscore between E and T to make separating them easier
mutate(loc = gregexpr("T", variable)[[1]],
variable = paste0(substr(variable, 1, loc - 1), "_",
substr(variable, loc, nchar(variable)))) %>%
select(-loc) %>%
#separate into three distinct columns
separate(variable, into = c("E", "T", "vDate"), sep = "_")
# A tibble: 40 x 7
ID UnrelatedV1 UnrelatedV2 E T vDate value
<dbl> <chr> <chr> <chr> <chr> <chr> <dbl>
1 123 Unrelated1 Unrelated2 E1 T1 Date 1506816000
2 225 Unrelated1 Unrelated2 E1 T1 Date 1513296000
3 123 Unrelated1 Unrelated2 E1 T1 v1 10
4 225 Unrelated1 Unrelated2 E1 T1 v1 20
5 123 Unrelated1 Unrelated2 E1 T1 v2 20
6 225 Unrelated1 Unrelated2 E1 T1 v2 20
7 123 Unrelated1 Unrelated2 E1 T1 v3 30
8 225 Unrelated1 Unrelated2 E1 T1 v3 20
9 123 Unrelated1 Unrelated2 E1 T1 v4 40
10 225 Unrelated1 Unrelated2 E1 T1 v4 20

R-How to compare two dataframe and update list column value

I have two data frame dataframe 1, dataframe 2,How to compare two dataframe column values against P.Name,Name,Q.Name and update same value and append different row of value. please check below Ex.
Data frame1
P.Name Name Q.Name values
Read Mike salseID list(value = "Y2TS", countofvalues = 1)
Write jhon Purchasedcust list(value = "ANDERSON", countofvalues = 1)
write jhon shippingname list(value = "Mikel", countofvalues = 5)
Read peter ordername list(value = c("july", "mary", "petersonavail"), countofvalues = c(1, 2, 1))
Write jack deliveredadd list(value = c("IICC PS LOL UY", "IICC UYY LOL UY"), countofvalues = c(2,1))
Dataframe 2
P.Name Name Q.Name values
Read Mike salseID list(value = "Y2TS", countofvalues = 1)
Write jhon Purchasedcust list(value = "vjantony", countofvalues = 1)
write jhon CustaAddress list(value = "Mikel", countofvalues = 5)
Read peter ordername list(value = c("july", "mary", "parker"), countofvalues = c(1, 2, 1))
Expected data frame:
P.Name Name Q.Name values
Read Mike salseID list(value = "Y2TS", countofvalues = 2)
Write jhon Purchasedcust list(value = c("ANDERSON","vjantony"), countofvalues = c(1,1))
write jhon shippingname list(value = "Mikel", countofvalues = 5)
write jhon CustaAddress list(value = "Mikel", countofvalues = 5)
Read peter ordername list(value = c("july", "mary", "petersonavail","parker"), countofvalues = c(2, 4, 1,1))
Write jack deliveredadd list(value = c("IICC PS LOL UY", "IICC UYY LOL UY"), countofvalues = c(2,1))
Data frame1 dput data.
structure(list(P.Name = c("Read", "Write", "Write", "Read", "Write"
), Name = c("Mike", "jhon", "jhon", "peter", "jack"), Q.Name = c("salseID",
"Purchasedcust", "shippingname", "ordername", "deliveredadd"),
values = list(structure(list(value = "Y2TS", countofvalues = 1L), .Names = c("value",
"countofvalues"), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame")), structure(list(value = "ANDERSON",
countofvalues = 1L), .Names = c("value", "countofvalues"
), row.names = c(NA, -1L), class = c("tbl_df", "tbl", "data.frame"
)), structure(list(value = "Mikel", countofvalues = 5L), .Names = c("value",
"countofvalues"), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame")), structure(list(value = c("july", "mary",
"petersonavail"), countofvalues = c(1L, 2L, 1L)), .Names = c("value",
"countofvalues"), row.names = c(NA, -5L), class = c("tbl_df",
"tbl", "data.frame")), structure(list(value = c("IICC PS LOL UY",
"IICC UYY LOL UY"), countofvalues = c(2L, 1L)), .Names = c("value",
"countofvalues"), row.names = c(NA, -3L), class = c("tbl_df",
"tbl", "data.frame")))), .Names = c("P.Name", "Name", "Q.Name",
"values"), row.names = c(NA, -5L), class = "data.frame")
Data frame 2 dput data
structure(list(P.Name = c("Read", "Write", "Write", "Read"),
Name = c("Mike", "jhon", "jhon", "peter"), Q.Name = c("salseID",
"Purchasedcust", "CustaAddress", "ordername"), values = list(
structure(list(value = "Y2TS", countofvalues = 1L), .Names = c("value",
"countofvalues"), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame")), structure(list(value = "vjantony",
countofvalues = 1L), .Names = c("value", "countofvalues"
), row.names = c(NA, -1L), class = c("tbl_df", "tbl",
"data.frame")), structure(list(value = "Mikel", countofvalues = 5L), .Names = c("value",
"countofvalues"), row.names = c(NA, -4L), class = c("tbl_df",
"tbl", "data.frame")), structure(list(value = c("july",
"mary", "parker"), countofvalues = c(1L, 2L, 1L)), .Names = c("value",
"countofvalues"), row.names = c(NA, -3L), class = c("tbl_df",
"tbl", "data.frame")))), .Names = c("P.Name", "Name",
"Q.Name", "values"), row.names = c(NA, -4L), class = "data.frame")
You can try a tidyverse/dplyr solution
library(tidyverse)
# remove NAs. Otherwise it will not work. Don't know if they are important.
d1$values <- lapply(d1$values, function(x) x[!is.na(x[,1]),])
d2$values <- lapply(d2$values, function(x) x[!is.na(x[,1]),])
d1 %>%
unnest() %>%
bind_rows(unnest(d2)) %>%
group_by(P.Name, Name, Q.Name, value) %>%
summarise(countofvalues=sum(countofvalues))
# A tibble: 11 x 5
# Groups: P.Name, Name, Q.Name [?]
P.Name Name Q.Name value countofvalues
<chr> <chr> <chr> <chr> <int>
1 Read Mike salseID Y2TS 2
2 Read peter ordername july 2
3 Read peter ordername mary 4
4 Read peter ordername parker 1
5 Read peter ordername petersonavail 1
6 Write jack deliveredadd IICC PS LOL UY 2
7 Write jack deliveredadd IICC UYY LOL UY 1
8 Write jhon CustaAddress Mikel 5
9 Write jhon Purchasedcust ANDERSON 1
10 Write jhon Purchasedcust vjantony 1
11 Write jhon shippingname Mikel 5
Then you can nest the last columns using nest()
d1 %>%
unnest() %>%
bind_rows(unnest(d2)) %>%
group_by(P.Name, Name, Q.Name, value) %>%
summarise(countofvalues=sum(countofvalues)) %>%
nest(.key = "values")
# A tibble: 6 x 4
P.Name Name Q.Name values
<chr> <chr> <chr> <list>
1 Read Mike salseID <tibble [1 x 2]>
2 Read peter ordername <tibble [4 x 2]>
3 Write jack deliveredadd <tibble [2 x 2]>
4 Write jhon CustaAddress <tibble [1 x 2]>
5 Write jhon Purchasedcust <tibble [2 x 2]>
6 Write jhon shippingname <tibble [1 x 2]>

How do i dynamically add column in dataframe if list of variable increases

How to add column dynamically in dataframe if list of variable increases. My dataframe:
ID Value
1 list(F="20",B="rt")
2 list(F="20",B="rt",`H'="ty")
3 list(F="20",B="rt")
4 list(F="20")
Desired output:
ID Value F B H
1 list(F="20",B="rt") 20 rt NA
2 list(F="20",B="rt",H="ty") 20 rt ty
3 list(F="20",B="rt") 20 rt NA
4 list(F="20") 20 NA NA
structure(list(Billing = list(NULL, structure
(list(`EUcust#` = "3",`Cust#` = "5", Com = "I", `Com#` = "6", Add = "Y"), .Names
= c("EUcust#",
"Cust#", "Com", "Com#", "Add"), class
= "data.frame", row.names = 1L))), .Names
= "Billing",
row.names = 8:9, class = "data.frame")
We can use tidyverse
library(tidyverse)
df1 %>%
bind_cols(., map_df(.$Value, ~do.call(cbind.data.frame, .)))
# ID Value F B H
#1 1 20, rt 20 rt <NA>
#2 2 20, rt, ty 20 rt ty
#3 3 20, rt 20 rt <NA>
#4 4 20 20 <NA> <NA>
data
df1 <- structure(list(ID = 1:4, Value = structure(list(structure(list(
F = "20", B = "rt"), .Names = c("F", "B")), structure(list(
F = "20", B = "rt", H = "ty"), .Names = c("F", "B", "H")),
structure(list(F = "20", B = "rt"), .Names = c("F", "B")),
structure(list(F = "20"), .Names = "F")), class = "AsIs")),
.Names = c("ID",
"Value"), row.names = c(NA, -4L), class = "data.frame")

Resources