I want to calculate difference by groups. Although I referred R: Function “diff” over various groups thread on SO, for unknown reason, I am unable to find the difference. I have tried three methods : a) spread b) dplyr::mutate with base::diff() c) data.table with base::diff(). While a) works, I am unsure how I can solve this problem using b) and c).
Description about the data:
I have revenue data for the product by year. I have categorized years >= 2013 as Period 2 (called P2), and years < 2013 as Period 1 (called P1).
Sample data:
dput(Test_File)
structure(list(Ship_Date = c(2010, 2010, 2012, 2012, 2012, 2012,
2017, 2017, 2017, 2016, 2016, 2016, 2011, 2017), Name = c("Apple",
"Apple", "Banana", "Banana", "Banana", "Banana", "Apple", "Apple",
"Apple", "Banana", "Banana", "Banana", "Mango", "Pineapple"),
Revenue = c(5, 10, 13, 14, 15, 16, 25, 25, 25, 1, 2, 4, 5,
7)), .Names = c("Ship_Date", "Name", "Revenue"), row.names = c(NA,
14L), class = "data.frame")
Expected Output
dput(Diff_Table)
structure(list(Name = c("Apple", "Banana", "Mango", "Pineapple"
), P1 = c(15, 58, 5, NA), P2 = c(75, 7, NA, 7), Diff = c(60,
-51, NA, NA)), .Names = c("Name", "P1", "P2", "Diff"), class = "data.frame", row.names = c(NA,
-4L))
Here's my code:
Method 1: Using spread [Works]
data.table::setDT(Test_File)
cutoff<-2013
Test_File[Test_File$Ship_Date>=cutoff,"Ship_Period"]<-"P2"
Test_File[Test_File$Ship_Date<cutoff,"Ship_Period"]<-"P1"
Diff_Table<- Test_File %>%
dplyr::group_by(Ship_Period,Name) %>%
dplyr::mutate(Revenue = sum(Revenue)) %>%
dplyr::select(Ship_Period, Name,Revenue) %>%
dplyr::ungroup() %>%
dplyr::distinct() %>%
tidyr::spread(key = Ship_Period,value = Revenue) %>%
dplyr::mutate(Diff = `P2` - `P1`)
Method 2: Using dplyr [Doesn't work: NAs are generated in Diff column.]
Diff_Table<- Test_File %>%
dplyr::group_by(Ship_Period,Name) %>%
dplyr::mutate(Revenue = sum(Revenue)) %>%
dplyr::select(Ship_Period, Name,Revenue) %>%
dplyr::ungroup() %>%
dplyr::distinct() %>%
dplyr::arrange(Name,Ship_Period, Revenue) %>%
dplyr::group_by(Ship_Period,Name) %>%
dplyr::mutate(Diff = diff(Revenue))
Method 3: Using data.table [Doesn't work: It generates all zeros in Diff column.]
Test_File[,Revenue1 := sum(Revenue),by=c("Ship_Period","Name")]
Diff_Table<-Test_File[,.(Diff = diff(Revenue1)),by=c("Ship_Period","Name")]
Question: Can someone please help me with method 2 and method 3 above? I am fairly new to R so I apologize if my work sounds too basic. I am still learning this language.
We can do this with data.table. Convert the 'data.frame' to 'data.table' (setDT(Test_File)), grouped by the run-length-id of 'Name' and 'Name', get the sum of 'Revenue', reshape it to 'wide' format with dcast, get the difference between 'P2' and 'P1' and assign (:=) it to 'Diff'
library(data.table)
dcast(setDT(Test_File)[, .(Revenue = sum(Revenue)),
.(grp=rleid(Name), Name)], Name~ paste0("P", rowid(Name)),
value.var = "Revenue")[, Diff := P2 - P1][]
# Name P1 P2 Diff
#1: Apple 15 75 60
#2: Banana 58 7 -51
#3: Mango 5 NA NA
#4: Pineapple 7 NA NA
Or for third case, i.e. base R, we create a grouping column based on whether the adjacent elements in 'Name' are the same or not ('grp'), then aggregate the 'Revenue' by 'Name' and 'grp' to find the sum, create a sequence column, reshape it to 'wide' and transform the dataset to create the 'Diff' column
Test_File$grp <- with(Test_File, cumsum(c(TRUE, Name[-1]!=Name[-length(Name)])))
d1 <- aggregate(Revenue~Name +grp, Test_File, sum)
d1$Seq <- with(d1, ave(seq_along(Name), Name, FUN = seq_along))
transform(reshape(d1[-2], idvar = "Name", timevar = "Seq",
direction = "wide"), Diff = Revenue.2- Revenue.1)
The tidyverse method can also be done using
library(dplyr)
library(tidyr)
Test_File %>%
group_by(grp = cumsum(c(TRUE, Name[-1]!=Name[-length(Name)])), Name) %>%
summarise(Revenue = sum(Revenue)) %>%
group_by(Name) %>%
mutate(Seq = paste0("P", row_number())) %>%
select(-grp) %>%
spread(Seq, Revenue) %>%
mutate(Diff = P2-P1)
#Source: local data frame [4 x 4]
#Groups: Name [4]
# Name P1 P2 Diff
# <chr> <dbl> <dbl> <dbl>
#1 Apple 15 75 60
#2 Banana 58 7 -51
#3 Mango 5 NA NA
#4 Pineapple 7 NA NA
Update
Based on the OP's comments to use only diff function
library(data.table)
setDT(Test_File)[, .(Revenue = sum(Revenue)), .(Name, grp = rleid(Name))
][, .(P1 = Revenue[1L], P2 = Revenue[2L], Diff = diff(Revenue)) , Name]
# Name P1 P2 Diff
#1: Apple 15 75 60
#2: Banana 58 7 -51
#3: Mango 5 NA NA
#4: Pineapple 7 NA NA
Or with dplyr
Test_File %>%
group_by(grp = cumsum(c(TRUE, Name[-1]!=Name[-length(Name)])), Name) %>%
summarise(Revenue = sum(Revenue)) %>%
group_by(Name) %>%
summarise(P1 = first(Revenue), P2 = last(Revenue)) %>%
mutate(Diff = P2-P1)
This will do:
library("data.table")
setDT(Test_File)
T <- Test_File[, .(P=sum(Revenue)),by=.(Ship_Date, Name)]
T[Ship_Date>=2013][T[Ship_Date<2013][CJ(Name=T$Name, unique=TRUE), on="Name"], on="Name"][,`:=`(P1=i.P, P2=P, Diff=P-i.P)][]
# Ship_Date Name P i.Ship_Date i.P P1 P2 Diff
# 1: 2017 Apple 75 2010 15 15 75 60
# 2: 2016 Banana 7 2012 58 58 7 -51
# 3: NA Mango NA 2011 5 5 NA NA
# 4: 2017 Pineapple 7 NA NA NA 7 NA
Or with only the wanted columns:
T[Ship_Date>=2013][T[Ship_Date<2013][CJ(Name=T$Name, unique=TRUE), on="Name"], on="Name"][,`:=`(P1=i.P, P2=P, Diff=P-i.P)][,.(Name, P1, P2, Diff)]
# Name P1 P2 Diff
# 1: Apple 15 75 60
# 2: Banana 58 7 -51
# 3: Mango 5 NA NA
# 4: Pineapple NA 7 NA
Here is a variant using setnames():
setnames(T[Ship_Date>=2013][T[Ship_Date<2013][CJ(Name=T$Name, unique=TRUE), on="Name"], on="Name"],
c("P", "i.P"), c("P2", "P1"))[, Diff:=P2-P1][]
Related
I have two data frames with different lengths in rows and columns
data.frame(
stringsAsFactors = FALSE,
Date = c("01/01/2000", "01/01/2010", "01/01/2020"),
Germany = c(5, 8, 9),
France = c(4, NA, 7),
Luxembourg = c(10, 6, 3)
) -> df1
data.frame(
stringsAsFactors = FALSE,
Date = c("01/01/1990", "01/01/2000", "01/01/2010", "01/01/2020"),
Germany = c(1, 9, 7, NA),
France = c(10, 3, 9, 6),
Luxembourg = c(10, NA, NA, 7),
Belgium = c(NA, 8, 1, 9)
) -> df2
I have to create a third df (df3) where,
NA values of df1 are replaced with the values of df2 by matching IDs and Dates and viceversa (the NA from df2 replaced by df1)
The values of df1 are priority (=TRUE)
All those columns that are not in one data frame (like Belgium in this case) should also be included in the df3
df3 should look like this:
Any help would be greatly appreciated
We can do a join on the 'Date' and use fcoalesce to replace the NA with the corresponding non-NA
library(data.table)
nm2 <- intersect(names(df2)[-1], names(df1)[-1])
df3 <- copy(df2)
setDT(df3)[df1, (nm2) := Map(fcoalesce, mget(nm2),
mget(paste0('i.', nm2))), on = .(Date)]
-output
df3
# Date Germany France Luxembourg Belgium
#1: 01/01/1990 1 10 10 NA
#2: 01/01/2000 9 3 10 8
#3: 01/01/2010 7 9 6 1
#4: 01/01/2020 9 6 7 9
Or this can be done with tidyverse
library(dplyr)
library(stringr)
left_join(df2, df1, by = 'Date') %>%
mutate(Date, across(ends_with(".x"),
~ coalesce(., get(str_replace(cur_column(), "\\.x$", ".y"))))) %>%
select(-ends_with('.y')) %>%
rename_with(~ str_remove(., "\\.x$"), ends_with('.x'))
Here is another data.table option
cols <- setdiff(intersect(names(df1), names(df2)), "Date")
setDT(df1)[setDT(df2),
on = "Date"
][
,
c(cols) :=
Map(
fcoalesce,
.SD[, cols, with = FALSE],
.SD[, paste0("i.", cols), with = FALSE]
)
][,
.SD,
.SDcols = patterns("^[^i]")
]
giving
Date Germany France Luxembourg Belgium
1: 01/01/1990 1 10 10 NA
2: 01/01/2000 5 4 10 8
3: 01/01/2010 8 9 6 1
4: 01/01/2020 9 7 3 9
An approach with dplyr only using mutate(across...
I also propose use of full_join instead of left_join or right_join as full_join will take all rows from df1 or df2 as opposed to left or right joins.
data.frame(
stringsAsFactors = FALSE,
Date = c("01/01/2000", "01/01/2010", "01/01/2020"),
Germany = c(5, 8, 9),
France = c(4, NA, 7),
Luxembourg = c(10, 6, 3)
) -> df1
data.frame(
stringsAsFactors = FALSE,
Date = c("01/01/1990", "01/01/2000", "01/01/2010", "01/01/2020"),
Germany = c(1, 9, 7, NA),
France = c(10, 3, 9, 6),
Luxembourg = c(10, NA, NA, 7),
Belgium = c(NA, 8, 1, 9)
) -> df2
library(dplyr)
df1 %>% full_join(df2, by = 'Date', suffix = c('_x', '_y')) %>%
mutate(across(ends_with('_x'), ~coalesce(., get(sub('_x', '_y', cur_column()))),
.names = '{sub("_x", "", {.col})}')) %>%
select(!ends_with('_x') & !ends_with('_y'))
#> Date Belgium Germany France Luxembourg
#> 1 01/01/2000 8 5 4 10
#> 2 01/01/2010 1 8 9 6
#> 3 01/01/2020 9 9 7 3
#> 4 01/01/1990 NA 1 10 10
Created on 2021-05-18 by the reprex package (v2.0.0)
library(tidyverse)
library(lubridate)
df1 <- tibble::tribble(
~Date, ~Germany, ~France, ~Luxembourg,
"01/01/2000", 5, 4, 10,
"01/01/2010", 8, NA, 6,
"01/01/2020", 9, 7, 3
)
df2 <- tibble::tribble(
~Date, ~Germany, ~France, ~Luxembourg, ~Belgium,
"01/01/1990", 1, 10, 10, NA,
"01/01/2000", 9, 3, NA, 8,
"01/01/2010", 7, 9, NA, 1,
"01/01/2020", NA, 6, 7, 9
)
bind_rows(df1 %>%
mutate(priority = 1),
df2 %>%
mutate(priority = 2)) %>%
mutate(Date = lubridate::dmy(Date)) %>%
group_by(Date) %>%
arrange(priority) %>%
summarise(across(-priority, ~ first(na.omit(.))))
#> # A tibble: 4 x 5
#> Date Germany France Luxembourg Belgium
#> <date> <dbl> <dbl> <dbl> <dbl>
#> 1 1990-01-01 1 10 10 NA
#> 2 2000-01-01 5 4 10 8
#> 3 2010-01-01 8 9 6 1
#> 4 2020-01-01 9 7 3 9
Base R solution:
# Store as a variable a list denoting each data.frame's column names:
# cnames => character vector
cnames <- list(names(df1), names(df2))
# Determine which vector of names is required in the resulting data.frame
# required_vecs => character vector
required_vecs <- cnames[[which.max(lengths(cnames))]]
# Merge the data: full_data => data.frame
full_data <- merge(
df1,
df2,
by = "Date",
all = TRUE
)
# Resolve the vector names of vectors requiring coalescing:
# clsce_required_vecs=> character vector
clsce_required_vecs <- setdiff(intersect(names(df1), names(df2)), c("Date"))
# Resolve the vector names of vectors not requiring coalescing:
# nt_rqrd_vecs => character vector
nt_rqrd_vecs <- setdiff(unlist(cnames), clsce_required_vecs)
# Split-Apply-Combine data requiring coalescing: coalesced_data => data.frame
coalesced_data <- setNames(
data.frame(
do.call(
cbind,
lapply(
clsce_required_vecs,
function(x) {
# Subset the data to only contain relevant vectors: sbst => data.frame
sbst <- full_data[,grepl(x, names(full_data))]
# Split each column (of the same data) into a data.frame in a list:
# same_vecs => list of data.frames
same_vecs <- split.default(sbst, seq_len(ncol(sbst)))
# Rename the data.frames as required and row-bind them into a single df:
# vector => GlobalEnv()
Reduce(
function(y, z){
replace(y, is.na(y), z[is.na(y)])
},
do.call(cbind, same_vecs)
)
}
)
), row.names = NULL),
clsce_required_vecs)
# Column bind and order the columns:
res <- cbind(full_data[, nt_rqrd_vecs], coalesced_data)[,required_vecs]
I have a data.frame (df1) and I want to include a single, most recent age for each of my samples from another data.frame (df2):
df1$age <- df2$age_9[match(df1$Sample_ID, df2$Sample_ID)]
The problem is that in df2 there are 9 columns for age, as each one indicates the age at a specific check-up date (age_1 is from the first visit, age_9 is the age at the 9th visit) and patients dont make all their visits.
How do I add the most recently obtained age from a non empty check up date?
aka, if age_9 == "." replace "." with age_8 then if age_8 == "." replace "." with age_7 ... etc
From this:
View(df1)
Sample Age
1 50
2 .
3 .
To:
View(df1)
Sample Age
1 50
2 49
3 30
From the data df2
View(df2)
Sample Age_1 Age_2 Age_3
1 40 42 44
2 35 49 .
3 30 . .
This is my attempt:
df1$age[which(df1$age == ".")] <- df2$age_8[match(df1$Sample_ID, df2$Sample_ID)]
With base R, we can use max.col to return the last column index for each row, where the 'Age' columns are not ., cbind with sequence of rows to return a row/column index, extract the elements and change the 'Age' column in 'df1', where the 'Age' is .
df1$Age <- ifelse(df1$Age == ".", df2[-1][cbind(seq_len(nrow(df2)),
max.col(df2[-1] != ".", "last"))], df1$Age)
df1 <- type.convert(df1, as.is = TRUE)
-output
df1
# Sample Age
#1 1 50
#2 2 49
#3 3 30
or using tidyverse by reshaping into 'long' format and then do a join after sliceing the last row grouped by 'Sample'
library(dplyr)
library(tidyr)
df2 %>%
mutate(across(starts_with('Age'), as.integer)) %>%
pivot_longer(cols = starts_with('Age'), values_drop_na = TRUE) %>%
group_by(Sample) %>%
slice_tail(n = 1) %>%
ungroup %>%
select(-name) %>%
right_join(df1) %>%
transmute(Sample, Age = coalesce(as.integer(Age), value))
-output
# A tibble: 3 x 2
# Sample Age
# <int> <int>
#1 1 50
#2 2 49
#3 3 30
data
df1 <- structure(list(Sample = 1:3, Age = c("50", ".", ".")),
class = "data.frame",
row.names = c(NA,
-3L))
df2 <- structure(list(Sample = 1:3, Age_1 = c(40L, 35L, 30L), Age_2 = c("42",
"49", "."), Age_3 = c("44", ".", ".")), class = "data.frame",
row.names = c(NA,
-3L))
I am processing a large dataset adapted to my research. Suppose that I have 4 observations (records) and 5 columns as follows:
x <- data.frame("ID" = c(1, 2, 3, 4),
"group1" = c("A", NA, "B", NA),
"group2" = c("B", "A", NA, "C"),
"hours1" = c(3, NA, 5, NA),
"hours2" = c(1, 2, NA, 5))
> x
ID group1 group2 hours1 hours2
1 A B 3 1
2 <NA> A NA 2
3 B <NA> 5 NA
4 <NA> C NA 5
The "group1" and "group2" are reference columns containing the character values of A, B, and C, and the last two columns, "hours1" and "hours2," are numeric indicating hours obviously.
The column "group1" is corresponding to the column "hours1"; likewise, "group2" is corresponding to "hours 2."
I want to create multiple columns according to the values, A, B, and C, of the reference columns matching to values of "hours1" and "hours2" as follows:
ID group1 group2 hours1 hours2 A B C
1 A B 3 1 3 1 NA
2 <NA> A NA 2 2 NA NA
3 B <NA> 5 NA NA 5 NA
4 <NA> C NA 5 NA NA 5
For example, ID 1 has A in "group1," corresponding to 3 in "hours1" which is found under the column "A." ID 3 has B in "group1," corresponding to 5 in "hours1" which is found under the columns "B." In "group 2," ID 4 has C, corresponding to 5 in hours2 which is found under column "C."
Is there a way to do it using R?
One way would be to combine all the "hour" column in one column and "group" columns in another column. This can be done using pivot_longer. After that we can get data in wide format and join it with original data.
library(dplyr)
library(tidyr)
x %>%
pivot_longer(cols = -ID,
names_to = c('.value'),
names_pattern = '(.*?)\\d+',
values_drop_na = TRUE) %>%
pivot_wider(names_from = group, values_from = hours) %>%
left_join(x, by = 'ID') %>%
select(ID, starts_with('group'), starts_with('hour'), everything())
# A tibble: 4 x 8
# ID group1 group2 hours1 hours2 A B C
# <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 1 A B 3 1 3 1 NA
#2 2 NA A NA 2 2 NA NA
#3 3 B NA 5 NA NA 5 NA
#4 4 NA C NA 5 NA NA 5
For OP's dataset we can slightly modify the code to achieve the desired result.
zz %>%
pivot_longer(cols = -id,
names_to = c('.value'),
names_pattern = '(.*)_',
values_drop_na = TRUE) %>%
arrange(fu2a) %>%
pivot_wider(names_from = fu2a, values_from = fu2b) %>%
left_join(zz, by = 'id') %>%
select(id, starts_with('fu2a'), starts_with('fu2b'), everything())
Another approach using dplyr could be done separating group and hours variables to compute the desired variables and then merge with the original x:
library(tidyverse)
#Data
x <- data.frame("ID" = c(1, 2, 3, 4),
"group1" = c("A", NA, "B", NA),
"group2" = c("B", "A", NA, "C"),
"hours1" = c(3, NA, 5, NA),
"hours2" = c(1, 2, NA, 5),stringsAsFactors = F)
#Reshape
x %>%
left_join(x %>% select(1:3) %>%
pivot_longer(cols = -ID) %>%
group_by(ID) %>% mutate(id=1:n()) %>%
left_join(x %>% select(c(1,4:5)) %>%
pivot_longer(cols = -ID) %>%
rename(name2=name,value2=value) %>%
group_by(ID) %>% mutate(id=1:n())) %>%
filter(!is.na(value)) %>% select(ID,value,value2) %>%
pivot_wider(names_from = value,values_from=value2))
Output:
ID group1 group2 hours1 hours2 A B C
1 1 A B 3 1 3 1 NA
2 2 <NA> A NA 2 2 NA NA
3 3 B <NA> 5 NA NA 5 NA
4 4 <NA> C NA 5 NA NA 5
I have two dataframe of the same style and would like to merge them into a dataframe with the same columns whilst also combining one of the columns.
The two dataframes look like this:
year variable
1 1968 2
2 1969 5
3 1970 <NA>
4 1971 <NA>
5 1972 <NA>
year variable
1 1968 <NA>
2 1969 <NA>
3 1970 5
4 1971 7
5 1972 <NA>
and I would like to end up with a data frame that looks like this:
year variable
1 1968 2
2 1969 5
3 1970 5
4 1971 7
5 1972 <NA>
Thanks in advance!
Use coalesce from dplyr:
df1 <- bind_cols(year = seq(1968, 1972, 1),
variable = c(2, 5, NA, NA, NA))
df2 <- bind_cols(year = seq(1968, 1972, 1),
variable = c(NA, NA, 5, 7, NA))
left_join(df1, df2, by='year') %>%
mutate(
variable = coalesce(variable.x, variable.y)
) %>%
select(year, variable)
you can use dplyr package, and remove the NAs.
First you need to transform into proper NA
df1 <- data.frame(year = c(1968,1969,1970,1971,1972),
variable = c(2,5,'<NA>', '<NA>', '<NA>'))
df2 <- data.frame(year= c(1968,1969,1970,1971,1972),
variable = c('<NA>', '<NA>', 5, 7, '<NA>'))
library(dplyr)
df1 <- df1 %>%
mutate(variable = ifelse(variable == "<NA>", NA, variable))
df2 <- df2 %>%
mutate(variable = ifelse(variable == "<NA>", NA, variable))
df_m <- full_join(df1,df2) %>%
na.omit(variable)
You can use merge, but I prefer the tidyverse.
library(tidyverse)
df <- tibble(Date = c(rep(as.Date("2020-01-01"), 3), NA),
col1 = 1:4,
thisCol = c(NA, 8, NA, 3),
thatCol = 25:28,
col999 = rep(99, 4))
#> # A tibble: 4 x 5
#> Date col1 thisCol thatCol col999
#> <date> <int> <dbl> <int> <dbl>
#> 1 2020-01-01 1 NA 25 99
#> 2 2020-01-01 2 8 26 99
#> 3 2020-01-01 3 NA 27 99
#> 4 NA 4 3 28 99
My actual R data frame has hundreds of columns that aren't neatly named, but can be approximated by the df data frame above.
I want to replace all values of NA with 0, with the exception of several columns (in my example I want to leave out the Date column and the thatCol column. I'd want to do it in this sort of fashion:
df %>% replace(is.na(.), 0)
#> Error: Assigned data `values` must be compatible with existing data.
#> i Error occurred for column `Date`.
#> x Can't convert <double> to <date>.
#> Run `rlang::last_error()` to see where the error occurred.
And my unsuccessful ideas for accomplishing the "everything except" replace NA are shown below.
df %>% replace(is.na(c(., -c(Date, thatCol)), 0))
df %>% replace_na(list([, c(2:3, 5)] = 0))
df %>% replace_na(list(everything(-c(Date, thatCol)) = 0))
Is there a way to select everything BUT in the way I need to? There's hundred of columns, named inconsistently, so typing them one by one is not a practical option.
You can use mutate_at :
library(dplyr)
Remove them by Name
df %>% mutate_at(vars(-c(Date, thatCol)), ~replace(., is.na(.), 0))
Remove them by position
df %>% mutate_at(-c(1,4), ~replace(., is.na(.), 0))
Select them by name
df %>% mutate_at(vars(col1, thisCol, col999), ~replace(., is.na(.), 0))
Select them by position
df %>% mutate_at(c(2, 3, 5), ~replace(., is.na(.), 0))
If you want to use replace_na
df %>% mutate_at(vars(-c(Date, thatCol)), tidyr::replace_na, 0)
Note that mutate_at is soon going to be replaced by across in dplyr 1.0.0.
You have several options here based on data.table.
One of the coolest options: setnafill (version >= 1.12.4):
library(data.table)
setDT(df)
data.table::setnafill(df,fill = 0, cols = colnames(df)[!(colnames(df) %in% c("Date", thatCol)]))
Note that your dataframe is updated by reference.
Another base solution:
to_change<-grep("^(this|col)",names(df))
df[to_change]<- sapply(df[to_change],function(x) replace(x,is.na(x),0))
df
# A tibble: 4 x 5
Date col1 thisCol thatCol col999
<date> <dbl> <dbl> <int> <dbl>
1 2020-01-01 1 0 25 99
2 2020-01-01 2 8 26 99
3 2020-01-01 3 0 27 99
4 NA 0 3 28 99
Data(I changed one value):
df <- structure(list(Date = structure(c(18262, 18262, 18262, NA), class = "Date"),
col1 = c(1L, 2L, 3L, NA), thisCol = c(NA, 8, NA, 3), thatCol = 25:28,
col999 = c(99, 99, 99, 99)), row.names = c(NA, -4L), class = c("tbl_df",
"tbl", "data.frame"))
replace works on a data.frame, so we can just do the replacement by index and update the original dataset
df[-c(1, 4)] <- replace(df[-c(1, 4)], is.na(df[-c(1, 4)]), 0)
Or using replace_na with across (from the new dplyr)
library(dplyr)
library(tidyr)
df %>%
mutate(across(-c(Date, thatCol), ~ replace_na(., 0)))
If you know the ones that you don't want to change, you could do it like this:
df <- tibble(Date = c(rep(as.Date("2020-01-01"), 3), NA),
col1 = 1:4,
thisCol = c(NA, 8, NA, 3),
thatCol = 25:28,
col999 = rep(99, 4))
#dplyr
df_nonreplace <- select(df, c("Date", "thatCol"))
df_replace <- df[ ,!names(df) %in% names(df_nonreplace)]
df_replace[is.na(df_replace)] <- 0
df <- cbind(df_nonreplace, df_replace)
> head(df)
Date thatCol col1 thisCol col999
1 2020-01-01 25 1 0 99
2 2020-01-01 26 2 8 99
3 2020-01-01 27 3 0 99
4 <NA> 28 4 3 99