Conditionally Create New Column Based on Row Values - r

thanks in advance for any assistance.
I have a dataframe:
df <- structure(list(ID = c("0001", "0002", "0003", "0004"), May_1 = c(1,
2, 1, 3), May_5 = c(NA, 1, 2, 1), May_10 = c(NA, 3, 3, NA), May_16 = c(2,
NA, NA, NA), May_20 = c(3, NA, NA, 2)), row.names = c(NA, -4L
), class = c("tbl_df", "tbl", "data.frame"))
I would like to create new columns named "First Preference", "Second Preference" and "Third Preference" based on the row values for each response.
If a row value == 1, I would like to append a column called "First Preference" that contains the column name where the row value == 1.
My actual data contains about 40 dates that will be changing week over week, so a generalizable solution is most appreciated.
Here's the ideal df:
df_ideal <- structure(list(ID = c("0001", "0002", "0003", "0004"), May_1 = c(1,
2, 1, 3), May_5 = c(NA, 1, 2, 1), May_10 = c(NA, 3, 3, NA), May_16 = c(2,
NA, NA, NA), May_20 = c(3, NA, NA, 2), First_Preference = c("May_1",
"May_5", "May_1", "May_5"), Second_Preference = c("May_16", "May_1",
"May_5", "May_20"), Third_Preference = c("May_20", "May_10",
"May_10", "May_1")), row.names = c(NA, -4L), class = c("tbl_df",
"tbl", "data.frame"))
A tidyverse solution would be preferred, but I'm certainly open to anything.
Thanks!

In base R, we can use apply row-wise order the values removing NA values and get corresponding column names.
cols <- paste(c('First', 'Second', 'Third'), "Preference", sep = "_")
df[cols] <- t(apply(df[-1], 1, function(x) names(df)[-1][order(x, na.last= NA)]))
df
# A tibble: 4 x 9
# ID May_1 May_5 May_10 May_16 May_20 First_Preference Second_Preference Third_Preference
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr>
#1 0001 1 NA NA 2 3 May_1 May_16 May_20
#2 0002 2 1 3 NA NA May_5 May_1 May_10
#3 0003 1 2 3 NA NA May_1 May_5 May_10
#4 0004 3 1 NA NA 2 May_5 May_20 May_1

We can reshape it to 'long' format, while dropping the NA elements with values_drop_na, then use the 'value' column as index to change the labels and then convert back to 'wide' format with pivot_wider
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = -ID, values_drop_na = TRUE) %>%
group_by(ID) %>%
mutate(value = c("First_Preference", "Second_Preference",
"Third_Preference")[value]) %>%
ungroup %>%
pivot_wider(names_from = value, values_from = name) %>%
left_join(df, .)
# A tibble: 4 x 9
# ID May_1 May_5 May_10 May_16 May_20 First_Preference Second_Preference Third_Preference
#* <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr>
#1 0001 1 NA NA 2 3 May_1 May_16 May_20
#2 0002 2 1 3 NA NA May_5 May_1 May_10
#3 0003 1 2 3 NA NA May_1 May_5 May_10
#4 0004 3 1 NA NA 2 May_5 May_20 May_1
To get the column names automatically, we can use ordinal from english
library(english)
library(stringr)
df %>%
pivot_longer(cols = -ID, values_drop_na = TRUE) %>%
group_by(ID) %>%
mutate(value = str_c(ordinal(value), "_preference")) %>%
ungroup %>%
pivot_wider(names_from = value, values_from = name) %>%
left_join(df, .)
Or using data.table
library(data.table)
setDT(df)[dcast(melt(df, id.var = 'ID', na.rm = TRUE),
ID ~ paste0(ordinal(value), "_preference"), value.var = 'variable'), on = .(ID)]
# ID May_1 May_5 May_10 May_16 May_20 first_preference second_preference third_preference
#1: 0001 1 NA NA 2 3 May_1 May_16 May_20
#2: 0002 2 1 3 NA NA May_5 May_1 May_10
#3: 0003 1 2 3 NA NA May_1 May_5 May_10
#4: 0004 3 1 NA NA 2 May_5 May_20 May_1

Related

Merge column if duplicates in rows between columns

I have a dataframe such as :
COL1 COL2 COL3 COL4 COL5 COL6 COL7
1 Sp1-2 Sp1-2 Sp3_2-54 Sp3-2 Sp3-2 Sp3-2 SP9-43
2 Sp5-1 Sp5-2 Sp2-4 Sp9-2 Sp10-3 SP9-90 NA
3 Sp_7-3 Sp_7-3 NA SP6-56 Sp2-7 SP3-3 NA
And I would simply like to merge columns when at leats two elements are duplicated.
for example, in COL1 and COL2, Sp1-2 & Sp_7-3 are duplicated in both columns, then I merge it that way by adding a pipe "|" between non-duplicated elements:
COL1|COL2 COL3 COL4|COL5|COL6 COL7
1 Sp1-2 Sp3_2-54 Sp3-2 SP9-43
2 Sp5-1|Sp5-2 Sp2-4 Sp9-2|Sp10-3|SP9-90 NA
3 Sp_7-3 NA SP6-56|Sp2-7|SP3-3 NA
Here is the dput format :
structure(list(COL1 = c("Sp1-2", "Sp5-1", "Sp_7-3"), COL2 = c("Sp1-2",
"Sp5-2", "Sp_7-3"), COL3 = c("Sp3_2-54", "Sp2-4", NA), COL4 = c("Sp3-2",
"Sp9-2", "SP6-56"), COL5 = c("Sp3-2", "Sp10-3", "Sp2-7"), COL6 = c("Sp3-2",
"SP9-90", "SP3-3"), COL7 = c("SP9-43", NA, NA)), class = "data.frame", row.names = c(NA,
-3L))
Another example :
G136 G348 G465
1 NA NA NA
2 NA NA NA
3 SP4-140 SP4-140 NA
4 SP2-8 NA NA
5 SP3-59 NA NA
6 SP1_contig.682-8 NA SP1_contig.682-8
expected output:
G136|G348|G465
1 NA
2 NA
3 SP4-140
4 SP2-8
5 SP3-59
6 SP1_contig.682-8
the deput format :
dat<- structure(list(G136 = c(NA, NA, "SP4-140", "SP2-8", "SP3-59", "SP1_contig.682-8", NA, NA, NA), G348 = c(NA, NA, "SP4-140", NA, NA, NA, NA, NA, NA), G465 = c(NA, NA, NA, NA, NA, "SP1_contig.682-8", NA, NA, NA)), row.names = c(NA, -9L), class = c("tbl_df", "tbl", "data.frame"))
This is probably best handled by reshaping your data first, then it's straight forward to use various groupings to achieve your desired result:
library(tidyr)
library(dplyr)
dat %>%
rowid_to_column() %>%
pivot_longer(-rowid) %>%
filter(!is.na(value)) %>%
group_by(rowid, value) %>%
mutate(new_name = paste(name, collapse = "|")) %>%
separate_rows(new_name, sep = "\\|") %>%
group_by(name) %>%
mutate(new_name = paste(unique(new_name), collapse = "|")) %>%
group_by(value) %>%
filter(nchar(new_name) == max(nchar(new_name))) %>%
ungroup() %>%
select(-name) %>%
pivot_wider(names_from = new_name, values_from = value, values_fn = ~ paste(unique(.x), collapse = "|")) %>%
complete(rowid = full_seq(c(1, rowid), 1))
# A tibble: 3 × 5
rowid `COL1|COL2` COL3 `COL4|COL5|COL6` COL7
<dbl> <chr> <chr> <chr> <chr>
1 1 Sp1-2 Sp3_2-54 Sp3-2 SP9-43
2 2 Sp5-1|Sp5-2 Sp2-4 Sp9-2|Sp10-3|SP9-90 NA
3 3 Sp_7-3 NA SP6-56|Sp2-7|SP3-3 NA
And using the data in your second example gives:
# A tibble: 6 × 2
rowid `G136|G348|G465`
<dbl> <chr>
1 1 NA
2 2 NA
3 3 SP4-140
4 4 SP2-8
5 5 SP3-59
6 6 SP1_contig.682-8
It's really messy...but you may try
library(igraph)
library(stringdist)
library(data.table)
table(df[1,])
d <- c()
for (i in 1:(ncol(df)-1)){
for (j in (i+1):ncol(df)) {
if(any(na.omit(stringdist(df[,i], df[,j], method = "lv") == 0))) {
d <- rbind(d, c(i,j))
}
}
}
dd <- data.table(d)
net <- graph_from_data_frame(d = dd, directed = F)
key <- split(names(V(net)), components(net)$membership)
res <- matrix(NA,nrow = nrow(df), ncol = 0)
names_dummy <- c()
df_dummy <- c()
for (i in key){
i <- as.numeric(i)
names_dummy <- c(names_dummy, paste0(colnames(df)[i], collapse = "|"))
df_dummy <- cbind(df_dummy, apply(df[,i], 1, function(x) {paste0(unique(unlist(x)), collapse = "|")}))
}
colnames(df_dummy) <- names_dummy
df_dummy
res <- cbind(df_dummy, df[,-as.numeric(unlist(key))])
res <- res[,sort(colnames(res))]
res
COL1|COL2 COL3 COL4|COL5|COL6 COL7
1 Sp1-2 Sp3_2-54 Sp3-2 SP9-43
2 Sp5-1|Sp5-2 Sp2-4 Sp9-2|Sp10-3|SP9-90 <NA>
3 Sp_7-3 <NA> SP6-56|Sp2-7|SP3-3 <NA>

Reshaping a dataframe in R by sorting just some fields in a row alphabetically

I have a few large dataframes in RStudio, that have this structure:
Original data structure
structure(list(CHROM = c("scaffold1000|size223437", "scaffold1000|size223437",
"scaffold1000|size223437", "scaffold1000|size223437"), POS = c(666,
1332, 3445, 4336), REF = c("A", "TA", "CTTGA", "GCTA"), RO = c(20,
14, 9, 25), ALT_1 = c("GAT", "TGC", "AGC", "T"), ALT_2 = c("CAG",
"TGA", "CGC", NA), ALT_3 = c("G", NA, "TGA", NA), ALT_4 = c("AGT",
NA, NA, NA), AO_1 = c(13, 4, 67, 120), AO_2 = c(12, 5, 34, NA
), AO_3 = c(6, NA, 18, NA), AO_4 = c(101, NA, NA, NA), AOF_1 = c(8.55263157894737,
17.3913043478261, 52.34375, 82.7586206896552), AOF_2 = c(7.89473684210526,
21.7391304347826, 26.5625, NA), AOF_3 = c(3.94736842105263, NA,
14.0625, NA), AOF_4 = c(66.4473684210526, NA, NA, NA)), class = "data.frame", row.names = c(NA,
-4L))
But for an analysis I need it to look like this:
Desired output
structure(list(CHROM = c("scaffold1000|size223437", "scaffold1000|size223437",
"scaffold1000|size223437", "scaffold1000|size223437"), POS = c(666,
1332, 3445, 4336), REF = c("A", "TA", "CTTGA", "GCTA"), RO = c(20,
14, 9, 25), ALT_1 = c("AGT", "TGA", "AGC", "T"), ALT_2 = c("CAG",
"TGC", "CGC", NA), ALT_3 = c("G", NA, "TGA", NA), ALT_4 = c("GAT",
NA, NA, NA), AO_1 = c(101, 5, 67, 120), AO_2 = c(12, 4, 34, NA
), AO_3 = c(6, NA, 18, NA), AO_4 = c(13, NA, NA, NA), AOF_1 = c(66.4473684210526,
21.7391304347826, 52.34375, 82.7586206896552), AOF_2 = c(7.89473684210526,
17.3913043478261, 26.5625, NA), AOF_3 = c(3.94736842105263, NA,
14.0625, NA), AOF_4 = c(8.55263157894737, NA, NA, NA)), class = "data.frame", row.names = c(NA,
-4L))
So what I would like to do is to rearrange the content of a row in a way, that the columns ALT_1, ALT_2, ALT_3, ALT_4 are alphabetically sorted, but at the same time I also need to rearrange the corresponding columns of AO and AOF, so that the values still match.
(The value of AO_1 should still match with the sequence that was in ALT_1.
So if ALT_1 becomes ALT_2 in the sorted dataframe, AO_1 should also become AO_2)
What I tried so far, but didn't work:
Pasting the values of ALT_1, AO_1, AOF_1 all in one field, so I have them together with
if (is.na(X[i,6]) == FALSE) {
X[i,6] <- paste(X[i,6],X[i,10],X[i,14],sep=" ")
}
}
And then I wanted to extract every row as a vector to sort the values and put it back in the dataframe, but I didn't manage to do this.
So the question would be how I can order the dataframe to get the desired output?
(I need to apply this to 32 dataframes with each having >100.000 values)
Here is dplyr solution. Took me some time and I needed some help pivot_wider dissolves arrange:
library(dplyr)
library(tidyr)
df1 %>%
mutate(id = row_number()) %>%
unite("conc1", c(ALT_1, AO_1, AOF_1), sep = "_") %>%
unite("conc2", c(ALT_2, AO_2, AOF_2), sep = "_") %>%
unite("conc3", c(ALT_3, AO_3, AOF_3), sep = "_") %>%
unite("conc4", c(ALT_4, AO_4, AOF_4), sep = "_") %>%
pivot_longer(
starts_with("conc")
) %>%
mutate(value = ifelse(value=="NA_NA_NA", NA_character_, value)) %>%
group_by(id) %>%
mutate(value = sort(value, na.last = TRUE)) %>%
ungroup() %>%
pivot_wider(
names_from = name,
values_from = value,
values_fill = "0"
) %>%
separate(conc1, c("ALT_1", "AO_1", "AOF_1"), sep = "_") %>%
separate(conc2, c("ALT_2", "AO_2", "AOF_2"), sep = "_") %>%
separate(conc3, c("ALT_3", "AO_3", "AOF_3"), sep = "_") %>%
separate(conc4, c("ALT_4", "AO_4", "AOF_4"), sep = "_") %>%
select(CHROM, POS, REF, RO, starts_with("ALT"), starts_with("AO_"), starts_with("AOF_")) %>%
type.convert(as.is=TRUE)
CHROM POS REF RO ALT_1 ALT_2 ALT_3 ALT_4 AO_1 AO_2 AO_3 AO_4 AOF_1 AOF_2 AOF_3 AOF_4
<chr> <int> <chr> <int> <chr> <chr> <chr> <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl>
1 scaffold1000|size223437 666 A 20 AGT CAG G GAT 101 12 6 13 66.4 7.89 3.95 8.55
2 scaffold1000|size223437 1332 TA 14 TGA TGC NA NA 5 4 NA NA 21.7 17.4 NA NA
3 scaffold1000|size223437 3445 CTTGA 9 AGC CGC TGA NA 67 34 18 NA 52.3 26.6 14.1 NA
4 scaffold1000|size223437 4336 GCTA 25 T NA NA NA 120 NA NA NA 82.8 NA NA NA
here is a data.table approach
library(data.table)
# Set to data.table format
setDT(mydata)
# Melt to long format
DT.melt <- melt(mydata, measure.vars = patterns(ALT = "^ALT_", AO = "^AO_", AOF = "^AOF_"))
# order by groups, na's at the end
setorderv(DT.melt, cols = c("CHROM", "POS", "ALT"), na.last = TRUE)
# cast to wide again, use rowid() for numbering
dcast(DT.melt, CHROM + POS + REF + RO ~ rowid(REF), value.var = list("ALT", "AO", "AOF"))
# CHROM POS REF RO ALT_1 ALT_2 ALT_3 ALT_4 AO_1 AO_2 AO_3 AO_4 AOF_1 AOF_2 AOF_3 AOF_4
# 1: scaffold1000|size223437 666 A 20 AGT CAG G GAT 101 12 6 13 66.44737 7.894737 3.947368 8.552632
# 2: scaffold1000|size223437 1332 TA 14 TGA TGC <NA> <NA> 5 4 NA NA 21.73913 17.391304 NA NA
# 3: scaffold1000|size223437 3445 CTTGA 9 AGC CGC TGA <NA> 67 34 18 NA 52.34375 26.562500 14.062500 NA
# 4: scaffold1000|size223437 4336 GCTA 25 T <NA> <NA> <NA> 120 NA NA NA 82.75862 NA NA NA

Keep only one of the duplicated values in the same column and keeping NAs as such as well

I got column like this with some duplicated values
structure(list(id = c(1, 1, 1, 1, 1, 1, 1, 1), date = c(NA, NA,
NA, "2011/01/01", "2011/02/01", "2012/01/01", "2012/01/01", "2012/05/01"
)), class = "data.frame", row.names = c(NA, -8L))
I want to keep only one of the duplicated values, like this
structure(list(id2 = c(1, 1, 1, 1, 1, 1, 1),
date2 = c(NA, NA, NA, "2011/01/01", "2011/02/01", "2012/01/01", "2012/05/01")),
class = "data.frame", row.names = c(NA, -7L))
Depending on what you want exactly there are multiple alternatives:
dat %>%
filter(!duplicated(date))
gives
id date
1 1 <NA>
2 1 2011/01/01
3 1 2011/02/01
4 1 2012/01/01
As someone else also suggested, it gives the same result as
dat %>% distinct(date, .keep_all = T)
In contrast to that person I added a column to the distinct function, as I assumed you only want to remove the duplicated dates, not necessary duplicates in other columns (and the .keep_all is than necessary to keep those other columns).
However it is unclear for me if you want to keep all NAs or not. Becuase than you need to add some rows with just the NAs.
if you want all NAs you could for example do:
dat %>%
filter(!is.na(date) & !duplicated(date)) %>%
bind_rows(dat %>% filter(is.na(date)))
which gives
id date
1 1 2011/01/01
2 1 2011/02/01
3 1 2012/01/01
4 1 <NA>
5 1 <NA>
6 1 <NA>
Although there probably is a nicer way to do this.
Edit:
If you want to keep the entries but only want to make the duplicated values NA you can use the duplicated function this way:
dat %>%
mutate(
date1 = case_when(
duplicated(date) ~ NA_character_,
TRUE ~ date
)
)
I generally prefer case_when over if_else due to its readability. But in this case it would be the same.
It results in
id date date1
1 1 <NA> <NA>
2 1 <NA> <NA>
3 1 <NA> <NA>
4 1 2011/01/01 2011/01/01
5 1 2011/02/01 2011/02/01
6 1 2012/01/01 2012/01/01
7 1 2012/01/01 <NA>
8 1 2012/05/01 2012/05/01
I created an extra column for this example. But you could simply overwrite the date column in your actual analysis.
You can use dplyr::distinct:
library(tidyverse)
df <- structure(list(id = c(1, 1, 1, 1, 1, 1), date = c(NA, NA, NA,
"2011/01/01", "2011/02/01", "2012/01/01")), row.names = c(NA, 6L), class = "data.frame")
df
#> id date
#> 1 1 <NA>
#> 2 1 <NA>
#> 3 1 <NA>
#> 4 1 2011/01/01
#> 5 1 2011/02/01
#> 6 1 2012/01/01
df %>%
distinct()
#> id date
#> 1 1 <NA>
#> 2 1 2011/01/01
#> 3 1 2011/02/01
#> 4 1 2012/01/01

R Mutating multiple columns with matching

I am processing a large dataset adapted to my research. Suppose that I have 4 observations (records) and 5 columns as follows:
x <- data.frame("ID" = c(1, 2, 3, 4),
"group1" = c("A", NA, "B", NA),
"group2" = c("B", "A", NA, "C"),
"hours1" = c(3, NA, 5, NA),
"hours2" = c(1, 2, NA, 5))
> x
ID group1 group2 hours1 hours2
1 A B 3 1
2 <NA> A NA 2
3 B <NA> 5 NA
4 <NA> C NA 5
The "group1" and "group2" are reference columns containing the character values of A, B, and C, and the last two columns, "hours1" and "hours2," are numeric indicating hours obviously.
The column "group1" is corresponding to the column "hours1"; likewise, "group2" is corresponding to "hours 2."
I want to create multiple columns according to the values, A, B, and C, of the reference columns matching to values of "hours1" and "hours2" as follows:
ID group1 group2 hours1 hours2 A B C
1 A B 3 1 3 1 NA
2 <NA> A NA 2 2 NA NA
3 B <NA> 5 NA NA 5 NA
4 <NA> C NA 5 NA NA 5
For example, ID 1 has A in "group1," corresponding to 3 in "hours1" which is found under the column "A." ID 3 has B in "group1," corresponding to 5 in "hours1" which is found under the columns "B." In "group 2," ID 4 has C, corresponding to 5 in hours2 which is found under column "C."
Is there a way to do it using R?
One way would be to combine all the "hour" column in one column and "group" columns in another column. This can be done using pivot_longer. After that we can get data in wide format and join it with original data.
library(dplyr)
library(tidyr)
x %>%
pivot_longer(cols = -ID,
names_to = c('.value'),
names_pattern = '(.*?)\\d+',
values_drop_na = TRUE) %>%
pivot_wider(names_from = group, values_from = hours) %>%
left_join(x, by = 'ID') %>%
select(ID, starts_with('group'), starts_with('hour'), everything())
# A tibble: 4 x 8
# ID group1 group2 hours1 hours2 A B C
# <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 1 A B 3 1 3 1 NA
#2 2 NA A NA 2 2 NA NA
#3 3 B NA 5 NA NA 5 NA
#4 4 NA C NA 5 NA NA 5
For OP's dataset we can slightly modify the code to achieve the desired result.
zz %>%
pivot_longer(cols = -id,
names_to = c('.value'),
names_pattern = '(.*)_',
values_drop_na = TRUE) %>%
arrange(fu2a) %>%
pivot_wider(names_from = fu2a, values_from = fu2b) %>%
left_join(zz, by = 'id') %>%
select(id, starts_with('fu2a'), starts_with('fu2b'), everything())
Another approach using dplyr could be done separating group and hours variables to compute the desired variables and then merge with the original x:
library(tidyverse)
#Data
x <- data.frame("ID" = c(1, 2, 3, 4),
"group1" = c("A", NA, "B", NA),
"group2" = c("B", "A", NA, "C"),
"hours1" = c(3, NA, 5, NA),
"hours2" = c(1, 2, NA, 5),stringsAsFactors = F)
#Reshape
x %>%
left_join(x %>% select(1:3) %>%
pivot_longer(cols = -ID) %>%
group_by(ID) %>% mutate(id=1:n()) %>%
left_join(x %>% select(c(1,4:5)) %>%
pivot_longer(cols = -ID) %>%
rename(name2=name,value2=value) %>%
group_by(ID) %>% mutate(id=1:n())) %>%
filter(!is.na(value)) %>% select(ID,value,value2) %>%
pivot_wider(names_from = value,values_from=value2))
Output:
ID group1 group2 hours1 hours2 A B C
1 1 A B 3 1 3 1 NA
2 2 <NA> A NA 2 2 NA NA
3 3 B <NA> 5 NA NA 5 NA
4 4 <NA> C NA 5 NA NA 5

R Replace NA for all Columns Except *

library(tidyverse)
df <- tibble(Date = c(rep(as.Date("2020-01-01"), 3), NA),
col1 = 1:4,
thisCol = c(NA, 8, NA, 3),
thatCol = 25:28,
col999 = rep(99, 4))
#> # A tibble: 4 x 5
#> Date col1 thisCol thatCol col999
#> <date> <int> <dbl> <int> <dbl>
#> 1 2020-01-01 1 NA 25 99
#> 2 2020-01-01 2 8 26 99
#> 3 2020-01-01 3 NA 27 99
#> 4 NA 4 3 28 99
My actual R data frame has hundreds of columns that aren't neatly named, but can be approximated by the df data frame above.
I want to replace all values of NA with 0, with the exception of several columns (in my example I want to leave out the Date column and the thatCol column. I'd want to do it in this sort of fashion:
df %>% replace(is.na(.), 0)
#> Error: Assigned data `values` must be compatible with existing data.
#> i Error occurred for column `Date`.
#> x Can't convert <double> to <date>.
#> Run `rlang::last_error()` to see where the error occurred.
And my unsuccessful ideas for accomplishing the "everything except" replace NA are shown below.
df %>% replace(is.na(c(., -c(Date, thatCol)), 0))
df %>% replace_na(list([, c(2:3, 5)] = 0))
df %>% replace_na(list(everything(-c(Date, thatCol)) = 0))
Is there a way to select everything BUT in the way I need to? There's hundred of columns, named inconsistently, so typing them one by one is not a practical option.
You can use mutate_at :
library(dplyr)
Remove them by Name
df %>% mutate_at(vars(-c(Date, thatCol)), ~replace(., is.na(.), 0))
Remove them by position
df %>% mutate_at(-c(1,4), ~replace(., is.na(.), 0))
Select them by name
df %>% mutate_at(vars(col1, thisCol, col999), ~replace(., is.na(.), 0))
Select them by position
df %>% mutate_at(c(2, 3, 5), ~replace(., is.na(.), 0))
If you want to use replace_na
df %>% mutate_at(vars(-c(Date, thatCol)), tidyr::replace_na, 0)
Note that mutate_at is soon going to be replaced by across in dplyr 1.0.0.
You have several options here based on data.table.
One of the coolest options: setnafill (version >= 1.12.4):
library(data.table)
setDT(df)
data.table::setnafill(df,fill = 0, cols = colnames(df)[!(colnames(df) %in% c("Date", thatCol)]))
Note that your dataframe is updated by reference.
Another base solution:
to_change<-grep("^(this|col)",names(df))
df[to_change]<- sapply(df[to_change],function(x) replace(x,is.na(x),0))
df
# A tibble: 4 x 5
Date col1 thisCol thatCol col999
<date> <dbl> <dbl> <int> <dbl>
1 2020-01-01 1 0 25 99
2 2020-01-01 2 8 26 99
3 2020-01-01 3 0 27 99
4 NA 0 3 28 99
Data(I changed one value):
df <- structure(list(Date = structure(c(18262, 18262, 18262, NA), class = "Date"),
col1 = c(1L, 2L, 3L, NA), thisCol = c(NA, 8, NA, 3), thatCol = 25:28,
col999 = c(99, 99, 99, 99)), row.names = c(NA, -4L), class = c("tbl_df",
"tbl", "data.frame"))
replace works on a data.frame, so we can just do the replacement by index and update the original dataset
df[-c(1, 4)] <- replace(df[-c(1, 4)], is.na(df[-c(1, 4)]), 0)
Or using replace_na with across (from the new dplyr)
library(dplyr)
library(tidyr)
df %>%
mutate(across(-c(Date, thatCol), ~ replace_na(., 0)))
If you know the ones that you don't want to change, you could do it like this:
df <- tibble(Date = c(rep(as.Date("2020-01-01"), 3), NA),
col1 = 1:4,
thisCol = c(NA, 8, NA, 3),
thatCol = 25:28,
col999 = rep(99, 4))
#dplyr
df_nonreplace <- select(df, c("Date", "thatCol"))
df_replace <- df[ ,!names(df) %in% names(df_nonreplace)]
df_replace[is.na(df_replace)] <- 0
df <- cbind(df_nonreplace, df_replace)
> head(df)
Date thatCol col1 thisCol col999
1 2020-01-01 25 1 0 99
2 2020-01-01 26 2 8 99
3 2020-01-01 27 3 0 99
4 <NA> 28 4 3 99

Resources