Remove column only if it exists - r

I want to remove a column from a data frame only if it's there.
Example:
a <- 1:5
x <- tibble(a, b = a * 2, c = 1)
x %>% select(-'a')
x %>% select(-'d') # Throws an error
I want a way to remove columns a and d only if they exist, so a is removed and the attempt to remove d never happens. I tried modifying this solution to my problem, but I could not get it to work.

data.table
library(data.table)
a <- 1:5
x <- data.frame(a, b = a * 2, c = 1)
cols <- c("a", "d")
my_cols <- intersect(cols, names(x))
setDT(x)[, ..my_cols]
#> a
#> 1: 1
#> 2: 2
#> 3: 3
#> 4: 4
#> 5: 5
Created on 2021-07-09 by the reprex package (v2.0.0)

Related

How to divide only a certain factor in a column in R data frame?

This is the sample dataset:
library(data.table)
df = data.table(x = c(1000,2000,10,2), y = c('A','A','B','B'))
I only want to divide df$y == "A" by 1000. The final dataset should appear as:
df = data.table(x = c(1,2,10,2), y = c('A','A','B','B'))
You need to create a conditional statement.
In base R:
df$x <- ifelse(df$y == "A", df$x/1000, df$x)
In dplyr:
library(dplyr)
df <- df |>
mutate(x = if_else(y == "A", x/1000, x))
data.table option using fifelse like this:
library(data.table)
df = data.table(x = c(1000,2000,10,2), y = c('A','A','B','B'))
df[,x:=fifelse(y == "A", x/1000, x),]
df
#> x y
#> 1: 1 A
#> 2: 2 A
#> 3: 10 B
#> 4: 2 B
Created on 2023-02-18 with reprex v2.0.2
We could use data.table methods as the input is a data.table
library(data.table)
df[y == 'A', x := x/1000]
-output
> df
x y
1: 1 A
2: 2 A
3: 10 B
4: 2 B
Base R: Subsetting with [:
df$x[df$y == "A"] <- df$x[df$y == "A"]/1000
x y
1: 1 A
2: 2 A
3: 10 B
4: 2 B

Extract first Non NA value over multiple columns

I'm still learning R and was wondering if I there was an elegant way of manipulating the below df to achieve df2.
I'm not sure if it's a loop that is supposed to be used for this, but basically I want to extract the first Non NA "X_No" Value if the "X_No" value is NA in the first row. This would perhaps be best described through an example from df to the desired df2.
A_ID <- c('A','B','I','N')
A_No <- c(11,NA,15,NA)
B_ID <- c('B','C','D','J')
B_No <- c(NA,NA,12,NA)
C_ID <- c('E','F','G','P')
C_No <- c(NA,13,14,20)
D_ID <- c('J','K','L','M')
D_No <- c(NA,NA,NA,40)
E_ID <- c('W','X','Y','Z')
E_No <- c(50,32,48,40)
df <- data.frame(A_ID,A_No,B_ID,B_No,C_ID,C_No,D_ID,D_No,E_ID,E_No)
ID <- c('A','D','F','M','W')
No <- c(11,12,13,40,50)
df2 <- data.frame(ID,No)
I'm hoping for an elegant solution to this as there are over a 1000 columns similar to the example provided.
I've looked all over the web for a similar example however to no avail that would reproduce the expected result.
Your help is very much appreciated.
Thankyou
I don't know if I'd call it "elegant", but here is a potential solution:
library(tidyverse)
A_ID <- c('A','B','I','N')
A_No <- c(11,NA,15,NA)
B_ID <- c('B','C','D','J')
B_No <- c(NA,NA,12,NA)
C_ID <- c('E','F','G','P')
C_No <- c(NA,13,14,20)
D_ID <- c('J','K','L','M')
D_No <- c(NA,NA,NA,40)
E_ID <- c('W','X','Y','Z')
E_No <- c(50,32,48,40)
df <- data.frame(A_ID,A_No,B_ID,B_No,C_ID,C_No,D_ID,D_No,E_ID,E_No)
ID <- c('A','D','F','M','W')
No <- c(11,12,13,40,50)
df2 <- data.frame(ID,No)
output <- df %>%
pivot_longer(everything(),
names_sep = "_",
names_to = c("Col", ".value")) %>%
drop_na() %>%
group_by(Col) %>%
slice_head(n = 1) %>%
ungroup() %>%
select(-Col)
df2
#> ID No
#> 1 A 11
#> 2 D 12
#> 3 F 13
#> 4 M 40
#> 5 W 50
output
#> # A tibble: 5 × 2
#> ID No
#> <chr> <dbl>
#> 1 A 11
#> 2 D 12
#> 3 F 13
#> 4 M 40
#> 5 W 50
all_equal(df2, output)
#> [1] TRUE
Created on 2023-02-08 with reprex v2.0.2
Using base R with max.col (assuming the columns are alternating with ID, No)
ind <- max.col(!is.na(t(df[c(FALSE, TRUE)])), "first")
m1 <- cbind(seq_along(ind), ind)
data.frame(ID = t(df[c(TRUE, FALSE)])[m1], No = t(df[c(FALSE, TRUE)])[m1])
ID No
1 A 11
2 D 12
3 F 13
4 M 40
5 W 50
Here is a data.table solution that should scale well to a (very) large dataset.
functionally
split the data.frame to a list of chunks of columns, based on their
names. So all columns startting with A_ go to
the first element, all colums startting with B_ to the second
Then, put these list elements on top of each other, using
data.table::rbindlist. Ignure the column-namaes (this only works if
A_ has the same number of columns as B_ has the same number of cols
as n_)
Now get the first non-NA value of each value in the first column
code
library(data.table)
# split based on what comes after the underscore
L <- split.default(df, f = gsub("(.*)_.*", "\\1", names(df)))
# bind together again
DT <- rbindlist(L, use.names = FALSE)
# extract the first value of the non-NA
DT[!is.na(A_No), .(No = A_No[1]), keyby = .(ID = A_ID)]
# ID No
# 1: A 11
# 2: D 12
# 3: F 13
# 4: G 14
# 5: I 15
# 6: M 40
# 7: P 20
# 8: W 50
# 9: X 32
#10: Y 48
#11: Z 40

How can I use a pre-assigned variable in dplyr::filter?

When I assign expt as below, I get a tibble of dim 175, 81:
expt <- 5
mydata <- loaded_data %>% filter(expt == expt) %>%
select(leaf.TN.pc | starts_with("air_")) %>%
drop_na(leaf.TN.pc)
This is not what I want. Instead if I assign expt inline, as:
mydata <- loaded_data %>% filter(expt == expt) %>%
I get what I want, which is only obs from experiment 5, dim = 79, 81.
What you did was comparing expt column with itself which was always true.
Below the solution that will work for either one chosen value (expt_chosen <- 1) or multiple values (expt_chosen <- c(1, 3)). Just use %in% operator.
library(tidyverse)
expt_chosen <- c(1, 3)
dframe <- data.frame(name = letters[1:5], expt = 1:5)
dframe
#> name expt
#> 1 a 1
#> 2 b 2
#> 3 c 3
#> 4 d 4
#> 5 e 5
dframe %>% filter(expt %in% expt_chosen)
#> name expt
#> 1 a 1
#> 2 c 3
Created on 2020-05-14 by the reprex package (v0.3.0)

Replace values in tibble in R 4.0

I just upgraded to R 4.0.0 from R 3.6.2 and some functionality that I use to replace values in a tibble no longer works. I can't find what I need to do now. Does anyone know the "new" way?
library(tidyverse)
v <- c(1, 2, 3)
w <- c(4, 4)
i <- 1
# Does not work anymore
df <- tibble(a = v, b = v, c = v)
df[i, 2:3] <- w
# This used to work with tibbles
df.old <- data.frame(a = v, b = v, c = v)
df.old[i, 2:3] <- w
This is the error that I get with the tibble:
Error: Assigned data `w` must be compatible with row subscript `i`.
x 1 row must be assigned.
x Assigned data has 2 rows.
i Only vectors of size 1 are recycled.
Thanks,
In my R-devel version, the error message includes
ℹ Row updates require a list value. Do you need `list()` or `as.list()`?
So the canonical way in this version is probably df[i, 2:3] <- as.list(w), which works:
library(tidyverse)
v <- c(1, 2, 3)
w <- c(4, 4)
i <- 1
df <- tibble(a = v, b = v, c = v)
df[i, 2:3] <- as.list(w)
df
#> # A tibble: 3 x 3
#> a b c
#> <dbl> <dbl> <dbl>
#> 1 1 4 4
#> 2 2 2 2
#> 3 3 3 3
Created on 2020-04-26 by the reprex package (v0.3.0)

How to replace NA values in a table for selected columns

There are a lot of posts about replacing NA values. I am aware that one could replace NAs in the following table/frame with the following:
x[is.na(x)]<-0
But, what if I want to restrict it to only certain columns? Let's me show you an example.
First, let's start with a dataset.
set.seed(1234)
x <- data.frame(a=sample(c(1,2,NA), 10, replace=T),
b=sample(c(1,2,NA), 10, replace=T),
c=sample(c(1:5,NA), 10, replace=T))
Which gives:
a b c
1 1 NA 2
2 2 2 2
3 2 1 1
4 2 NA 1
5 NA 1 2
6 2 NA 5
7 1 1 4
8 1 1 NA
9 2 1 5
10 2 1 1
Ok, so I only want to restrict the replacement to columns 'a' and 'b'. My attempt was:
x[is.na(x), 1:2]<-0
and:
x[is.na(x[1:2])]<-0
Which does not work.
My data.table attempt, where y<-data.table(x), was obviously never going to work:
y[is.na(y[,list(a,b)]), ]
I want to pass columns inside the is.na argument but that obviously wouldn't work.
I would like to do this in a data.frame and a data.table. My end goal is to recode the 1:2 to 0:1 in 'a' and 'b' while keeping 'c' the way it is, since it is not a logical variable. I have a bunch of columns so I don't want to do it one by one. And, I'd just like to know how to do this.
Do you have any suggestions?
You can do:
x[, 1:2][is.na(x[, 1:2])] <- 0
or better (IMHO), use the variable names:
x[c("a", "b")][is.na(x[c("a", "b")])] <- 0
In both cases, 1:2 or c("a", "b") can be replaced by a pre-defined vector.
Building on #Robert McDonald's tidyr::replace_na() answer, here are some dplyr options for controlling which columns the NAs are replaced:
library(tidyverse)
# by column type:
x %>%
mutate_if(is.numeric, ~replace_na(., 0))
# select columns defined in vars(col1, col2, ...):
x %>%
mutate_at(vars(a, b, c), ~replace_na(., 0))
# all columns:
x %>%
mutate_all(~replace_na(., 0))
Edit 2020-06-15
Since data.table 1.12.4 (Oct 2019), data.table gains two functions to facilitate this: nafill and setnafill.
nafill operates on columns:
cols = c('a', 'b')
y[ , (cols) := lapply(.SD, nafill, fill=0), .SDcols = cols]
setnafill operates on tables (the replacements happen by-reference/in-place)
setnafill(y, cols=cols, fill=0)
# print y to show the effect
y[]
This will also be more efficient than the other options; see ?nafill for more, the last-observation-carried-forward (LOCF) and next-observation-carried-backward (NOCB) versions of NA imputation for time series.
This will work for your data.table version:
for (col in c("a", "b")) y[is.na(get(col)), (col) := 0]
Alternatively, as David Arenburg points out below, you can use set (side benefit - you can use it either on data.frame or data.table):
for (col in 1:2) set(x, which(is.na(x[[col]])), col, 0)
This is now trivial in tidyr with replace_na(). The function appears to work for data.tables as well as data.frames:
tidyr::replace_na(x, list(a=0, b=0))
Not sure if this is more concise, but this function will also find and allow replacement of NAs (or any value you like) in selected columns of a data.table:
update.mat <- function(dt, cols, criteria) {
require(data.table)
x <- as.data.frame(which(criteria==TRUE, arr.ind = TRUE))
y <- as.matrix(subset(x, x$col %in% which((names(dt) %in% cols), arr.ind = TRUE)))
y
}
To apply it:
y[update.mat(y, c("a", "b"), is.na(y))] <- 0
The function creates a matrix of the selected columns and rows (cell coordinates) that meet the input criteria (in this case is.na == TRUE).
We can solve it in data.table way with tidyr::repalce_na function and lapply
library(data.table)
library(tidyr)
setDT(df)
df[,c("a","b","c"):=lapply(.SD,function(x) replace_na(x,0)),.SDcols=c("a","b","c")]
In this way, we can also solve paste columns with NA string. First, we replace_na(x,""),then we can use stringr::str_c to combine columns!
Starting from the data.table y, you can just write:
y[, (cols):=lapply(.SD, function(i){i[is.na(i)] <- 0; i}), .SDcols = cols]
Don't forget to library(data.table) before creating y and running this command.
This needed a bit extra for dealing with NA's in factors.
Found a useful function here, which you can then use with mutate_at or mutate_if:
replace_factor_na <- function(x){
x <- as.character(x)
x <- if_else(is.na(x), 'NONE', x)
x <- as.factor(x)
}
df <- df %>%
mutate_at(
vars(vector_of_column_names),
replace_factor_na
)
Or apply to all factor columns:
df <- df %>%
mutate_if(is.factor, replace_factor_na)
For a specific column, there is an alternative with sapply
DF <- data.frame(A = letters[1:5],
B = letters[6:10],
C = c(2, 5, NA, 8, NA))
DF_NEW <- sapply(seq(1, nrow(DF)),
function(i) ifelse(is.na(DF[i,3]) ==
TRUE,
0,
DF[i,3]))
DF[,3] <- DF_NEW
DF
For completeness, built upon #sbha's answer, here is the tidyverse version with the across() function that's available in dplyr since version 1.0 (which supersedes the *_at() variants, and others):
# random data
set.seed(1234)
x <- data.frame(a = sample(c(1, 2, NA), 10, replace = T),
b = sample(c(1, 2, NA), 10, replace = T),
c = sample(c(1:5, NA), 10, replace = T))
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(tidyr)
# with the magrittr pipe
x %>% mutate(across(1:2, ~ replace_na(.x, 0)))
#> a b c
#> 1 2 2 5
#> 2 2 2 2
#> 3 1 0 5
#> 4 0 2 2
#> 5 1 2 NA
#> 6 1 2 3
#> 7 2 2 4
#> 8 2 1 4
#> 9 0 0 3
#> 10 2 0 1
# with the native pipe (since R 4.1)
x |> mutate(across(1:2, ~ replace_na(.x, 0)))
#> a b c
#> 1 2 2 5
#> 2 2 2 2
#> 3 1 0 5
#> 4 0 2 2
#> 5 1 2 NA
#> 6 1 2 3
#> 7 2 2 4
#> 8 2 1 4
#> 9 0 0 3
#> 10 2 0 1
Created on 2021-12-08 by the reprex package (v2.0.1)
it's quite handy with data.table and stringr
library(data.table)
library(stringr)
x[, lapply(.SD, function(xx) {str_replace_na(xx, 0)})]
FYI
this works fine for me
DataTable DT = new DataTable();
DT = DT.AsEnumerable().Select(R =>
{
R["Campo1"] = valor;
return (R);
}).ToArray().CopyToDataTable();

Resources