Remove rows from list of dataframes based on condition - r

I have a list of dataframes. It looks something like this:
df1 <- data.frame(Var1 = c(1, 7, 9, 4, 2),
Var2 = c(7, 2, 4, 4, 3),
Var3 = c(3, 6, 2, 0, 8))
df2 <- data.frame(Var1 = c(5, 6, 2, 2, 1),
Var2 = c(8, 6, 6, 7, 4),
Var3 = c(9, 0, 1, 3, 4))
df3.wxyz <- data.frame(Var1 = c("w", "x", "y", "z", 3, 7, 3, 6, 6),
Var2 = c(NA, NA, NA, NA, 7, 5, 8, 0, 2),
Var3 = c(NA, NA, NA, NA, 3, 3, 4, 1, 9))
df4 <- data.frame(Var1 = c(2, 7, 2, 4, 8),
Var2 = c(8, 3, 1, 7, 3),
Var3 = c(9, 1, 1, 6, 5))
df5.wxyz <- data.frame(Var1 = c("w", "x", "y", "z", 2, 7, 3, 1, 6),
Var2 = c(NA, NA, NA, NA, 7, 4, 8, 1, 9),
Var3 = c(NA, NA, NA, NA, 8, 0, 4, 1, 2))
df.list <- list(df1, df2, df3.wxyz, df4, df5.wxyz)
names(df.list) <- c("df1", "df2", "df3.wxyz", "df4", "df5.wxyz")
I would like to remove the first 4 rows of df3.wxyz and df5.wxyz from the list of dataframes as those contain information that I do not need. What I've tried is the following code, but instead of only removing the first 4 rows in df3.wxyz and df5.wxyz, it is removing the first 4 rows from every dataframe in my list. I'm not sure what the issue is.
df.list <- lapply(df.list, function(i){
ifelse(grepl("wxyz", names(df.list)), i <- i[-c(1:4), ], df.list)
i
})
This is what I would like to achieve:
df1 <- data.frame(Var1 = c(1, 7, 9, 4, 2),
Var2 = c(7, 2, 4, 4, 3),
Var3 = c(3, 6, 2, 0, 8))
df2 <- data.frame(Var1 = c(5, 6, 2, 2, 1),
Var2 = c(8, 6, 6, 7, 4),
Var3 = c(9, 0, 1, 3, 4))
df3.wxyz <- data.frame(Var1 = c(3, 7, 3, 6, 6),
Var2 = c(7, 5, 8, 0, 2),
Var3 = c(3, 3, 4, 1, 9))
df4 <- data.frame(Var1 = c(2, 7, 2, 4, 8),
Var2 = c(8, 3, 1, 7, 3),
Var3 = c(9, 1, 1, 6, 5))
df5.wxyz <- data.frame(Var1 = c(2, 7, 3, 1, 6),
Var2 = c(7, 4, 8, 1, 9),
Var3 = c(8, 0, 4, 1, 2))
df.list <- list(df1, df2, df3.wxyz, df4, df5.wxyz)
names(df.list) <- c("df1", "df2", "df3.wxyz", "df4", "df5.wxyz")

You can try,
df.list[grepl('wxyz', names(df.list))] <- lapply(df.list[grepl('wxyz', names(df.list))], na.omit)

You can try na.omit like below
> Map(na.omit,df.list)
$df1
Var1 Var2 Var3
1 1 7 3
2 7 2 6
3 9 4 2
4 4 4 0
5 2 3 8
$df2
Var1 Var2 Var3
1 5 8 9
2 6 6 0
3 2 6 1
4 2 7 3
5 1 4 4
$df3.wxyz
Var1 Var2 Var3
5 3 7 3
6 7 5 3
7 3 8 4
8 6 0 1
9 6 2 9
$df4
Var1 Var2 Var3
1 2 8 9
2 7 3 1
3 2 1 1
4 4 7 6
5 8 3 5
$df5.wxyz
Var1 Var2 Var3
5 2 7 8
6 7 4 0
7 3 8 4
8 1 1 1
9 6 9 2

Related

variable based on other variables in R

I have a df like this
my_df <- data.frame(
b1 = c(2, 6, 3, 6, 4, 2, 1, 9, NA),
b2 = c(100, 4, 106, 102, 6, 6, 1, 1, 7),
b3 = c(75, 79, 8, 0, 2, 3, 9, 5, 80),
b4 = c(NA, 6, NA, 10, 12, 8, 3, 6, 2),
b5 = c(2, 12, 1, 7, 8, 5, 5, 6, NA),
b6 = c(9, 2, 4, 6, 7, 6, 6, 7, 9),
b7 = c(1, 3, 7, 7, 4, 2, 2, 9, 5),
b8 = c(NA, 8, 4, 5, 1, 4, 1, 3, 6),
b9 = c(4, 5, 7, 9, 5, 1, 1, 2, NA),
b10 = c(14, 2, 4, 2, 1, 1, 1, 1, 5))
I want to create a new column (NEW) which says BLUE or RED based on columns b2 and b3. so, if column b2 is Greater than or equal to 100 0R b3 is Greater than or equal to 75, then input BLUE otherwise input RED.
So that I will have something like this:
my_df <- data.frame(
b1 = c(2, 6, 3, 6, 4, 2, 1, 9, NA),
b2 = c(100, 4, 106, 102, 6, 6, 1, 1, 7),
b3 = c(75, 79, 8, 0, 2, 3, 9, 5, 80),
b4 = c(NA, 6, NA, 10, 12, 8, 3, 6, 2),
b5 = c(2, 12, 1, 7, 8, 5, 5, 6, NA),
b6 = c(9, 2, 4, 6, 7, 6, 6, 7, 9),
b7 = c(1, 3, 7, 7, 4, 2, 2, 9, 5),
b8 = c(NA, 8, 4, 5, 1, 4, 1, 3, 6),
b9 = c(4, 5, 7, 9, 5, 1, 1, 2, NA),
b10 = c(14, 2, 4, 2, 1, 1, 1, 1, 5),
NEW = c("BLUE", "BLUE", "BLUE", "BLUE", "RED", "RED", "RED", "RED", "BLUE"))
I have been able to work this out using this:
library (tidyverse)
greater_threshold <- 99.9
greater_threshold1 <- 74.9
my_df1 <- my_df %>%
mutate(NEW = case_when(b2 > greater_threshold ~ "BLUE",
b3 > greater_threshold1 ~ "BLUE",
+ T~"RED"))
At the moment, you can see that I am setting my 'greater threshold' to be slightly less than the required value. Although it works well. My question is this. Is there a way I set set my 'greater threshold to be ≥ 100 for b2 and ≥ 75 for b3.
For this example, I'd go whit if_else instead of case_when:
library(dplyr)
greater_threshold <- 100
greater_threshold1 <- 75
my_df <- data.frame(
b1 = c(2, 6, 3, 6, 4, 2, 1, 9, NA),
b2 = c(100, 4, 106, 102, 6, 6, 1, 1, 7),
b3 = c(75, 79, 8, 0, 2, 3, 9, 5, 80),
b4 = c(NA, 6, NA, 10, 12, 8, 3, 6, 2),
b5 = c(2, 12, 1, 7, 8, 5, 5, 6, NA),
b6 = c(9, 2, 4, 6, 7, 6, 6, 7, 9),
b7 = c(1, 3, 7, 7, 4, 2, 2, 9, 5),
b8 = c(NA, 8, 4, 5, 1, 4, 1, 3, 6),
b9 = c(4, 5, 7, 9, 5, 1, 1, 2, NA),
b10 = c(14, 2, 4, 2, 1, 1, 1, 1, 5)
)
my_df1 <- my_df %>%
mutate(
NEW = if_else(
b2 >= greater_threshold | b3 >= greater_threshold1,
"BLUE",
"RED"
)
)
my_df1
# b1 b2 b3 b4 b5 b6 b7 b8 b9 b10 NEW
# 1 2 100 75 NA 2 9 1 NA 4 14 BLUE
# 2 6 4 79 6 12 2 3 8 5 2 BLUE
# 3 3 106 8 NA 1 4 7 4 7 4 BLUE
# 4 6 102 0 10 7 6 7 5 9 2 BLUE
# 5 4 6 2 12 8 7 4 1 5 1 RED
# 6 2 6 3 8 5 6 2 4 1 1 RED
# 7 1 1 9 3 5 6 2 1 1 1 RED
# 8 9 1 5 6 6 7 9 3 2 1 RED
# 9 NA 7 80 2 NA 9 5 6 NA 5 BLUE

Replace values in one vector with values from other vector(s)

I have a dataframe something this:
id <- c(1, 2, 3, 4, 5, 6, 7)
var1 <- c(1, NA, 2, NA, 1, 1, 2)
var2 <- c(1, 1, 2, 2, NA, 2, 2)
However, how do I manage to create a new vector, which takes the values from var2, and replace it with NAs in var1 and otherwise just takes the value (1 or 2) from var1, as long as it has one?
I'm thinking something like:
id <- c(1, 2, 3, 4, 5, 6, 7)
var1 <- c(1, NA, 2, NA, 1, 1, 2)
var2 <- c(1, 1, 2, 2, NA, 2, 2)
newvar <- c(1, 1, 2, 2, 1, 1, 2)
The same goes for another dataframe, in which there are more vectors:
id <- c(1, 2, 3, 4, 5, 6, 7)
var1 <- c(1, NA,2, NA,NA,1, 2)
var2 <- c(1, 1, 2, 2, NA,2, 2)
var3 <- c(2, 1, 2, 1, 1, 1, 2)
var4 <- c(1, 1, 2, NA,2, 1, 2)
In this case, I want to create another vector "newvar", which takes the dominant value from the var2, var3 and var4, and replace it with NA in var1.
So the starting point will always be what is in var1. However for id4 and id5 fx, there is no dominant value in the other variables - then i want to replace NA with values from the first variable with values, in this to two cases values from var 2 and var3 respectively.
id <- c(1, 2, 3, 4, 5, 6, 7)
var1 <- c(1, NA,2, NA,NA,1, 2)
var2 <- c(1, 1, 2, 2, NA,2, 2)
var3 <- c(2, 1, 2, 1, 1, 1, 2)
var4 <- c(1, 1, 2, NA,2, 1, 2)
newvar <- c(1, 1, 2, 2, 1, 1, 2)
How can this be done in an easy way?
Thank you!
Its possible to use [<- in Reduce to overwrite NA with values of the next vector(s).
var1 <- c(1, NA, 2, NA, 1, 1, 2)
var2 <- c(1, 1, 2, 2, NA, 2, 2)
#`[<-`(var1, is.na(var1), var2[is.na(var1)]) #In case of only two vectors
Reduce(function(a, b) `[<-`(a, is.na(a), b[is.na(a)]), list(var1, var2))
#[1] 1 1 2 2 1 1 2
var1 <- c(1, NA,2, NA,NA,1, 2)
var2 <- c(1, 1, 2, 2, NA,2, 2)
var3 <- c(2, 1, 2, 1, 1, 1, 2)
var4 <- c(1, 1, 2, NA,2, 1, 2)
Reduce(function(a, b) `[<-`(a, is.na(a), b[is.na(a)]), list(var1, var2, var3, var4))
#[1] 1 1 2 2 1 1 2
What is somehow like doing:
var1 <- c(1, NA, 2, NA, 1, 1, 2)
var2 <- c(1, 1, 2, 2, NA, 2, 2)
newvar <- var1
i <- is.na(newvar)
newvar[i] <- var2[i]
newvar
#[1] 1 1 2 2 1 1 2
Try this.
df %>%
mutate(newavar = coalesce(var1,var2, var3, var4))
You can use coalesce from dplyr.
library(dplyr)
df$newvar <- do.call(coalesce, select(df, starts_with('var')))
df
# id var1 var2 var3 var4 newvar
#1 1 1 1 2 1 1
#2 2 NA 1 1 1 1
#3 3 2 2 2 2 2
#4 4 NA 2 1 NA 2
#5 5 NA NA 1 2 1
#6 6 1 2 1 1 1
#7 7 2 2 2 2 2
data
id <- c(1, 2, 3, 4, 5, 6, 7)
var1 <- c(1, NA,2, NA,NA,1, 2)
var2 <- c(1, 1, 2, 2, NA,2, 2)
var3 <- c(2, 1, 2, 1, 1, 1, 2)
var4 <- c(1, 1, 2, NA,2, 1, 2)
df <- data.frame(id, var1, var2, var3, var4)
With tidyverse, we can use invoke with coalesce
library(dplyr)
library(purrr)
df %>%
mutate(newvar = invoke(coalesce, select(cur_data(), starts_with('var'))))
id var1 var2 var3 var4 newvar
1 1 1 1 2 1 1
2 2 NA 1 1 1 1
3 3 2 2 2 2 2
4 4 NA 2 1 NA 2
5 5 NA NA 1 2 1
6 6 1 2 1 1 1
7 7 2 2 2 2 2
data
df <- structure(list(id = c(1, 2, 3, 4, 5, 6, 7), var1 = c(1, NA, 2,
NA, NA, 1, 2), var2 = c(1, 1, 2, 2, NA, 2, 2), var3 = c(2, 1,
2, 1, 1, 1, 2), var4 = c(1, 1, 2, NA, 2, 1, 2)),
class = "data.frame", row.names = c(NA,
-7L))
A base R option using pmin + col
df$newvar <- df[-1][
cbind(
1:nrow(df),
do.call(
pmin,
data.frame(
replace(
u <- (!is.na(df[-1])) * col(df[-1]),
u == 0, Inf
)
)
)
)
]
gives
> df
id var1 var2 var3 var4 newvar
1 1 1 1 2 1 1
2 2 NA 1 1 1 1
3 3 2 2 2 2 2
4 4 NA 2 1 NA 2
5 5 NA NA 1 2 1
6 6 1 2 1 1 1
7 7 2 2 2 2 2

How to sort each column of a df in descending order regarless of the row order?

I am trying to sort my data in descending or ascending order regardless of the data in the rows. I made a dummy example below:
A <- c(9,9,5,4,6,3,2,NA)
B <- c(9,5,3,4,1,4,NA,NA)
C <- c(1,4,5,6,7,4,2,4)
base <- data.frame(A,B,C)
df <- base
df$A <- sort(df$A,na.last = T)
df$B <- sort(df$B,na.last = T)
df$C <- sort(df$C)
We get this
structure(list(A = c(2, 3, 3, 4, 4, 4, 5, 5, 6, 9, 9, NA), B = c(1,
2, 3, 4, 4, 4, 5, 5, 9, 10, NA, NA), C = c(1, 2, 3, 4, 4, 4,
5, 5, 6, 7, 8, 8)), row.names = c(NA, -12L), class = "data.frame")
I want to get something similar to df but my data have hundreds of columns, is there an easier way to do it?
I tried arrange_all() but the result is not what i want.
library(tidyverse)
test <- base%>%
arrange_all()
Obtaining this:
structure(list(A = c(2, 3, 3, 4, 4, 4, 5, 5, 6, 9, 9, NA), B = c(NA,
2, 4, 4, 5, 10, 3, 4, 1, 5, 9, NA), C = c(2, 3, 4, 6, 8, 5, 5,
8, 7, 4, 1, 4)), class = "data.frame", row.names = c(NA, -12L
))
You can sort each column individually :
library(dplyr)
base %>% mutate(across(.fns = sort, na.last = TRUE))
# A B C
#1 2 1 1
#2 3 3 2
#3 4 4 4
#4 5 4 4
#5 6 5 4
#6 9 9 5
#7 9 NA 6
#8 NA NA 7
Or in base R :
base[] <- lapply(base, sort, na.last = TRUE)

How shift values in rows in R (dplyr or data.table)?

Here is data set 'before' and 'after' shifting.
# Data set 'before'
df_before <- t(data.table(
x = c(1, 2, 3, 4, 5),
y = c(0, 6, 7, 8, 9),
z = c(0, 0, 11, 12, 13)))
# Shift operation
# ...
# Data set 'after'
df_after <- t(data.table(
x = c(1, 2, 3, 4, 5),
y = c(6, 7, 8, 9, NA),
z = c(11, 12, 13, NA, NA)))
How to make this kind of shifting on +1 cell only for all rows?
Thanks!
Something like this? Just start the rows always shifted by one and reset their length. The latter adds NAs.
t(sapply(1:nrow(DF), function(x) `length<-`(DF[x, x:ncol(DF)], ncol(DF))))
# [,1] [,2] [,3] [,4] [,5]
# [1,] 1 2 3 4 5
# [2,] 6 7 8 9 NA
# [3,] 11 12 13 NA NA
Data
DF <- structure(c(1, 0, 0, 2, 6, 0, 3, 7, 11, 4, 8, 12, 5, 9, 13), .Dim = c(3L,
5L), .Dimnames = list(c("x", "y", "z"), NULL))
Taking a guess at the logic:
t(apply(df_before, 1, function(x) `length<-`(x[x != 0], ncol(df_before))))
[,1] [,2] [,3] [,4] [,5]
x 1 2 3 4 5
y 6 7 8 9 NA
z 11 12 13 NA NA
You can un-transpose the df_before data.frame then use the lead function from dplyr
to shift the columns
library(data.table)
library(dplyr)
df_before <- data.table(
x = c(1, 2, 3, 4, 5),
y = c(0, 6, 7, 8, 9),
z = c(0, 0, 11, 12, 13))
df_after <- t(data.table(
x = c(1, 2, 3, 4, 5),
y = c(6, 7, 8, 9, NA),
z = c(11, 12, 13, NA, NA)))
df_before[] <-lapply(1:ncol(df_before), function(x){
dplyr::lead(df_before[[x]],n= x-1)
})
If you need to transpose the data after this step:
df_after2 <- t(df_before)
all.equal(df_after,df_after2) # TRUE

using dplyr case_when to alter NA values based on value from another column

structure(list(a = c(NA, 3, 4, NA, 3, "Council" , "Council", 1), b = c("Council A", 3, 4,
"Council B", 6, 7, 2, 6), c = c(6, 3, 6, 5, 3, 6, 5, 3), d = c(6, 2, 4,
5, 3, 7, 2, 6), e = c(1, 2, 4, 5, 6, 7, 6, 3), f = c(2, 3, 4,
2, 2, 7, 5, 2)), .Names = c("a", "b", "c", "d", "e", "f"), row.names = c(NA,
8L), class = "data.frame")
I am trying to convert objects in a using dplyr mutuate and case_when based on text in b . I want to convert values in a to Council if b contains Council in the string.
The code i've used is DF %>% select(a, b) %>% mutate(a =case_when(grepl("Council", b) ~"Council"))
However all values become NA in a if they do not contain the string Council. I've reviewed other posts and attempted various methods including ifelse. I want to maintain the same dataframe just make any NA values in a be converted to Council but only in the cases where it is NA values.
From ?case_when
If no cases match, NA is returned.
So for the cases when there is no "Council" word in b it returns NA.
You need to define the TRUE argument in case_when and assign it to a to keep the values unchanged when the condition is not met.
library(dplyr)
df %>%
mutate(a = case_when(grepl("Council", b) ~"Council",
TRUE ~ a))
# a b c d e f
#1 Council Council A 6 6 1 2
#2 3 3 3 2 2 3
#3 4 4 6 4 4 4
#4 Council Council B 5 5 5 2
#5 3 6 3 3 6 2
#6 Council 7 6 7 7 7
#7 Council 2 5 2 6 5
#8 1 6 3 6 3 2
In this case you could also achieve your result using base R
df$a[grepl("Council", df$b)] <- "Council"
You can also use str_detect from the package stringr to achieve your objective.
library(dplyr)
library(stringr)
df <- structure(list(a = c(NA, 3, 4, NA, 3, "Council" , "Council", 1), b = c("Council A", 3, 4,
"Council B", 6, 7, 2, 6), c = c(6, 3, 6, 5, 3, 6, 5, 3), d = c(6, 2, 4,
5, 3, 7, 2, 6), e = c(1, 2, 4, 5, 6, 7, 6, 3), f = c(2, 3, 4,
2, 2, 7, 5, 2)), .Names = c("a", "b", "c", "d", "e", "f"), row.names = c(NA,
8L), class = "data.frame")
df %>%
mutate(a=ifelse(str_detect(b,fixed("council",ignore_case = T)) & is.na(a),"Council",a))

Resources