Expanding a dataframe with condition - r

I would like to expand a dataframe with duplicates of its own elements, but with specific conditions.
Here is my example data:
x1 <- c(1, 2, 3, 4, 5)
x2 <- c(2, 2, 2, 2, 2)
y1 <- c(9, 9, 8, 9, 9)
y2 <- c(0, 0, 0, 1, 1)
df <- data.frame(x1, x2, y1, y2)
df
x1 x2 y1 y2
1 1 2 9 0
2 2 2 9 0
3 3 2 8 0
4 4 2 9 1
5 5 2 9 1
The condition: only duplicate if y1 = 9 and y2 = 0. Therefore the output should look like this:
x1 x2 y1 y2
1 1 2 9 0
2 2 2 9 0
3 3 2 8 0
4 4 2 9 1
5 5 2 9 1
6 1 2 9 0
7 2 2 9 0
Case 1 and 2 were duplicated and accordingly the dataframe was expanded (new rows 6 and 7). Case 3, 4 and 5 were ignored, the condition was not met.
I am grateful for any help.

We can get the row index of the rows which satisfies our condition using which and just rbind those rows together to the original data frame.
inds <- which(df$y1 == 9 & df$y2 == 0)
rbind(df, df[inds,])
# x1 x2 y1 y2
#1 1 2 9 0
#2 2 2 9 0
#3 3 2 8 0
#4 4 2 9 1
#5 5 2 9 1
#6 1 2 9 0
#7 2 2 9 0
Or using dplyr bind_rows
library(dplyr)
bind_rows(df,
df %>%
filter(y1 == 9 & y2 == 0))
If we want to change the values for the duplicated rows for y1 to 10, we can do
bind_rows(df,
df %>%
filter(y1 == 9 & y2 == 0) %>%
mutate(y1 = 10)
)
# x1 x2 y1 y2
#1 1 2 9 0
#2 2 2 9 0
#3 3 2 8 0
#4 4 2 9 1
#5 5 2 9 1
#6 1 2 10 0
#7 2 2 10 0

Related

How to count the number of occurrences of a given value for each row?

I'm sure this is a really easy fix but I can't seem to find the answer... I am trying to create a column at the end of my dataframe that is a sum of the number of times a specific value (say "1") appears across that row. So for example, if I started with the following dataframe:
X1 <- c(5,1,7,8,1,5)
X2 <- c(5,0,0,2,3,7)
X3 <- c(6,2,3,4,1,7)
X4 <- c(1,1,5,2,1,7)
df <- data.frame(id,X1,X2,X3,X4)
id X1 X2 X3 X4
1 1 5 5 6 1
2 2 1 0 1 1
3 3 7 0 3 5
4 4 8 2 4 2
5 5 1 3 2 1
6 6 5 7 7 7
and I was trying to identify how many times the value "1" appears across that row, I would want the output to look like this:
id X1 X2 X3 X4 one_appears
1 1 5 5 6 1 2
2 2 1 0 1 1 3
3 3 7 0 3 5 0
4 4 8 2 4 2 0
5 5 1 3 2 1 2
6 6 5 7 7 7 0
Thanks very much in advance!
library(tidyverse)
df %>%
mutate(
one = rowSums(across(everything(), ~ .x == 1))
)
# A tibble: 6 × 6
id X1 X2 X3 X4 one
<int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 5 5 6 1 2
2 2 1 0 2 1 2
3 3 7 0 3 5 0
4 4 8 2 4 2 0
5 5 1 3 1 1 3
6 6 5 7 7 7 0
EDIT:
df %>%
mutate(
one = rowSums(across(starts_with("X"), ~ .x == 1))
)
df %>%
mutate(
one = rowSums(across(X1:X4, ~ .x == 1))
)
We can use rowSums on a logical matrix
df$one_appears <- rowSums(df == 1, na.rm = TRUE)
-output
> df
id X1 X2 X3 X4 one_appears
1 1 5 5 6 1 2
2 2 1 0 1 1 3
3 3 7 0 3 5 0
4 4 8 2 4 2 0
5 5 1 3 2 1 2
6 6 5 7 7 7 0
Another option using apply with sum:
id <- c(1:6)
X1 <- c(5,1,7,8,1,5)
X2 <- c(5,0,0,2,3,7)
X3 <- c(6,2,3,4,1,7)
X4 <- c(1,1,5,2,1,7)
df <- data.frame(id,X1,X2,X3,X4)
df$one_appear = apply(df, 1, \(x) sum(x == 1))
df
#> id X1 X2 X3 X4 one_appear
#> 1 1 5 5 6 1 2
#> 2 2 1 0 2 1 2
#> 3 3 7 0 3 5 0
#> 4 4 8 2 4 2 0
#> 5 5 1 3 1 1 3
#> 6 6 5 7 7 7 0
Created on 2023-01-18 with reprex v2.0.2
This answer may not be the best of the approach, but an alternative that I tried so thought to share
code
library(dplyr)
X1 <- c(5,1,7,8,1,5)
X2 <- c(5,0,0,2,3,7)
X3 <- c(6,2,3,4,1,7)
X4 <- c(1,1,5,2,1,7)
df <- data.frame(X1,X2,X3,X4) %>% rowwise %>%
mutate(across(starts_with('X'), function(x) ifelse(x==1,1,NA), .names = 'Y_{col}'),
one_appears=sum(across(starts_with('Y')), na.rm = T)
)

How can I create rank variables for each other variables in R?

Hello dear community members.
I'm trying to create ranking variables for certain variables in R. For example I want to transform this data frame
> df
X1 X2 X3 X4 X5
1 1 4 7 3 2
2 2 5 8 4 3
3 3 6 3 5 4
4 4 1 2 6 5
5 5 2 1 7 6
into
> df
X1 X2 X3 X4 X5 x1_rank x2_rank x3_rank
1 1 4 7 3 2 3 2 1
2 2 5 8 4 3 3 2 1
3 3 6 3 5 4 3 1 3
4 4 1 2 6 5 1 3 2
5 5 2 1 7 6 1 2 3
like this (select X1~X3, and make ranking variables between them).
I tried this code
for (i in 1:nrow(df)) {
df_rank <- df[i, ] %>%
dplyr::select(X1, X2, X3, X4) %>%
base::rank()
}
I can imagine I can solve this problem by using for loop but I'm beginner about R so I do not understand why this doesn't work.
One way to achieve it is to use the ties argument on negative values.
df <- tibble::tribble(
~x1, ~x2, ~x3, ~x4, ~x5,
1,4,7,3,2,
2,5,8,4,3,
3,6,3,5,4,
4,1,2,6,5,
5,2,1,7,6
)
library(magrittr)
df %>%
cbind(
t(apply(-df[,1:3], 1, rank, ties = "min")) %>% {colnames(.) <- paste0(colnames(.), "_rank"); .}
)
x1 x2 x3 x4 x5 x1_rank x2_rank x3_rank
1 1 4 7 3 2 3 2 1
2 2 5 8 4 3 3 2 1
3 3 6 3 5 4 2 1 2
4 4 1 2 6 5 1 3 2
5 5 2 1 7 6 1 2 3
As to why your code does not work - the for loop does not return anything, instead, it assigns a variable df_rank every iteration. To fix it, you could declare an object outside of the loop, and add content to it each iteration, and finally bind that to the original data.
m <- matrix(ncol = 3, nrow = 5)
for (i in 1:nrow(df)) {
m[i,] <- -df[i, ] %>%
dplyr::select(x1, x2, x3) %>%
base::rank(ties = "min")
}
colnames(m) <- paste0(names(df)[1:3], "_rank")
df %>% bind_cols(m)

Ifelse condition across multiple variables using paste0 function to call up variables

I want to use an ifelse condition across multiple variables using paste0("Var",c(1,3,5)) to call up variables.
Here is some data.
set.seed(123)
df <- data.frame(Var1 = sample(1:5,10,replace = T),
Var2 = sample(1:5,10,replace = T),
Var3 = sample(1:5,10,replace = T),
Var4 = sample(1:5,10,replace = T),
Var5 = sample(1:5,10,replace = T))
df
Var1 Var2 Var3 Var4 Var5
1 3 5 2 1 5
2 3 3 1 1 5
3 2 3 3 2 4
4 2 1 4 3 5
5 3 4 1 4 2
6 5 1 3 5 1
7 4 1 5 5 1
8 1 5 4 3 3
9 2 3 2 1 1
10 3 2 5 2 5
As an example, i'm interested in Var1, Var3, Var5. Using ifelse, if the value is equal to 4,5 the new variable is given value of 1, else 0.
I'm using code to get the variables I'm interested in.
paste0( "Var", c(1,3,5) )
[1] "Var1" "Var3" "Var5"
I tried both of these, and know this doesn't work but is it possible to write a code that is similar to this
newvar <- ifelse( paste0("Var", c(1,3,5)) %in% c(4,5) , 1, 0)
newvar <- ifelse( df[ , paste0( "Var", c(1:3) ) ] %in% c(4,5) , 1, 0)
Any help greatly appreciated. Thanks
EDITED : Apologies if wasn't clear. I was making the ifelse across multiple variables to create a single variable. "newvar" is single variable, I didnte want to create the ifelse for each variable newvar1, newvar3, newvar5. It is any value of 4 or 5 in any of those variables, newvar is 1 or 0. Thanks
Use sapply:
v <- paste0("Var", c(1, 3, 5)) # or v <- c(1, 3, 5)
cbind(df, new = +sapply(df[v], `%in%`, 4:5))
giving:
Var1 Var2 Var3 Var4 Var5 new.Var1 new.Var3 new.Var5
1 3 5 2 1 5 0 0 1
2 3 3 1 1 5 0 0 1
3 2 3 3 2 4 0 0 1
4 2 1 4 3 5 0 1 1
5 3 4 1 4 2 0 0 0
6 5 1 3 5 1 1 0 0
7 4 1 5 5 1 1 1 0
8 1 5 4 3 3 0 1 0
9 2 3 2 1 1 0 0 0
10 3 2 5 2 5 0 1 1
Added
Regarding the comment below this answer try any of these. They all use v defined in the next line except for the last solution.
v <- paste0("Var", c(1, 3, 5)) # or v <- c(1, 3, 5)
transform(df, newvar = do.call("pmax", lapply(df[v], `%in%`, 4:5)))
transform(df, newvar = apply(df[v] == 4 | df[v] == 5, 1, max))
transform(df, newvar = apply(df[v], 1, function(x) +any(x %in% 4:5)))
transform(df, newvar = pmax(Var1 %in% 4:5, Var3 %in% 4:5, Var5 %in% 4:5))
any of which give:
Var1 Var2 Var3 Var4 Var5 newvar
1 3 5 2 1 5 1
2 3 3 1 1 5 1
3 2 3 3 2 4 1
4 2 1 4 3 5 1
5 3 4 1 4 2 0
6 5 1 3 5 1 1
7 4 1 5 5 1 1
8 1 5 4 3 3 1
9 2 3 2 1 1 0
10 3 2 5 2 5 1
Here's a solution based on the tidyverse.
library(tidyverse)
df %>%
mutate(
across(
c(Var1, Var3, Var5),
~ifelse(.x %in% c(4, 5), 1, 0),
.names="new{.col}"
)
)
Var1 Var2 Var3 Var4 Var5 newVar1 newVar3 newVar5
1 3 5 2 1 5 0 0 1
2 3 3 1 1 5 0 0 1
3 2 3 3 2 4 0 0 1
4 2 1 4 3 5 0 1 1
5 3 4 1 4 2 0 0 0
6 5 1 3 5 1 1 0 0
7 4 1 5 5 1 1 1 0
8 1 5 4 3 3 0 1 0
9 2 3 2 1 1 0 0 0
10 3 2 5 2 5 0 1 1
The across function runs the function defined in its second argument on each of the columns defined by its first argument. The .names argument is optional and provides names for the new columns (as opposed to overwriting the originals). {.col} is a palceholder for the name of the current column.
Here's a base Rsolution close to what OP tried:
df$newVar <- sapply(df[,c(1,3,5)], function(x) ifelse(x == 4|x == 5, 1, 0))
or, even closer:
df$newVar <- sapply(df[,c(1,3,5)], function(x) ifelse(x %in% c(4,5), 1, 0))
If you want to use both paste0() and ifelse(), here is an example:
x <- paste0("Var", c(1,3,5))
newvar <- matrix(NA, nrow(df1), length(x))
for(j in 1:(length(x))){
newvar[,j] <- ifelse(df1[ ,x[j]] %in% c(4,5), 1, 0)}
final.df <- data.frame(df1,
newvar=ifelse(apply(newvar, 1, sum)>0, 1, 0
))
final.df
Var1 Var2 Var3 Var4 Var5 newvar
1 3 5 2 1 5 1
2 3 3 1 1 5 1
3 2 3 3 2 4 1
4 2 1 4 3 5 1
5 3 4 1 4 2 0
6 5 1 3 5 1 1
7 4 1 5 5 1 1
8 1 5 4 3 3 1
9 2 3 2 1 1 0
10 3 2 5 2 5 1

Replacing leading NAs by group with 0s, but Keep other NAs

I have a COVID data frame grouped by state with 60 columns. As the COVID started at different times across states, therefore there are NAs before values for different states. Different indicators (column9) also have data starting differently. Below is a sample df I made for the demonstration.
state <- c(rep("A", 6), rep("B", 6))
time <- c(1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6)
x1 <- c(NA, NA, NA, 4, 5, 6, NA, NA, 3, 4, 5, NA)
x2 <- c(NA, 2, 3, NA, 5, 6, NA, NA, NA, 4, 5, 6)
x3 <- c(NA, NA, 3, 4, 5, NA, NA, 2, NA, 4, 5, 6)
df <- data.frame(state, time, x1, x2, x3)
df
state time x1 x2 x3
1 A 1 NA NA NA
2 A 2 NA 2 NA
3 A 3 NA 3 3
4 A 4 4 NA 4
5 A 5 5 5 5
6 A 6 6 6 NA
7 B 1 NA NA NA
8 B 2 NA NA 2
9 B 3 3 NA NA
10 B 4 4 4 4
11 B 5 5 5 5
12 B 6 NA 6 6
I'm trying to replace all the leading NAs with 0 for each state, but keep other NAs. The results should look like below:
state time x1 x2 x3
1 A 1 0 0 0
2 A 2 0 2 0
3 A 3 0 3 3
4 A 4 4 NA 4
5 A 5 5 5 5
6 A 6 6 6 NA
7 B 1 0 0 0
8 B 2 0 0 2
9 B 3 3 0 NA
10 B 4 4 4 4
11 B 5 5 5 5
12 B 6 NA 6 6
One solution I came up with is to replace NAs by the condition of the cumulative sums, as below:
df1 <- df %>%
group_by(state) %>%
mutate(
check.sum1 = cumsum(replace_na(x1, 0)),
x1 = if_else(check.sum1 != 0, x1, 0),
check.sum2 = cumsum(replace_na(x2, 0)),
x2 = if_else(check.sum2 != 0, x2, 0),
check.sum3 = cumsum(replace_na(x3, 0)),
x3 = if_else(check.sum3 != 0, x3, 0)
)
df1
This method worked fine. But since there are 60 columns, I want to wrap it up with a function and/or use apply(). But it gives out error messages:
df2 <- df %>%
group_by(state) %>%
apply(
df[3:5], MARGIN = 2, FUN = function(x) mutate(
check.sum = cumsum(replace_na(x, 0)),
x = if_else(check.sum != 0, x, 0)
)
)
Error in FUN(newX[, i], ...) : unused argument (df[3:5])
#or
func <- function(x) {
mutate(
check.sum = cumsum(replace_na(x, 0)),
x = if_else(check.sum != 0, x, 0)
)
}
df3 <- df %>%
group_by(state) %>%
apply(
df[3:5], MARGIN = 2, func
)
Error in match.fun(FUN) :
'df[3:5]' is not a function, character or symbol
So there are three specific questions:
How to create the user-defined functions by using columns as arguments.
How to use apply() function. and
Are there any other ways of using exiting functions, such as na.locf() or na.trim() to do the job?
Thank you!
Using by and looking where a column is.na and NA is not repeated, i.e. boolean differences are smaller or equal to zero.
do.call(rbind, by(df, df$state, \(x) {
x[] <- lapply(x, \(z) {z[is.na(z) & c(0, diff(is.na(z))) <= 0] <- 0; z})
return(x)
}))
# state time x1 x2 x3
# A.1 A 1 0 0 0
# A.2 A 2 0 2 0
# A.3 A 3 0 3 3
# A.4 A 4 4 NA 4
# A.5 A 5 5 5 5
# A.6 A 6 6 6 NA
# B.7 B 1 0 0 0
# B.8 B 2 0 0 2
# B.9 B 3 3 0 NA
# B.10 B 4 4 4 4
# B.11 B 5 5 5 5
# B.12 B 6 NA 6 6
Note: Please use update R>=4.1 for \(x) function shorthand notation or write function(x).
Using dplyr, we can do
library(dplyr)
df %>%
group_by(state) %>%
mutate(across(starts_with('x'), ~ replace(., !cumsum(!is.na(.)), 0))) %>%
ungroup
# A tibble: 12 × 5
state time x1 x2 x3
<chr> <dbl> <dbl> <dbl> <dbl>
1 A 1 0 0 0
2 A 2 0 2 0
3 A 3 0 3 3
4 A 4 4 NA 4
5 A 5 5 5 5
6 A 6 6 6 NA
7 B 1 0 0 0
8 B 2 0 0 2
9 B 3 3 0 NA
10 B 4 4 4 4
11 B 5 5 5 5
12 B 6 NA 6 6

How to add row and column to a dataframe of different length?

I have two dataframes of different length:
Headers <- data.frame(x = paste0("x", 1:4), y = 1:4)
Dataset <- data.frame(H = c(20, 10, 11, 8, 10), W = c(30, 20, 30, 10, 6))
Headers
x y
1 x1 1
2 x2 2
3 x3 3
4 x4 4
Dataset
H W
1 20 30
2 10 20
3 11 30
4 8 10
5 10 6
I need to convert column 'x' from 'Headers' to header, and column 'y' to corresponding values, and then bind to 'Dataset':
H W x1 x2 x3 x4
20 30 1 2 3 4
10 20 1 2 3 4
11 30 1 2 3 4
8 10 1 2 3 4
10 6 1 2 3 4
Here is the code which I tried:
H <- t(Headers)
Dataset <- cbind(H, Dataset)
names(H) <- NULL
Dataset <- qpcR:::cbind.na(H, Dataset)
Any help will be appreciated.Thanks
Transpose 'y' and repeat to the desired number of rows. Set column names to 'x'.
cbind(Dataset, `colnames<-`(t(Headers$y)[rep(1, nrow(Dataset)), ], Headers$x))
H W x1 x2 x3 x4
1 20 30 1 2 3 4
2 10 20 1 2 3 4
3 11 30 1 2 3 4
4 8 10 1 2 3 4
5 10 6 1 2 3 4
A data.table approach:
library(data.table)
cbind(Dataset, dcast(Headers, . ~ x, value.var = "y")[,-1])
Output:
H W x1 x2 x3 x4
1 20 30 1 2 3 4
2 10 20 1 2 3 4
3 11 30 1 2 3 4
4 8 10 1 2 3 4
5 10 6 1 2 3 4
A tidyverse approach:
library(tidyverse)
Headers %>%
rownames_to_column %>%
spread(x, y) %>%
summarise_all(funs(first(na.omit(.)))) %>%
cbind(Dataset, .) %>% select(-rowname)
Output:
H W x1 x2 x3 x4
1 20 30 1 2 3 4
2 10 20 1 2 3 4
3 11 30 1 2 3 4
4 8 10 1 2 3 4
5 10 6 1 2 3 4
You could also go with basic R
cbind(Dataset,data.frame(matrix(rep(Headers$y,each=nrow(Dataset)),nrow=nrow(Dataset))))

Resources