Repeating code to add blank rows in dataframe - r

I am trying to find a way to replicate the following code 4 times:
df3_1<- df3_1 %>% add_row(.before
= 2)
I tried the 'rep' function but it didn't work out. Is there any way to repeat this code 4 times so I can add multiple blank rows exactly in the manner described above (i.e. with respect to a specific row number).
Thanks!

In base R, you can do:
n <- 4
nr <- nrow(df3_1)
df3_1[append(seq(nr), values = rep(nr + 1, n), after = 1), ]
Or a tidyverse approach:
library(tibble)
n <- 4
df3_1 %>%
add_row(!!names(.)[1] := rep(NA, n), .before = 2)

I would think base::Reduce() or purrr::reduce() are best designed for this kind of job
library(dplyr)
library(purrr)
df3_1 <- data.frame(a = 1:5,
b = letters[1:5])
# purrr reduce
reduce(1:4,
function(df, x){
add_row(df, .before = 2)
},
.init = df3_1)
#> a b
#> 1 1 a
#> 2 NA <NA>
#> 3 NA <NA>
#> 4 NA <NA>
#> 5 NA <NA>
#> 6 2 b
#> 7 3 c
#> 8 4 d
#> 9 5 e
# base R Reduce
Reduce(function(df, x){
add_row(df, .before = 2)
},
1:4,
init = df3_1)
#> a b
#> 1 1 a
#> 2 NA <NA>
#> 3 NA <NA>
#> 4 NA <NA>
#> 5 NA <NA>
#> 6 2 b
#> 7 3 c
#> 8 4 d
#> 9 5 e
You dont actually need the x parameter here inside of your function but to determine how often the function should be called, that is, how many blank rows should be inserted.

Try this (updated simplified version based on #27ϕ9 generalised approach):
df3_1 <- data.frame(a = 1:5,
b = letters[1:5])
library(dplyr)
df3_1 %>%
add_row(a = rep(NA, 4), .before = 2)
#> a b
#> 1 1 a
#> 2 NA <NA>
#> 3 NA <NA>
#> 4 NA <NA>
#> 5 NA <NA>
#> 6 2 b
#> 7 3 c
#> 8 4 d
#> 9 5 e
Created on 2020-07-01 by the reprex package (v0.3.0)

Related

Expand each group to the max n of rows

How can I expand a group to length of the max group:
df <- structure(list(ID = c(1L, 1L, 2L, 3L, 3L, 3L), col1 = c("A",
"B", "O", "U", "L", "R")), class = "data.frame", row.names = c(NA,
-6L))
ID col1
1 A
1 B
2 O
3 U
3 L
3 R
Desired Output:
1 A
1 B
NA NA
2 O
NA NA
NA NA
3 U
3 L
3 R
You can take advantage of the fact that df[n_bigger_than_nrow,] gives a row of NAs
dplyr
max_n <- max(count(df, ID)$n)
df %>%
group_by(ID) %>%
summarise(cur_data()[seq(max_n),])
#> `summarise()` has grouped output by 'ID'. You can override using the `.groups`
#> argument.
#> # A tibble: 9 × 2
#> # Groups: ID [3]
#> ID col1
#> <int> <chr>
#> 1 1 A
#> 2 1 B
#> 3 1 <NA>
#> 4 2 O
#> 5 2 <NA>
#> 6 2 <NA>
#> 7 3 U
#> 8 3 L
#> 9 3 R
base R
n <- tapply(df$ID, df$ID, length)
max_n <- max(n)
i <- lapply(n, \(x) c(seq(x), rep(Inf, max_n - x)))
i <- Map(`+`, i, c(0, cumsum(head(n, -1))))
df <- df[unlist(i),]
rownames(df) <- NULL
df$ID <- rep(as.numeric(names(i)), each = max_n)
df
#> ID col1
#> 1 1 A
#> 2 1 B
#> 3 1 <NA>
#> 4 2 O
#> 5 2 <NA>
#> 6 2 <NA>
#> 7 3 U
#> 8 3 L
#> 9 3 R
Here's a base R solution.
split the df by the ID column, then use lapply to iterate over the split df, and rbind with a data frame of NA if there's fewer row than 3 (max(table(df$ID))).
do.call(rbind,
lapply(split(df, df$ID),
\(x) rbind(x, data.frame(ID = NA, col1 = NA)[rep(1, max(table(df$ID)) - nrow(x)), ]))
)
ID col1
1.1 1 A
1.2 1 B
1.3 NA <NA>
2.3 2 O
2.1 NA <NA>
2.1.1 NA <NA>
3.4 3 U
3.5 3 L
3.6 3 R
Here is a possible tidyverse solution. We can use add_row inside of summarise to add n number of rows to each group. I use max(count(df, ID)$n) to get the max group length, then I subtract that from the number of rows in each group to get the total number of rows that need to be added for each group. I use rep to produce the correct number of values that we need to add for each group. Finally, I replace ID with NA when there is an NA in col1.
library(tidyverse)
df %>%
group_by(ID) %>%
summarise(add_row(cur_data(),
col1 = rep(NA_character_,
unique(max(count(df, ID)$n) - n()))),
.groups = "drop") %>%
mutate(ID = replace(ID, is.na(col1), NA))
Output
ID col1
<int> <chr>
1 1 A
2 1 B
3 NA NA
4 2 O
5 NA NA
6 NA NA
7 3 U
8 3 L
9 3 R
Or another option without using add_row:
library(dplyr)
# Get maximum number of rows for all groups
N = max(count(df,ID)$n)
df %>%
group_by(ID) %>%
summarise(col1 = c(col1, rep(NA, N-length(col1))), .groups = "drop") %>%
mutate(ID = replace(ID, is.na(col1), NA))
Another option could be:
df %>%
group_split(ID) %>%
map_dfr(~ rows_append(.x, tibble(col1 = rep(NA_character_, max(pull(count(df, ID), n)) - group_size(.x)))))
ID col1
<int> <chr>
1 1 A
2 1 B
3 NA NA
4 2 O
5 NA NA
6 NA NA
7 3 U
8 3 L
9 3 R
A base R using merge + rle
merge(
transform(
data.frame(ID = with(rle(df$ID), rep(values, each = max(lengths)))),
q = ave(ID, ID, FUN = seq_along)
),
transform(
df,
q = ave(ID, ID, FUN = seq_along)
),
all = TRUE
)[-2]
gives
ID col1
1 1 A
2 1 B
3 1 <NA>
4 2 O
5 2 <NA>
6 2 <NA>
7 3 U
8 3 L
9 3 R
A data.table option may also work
> setDT(df)[, .(col1 = `length<-`(col1, max(df[, .N, ID][, N]))), ID]
ID col1
1: 1 A
2: 1 B
3: 1 <NA>
4: 2 O
5: 2 <NA>
6: 2 <NA>
7: 3 U
8: 3 L
9: 3 R
An option to tidyr::complete the ID and row_new, using row_old to replace ID with NA.
library (tidyverse)
df %>%
group_by(ID) %>%
mutate(
row_new = row_number(),
row_old = row_number()) %>%
ungroup() %>%
complete(ID, row_new) %>%
mutate(ID = if_else(is.na(row_old),
NA_integer_,
ID)) %>%
select(-matches("row_"))
# A tibble: 9 x 2
ID col1
<int> <chr>
1 1 A
2 1 B
3 NA <NA>
4 2 O
5 NA <NA>
6 NA <NA>
7 3 U
8 3 L
9 3 R
n <- max(table(df$ID))
df %>%
group_by(ID) %>%
summarise(col1 =`length<-`(col1, n), .groups = 'drop') %>%
mutate(ID = `is.na<-`(ID, is.na(col1)))
# A tibble: 9 x 2
ID col1
<int> <chr>
1 1 A
2 1 B
3 NA NA
4 2 O
5 NA NA
6 NA NA
7 3 U
8 3 L
9 3 R
Another base R solution using sequence.
print(
df[
sequence(
abs(rep(i <- rle(df$ID)$lengths, each = 2) - c(0L, max(i))),
rep(cumsum(c(1L, i))[-length(i) - 1L], each = 2) + c(0L, nrow(df)),
),
],
row.names = FALSE
)
#> ID col1
#> 1 A
#> 1 B
#> NA <NA>
#> 2 O
#> NA <NA>
#> NA <NA>
#> 3 U
#> 3 L
#> 3 R

Reverse the order of non-NA values in a variable

I am interested in reversing the values for a column that has NA values in a tidy way.
The rev call won't do the trick here:
library(tidyverse)
tibble(
Which = LETTERS[1:11],
x = c( c(3,1,4,2,16), NA, NA, 4, rep(NA, 2), 10)) %>%
mutate(y = rev(x))
As it completely reverses the values (NAs included).
I essentially want a tidy mutate command (no splitting / joining) that reverses the values for the Which column so that E has value 1 (max becomes min) B has value 16 (min becomes max), etc - and NA values remain NA (F, G, I & J).
Edit:
Several answers do not achieve intended outcome. The question is aimed at effectively having a reverse (rev) work while keeping NAs in position.
#Moody_Mudskipper has a solution to the case where there's no repeats, but it fails when there are repeats, e.g.:
rev_na <- function(x) setNames(sort(x), sort(x, TRUE))[as.character(x)]
Works here:
tibble(
Which = LETTERS[1:11],
x = c( c(3,1,4,2,16), NA, NA, 4, rep(NA, 2), 10)) %>%
mutate(y = rev_na(x))
Fails here:
tibble(
Which = LETTERS[1:7],
x = c(3,1,9,9,9, 9, 10)
) %>% mutate(y = rev_na(x), z = rev(x))
If you can tolerate a little hack :
tibble(
Which = LETTERS[1:11],
x = c( c(3,1,4,2,16), NA, NA, 4, rep(NA, 2), 10)) %>%
mutate(y = setNames(sort(x), sort(x, TRUE))[as.character(x)])
#> # A tibble: 11 x 3
#> Which x y
#> <chr> <dbl> <dbl>
#> 1 A 3 4
#> 2 B 1 16
#> 3 C 4 3
#> 4 D 2 10
#> 5 E 16 1
#> 6 F NA NA
#> 7 G NA NA
#> 8 H 4 3
#> 9 I NA NA
#> 10 J NA NA
#> 11 K 10 2
Created on 2021-05-11 by the reprex package (v0.3.0)
This will do
data.frame(
Which = LETTERS[1:11],
x = c( c(3,1,4,2,16), NA, NA, 4, rep(NA, 2), 10)) -> df
df %>% group_by(d = is.na(x)) %>%
arrange(x) %>%
mutate(y = ifelse(!d, rev(x), x)) %>%
ungroup %>% select(-d)
# A tibble: 11 x 3
Which x y
<chr> <dbl> <dbl>
1 B 1 16
2 D 2 10
3 A 3 4
4 C 4 4
5 H 4 3
6 K 10 2
7 E 16 1
8 F NA NA
9 G NA NA
10 I NA NA
11 J NA NA
Needless to say you may arrange back the results if your Which was arranged already or creating a row_number() at the start of the syntax.
df %>%
group_by(d = is.na(x)) %>%
arrange(x) %>%
mutate(y = ifelse(!d, rev(x), x)) %>%
ungroup %>% select(-d) %>%
arrange(Which)
# A tibble: 11 x 3
Which x y
<chr> <dbl> <dbl>
1 A 3 4
2 B 1 16
3 C 4 4
4 D 2 10
5 E 16 1
6 F NA NA
7 G NA NA
8 H 4 3
9 I NA NA
10 J NA NA
11 K 10 2

How to lag a specific column of a data frame in R

Input
(Say d is the data frame below.)
a b c
1 5 7
2 6 8
3 7 9
I want to shift the contents of column b one position down and put an arbitrary number in the first position in b. How do I do this? I would appreciate any help in this regard. Thank you.
I tried c(6,tail(d["b"],-1)) but it does not produce (6,5,6).
Output
a b c
1 6 7
2 5 8
3 6 9
Use head instead
df$b <- c(6, head(df$b, -1))
# a b c
#1 1 6 7
#2 2 5 8
#3 3 6 9
You could also use lag in dplyr
library(dplyr)
df %>% mutate(b = lag(b, default = 6))
Or shift in data.table
library(data.table)
setDT(df)[, b:= shift(b, fill = 6)]
A dplyr solution uses lag with an explicit default argument, if you prefer:
library(dplyr)
d <- tibble(a = 1:3, b = 5:7, c = 7:9)
d %>% mutate(b = lag(b, default = 6))
#> # A tibble: 3 x 3
#> a b c
#> <int> <dbl> <int>
#> 1 1 6 7
#> 2 2 5 8
#> 3 3 6 9
Created on 2019-12-05 by the reprex package (v0.3.0)
Here is a solution similar to the head approach by #Ronak Shah
df <- within(df,b <- c(runif(1),b[-1]))
where a uniformly random variable is added to the first place of b column:
> df
a b c
1 1 0.6644704 7
2 2 6.0000000 8
3 3 7.0000000 9
Best solution below will help in any lag or lead position
d <- data.frame(a=c(1,2,3),b=c(5,6,7),c=c(7,8,9))
d1 <- d %>% arrange(b) %>% group_by(b) %>%
mutate(b1= dplyr::lag(b, n = 1, default = NA))

Is there a way to repeat a function a fixed number of times and save every result as a data frame?

let's say I have a data frame which looks something like this
A <- c(1:100)
B <- c(0.5:100)
df <- data.frame(A,B)
And I want to get 25 random rows out of this data frame with
df[sample(nrow(df), size = 25, replace = FALSE),]
But now I want to repeat this sample function 100 times and save every result individually.
I've tried to use the repeat function but I can't find a way to save every result.
Thank you.
As mentioned in the comments, the replicate implementation can reach your goal, i.e.,
res <- replicate(100,df[sample(nrow(df), size = 25, replace = FALSE),],simplify = F)
An alternative is to use sapply (or lapply), i.e.,
res <- sapply(1:100, function(k) df[sample(nrow(df), size = 25, replace = FALSE),],simplify = F)
or
res <- lapply(1:100, function(k) df[sample(nrow(df), size = 25, replace = FALSE),])
replicate() is a great option for this problem.
If you would like your final results in a single table with a column for the ID variable, you can use bind_rows() from the dplyr package. Here is a smaller example (3 samples from a data set of 10 rows) that may allow easier understanding of replicate()'s behavior:
library(dplyr, warn.conflicts = FALSE)
# make a smaller data set of 10 rows
d <- data.frame(
A = 1:10,
B = LETTERS[1:10]
) %>% print
#> A B
#> 1 1 A
#> 2 2 B
#> 3 3 C
#> 4 4 D
#> 5 5 E
#> 6 6 F
#> 7 7 G
#> 8 8 H
#> 9 9 I
#> 10 10 J
# create 3 samples, with each sample containing 4 rows
reps <- replicate(3, d[sample(nrow(d), 4, FALSE), ], simplify = FALSE) %>% print
#> [[1]]
#> A B
#> 2 2 B
#> 5 5 E
#> 6 6 F
#> 1 1 A
#>
#> [[2]]
#> A B
#> 3 3 C
#> 2 2 B
#> 5 5 E
#> 8 8 H
#>
#> [[3]]
#> A B
#> 4 4 D
#> 9 9 I
#> 3 3 C
#> 8 8 H
# bind the list elements into a single tibble, with an ID column for the sample
bind_rows(reps, .id = "sample_id")
#> sample_id A B
#> 1 1 2 B
#> 2 1 5 E
#> 3 1 6 F
#> 4 1 1 A
#> 5 2 3 C
#> 6 2 2 B
#> 7 2 5 E
#> 8 2 8 H
#> 9 3 4 D
#> 10 3 9 I
#> 11 3 3 C
#> 12 3 8 H
Created on 2019-12-02 by the reprex package (v0.3.0)

R ifelse statement with nested conditions

x=data.frame("l"=c('a','b','d','a','c','c','d'),
"t"=c(1:7))
x$l1 = ifelse(x$l=="a",1,
ifelse(x$l=="b",2,
ifelse(x$l=="c",3,
ifelse(x$l=="d" ,4,
ifelse(x$l=="d" & x$t==7,5,NA)))))
As you can see on the final statement, if l == "d" & t == 7 then l1 should equals to 5 but it gets set equals to 4.
ifelse execution stops wherever it first hits TRUE.
Simply reordering your conditions -
x$l1 = ifelse(x$l=="a",1,
ifelse(x$l=="b",2,
ifelse(x$l=="c",3,
ifelse(x$l=="d" & x$t==7,5,
ifelse(x$l=="d" ,4,NA)))))
x
l t l1
1 a 1 1
2 b 2 2
3 d 3 4
4 a 4 1
5 c 5 3
6 c 6 3
7 d 7 5
Also consider using case_when() from dplyr package; it would be much more readable.
It is better not to use any nested ifelse or case_when. This can be done much easily and efficiently with a key/value dataset merge
keydat <- data.frame(l = letters[1:4],l1 = 1:4)
left_join(x, keydat) %>%
mutate(l1 = replace(l1, l== 'd' & t == 7, 5))
# l t l1
#1 a 1 1
#2 b 2 2
#3 d 3 4
#4 a 4 1
#5 c 5 3
#6 c 6 3
#7 d 7 5
Based on #Shree's answer, here it comes the tidyverse version:
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
x <- data.frame("l"=c('a','b','d','a','c','c','d'),
"t"=c(1:7))
x %>%
as_tibble() %>%
mutate(l1 = case_when(
l == "a" ~ 1,
l == "b" ~ 2,
l == "c" ~ 3,
l == "d" & t == 7 ~ 5,
l == "d" ~ 4
))
#> # A tibble: 7 x 3
#> l t l1
#> <fct> <int> <dbl>
#> 1 a 1 1
#> 2 b 2 2
#> 3 d 3 4
#> 4 a 4 1
#> 5 c 5 3
#> 6 c 6 3
#> 7 d 7 5
Created on 2019-06-17 by the reprex package (v0.3.0)

Resources