Add rows to dataframe in R based on values in column - r

I have a dataframe with 2 columns: time and day. there are 3 days and for each day, time runs from 1 to 12. I want to add new rows for each day with times: -2, 1 and 0. How do I do this?
I have tried using add_row and specifying the row number to add to, but this changes each time a new row is added making the process tedious. Thanks in advance
picture of the dataframe

We could use add_row
then slice the desired sequence
and bind all to a dataframe:
library(tibble)
library(dplyr)
df1 <- df %>%
add_row(time = -2:0, Day = c(1,1,1), .before = 1) %>%
slice(1:15)
df2 <- bind_rows(df1, df1, df1) %>%
mutate(Day = rep(row_number(), each=15, length.out = n()))
Output:
# A tibble: 45 x 2
time Day
<dbl> <int>
1 -2 1
2 -1 1
3 0 1
4 1 1
5 2 1
6 3 1
7 4 1
8 5 1
9 6 1
10 7 1
11 8 1
12 9 1
13 10 1
14 11 1
15 12 1
16 -2 2
17 -1 2
18 0 2
19 1 2
20 2 2
21 3 2
22 4 2
23 5 2
24 6 2
25 7 2
26 8 2
27 9 2
28 10 2
29 11 2
30 12 2
31 -2 3
32 -1 3
33 0 3
34 1 3
35 2 3
36 3 3
37 4 3
38 5 3
39 6 3
40 7 3
41 8 3
42 9 3
43 10 3
44 11 3
45 12 3

Here's a fast way to create the desired dataframe from scratch using expand.grid(), rather than adding individual rows:
df <- expand.grid(-2:12,1:3)
colnames(df) <- c("time","day")
Results:
df
time day
1 -2 1
2 -1 1
3 0 1
4 1 1
5 2 1
6 3 1
7 4 1
8 5 1
9 6 1
10 7 1
11 8 1
12 9 1
13 10 1
14 11 1
15 12 1
16 -2 2
17 -1 2
18 0 2
19 1 2
20 2 2
21 3 2
22 4 2
23 5 2
24 6 2
25 7 2
26 8 2
27 9 2
28 10 2
29 11 2
30 12 2
31 -2 3
32 -1 3
33 0 3
34 1 3
35 2 3
36 3 3
37 4 3
38 5 3
39 6 3
40 7 3
41 8 3
42 9 3
43 10 3
44 11 3
45 12 3

You can use tidyr::crossing
library(dplyr)
library(tidyr)
add_values <- c(-2, 1, 0)
crossing(time = add_values, Day = unique(day$Day)) %>%
bind_rows(day) %>%
arrange(Day, time)
# A tibble: 45 x 2
# time Day
# <dbl> <int>
# 1 -2 1
# 2 0 1
# 3 1 1
# 4 1 1
# 5 2 1
# 6 3 1
# 7 4 1
# 8 5 1
# 9 6 1
#10 7 1
# … with 35 more rows
If you meant -2, -1 and 0 you can also use complete.
tidyr::complete(day, Day, time = -2:0)

Related

Exclude rows where value used in another row

Imagine you have the following data set:
df = data.frame(ID = c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20), gender= c(1,2,1,2,2,2,2,1,1,2,1,2,1,2,2,2,2,1,1,2),
PID = c(1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10))
how can I write a code that removes the rows in the df whose gender and PID are the same (see picture). Please imagine that the code is over 1000 rows long (so it should be a solution that automatically searches for the right values to exclude).
base R
df[ave(rep(TRUE, nrow(df)), df[,c("gender","paar")], FUN = function(z) !any(duplicated(z))),]
# ID gender paar
# 1 1 1 1
# 2 2 2 1
# 3 3 1 2
# 4 4 2 2
# 7 7 2 4
# 8 8 1 4
# 9 9 1 5
# 10 10 2 5
# 11 11 1 6
# 12 12 2 6
# 13 13 1 7
# 14 14 2 7
# 17 17 2 9
# 18 18 1 9
# 19 19 1 10
# 20 20 2 10
dplyr
library(dplyr)
df %>%
group_by(gender, paar) %>%
filter(!any(duplicated(cbind(gender, paar)))) %>%
ungroup()
In base R, we may use subset after removing the observations where the group count for 'gender' and 'paar' are not 1
subset(df, ave(seq_along(gender), gender, paar, FUN = length) == 1)
Or with duplicated
df[!(duplicated(df[-1])|duplicated(df[-1], fromLast = TRUE)),]
-output
ID gender paar
1 1 1 1
2 2 2 1
3 3 1 2
4 4 2 2
7 7 2 4
8 8 1 4
9 9 1 5
10 10 2 5
11 11 1 6
12 12 2 6
13 13 1 7
14 14 2 7
17 17 2 9
18 18 1 9
19 19 1 10
20 20 2 10
Here is one more: :-)
library(dplyr)
df %>%
group_by(gender, PID) %>%
filter(is.na(ifelse(n()>1, 1, NA)))
ID gender PID
<dbl> <dbl> <dbl>
1 1 1 1
2 2 2 1
3 3 1 2
4 4 2 2
5 7 2 4
6 8 1 4
7 9 1 5
8 10 2 5
9 11 1 6
10 12 2 6
11 13 1 7
12 14 2 7
13 17 2 9
14 18 1 9
15 19 1 10
16 20 2 10
Another dplyr option could be:
df %>%
filter(with(rle(paste0(gender, PID)), rep(lengths == 1, lengths)))
ID gender PID
1 1 1 1
2 2 2 1
3 3 1 2
4 4 2 2
5 7 2 4
6 8 1 4
7 9 1 5
8 10 2 5
9 11 1 6
10 12 2 6
11 13 1 7
12 14 2 7
13 17 2 9
14 18 1 9
15 19 1 10
16 20 2 10
If the duplicated values can occur also between non-consecutive rows:
df %>%
arrange(gender, PID) %>%
filter(with(rle(paste0(gender, PID)), rep(lengths == 1, lengths)))
Using aggregate
na.omit(aggregate(. ~ gender + PID, df, function(x)
ifelse(length(x) == 1, x, NA)))
gender PID ID
1 1 1 1
2 2 1 2
3 1 2 3
4 2 2 4
6 1 4 8
7 2 4 7
8 1 5 9
9 2 5 10
10 1 6 11
11 2 6 12
12 1 7 13
13 2 7 14
15 1 9 18
16 2 9 17
17 1 10 19
18 2 10 20
With dplyr
library(dplyr)
df %>%
group_by(gender, PID) %>%
filter(n() == 1) %>%
ungroup()
# A tibble: 16 × 3
ID gender PID
<dbl> <dbl> <dbl>
1 1 1 1
2 2 2 1
3 3 1 2
4 4 2 2
5 7 2 4
6 8 1 4
7 9 1 5
8 10 2 5
9 11 1 6
10 12 2 6
11 13 1 7
12 14 2 7
13 17 2 9
14 18 1 9
15 19 1 10
16 20 2 10

Finding cumulative second max per group in R

I have a dataset where I would like to create a new variable that is the cumulative second largest value of another variable, and I would like to perform this function per group.
Let's say I create the following example data frame:
(df1 <- data.frame(patient = rep(1:5, each=8), visit = rep(1:2,each=4,5), trial = rep(1:4,10), var1 = sample(1:50,20,replace=TRUE)))
This is pretend data that represents 5 patients who each had 2 study visits, and each visit had 4 trials with a measurement taken (var1).
> head(df1,n=20)
patient visit trial var1
1 1 1 1 25
2 1 1 2 23
3 1 1 3 48
4 1 1 4 37
5 1 2 1 41
6 1 2 2 45
7 1 2 3 8
8 1 2 4 9
9 2 1 1 26
10 2 1 2 14
11 2 1 3 41
12 2 1 4 35
13 2 2 1 37
14 2 2 2 30
15 2 2 3 14
16 2 2 4 28
17 3 1 1 34
18 3 1 2 19
19 3 1 3 28
20 3 1 4 10
I would like to create a new variable, cum2ndmax, that is the cumulative 2nd largest value of var1 and I would like to group this variable by patient # and visit #.
I figured out how to calculate the cumulative 2nd max number like so:
df1$cum2ndmax <- sapply(seq_along(df1$var1),function(x){sort(df1$var1[seq(x)],decreasing=TRUE)[2]})
df1
However, this calculates the cumulative 2nd max across the whole dataset, not for each group. I have attempted to calculate this variable using grouped data like so after installing and loading package dplyr:
library(dplyr)
df2 <- df1 %>%
group_by(patient,visit) %>%
mutate(cum2ndmax = sapply(seq_along(df1$var1),function(x){sort(df1$var1[seq(x)],decreasing=TRUE)[2]}))
But I get an error: Error: Problem with mutate() input cum2ndmax. x Input cum2ndmax can't be recycled to size 4.
Ideally, my result would look something like this:
patient visit trial var1 cum2ndmax
1 1 1 25 NA
1 1 2 23 23
1 1 3 48 25
1 1 4 37 37
1 2 1 41 NA
1 2 2 45 41
1 2 3 8 41
1 2 4 9 41
2 1 1 26 NA
2 1 2 14 14
2 1 3 41 26
2 1 4 35 35
… … … … …
Any help in getting this to work in R would be much appreciated! Thank you!
One dplyr and purrr option could be:
df1 %>%
group_by(patient, visit) %>%
mutate(cum_second_max = map_dbl(.x = seq_along(var1),
~ ifelse(.x == 1, NA, var1[dense_rank(-var1[1:.x]) == 2])))
patient visit trial var1 cum_second_max
<int> <int> <int> <int> <dbl>
1 1 1 1 25 NA
2 1 1 2 23 23
3 1 1 3 48 25
4 1 1 4 37 37
5 1 2 1 41 NA
6 1 2 2 45 41
7 1 2 3 8 41
8 1 2 4 9 41
9 2 1 1 26 NA
10 2 1 2 14 14
11 2 1 3 41 26
12 2 1 4 35 35
13 2 2 1 37 NA
14 2 2 2 30 30
15 2 2 3 14 30
16 2 2 4 28 30
17 3 1 1 34 NA
18 3 1 2 19 19
19 3 1 3 28 28
20 3 1 4 10 28
Here is an Rcpp solution.
cum_second_max is a modification of cummax which keeps track of the second maximum.
library(tidyverse)
Rcpp::cppFunction("
NumericVector cum_second_max(NumericVector x) {
double max_value = R_NegInf, max_value2 = NA_REAL;
NumericVector result(x.length());
for (int i = 0 ; i < x.length() ; ++i) {
if (x[i] > max_value) {
max_value2 = max_value;
max_value = x[i];
}
else if (x[i] < max_value && x[i] > max_value2) {
max_value2 = x[i];
}
result[i] = isinf(max_value2) ? NA_REAL : max_value2;
}
return result;
}
")
df1 %>%
group_by(patient, visit) %>%
mutate(
c2max = cum_second_max(var1)
)
#> # A tibble: 20 x 5
#> # Groups: patient, visit [5]
#> patient visit trial var1 c2max
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 1 25 NA
#> 2 1 1 2 23 23
#> 3 1 1 3 48 25
#> 4 1 1 4 37 37
#> 5 1 2 1 41 NA
#> 6 1 2 2 45 41
#> 7 1 2 3 8 41
#> 8 1 2 4 9 41
#> 9 2 1 1 26 NA
#> 10 2 1 2 14 14
#> 11 2 1 3 41 26
#> 12 2 1 4 35 35
#> 13 2 2 1 37 NA
#> 14 2 2 2 30 30
#> 15 2 2 3 14 30
#> 16 2 2 4 28 30
#> 17 3 1 1 34 NA
#> 18 3 1 2 19 19
#> 19 3 1 3 28 28
#> 20 3 1 4 10 28
Thanks so much everyone! I really appreciate it and could not have solved this without your help! In the end, I ended up using a similar approach suggested by tmfmnk since I was already using dplyr. I found an interesting result with the code suggested by tmkmnk where for some reason it gave me a column of values that just repeated the first row's number. With a small tweak to change dense_rank to order, I got exactly what I wanted like this:
df1 %>%
group_by(patient, visit) %>%
mutate(cum_second_max = map_dbl(.x = seq_along(var1),
~ ifelse(.x == 1, NA, var1[order(-var1[1:.x])[2])))

count row number first and then insert new row by condition [duplicate]

This question already has answers here:
How to create missing value for repeated measurement data?
(2 answers)
Closed 4 years ago.
I need to count the number of rows first after a group_by function and add up new row(s) to 6 row if the row number < 6.
My df has three variables (v1,v2,v3): v1 = group name, v2 = row number (i.e., 1,2,3,4,5,6). In the new row(s), I want to repeat the v1 value, v2 continue the couting of row number, v3 = NA
sample df
v1 v2 v3
1 1 79
1 2 32
1 3 53
1 4 33
1 5 76
1 6 11
2 1 32
2 2 42
2 3 44
2 4 12
3 1 22
3 2 12
3 3 12
3 4 67
3 5 32
expected output
v1 v2 v3
1 1 79
1 2 32
1 3 53
1 4 33
1 5 76
1 6 11
2 1 32
2 2 42
2 3 44
2 4 12
2 5 NA #insert
2 6 NA #insert
3 1 22
3 2 12
3 3 12
3 4 67
3 5 32
3 6 NA #insert
I tried to count the row number first by dplyr, but I don't know if I can or how can I add this if else condition by using the pip. Or is there other easier function?
My code
df %>%
group_by(v1) %>%
dplyr::summarise(N=n()) %>%
if (N < 6) {
# sth like that?
}
Thanks!
We can use complete
library(tidyverse)
complete(df1, v1, v2)
# A tibble: 18 x 3
# v1 v2 v3
# <int> <int> <int>
# 1 1 1 79
# 2 1 2 32
# 3 1 3 53
# 4 1 4 33
# 5 1 5 76
# 6 1 6 11
# 7 2 1 32
# 8 2 2 42
# 9 2 3 44
#10 2 4 12
#11 2 5 NA
#12 2 6 NA
#13 3 1 22
#14 3 2 12
#15 3 3 12
#16 3 4 67
#17 3 5 32
#18 3 6 NA
Here is a way to do it using merge.
df <- read.table(text =
"v1 v2 v3
1 1 79
1 2 32
1 3 53
1 4 33
1 5 76
1 6 11
2 1 32
2 2 42
2 3 44
2 4 12
3 1 22
3 2 12
3 3 12
3 4 67
3 5 32", header = T)
toMerge <- data.frame(v1 = rep(1:3, each = 6), v2 = rep(1:6, times = 3))
m <- merge(toMerge, df, by = c("v1", "v2"), all.x = T)
m
v1 v2 v3
1 1 1 79
2 1 2 32
3 1 3 53
4 1 4 33
5 1 5 76
6 1 6 11
7 2 1 32
8 2 2 42
9 2 3 44
10 2 4 12
11 2 5 NA
12 2 6 NA
13 3 1 22
14 3 2 12
15 3 3 12
16 3 4 67
17 3 5 32
18 3 6 NA

Data Frame Filter Values

Suppose I have the next data frame.
table<-data.frame(group=c(0,5,10,15,20,25,30,35,40,0,5,10,15,20,25,30,35,40,0,5,10,15,20,25,30,35,40),plan=c(1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3),price=c(1,4,5,6,8,9,12,12,12,3,5,6,7,10,12,20,20,20,5,6,8,12,15,20,22,28,28))
group plan price
1 0 1 1
2 5 1 4
3 10 1 5
4 15 1 6
5 20 1 8
6 25 1 9
7 30 1 12
8 35 1 12
9 40 1 12
10 0 2 3
11 5 2 5
12 10 2 6
13 15 2 7
14 20 2 10
15 25 2 12
16 30 2 20
17 35 2 20
18 40 2 20
How can I get the values from the table up to the maximum price, without duplicates.
So the result would be:
group plan price
1 0 1 1
2 5 1 4
3 10 1 5
4 15 1 6
5 20 1 8
6 25 1 9
7 30 1 12
10 0 2 3
11 5 2 5
12 10 2 6
13 15 2 7
14 20 2 10
15 25 2 12
16 30 2 20
You can use slice in dplyr:
library(dplyr)
table %>%
group_by(plan) %>%
slice(1:which.max(price == max(price)))
which.max gives the index of the first occurrence of price == max(price). Using that, I can slice the data.frame to only keep rows for each plan up to the maximum price.
Result:
# A tibble: 22 x 3
# Groups: plan [3]
group plan price
<dbl> <dbl> <dbl>
1 0 1 1
2 5 1 4
3 10 1 5
4 15 1 6
5 20 1 8
6 25 1 9
7 30 1 12
8 0 2 3
9 5 2 5
10 10 2 6
# ... with 12 more rows

Group rows and add sum column of unique values

Here an example of my data.frame:
df = read.table(text='colA colB colC
10 11 7
10 34 7
10 89 7
10 21 7
2 23 5
2 21 5
2 56 5
22 14 3
22 19 3
22 90 3
11 19 2
11 45 2
1 45 0
1 23 0
9 8 0
9 11 0
9 21 0', header = TRUE)
I need to group the rows by colA and colC and add a new column which states the sum of unique values based on colB.
In steps here what I need to do for this specific data.frame:
group rows with colA = 10 and 9, colA = 2 and 1, colA = 22 and colA = 11;
find the unique values of colB per each group;
add the unique values in a new col (newcolD).
Note that colC states the total number of observations for colA = 10 and 9, colA = 2 and 1, colA = 22 and colA = 11.
The data.frame needs to remain ordered decreasingly by colC.
My expected output is:
colA colB colC newcolD
10 11 7 5
10 34 7 5
10 89 7 5
10 21 7 5
9 8 0 5
9 11 0 5
9 21 0 5
2 23 5 4
2 21 5 4
2 56 5 4
1 45 0 4
1 23 0 4
22 14 3 3
22 19 3 3
22 90 3 3
11 19 2 2
11 45 2 2
To note that in df the colB duplicated values are: 11 and 21 for group 10 and 9, and 23 for group 2 and 1.
You can do that with dplyr. The trick is to create a new grouping column which groups consecutive values in colA. This is done with cumsum(c(1, diff(colA) < -1) in the example below.
df1 = read.table(text='colA colB colC
10 11 7
10 34 7
10 89 7
10 21 7
2 23 5
2 21 5
2 56 5
22 14 3
22 19 3
22 90 3
1 45 0
1 23 0
9 8 0
9 11 0
9 21 0', header = TRUE,stringsAsFactors=FALSE)
library(dplyr)
df1 %>%
arrange(desc(colA)) %>%
group_by(group_sequential = cumsum(c(1, diff(colA) < -1))) %>%
mutate(newcolD=n_distinct(colB))
colA colB colC group_sequential newcolD
<int> <int> <int> <dbl> <int>
1 22 14 3 1 3
2 22 19 3 1 3
3 22 90 3 1 3
4 10 11 7 2 5
5 10 34 7 2 5
6 10 89 7 2 5
7 10 21 7 2 5
8 9 8 0 2 5
9 9 11 0 2 5
10 9 21 0 2 5
11 2 23 5 3 4
12 2 21 5 3 4
13 2 56 5 3 4
14 1 45 0 3 4
15 1 23 0 3 4
EDIT FOR NEW DATA
With the data you added, we need to create a custom grouping. I use case_when in the example below. This matches the order you show in the desired output column. In the text, you wrote that you wanted the table to be sorted by colC. To do so, change the last line to arrange(desc(colC))
df1 = read.table(text='colA colB colC
10 11 7
10 34 7
10 89 7
10 21 7
2 23 5
2 21 5
2 56 5
22 14 3
22 19 3
22 90 3
11 19 2
11 45 2
1 45 0
1 23 0
9 8 0
9 11 0
9 21 0', header = TRUE,stringsAsFactors=FALSE)
library(dplyr)
df1 %>%
group_by(group_sequential = case_when(.$colA==10|.$colA==9~1,
.$colA==2|.$colA==1~2,
.$colA==22~3,
.$colA==11~4)) %>%
mutate(newcolD=n_distinct(colB)) %>%
arrange(desc(newcolD))
colA colB colC group_sequential newcolD
<int> <int> <int> <dbl> <int>
1 10 11 7 1 5
2 10 34 7 1 5
3 10 89 7 1 5
4 10 21 7 1 5
5 9 8 0 1 5
6 9 11 0 1 5
7 9 21 0 1 5
8 2 23 5 2 4
9 2 21 5 2 4
10 2 56 5 2 4
11 1 45 0 2 4
12 1 23 0 2 4
13 22 14 3 3 3
14 22 19 3 3 3
15 22 90 3 3 3
16 11 19 2 4 2
17 11 45 2 4 2
You're really not making it easy for us, reposting slight variations of the same question instead of updating the old one and presenting conditions that are vague and inconsistent with what the desired output implies. Anyhow, here is my attempt. This is more an answer to the second question you posted, as that was a bit more general in form.
It's a bit messy, it's pretty much a direct translation of your conditions into a for loop with some if statements. I chose to focus on your written conditions rather than the expected output as that was the easier one to understand. If you want a better answer, please consider cleaning up you question(s) considerably.
df1 <- read.table(text="
colA colB colC
10 11 7
10 34 7
10 89 7
10 21 7
2 23 5
2 21 5
2 56 5
22 14 3
22 19 3
22 90 3
11 19 2
11 45 2
1 45 0
1 23 0
9 8 0
9 11 0
9 21 0", header=TRUE)
df2 <- read.table(text="
colA colB colC
10 11 7
10 34 7
10 89 7
10 21 7
2 23 5
2 21 5
2 56 5
33 24 3
33 78 3
22 14 3
22 19 3
22 90 3
11 19 2
11 45 2
1 45 0
1 23 0
9 8 0
9 11 0
9 21 0
32 11 0", header=TRUE)
df <- df1
for (i in 1:nrow(df)) {
df$colD[i] <- ifelse(df$colC[i] == 0,
0,
length(unique(df$colA[1:i])))
if (any(df$colA[i]-1 == df$colA[1:i]) & df$colC[i] != 0) {
df$colD[i] <- df$colD[which(df$colA[i]-1 == df$colA[1:i])][1]
}
}
# colA colB colC colD
# 10 11 7 1
# 10 34 7 1
# 10 89 7 1
# 10 21 7 1
# 2 23 5 2
# 2 21 5 2
# 2 56 5 2
# 22 14 3 3
# 22 19 3 3
# 22 90 3 3
# 11 19 2 1
# 11 45 2 1
# 1 45 0 0
# 1 23 0 0
# 9 8 0 0
# 9 11 0 0
# 9 21 0 0
df <- df2
for (i in 1:nrow(df)) {
df$colD[i] <- ifelse(df$colC[i] == 0,
0,
length(unique(df$colA[1:i])))
if (any(df$colA[i]-1 == df$colA[1:i]) & df$colC[i] != 0) {
df$colD[i] <- df$colD[which(df$colA[i]-1 == df$colA[1:i])][1]
}
}
df
# colA colB colC colD
# 10 11 7 1
# 10 34 7 1
# 10 89 7 1
# 10 21 7 1
# 2 23 5 2
# 2 21 5 2
# 2 56 5 2
# 33 24 3 3
# 33 78 3 3
# 22 14 3 4
# 22 19 3 4
# 22 90 3 4
# 11 19 2 1
# 11 45 2 1
# 1 45 0 0
# 1 23 0 0
# 9 8 0 0
# 9 11 0 0
# 9 21 0 0
# 32 11 0 0
To also group the rows where colC is zero, it's sufficient to adjust the conditionals like this:
for (i in 1:nrow(df)) {
df$colD[i] <- length(unique(df$colA[1:i]))
if (any(df$colA[i]-1 == df$colA[1:i])) {
df$colD[i] <- df$colD[which(df$colA[i]-1 == df$colA[1:i])][1]
}
}

Resources