replace negative values with na using na_if{dplyr} - r

Let's say I have the following dataframe:
dat <- tribble(
~V1, ~V2,
2, -3,
3, 2,
1, 3,
3, -4,
5, 1,
3, 2,
1, -4,
3, 4,
4, 1,
3, -5,
4, 2,
3, 4
)
How can I replace negative values with NA using na_if()? I know how to do this using ifelse, but don't manage to come up with a correct condition for na_if():
> dat %>%
+ mutate(V2 = ifelse(V2 < 0, NA, V2))
# A tibble: 12 x 2
V1 V2
<dbl> <dbl>
1 2 NA
2 3 2
3 1 3
4 3 NA
5 5 1
6 3 2
7 1 NA
8 3 4
9 4 1
10 3 NA
11 4 2
12 3 4

Related

R incrementing a variable in dplyr

I have the following grouped data frame:
library(dplyr)
# Create a sample dataframe
df <- data.frame(
student = c("A", "A", "A","B","B", "B", "C", "C","C"),
grade = c(1, 2, 3, 1, 2, 3, 1, 2, 3),
age= c(NA, 6, 6, 7, 7, 7, NA, NA, 9)
)
I want to update the age of each student so that it is one plus the age in the previous year, with their age in the first year they appear in the dataset remaining unchanged. For example, student A's age should be NA, 6, 7, student B's age should be 7,8,9, and student C's age should be NA, NA, 9.
How about this:
library(dplyr)
df <- data.frame(
student = c("A", "A", "A","B","B", "B", "C", "C","C"),
grade = c(1, 2, 3, 1, 2, 3, 1, 2, 3),
age= c(NA, 6, 6, 7, 7, 7, NA, NA, 9)
)
df %>%
group_by(student) %>%
mutate(age = age + cumsum(!is.na(age))-1)
#> # A tibble: 9 × 3
#> # Groups: student [3]
#> student grade age
#> <chr> <dbl> <dbl>
#> 1 A 1 NA
#> 2 A 2 6
#> 3 A 3 7
#> 4 B 1 7
#> 5 B 2 8
#> 6 B 3 9
#> 7 C 1 NA
#> 8 C 2 NA
#> 9 C 3 9
Created on 2022-12-30 by the reprex package (v2.0.1)
in data.table, assuming the order of the rows is the 'correct' order:
library(data.table)
setDT(df)[, new_age := age + rowid(age) - 1, by = .(student)]
# student grade age new_age
# 1: A 1 NA NA
# 2: A 2 6 6
# 3: A 3 6 7
# 4: B 1 7 7
# 5: B 2 7 8
# 6: B 3 7 9
# 7: C 1 NA NA
# 8: C 2 NA NA
# 9: C 3 9 9

dplyr solution: absolute difference of two values in one column matched by other column

I have a dataframe that looks like this, but there will be many more IDs:
# Groups: ID [1]
ID ARS stim
<int> <int> <chr>
1 3 0 1
2 3 4 2
3 3 2 3
4 3 3 4
5 3 1 5
6 3 0 6
7 3 2 10
8 3 4 11
9 3 0 12
10 3 3 13
11 3 2 14
12 3 2 15
I would like to calculate the sum of the absolute difference abs() between the values in ARS, e.g. for stim=1 and stim=10 plus for stim=2 and stim=11 and so on.
Any good solutions are appreciated!
The desired output calculation is:
abs(0-2) + abs(4-4) + abs(2-0) + abs(3-3) + abs(1-2) + abs(0-2)
Hence, 2+0+2+0+1+2
Output for ID==3: 7
A possible solution:
library(dplyr)
df <- structure(list(ID = c(3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3), ARS = c(0, 4, 2, 3, 1, 0, 2, 4, 0, 3, 2, 2), stim = c(1, 2, 3, 4, 5, 6,
10, 11, 12, 13, 14, 15)), row.names = c(NA, -12L), class = "data.frame")
df %>%
group_by(ID) %>%
summarise(value = abs(ARS[which(stim == 1:6)] - ARS[which(stim == 9+1:6)]),
.groups = "drop") %>%
pull(value) %>% sum
#> [1] 7

dplyr Find records that have specifc set of values

I have a dataset that has some ID and associated timepoints. I want to filter out IDs that have a specific combination of timepoints. If I filter using %in% or |, I get IDs out of the specific combination. How do I do this in R ?
ID
Timepoint
1
1
1
6
1
12
2
1
3
1
3
6
3
12
3
18
4
1
4
6
4
12
I want to filter IDs that have timepoints 1,6 and 12 and exclude other IDs.
Result would be IDs 1,3 and 4
library(dplyr)
df <- data.frame(ID = c(1, 1, 1, 2, 3, 3, 3, 3, 4, 4, 4),
Timepoint = c(1, 6, 12, 1, 1, 6, 12, 18, 1, 6, 12))
df %>%
filter(Timepoint %in% c(1, 6, 12)) %>%
mutate(indicator = 1) %>%
group_by(ID) %>%
complete(Timepoint = c(1, 6, 12)) %>%
filter(!ID %in% pull(filter(., is.na(indicator)), ID)) %>%
select(indicator)
Output:
# A tibble: 9 × 2
# Groups: ID [3]
ID indicator
<dbl> <dbl>
1 1 1
2 1 1
3 1 1
4 3 1
5 3 1
6 3 1
7 4 1
8 4 1
9 4 1
We can use
library(dplyr)
df %>%
group_by(ID) %>%
filter(all(c(1, 6, 12) %in% Timepoint)) %>%
ungroup
-output
# A tibble: 10 x 2
ID Timepoint
<dbl> <dbl>
1 1 1
2 1 6
3 1 12
4 3 1
5 3 6
6 3 12
7 3 18
8 4 1
9 4 6
10 4 12
From your data, ID 2 has time point 1. So if filter by time points 1, 6, 12, the result will be 1, 2, 3, 4 instead of 1, 3, 4.
ids <- c(1, 1, 1, 2, 3, 3, 3, 3, 4, 4, 4)
time_points <- c(1, 6, 12, 1, 1, 6, 12, 18, 1, 6, 12)
dat <- data.frame(ids, time_points)
unique(dat$ids[dat$time_points %in% c(1, 6, 12)])

build a network edge table from a sparse table

I don't know exactly how to explain it but...
I have a sparse table where each group represents a level. The columns are ordered, it means, the downstream (left) column represents a child node and upstream (right) node represents a parent node.
I'd like a two columns table where the 1st column is the parent node and the 2nd is the child node. If possible, a 3rd columns with the length (sum of the number of final nodes) of the parents.
Follow the example:
>tt <- tibble(
ID = letters[1:8],
`1` = c( 1, 1, 1, 1, 2, 2, 2, 2),
`2` = c( 3, 3, 4, 4, 5, 5, 5, 6),
`3` = c( 7, 7, 8, 9,10,10,11,12)
)
> tt
# A tibble: 8 x 4
ID `1` `2` `3`
<chr> <dbl> <dbl> <dbl>
1 a 1 3 7
2 b 1 3 7
3 c 1 4 8
4 d 1 4 9
5 e 2 5 10
6 f 2 5 10
7 g 2 5 11
8 h 2 6 12
>dput(tt)
structure(list(ID = c("a", "b", "c", "d", "e", "f", "g", "h"),
`1` = c(1, 1, 1, 1, 2, 2, 2, 2), `2` = c(3, 3, 4, 4, 5, 5,
5, 6), `3` = c(7, 7, 8, 9, 10, 10, 11, 12)), row.names = c(NA,
-8L), class = c("tbl_df", "tbl", "data.frame"))
the result should be:
>ttt <- tibble(
parent = c(1,1,2,2,3,4,4, 5, 5, 6, 7,7,8,9,10,10,11,12),
child = c(3,4,5,6,7,8,9,10,11,12, letters[1:8] ),
length = c(4,4,4,4,2,2,2, 3, 3, 1, 2,2,1,1, 2, 2, 1, 1)
)
>ttt
# A tibble: 18 x 3
parent child length
<dbl> <chr> <dbl>
1 1 3 4
2 1 4 4
3 2 5 4
4 2 6 4
5 3 7 2
6 4 8 2
7 4 9 2
8 5 10 3
9 5 11 3
10 6 12 1
11 7 a 2
12 7 b 2
13 8 c 1
14 9 d 1
15 10 e 2
16 10 f 2
17 11 g 1
18 12 h 1
> dput(ttt)
structure(list(parent = c(1, 1, 2, 2, 3, 4, 4, 5, 5, 6, 7, 7,
8, 9, 10, 10, 11, 12), child = c("3", "4", "5", "6", "7", "8",
"9", "10", "11", "12", "a", "b", "c", "d", "e", "f", "g", "h"
), length = c(4, 4, 4, 4, 2, 2, 2, 3, 3, 1, 2, 2, 1, 1, 2, 2,
1, 1)), row.names = c(NA, -18L), class = c("tbl_df", "tbl", "data.frame"
))
Any help is appreciated.
Thanks in advance.
This gets you 90% of the way there:
tt_correct <- tt[, c(2,3,4,1)]
ttt <- do.call(
rbind,
lapply(seq_len(length(tt)-1),
function(i){
DF <- tt_correct[, c(i, i+1)]
names(DF) <- c('parent', 'child')
DF$length <- ave(DF$parent, DF$parent, FUN = length)
unique(DF)
}
)
)
ttt
# A tibble: 18 x 3
parent child length
<dbl> <chr> <dbl>
1 1 3 4
2 1 4 4
3 2 5 4
4 2 6 4
5 3 7 2
6 4 8 2
7 4 9 2
8 5 10 3
9 5 11 3
10 6 12 1
11 7 a 2
12 7 b 2
13 8 c 1
14 9 d 1
15 10 e 2
16 10 f 2
17 11 g 1
18 12 h 1
The first part is correcting the order. Your expected output indicates that the 1st column is a child of the 4th column. The lapply() statement largely walks along the data.frame and stacks the data.
This is 90% of the way because the answer doesn't agree with your expected output for lengths. I think this is correct but I could be wrong.
Finally, and I'm not that good with igraph, you could likely find additional information doing:
library(igraph)
plot(graph_from_data_frame(ttt[, 1:2]))

gather multiple columns with nested, repeated measures

I have a dataset of people (pid) of different types (type2=c("dad", "mom", "kid"; and for ease, type=c("a", "b", "c")) nested in households (hid) with repeated measurements (time).
Some variables like v1_ are asked to everyone, but the values are spread across three columns. For instance, v1_a contains the values for all of the dads (type==a).
Variables like v2_ are only asked of dads and moms (a's and b's), and the values are spread across two columns.
Variables like v3 are also only asked to dads and moms, but the values are contained in one column.
Variables like v4 are asked to everyone, and the values are contained in one column.
Have:
hid pid type type2 time v1_a v1_b v1_c v2_a v2_b v3 v4
1 1 1 a dad 1 6 NA NA 2 NA 4 3
2 1 2 b mom 1 NA 2 NA NA 5 6 6
3 1 3 c kid 1 NA NA 1 NA NA NA 5
4 2 4 a dad 1 3 NA NA 6 NA 2 6
5 2 5 b mom 1 NA 5 NA NA 2 4 3
6 2 6 c kid 1 NA NA 3 NA NA NA 5
7 1 1 a dad 2 3 NA NA 2 NA 4 3
8 1 2 b mom 2 NA 3 NA NA 5 6 6
9 1 3 c kid 2 NA NA 2 NA NA NA 5
10 2 4 a dad 2 2 NA NA 6 NA 2 6
11 2 5 b mom 2 NA 3 NA NA 2 4 3
12 2 6 c kid 2 NA NA 2 NA NA NA 5
Here is the end result I want:
hid pid type type2 time v1 v2 v3 v4
1 1 1 a dad 1 6 2 4 3
2 1 2 b mom 1 2 5 6 6
3 1 3 c kid 1 1 NA NA 5
4 2 4 a dad 1 3 6 2 6
5 2 5 b mom 1 5 2 4 3
6 2 6 c kid 1 3 NA NA 5
7 1 1 a dad 2 3 2 4 3
8 1 2 b mom 2 3 5 6 6
9 1 3 c kid 2 2 NA NA 5
10 2 4 a dad 2 2 6 2 6
11 2 5 b mom 2 3 2 4 3
12 2 6 c kid 2 2 NA NA 5
I'm looking for a tidyverse approach that will handle a larger actual use case of mixed variables as shown here. The variable naming is consistent. Where do I go after gather()?
library(tidyverse)
df_have <- data.frame(hid=c(1, 1, 1, 2, 2, 2,
1, 1, 1, 2, 2, 2),
pid=c(1, 2, 3, 4, 5, 6,
1, 2, 3, 4, 5, 6),
type=c("a", "b", "c", "a", "b", "c",
"a", "b", "c", "a", "b", "c"),
type2=c("dad", "mom", "kid", "dad", "mom", "kid",
"dad", "mom", "kid", "dad", "mom", "kid"),
time=c(1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2),
v1_a=c(6, NA, NA, 3, NA, NA,
3, NA, NA, 2, NA, NA),
v1_b=c(NA, 2, NA, NA, 5, NA,
NA, 3, NA, NA, 3, NA),
v1_c=c(NA, NA, 1, NA, NA, 3,
NA, NA, 2, NA, NA, 2),
v2_a=c(2, NA, NA, 6, NA, NA,
2, NA, NA, 6, NA, NA),
v2_b=c(NA, 5, NA, NA, 2, NA,
NA, 5, NA, NA, 2, NA),
v3=c(4, 6, NA, 2, 4, NA,
4, 6, NA, 2, 4, NA),
v4=c(3, 6, 5, 6, 3, 5,
3, 6, 5, 6, 3, 5)
)
df_want <- data.frame(hid=c(1, 1, 1, 2, 2, 2,
1, 1, 1, 2, 2, 2),
pid=c(1, 2, 3, 4, 5, 6,
1, 2, 3, 4, 5, 6),
type=c("a", "b", "c", "a", "b", "c",
"a", "b", "c", "a", "b", "c"),
type2=c("dad", "mom", "kid", "dad", "mom", "kid",
"dad", "mom", "kid", "dad", "mom", "kid"),
time=c(1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2),
v1=c(6, 2, 1, 3, 5, 3,
3, 3, 2, 2, 3, 2),
v2=c(2, 5, NA, 6, 2, NA,
2, 5, NA, 6, 2, NA),
v3=c(4, 6, NA, 2, 4, NA,
4, 6, NA, 2, 4, NA),
v4=c(3, 6, 5, 6, 3, 5,
3, 6, 5, 6, 3, 5)
)
df_have %>%
gather(key, value, -hid, -pid, -type, -type2, -time)
Here is another idea using coalesce from dplyr and map from purrr.
library(tidyverse)
# Set target column names
cols <- paste0("v", 1:4)
# Coalesce the numbers based on column names
nums <- map(cols, ~coalesce(!!!as.list(df_have %>% select(starts_with(.x)))))
# Create a data frame
nums_df <- nums %>%
setNames(cols) %>%
as_data_frame()
# Create the final output by bind_cols
df_test <- df_have %>%
select(-starts_with("v")) %>%
bind_cols(nums_df)
df_test
# hid pid type type2 time v1 v2 v3 v4
# 1 1 1 a dad 1 6 2 4 3
# 2 1 2 b mom 1 2 5 6 6
# 3 1 3 c kid 1 1 NA NA 5
# 4 2 4 a dad 1 3 6 2 6
# 5 2 5 b mom 1 5 2 4 3
# 6 2 6 c kid 1 3 NA NA 5
# 7 1 1 a dad 2 3 2 4 3
# 8 1 2 b mom 2 3 5 6 6
# 9 1 3 c kid 2 2 NA NA 5
# 10 2 4 a dad 2 2 6 2 6
# 11 2 5 b mom 2 3 2 4 3
# 12 2 6 c kid 2 2 NA NA 5
This gets me there, but the filter(!is.na(value)) step seems like a hack. Better ideas?
df_test <-
df_have %>%
gather(key, value, -hid, -pid, -type, -time, -type2) %>%
mutate(key = str_replace(key, "_.*", "")) %>%
filter(!is.na(value)) %>%
spread(key, value) %>%
arrange(time, hid, type, pid)
Update from #www:
df_test <-
df_have %>%
gather(key, value, -hid, -pid, -type, -time, -type2, na.rm=TRUE) %>%
mutate(key = str_replace(key, "_.*", "")) %>%
spread(key, value) %>%
arrange(time, hid, type, pid)

Resources