I have a tibble with information about diagnoses:
data <- tibble(
id = c(1:10),
diagnosis_1 = c("F32", "F431", "R58", "S32", "F11", NA, NA, "Y67", "F32", "Z032"),
diagnosis_2 = c(NA, NA, NA, NA, NA, NA, "G35", NA, NA, NA),
diagnosis_3 = c("F40", NA, "R67", "F431", NA, "F60", "S58", "R68", "F11", NA),
diagnosis_4 = c(NA, NA, "F65", NA, "F19", NA, NA, "F32", NA, NA)
)
As a part of the cleaning process, I have removed all diagnoses not fulfilling certain criteria (i.e. not starting with the letter F, G, or Z). With the following code:
data$diagnosis_1[str_sub(data$diagnosis_1, 1,1) %in% c("R", "S", "Y")] <- NA
data$diagnosis_2[str_sub(data$diagnosis_2, 1,1) %in% c("R", "S", "Y")] <- NA
data$diagnosis_3[str_sub(data$diagnosis_3, 1,1) %in% c("R", "S", "Y")] <- NA
data$diagnosis_4[str_sub(data$diagnosis_4, 1,1) %in% c("R", "S", "Y")] <- NA
Ending up with this tibble:
I now need to move the data to the left to fill the columns from left to right (i.e diagnosis_1 not being empty if diagnosis_2, diagnosis_3 or diagnosis_4 has data). I have tried using ifelse() as it is vectorized but I can`t seem to get it to work with several nested ifelse().
ifelse(is.na(data$diagnosis_1), data$diagnosis_2, data$diagnosis_1))
All suggestions are much appreciated.
Edit: adding expected output:
Using dplyr and tidyr. Reshape from wide to long, exclude "^RSY" and NA diagnosis, reshape long to wide.
library(dplyr)
library(tidyr)
gather(data, key = "k", value = "v", -id) %>%
filter(!(grepl("^[R|S|Y]", v) | is.na(v))) %>%
group_by(id) %>%
mutate(diagN = paste0("diagnosis_", row_number())) %>%
select(-k) %>%
spread(key = "diagN", value = "v") %>%
ungroup()
# # A tibble: 10 x 3
# id diagnosis_1 diagnosis_2
# <int> <chr> <chr>
# 1 1 F32 F40
# 2 2 F431 NA
# 3 3 F65 NA
# 4 4 F431 NA
# 5 5 F11 F19
# 6 6 F60 NA
# 7 7 G35 NA
# 8 8 F32 NA
# 9 9 F32 F11
# 10 10 Z032 NA
We first replace values which start with either "R", "S" or "Y" to NA and then left shift the non-NA values.
data[-1] <- lapply(data[-1], function(x) replace(x, grepl("^[R|S|Y]", x), NA))
data[] <- t(apply(data, 1, function(x) `length<-`(na.omit(x), length(x))))
data
# A tibble: 10 x 5
# id diagnosis_1 diagnosis_2 diagnosis_3 diagnosis_4
# <chr> <chr> <chr> <chr> <chr>
# 1 " 1" F32 F40 NA NA
# 2 " 2" F431 NA NA NA
# 3 " 3" F65 NA NA NA
# 4 " 4" F431 NA NA NA
# 5 " 5" F11 F19 NA NA
# 6 " 6" F60 NA NA NA
# 7 " 7" G35 NA NA NA
# 8 " 8" F32 NA NA NA
# 9 " 9" F32 F11 NA NA
#10 10 Z032 NA NA NA
Shifting the non-NA value to left has been taken from David's answer from here. You can try any other approach to shift values from the same question as well.
You can try a tidyverse
library(tidyverse)
data %>%
mutate_at(vars(starts_with("diagnosis")), funs(ifelse(str_sub(., 1, 1) %in% c("R", "S", "Y"), NA, .))) %>%
gather(k,v, -id) %>%
group_by(id) %>%
arrange(id) %>%
mutate(v=ifelse(k == "diagnosis_1", v[!is.na(v)][1], v)) %>%
spread(k, v)
# A tibble: 10 x 5
# Groups: id [10]
id diagnosis_1 diagnosis_2 diagnosis_3 diagnosis_4
<int> <chr> <chr> <chr> <chr>
1 1 F32 NA F40 NA
2 2 F431 NA NA NA
3 3 F65 NA NA F65
4 4 F431 NA F431 NA
5 5 F11 NA NA F19
6 6 F60 NA F60 NA
7 7 G35 G35 NA NA
8 8 F32 NA NA F32
9 9 F32 NA F11 NA
10 10 Z032 NA NA NA
As its unclear what OP wants (see discussion below) you can also try
data %>%
mutate_at(vars(starts_with("diagnosis")), funs(ifelse(str_sub(., 1, 1) %in% c("R", "S", "Y"), NA, .))) %>%
gather(k,v, -id) %>%
group_by(id) %>%
arrange(id) %>%
mutate(v=c(v[!is.na(v)], rep(NA, length(v) - length(v[!is.na(v)])))) %>%
spread(k, v)
# A tibble: 10 x 5
# Groups: id [10]
id diagnosis_1 diagnosis_2 diagnosis_3 diagnosis_4
<int> <chr> <chr> <chr> <chr>
1 1 F32 F40 NA NA
2 2 F431 NA NA NA
3 3 F65 NA NA NA
4 4 F431 NA NA NA
5 5 F11 F19 NA NA
6 6 F60 NA NA NA
7 7 G35 NA NA NA
8 8 F32 NA NA NA
9 9 F32 F11 NA NA
10 10 Z032 NA NA NA
You can use Reduce along with coalesce from dplyr, i.e.
df$diagnosis_1 <- Reduce(dplyr::coalesce, df[-1])
#id diagnosis_1 diagnosis_2 diagnosis_3 diagnosis_4
# <int> <chr> <chr> <chr> <chr>
# 1 1 F32 <NA> F40 <NA>
# 2 2 F431 <NA> <NA> <NA>
# 3 3 F65 <NA> <NA> F65
# 4 4 F431 <NA> F431 <NA>
# 5 5 F11 <NA> <NA> F19
# 6 6 F60 <NA> F60 <NA>
# 7 7 G35 G35 <NA> <NA>
# 8 8 F32 <NA> <NA> F32
# 9 9 F32 <NA> F11 <NA>
#10 10 Z032 <NA> <NA> <NA>
Below solution using function na_move from package dedupewider.
library(dedupewider)
na_move(data) # 'right' direction is by default
#> # A tibble: 10 x 5
#> id diagnosis_1 diagnosis_2 diagnosis_3 diagnosis_4
#> * <chr> <chr> <chr> <lgl> <lgl>
#> 1 1 F32 F40 NA NA
#> 2 2 F431 <NA> NA NA
#> 3 3 F65 <NA> NA NA
#> 4 4 F431 <NA> NA NA
#> 5 5 F11 F19 NA NA
#> 6 6 F60 <NA> NA NA
#> 7 7 G35 <NA> NA NA
#> 8 8 F32 <NA> NA NA
#> 9 9 F32 F11 NA NA
#> 10 10 Z032 <NA> NA NA
A tidyr update, using pivot_longer and unnest_wider.
dplyr 1.0.10 CRAN release: 2022-09-01
tidyr 1.2.1 CRAN release: 2022-09-08
Step 1: clean up data
library(dplyr)
library(tidyr)
data <- data %>%
mutate(across(starts_with("diag"), ~
replace(.x, grepl(paste0("^", c("R", "S", "Y"), collapse="|"), .x), NA)))
Step 2: left-compact data
data %>%
pivot_longer(starts_with("diag")) %>%
group_by(id) %>%
mutate(value = value[order(is.na(value))]) %>%
summarize(col = list(value)) %>%
unnest_wider(col, names_sep="_") %>%
setNames(colnames({{data}}))
# A tibble: 10 × 5
id diagnosis_1 diagnosis_2 diagnosis_3 diagnosis_4
<int> <chr> <chr> <chr> <chr>
1 1 F32 F40 NA NA
2 2 F431 NA NA NA
3 3 F65 NA NA NA
4 4 F431 NA NA NA
5 5 F11 F19 NA NA
6 6 F60 NA NA NA
7 7 G35 NA NA NA
8 8 F32 NA NA NA
9 9 F32 F11 NA NA
10 10 Z032 NA NA NA
data
data <- structure(list(id = 1:10, diagnosis_1 = c("F32", "F431", "R58",
"S32", "F11", NA, NA, "Y67", "F32", "Z032"), diagnosis_2 = c(NA,
NA, NA, NA, NA, NA, "G35", NA, NA, NA), diagnosis_3 = c("F40",
NA, "R67", "F431", NA, "F60", "S58", "R68", "F11", NA), diagnosis_4 = c(NA,
NA, "F65", NA, "F19", NA, NA, "F32", NA, NA)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -10L))
Related
I have a huge dataset and wish to replace values in certain columns (VAR1, VAR2) with NA if they do not start with AA or DD.
Data:
DF<-tibble::tribble(
~ID, ~VAR1, ~VAR1DATE, ~VAR2, ~VAR2DATE,
1L, "AABB", "2001-01-01", "BBAA", "2001-01-01",
2L, "AACC", "2001-01-02", "AACC", "2001-01-02",
3L, "CCDD", "2001-01-03", "DDCC", "2001-01-03",
4L, "DDAA", "2001-01-04", "CCBB", "2001-01-04",
5L, "CCBB", "2001-01-05", "CCBB", "2001-01-05"
)
Desired output:
A tibble: 5 × 5
ID VAR1 VAR1DATE VAR2 VAR2DATE
<int> <chr> <chr> <chr> <chr>
1 1 AABB 2001-01-01 NA NA
2 2 AACC 2001-01-02 AACC 2001-01-02
3 3 NA NA DDCC 2001-01-03
4 4 DDAA 2001-01-04 NA NA
5 5 NA NA NA NA
Is there an elegant and smart way to this? mutate_all?
We may do this in two steps - loop across the columns that have 'VAR' followed by digits (\\d+) in column names, replace the values where the first two characters are not AA or DD to NA, then replace the corresponding DATE column to NA based on the NA in the 'VAR1', 'VAR2' columns
library(dplyr)
library(stringr)
DF %>%
mutate(across(matches("^VAR\\d+$"),
~ replace(., !substr(., 1, 2) %in% c("AA", "DD"), NA)),
across(ends_with("DATE"),
~ replace(., is.na(get(str_remove(cur_column(), "DATE"))), NA)))
-output
# A tibble: 5 × 5
ID VAR1 VAR1DATE VAR2 VAR2DATE
<int> <chr> <chr> <chr> <chr>
1 1 AABB 2001-01-01 <NA> <NA>
2 2 AACC 2001-01-02 AACC 2001-01-02
3 3 <NA> <NA> DDCC 2001-01-03
4 4 DDAA 2001-01-04 <NA> <NA>
5 5 <NA> <NA> <NA> <NA>
Use mutate/across with the assignment function is.na<-.
DF %>%
mutate(across(starts_with("VAR"), \(x) `is.na<-`(x, !grepl("^AA|^DD", x))))
## A tibble: 5 x 5
# ID VAR1 VAR1DATE VAR2 VAR2DATE
# <int> <chr> <chr> <chr> <chr>
#1 1 AABB NA NA NA
#2 2 AACC NA AACC NA
#3 3 NA NA DDCC NA
#4 4 DDAA NA NA NA
#5 5 NA NA NA NA
Or simpler:
DF %>%
mutate(across(starts_with("VAR"), ~`is.na<-`(., !grepl("^AA|^DD", .))))
Here is a tidyverse solution. Using across with str_replace_all and appending two ifelse statements.
library(dplyr)
library(stringr)
DF %>%
mutate(across(c(VAR1, VAR2), ~str_replace_all(., "^[^AA|DD]", NA_character_))) %>%
mutate(VAR1DATE = ifelse(is.na(VAR1), NA_character_, VAR1DATE),
VAR2DATE = ifelse(is.na(VAR2), NA_character_, VAR2DATE))
ID VAR1 VAR1DATE VAR2 VAR2DATE
<int> <chr> <chr> <chr> <chr>
1 1 AABB 2001-01-01 NA NA
2 2 AACC 2001-01-02 AACC 2001-01-02
3 3 NA NA DDCC 2001-01-03
4 4 DDAA 2001-01-04 NA NA
5 5 NA NA NA NA
Another possibility, using tidyr::unite and tiydr::separate:
library(tidyverse)
DF<-tibble::tribble(
~ID, ~VAR1, ~VAR1DATE, ~VAR2, ~VAR2DATE,
1L, "AABB", "2001-01-01", "BBAA", "2001-01-01",
2L, "AACC", "2001-01-02", "AACC", "2001-01-02",
3L, "CCDD", "2001-01-03", "DDCC", "2001-01-03",
4L, "DDAA", "2001-01-04", "CCBB", "2001-01-04",
5L, "CCBB", "2001-01-05", "CCBB", "2001-01-05"
)
DF %>%
unite(VAR1,VAR1,VAR1DATE) %>% unite(VAR2,VAR2,VAR2DATE) %>%
mutate(across(starts_with("VAR"),~if_else(str_detect(.x, "^AA|^DD"), .x, NA_character_))) %>%
separate(VAR1,into = c("VAR1", "VAR1DATE"), sep = "_") %>%
separate(VAR2,into = c("VAR2", "VAR2DATE"), sep = "_")
#> # A tibble: 5 × 5
#> ID VAR1 VAR1DATE VAR2 VAR2DATE
#> <int> <chr> <chr> <chr> <chr>
#> 1 1 AABB 2001-01-01 <NA> <NA>
#> 2 2 AACC 2001-01-02 AACC 2001-01-02
#> 3 3 <NA> <NA> DDCC 2001-01-03
#> 4 4 DDAA 2001-01-04 <NA> <NA>
#> 5 5 <NA> <NA> <NA> <NA>
Here is another tidyverse solution with using str_detect to determine where to convert to NA for the date columns. Then, we can use the same type of function on VAR1 and VAR2.
library(tidyverse)
DF %>%
rowwise %>%
mutate(
VAR1DATE = ifelse(str_detect(VAR1, '^BB|^CC') == TRUE, NA, VAR1DATE),
VAR2DATE = ifelse(str_detect(VAR2, '^BB|^CC') == TRUE, NA, VAR2DATE)
) %>%
mutate(across(c(VAR1, VAR2), function(x)
ifelse(str_detect(x, '^BB|^CC') == TRUE, NA, x)))
Output
# A tibble: 5 × 5
# Rowwise:
ID VAR1 VAR1DATE VAR2 VAR2DATE
<int> <chr> <chr> <chr> <chr>
1 1 AABB 2001-01-01 NA NA
2 2 AACC 2001-01-02 AACC 2001-01-02
3 3 NA NA DDCC 2001-01-03
4 4 DDAA 2001-01-04 NA NA
5 5 NA NA NA NA
This must be easy but my brain is blocked!
I have this dataframe:
col1
<chr>
1 A
2 B
3 NA
4 C
5 D
6 NA
7 NA
8 E
9 NA
10 F
df <- structure(list(col1 = c("A", "B", NA, "C", "D", NA, NA, "E",
NA, "F")), row.names = c(NA, -10L), class = c("tbl_df", "tbl",
"data.frame"))
I want to add a column with uniqueID only for values that are not NA with tidyverse.
Expected output:
col1 uniqueID
<chr> <dbl>
1 A 1
2 B 2
3 NA NA
4 C 3
5 D 4
6 NA NA
7 NA NA
8 E 5
9 NA NA
10 F 6
I have tried: n(), row_number(), cur_group_id ....
We could do this easily in data.table. Specify the condition in i i.e. non-NA elements in 'col1', create the column 'uniqueID' with the sequence of elements by assignment (:=)
library(data.table)
setDT(df)[!is.na(col1), uniqueID := seq_len(.N)]
-output
df
col1 uniqueID
1: A 1
2: B 2
3: <NA> NA
4: C 3
5: D 4
6: <NA> NA
7: <NA> NA
8: E 5
9: <NA> NA
10: F 6
In dplyr, we can use replace
library(dplyr)
df %>%
mutate(uniqueID = replace(col1, !is.na(col1),
seq_len(sum(!is.na(col1)))))
-output
# A tibble: 10 x 2
col1 uniqueID
<chr> <chr>
1 A 1
2 B 2
3 <NA> <NA>
4 C 3
5 D 4
6 <NA> <NA>
7 <NA> <NA>
8 E 5
9 <NA> <NA>
10 F 6
Another approach:
library(dplyr)
df %>%
mutate(UniqueID = cumsum(!is.na(col1)),
UniqueID = if_else(is.na(col1), NA_integer_, UniqueID))
# A tibble: 10 x 2
col1 UniqueID
<chr> <int>
1 A 1
2 B 2
3 NA NA
4 C 3
5 D 4
6 NA NA
7 NA NA
8 E 5
9 NA NA
10 F 6
A base R option using match + na.omit + unique
transform(
df,
uniqueID = match(col1, na.omit(unique(col1)))
)
gives
col1 uniqueID
1 A 1
2 B 2
3 <NA> NA
4 C 3
5 D 4
6 <NA> NA
7 <NA> NA
8 E 5
9 <NA> NA
10 F 6
A weird tidyverse solution:
library(dplyr)
df %>%
mutate(id = ifelse(is.na(col1), 0, 1),
id = cumsum(id == 1),
id = ifelse(is.na(col1), NA, id))
# A tibble: 10 x 2
col1 id
<chr> <int>
1 A 1
2 B 2
3 NA NA
4 C 3
5 D 4
6 NA NA
7 NA NA
8 E 5
9 NA NA
10 F 6
I would like to do some calculations using frollaply() or rollapplyr() with a conditional factor.
I have the following data
df <- tibble(w = c(NA, NA, "c1", NA, NA, "c2", NA, NA, "c3", NA, NA, "c4"),
x = 1:12, y = x * 2) %>%
as.data.table()
Using data.table I generate the following result.
df[, sumx := frollapply(x, 3, FUN = sum)]
w
x
y
sumx
1
2
NA
2
4
NA
c1
3
6
6
4
8
9
5
10
12
c2
6
12
15
7
14
18
8
16
21
c3
9
18
24
10
20
27
11
22
30
c4
12
24
33
I like this result. Although I would to do something more complicated.
First: I would like let this output more clean, like this:
w
x
y
sumx
1
2
NA
2
4
NA
c1
3
6
6
4
8
NA
5
10
NA
c2
6
12
15
7
14
NA
8
16
NA
c3
9
18
24
10
20
NA
11
22
NA
c4
12
24
33
Second: I would like create an another variable, for example "sumx2", where the values of the line "c1" is the sum (OBS: not just sum, could be mean or count of a specific value) of all 4 or 5 or n values of variable "x" above (OBS: If not have 4 or 5 or n values above, this absent values has to be understand as NA). The correspondent lines "c2" and "c3" following the same idea. In this way the output expected would be:
w
x
y
sumx
sumx2
1
2
NA
NA
2
4
NA
NA
c1
3
6
6
6
4
8
NA
NA
5
10
NA
NA
c2
6
12
15
18
7
14
NA
NA
8
16
NA
NA
c3
9
18
24
30
10
20
NA
NA
11
22
NA
NA
c4
12
24
33
42
Your help is appreciated!
if I understood everything correctly
library(tibble)
df <- tibble(w = c(NA, NA, "c1", NA, NA, "c2", NA, NA, "c3", NA, NA, "c4"),
x = 1:12, y = x * 2)
library(data.table)
setDT(df)
nm_cols <- c("sumX", "sumx2")
df[, (nm_cols) := list(
ifelse(is.na(w), NA, zoo::rollapplyr(x, width = 3, FUN = function(x) sum(x), partial = T)),
ifelse(is.na(w), NA, zoo::rollapplyr(x, width = 4, FUN = function(x) sum(x), partial = T))
)]
df
#> w x y sumX sumx2
#> 1: <NA> 1 2 NA NA
#> 2: <NA> 2 4 NA NA
#> 3: c1 3 6 6 6
#> 4: <NA> 4 8 NA NA
#> 5: <NA> 5 10 NA NA
#> 6: c2 6 12 15 18
#> 7: <NA> 7 14 NA NA
#> 8: <NA> 8 16 NA NA
#> 9: c3 9 18 24 30
#> 10: <NA> 10 20 NA NA
#> 11: <NA> 11 22 NA NA
#> 12: c4 12 24 33 42
Created on 2021-03-21 by the reprex package (v1.0.0)
Check this
library(data.table)
dt <- data.table(w = c(NA, NA, "c1", NA, NA, "c2", NA, NA, "c3", NA, NA, "c4"),
x = 1:12)
dt[,id:=rleidv(x)]
#dt[,sumx := ifelse(is.na(w),NA,frollapply(x,3,sum))]
dt[,sumx := fcase(!is.na(w),frollapply(x,3,sum))]
dt[,sumx2 := fcase(!is.na(w) & id == 3, frollapply(x, n = 3, sum),
!is.na(w) & id >= 4, frollapply(x, n = 4, sum))
]
dt[,id:=NULL]
Result:
dt
w x sumx sumx2
1: <NA> 1 NA NA
2: <NA> 2 NA NA
3: c1 3 6 6
4: <NA> 4 NA NA
5: <NA> 5 NA NA
6: c2 6 15 18
7: <NA> 7 NA NA
8: <NA> 8 NA NA
9: c3 9 24 30
10: <NA> 10 NA NA
11: <NA> 11 NA NA
12: c4 12 33 42
I have a dataframe which looks like this:
`Row Labels` Female Male
<chr> <chr> <chr>
1 London <NA> <NA>
2 42 <NA> 1
3 Paris <NA> <NA>
4 36 1 <NA>
5 Belgium <NA> <NA>
6 18 1
7 21 <NA> 1
8 Madrid <NA> <NA>
9 20 1 <NA>
10 Berlin <NA> <NA>
11 37 <NA> 1
12 23 1
13 25 1
14 44 1
The code I used to produce this dataframe looks like this:
structure(list(`Row Labels` = c("London", "42", "Paris","36", "Belgium","18" ,"21", "Madrid", "20", "Berlin", "37","23","25","44"),
Female = c(NA, NA, NA, "1", NA, NA,NA, NA, "1", NA, NA,"1","1","1"), Male = c(NA,"1", NA, NA, NA, "1", NA, NA, NA, "1",NA,NA,NA,NA)),
.Names = c("Row Labels","Female", "Male"), row.names = c(NA, -14L), class = c("tbl_df", "tbl", "data.frame"))
I would like to know how I can change multiple rows in this dataframe to become columns.
My ideal output looks like this:
'Row Labels' Female Male 42 36 21 20 37 18 23 25 44
London 1 1
Paris 1 1
Belgium 1 1 1 1
Madrid 1 1
Berlin 3 1 1 1 1 1
Seems very mechanical. Calling your data d:
d1 = d[seq(1, nrow(d), by = 2), ]
d2 = d[seq(2, nrow(d), by = 2), ]
d1[, c("Male", "Female")] = d2[, c("Male", "Female")]
d3 = matrix(nrow = nrow(d2), ncol = nrow(d2))
diag(d3) = 1
colnames(d3) = d2$`Row Labels`
cbind(d2, d3)
# Row Labels Female Male 42 36 21 20 37
# 1 42 <NA> 1 1 NA NA NA NA
# 2 36 1 <NA> NA 1 NA NA NA
# 3 21 <NA> 1 NA NA 1 NA NA
# 4 20 1 <NA> NA NA NA 1 NA
# 5 37 <NA> 1 NA NA NA NA 1
Using tidyverse.
library(dplyr)
library(tidyr)
#cumsum based on country names
df %>% group_by(gr=cumsum(grepl('\\D+',`Row Labels`))) %>%
#Sum Female and Male
mutate_at(vars('Female','Male'), list(~sum(as.numeric(.), na.rm = T))) %>%
#Create RL from country name and number where we are at numbers
mutate(RL=ifelse(row_number()>1, paste0(first(`Row Labels`),',',`Row Labels`), NA)) %>%
filter(!is.na(RL)) %>%
select(RL, gr, Male, Female) %>%
separate(RL, into = c('RL','Age')) %>% mutate(flag=1) %>% spread(Age, flag) %>%
ungroup() %>% select(-gr)
# A tibble: 5 x 12
RL Male Female `18` `20` `21` `23` `25` `36` `37` `42` `44`
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Belgium 1 0 1 NA 1 NA NA NA NA NA NA
2 Berlin 1 3 NA NA NA 1 1 NA 1 NA 1
3 London 1 0 NA NA NA NA NA NA NA 1 NA
4 Madrid 0 1 NA 1 NA NA NA NA NA NA NA
5 Paris 0 1 NA NA NA NA NA 1 NA NA NA
I want to get rolling means for the past 1 to 10 events grouped by a column for multiple columns. I also want it very fast such as in dplyr or data.table because I want to run this on a 1,000,000 x 1,000 dataframe.
starting df
data.table(a = c("bill", "bob", "bill", "bob", "bill", "bob"),
b = c(1,2,1,1,3,2),
c = c(2,3,9,1,4,1),
d = c(4,5,1,7,3,4))
1: bill 1 2 4
2: bob 2 3 5
3: bill 1 9 1
4: bob 1 1 7
5: bill 3 4 3
6: bob 2 1 4
desired df
I want the rolling mean of only b and c grouped by column a with a window of 1 to 10 for each column lagged 1 row.
a b c d b_roll1 c_roll1 b_roll2 c_roll2 b_roll3 c_roll3
1: bill 1 2 4 NA NA NA NA NA NA
2: bob 2 3 5 NA NA NA NA NA NA
3: bill 1 9 1 1 2 1 2 1 2
4: bob 1 1 7 2 3 2 3 2 3
5: bill 3 4 3 1 9 1 5.5 1 5.5
6: bob 2 1 4 1 1 1 2 1 2
Your example outcome doesn't make too much sense to me, but here is an example on how you can generate many mutate calls programmatically.
An extendable solution using lazyeval and RcppRoll:
library(tidyverse)
vars <- c('b', 'c')
ns <- 1:10
com <- expand.grid(vars, ns, stringsAsFactors = FALSE)
dots <- map2(com[[1]], com[[2]],
~lazyeval::interp(~RcppRoll::roll_meanr(x, y, fill = NA), x = as.name(.x), y = .y))
names(dots) <- apply(com, 1, paste0, collapse = '_')
D %>%
group_by(a) %>%
mutate_(.dots = dots)
Gives:
Source: local data frame [6 x 24]
Groups: a [2]
a b c d `b_ 1` `c_ 1` `b_ 2` `c_ 2` `b_ 3` `c_ 3` `b_ 4` `c_ 4` `b_ 5` `c_ 5` `b_ 6` `c_ 6` `b_ 7` `c_ 7` `b_ 8` `c_ 8` `b_ 9`
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 bill 1 2 4 1 2 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
2 bob 2 3 5 2 3 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
3 bill 1 2 1 1 2 1.0 2 NA NA NA NA NA NA NA NA NA NA NA NA NA
4 bob 1 1 7 1 1 1.5 2 NA NA NA NA NA NA NA NA NA NA NA NA NA
5 bill 3 4 3 3 4 2.0 3 1.666667 2.666667 NA NA NA NA NA NA NA NA NA NA NA
6 bob 2 1 4 2 1 1.5 1 1.666667 1.666667 NA NA NA NA NA NA NA NA NA NA NA
# ... with 3 more variables: `c_ 9` <dbl>, b_10 <dbl>, c_10 <dbl>
I am still not completely following you. It seems that you apply a combination of a lag and a rolled mean. For just the rolled mean this is a solution using dplyr and RcppRoll.
roll_mean_na <- function(x, lag){
c(rep(NA, lag - 1), RcppRoll::roll_mean(x, lag, align = "left"))
}
library(dplyr)
df %>% group_by(a) %>%
mutate(b_2 = roll_mean_na(b, 2), c_2 = roll_mean_na(c, 2),
b_3 = roll_mean_na(b, 3), c_3 = roll_mean_na(c, 3),
b_4 = roll_mean_na(b, 4), c_4 = roll_mean_na(c, 4))