Extrapolating sequence of numbers in R - r

I would like to increment numbers from the last observed number to the last number i.e. the NA values should be replaced with 35, 36, 38, 39
How can I go about it?
library(tidyverse)
trialdata <- tibble(
id = c(13, 8, 20, 34, 4, NA, NA, NA, NA, NA)
)

If your goal is to fill the NA id rows with a sequence that starts after the maximum non-NA value, then here's one way you could do it:
trialdata %>%
mutate(
id_filled = cumsum(is.na(id)) + max(id, na.rm = T),
id_filled = coalesce(id, id_filled)
)
id id_filled
<dbl> <dbl>
1 13 13
2 8 8
3 20 20
4 34 34
5 4 4
6 NA 35
7 NA 36
8 NA 37
9 NA 38
10 NA 39

Here is one option that could work on NAs that are between as well
library(dplyr)
library(tidyr)
library(data.table)
trialdata %>%
mutate(id1 = id) %>%
fill(id1, .direction = "downup") %>%
group_by(grp = rleid(id1)) %>%
mutate(id = id[!is.na(id)] + row_number() - row_number()[!is.na(id)]) %>%
ungroup %>%
select(-id1, -grp)
-output
# A tibble: 10 × 1
id
<dbl>
1 4
2 8
3 13
4 20
5 34
6 35
7 36
8 37
9 38
10 39
On another data
trialdata1 <- structure(list(id = c(NA, 4, 10, NA, NA, 20, NA)),
class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -7L))
the output is
# A tibble: 7 × 1
id
<dbl>
1 3
2 4
3 10
4 11
5 12
6 20
7 21

Here is an alternative:
libaray(dplyr)
library(tidyr)
trialdata %>%
mutate(id1 = row_number()) %>%
arrange(id) %>%
fill(id) %>%
group_by(id) %>%
mutate(id = ifelse(row_number()>1, id+row_number()-1, id)) %>%
arrange(id1) %>%
ungroup() %>%
select(-id1)
id
<dbl>
1 13
2 8
3 20
4 34
5 4
6 35
7 36
8 37
9 38
10 39

For trailing NAs, you could do something like this.
x[is.na(x)] <- (x[sum(x > 0, na.rm=TRUE)]) + seq(sum(is.na(x)))
x
# [1] 4 8 13 20 34 35 36 37 38 39
Data:
x <- c(sort(c(13, 8, 20, 34, 4)), NA, NA, NA, NA, NA)

Related

Is there an elegant way to replace NAs with values from a corresponding column, for multiple columns, in R?

I'm working with a dataframe of trial participant blood test results, with some sporadic missing values (analyte failed). Fortunately we have two time points quite close together, so for missing values at timepoint 1, i'm hoping to impute the corresponding value from timepoint 2.
I am just wondering, if there is an elegant way to code this in R/tidyverse for multiple test results?
Here is some sample data:
timepoint = c(1,1,1,1,1,2,2,2,2,2),
fst_test = c(NA,sample(1:40,9, replace =F)),
scd_test = c(sample(1:20,8, replace = F),NA,NA))
So far I have been pivoting wider, then manually coalescing the corresponding test results, like so:
test %>%
pivot_wider(names_from = timepoint,
values_from = fst_test:scd_test) %>%
mutate(fst_test_imputed = coalesce(fst_test_1, fst_test_2),
scd_test_imputed = coalesce(scd_test_1, scd_test_2)) %>%
select(ID, fst_test_imputed, scd_test_imputed)
However for 15 tests this is cumbersome...
I thought there might be an elegant R / dplyr solution for this situation?
Many thanks in advance for your help!!
We could use fill after creating a grouping column with rowid on the 'timepoint' (as the OP mentioned to replace with corresponding data point in 'timepoint' column). Then, we just need fill and specify the .direction as "updown" to fill NA in the preceding value with the succeeding non-NA first (if it should be only to take care of 'NA' in 'timepoint' 1, then change the .direction = "up")
library(dplyr)
library(tidyr)
library(data.table)
test %>%
group_by(grp = rowid(timepoint)) %>%
fill(fst_test, scd_test, .direction = "updown") %>%
ungroup %>%
select(-grp)
data
test <- structure(list(timepoint = c(1, 1, 1, 1, 1, 2, 2, 2, 2, 2),
fst_test = c(NA,
16L, 30L, 29L, 14L, 32L, 21L, 20L, 3L, 23L), scd_test = c(18L,
17L, 8L, 20L, 1L, 10L, 14L, 19L, NA, NA)),
class = "data.frame", row.names = c(NA,
-10L))
You could pivot your data so that "timepoint" defines the columns, with all your tests on the rows. In order to perform this pivot without creating list-cols, we'll have to group by "timepoint" and create an index for each row within the group:
test <- tibble(
timepoint = c(1,1,1,1,1,2,2,2,2,2),
fst_test = c(NA,sample(1:40,9, replace =F)),
scd_test = c(sample(1:20,8, replace = F),NA,NA))
)
test_pivoted <- test %>%
group_by(timepoint) %>%
mutate(idx = row_number()) %>%
pivot_longer(-c(timepoint, idx)) %>%
pivot_wider(names_from = timepoint, values_from = value, names_prefix = 'timepoint')
idx name timepoint1 timepoint2
<int> <chr> <int> <int>
1 1 fst_test NA 39
2 1 scd_test 5 10
3 2 fst_test 37 7
4 2 scd_test 20 3
5 3 fst_test 5 26
6 3 scd_test 19 11
7 4 fst_test 17 28
8 4 scd_test 9 NA
9 5 fst_test 14 32
10 5 scd_test 8 NA
Now we can coalesce once across the two timepoints for all tests:
test_pivoted %>%
mutate(
imputed = coalesce(timepoint1, timepoint2)
)
idx name timepoint1 timepoint2 imputed
<int> <chr> <int> <int> <int>
1 1 fst_test NA 39 39
2 1 scd_test 5 10 5
3 2 fst_test 37 7 37
4 2 scd_test 20 3 20
5 3 fst_test 5 26 5
6 3 scd_test 19 11 19
7 4 fst_test 17 28 17
8 4 scd_test 9 NA 9
9 5 fst_test 14 32 14
10 5 scd_test 8 NA 8
And if you wanted to clean up the result a little more:
test_pivoted %>%
mutate(
imputed = coalesce(timepoint1, timepoint2)
) %>%
select(name, idx, imputed) %>%
pivot_wider(names_from = name, values_from = imputed)
idx fst_test scd_test
<int> <int> <int>
1 1 39 5
2 2 37 20
3 3 5 19
4 4 17 9
5 5 14 8

Rolling rowsum over existing data frame with NAs in r

Given the data frame:
df1 <- data.frame(Company = c('A','B','C','D','E'),
`X1980` = c(NA, 5, 3, 8, 13),
`X1981` = c(NA, 12, NA, 11, 29),
`X1982` = c(33, NA, NA, 41, 42),
`X1983` = c(45, 47, 53, NA, 55))
I would like to create a new data frame where each value is replaced by the sum of the current value and the previous value of the row. NAs should be kept as they are.
This should result in the following data frame:
Company 1980 1981 1982 1983
A NA NA 33 78
B 5 17 NA 47
C 3 NA NA 53
D 8 19 60 NA
E 13 42 84 139
Here is a tidyverse approach
library(dplyr)
library(tidyr)
library(purrr)
df1 %>%
pivot_longer(matches("\\d{4}$")) %>%
group_by(Company) %>%
mutate(value = accumulate(value, ~if (is.na(out <- .x + .y)) .y else out)) %>%
pivot_wider()
Output
# A tibble: 5 x 5
# Groups: Company [5]
Company X1980 X1981 X1982 X1983
<chr> <dbl> <dbl> <dbl> <dbl>
1 A NA NA 33 78
2 B 5 17 NA 47
3 C 3 NA NA 53
4 D 8 19 60 NA
5 E 13 42 84 139

Counting the number of values that are more than 60 for each row

I have a data frame that looks like this:
location td1_2019 td2_2019 td3_2019 td4_2019 td1_2020 td2_2020 td3_2020 td4_2020
1 a 50 55 60 58 63 55 60 58
2 b 45 65 57 50 61 66 62 59
3 c 61 66 62 59 45 65 57 50
here, td1_2019 = temperature day1 in 2019 ... and so on
I want count the number of days temperature was above 60 for both 2019 and 2020 for each location. I want the table to look like the following:
location 2019 2020
1 a 1 2
2 b 1 3
3 c 3 1
I am using R, so I would prefer a solution in R. Any help would be appreciated! Thank you!
A dplyr solution
library(dplyr)
df1 %>%
pivot_longer(
-location,
names_to = c("day", "year"),
names_pattern = "td(\\d)_(\\d{4})",
values_to = "temperature"
) %>%
group_by(year, location) %>%
summarise(n = sum(temperature >= 60)) %>%
pivot_wider(names_from = "year", values_from = "n")
A Base R solution
nms <- names(df1)
cond <- df1 >= 60
Reduce(
function(out, y) `[[<-`(out, y, value = rowSums(cond[, which(grepl(y, nms))])),
c("2019", "2020"),
init = df1[, "location", drop = FALSE]
)
Output
location `2019` `2020`
<chr> <int> <int>
1 a 1 2
2 b 1 3
3 c 3 1
Assume that df1 looks like this
> df1
# A tibble: 3 x 9
location td1_2019 td2_2019 td3_2019 td4_2019 td1_2020 td2_2020 td3_2020 td4_2020
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 a 50 55 60 58 63 55 60 58
2 b 45 65 57 50 61 66 62 59
3 c 61 66 62 59 45 65 57 50
Does this work: I think you want something more year wise.
> library(dplyr)
> temp %>% pivot_longer(-location, names_to = c('td', 'year'), names_pattern = '(.*)_(.*)', values_to = 'temp') %>%
+ filter(temp >= 60) %>% count(location, year, name = 'Count') %>%
+ pivot_wider(location, names_from = year, values_from = Count, values_fill = list(Count = 0))
# A tibble: 3 x 3
location `2019` `2020`
<chr> <int> <int>
1 a 1 2
2 b 1 3
3 c 3 1
>
You can use the following tidy solution. Just as in the other solutions posted (which are very nice), a key move is to get the data in a long format using pivot_longer().
library(dplyr)
library(tidyr)
library(stringr)
data %>%
pivot_longer(-location) %>%
mutate(year = str_sub(name, -2)) %>%
group_by(location, year) %>%
mutate(above60 = sum(value >= 60)) %>%
ungroup() %>%
distinct(location, year, above60) %>%
pivot_wider(names_from = year, values_from = above60)
# location `19` `20`
# <chr> <int> <int>
# 1 a 1 2
# 2 b 1 3
# 3 c 3 1
data
structure(list(location = c("a", "b", "c"), td1_2019 = c(50,
45, 61), td2_2019 = c(55, 65, 66), td3_2019 = c(60, 57, 62),
td4_2019 = c(58, 50, 59), td1_2020 = c(63, 61, 45), td2_2020 = c(55,
66, 65), td3_2020 = c(60, 62, 57), td4_2020 = c(58, 59, 50
)), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"
))
A base R option
cbind(
df[1],
list2DF(
lapply(
split.default(
as.data.frame(df[-1] >= 60),
gsub(".*?(\\d+)$", "\\1", names(df)[-1],
perl = TRUE
)
),
rowSums
)
)
)
which gives
location 2019 2020
1 a 1 2
2 b 1 3
3 c 3 1

R Replace NA for all Columns Except *

library(tidyverse)
df <- tibble(Date = c(rep(as.Date("2020-01-01"), 3), NA),
col1 = 1:4,
thisCol = c(NA, 8, NA, 3),
thatCol = 25:28,
col999 = rep(99, 4))
#> # A tibble: 4 x 5
#> Date col1 thisCol thatCol col999
#> <date> <int> <dbl> <int> <dbl>
#> 1 2020-01-01 1 NA 25 99
#> 2 2020-01-01 2 8 26 99
#> 3 2020-01-01 3 NA 27 99
#> 4 NA 4 3 28 99
My actual R data frame has hundreds of columns that aren't neatly named, but can be approximated by the df data frame above.
I want to replace all values of NA with 0, with the exception of several columns (in my example I want to leave out the Date column and the thatCol column. I'd want to do it in this sort of fashion:
df %>% replace(is.na(.), 0)
#> Error: Assigned data `values` must be compatible with existing data.
#> i Error occurred for column `Date`.
#> x Can't convert <double> to <date>.
#> Run `rlang::last_error()` to see where the error occurred.
And my unsuccessful ideas for accomplishing the "everything except" replace NA are shown below.
df %>% replace(is.na(c(., -c(Date, thatCol)), 0))
df %>% replace_na(list([, c(2:3, 5)] = 0))
df %>% replace_na(list(everything(-c(Date, thatCol)) = 0))
Is there a way to select everything BUT in the way I need to? There's hundred of columns, named inconsistently, so typing them one by one is not a practical option.
You can use mutate_at :
library(dplyr)
Remove them by Name
df %>% mutate_at(vars(-c(Date, thatCol)), ~replace(., is.na(.), 0))
Remove them by position
df %>% mutate_at(-c(1,4), ~replace(., is.na(.), 0))
Select them by name
df %>% mutate_at(vars(col1, thisCol, col999), ~replace(., is.na(.), 0))
Select them by position
df %>% mutate_at(c(2, 3, 5), ~replace(., is.na(.), 0))
If you want to use replace_na
df %>% mutate_at(vars(-c(Date, thatCol)), tidyr::replace_na, 0)
Note that mutate_at is soon going to be replaced by across in dplyr 1.0.0.
You have several options here based on data.table.
One of the coolest options: setnafill (version >= 1.12.4):
library(data.table)
setDT(df)
data.table::setnafill(df,fill = 0, cols = colnames(df)[!(colnames(df) %in% c("Date", thatCol)]))
Note that your dataframe is updated by reference.
Another base solution:
to_change<-grep("^(this|col)",names(df))
df[to_change]<- sapply(df[to_change],function(x) replace(x,is.na(x),0))
df
# A tibble: 4 x 5
Date col1 thisCol thatCol col999
<date> <dbl> <dbl> <int> <dbl>
1 2020-01-01 1 0 25 99
2 2020-01-01 2 8 26 99
3 2020-01-01 3 0 27 99
4 NA 0 3 28 99
Data(I changed one value):
df <- structure(list(Date = structure(c(18262, 18262, 18262, NA), class = "Date"),
col1 = c(1L, 2L, 3L, NA), thisCol = c(NA, 8, NA, 3), thatCol = 25:28,
col999 = c(99, 99, 99, 99)), row.names = c(NA, -4L), class = c("tbl_df",
"tbl", "data.frame"))
replace works on a data.frame, so we can just do the replacement by index and update the original dataset
df[-c(1, 4)] <- replace(df[-c(1, 4)], is.na(df[-c(1, 4)]), 0)
Or using replace_na with across (from the new dplyr)
library(dplyr)
library(tidyr)
df %>%
mutate(across(-c(Date, thatCol), ~ replace_na(., 0)))
If you know the ones that you don't want to change, you could do it like this:
df <- tibble(Date = c(rep(as.Date("2020-01-01"), 3), NA),
col1 = 1:4,
thisCol = c(NA, 8, NA, 3),
thatCol = 25:28,
col999 = rep(99, 4))
#dplyr
df_nonreplace <- select(df, c("Date", "thatCol"))
df_replace <- df[ ,!names(df) %in% names(df_nonreplace)]
df_replace[is.na(df_replace)] <- 0
df <- cbind(df_nonreplace, df_replace)
> head(df)
Date thatCol col1 thisCol col999
1 2020-01-01 25 1 0 99
2 2020-01-01 26 2 8 99
3 2020-01-01 27 3 0 99
4 <NA> 28 4 3 99

1 Day lag, 3 Day Average Lags and 7 Average Day lags for all columns in R

I have 158 columns in a dataset. I want to create 3 new columns(1_day,3_day and 7_day lag) for each column.
data <- data.frame(DATE = c("1/1/2016","1/2/2016","1/3/2016","1/4/2016","1/5/2016","1/6/2016","1/7/2016","1/8/2016","1/9/2016","1/10/2016",
Attr1 = c(5,8,7,6,2,1,4,1,2),
Attr2 = c(10,23,32,12,3,2,5,3,21),
Attr3 = c(12,23,43,3,2,4,1,23,33))
The result wanted is as follows :
Attr1_3D = Average of last 3 days of ATTR1
Attr1_7D = Aveage of last 7 days of ATTR1
Attr2_3D = Average of last 3 days of ATTR2
Attr2_7D = Aveage of last 7 days of ATTR2
Attr3_3D = Average of last 3 days of ATTR3
Attr3_7D = Aveage of last 7 days of ATTR3
One approach using tidyverse and zoo is below. You can use rollapply from zoo package to get rolling means (by 1, 3, or 7 days).
Edit: Also added offset by 1 day (as rolling mean values are included on the day after the X-day window). Also joining back to original data frame to include original Attr columns.
library(tidyverse)
library(zoo)
data %>%
pivot_longer(starts_with("Attr"), names_to = "Attr", values_to = "Value") %>%
group_by(Attr) %>%
mutate(Attr_1D = rollapply(Value, 1, mean, align = 'right', fill = NA),
Attr_3D = rollapply(Value, 3, mean, align = 'right', fill = NA),
Attr_7D = rollapply(Value, 7, mean, align = 'right', fill = NA),
DATE = lead(DATE)) %>%
pivot_wider(id_cols = DATE, names_from = "Attr", values_from = c("Attr_1D", "Attr_3D", "Attr_7D")) %>%
right_join(data)
Output
# A tibble: 9 x 13
DATE Attr_1D_Attr1 Attr_1D_Attr2 Attr_1D_Attr3 Attr_3D_Attr1 Attr_3D_Attr2 Attr_3D_Attr3 Attr_7D_Attr1 Attr_7D_Attr2 Attr_7D_Attr3 Attr1 Attr2 Attr3
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1/1/2016 NA NA NA NA NA NA NA NA NA 5 10 12
2 1/2/2016 5 10 12 NA NA NA NA NA NA 8 23 23
3 1/3/2016 8 23 23 NA NA NA NA NA NA 7 32 43
4 1/4/2016 7 32 43 6.67 21.7 26 NA NA NA 6 12 3
5 1/5/2016 6 12 3 7 22.3 23 NA NA NA 2 3 2
6 1/6/2016 2 3 2 5 15.7 16 NA NA NA 1 2 4
7 1/7/2016 1 2 4 3 5.67 3 NA NA NA 4 5 1
8 1/8/2016 4 5 1 2.33 3.33 2.33 4.71 12.4 12.6 1 3 23
9 1/9/2016 1 3 23 2 3.33 9.33 4.14 11.4 14.1 2 21 33
Data
data <- structure(list(DATE = structure(1:9, .Label = c("1/1/2016", "1/2/2016",
"1/3/2016", "1/4/2016", "1/5/2016", "1/6/2016", "1/7/2016", "1/8/2016",
"1/9/2016"), class = "factor"), Attr1 = c(5, 8, 7, 6, 2, 1, 4,
1, 2), Attr2 = c(10, 23, 32, 12, 3, 2, 5, 3, 21), Attr3 = c(12,
23, 43, 3, 2, 4, 1, 23, 33)), class = "data.frame", row.names = c(NA,
-9L))

Resources