Using Lubridate producing NA values - r

I have this data frame;(df)
date Name Name_id x1 x2 x3 x4 x5 x6
01/01/2000 00:00 A U_12 1 1 1 1 1 1
01/01/2000 01:00 A U_12
01/01/2000 02:00
01/01/2000 03:00
....
I am trying to calculate the monthly aggregated mean etc. for some columns using lubridate.
what I did so far;
df$date <- dmy_hm(Sites_tot$date)
df$month <- floor_date(df$date,"month")
monthly_avgerage <- df %>%
group_by(Name, Name_id, month) %>%
summarize_at(vars(x1:x4), .funs = c("mean", "min", "max"), na.rm = TRUE)
I can see the values seem okay although some of the months are turned into NAs.

We can modify the summarise_at to
library(dplyr)
df %>%
group_by(Name, Name_id, month) %>%
summarise(across(x1:x4, list(mean = ~ mean(.x, na.rm = TRUE),
min = ~ min(.x, na.rm = TRUE),
max = ~ max(.x, na.rm = TRUE))))
A reproducible example
iris %>%
group_by(Species) %>%
summarise(across(everything(), list(mean = ~ mean(.x, na.rm = TRUE),
min = ~ min(.x, na.rm = TRUE),
max = ~ max(.x, na.rm = TRUE))))

If I am not wrong, the challenge is to get date column into datetime format:
Somehow date = dmy_hm(date) does not work:
library(dplyr)
library(lubridate)
df %>%
mutate(date = dmy_hms(paste0(date, ":00")),
month = month(date)) %>%
group_by(Name, Name_id, month) %>%
summarise(across(x1:x4, list(mean = ~ mean(.x, na.rm = TRUE),
min = ~ min(.x, na.rm = TRUE),
max = ~ max(.x, na.rm = TRUE))), .groups = "drop")
Name Name_id month x1_mean x1_min x1_max x2_mean x2_min x2_max x3_mean x3_min x3_max
<chr> <chr> <dbl> <dbl> <int> <int> <dbl> <int> <int> <dbl> <int> <int>
1 A U_12 1 1.5 1 2 1.5 1 2 1.5 1 2
2 B U_13 1 3.5 3 4 3.5 3 4 3.5 3 4
# … with 3 more variables: x4_mean <dbl>, x4_min <int>, x4_max <int>
# ℹ Use `colnames()` to see all variable names
fake data:
df <- structure(list(date = c("01/01/2000 00:00", "01/01/2000 01:00",
"01/01/2000 02:00", "01/01/2000 03:00"), Name = c("A", "A", "B",
"B"), Name_id = c("U_12", "U_12", "U_13", "U_13"), x1 = 1:4,
x2 = 1:4, x3 = 1:4, x4 = 1:4, x5 = 1:4, x6 = 1:4), class = "data.frame", row.names = c(NA,
-4L))

Related

R: Cut dataframe but ensure all steps

let's say I have this data:
test_data <- dplyr::tibble(
ID = c(1, 1, 1, 1, 1, 1, 1),
values = c(40, 41, 38, 36, 35, 36, 30),
times = c(as.POSIXct("2020-01-01 00:00:00"),
as.POSIXct("2020-01-01 15:00:00"),
as.POSIXct("2020-01-01 18:00:00"),
as.POSIXct("2020-01-02 14:00:00"),
as.POSIXct("2020-01-03 20:00:00"),
as.POSIXct("2020-01-05 10:00:00"),
as.POSIXct("2020-01-05 14:00:00")))
I now want to extract the last value of each day, beginning with the first timestep. For that i do:
test_data %>%
dplyr::mutate(diff = as.double.difftime(times - min(times), units = "days")) %>%
dplyr::mutate(day = cut(diff, breaks = 0:6, include.lowest = TRUE, right = TRUE, ordered_result = TRUE)) %>%
group_by(ID, day) %>%
filter(row_number()==n()) %>%
select(ID, day, values) %>%
tidyr::pivot_wider(names_from = day, values_from = values)
which gives:
ID `[0,1]` `(1,2]` `(2,3]` `(4,5]`
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 38 36 35 30
However, as you see there is a step missing as we have no data from day 3 to 4. Is there a way to ensure that alle intervals are included in the result and that for missing data NA is placed instead?
My only idea would be to add a "dummy user" to the dataframe that has data for all intervals so that it is ensured that all intervals are included.
So what i want is:
ID `[0,1]` `(1,2]` `(2,3]` `(3,4]` `(4,5]`
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 38 36 35 NA 30
You can fill missing rows in your dataset looking for missing dates like that:
seq_dates <- tibble(times = seq(min(unique(as.Date(test_data$times))), max(unique(as.Date(test_data$times))), by="days"))
missing_dates <- seq_dates %>% filter(!times %in% unique(as.Date(test_data$times)))
missing_dates$times <- as.POSIXct(missing_dates$times)
missing_dates$ID <- 1
missing_dates$values <- NA
missing_dates <- missing_dates %>% select(ID, values, times)
test_data <- test_data %>% bind_rows(missing_dates) %>% arrange(times)
And then execute your code:
test_data %>%
dplyr::mutate(diff = as.double.difftime(times - min(times), units = "days")) %>%
dplyr::mutate(day = cut(diff, breaks = 0:6, include.lowest = TRUE, right = TRUE, ordered_result = TRUE)) %>%
group_by(ID, day) %>%
filter(row_number()==n()) %>%
select(ID, day, values) %>%
tidyr::pivot_wider(names_from = day, values_from = values)
And get the desired result:
ID `[0,1]` `(1,2]` `(2,3]` `(3,4]` `(4,5]`
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 38 36 35 NA 30

Correlation by group and unique pairs

I want to do correlations for each unique combination and grouped by another variable. My solutions works for a very small dataset buy imagine more columns it's getting very tedious.
set.seed((13))
df <- data.frame(group = rep(LETTERS[1:3], 3),
var1 = rnorm(9, 1),
var2 = rnorm(9, 2),
var3 = rnorm(9, 1))
df %>%
group_by(group) %>%
summarise(var1_var2 = cor(var1, var2),
var1_var3 = cor(var1, var3),
var2_var3 = cor(var2, var3))
I also tried this one, but it doens't work.
df %>%
group_by(group) %>%
summarise(cor = cor(df[,2:ncol(df)]))
Here is an option. Map out all the combos then run a cor test for each group and each var and then pivot wider at the end:
library(tidyverse)
map_dfr(unique(df$group), \(x){
data.frame(t(combn(c("var1", "var2", "var3"), 2))) |>
mutate(group = x)
}) |>
mutate(cor = pmap_dbl(list(X1, X2, group),
~cor(df[df$group == ..3, ..1],
df[df$group == ..3, ..2]))) |>
unite(test, X1, X2) |>
pivot_wider(names_from = test, values_from = cor)
#> # A tibble: 3 x 4
#> group var1_var2 var1_var3 var2_var3
#> <chr> <dbl> <dbl> <dbl>
#> 1 A 0.318 -0.476 -0.985
#> 2 B -0.373 -0.487 -0.628
#> 3 C 0.535 0.991 0.645
Another solution. This works for any number of variables.
library(dplyr)
library(tidyr)
library(purrr)
library(tibble)
set.seed((13))
df <- data.frame(group = rep(LETTERS[1:3], 3),
var1 = rnorm(9, 1),
var2 = rnorm(9, 2),
var3 = rnorm(9, 2))
df %>%
select(-group) %>%
split(df$group) %>%
imap_dfr(
~ {
expand.grid(
first = names(.x),
second = names(.x),
stringsAsFactors = FALSE
) %>%
filter(first < second) %>%
rowwise() %>%
transmute(
group = .y,
pair = paste(first, second, sep = "_"),
cor = cor(.x[[first]], .x[[second]])
)
}
) %>%
pivot_wider(
names_from = "pair",
values_from = "cor"
)
# # A tibble: 3 × 4
# group var1_var2 var1_var3 var2_var3
# <chr> <dbl> <dbl> <dbl>
# 1 A 0.318 -0.476 -0.985
# 2 B -0.373 -0.487 -0.628
# 3 C 0.535 0.991 0.645

How to combine across, summarize, and n() in R to get number of non-NA values by column?

I have a list of questions, and I want to know how many rows have non-NA values using summarize. I want to use summarize because I'm already using that to calculate the average, which works in the below code. Why does the below code not work and how can I fix it?
library(dplyr)
test <- tibble(student = c("j", "c", "s"),
q1 = c(1, 2, 3),
q2 = c(NA_real_, NA_real_, 4),
q3 = c(43, NA_real_, 232))
test %>%
dplyr::summarise(n = across(starts_with("q"), ~n(.x)),
avg = across(contains("q"), ~ round(mean(.x, na.rm = T), 2)))
expected_outcome <- tibble(n_q1 = 3,
n_q2 = 1,
n_q3 = 2,
avg_q1 = 2,
avg_q2 = 4,
avg_q3 = 138)
library(dplyr)
test %>%
summarize(across(starts_with("q"), list(n = ~sum(!is.na(.)),
avg = ~mean(., na.rm = T)),
.names = "{.fn}_{.col}"))
From the ?across documentation, you can pass a list to the .fns argument:
A list of functions/lambdas, e.g. list(mean = mean, n_miss = ~ sum(is.na(.x))
This will apply every function in that list to the columns you have specified. You can then use the .names argument of across to set the column names how you desire.
Output
n_q1 avg_q1 n_q2 avg_q2 n_q3 avg_q3
<int> <dbl> <int> <dbl> <int> <dbl>
1 3 2 1 4 2 138.
Update: Upps I missed the whole question. sorry: But here is an alternative just for fun: The preferred answer is already given by #LMc:
library(dplyr)
test %>%
summarise(across(starts_with("q"), list(avg = ~mean(., na.rm = T)),
.names = "{.fn}_{.col}")) %>%
bind_cols(test %>% purrr::map_df(~sum(!is.na(.))))
avg_q1 avg_q2 avg_q3 student q1 q2 q3
<dbl> <dbl> <dbl> <int> <int> <int> <int>
1 2 4 138. 3 3 1 2
test %>%
summarise(across(starts_with("q"), list(avg = ~mean(., na.rm = T)),
.names = "{.fn}_{.col}")) %>%
bind_cols(test %>% purrr::map_df(~sum(!is.na(.))))
First not full answer:
To get the non-nas of the whole dataset, we could do this:
library(dplyr)
test %>%
purrr::map_df(~sum(!is.na(.)))
student q1 q2 q3
<int> <int> <int> <int>
1 3 3 1 2

Stack columns in a data frame

Can someone help me stack the following data frame so that the as are on top of each other and also the 1s and 2s, preferably using a pipe and form a 3x4 dataframe
df <- rbind(data.frame(X1 = 'a', X2 = 1, X3 = 2, X4 = 'a', X5 = 1, X6 = 2), data.frame(X1 = 'a', X2 = 1, X3 = 2, X4 = 'a', X5 = 1, X6 = 2))
Thank you
Here is a data.table solution...
library(data.table)
cols <- 3
# Split df to chuncks of 3 (=ncol) columns
L <- split.default(df, f = cols:(ncol(df) + 2) %/% cols)
# Rowbind, ignore columns names
data.table::rbindlist(L, use.names = FALSE)
# X1 X2 X3
# 1: a 1 2
# 2: a 1 2
# 3: a 1 2
# 4: a 1 2
Using tidyverse -
library(dplyr)
library(tidyr)
df %>%
mutate(across(.fns = as.character)) %>%
pivot_longer(cols = everything()) %>%
mutate(id = paste0('col', rep(1:3, length.out = n()))) %>%
group_by(id) %>%
mutate(name = row_number()) %>%
pivot_wider(names_from = id, values_from = value) %>%
select(-name)
# col1 col2 col3
# <chr> <chr> <chr>
#1 a 1 2
#2 a 1 2
#3 a 1 2
#4 a 1 2

Proportions by group with survey weights

data = data.frame(ID = 1:1000,
GROUP = factor(sample(1:5, rep = T)),
CAT = factor(sample(1:5, rep = T)),
DOG = factor(sample(1:5, rep = T)),
FOX = factor(sample(1:5, rep = T)),
MOUSE = factor(sample(1:5, rep = T)),
WEIGHT = round(runif(1000)*100,0)
)
data_WANT = data.frame(VARS = c("CAT", "DOG", "FOX", "MOUSE", "WEIGHT"),
GROUP1_N = NA,
GROUP1_PROP = NA,
GROUP2_N = NA,
GROUP2_PROP = NA,
GROUP3_N = NA,
GROUP3_PROP = NA,
GROUP4_N = NA,
GROUP4_PROP = NA,
GROUP5_N = NA,
GROUP5_PROP = NA)
I have a dataframe called 'data' and I wish to create a dataframe or datatable that presents the COUNT(_N) of each variable by GROUP and also the weighted proportion (_PROP) for each variable for each group using the variable WEIGHT in the dataframe called 'data'. This is a probability weight that is given to me to get representative estimates.
We can use data.table methods
library(data.table)
dcast(melt(setDT(type.convert(data, as.is = TRUE))[,
c(list(N = .N), lapply(.SD, weighted.mean, WEIGHT)),
GROUP, .SDcols = CAT:MOUSE], id.var = c('GROUP', 'N'),
variable.name = 'Animal'), Animal ~
paste0('GROUP_', GROUP), value.var = c('value', 'N'))
Perhaps, you are trying to do :
library(dplyr)
library(tidyr)
data %>%
type.convert(as.is = TRUE) %>%
group_by(GROUP) %>%
summarise(across(CAT:MOUSE, list(N = ~n(),
PROP = ~weighted.mean(., WEIGHT)))) %>%
pivot_longer(-GROUP,
names_to = c('Animal', 'prop'),
names_sep = '_') %>%
pivot_wider(names_from = c(GROUP, prop), values_from = value,
names_prefix = 'GROUP_')
# A tibble: 4 x 11
# Animal GROUP_1_N GROUP_1_PROP GROUP_2_N GROUP_2_PROP GROUP_3_N
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 CAT 200 5 200 1 200
#2 DOG 200 5 200 2 200
#3 FOX 200 1 200 3 200
#4 MOUSE 200 2 200 1 200
# … with 5 more variables: GROUP_3_PROP <dbl>, GROUP_4_N <dbl>,
# GROUP_4_PROP <dbl>, GROUP_5_N <dbl>, GROUP_5_PROP <dbl>
The pivot_longer and pivot_wider step is to get data in the same format as shown in data_WANT and they are not necessary to perform the calculation.

Resources