Identifying start-end in a time use data - r

I am working with time-use data and want to calculate the duration of a started measurement at each step per id. I was considering using the colsum function, but I am not sure how to handle if an id an activity is fragmented and has multiple starting times.
Example of output for id 1 and 10
id start-end duration
1 04:00-06:20 15
10 04:00-4:10 2
10 04:50-06:20 10
Sample data:
structure(list(id = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14), `04:00` = c(11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
11, 11, 11, 11), `04:10` = c(11, 11, 11, 11, 11, 11, 11, 11,
11, 11, 11, 11, 11, 11), `04:20` = c(11, 11, 11, 11, 11, 11,
11, 11, 11, 11, 11, 11, 11, 11), `04:30` = c(11, 11, 11, 11,
11, 11, 11, 11, 11, 11, 11, 11, 11, 11), `04:40` = c(11, 11,
11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11), `04:50` = c(11,
11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11), `05:00` = c(11,
11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11), `05:10` = c(11,
11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11), `05:20` = c(11,
11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11), `05:30` = c(11,
11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11), `05:40` = c(11,
11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11), `05:50` = c(11,
11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11), `06:00` = c(11,
0, 11, 11, 11, 11, 11, 0, 0, 11, 11, 11, 11, 11), `06:10` = c(11,
0, 11, 11, 11, 11, 11, 0, 0, 11, 11, 11, 11, 11), `06:20` = c(11,
0, 11, 11, 11, 11, 11, 0, 0, 11, 11, 11, 11, 11)), row.names = c(NA,
-14L), spec = structure(list(cols = list(id = structure(list(), class = c("collector_double",
"collector")), `04:00` = structure(list(), class = c("collector_double",
"collector")), `04:10` = structure(list(), class = c("collector_double",
"collector")), `04:20` = structure(list(), class = c("collector_double",
"collector")), `04:30` = structure(list(), class = c("collector_double",
"collector")), `04:40` = structure(list(), class = c("collector_double",
"collector")), `04:50` = structure(list(), class = c("collector_double",
"collector")), `05:00` = structure(list(), class = c("collector_double",
"collector")), `05:10` = structure(list(), class = c("collector_double",
"collector")), `05:20` = structure(list(), class = c("collector_double",
"collector")), `05:30` = structure(list(), class = c("collector_double",
"collector")), `05:40` = structure(list(), class = c("collector_double",
"collector")), `05:50` = structure(list(), class = c("collector_double",
"collector")), `06:00` = structure(list(), class = c("collector_double",
"collector")), `06:10` = structure(list(), class = c("collector_double",
"collector")), `06:20` = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), delim = ","), class = "col_spec"), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"))

Assume df is the provided dataset:
library(tidyverse)
df %>%
pivot_longer(cols = -id) %>%
mutate(id_interval = value == 0,
id_interval = cumsum(id_interval) + id) %>%
filter(value != 0) %>%
group_by(id_interval) %>%
summarise(
id = unique(id),
`start-end` = str_c(head(name, 1), tail(name, 1), sep = "-"),
duration = n()) %>%
select(-id_interval)
This yields
# A tibble: 14 × 3
id `start-end` duration
<dbl> <chr> <int>
1 1 04:00-06:20 15
2 2 04:00-05:50 12
3 3 04:00-06:20 15
4 4 04:00-06:20 15
5 5 04:00-06:20 15
6 6 04:00-06:20 15
7 7 04:00-06:20 15
8 8 04:00-05:50 12
9 9 04:00-05:50 12
10 10 04:00-06:20 15
11 11 04:00-06:20 15
12 12 04:00-06:20 15
13 13 04:00-06:20 15
14 14 04:00-06:20 15
Note that the provided dput input does not correspond to the image above.
It looks more like this:

solution using data.table
library(data.table)
setDT(dt)
# your sample data does not illustrate it well like on your screenshot
# add some "breaks" on second row, col 4-6
dt[2,4:6] <- 0
dl <- melt(dt, id.vars = "id")
setorder(dl, id, variable)
dl[, .(`start-end` = paste0(first(variable), "-", last(variable)), duration = .N), by = .(id, rleid(value), value > 0)][value == T, .(id, `start-end`, duration)]
output
# id start-end duration
# 1: 1 04:00-06:20 15
# 2: 2 04:00-04:10 2
# 3: 2 04:50-05:50 7
# 4: 3 04:00-06:20 15
# 5: 4 04:00-06:20 15
# 6: 5 04:00-06:20 15
# 7: 6 04:00-06:20 15
# 8: 7 04:00-06:20 15
# 9: 8 04:00-05:50 12
# 10: 9 04:00-05:50 12
# 11: 10 04:00-06:20 15
# 12: 11 04:00-06:20 15
# 13: 12 04:00-06:20 15
# 14: 13 04:00-06:20 15
# 15: 14 04:00-06:20 15
data
dt <- structure(list(id = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14), `04:00` = c(11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
11, 11, 11, 11), `04:10` = c(11, 11, 11, 11, 11, 11, 11, 11,
11, 11, 11, 11, 11, 11), `04:20` = c(11, 11, 11, 11, 11, 11,
11, 11, 11, 11, 11, 11, 11, 11), `04:30` = c(11, 11, 11, 11,
11, 11, 11, 11, 11, 11, 11, 11, 11, 11), `04:40` = c(11, 11,
11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11), `04:50` = c(11,
11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11), `05:00` = c(11,
11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11), `05:10` = c(11,
11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11), `05:20` = c(11,
11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11), `05:30` = c(11,
11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11), `05:40` = c(11,
11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11), `05:50` = c(11,
11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11), `06:00` = c(11,
0, 11, 11, 11, 11, 11, 0, 0, 11, 11, 11, 11, 11), `06:10` = c(11,
0, 11, 11, 11, 11, 11, 0, 0, 11, 11, 11, 11, 11), `06:20` = c(11,
0, 11, 11, 11, 11, 11, 0, 0, 11, 11, 11, 11, 11)), row.names = c(NA,
-14L), spec = structure(list(cols = list(id = structure(list(), class = c("collector_double",
"collector")), `04:00` = structure(list(), class = c("collector_double",
"collector")), `04:10` = structure(list(), class = c("collector_double",
"collector")), `04:20` = structure(list(), class = c("collector_double",
"collector")), `04:30` = structure(list(), class = c("collector_double",
"collector")), `04:40` = structure(list(), class = c("collector_double",
"collector")), `04:50` = structure(list(), class = c("collector_double",
"collector")), `05:00` = structure(list(), class = c("collector_double",
"collector")), `05:10` = structure(list(), class = c("collector_double",
"collector")), `05:20` = structure(list(), class = c("collector_double",
"collector")), `05:30` = structure(list(), class = c("collector_double",
"collector")), `05:40` = structure(list(), class = c("collector_double",
"collector")), `05:50` = structure(list(), class = c("collector_double",
"collector")), `06:00` = structure(list(), class = c("collector_double",
"collector")), `06:10` = structure(list(), class = c("collector_double",
"collector")), `06:20` = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), delim = ","), class = "col_spec"), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"))

Related

Identifying sequences per column

I am working with time-use data and want to calculate the duration of a started measurement at each time step (per column) and select the longest duration for each measurement. The measurement are numbered from 1 to 27. The length is weighted with 1 (e.g increment is set to 1). I am not sure how to handle if a measurement is fragmented and has multiple durations times.
Data format:
Desired output (example for the measurement number 1):
Time Measurement Duration
04:00 1 1
04:10 1 1
04:20 1 2
04:20 1 2
04:20 1 2
Longest duration
Time Measurement Duration
04:20 1 2
Sample data:
df<-structure(list(id = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14), `04:00` = c(1, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
11, 11, 11), `04:10` = c(1, 11, 11, 11, 11, 11, 11, 11, 11, 11,
11, 11, 11, 11), `04:20` = c(1, 11, 1, 1, 11, 11, 11, 11, 11,
1, 1, 11, 11, 11), `04:30` = c(1, 11, 1, 1, 3, 11, 11, 11, 11,
1, 1, 13, 11, 11), `04:40` = c(1, 11, 1, 1, 3, 12, 11, 11, 4,
1, 1, 13, 4, 11), `04:50` = c(4, 11, 11, 11, 3, 12, 11, 11, 4,
11, 11, 13, 4, 11), `05:00` = c(4, 11, 11, 11, 3, 12, 11, 11,
4, 13, 11, 13, 4, 11), `05:10` = c(4, 11, 11, 11, 3, 12, 11,
11, 4, 13, 11, 13, 4, 11), `05:20` = c(4, 11, 11, 11, 11, 13,
4, 11, 4, 13, 11, 13, 4, 11), `05:30` = c(4, 11, 11, 11, 11,
13, 4, 13, 4, 13, 11, 1, 4, 13), `05:40` = c(4, 11, 3, 11, 11,
13, 4, 13, 11, 13, 11, 1, 1, 13), `05:50` = c(11, 11, 3, 11,
11, 13, 4, 13, 11, 13, 11, 1, 11, 13), `06:00` = c(11, 1, 3,
11, 11, 13, 4, 13, 1, 11, 11, 11, 11, 13), `06:10` = c(11, 1,
3, 11, 11, 13, 4, 13, 1, 11, 11, 11, 11, 13), `06:20` = c(11,
1, 3, 11, 11, 11, 11, 13, 1, 11, 11, 11, 11, 13)), row.names = c(NA,
-14L), spec = structure(list(cols = list(id = structure(list(), class = c("collector_double",
"collector")), `04:00` = structure(list(), class = c("collector_double",
"collector")), `04:10` = structure(list(), class = c("collector_double",
"collector")), `04:20` = structure(list(), class = c("collector_double",
"collector")), `04:30` = structure(list(), class = c("collector_double",
"collector")), `04:40` = structure(list(), class = c("collector_double",
"collector")), `04:50` = structure(list(), class = c("collector_double",
"collector")), `05:00` = structure(list(), class = c("collector_double",
"collector")), `05:10` = structure(list(), class = c("collector_double",
"collector")), `05:20` = structure(list(), class = c("collector_double",
"collector")), `05:30` = structure(list(), class = c("collector_double",
"collector")), `05:40` = structure(list(), class = c("collector_double",
"collector")), `05:50` = structure(list(), class = c("collector_double",
"collector")), `06:00` = structure(list(), class = c("collector_double",
"collector")), `06:10` = structure(list(), class = c("collector_double",
"collector")), `06:20` = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), delim = ","), class = "col_spec"), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"))
Here's a function, mainly using rle, that will get you the desired output for a specific measurement:
f <- function(n){
l <- lapply(df[-1], \(x) with(rle(x), lengths[values == n]))
enframe(l, name = "Time", value = "Duration") %>%
unnest("Duration") %>%
mutate(Measurement = n, .before = "Duration")
}
output
> f(1)
# A tibble: 20 × 3
Time Measurement Duration
<chr> <dbl> <int>
1 04:00 1 1
2 04:10 1 1
3 04:20 1 1
4 04:20 1 2
5 04:20 1 2
6 04:30 1 1
7 04:30 1 2
8 04:30 1 2
9 04:40 1 1
10 04:40 1 2
11 04:40 1 2
12 05:30 1 1
13 05:40 1 2
14 05:50 1 1
15 06:00 1 1
16 06:00 1 1
17 06:10 1 1
18 06:10 1 1
19 06:20 1 1
20 06:20 1 1
Get the maximum with slice_max:
f(1) %>%
slice_max(Duration, n = 1, with_ties = F)
# A tibble: 1 × 3
Time Measurement Duration
<chr> <dbl> <int>
1 04:20 1 2
library(tidyverse)
library(lubridate)
df %>%
pivot_longer(-id, names_to = "timepoint", values_to = "Measurement") %>%
arrange(id, Measurement) %>%
type_convert() %>%
group_by(id) %>%
# Duration to first time point for each id
mutate(Duration = timepoint - min(timepoint)) %>%
# get the longest duration
filter(Duration == max(Duration))
#>
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#> timepoint = col_time(format = "")
#> )
#> # A tibble: 14 × 4
#> # Groups: id [14]
#> id timepoint Measurement Duration
#> <dbl> <time> <dbl> <drtn>
#> 1 1 06:20 11 8400 secs
#> 2 2 06:20 1 8400 secs
#> 3 3 06:20 3 8400 secs
#> 4 4 06:20 11 8400 secs
#> 5 5 06:20 11 8400 secs
#> 6 6 06:20 11 8400 secs
#> 7 7 06:20 11 8400 secs
#> 8 8 06:20 13 8400 secs
#> 9 9 06:20 1 8400 secs
#> 10 10 06:20 11 8400 secs
#> 11 11 06:20 11 8400 secs
#> 12 12 06:20 11 8400 secs
#> 13 13 06:20 11 8400 secs
#> 14 14 06:20 13 8400 secs
Created on 2022-05-16 by the reprex package (v2.0.0)

Find the total number of times each of a possible range of values occurs across three separate variables in R

I have data which looks like this:
library(stringr)
library(dplyr)
library(magrittr)
Codes = c(1, 2, 3, 4, 5, 6, 9)
Codes2 = c(Codes, rep(9, 100))
data <- data.frame(
MASTER_HCU_DI = do.call(paste0, Map(stri_rand_strings, n=100, length=c(4, 3),
pattern = c('[A-Z]', '[0-9]'))),
CODE_1 = sample(Codes, 100, replace = T))
data %<>%
mutate(CODE_2 = if_else(CODE_1 == 9, 9, sample(Codes2, 100, replace = T)),
CODE_3 = if_else(CODE_2 == 9, 9, sample(Codes2, 100, replace = T)))
What I want to do is find the total number of people with each of the possible values of CODE_1, CODE_2, and CODE_3; across all three Codes.
Where all of someone's CODE start with a 9, they are counted as missing. Otherwise, I'd like to ignore the CODE values which start with a 9.
This code does what I want, but seems cumbersome:
data %<>%
mutate(Sum_grp1 = if_else(CODE_1 == 1 | CODE_2 == 1 | CODE_3 == 1, 1, 0),
Sum_grp2 = if_else(CODE_1 == 2 | CODE_2 == 2 | CODE_3 == 2, 1, 0),
Sum_grp3 = if_else(CODE_1 == 3 | CODE_2 == 3 | CODE_3 == 3, 1, 0),
Sum_grp4 = if_else(CODE_1 == 4 | CODE_2 == 4 | CODE_3 == 4, 1, 0),
Sum_grp5 = if_else(CODE_1 == 5 | CODE == 5 | CODE_3 == 5, 1, 0),
Sum_grp6 = if_else(CODE_1 == 6 | CODE_2 == 6 | CODE_3 == 6, 1, 0),
Missing = if_else(CODE_1 == 9 & CODE_2 == 9 & CODE_3 == 9, 1, 0))
Group_counts <- data.frame(
Group = c("Group_1", "Group_2", "Group_3", "Group_4", "Group_5", "Group_6", "Missing"),
Sum = c(sum(data$Sum_grp1 == 1),
sum(data$Sum_grp2 == 1),
sum(data$Sum_grp3 == 1),
sum(data$Sum_grp4 == 1),
sum(data$Sum_grp5 == 1),
sum(data$Sum_grp6 == 1),
sum(data$Missing == 1)))
Expected output looks like this:
Is there an easier way to do this?
Thanks.
You can get the data in long format and use count -
library(dplyr)
library(tidyr)
data %>% pivot_longer(cols = -MASTER_HCU_DI) %>% count(name, value)
Is this what you expect?
data %>% pivot_longer(cols = -MASTER_HCU_DI) %>% group_by(name) %>%
summarise(Sum = sum(value), .groups = 'drop')
# A tibble: 3 x 2
name Sum
<chr> <dbl>
1 GROUP_1 409
2 GROUP_2 897
3 GROUP_3 900
As I understand it, the following functionality outlined in the question is not addressed by the existing answers:
Where all of someone's CODE start with a 9, they are counted as missing. Otherwise, I'd like to ignore the CODE values which start with a 9.
Here is my approach to include this functionality:
library(purrr)
library(dplyr)
data %>%
pmap_dfr(~ table(c(...)[-1])) %>%
set_names(~ paste0("Group_", .x)) %>%
mutate(Missing = ifelse(`Group_9` == 3, 1, NA)) %>%
select(-`Group_9`) %>%
colSums(na.rm = T) %>%
tibble::tibble(Group = names(.), Sum = .) %>%
arrange(Group)
Returns:
# A tibble: 7 x 2
Group Sum
<chr> <dbl>
1 Group_1 23
2 Group_2 13
3 Group_3 16
4 Group_4 13
5 Group_5 13
6 Group_6 11
7 Missing 15
Data used:
data <- structure(list(MASTER_HCU_DI = c("VBHT228", "CAAO199", "NDDI124", "AVZV996", "KMOP513", "AALT248", "IGZC617", "ZDHO229", "GXYV745", "PDTW465", "SEPM505", "ZJWQ323", "VRRU692", "NHOY962", "BBFR276", "NVML939", "VHPV534", "YTXG467", "BOCT360", "ONEO498", "CICL849", "SAIK461", "NZGL739", "NIFD497", "XMVE276", "JHZM922", "LCLV707", "BPKN209", "YTZU211", "LUNI891", "CQTC089", "FBDZ269", "VKCI112", "BLJH968", "LLML439", "TDRV973", "RTFR863", "GZAN917", "WSUI006", "JILN883", "CAHM719", "JCMI028", "BGFZ774", "BGVZ374", "WBUJ792", "DLVT690", "AVKE534", "TDPU030", "SKFI697", "UCLY688", "OODZ687", "IIPR924", "TSES431", "CQSN693", "ZQGJ398", "FMGH661", "ZORF207", "MDWD343", "OBDM142", "SATV193", "MUKZ136", "INAE029", "MWDB125", "JUXN395", "LQGW143", "ALKP557", "WQAR962", "UYZI622", "WKYM520", "WUMH621", "GLRV451", "ISHG990", "OCNW161", "WQMS244", "UQEF227", "IAEZ636", "TEZJ280", "GCCJ844", "EVTF869", "JGJH568", "MDPH890", "EHKR422", "NBIM361", "XEWM477", "PBJP921", "FGEG840", "UJOO120", "XZTB081", "GXCQ610", "ANAR117", "TNIP023", "GLFN787", "SYYV532", "GOTY296", "TXME798", "SUZK405", "VWHY631", "HAXW159", "CCJN761", "GGUN719"), GROUP_1 = c(6, 1, 4, 9, 1, 3, 3, 2, 3, 2, 1, 9, 4, 3, 1, 1, 4, 6, 5, 3, 3, 3, 9, 9, 2, 3, 4, 6, 1, 1, 1, 9, 6, 1, 5, 9, 5, 9, 5, 5, 1, 2, 9, 1, 3, 9, 9, 3, 5, 6, 1, 4, 1, 6, 4, 5, 2, 6, 4, 1, 5, 9, 1, 4, 3, 1, 2, 1, 2, 9, 1, 4, 3, 1, 2, 3, 6, 1, 6, 2, 2, 4, 1, 2, 6, 9, 3, 4, 2, 9, 6, 1, 3, 3, 1, 5, 4, 2, 4, 9), GROUP_2 = c(9, 9, 5, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 5, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 5, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9), GROUP_3 = c(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 5, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9)), class = "data.frame", row.names = c(NA, -100L))
We can use gather
library(dplyr)
library(tidyr)
data %>%
gather('name', 'value', -MASTER_HCU_DI) %>%
count(name, value)

Create new rows to column bind

I want to combine columns of a list into a single dataframe, however, some lists are of different lengths. The maximum length is 17, and I've thought of a way around this and that's by creating a new row to match the maximum length number for column concatenation.
If row layers do not match in length, then fill the missing value between 1 and 17, and replace the values column enc_ with the number 0.
Here's a sample of the dataset:
[[1]]
layer pland_01_evergreen_needleleaf
1 1 0.016832782
2 2 0.024552628
3 3 0.024377985
4 4 0.009584417
5 5 0.013569500
6 6 0.021745836
7 7 0.024301743
8 8 0.028323187
9 9 0.029710995
10 10 0.020706332
11 11 0.025760934
12 12 0.025148797
13 13 0.028520806
14 14 0.021327549
15 15 0.024794668
16 16 0.027986949
17 17 0.022970945
[[2]]
layer pland_02_evergreen_broadleaf
1 7 0.02329869
2 11 0.02910651
3 12 0.04234851
4 13 0.02788104
5 14 0.01899742
6 15 0.02639924
7 16 0.02601143
8 17 0.03166427
My expected output:
[[1]]
layer pland_01_evergreen_needleleaf pland_02_evergreen_broadleaf
1 1 0.016832782 0
2 2 0.024552628 0
3 3 0.024377985 0
4 4 0.009584417 0
5 5 0.013569500 0
6 6 0.021745836 0
7 7 0.024301743 0.02329869
8 8 0.028323187 0
9 9 0.029710995 0
10 10 0.020706332 0
11 11 0.025760934 0.02910651
12 12 0.025148797 0.04234851
13 13 0.028520806 0.02788104
14 14 0.021327549 0.01899742
15 15 0.024794668 0.02639924
16 16 0.027986949 0.02601143
17 17 0.022970945 0.03166427
I have tried:
do.call(plyr::rbind.fill, test.enc)
Though, it does not replace the rows and just fills values in columns with NA's.
Reproducible code:
test.enc <- list(structure(list(layer = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
11, 12, 13, 14, 15, 16, 17), pland_01_evergreen_needleleaf = c(0.0168327818172984,
0.0245526278078456, 0.0243779845525292, 0.00958441728108318,
0.0135694997972973, 0.0217458355, 0.0243017425347303, 0.0283231869863014,
0.0297109945836134, 0.0207063315181945, 0.0257609335769293, 0.0251487967356828,
0.0285208063526021, 0.0213275492944468, 0.0247946677520666, 0.0279869491599538,
0.0229709450323356)), row.names = c(NA, -17L), class = "data.frame"),
structure(list(layer = c(7, 11, 12, 13, 14, 15, 16, 17),
pland_02_evergreen_broadleaf = c(0.0232986892474108,
0.029106514197793, 0.0423485148880614, 0.0278810399372792,
0.0189974225113402, 0.0263992402670516, 0.0260114284210526,
0.0316642657775499)), row.names = c(NA, -8L), class = "data.frame"),
structure(list(layer = c(1, 2, 3, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17), pland_03_deciduous_needleleaf = c(0.0224730632077946,
0.0272254714759945, 0.0179234332099727, 0.0233360434693878,
0.0289772211061947, 0.0279319832599034, 0.0240684032409326,
0.0193554670384615, 0.0279649463078261, 0.0269396070886525,
0.0185719102763596, 0.018542528637931, 0.012709947072028,
0.04239139)), row.names = c(NA, -14L), class = "data.frame"),
structure(list(layer = c(1, 2, 3, 4, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17), pland_04_deciduous_broadleaf = c(0.0237555990295715,
0.0250673634976813, 0.0215182227341075, 0.00714736670909091,
0.0290969429050279, 0.0267860332636672, 0.0270534621613419,
0.026721714630264, 0.0238709596184027, 0.0249074332489268,
0.0304618992970835, 0.0260209517100003, 0.015865886959611,
0.0243338004003074, 0.0201179804026253, 0.0332228978795843
)), row.names = c(NA, -16L), class = "data.frame"), structure(list(
layer = c(1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17), pland_05_mixed_forest = c(0.0205357761652226,
0.0241299700965417, 0.0225027270827694, 0.00985684546268657,
0.0311072087096774, 0.0252826755994332, 0.0271736973582555,
0.0283303792425047, 0.0229465085587453, 0.0262387189000513,
0.0349808141373789, 0.0269785067137574, 0.0178032039611502,
0.0251414066142756, 0.0237955553523809, 0.0349799640745083
)), row.names = c(NA, -16L), class = "data.frame"), structure(list(
layer = c(3, 5, 6, 7, 10, 11, 13, 14, 15, 16, 17), pland_06_closed_shrubland = c(0.005861055,
0.0247702364814815, 0.0217156349945235, 0.0266147094731707,
0.0273557187764706, 0.02247895109375, 0.0314803993053339,
0.0199688156521739, 0.0250040668072976, 0.024064520016,
0.0289086554672578)), row.names = c(NA, -11L), class = "data.frame"),
structure(list(layer = c(1, 2, 5, 6, 7, 10, 13, 15, 16, 17
), pland_07_open_shrubland = c(0.0239835098420742, 0.0196024526993901,
0.0275470745648515, 0.0205289891038188, 0.0252871031854839,
0.0225145242857143, 0.0277447744846797, 0.0273150363541667,
0.0372795540909091, 0.0258269711946903)), row.names = c(NA,
-10L), class = "data.frame"), structure(list(layer = c(1,
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17),
pland_08_woody_savanna = c(0.0234895073226773, 0.0254242177795502,
0.0222844341348828, 0.010322404308595, 0.0115202866290984,
0.022858064298995, 0.0261324981159272, 0.0269339113300467,
0.0272905667936239, 0.0243445938197004, 0.0263085547098274,
0.031577225982848, 0.027366790080755, 0.0170917603078201,
0.0245166202483043, 0.0230437328068511, 0.0302480713824274
)), row.names = c(NA, -17L), class = "data.frame"), structure(list(
layer = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17), pland_09_savanna = c(0.024511496338631,
0.0263438531740197, 0.0230784856467449, 0.0103841481938194,
0.0112631119225057, 0.0218656878147517, 0.0263293450194207,
0.0272377655722272, 0.0277590005710358, 0.0248185191981168,
0.0264710300465011, 0.0311785029047626, 0.027764701873438,
0.018296641767007, 0.0243240673465086, 0.0269793925823536,
0.0261431798468939)), row.names = c(NA, -17L), class = "data.frame"),
structure(list(layer = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17), pland_10_grassland = c(0.0241048000322165,
0.0257675668336232, 0.0223383845545, 0.0189068612261722,
0.0261390898788855, 0.0261454176785369, 0.0262590636755884,
0.0273476886308152, 0.0282016510452861, 0.0249749584240885,
0.0269017127896855, 0.0309276372122874, 0.0280081024050942,
0.0171571967814629, 0.024706397187938, 0.0229732030207295,
0.0271717635000233)), row.names = c(NA, -17L), class = "data.frame"),
structure(list(layer = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17), pland_11_wetland = c(0.0261045398315745,
0.0270077896857178, 0.0228967718773374, 0.0199122837701645,
0.0227976864969644, 0.0275306004374101, 0.0271334525693991,
0.0285065610334257, 0.0281986960454696, 0.0235630515843985,
0.0235566291662858, 0.0272662707441063, 0.0242547847851237,
0.020220947639907, 0.0229653844016148, 0.0189523223219292,
0.016330738598504)), row.names = c(NA, -17L), class = "data.frame"),
structure(list(layer = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17), pland_12_cropland = c(0.0247481645364914,
0.0269929124824351, 0.0233212451104437, 0.0209935752243073,
0.027662987546265, 0.0267526016850953, 0.0264659030703554,
0.0276911097027454, 0.027704723980107, 0.0258298011360007,
0.0293761963259958, 0.0304401704151498, 0.0297272977127787,
0.0191320152910558, 0.022300483848187, 0.0310418860633282,
0.0194552407910497)), row.names = c(NA, -17L), class = "data.frame"),
structure(list(layer = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17), pland_13_urban = c(0.0250541999489398,
0.0249789151674128, 0.0219325183761915, 0.0174050192638298,
0.0198481538465096, 0.0273040101927991, 0.0261348274108392,
0.0274315478205557, 0.0284040130969821, 0.0255357946798584,
0.0276680704963855, 0.0283009734389356, 0.0273947664869961,
0.0191846595896345, 0.0225736950645381, 0.0185572109335283,
0.0266912368721673)), row.names = c(NA, -17L), class = "data.frame"),
structure(list(layer = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17), pland_14_mosiac = c(0.0244553100335083,
0.0265608905797148, 0.0230754220937747, 0.0126737591788462,
0.0208868797777778, 0.0264543431506849, 0.0271490616452074,
0.0275509256793189, 0.0274870231454383, 0.0260302106124036,
0.0294514198552019, 0.0317358807321971, 0.0303629153539886,
0.0191054718841496, 0.0221332367959672, 0.0332987653767865,
0.0153846531471452)), row.names = c(NA, -17L), class = "data.frame"),
structure(list(layer = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17), pland_15_barren = c(0.0254695416164035,
0.0260217783555025, 0.0278294141356033, 0.022098210265976,
0.0232223153248193, 0.0277460892260692, 0.0280945051729643,
0.0308188510180505, 0.0283990843854084, 0.0282966180792079,
0.0292701060708535, 0.02484902225, 0.0202313840629426, 0.02730348265625,
0.0252544010927835, 0.012387523087037, 0.0243783162068618
)), row.names = c(NA, -17L), class = "data.frame"))
You can use mergeand as you have many columns to be added you can call it using Reduce.
Reduce(function(x,y) merge(x,y, all=TRUE), test.enc)
# layer pland_01_evergreen_needleleaf pland_02_evergreen_broadleaf pland_03_deciduous_needleleaf pland_04_deciduous_broadleaf pland_05_mixed_forest pland_06_closed_shrubland pland_07_open_shrubland pland_08_woody_savanna pland_09_savanna pland_10_grassland pland_11_wetland pland_12_cropland pland_13_urban pland_14_mosiac pland_15_barren
#1 1 0.016832782 NA 0.02247306 0.023755599 0.020535776 NA 0.02398351 0.02348951 0.02451150 0.02410480 0.02610454 0.02474816 0.02505420 0.02445531 0.02546954
#2 2 0.024552628 NA 0.02722547 0.025067363 0.024129970 NA 0.01960245 0.02542422 0.02634385 0.02576757 0.02700779 0.02699291 0.02497892 0.02656089 0.02602178
#3 3 0.024377985 NA 0.01792343 0.021518223 0.022502727 0.005861055 NA 0.02228443 0.02307849 0.02233838 0.02289677 0.02332125 0.02193252 0.02307542 0.02782941
#4 4 0.009584417 NA NA 0.007147367 0.009856845 NA NA 0.01032240 0.01038415 0.01890686 0.01991228 0.02099358 0.01740502 0.01267376 0.02209821
#5 5 0.013569500 NA NA NA NA 0.024770236 0.02754707 0.01152029 0.01126311 0.02613909 0.02279769 0.02766299 0.01984815 0.02088688 0.02322232
#6 6 0.021745836 NA NA 0.029096943 0.031107209 0.021715635 0.02052899 0.02285806 0.02186569 0.02614542 0.02753060 0.02675260 0.02730401 0.02645434 0.02774609
#7 7 0.024301743 0.02329869 0.02333604 0.026786033 0.025282676 0.026614709 0.02528710 0.02613250 0.02632935 0.02625906 0.02713345 0.02646590 0.02613483 0.02714906 0.02809451
#8 8 0.028323187 NA 0.02897722 0.027053462 0.027173697 NA NA 0.02693391 0.02723777 0.02734769 0.02850656 0.02769111 0.02743155 0.02755093 0.03081885
#9 9 0.029710995 NA 0.02793198 0.026721715 0.028330379 NA NA 0.02729057 0.02775900 0.02820165 0.02819870 0.02770472 0.02840401 0.02748702 0.02839908
#10 10 0.020706332 NA 0.02406840 0.023870960 0.022946509 0.027355719 0.02251452 0.02434459 0.02481852 0.02497496 0.02356305 0.02582980 0.02553579 0.02603021 0.02829662
#11 11 0.025760934 0.02910651 0.01935547 0.024907433 0.026238719 0.022478951 NA 0.02630855 0.02647103 0.02690171 0.02355663 0.02937620 0.02766807 0.02945142 0.02927011
#12 12 0.025148797 0.04234851 0.02796495 0.030461899 0.034980814 NA NA 0.03157723 0.03117850 0.03092764 0.02726627 0.03044017 0.02830097 0.03173588 0.02484902
#13 13 0.028520806 0.02788104 0.02693961 0.026020952 0.026978507 0.031480399 0.02774477 0.02736679 0.02776470 0.02800810 0.02425478 0.02972730 0.02739477 0.03036292 0.02023138
#14 14 0.021327549 0.01899742 0.01857191 0.015865887 0.017803204 0.019968816 NA 0.01709176 0.01829664 0.01715720 0.02022095 0.01913202 0.01918466 0.01910547 0.02730348
#15 15 0.024794668 0.02639924 0.01854253 0.024333800 0.025141407 0.025004067 0.02731504 0.02451662 0.02432407 0.02470640 0.02296538 0.02230048 0.02257370 0.02213324 0.02525440
#16 16 0.027986949 0.02601143 0.01270995 0.020117980 0.023795555 0.024064520 0.03727955 0.02304373 0.02697939 0.02297320 0.01895232 0.03104189 0.01855721 0.03329877 0.01238752
#17 17 0.022970945 0.03166427 0.04239139 0.033222898 0.034979964 0.028908655 0.02582697 0.03024807 0.02614318 0.02717176 0.01633074 0.01945524 0.02669124 0.01538465 0.02437832

Tried code in R with mutate_at and max() functions with own data. Warning messages come up: no non-missing arguments to max

I'm curretly learning R with a book and was trying a mutate_at function from dplyr. In this example I want to standardize the survey items on a scale from 0 to 1. To do this, we can divide each value by the (theoretical) maximum value of the scale.
The book example stats_test from the package "pradadata" works perfectly fine:
data(stats_test, package = "pradadata")
stats_test %>%
drop_na() %>%
mutate_at(.vars = vars(study_time, self_eval, interest),
.funs = funs(prop = ./max(.))) %>%
select(contains("_prop"))
Output:
study_time_prop self_eval_prop interest_prop
<dbl> <dbl> <dbl>
1 0.6 0.7 0.667
2 0.8 0.8 0.833
3 0.6 0.4 0.167
4 0.8 0.7 0.833
5 0.4 0.6 0.5
6 0.4 0.6 0.667
7 0.8 0.6 0.5
8 0.2 0.7 0.667
9 0.6 0.8 0.833
10 0.6 0.7 0.833
# ... with 1,617 more rows
Tried the same code with my own data but it doesn't work and I can't figure out why. The variable RG04 from my data has a range from 1-5. I tried to transform the variable from numeric to integer, because the variables from the the data stats_test are integer too:
df_literacy_2 <- transform(df_literacy, RG04 = as.integer(RG04))
df_literacy_2 <- tibble(df_literacy_2)
df_literacy_2 %>%
drop_na() %>%
mutate_at(.vars = vars(RG04),
.funs = funs(prop = ./max(.))) %>%
select(contains("_prop"))
Output:
# A tibble: 0 x 0
Warning messages:
1: Problem with `mutate()` input `prop`.
i no non-missing arguments to max; returning -Inf
i Input `prop` is `RG04/max(RG04)`.
2: In base::max(x, ..., na.rm = na.rm) :
no non-missing arguments to max; returning -Inf
str(df_literacy_2$RG04)
int [1:630] 2 4 2 1 2 2 1 3 1 3 ...
Why doesn't it work on my data?
Thank you for your help.
Edit with sample of df_literacy:
> dput(head(df_literacy,20))
structure(list(CASE = c(40, 41, 44, 45, 48, 49, 54, 55, 56, 57,
58, 61, 62, 63, 64, 65, 66, 67, 68, 69), SERIAL = c(NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA), REF = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), QUESTNNR = c("base", "base",
"base", "base", "base", "base", "base", "base", "base", "base",
"base", "base", "base", "base", "base", "base", "base", "base",
"base", "base"), MODE = c("interview", "interview", "interview",
"interview", "interview", "interview", "interview", "interview",
"interview", "interview", "interview", "interview", "interview",
"interview", "interview", "interview", "interview", "interview",
"interview", "interview"), STARTED = structure(c(1607290462,
1607290608, 1607291086, 1607291118, 1607291265, 1607291793, 1607294071,
1607294336, 1607294337, 1607294419, 1607294814, 1607296474, 1607301809,
1607329348, 1607333933, 1607335996, 1607336207, 1607336378, 1607343194,
1607343414), tzone = "UTC", class = c("POSIXct", "POSIXt")),
EI01 = structure(c(2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L), .Label = c("Ja",
"Nein", "Nicht beantwortet"), class = "factor"), EI02 = c(2,
2, 2, 1, 1, 2, 1, 2, 2, 2, 2, 1, 2, 2, 1, 1, 1, 1, 2, 3),
RF01 = c(4, 2, 4, 3, 4, 4, 1, 3, 2, 3, 4, 3, 2, 3, 2, 2,
4, 2, 5, 3), RF02 = c(1, 1, 1, 1, 2, 2, 1, 2, 1, 1, 2, 1,
1, 1, 2, 2, 2, 2, 2, 2), RF03 = c(1, 2, 2, 2, 1, 2, 1, 1,
1, 1, 2, 1, 1, 2, 2, 2, 1, 2, 1, 2), RG01 = c(2, 2, 2, 2,
2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2), RG02 = c(3,
3, 3, 3, 4, 3, 4, 2, 4, 2, 3, 4, 4, 2, 4, 3, 4, 3, 4, 4),
RG03 = c(3, 2, 2, 3, 3, 3, 1, 3, 1, 2, 3, 1, 2, 2, 1, 3,
2, 3, 2, 2), RG04 = c(2, 4, 2, 1, 2, 2, 1, 3, 1, 3, 2, 4,
1, 1, 1, 1, 1, 2, 4, 1), RG05 = c(1, 1, 1, 1, 1, 1, 1, 2,
1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1), SD01 = structure(c(2L,
1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 1L, 1L), .Label = c("weiblich", "männlich", "divers",
"nicht beantwortet"), class = "factor"), SD03 = c(4, 3, 2,
2, 1, 2, 4, 4, 1, 4, 3, 1, 2, 3, 2, 4, 2, 3, 1, 3), SD05_01 = c(23,
22, 22, 21, 18, 22, 21, 27, 17, 22, 17, 21, 21, 22, 50, 25,
23, 20, 23, 23), TIME001 = c(2, 3, 23, 73, 29, 2, 3, 3, 29, 7,
50, 55, 3, 2, 10, 2, 1, 5, 7, 35), TIME002 = c(2, 2, 16,
34, 12, 14, 2, 2, 21, 2, 30, 24, 21, 3, 3, 2, 3, 2, 3, 22
), TIME003 = c(34, 8, 12, 15, 13, 12, 12, 7, 13, 11, 16,
10, 11, 16, 8, 8, 7, 8, 11, 14), TIME004 = c(60, 33, 25,
31, 45, 25, 14, 13, 38, 35, 50, 50, 37, 32, 32, 25, 72, 55,
28, 29), TIME005 = c(84, 21, 29, 41, 54, 33, 30, 22, 32,
42, 44, 23, 65, 30, 28, 32, 51, 31, 27, 44), TIME006 = c(14,
9, 27, 11, 24, 8, 8, 9, 18, 12, 35, 33, 27, 46, 11, 15, 8,
14, 12, 14), TIME007 = c(3, 18, 3, 5, 6, 2, 9, 2, 3, 3, 6,
7, 3, 13, 4, 4, 378, 3, 4, 10), TIME_SUM = c(199, 94, 135,
142, 183, 96, 78, 58, 154, 112, 186, 152, 167, 142, 96, 88,
146, 118, 92, 168), MAILSENT = c(NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
LASTDATA = structure(c(1607290661, 1607290702, 1607291221,
1607291328, 1607291448, 1607291889, 1607294149, 1607294394,
1607294491, 1607294531, 1607295045, 1607296676, 1607301976,
1607329490, 1607334030, 1607336084, 1607336727, 1607336496,
1607343286, 1607343582), tzone = "UTC", class = c("POSIXct",
"POSIXt")), FINISHED = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1), Q_VIEWER = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), LASTPAGE = c(7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7),
MAXPAGE = c(7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7), MISSING = c(7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 0, 7, 7, 7), MISSREL = c(1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1), TIME_RSI = c("46023",
"14246", "0.75", "0.63", "0.54", "12055", "17533", "30682",
"0.7", "44197", "0.45", "0.58", "0.83", "44378", "44501",
"18629", "46753", "46388", "44197", "0.57"), DEG_TIME = c(27,
27, 3, 1, 0, 23, 30, 42, 2, 17, 0, 2, 7, 18, 10, 27, 43,
18, 8, 0)), row.names = c(NA, -20L), class = c("tbl_df",
"tbl", "data.frame"))
Edit with TRUE and FALSE NAs:
> sapply(df_literacy, function(a) table(c(T,F,is.na(a)))-1)
CASE SERIAL REF QUESTNNR MODE STARTED EI01 EI02 RF01 RF02 RF03 RG01 RG02 RG03 RG04 RG05 SD01 SD03 SD05_01 TE03_01 TIME001 TIME002 TIME003
FALSE 630 0 0 630 630 630 630 630 630 630 630 630 630 630 630 630 629 629 615 99 630 630 630
TRUE 0 630 630 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 15 531 0 0 0
TIME004 TIME005 TIME006 TIME007 TIME_SUM MAILSENT LASTDATA FINISHED Q_VIEWER LASTPAGE MAXPAGE MISSING MISSREL TIME_RSI DEG_TIME
FALSE 630 630 629 625 630 0 630 630 630 630 630 630 630 630 630
TRUE 0 0 1 5 0 630 0 0 0 0 0 0 0 0 0
There are a few things to correct here.
drop_na() is removing all of your data.
drop_na(df_literacy)
# # A tibble: 0 x 37
# # ... with 37 variables: CASE <dbl>, SERIAL <lgl>, REF <lgl>, QUESTNNR <chr>,
# # MODE <chr>, STARTED <dttm>, EI01 <fct>, EI02 <dbl>, RF01 <dbl>, RF02 <dbl>,
# # RF03 <dbl>, RG01 <dbl>, RG02 <dbl>, RG03 <dbl>, RG04 <dbl>, RG05 <dbl>,
# # SD01 <fct>, SD03 <dbl>, SD05_01 <dbl>, TIME001 <dbl>, TIME002 <dbl>,
# # TIME003 <dbl>, TIME004 <dbl>, TIME005 <dbl>, TIME006 <dbl>, TIME007 <dbl>,
# # TIME_SUM <dbl>, MAILSENT <lgl>, LASTDATA <dttm>, FINISHED <dbl>,
# # Q_VIEWER <dbl>, LASTPAGE <dbl>, MAXPAGE <dbl>, MISSING <dbl>,
# # MISSREL <dbl>, TIME_RSI <chr>, DEG_TIME <dbl>
The problem is that you have several columns that are completely NA, namely SERIAL, REF, and MAILSENT.
sapply(df_literacy, function(a) table(c(T,F,is.na(a)))-1)
# CASE SERIAL REF QUESTNNR MODE STARTED EI01 EI02 RF01 RF02 RF03 RG01 RG02
# FALSE 20 0 0 20 20 20 20 20 20 20 20 20 20
# TRUE 0 20 20 0 0 0 0 0 0 0 0 0 0
# RG03 RG04 RG05 SD01 SD03 SD05_01 TIME001 TIME002 TIME003 TIME004 TIME005
# FALSE 20 20 20 20 20 20 20 20 20 20 20
# TRUE 0 0 0 0 0 0 0 0 0 0 0
# TIME006 TIME007 TIME_SUM MAILSENT LASTDATA FINISHED Q_VIEWER LASTPAGE
# FALSE 20 20 20 0 20 20 20 20
# TRUE 0 0 0 20 0 0 0 0
# MAXPAGE MISSING MISSREL TIME_RSI DEG_TIME
# FALSE 20 20 20 20 20
# TRUE 0 0 0 0 0
Drop the drop_na(), or at least drop_na(-SERIAL, -REF, -MAILSENT).
Your code is using funs, which has been deprecated since dplyr-0.8.0.
# Warning: `funs()` is deprecated as of dplyr 0.8.0.
# Please use a list of either functions or lambdas:
# # Simple named list:
# list(mean = mean, median = median)
# # Auto named with `tibble::lst()`:
# tibble::lst(mean, median)
# # Using lambdas
# list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
While this isn't causing an error, it is causing a warning (and will likely stop working at some point. Change your mutate_at to be:
mutate_at(.vars = vars(RG04, RF02),
.funs = list(prop = ~ . / max(.)))
You are using a single variable within .vars and a single function within .funs, so the column names are preserved as-is (and you will not see a _prop column). From ?mutate_at:
The names of the new columns are derived from the names of the
input variables and the names of the functions.
• if there is only one unnamed function (i.e. if '.funs' is an
unnamed list of length one), the names of the input variables
are used to name the new columns;
• for _at functions, if there is only one unnamed variable
(i.e., if '.vars' is of the form 'vars(a_single_column)') and
'.funs' has length greater than one, the names of the
functions are used to name the new columns;
• otherwise, the new names are created by concatenating the
names of the input variables and the names of the functions,
separated with an underscore '"_"'.
If you aren't going to add more variables and functions, then you need to self-name it in the call, as in mutate_at(.vars = vars(RG04 = RG04), ...). Oddly enough, this causes it to produce RG04_prop.
If we fix all of those, then it works.
df_literacy %>%
drop_na(-SERIAL, -REF, -MAILSENT) %>%
mutate_at(.vars = vars(RG04 = RG04),
.funs = list(prop = ~ ./max(.))) %>%
select(contains("_prop")) %>%
head(3)
# A tibble: 3 x 1
# RG04_prop
# <dbl>
# 1 0.5
# 2 1
# 3 0.5

Working across two dataframes: Apply or for-loop?

I have two dataframes and one function. The function is supposed to take the variables start_month & end_month, select for each row the values in the second dataframe in the month-column, calculate the rate_of_change between each start_month and end_month variable in a given year. Finally calculate the mean(rate_of_change) and place it into the first dataframe as a new variable in the vector average_ratio.
So far I've created a code that calculates the average ratio, but I can't manage to put it into a for loop or an apply function so that the loop runs through the whole first data frame. I have two ideas, but they don't work so far.
structure(Total) # Df containing total combinations of all existing month starting in September
.
i | start_month | end_month | average_ratio (expected output)
1 | 9 | 10 | -23
2 | 9 | 11 | 13
3 | 9 | 12 | -4
4 | 9 | 1 |
5 | 9 | 2 | # ... with 61 more rows
and
structure(Cologne)
# A tibble: 3,000 x 4
year month price town (rate of change)
<dbl> <dbl> <dbl> <chr>
1 1531 7 7575 Cologne
2 1531 8 588 Cologne
3 1531 9 615 Cologne
4 1531 10 69 Cologne -88%
5 1531 11 712 Cologne
6 1531 12 590 Cologne
7 1532 1 72 Cologne
8 1532 2 675 Cologne
9 1532 3 6933 Cologne
10 1532 4 54 Cologne
11 1532 5 425 Cologne
12 1532 6 12 Cologne
13 1532 7 323 Cologne
14 1532 8 32 Cologne
15 1532 9 58 Cologne
16 1532 10 84 Cologne 42%
# ... with 2,990 more rows
# rate of change function
rateofchange <- function(x,y) {
((x-y)/y)*100
}
# avg_ratio function
avg_ratio <- function(x,y,z) {
dt.frame <- filter(x, month==y | month==z)
pre_p <- lag(dt.frame$price, 1)
dt.frame <- cbind(dt.frame, pre_p)
for (i in 1:nrow(dt.frame)) {
dt.frame$roc <- rateofchange(dt.frame$price,dt.frame$pre_p)
}
result <- mean(dt.frame$roc,na.rm=TRUE)
return(result)
}
May_Aug <- avg_ratio(Cologne, 5,7)
################ works until here ################
# Now, Idea 1
Total <- Total %>%
mutate(Total, ratio = avg_ratio(Cologne,Total$start_mth,Total$end_mth)
)
Warning messages:
1: In month == y :
longer object length is not a multiple of shorter object length
2: In month == z :
longer object length is not a multiple of shorter object length
# and Idea 2
ratio <- c()
Total_new <- for(i in 1:nrow(Total)) {
ratio [i] <- c(ratio, avg_ratio(Cologne,Total$start_mth[i],Total$end_mth[i]))
return(cbind(Total,ratio))
}
> dput(Cologne[1:20,])
structure(list(year = c(1531, 1531, 1531, 1531, 1531, 1531, 1532,
1532, 1532, 1532, 1532, 1532, 1532, 1532, 1532, 1532, 1532, 1532,
1533, 1533), month = c(7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6,
7, 8, 9, 10, 11, 12, 1, 2), price = c(7575, 588, 615, 69, 712,
72, 72, 675, 6933, 70, 656, 66, 62, 48, 48, 462, 45, 45, 456,
46), town = c("Cologne", "Cologne", "Cologne", "Cologne", "Cologne",
"Cologne", "Cologne", "Cologne", "Cologne", "Cologne", "Cologne",
"Cologne", "Cologne", "Cologne", "Cologne", "Cologne", "Cologne",
"Cologne", "Cologne", "Cologne")), spec = structure(list(cols = list(
Jahr = structure(list(), class = c("collector_double", "collector"
)), Monat = structure(list(), class = c("collector_double",
"collector")), cologne_wheat_monthly = structure(list(), class = c("collector_number",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), class = "col_spec"), row.names = c(NA,
20L), class = c("tbl_df", "tbl", "data.frame"))
> dput(Total) structure(list(start_mth = c(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 6, 6, 7), end_mth = c(10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 12, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 2, 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 4, 5, 6, 7, 8, 5, 6, 7, 8, 6, 7, 8, 7, 8, 8)), class = "data.frame", row.names = c(NA, -66L))
You can do:
Total$average_ratio <- mapply(avg_ratio, y = Total$start_mth, z = Total$end_mth, MoreArgs = list(x = cologne))
Your function is not vectorized, that's why this doesn't work:
Total <- Total %>%
mutate(ratio = avg_ratio(cologne, start_mth, end_mth))
The mapply() function iterates (or vectorizes) through the arguments provided, you don't want to iterate over cologne however, that's why you pass it inside MoreArgs = , so it gets taken as it is.

Resources