Identifying sequences per column - r

I am working with time-use data and want to calculate the duration of a started measurement at each time step (per column) and select the longest duration for each measurement. The measurement are numbered from 1 to 27. The length is weighted with 1 (e.g increment is set to 1). I am not sure how to handle if a measurement is fragmented and has multiple durations times.
Data format:
Desired output (example for the measurement number 1):
Time Measurement Duration
04:00 1 1
04:10 1 1
04:20 1 2
04:20 1 2
04:20 1 2
Longest duration
Time Measurement Duration
04:20 1 2
Sample data:
df<-structure(list(id = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14), `04:00` = c(1, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
11, 11, 11), `04:10` = c(1, 11, 11, 11, 11, 11, 11, 11, 11, 11,
11, 11, 11, 11), `04:20` = c(1, 11, 1, 1, 11, 11, 11, 11, 11,
1, 1, 11, 11, 11), `04:30` = c(1, 11, 1, 1, 3, 11, 11, 11, 11,
1, 1, 13, 11, 11), `04:40` = c(1, 11, 1, 1, 3, 12, 11, 11, 4,
1, 1, 13, 4, 11), `04:50` = c(4, 11, 11, 11, 3, 12, 11, 11, 4,
11, 11, 13, 4, 11), `05:00` = c(4, 11, 11, 11, 3, 12, 11, 11,
4, 13, 11, 13, 4, 11), `05:10` = c(4, 11, 11, 11, 3, 12, 11,
11, 4, 13, 11, 13, 4, 11), `05:20` = c(4, 11, 11, 11, 11, 13,
4, 11, 4, 13, 11, 13, 4, 11), `05:30` = c(4, 11, 11, 11, 11,
13, 4, 13, 4, 13, 11, 1, 4, 13), `05:40` = c(4, 11, 3, 11, 11,
13, 4, 13, 11, 13, 11, 1, 1, 13), `05:50` = c(11, 11, 3, 11,
11, 13, 4, 13, 11, 13, 11, 1, 11, 13), `06:00` = c(11, 1, 3,
11, 11, 13, 4, 13, 1, 11, 11, 11, 11, 13), `06:10` = c(11, 1,
3, 11, 11, 13, 4, 13, 1, 11, 11, 11, 11, 13), `06:20` = c(11,
1, 3, 11, 11, 11, 11, 13, 1, 11, 11, 11, 11, 13)), row.names = c(NA,
-14L), spec = structure(list(cols = list(id = structure(list(), class = c("collector_double",
"collector")), `04:00` = structure(list(), class = c("collector_double",
"collector")), `04:10` = structure(list(), class = c("collector_double",
"collector")), `04:20` = structure(list(), class = c("collector_double",
"collector")), `04:30` = structure(list(), class = c("collector_double",
"collector")), `04:40` = structure(list(), class = c("collector_double",
"collector")), `04:50` = structure(list(), class = c("collector_double",
"collector")), `05:00` = structure(list(), class = c("collector_double",
"collector")), `05:10` = structure(list(), class = c("collector_double",
"collector")), `05:20` = structure(list(), class = c("collector_double",
"collector")), `05:30` = structure(list(), class = c("collector_double",
"collector")), `05:40` = structure(list(), class = c("collector_double",
"collector")), `05:50` = structure(list(), class = c("collector_double",
"collector")), `06:00` = structure(list(), class = c("collector_double",
"collector")), `06:10` = structure(list(), class = c("collector_double",
"collector")), `06:20` = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), delim = ","), class = "col_spec"), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"))

Here's a function, mainly using rle, that will get you the desired output for a specific measurement:
f <- function(n){
l <- lapply(df[-1], \(x) with(rle(x), lengths[values == n]))
enframe(l, name = "Time", value = "Duration") %>%
unnest("Duration") %>%
mutate(Measurement = n, .before = "Duration")
}
output
> f(1)
# A tibble: 20 × 3
Time Measurement Duration
<chr> <dbl> <int>
1 04:00 1 1
2 04:10 1 1
3 04:20 1 1
4 04:20 1 2
5 04:20 1 2
6 04:30 1 1
7 04:30 1 2
8 04:30 1 2
9 04:40 1 1
10 04:40 1 2
11 04:40 1 2
12 05:30 1 1
13 05:40 1 2
14 05:50 1 1
15 06:00 1 1
16 06:00 1 1
17 06:10 1 1
18 06:10 1 1
19 06:20 1 1
20 06:20 1 1
Get the maximum with slice_max:
f(1) %>%
slice_max(Duration, n = 1, with_ties = F)
# A tibble: 1 × 3
Time Measurement Duration
<chr> <dbl> <int>
1 04:20 1 2

library(tidyverse)
library(lubridate)
df %>%
pivot_longer(-id, names_to = "timepoint", values_to = "Measurement") %>%
arrange(id, Measurement) %>%
type_convert() %>%
group_by(id) %>%
# Duration to first time point for each id
mutate(Duration = timepoint - min(timepoint)) %>%
# get the longest duration
filter(Duration == max(Duration))
#>
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#> timepoint = col_time(format = "")
#> )
#> # A tibble: 14 × 4
#> # Groups: id [14]
#> id timepoint Measurement Duration
#> <dbl> <time> <dbl> <drtn>
#> 1 1 06:20 11 8400 secs
#> 2 2 06:20 1 8400 secs
#> 3 3 06:20 3 8400 secs
#> 4 4 06:20 11 8400 secs
#> 5 5 06:20 11 8400 secs
#> 6 6 06:20 11 8400 secs
#> 7 7 06:20 11 8400 secs
#> 8 8 06:20 13 8400 secs
#> 9 9 06:20 1 8400 secs
#> 10 10 06:20 11 8400 secs
#> 11 11 06:20 11 8400 secs
#> 12 12 06:20 11 8400 secs
#> 13 13 06:20 11 8400 secs
#> 14 14 06:20 13 8400 secs
Created on 2022-05-16 by the reprex package (v2.0.0)

Related

Identifying start-end in a time use data

I am working with time-use data and want to calculate the duration of a started measurement at each step per id. I was considering using the colsum function, but I am not sure how to handle if an id an activity is fragmented and has multiple starting times.
Example of output for id 1 and 10
id start-end duration
1 04:00-06:20 15
10 04:00-4:10 2
10 04:50-06:20 10
Sample data:
structure(list(id = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14), `04:00` = c(11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
11, 11, 11, 11), `04:10` = c(11, 11, 11, 11, 11, 11, 11, 11,
11, 11, 11, 11, 11, 11), `04:20` = c(11, 11, 11, 11, 11, 11,
11, 11, 11, 11, 11, 11, 11, 11), `04:30` = c(11, 11, 11, 11,
11, 11, 11, 11, 11, 11, 11, 11, 11, 11), `04:40` = c(11, 11,
11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11), `04:50` = c(11,
11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11), `05:00` = c(11,
11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11), `05:10` = c(11,
11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11), `05:20` = c(11,
11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11), `05:30` = c(11,
11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11), `05:40` = c(11,
11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11), `05:50` = c(11,
11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11), `06:00` = c(11,
0, 11, 11, 11, 11, 11, 0, 0, 11, 11, 11, 11, 11), `06:10` = c(11,
0, 11, 11, 11, 11, 11, 0, 0, 11, 11, 11, 11, 11), `06:20` = c(11,
0, 11, 11, 11, 11, 11, 0, 0, 11, 11, 11, 11, 11)), row.names = c(NA,
-14L), spec = structure(list(cols = list(id = structure(list(), class = c("collector_double",
"collector")), `04:00` = structure(list(), class = c("collector_double",
"collector")), `04:10` = structure(list(), class = c("collector_double",
"collector")), `04:20` = structure(list(), class = c("collector_double",
"collector")), `04:30` = structure(list(), class = c("collector_double",
"collector")), `04:40` = structure(list(), class = c("collector_double",
"collector")), `04:50` = structure(list(), class = c("collector_double",
"collector")), `05:00` = structure(list(), class = c("collector_double",
"collector")), `05:10` = structure(list(), class = c("collector_double",
"collector")), `05:20` = structure(list(), class = c("collector_double",
"collector")), `05:30` = structure(list(), class = c("collector_double",
"collector")), `05:40` = structure(list(), class = c("collector_double",
"collector")), `05:50` = structure(list(), class = c("collector_double",
"collector")), `06:00` = structure(list(), class = c("collector_double",
"collector")), `06:10` = structure(list(), class = c("collector_double",
"collector")), `06:20` = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), delim = ","), class = "col_spec"), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"))
Assume df is the provided dataset:
library(tidyverse)
df %>%
pivot_longer(cols = -id) %>%
mutate(id_interval = value == 0,
id_interval = cumsum(id_interval) + id) %>%
filter(value != 0) %>%
group_by(id_interval) %>%
summarise(
id = unique(id),
`start-end` = str_c(head(name, 1), tail(name, 1), sep = "-"),
duration = n()) %>%
select(-id_interval)
This yields
# A tibble: 14 × 3
id `start-end` duration
<dbl> <chr> <int>
1 1 04:00-06:20 15
2 2 04:00-05:50 12
3 3 04:00-06:20 15
4 4 04:00-06:20 15
5 5 04:00-06:20 15
6 6 04:00-06:20 15
7 7 04:00-06:20 15
8 8 04:00-05:50 12
9 9 04:00-05:50 12
10 10 04:00-06:20 15
11 11 04:00-06:20 15
12 12 04:00-06:20 15
13 13 04:00-06:20 15
14 14 04:00-06:20 15
Note that the provided dput input does not correspond to the image above.
It looks more like this:
solution using data.table
library(data.table)
setDT(dt)
# your sample data does not illustrate it well like on your screenshot
# add some "breaks" on second row, col 4-6
dt[2,4:6] <- 0
dl <- melt(dt, id.vars = "id")
setorder(dl, id, variable)
dl[, .(`start-end` = paste0(first(variable), "-", last(variable)), duration = .N), by = .(id, rleid(value), value > 0)][value == T, .(id, `start-end`, duration)]
output
# id start-end duration
# 1: 1 04:00-06:20 15
# 2: 2 04:00-04:10 2
# 3: 2 04:50-05:50 7
# 4: 3 04:00-06:20 15
# 5: 4 04:00-06:20 15
# 6: 5 04:00-06:20 15
# 7: 6 04:00-06:20 15
# 8: 7 04:00-06:20 15
# 9: 8 04:00-05:50 12
# 10: 9 04:00-05:50 12
# 11: 10 04:00-06:20 15
# 12: 11 04:00-06:20 15
# 13: 12 04:00-06:20 15
# 14: 13 04:00-06:20 15
# 15: 14 04:00-06:20 15
data
dt <- structure(list(id = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14), `04:00` = c(11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
11, 11, 11, 11), `04:10` = c(11, 11, 11, 11, 11, 11, 11, 11,
11, 11, 11, 11, 11, 11), `04:20` = c(11, 11, 11, 11, 11, 11,
11, 11, 11, 11, 11, 11, 11, 11), `04:30` = c(11, 11, 11, 11,
11, 11, 11, 11, 11, 11, 11, 11, 11, 11), `04:40` = c(11, 11,
11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11), `04:50` = c(11,
11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11), `05:00` = c(11,
11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11), `05:10` = c(11,
11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11), `05:20` = c(11,
11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11), `05:30` = c(11,
11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11), `05:40` = c(11,
11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11), `05:50` = c(11,
11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11), `06:00` = c(11,
0, 11, 11, 11, 11, 11, 0, 0, 11, 11, 11, 11, 11), `06:10` = c(11,
0, 11, 11, 11, 11, 11, 0, 0, 11, 11, 11, 11, 11), `06:20` = c(11,
0, 11, 11, 11, 11, 11, 0, 0, 11, 11, 11, 11, 11)), row.names = c(NA,
-14L), spec = structure(list(cols = list(id = structure(list(), class = c("collector_double",
"collector")), `04:00` = structure(list(), class = c("collector_double",
"collector")), `04:10` = structure(list(), class = c("collector_double",
"collector")), `04:20` = structure(list(), class = c("collector_double",
"collector")), `04:30` = structure(list(), class = c("collector_double",
"collector")), `04:40` = structure(list(), class = c("collector_double",
"collector")), `04:50` = structure(list(), class = c("collector_double",
"collector")), `05:00` = structure(list(), class = c("collector_double",
"collector")), `05:10` = structure(list(), class = c("collector_double",
"collector")), `05:20` = structure(list(), class = c("collector_double",
"collector")), `05:30` = structure(list(), class = c("collector_double",
"collector")), `05:40` = structure(list(), class = c("collector_double",
"collector")), `05:50` = structure(list(), class = c("collector_double",
"collector")), `06:00` = structure(list(), class = c("collector_double",
"collector")), `06:10` = structure(list(), class = c("collector_double",
"collector")), `06:20` = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), delim = ","), class = "col_spec"), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"))

Create new rows to column bind

I want to combine columns of a list into a single dataframe, however, some lists are of different lengths. The maximum length is 17, and I've thought of a way around this and that's by creating a new row to match the maximum length number for column concatenation.
If row layers do not match in length, then fill the missing value between 1 and 17, and replace the values column enc_ with the number 0.
Here's a sample of the dataset:
[[1]]
layer pland_01_evergreen_needleleaf
1 1 0.016832782
2 2 0.024552628
3 3 0.024377985
4 4 0.009584417
5 5 0.013569500
6 6 0.021745836
7 7 0.024301743
8 8 0.028323187
9 9 0.029710995
10 10 0.020706332
11 11 0.025760934
12 12 0.025148797
13 13 0.028520806
14 14 0.021327549
15 15 0.024794668
16 16 0.027986949
17 17 0.022970945
[[2]]
layer pland_02_evergreen_broadleaf
1 7 0.02329869
2 11 0.02910651
3 12 0.04234851
4 13 0.02788104
5 14 0.01899742
6 15 0.02639924
7 16 0.02601143
8 17 0.03166427
My expected output:
[[1]]
layer pland_01_evergreen_needleleaf pland_02_evergreen_broadleaf
1 1 0.016832782 0
2 2 0.024552628 0
3 3 0.024377985 0
4 4 0.009584417 0
5 5 0.013569500 0
6 6 0.021745836 0
7 7 0.024301743 0.02329869
8 8 0.028323187 0
9 9 0.029710995 0
10 10 0.020706332 0
11 11 0.025760934 0.02910651
12 12 0.025148797 0.04234851
13 13 0.028520806 0.02788104
14 14 0.021327549 0.01899742
15 15 0.024794668 0.02639924
16 16 0.027986949 0.02601143
17 17 0.022970945 0.03166427
I have tried:
do.call(plyr::rbind.fill, test.enc)
Though, it does not replace the rows and just fills values in columns with NA's.
Reproducible code:
test.enc <- list(structure(list(layer = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
11, 12, 13, 14, 15, 16, 17), pland_01_evergreen_needleleaf = c(0.0168327818172984,
0.0245526278078456, 0.0243779845525292, 0.00958441728108318,
0.0135694997972973, 0.0217458355, 0.0243017425347303, 0.0283231869863014,
0.0297109945836134, 0.0207063315181945, 0.0257609335769293, 0.0251487967356828,
0.0285208063526021, 0.0213275492944468, 0.0247946677520666, 0.0279869491599538,
0.0229709450323356)), row.names = c(NA, -17L), class = "data.frame"),
structure(list(layer = c(7, 11, 12, 13, 14, 15, 16, 17),
pland_02_evergreen_broadleaf = c(0.0232986892474108,
0.029106514197793, 0.0423485148880614, 0.0278810399372792,
0.0189974225113402, 0.0263992402670516, 0.0260114284210526,
0.0316642657775499)), row.names = c(NA, -8L), class = "data.frame"),
structure(list(layer = c(1, 2, 3, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17), pland_03_deciduous_needleleaf = c(0.0224730632077946,
0.0272254714759945, 0.0179234332099727, 0.0233360434693878,
0.0289772211061947, 0.0279319832599034, 0.0240684032409326,
0.0193554670384615, 0.0279649463078261, 0.0269396070886525,
0.0185719102763596, 0.018542528637931, 0.012709947072028,
0.04239139)), row.names = c(NA, -14L), class = "data.frame"),
structure(list(layer = c(1, 2, 3, 4, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17), pland_04_deciduous_broadleaf = c(0.0237555990295715,
0.0250673634976813, 0.0215182227341075, 0.00714736670909091,
0.0290969429050279, 0.0267860332636672, 0.0270534621613419,
0.026721714630264, 0.0238709596184027, 0.0249074332489268,
0.0304618992970835, 0.0260209517100003, 0.015865886959611,
0.0243338004003074, 0.0201179804026253, 0.0332228978795843
)), row.names = c(NA, -16L), class = "data.frame"), structure(list(
layer = c(1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17), pland_05_mixed_forest = c(0.0205357761652226,
0.0241299700965417, 0.0225027270827694, 0.00985684546268657,
0.0311072087096774, 0.0252826755994332, 0.0271736973582555,
0.0283303792425047, 0.0229465085587453, 0.0262387189000513,
0.0349808141373789, 0.0269785067137574, 0.0178032039611502,
0.0251414066142756, 0.0237955553523809, 0.0349799640745083
)), row.names = c(NA, -16L), class = "data.frame"), structure(list(
layer = c(3, 5, 6, 7, 10, 11, 13, 14, 15, 16, 17), pland_06_closed_shrubland = c(0.005861055,
0.0247702364814815, 0.0217156349945235, 0.0266147094731707,
0.0273557187764706, 0.02247895109375, 0.0314803993053339,
0.0199688156521739, 0.0250040668072976, 0.024064520016,
0.0289086554672578)), row.names = c(NA, -11L), class = "data.frame"),
structure(list(layer = c(1, 2, 5, 6, 7, 10, 13, 15, 16, 17
), pland_07_open_shrubland = c(0.0239835098420742, 0.0196024526993901,
0.0275470745648515, 0.0205289891038188, 0.0252871031854839,
0.0225145242857143, 0.0277447744846797, 0.0273150363541667,
0.0372795540909091, 0.0258269711946903)), row.names = c(NA,
-10L), class = "data.frame"), structure(list(layer = c(1,
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17),
pland_08_woody_savanna = c(0.0234895073226773, 0.0254242177795502,
0.0222844341348828, 0.010322404308595, 0.0115202866290984,
0.022858064298995, 0.0261324981159272, 0.0269339113300467,
0.0272905667936239, 0.0243445938197004, 0.0263085547098274,
0.031577225982848, 0.027366790080755, 0.0170917603078201,
0.0245166202483043, 0.0230437328068511, 0.0302480713824274
)), row.names = c(NA, -17L), class = "data.frame"), structure(list(
layer = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17), pland_09_savanna = c(0.024511496338631,
0.0263438531740197, 0.0230784856467449, 0.0103841481938194,
0.0112631119225057, 0.0218656878147517, 0.0263293450194207,
0.0272377655722272, 0.0277590005710358, 0.0248185191981168,
0.0264710300465011, 0.0311785029047626, 0.027764701873438,
0.018296641767007, 0.0243240673465086, 0.0269793925823536,
0.0261431798468939)), row.names = c(NA, -17L), class = "data.frame"),
structure(list(layer = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17), pland_10_grassland = c(0.0241048000322165,
0.0257675668336232, 0.0223383845545, 0.0189068612261722,
0.0261390898788855, 0.0261454176785369, 0.0262590636755884,
0.0273476886308152, 0.0282016510452861, 0.0249749584240885,
0.0269017127896855, 0.0309276372122874, 0.0280081024050942,
0.0171571967814629, 0.024706397187938, 0.0229732030207295,
0.0271717635000233)), row.names = c(NA, -17L), class = "data.frame"),
structure(list(layer = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17), pland_11_wetland = c(0.0261045398315745,
0.0270077896857178, 0.0228967718773374, 0.0199122837701645,
0.0227976864969644, 0.0275306004374101, 0.0271334525693991,
0.0285065610334257, 0.0281986960454696, 0.0235630515843985,
0.0235566291662858, 0.0272662707441063, 0.0242547847851237,
0.020220947639907, 0.0229653844016148, 0.0189523223219292,
0.016330738598504)), row.names = c(NA, -17L), class = "data.frame"),
structure(list(layer = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17), pland_12_cropland = c(0.0247481645364914,
0.0269929124824351, 0.0233212451104437, 0.0209935752243073,
0.027662987546265, 0.0267526016850953, 0.0264659030703554,
0.0276911097027454, 0.027704723980107, 0.0258298011360007,
0.0293761963259958, 0.0304401704151498, 0.0297272977127787,
0.0191320152910558, 0.022300483848187, 0.0310418860633282,
0.0194552407910497)), row.names = c(NA, -17L), class = "data.frame"),
structure(list(layer = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17), pland_13_urban = c(0.0250541999489398,
0.0249789151674128, 0.0219325183761915, 0.0174050192638298,
0.0198481538465096, 0.0273040101927991, 0.0261348274108392,
0.0274315478205557, 0.0284040130969821, 0.0255357946798584,
0.0276680704963855, 0.0283009734389356, 0.0273947664869961,
0.0191846595896345, 0.0225736950645381, 0.0185572109335283,
0.0266912368721673)), row.names = c(NA, -17L), class = "data.frame"),
structure(list(layer = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17), pland_14_mosiac = c(0.0244553100335083,
0.0265608905797148, 0.0230754220937747, 0.0126737591788462,
0.0208868797777778, 0.0264543431506849, 0.0271490616452074,
0.0275509256793189, 0.0274870231454383, 0.0260302106124036,
0.0294514198552019, 0.0317358807321971, 0.0303629153539886,
0.0191054718841496, 0.0221332367959672, 0.0332987653767865,
0.0153846531471452)), row.names = c(NA, -17L), class = "data.frame"),
structure(list(layer = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17), pland_15_barren = c(0.0254695416164035,
0.0260217783555025, 0.0278294141356033, 0.022098210265976,
0.0232223153248193, 0.0277460892260692, 0.0280945051729643,
0.0308188510180505, 0.0283990843854084, 0.0282966180792079,
0.0292701060708535, 0.02484902225, 0.0202313840629426, 0.02730348265625,
0.0252544010927835, 0.012387523087037, 0.0243783162068618
)), row.names = c(NA, -17L), class = "data.frame"))
You can use mergeand as you have many columns to be added you can call it using Reduce.
Reduce(function(x,y) merge(x,y, all=TRUE), test.enc)
# layer pland_01_evergreen_needleleaf pland_02_evergreen_broadleaf pland_03_deciduous_needleleaf pland_04_deciduous_broadleaf pland_05_mixed_forest pland_06_closed_shrubland pland_07_open_shrubland pland_08_woody_savanna pland_09_savanna pland_10_grassland pland_11_wetland pland_12_cropland pland_13_urban pland_14_mosiac pland_15_barren
#1 1 0.016832782 NA 0.02247306 0.023755599 0.020535776 NA 0.02398351 0.02348951 0.02451150 0.02410480 0.02610454 0.02474816 0.02505420 0.02445531 0.02546954
#2 2 0.024552628 NA 0.02722547 0.025067363 0.024129970 NA 0.01960245 0.02542422 0.02634385 0.02576757 0.02700779 0.02699291 0.02497892 0.02656089 0.02602178
#3 3 0.024377985 NA 0.01792343 0.021518223 0.022502727 0.005861055 NA 0.02228443 0.02307849 0.02233838 0.02289677 0.02332125 0.02193252 0.02307542 0.02782941
#4 4 0.009584417 NA NA 0.007147367 0.009856845 NA NA 0.01032240 0.01038415 0.01890686 0.01991228 0.02099358 0.01740502 0.01267376 0.02209821
#5 5 0.013569500 NA NA NA NA 0.024770236 0.02754707 0.01152029 0.01126311 0.02613909 0.02279769 0.02766299 0.01984815 0.02088688 0.02322232
#6 6 0.021745836 NA NA 0.029096943 0.031107209 0.021715635 0.02052899 0.02285806 0.02186569 0.02614542 0.02753060 0.02675260 0.02730401 0.02645434 0.02774609
#7 7 0.024301743 0.02329869 0.02333604 0.026786033 0.025282676 0.026614709 0.02528710 0.02613250 0.02632935 0.02625906 0.02713345 0.02646590 0.02613483 0.02714906 0.02809451
#8 8 0.028323187 NA 0.02897722 0.027053462 0.027173697 NA NA 0.02693391 0.02723777 0.02734769 0.02850656 0.02769111 0.02743155 0.02755093 0.03081885
#9 9 0.029710995 NA 0.02793198 0.026721715 0.028330379 NA NA 0.02729057 0.02775900 0.02820165 0.02819870 0.02770472 0.02840401 0.02748702 0.02839908
#10 10 0.020706332 NA 0.02406840 0.023870960 0.022946509 0.027355719 0.02251452 0.02434459 0.02481852 0.02497496 0.02356305 0.02582980 0.02553579 0.02603021 0.02829662
#11 11 0.025760934 0.02910651 0.01935547 0.024907433 0.026238719 0.022478951 NA 0.02630855 0.02647103 0.02690171 0.02355663 0.02937620 0.02766807 0.02945142 0.02927011
#12 12 0.025148797 0.04234851 0.02796495 0.030461899 0.034980814 NA NA 0.03157723 0.03117850 0.03092764 0.02726627 0.03044017 0.02830097 0.03173588 0.02484902
#13 13 0.028520806 0.02788104 0.02693961 0.026020952 0.026978507 0.031480399 0.02774477 0.02736679 0.02776470 0.02800810 0.02425478 0.02972730 0.02739477 0.03036292 0.02023138
#14 14 0.021327549 0.01899742 0.01857191 0.015865887 0.017803204 0.019968816 NA 0.01709176 0.01829664 0.01715720 0.02022095 0.01913202 0.01918466 0.01910547 0.02730348
#15 15 0.024794668 0.02639924 0.01854253 0.024333800 0.025141407 0.025004067 0.02731504 0.02451662 0.02432407 0.02470640 0.02296538 0.02230048 0.02257370 0.02213324 0.02525440
#16 16 0.027986949 0.02601143 0.01270995 0.020117980 0.023795555 0.024064520 0.03727955 0.02304373 0.02697939 0.02297320 0.01895232 0.03104189 0.01855721 0.03329877 0.01238752
#17 17 0.022970945 0.03166427 0.04239139 0.033222898 0.034979964 0.028908655 0.02582697 0.03024807 0.02614318 0.02717176 0.01633074 0.01945524 0.02669124 0.01538465 0.02437832

Working across two dataframes: Apply or for-loop?

I have two dataframes and one function. The function is supposed to take the variables start_month & end_month, select for each row the values in the second dataframe in the month-column, calculate the rate_of_change between each start_month and end_month variable in a given year. Finally calculate the mean(rate_of_change) and place it into the first dataframe as a new variable in the vector average_ratio.
So far I've created a code that calculates the average ratio, but I can't manage to put it into a for loop or an apply function so that the loop runs through the whole first data frame. I have two ideas, but they don't work so far.
structure(Total) # Df containing total combinations of all existing month starting in September
.
i | start_month | end_month | average_ratio (expected output)
1 | 9 | 10 | -23
2 | 9 | 11 | 13
3 | 9 | 12 | -4
4 | 9 | 1 |
5 | 9 | 2 | # ... with 61 more rows
and
structure(Cologne)
# A tibble: 3,000 x 4
year month price town (rate of change)
<dbl> <dbl> <dbl> <chr>
1 1531 7 7575 Cologne
2 1531 8 588 Cologne
3 1531 9 615 Cologne
4 1531 10 69 Cologne -88%
5 1531 11 712 Cologne
6 1531 12 590 Cologne
7 1532 1 72 Cologne
8 1532 2 675 Cologne
9 1532 3 6933 Cologne
10 1532 4 54 Cologne
11 1532 5 425 Cologne
12 1532 6 12 Cologne
13 1532 7 323 Cologne
14 1532 8 32 Cologne
15 1532 9 58 Cologne
16 1532 10 84 Cologne 42%
# ... with 2,990 more rows
# rate of change function
rateofchange <- function(x,y) {
((x-y)/y)*100
}
# avg_ratio function
avg_ratio <- function(x,y,z) {
dt.frame <- filter(x, month==y | month==z)
pre_p <- lag(dt.frame$price, 1)
dt.frame <- cbind(dt.frame, pre_p)
for (i in 1:nrow(dt.frame)) {
dt.frame$roc <- rateofchange(dt.frame$price,dt.frame$pre_p)
}
result <- mean(dt.frame$roc,na.rm=TRUE)
return(result)
}
May_Aug <- avg_ratio(Cologne, 5,7)
################ works until here ################
# Now, Idea 1
Total <- Total %>%
mutate(Total, ratio = avg_ratio(Cologne,Total$start_mth,Total$end_mth)
)
Warning messages:
1: In month == y :
longer object length is not a multiple of shorter object length
2: In month == z :
longer object length is not a multiple of shorter object length
# and Idea 2
ratio <- c()
Total_new <- for(i in 1:nrow(Total)) {
ratio [i] <- c(ratio, avg_ratio(Cologne,Total$start_mth[i],Total$end_mth[i]))
return(cbind(Total,ratio))
}
> dput(Cologne[1:20,])
structure(list(year = c(1531, 1531, 1531, 1531, 1531, 1531, 1532,
1532, 1532, 1532, 1532, 1532, 1532, 1532, 1532, 1532, 1532, 1532,
1533, 1533), month = c(7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6,
7, 8, 9, 10, 11, 12, 1, 2), price = c(7575, 588, 615, 69, 712,
72, 72, 675, 6933, 70, 656, 66, 62, 48, 48, 462, 45, 45, 456,
46), town = c("Cologne", "Cologne", "Cologne", "Cologne", "Cologne",
"Cologne", "Cologne", "Cologne", "Cologne", "Cologne", "Cologne",
"Cologne", "Cologne", "Cologne", "Cologne", "Cologne", "Cologne",
"Cologne", "Cologne", "Cologne")), spec = structure(list(cols = list(
Jahr = structure(list(), class = c("collector_double", "collector"
)), Monat = structure(list(), class = c("collector_double",
"collector")), cologne_wheat_monthly = structure(list(), class = c("collector_number",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), class = "col_spec"), row.names = c(NA,
20L), class = c("tbl_df", "tbl", "data.frame"))
> dput(Total) structure(list(start_mth = c(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 6, 6, 7), end_mth = c(10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 12, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 2, 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 4, 5, 6, 7, 8, 5, 6, 7, 8, 6, 7, 8, 7, 8, 8)), class = "data.frame", row.names = c(NA, -66L))
You can do:
Total$average_ratio <- mapply(avg_ratio, y = Total$start_mth, z = Total$end_mth, MoreArgs = list(x = cologne))
Your function is not vectorized, that's why this doesn't work:
Total <- Total %>%
mutate(ratio = avg_ratio(cologne, start_mth, end_mth))
The mapply() function iterates (or vectorizes) through the arguments provided, you don't want to iterate over cologne however, that's why you pass it inside MoreArgs = , so it gets taken as it is.

Can I create many categories of one variable based in two other conditions in r? [duplicate]

This question already has answers here:
How collect additional row data on binned data in R
(1 answer)
Group value in range r
(3 answers)
Closed 3 years ago.
I am doing a statistic analysis in a big data frame (more than 48.000.000 rows) in r. Here is an exemple of the data:
structure(list(herd = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3), cows = c(1, 2,
3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1, 2, 3, 4,
5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1, 2, 3, 4, 5, 6,
7, 8, 9, 10, 11, 12, 13, 14, 15, 16), `date` = c("11/03/2013",
"12/03/2013", "13/03/2013", "14/03/2013", "15/03/2013", "16/03/2013",
"13/05/2012", "14/05/2012", "15/05/2012", "16/05/2012", "17/05/2012",
"18/05/2012", "10/07/2016", "11/07/2016", "12/07/2016", "13/07/2016",
"11/03/2013", "12/03/2013", "13/03/2013", "14/03/2013", "15/03/2013",
"16/03/2013", "13/05/2012", "14/05/2012", "15/05/2012", "16/05/2012",
"17/05/2012", "18/05/2012", "10/07/2016", "11/07/2016", "12/07/2016",
"13/07/2016", "11/03/2013", "12/03/2013", "13/03/2013", "14/03/2013",
"15/03/2013", "16/03/2013", "13/05/2012", "14/05/2012", "15/05/2012",
"16/05/2012", "17/05/2012", "18/05/2012", "10/07/2016", "11/07/2016",
"12/07/2016", "13/07/2016"), glicose = c(240666, 23457789, 45688688,
679, 76564, 6574553, 78654, 546432, 76455643, 6876, 7645432,
876875, 98654, 453437, 98676, 9887554, 76543, 9775643, 986545,
240666, 23457789, 45688688, 679, 76564, 6574553, 78654, 546432,
76455643, 6876, 7645432, 876875, 98654, 453437, 98676, 9887554,
76543, 9775643, 986545, 240666, 23457789, 45688688, 679, 76564,
6574553, 78654, 546432, 76455643, 6876)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -48L))
I need to identify how many cows are in the following category of glicose by herd and by date:
<=100000
100000 and <=150000
150000 and <=200000
200000 and <=250000
250000 and <=400000
>400000
I tried to use the functions filter() and select() but could not categorize the variable like that.
I tried either to make a vector for each category but it did not work:
ht <- df %>% group_by(herd, date) %>%
filter(glicose < 100000)
Actually I do not have a clue of how I could do this. Please help!
I expect to get the number of cows in each category of each herd based on each date in a table like this:
Calling your data df,
df %>%
mutate(glicose_group = cut(glicose, breaks = c(0, seq(1e5, 2.5e5, by = 0.5e5), 4e5, Inf)),
date = as.Date(date, format = "%d/%m/%Y")) %>%
group_by(herd, date, glicose_group) %>%
count
# # A tibble: 48 x 4
# # Groups: herd, date, glicose_group [48]
# herd date glicose_group n
# <dbl> <date> <fct> <int>
# 1 1 2012-05-13 (0,1e+05] 1
# 2 1 2012-05-14 (4e+05,Inf] 1
# 3 1 2012-05-15 (4e+05,Inf] 1
# 4 1 2012-05-16 (0,1e+05] 1
# 5 1 2012-05-17 (4e+05,Inf] 1
# 6 1 2012-05-18 (4e+05,Inf] 1
# 7 1 2013-03-11 (2e+05,2.5e+05] 1
# 8 1 2013-03-12 (4e+05,Inf] 1
# 9 1 2013-03-13 (4e+05,Inf] 1
# 10 1 2013-03-14 (0,1e+05] 1
# # ... with 38 more rows
I also threw in a conversion to Date class, which is probably a good idea.

Summary with label names with dplyr

I have imported a .sav file with Haven but where I am stuck is that I cant seem to work out how to print the label names in place or, with the label codings. Labels: 1 = unemployed, 2 = looking etc.
Employment <- select(well_being_df, EmploymentStatus, Gender) %>% <group_by(EmploymentStatus) %>% summarise_all(funs(mean, n = n(), sd,min(.,is.na = TRUE), max(.,is.na = TRUE)))
# A tibble: 5 x 6
EmploymentStatus mean n sd min max
<dbl+lbl> <dbl> <int> <dbl> <dbl> <dbl>
1 1 1.67 12 0.492 1 2
2 2 1.17 6 0.408 1 2
3 3 1.8 85 0.431 1 3
4 4 1.5 62 0.504 1 2
5 5 1.5 4 0.577 1 2
Ideally:
# A tibble: 5 x 6
EmploymentStatus mean n sd min max
<dbl+lbl> <dbl> <int> <dbl> <dbl> <dbl>
1 1 Unemployed 1.67 12 0.492 1 2
2 2 Looking 1.17 6 0.408 1 2
3 3 Etc 1.8 85 0.431 1 3
4 4 1.5 62 0.504 1 2
5 5 1.5 4 0.577 1 2
dput(head(well_being_df, 10))
structure(list(Age = c(22, 20, 23, 20, 25, 18, 24, 21, 21, 30.7344197070233
), Gender = structure(c(2, 2, 1, 2, 1, 2, 2, 2, 2, 1), labels = c(Male = 1,
Female = 2, Transgender = 3), class = "labelled"), EmploymentStatus = structure(c(3,
1, 4, 3, 3, 3, 3, 4, 3, 4), labels = c(`Unemployed but not looking` = 1,
`Unemployed and looking` = 2, `Part-time` = 3, `Full-time` = 4,
Retired = 5), class = "labelled"), Cognition1 = structure(c(6,
3, 6, 5, 9, 6, 4, 4, 7, 5), labels = c(`Provides nothing that you want` = 0,
`Provides half of what you want` = 5, `Provides all that you want` = 10
), class = "labelled"), Cognition2 = structure(c(7, 3, 8,
5, 8, 5, 5, 7, 7, 3), labels = c(`Far below average` = 0,
`About Average` = 5, `Far above average` = 10), class = "labelled"),
Cognition3 = structure(c(6, 5, 4, 5, 6, 5, 5, 5, 5, 5), labels = c(`Far less than you deserve` = 0,
`About what you deserve` = 5, `Far more than you deserve` = 10
), class = "labelled"), Cognition4 = structure(c(7, 3, 6,
2, 8, 3, 3, 5, 6, 2), labels = c(`Far less than you need` = 0,
`About what you need` = 5, `Far more than you need` = 10), class = "labelled"),
Cognition5 = structure(c(10, 9, 6, 3, 7, 2, 2, 0, 4, 0), labels = c(`Far less than expected` = 0,
`About as expected` = 5, `Far more than expected` = 10), class = "labelled"),
Cognition6 = structure(c(8, 6, 0, 3, 3, 8, 9, 10, 5, 10), labels = c(`Far more than it will in the future` = 0,
`About what you expect in the future` = 5, `Far less than what the future will offer` = 10
), class = "labelled"), Cognition7 = structure(c(9, 7, 10,
5, 6, 2, 3, 0, 8, 3), labels = c(`Far below previous best` = 0,
`Equals previous best` = 5, `Far above previous best` = 10
), class = "labelled")), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"))
Employment <- select(well_being_df, EmploymentStatus, Gender) %>%
mutate(EmploymentStatus = labelled::to_factor(EmploymentStatus)) %>% # use labelled package
group_by(EmploymentStatus) %>%
summarise_all(funs(mean, n = n(), sd,min(.,is.na = TRUE), max(.,is.na = TRUE)))

Resources