Periodic Patterns Identification in R - r

I want to identify temporal patterns in a time series.
structure(list(ID = c("a", "b", "c", "d", "e", "f", "g", "h",
"i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u",
"v", "w", "x"), `2016/01` = c(1, NA, NA, 1, NA, NA, 1, NA, NA,
1, NA, 1, 1, 1, NA, 1, NA, NA, 1, NA, NA, 1, NA, NA), `2016/02` = c(NA,
1, NA, NA, 1, NA, NA, 1, NA, NA, 1, 1, 1, NA, 1, NA, 1, NA, NA,
1, NA, NA, 1, NA), `2016/03` = c(NA, NA, 1, NA, NA, 1, NA, NA,
1, 1, NA, 1, 1, 1, NA, NA, NA, 1, NA, NA, 1, NA, NA, 1), `2016/04` = c(NA,
NA, NA, 1, NA, NA, NA, NA, NA, NA, 1, 1, 1, NA, 1, NA, NA, NA,
1, NA, NA, NA, NA, NA), `2016/05` = c(NA, NA, NA, NA, 1, NA,
NA, NA, NA, 1, NA, 1, 1, 1, NA, NA, NA, NA, NA, 1, NA, NA, NA,
NA), `2016/06` = c(NA, NA, NA, NA, NA, 1, NA, NA, NA, NA, 1,
1, 1, NA, 1, NA, NA, NA, NA, NA, 1, NA, NA, NA), `2016/07` = c(NA,
NA, NA, 1, NA, NA, 1, NA, NA, 1, NA, 1, 1, 1, NA, 1, NA, NA,
1, NA, NA, NA, NA, NA), `2016/08` = c(NA, NA, NA, NA, 1, NA,
NA, 1, NA, NA, 1, 1, 1, NA, 1, NA, 1, NA, NA, 1, NA, NA, NA,
NA), `2016/09` = c(NA, NA, NA, NA, NA, 1, NA, NA, 1, 1, NA, 1,
1, 1, NA, NA, NA, 1, NA, NA, 1, NA, NA, NA), `2016/10` = c(NA,
NA, NA, 1, NA, NA, NA, NA, NA, NA, 1, 1, 1, NA, 1, NA, NA, NA,
1, NA, NA, NA, NA, NA), `2016/11` = c(NA, NA, NA, NA, 1, NA,
NA, NA, NA, 1, NA, 1, 1, 1, NA, NA, NA, NA, NA, 1, NA, NA, NA,
NA), `2016/12` = c(NA, NA, NA, NA, NA, 1, NA, NA, NA, NA, 1,
1, 1, NA, 1, NA, NA, NA, NA, NA, 1, NA, NA, NA), `2017/01` = c(1,
NA, NA, 1, NA, NA, 1, NA, NA, 1, NA, 1, 1, 1, NA, 1, NA, NA,
1, NA, NA, 1, NA, NA), `2017/02` = c(NA, 1, NA, NA, 1, NA, NA,
1, NA, NA, 1, 1, 1, NA, 1, NA, 1, NA, NA, 1, NA, NA, 1, NA),
`2017/03` = c(NA, NA, 1, NA, NA, 1, NA, NA, 1, 1, NA, 1,
1, 1, NA, NA, NA, 1, NA, NA, 1, NA, NA, 1), `2017/04` = c(NA,
NA, NA, 1, NA, NA, NA, NA, NA, NA, 1, 1, 1, NA, 1, NA, NA,
NA, 1, NA, NA, NA, NA, NA), `2017/05` = c(NA, NA, NA, NA,
1, NA, NA, NA, NA, 1, NA, 1, 1, 1, NA, NA, NA, NA, NA, 1,
NA, NA, NA, NA), `2017/06` = c(NA, NA, NA, NA, NA, 1, NA,
NA, NA, NA, 1, 1, 1, NA, 1, NA, NA, NA, NA, NA, 1, NA, NA,
NA), `2017/07` = c(NA, NA, NA, 1, NA, NA, 1, NA, NA, 1, NA,
1, 1, 1, NA, 1, NA, NA, 1, NA, NA, NA, NA, NA), `2017/08` = c(NA,
NA, NA, NA, 1, NA, NA, 1, NA, NA, 1, 1, 1, NA, 1, NA, 1,
NA, NA, 1, NA, NA, NA, NA), `2017/09` = c(NA, NA, NA, NA,
NA, 1, NA, NA, NA, 1, NA, 1, 1, 1, NA, NA, NA, NA, NA, NA,
1, NA, NA, NA), `2017/10` = c(NA, NA, NA, 1, NA, NA, NA,
NA, NA, NA, 1, 1, 1, NA, 1, NA, NA, NA, 1, NA, NA, NA, NA,
NA), `2017/11` = c(NA, NA, NA, NA, 1, NA, NA, NA, NA, 1,
NA, 1, 1, 1, NA, NA, NA, NA, NA, 1, NA, NA, NA, NA), `2017/12` = c(1,
NA, NA, NA, NA, 1, NA, NA, NA, NA, 1, 1, 1, NA, 1, NA, NA,
NA, NA, NA, 1, 1, NA, NA), `2018/01` = c(NA, 1, NA, 1, NA,
NA, 1, NA, NA, 1, NA, 1, 1, 1, NA, 1, NA, NA, 1, NA, NA,
NA, 1, NA), `2018/02` = c(NA, NA, 1, NA, 1, NA, NA, 1, NA,
NA, 1, 1, 1, NA, 1, NA, 1, NA, NA, 1, NA, NA, NA, 1), `2018/03` = c(NA,
NA, NA, NA, NA, 1, NA, NA, 1, 1, NA, 1, 1, 1, NA, NA, NA,
1, NA, NA, 1, NA, NA, NA), `2018/04` = c(NA, NA, NA, 1, NA,
NA, NA, NA, NA, NA, 1, 1, 1, NA, 1, NA, NA, NA, 1, NA, NA,
NA, NA, NA), `2018/05` = c(NA, NA, NA, NA, 1, NA, NA, NA,
NA, 1, NA, 1, 1, 1, NA, NA, NA, NA, NA, 1, NA, NA, NA, NA
), `2018/06` = c(NA, NA, NA, NA, NA, 1, NA, NA, NA, NA, 1,
1, 1, NA, 1, NA, NA, NA, NA, NA, 1, NA, NA, NA), `2018/07` = c(NA,
NA, NA, 1, NA, NA, 1, NA, NA, 1, NA, 1, 1, 1, NA, 1, NA,
NA, 1, NA, NA, NA, NA, NA), `2018/08` = c(NA, NA, NA, NA,
1, NA, NA, 1, NA, NA, 1, 1, 1, NA, 1, NA, 1, NA, NA, 1, NA,
NA, NA, NA), `2018/09` = c(NA, NA, NA, NA, NA, 1, NA, NA,
1, 1, NA, 1, 1, 1, NA, NA, NA, 1, NA, NA, 1, NA, NA, NA),
`2018/10` = c(NA, NA, NA, 1, NA, NA, NA, NA, NA, NA, 1, 1,
1, NA, 1, NA, NA, NA, 1, NA, NA, NA, NA, NA), `2018/11` = c(NA,
NA, NA, NA, 1, NA, NA, NA, NA, 1, NA, 1, 1, 1, NA, NA, NA,
NA, NA, 1, NA, NA, NA, NA), `2018/12` = c(NA, NA, NA, NA,
NA, 1, NA, NA, NA, NA, 1, 1, 1, NA, 1, NA, NA, NA, NA, NA,
1, NA, NA, NA)), row.names = c(NA, -24L), class = c("tbl_df",
"tbl", "data.frame"))
In the upper data frame individual:
List item
a has the same pattern as v
b has the same pattern as w
c has the same pattern as x
In the upper data frame individuals a, b, c, v, w and x have the same frequency - yearly.
The are some other cases as bimensal, quarterly and semestral.
My objective is to identify all this cases and classify all individuals with a time pattern.
I suppose that the package arulesSequences can be useful.
Can you help me please?

I think a good start would be a full hierarchical clustering:
library(gplots)
library(dendsort)
# data preparation
dm <- matrix( as.numeric(!is.na(dat[,-1])), nrow=nrow(dat[,-1]) )
rownames(dm) <- dat$ID
colnames(dm) <- colnames(dat[,-1])
heatmap.2( dm, trace="none", hclustfun=function(x){
dendsort(hclust(x, method="single"), type="average")
}, col=c("grey90","darkblue") )
Clearly visible are all time dependent connections through the columns.
I included dendsort to bring similar clusters together to make ID related patterns more obvious.
Also, only plotting the row-cluster lets you visualize the temporal patterns better.
heatmap.2( dm, trace="none", Colv=NA, dendrogram="row",
hclustfun=function(x){ dendsort(hclust(x, method="single"),
type="average") }, col=c("grey90","darkblue") )
Adding a summary and k-means for comparison:
hierarchical cluster
dis <- dist(dm, method="euclidean")
hc <- hclust(dis, method="single")
# choose the height where to cut
# lower means more fine grained cluster, less member per cluster
cutree(hc, h=4)
a b c d e f g h i j k l m n o p q r s t u v w x
1 2 1 3 2 4 1 2 1 5 6 7 7 5 6 1 2 1 3 2 4 1 2 1
# higher h means larger clusters, i.e. more member per cluster
cutree(hc, h=5)
a b c d e f g h i j k l m n o p q r s t u v w x
1 2 1 1 2 1 1 2 1 1 2 3 3 1 2 1 2 1 1 2 1 1 2 1
k-means
# pre-defining k=6, has to be rerun to change k
km <- kmeans(dm, 6, algorithm="Hartigan-Wong")
km$cluster
a b c d e f g h i j k l m n o p q r s t u v w x
2 5 2 6 5 4 2 5 4 3 1 1 1 3 1 2 5 4 6 5 4 2 5 2

Related

Creating new column with values from multiple other columns

I hope someone can help me with this one!
I have the following dataset and want to create a new column where the values of aver1, aver2 and aver3 are represented.
I tried it with rowSums but this did not work for me because when i put na.rm = TRUE also those rows who have only empty columns have 0 as their sum and I can not differentiate these from the ones that actually do have 0 as their value.
What I have:
count
aver1.
aver2.
aver3.
X
NA
1
NA
Y
1
NA
NA
X
NA
NA
0
What I want:
count
aver1.
aver2.
aver3.
aver_all
X
NA
1
NA
1
Y
1
NA
NA
1
X
NA
NA
0
0
the dput output:
structure(list(count = c(0,
0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,
1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0,
0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1,
1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0,
1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
1), start = c(NA, NA, NA, 5, NA, NA, NA, NA, 1, NA, NA, NA, NA,
1, 1, 1, NA, NA, 9, NA, NA, NA, 3, 4, NA, 11, 1, NA, NA, 1, NA,
NA, NA, 6, NA, NA, 5, NA, 5, NA, NA, NA, NA, NA, 1, NA, 3, NA,
NA, 3, 1, NA, 13, NA, 0, NA, NA, NA, NA, 1, NA, NA, NA, 12, 1,
NA, NA, NA, NA, NA, NA, NA, NA, 1, 1, NA, 1, NA, NA, NA, NA,
2, NA, 2, NA, NA, NA, 2, NA, NA, 1, NA, 3, NA, 3, NA, NA, NA,
NA, 10, NA, 1, NA, 0, 0, 1, 1, NA, NA, NA, NA, NA, 1, NA, 2,
7, NA, 1, NA, NA, 3, NA, 2, 6, NA, 3, NA, 1, 8, 1, NA, 1, NA,
NA, 0, NA, 0, 1, NA, NA, NA, NA, 3, NA, 0, NA, NA, NA, 1, NA,
NA, 0, NA, NA, NA, NA, NA, 2, NA, NA, 0, NA, NA, NA, NA, NA,
NA, 1, NA, 4), aver1 = c(NA, NA, NA, 0.5, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, 0.166666666666667, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, 0.133333333333333, NA, NA, NA, NA,
NA, NA, NA, NA, NA, 0, NA, NA, NA, NA, 0.266666666666667, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, 0.566666666666667, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, 0.266666666666667, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA), aver2 = c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, 0.333333333333333, 0.416666666666667, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.25, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.916666666666667,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.472222222222222,
NA, NA, NA, NA, NA, NA, 0.388888888888889, NA, NA, NA, 0.0833333333333333,
NA, NA, NA, NA, 0.0555555555555556, NA, 0.111111111111111, NA,
NA, NA, NA, NA, NA, NA, NA, 0.305555555555556, NA, 0.861111111111111,
NA, NA, NA, NA, NA, NA, NA, NA, 0.194444444444444, NA, NA, NA,
NA, NA, 0.611111111111111, NA, NA, NA, NA, 0, NA, 1, NA, 0.694444444444444,
NA, NA, NA, NA, 0.5, NA, 1, NA, NA, NA, NA, NA, 0.0277777777777778,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.138888888888889,
NA, NA, 0.583333333333333, NA, NA, NA, NA, NA, NA, 0.194444444444444,
NA, NA), aver3 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, 0, NA, NA, NA, NA, NA, NA, NA, 0.514285714285714,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, 0.0285714285714286, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, 1, 0.214285714285714, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.0142857142857143, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, 0.614285714285714, NA, NA, NA, NA, 0.371428571428571,
NA, NA, NA, NA, 0, NA, NA, NA, NA, NA, NA, NA, NA, 0, NA, NA,
NA, NA, NA, 0.9, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.0571428571428571,
NA, NA, 0.128571428571429, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, 0.1)), row.names = c(NA, -170L
), class = c("tbl_df", "tbl", "data.frame"))
This is an example that allows you to sum your selected variables from your data-frame (let's call this data-frame: 'df').
df$aver_all <- apply(df[, c("aver1", "aver2", "aver3")], 1, function(x) sum(x, na.rm=TRUE))
It will add 0s to rows where there are only NAs for aver1-2-3.
The next code will replace by NAs, the rows with full NAs.
df$aver_all <- apply(df[, c("aver1", "aver2", "aver3")], 1, function(x) ifelse(FALSE %in% is.na(x), sum(x, na.rm=TRUE), NA))
Given that you have said that you also have rows where all column values are NAs, I will create an additional row in your dataset that fulfills this condition:
dataset <- tibble(count = c("X", "Y", "X", "Z"), aver1. = c(NA, 1, NA, NA),
aver2. = c(1, NA, NA, NA), aver3. = c(NA, NA, 0, NA))
You can use the conditional case_when (https://dplyr.tidyverse.org/reference/case_when.html), which will allow you to set values depending on the conditions you choose for each row. In this case, you could use:
dataset$aver_all <- case_when(is.na(aver1.) & is.na(aver2.) & is.na(aver3.) ~ NA_real_,
aver1. | aver2. | aver3. ~ 1,
TRUE ~ 0)
Here the first condition sets rows where all values are NA to NA, the second sets a 1 if at least one of the three values of a row is a 1; and finally if none of these conditions is satisfied, a 0 is set.

Count NA's per id

I would like to know how many NAs there are per ID. For example in the case of ID = 1 from Monday till Friday there are a total of 4 NA.
Is there any solution in R?
My df:
My output:
My sample data:
structure(list(id = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
start.end = c("Mo_start", "Mo_end", "Tue_start", "Tue_end",
"Wed_start", "Wed_end", "Thur_start", "Thur_end", "Fri_start",
"Fri_end", "Mo_start", "Mo_end", "Tue_start", "Tue_end",
"Wed_start", "Wed_end", "Thur_start", "Thur_end", "Fri_start",
"Fri_end", "Mo_start", "Mo_end", "Tue_start", "Tue_end",
"Wed_start", "Wed_end", "Thur_start", "Thur_end", "Fri_start",
"Fri_end", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), time = structure(c(NA, NA,
25200, 53100, 25200, 53100, 25200, 53100, NA, NA, NA, NA,
NA, NA, 32400, 56700, NA, NA, NA, NA, 35100, 53100, 23400,
53100, 23400, 31500, 23400, 31500, 23400, 53100, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA), class = c("hms", "difftime"), units = "secs"),
X4 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA)), class = c("spec_tbl_df", "tbl_df",
"tbl", "data.frame"), row.names = c(NA, -80L), spec = structure(list(
cols = list(id = structure(list(), class = c("collector_double",
"collector")), start.end = structure(list(), class = c("collector_character",
"collector")), time = structure(list(format = ""), class = c("collector_time",
"collector")), X4 = structure(list(), class = c("collector_logical",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1L), class = "col_spec"))
df %>%
group_by(id) %>%
summarise(Missing=sum(is.na(time)), .groups="drop")
Giving
# A tibble: 4 x 2
id Missing
<dbl> <int>
1 1 4
2 2 8
3 3 0
4 NA 50
Your sample data appears not to match your expected output.
library(data.table)
dataset <- data.table(dataset)
dataset[is.na(time),.N,by=id]
Update:
Using setNames for adequate column names:
setNames(aggregate(time ~ id, data=df, function(x) {sum(is.na(x))}, na.action = NULL), c("id", "NAs/day"))
Output:
id NAs/day
1 1 4
2 2 8
3 3 0
First answer:
We could use aggregate:
aggregate(time ~ id, data=df, function(x) {sum(is.na(x))}, na.action = NULL)
Output
id time
1 1 4
2 2 8
3 3 0

How to plot an igraph object on a vector map in R

I use the igraph and sf packages.
I have an igraph object whose vertices have spatial coordinates geo_dist_graph.
The vertices names and coordinates look like this:
grid_grid <-
structure(list(coords.x1 = c(15.504078, 15.704078, 15.904078,
15.104078, 15.304078, 15.504078, 15.704078, 15.104078, 15.304078,
15.704078, 14.904078, 14.304078, 13.904078, 14.704078, 13.704078,
14.104078, 14.704078, 14.904078, 13.704078, 13.904078, 14.704078,
13.704078, 13.904078, 14.304078),
coords.x2 = c(43.835623, 43.835623,
43.835623, 44.035623, 44.035623, 44.035623, 44.035623, 44.235623,
44.235623, 44.235623, 44.435623, 44.635623, 44.835623, 44.835623,
45.035623, 45.035623, 45.035623, 45.035623, 45.235623, 45.235623,
45.235623, 45.435623, 45.435623, 45.435623),
g9.nodes = c(27,
28, 29, 40, 41, 42, 43, 55, 56, 58, 69, 81, 94, 98, 108, 110,
113, 114, 123, 124, 128, 138, 139, 141)),
class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24"
))
The graph is from a simple squared adjacency matrix:
geo_dist_graph <-
structure(c(NA, 1, 1, NA, NA, 1, 1, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, NA, 1, NA, NA, NA,
1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, 1, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, NA,
NA, 1, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, 1, NA, 1, NA, NA, 1, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, NA, NA, NA, 1, NA,
1, NA, 1, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, 1, 1, 1, NA, NA, 1, NA, NA, NA, 1, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, NA, NA, NA,
NA, 1, NA, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, 1, 1, 1, NA, 1, NA, 1, 1, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, 1, NA,
1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, 1, 1, NA, NA, NA, NA, 1, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, 1, 1, NA, 1, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, NA, NA, 1, 1,
NA, NA, NA, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, 1, 1, NA, NA, NA, NA, 1, 1, NA, NA, 1, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, NA, NA, 1, NA,
NA, 1, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, 1, 1, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, 1, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, NA, NA, NA, 1,
NA, NA, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, 1, NA, NA, 1, NA, NA, NA, 1, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, NA, NA, NA,
NA, 1, NA, 1, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, 1, NA, 1, 1, NA, NA, 1, NA, NA, 1, 1, 1, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, NA, NA, 1, 1, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, 1, 1, NA, NA, 1, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1,
1, NA, 1, NA, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, 1, NA, NA, NA, 1, NA, NA, 1, NA),
.Dim = c(24L,
24L))
colnames(geo_dist_graph) <- grid_grid$g9.nodes
row.names(geo_dist_graph) <- grid_grid$g9.nodes
geo_dist_graph <- graph_from_adjacency_matrix(geo_dist_graph, mode = "upper", diag = F)
The spatial coordinates where attched this way:
V(geo_dist_graph)$x <-
grid_grid$coords.x1[match(V(geo_dist_graph)$name, grid_grid$g9.nodes)]
V(geo_dist_graph)$y <-
grid_grid$coords.x2[match(V(geo_dist_graph)$name, grid_grid$g9.nodes)]
The graph is correclty plotted in space when using the plot function. But when I try to add a basemap like this plot(map_crop_sp, add = T), the map doesn't show behind the graph, but there is no error message.
The map is vector map, don't know if it's important. Here is the code used to create it.
map <- st_read("ne_10m_coastline/ne_10m_coastline.shp")
map_crop <- st_crop(map, xmin = 13.304078, ymin = 43.635623, xmax = 16.503846, ymax = 45.60185)
map_crop_sp <- as(map_crop, Class = "Spatial")
Answer
Since the igraph should be on top of the map, I plot it second. I also added rescale = F:
plot(map_crop_sp)
plot(geo_dist_graph, add = T, rescale = F)
Rationale
I typed ?plot.igraph. From there, I found ?igraph.plotting. It seems that plotting an igraph object rescales it (plot(..., rescale = TRUE):
Logical constant, whether to rescale the coordinates to the [-1,1]x-1,1 interval. This parameter is not implemented for tkplot.
Defaults to TRUE, the layout will be rescaled.

Assign value labels as string values in R

Hello I imported a dataset from SPSS in R, the dataset has labels and I want to use value labels as string values. Is there a way to do it?
head(dataset$A7B1)
<Labelled double>: A7b1. Cantón de San José en que reside
[1] NA NA NA 2 8 NA 4 NA 5
Labels:
value label
1 SAN JOSÉ
2 ESCAZÚ
3 DESAMPARADOS
4 PURISCAL
5 TARRAZÚ
6 ASERRÍ
7 MORA
8 GOICOECHEA
9 SANTA ANA
10 ALAJUELITA
11 CORONADO
12 ACOSTA
13 TIBAS
14 MORAVIA
15 MONTES DE OCA
16 TURRUBARES
17 DOTA
18 CURRIDABAT
19 PÉREZ ZELEDÓN
20 LEÓN CORTÉS
I need that every double labelled value become a string value according to the value label.
glimpse(dataset)
Rows: 283
Columns: 9
$ A7A <dbl+lbl> 2, 8, 3, 3, 1, 2, 4, 4, 4, 2, 2, 4, 3, 4, 2, 3, 1, 2, 2, 6, 1, 1, 2, 2, 1, 2, 3, 1, 2, 1, 1, 4, 3, 1, 2, 2, 1, 1, 4, ...
$ A7B1 <dbl+lbl> NA, NA, NA, NA, 8, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 3, NA, NA, NA, 1, 11, NA, NA, 8, NA, NA, 3, NA, 14, 1,...
$ A7B2 <dbl+lbl> 1, NA, NA, NA, NA, 1, NA, NA, NA, 1, 1, NA, NA, NA, 1, NA, NA, 6, 2, NA, NA, NA, 1, 10, NA, 1, NA, NA, 1, NA, NA, NA,...
$ A7B3 <dbl+lbl> NA, NA, 1, 7, NA, NA, NA, NA, NA, NA, NA, NA, 3, NA, NA, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, NA, NA, NA, NA...
$ A7B4 <dbl+lbl> NA, NA, NA, NA, NA, NA, 2, 1, 1, NA, NA, 9, NA, 7, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ A7B5 <dbl+lbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
$ A7B6 <dbl+lbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 2, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ A7B7 <dbl+lbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
$ A7B8 <dbl+lbl> NA, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA..
dput(head(dataset$A7A))
structure(c(2, 8, 3, 3, 1, 2), label = "A7a. Provincia de residencia", labels = c(`San Jose` = 1, Alajuela = 2, Cartago = 3, Heredia = 4, Guanacaste = 5, Puntarenas = 6,
Limon = 7, Extrenjero = 8), class = "haven_labelled")
I typically use haven when reading in SPSS data and have a helper function for this. Hope this helps--if it doesn't please provide more info in your question :)
library(haven)
swap_labels <- function(x, keep_original = TRUE) {
labels <- attr(x, "labels")
new_vec <- names(labels)[match(x, labels)]
if(keep_original) {
haven::labelled_spss(new_vec, setNames(names(labels), labels))
} else {
new_vec
}
}
# Reproducible example
test_vec <- labelled_spss(1:3, labels = setNames(1:3, letters[1:3]))
> test_vec
<labelled_spss<integer>[3]>
[1] 1 2 3
Labels:
value label
1 a
2 b
3 c
> swap_labels(test_vec)
<labelled_spss<character>[3]>
[1] a b c
Labels:
value label
a 1
b 2
c 3

extracting information from excel into lists in R

hello all i have this datasset :
> dput(test1)
structure(list(startdate = c("2019-11-06", "2019-11-06", "2019-11-06",
"2019-11-06", "2019-11-06", "2019-11-06", "2019-11-06", "2019-11-06",
"2019-11-06", "2019-11-06", "2019-11-06", "2019-11-06", "2019-11-06",
"2019-11-06", "2019-11-06", "2019-11-06", "2019-11-06", "2019-11-06",
"2019-11-06", "2019-11-06", "2019-11-06", "2019-11-27", "2019-11-27",
"2019-11-27", "2019-11-27", "2019-11-27", "2019-11-27", "2019-11-27",
"2019-11-27", "2019-11-27", "2019-11-27", "2019-11-27", "2019-11-27",
"2019-11-27", "2019-11-27", "2019-11-27", "2019-11-27", "2019-11-27",
"2019-11-27", "2019-11-27", "2019-11-01", "2019-11-05", "2019-11-15",
"2019-11-16", "2019-11-17", "2019-11-18", "2019-11-19", "2019-11-20",
"2019-11-21", NA), id = c("POL55", "POL56", "POL57", "POL58",
"POL59", "POL60", "POL61", "POL62", "POL63", "POL64", "POL65",
"POL66", "POL67", "POL68", "POL69", "POL56", "POL57", "POL58",
"POL59", "POL60", "POL61", "POL55", "POL56", "POL57", "POL58",
"POL59", "POL60", "POL61", "POL55", "POL56", "POL57", "POL58",
"POL59", "POL60", "POL61", "POL55", "POL56", "POL57", "POL58",
"POL59", "POL60", "POL61", "POL62", "POL63", "POL64", "POL65",
"POL66", "POL67", "POL68", NA), m0_9 = c(NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
32, 34, NA, NA, NA, NA, 55, 3, NA, NA, NA, 7, 9, 1, 65, 3, 98,
33, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), m10_19 = c(NA,
NA, NA, 32, 34, NA, NA, NA, NA, 55, 3, NA, NA, NA, 7, 9, 1, 65,
3, 98, 33, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), m20_29 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 32, 34, NA,
NA, NA, NA, 55, 3, NA, NA, NA, 7, 9, 1, 65, 3, 98, 33, NA, NA,
NA, NA, NA, NA, NA), m30_39 = c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, 32, 34, NA, NA, NA, NA, 55, 3, NA, NA, NA, 7, 9, 1, 65, 3,
98, 33, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA), m40_49 = c(32, 34, NA, NA,
NA, NA, 55, 3, NA, NA, NA, 7, 9, 1, 65, 3, 98, 33, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), m50_59 = c(NA,
NA, NA, NA, NA, NA, 32, 34, NA, NA, NA, NA, 55, 3, NA, NA, NA,
7, 9, 1, 65, 3, 98, 33, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), m60_69 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, 32, 34, NA, NA, NA, NA, 55, 3, NA, NA, NA, 7, 9,
1, 65, 3, 98, 33, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), m70 = c(NA, NA, NA, NA, NA, NA, 32,
34, NA, NA, NA, NA, 55, 3, NA, NA, NA, 7, 9, 1, 65, 3, 98, 33,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), f0_9 = c(32, 34, NA,
NA, NA, NA, 55, 3, NA, NA, NA, 7, 9, 1, 65, 3, 98, 33, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), f10_19 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, 32, 34, NA, NA, NA, NA, 55,
3, NA, NA, NA, 7, 9, 1, 65, 3, 98, 33, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), f20_29 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 32, 34, NA, NA, NA, NA, 55, 3, NA, NA, NA, 7, 9, 1, 65, 3,
98, 33, NA, NA, NA), f30_39 = c(NA, NA, NA, 32, 34, NA, NA, NA,
NA, 55, 3, NA, NA, NA, 7, 9, 1, 65, 3, 98, 33, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA), f40_49 = c(NA, NA, NA, NA,
NA, 32, 34, NA, NA, NA, NA, 55, 3, NA, NA, NA, 7, 9, 1, 65, 3,
98, 33, NA, NA, NA, NA, NA, NA, NA, NA, 32, 34, NA, NA, NA, NA,
55, 3, NA, NA, NA, 7, 9, 1, 65, 3, 98, 33, NA), f50_59 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 32, 34, NA, NA, NA, NA,
55, 3, NA, NA, NA, 7, 9, 1, 65, 3, 98, 33, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), f60_69 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 32, 34, NA, NA, NA, NA,
55, 3, NA, NA, NA, 7, 9, 1, 65, 3, 98, 33, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA), f70 = c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, 32, 34, NA, NA, NA, NA, 55, 3, NA, NA, NA, 7, 9, 1, 65, 3,
98, 33, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA)), row.names = c(NA, -50L), class = c("tbl_df",
"tbl", "data.frame"))
I would like to create a list called ageCat. This list should contain a number of lists. The number of lists is the amount of age categories. Then for each age category i would like to extract the following info startAge, endAge, maleCount,femaleCount, totalCount.
Additionaly, i want only to sum up only individuals that have the same id and start date. For now i have written this:
create list of age
createLists <- function(startdate, id){
testFiltered = test1[policyid == id & start == startdate]
ageGroup <- vector("list", length == 8)
names(ageGroup) <- as.character(seq_along(ageGroup))
for(ageCat in seq_along(ageGroup)){
ageGroup[[ageCat]] <- getAgeInfo(testFiltered, ageCat)
}
getAgeInfo <- function(testFiltered, ageCat){
start =
end =
nomales =
nofemales =
}
ageGroup <- list(startAge = start,
endAge = end ,
maleCount = nomales ,
femaleCount = nofemales)
}
I have hard coded the length of the vecor ageGroup. How can i do this without hard coding it, aka. to look up how many columns with age categories I have for each gender?
Secondly, how can i extract the information startAge, endAge, maleCount,femaleCount, totalCount
Instead of working with lists I suggest to convert your data.frame to long format, getting rid of missing values and extracting sex and age. A `tidyverse´ approach might look like this:
library(dplyr)
library(tidyr)
library(tibble)
df <- tibble(
startdate = c(
"2019-11-06", "2019-11-06", "2019-11-06",
"2019-11-06", "2019-11-06", "2019-11-06", "2019-11-06", "2019-11-06",
"2019-11-06", "2019-11-06", "2019-11-06", "2019-11-06", "2019-11-06",
"2019-11-06", "2019-11-06", "2019-11-06", "2019-11-06", "2019-11-06",
"2019-11-06", "2019-11-06", "2019-11-06", "2019-11-27", "2019-11-27",
"2019-11-27", "2019-11-27", "2019-11-27", "2019-11-27", "2019-11-27",
"2019-11-27", "2019-11-27", "2019-11-27", "2019-11-27", "2019-11-27",
"2019-11-27", "2019-11-27", "2019-11-27", "2019-11-27", "2019-11-27",
"2019-11-27", "2019-11-27", "2019-11-01", "2019-11-05", "2019-11-15",
"2019-11-16", "2019-11-17", "2019-11-18", "2019-11-19", "2019-11-20",
"2019-11-21", NA
),
id = c(
"POL55", "POL56", "POL57", "POL58",
"POL59", "POL60", "POL61", "POL62", "POL63", "POL64", "POL65",
"POL66", "POL67", "POL68", "POL69", "POL56", "POL57", "POL58",
"POL59", "POL60", "POL61", "POL55", "POL56", "POL57", "POL58",
"POL59", "POL60", "POL61", "POL55", "POL56", "POL57", "POL58",
"POL59", "POL60", "POL61", "POL55", "POL56", "POL57", "POL58",
"POL59", "POL60", "POL61", "POL62", "POL63", "POL64", "POL65",
"POL66", "POL67", "POL68", NA
),
m0_9 = c(
NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
32, 34, NA, NA, NA, NA, 55, 3, NA, NA, NA, 7, 9, 1, 65, 3, 98,
33, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
),
m10_19 = c(
NA,
NA, NA, 32, 34, NA, NA, NA, NA, 55, 3, NA, NA, NA, 7, 9, 1, 65,
3, 98, 33, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
),
m20_29 = c(
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 32, 34, NA,
NA, NA, NA, 55, 3, NA, NA, NA, 7, 9, 1, 65, 3, 98, 33, NA, NA,
NA, NA, NA, NA, NA
),
m30_39 = c(
NA, NA, NA, NA, NA, NA, NA, NA,
NA, 32, 34, NA, NA, NA, NA, 55, 3, NA, NA, NA, 7, 9, 1, 65, 3,
98, 33, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA
),
m40_49 = c(
32, 34, NA, NA,
NA, NA, 55, 3, NA, NA, NA, 7, 9, 1, 65, 3, 98, 33, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
),
m50_59 = c(
NA,
NA, NA, NA, NA, NA, 32, 34, NA, NA, NA, NA, 55, 3, NA, NA, NA,
7, 9, 1, 65, 3, 98, 33, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), m60_69 = c(
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, 32, 34, NA, NA, NA, NA, 55, 3, NA, NA, NA, 7, 9,
1, 65, 3, 98, 33, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA
), m70 = c(
NA, NA, NA, NA, NA, NA, 32,
34, NA, NA, NA, NA, 55, 3, NA, NA, NA, 7, 9, 1, 65, 3, 98, 33,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), f0_9 = c(
32, 34, NA,
NA, NA, NA, 55, 3, NA, NA, NA, 7, 9, 1, 65, 3, 98, 33, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), f10_19 = c(
NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, 32, 34, NA, NA, NA, NA, 55,
3, NA, NA, NA, 7, 9, 1, 65, 3, 98, 33, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), f20_29 = c(
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 32, 34, NA, NA, NA, NA, 55, 3, NA, NA, NA, 7, 9, 1, 65, 3,
98, 33, NA, NA, NA
), f30_39 = c(
NA, NA, NA, 32, 34, NA, NA, NA,
NA, 55, 3, NA, NA, NA, 7, 9, 1, 65, 3, 98, 33, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA
), f40_49 = c(
NA, NA, NA, NA,
NA, 32, 34, NA, NA, NA, NA, 55, 3, NA, NA, NA, 7, 9, 1, 65, 3,
98, 33, NA, NA, NA, NA, NA, NA, NA, NA, 32, 34, NA, NA, NA, NA,
55, 3, NA, NA, NA, 7, 9, 1, 65, 3, 98, 33, NA
), f50_59 = c(
NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 32, 34, NA, NA, NA, NA,
55, 3, NA, NA, NA, 7, 9, 1, 65, 3, 98, 33, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), f60_69 = c(
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 32, 34, NA, NA, NA, NA,
55, 3, NA, NA, NA, 7, 9, 1, 65, 3, 98, 33, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA
), f70 = c(
NA, NA, NA, NA, NA, NA, NA, NA,
NA, 32, 34, NA, NA, NA, NA, 55, 3, NA, NA, NA, 7, 9, 1, 65, 3,
98, 33, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA
)
)
# Convert to tidy data frame
df_age <- df %>%
gather(age_sex, count, -startdate, -id) %>%
filter(!is.na(count)) %>%
extract(age_sex, into = c("sex", "start_age", "end_age"), regex = "(m|f)(\\d+)_?(\\d+)?", remove = FALSE) %>%
mutate(ageg = paste0(start_age, "_", end_age))
df_age
#> # A tibble: 187 x 8
#> startdate id age_sex sex start_age end_age count ageg
#> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <chr>
#> 1 2019-11-27 POL55 m0_9 m 0 9 32 0_9
#> 2 2019-11-27 POL56 m0_9 m 0 9 34 0_9
#> 3 2019-11-27 POL61 m0_9 m 0 9 55 0_9
#> 4 2019-11-27 POL55 m0_9 m 0 9 3 0_9
#> 5 2019-11-27 POL59 m0_9 m 0 9 7 0_9
#> 6 2019-11-27 POL60 m0_9 m 0 9 9 0_9
#> 7 2019-11-27 POL61 m0_9 m 0 9 1 0_9
#> 8 2019-11-27 POL55 m0_9 m 0 9 65 0_9
#> 9 2019-11-27 POL56 m0_9 m 0 9 3 0_9
#> 10 2019-11-27 POL57 m0_9 m 0 9 98 0_9
#> # ... with 177 more rows
# df back to nested list by startdate and ageg
df_list <- df_age %>%
# Count by startdate, ageg, start_age, end_age, sex
count(startdate, ageg, start_age, end_age, sex, wt = count) %>%
# male and female counts back in columns
spread(sex, n, fill = 0) %>%
# split by startdate
split(.$startdate) %>%
# ... and split each startdate list by ageg
lapply(function(x) split(x, x$ageg))
Created on 2020-03-10 by the reprex package (v0.3.0)

Resources