Reshape long to wide with dcast - r

I need to reshape a dataframe csf_antibio (sample below) from long to wide format, separating all values per patient by row.
study_id,proanbx_dt,proanbx_tm,name,othername,route,dosage,units,doses,freq
CHLA_0001,2021-07-22,20:01:00,ceftriaxone,,IV,1250,mg,4,13
CHLA_0001,2021-07-22,20:19:00,metronidazole,,IV,250,mg,5,9
CHLA_0001,2021-07-22,23:17:00,vancomycin,,IV,350,mg,3,6
CHLA_0001,2021-08-09,19:34:00,cefazolin,,IV,738,mg,1,8
CHLA_0002,2020-12-18,0:30:00,cefepime,,IV,75,mg,5,8
CHLA_0002,2020-12-18,1:03:00,vancomycin,,IV,23,mg,4,13
CHLA_0002,2020-12-19,18:15:00,cefepime,,IV,60,mg,6,8
CHLA_0002,2020-12-20,4:18:00,vancomycin,,IV,24,mg,4,12
CHLA_0003,2021-04-20,15:17:00,meropenem,,IV,200,mg,2,1
CHLA_0003,2021-04-21,2:20:00,meropenem,,IV,400,mg,17,8
CHLA_0003,2021-04-22,14:16:00,Other,sulfamethoxazole-trimethoprim,IV,50,mg,9,12
I tried the following without success:
csfmelt <- melt(csf_antibio, id.vars=1:1)
csf <- dcast(csfmelt, study_id ~ variable, value.var = "value", fun.aggregate = sum)
I want the final dataframe to have each study id per row with variables
study_id,proanbx_dt1,proanbx_tm1,name1,othername1,route1,dosage1,units1,doses1,freq1,proanbx_dt2,proanbx_tm2,name2,othername2,route2,dosage2,units2,doses2,freq2,proanbx_dt3,proanbx_tm3,name3,othername3,route3,dosage3,units3,doses3,freq3,proanbx_dt4,proanbx_tm4,name4,othername4,route4,dosage4,units4,doses4,freq4
CHLA_0001,2021-07-22,20:01:00,ceftriaxone,,IV,1250,mg,4,13, 2021-07-22,20:19:00,metronidazole,,IV,250,mg,5,9, 2021-07-22,23:17:00,vancomycin,,IV,350,mg,3,6,2021-08-09,19:34:00,cefazolin,,IV,738,mg,1,8
CHLA_0002,2020-12-18,0:30:00,cefepime,,IV,75,mg,5,8,2020-12-18,1:03:00,vancomycin,,IV,23,mg,4,13,2020-12-19,18:15:00,cefepime,,IV,60,mg,6,8,2020-12-20,4:18:00,vancomycin,,IV,24,mg,4,12,2021-04-20,15:17:00,meropenem,,IV,200,mg,2,1,2021-04-21,2:20:00,meropenem,,IV,400,mg,17,8,2021-04-22,14:16:00,Other,sulfamethoxazole-trimethoprim,IV,50,mg,9,12
Thanks in advance!

Your desired output has a "number" component that is not naturally inferred by dcast. We can add it relatively easily with ave (base R, certainly this can be done just as easily in data.table or dplyr groupings).
reshape2 and base R
csfmelt$num <- ave(seq(nrow(csfmelt)), csfmelt[c("study_id","variable")], FUN = seq_along)
head(csfmelt)
# study_id variable value num
# 1 CHLA_0001 proanbx_dt 2021-07-22 1
# 2 CHLA_0001 proanbx_dt 2021-07-22 2
# 3 CHLA_0001 proanbx_dt 2021-07-22 3
# 4 CHLA_0001 proanbx_dt 2021-08-09 4
# 5 CHLA_0002 proanbx_dt 2020-12-18 1
# 6 CHLA_0002 proanbx_dt 2020-12-18 2
csfwide <- reshape2::dcast(csfmelt, study_id ~ variable + num, value.var = "value")
csfwide
# study_id proanbx_dt_1 proanbx_dt_2 proanbx_dt_3 proanbx_dt_4 proanbx_tm_1 proanbx_tm_2 proanbx_tm_3 proanbx_tm_4 name_1 name_2 name_3 name_4 othername_1 othername_2 othername_3 othername_4 route_1 route_2 route_3 route_4 dosage_1 dosage_2 dosage_3 dosage_4 units_1 units_2 units_3 units_4 doses_1 doses_2 doses_3 doses_4 freq_1 freq_2 freq_3 freq_4
# 1 CHLA_0001 2021-07-22 2021-07-22 2021-07-22 2021-08-09 20:01:00 20:19:00 23:17:00 19:34:00 ceftriaxone metronidazole vancomycin cefazolin IV IV IV IV 1250 250 350 738 mg mg mg mg 4 5 3 1 13 9 6 8
# 2 CHLA_0002 2020-12-18 2020-12-18 2020-12-19 2020-12-20 0:30:00 1:03:00 18:15:00 4:18:00 cefepime vancomycin cefepime vancomycin IV IV IV IV 75 23 60 24 mg mg mg mg 5 4 6 4 8 13 8 12
# 3 CHLA_0003 2021-04-20 2021-04-21 2021-04-22 <NA> 15:17:00 2:20:00 14:16:00 <NA> meropenem meropenem Other <NA> sulfamethoxazole-trimethoprim <NA> IV IV IV <NA> 200 400 50 <NA> mg mg mg <NA> 2 17 9 <NA> 1 8 12 <NA>
The column order is not what you requested, but it can be conformed a bit with this:
variables <- as.character(unique(csfmelt$variable))
sub(".*_", "", names(csfwide)[-(1:2)])
# [1] "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4"
sub("_[^_]*$", "", names(csfwide)[-(1:2)])
# [1] "proanbx_dt" "proanbx_dt" "proanbx_dt" "proanbx_tm" "proanbx_tm" "proanbx_tm" "proanbx_tm" "name" "name" "name" "name" "othername" "othername" "othername" "othername" "route" "route" "route" "route" "dosage"
# [21] "dosage" "dosage" "dosage" "units" "units" "units" "units" "doses" "doses" "doses" "doses" "freq" "freq" "freq" "freq"
nms <- names(csfwide)[-(1:2)]
newnms <- nms[order(sub(".*_", "", nms), match(nms, variables))]
csfwide2 <- subset(csfwide, select = c(names(csfwide)[1:2], newnms))
csfwide2
# study_id proanbx_dt_1 proanbx_tm_1 name_1 othername_1 route_1 dosage_1 units_1 doses_1 freq_1 proanbx_dt_2 proanbx_tm_2 name_2 othername_2 route_2 dosage_2 units_2 doses_2 freq_2 proanbx_dt_3 proanbx_tm_3 name_3 othername_3 route_3 dosage_3 units_3 doses_3 freq_3 proanbx_dt_4 proanbx_tm_4 name_4 othername_4 route_4 dosage_4 units_4 doses_4 freq_4
# 1 CHLA_0001 2021-07-22 20:01:00 ceftriaxone IV 1250 mg 4 13 2021-07-22 20:19:00 metronidazole IV 250 mg 5 9 2021-07-22 23:17:00 vancomycin IV 350 mg 3 6 2021-08-09 19:34:00 cefazolin IV 738 mg 1 8
# 2 CHLA_0002 2020-12-18 0:30:00 cefepime IV 75 mg 5 8 2020-12-18 1:03:00 vancomycin IV 23 mg 4 13 2020-12-19 18:15:00 cefepime IV 60 mg 6 8 2020-12-20 4:18:00 vancomycin IV 24 mg 4 12
# 3 CHLA_0003 2021-04-20 15:17:00 meropenem IV 200 mg 2 1 2021-04-21 2:20:00 meropenem IV 400 mg 17 8 2021-04-22 14:16:00 Other sulfamethoxazole-trimethoprim IV 50 mg 9 12 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>

#r2evans gave you a great answer, but I was thinking about your comments regarding dates and time. You didn't provide how you collected this data, so I can't tell you how to import it this way. However, I did convert these variables in the following code. That being said, adding dates isn't meaningful. I was thinking that the number of days and the amount of time passed might be more along the lines of what you were looking for for those particular variables. Unfortunately, I wasn't able to figure out how to make it work with reshape2. This uses dplyr, tidyselect and hms. Although, you would only have to call dplyr, because I've appended the packages for the applicable functions. (You need the packages installed, though.)
I didn't keep the name and othername because it's not multiple entries.
library(dplyr)
csf_antibio = read.table(header = T, sep = ",", text = "study_id,proanbx_dt,proanbx_tm,name,othername,route,dosage,units,doses,freq
CHLA_0001,2021-07-22,20:01:00,ceftriaxone,,IV,1250,mg,4,13
CHLA_0001,2021-07-22,20:19:00,metronidazole,,IV,250,mg,5,9
CHLA_0001,2021-07-22,23:17:00,vancomycin,,IV,350,mg,3,6
CHLA_0001,2021-08-09,19:34:00,cefazolin,,IV,738,mg,1,8
CHLA_0002,2020-12-18,0:30:00,cefepime,,IV,75,mg,5,8
CHLA_0002,2020-12-18,1:03:00,vancomycin,,IV,23,mg,4,13
CHLA_0002,2020-12-19,18:15:00,cefepime,,IV,60,mg,6,8
CHLA_0002,2020-12-20,4:18:00,vancomycin,,IV,24,mg,4,12
CHLA_0003,2021-04-20,15:17:00,meropenem,,IV,200,mg,2,1
CHLA_0003,2021-04-21,2:20:00,meropenem,,IV,400,mg,17,8
CHLA_0003,2021-04-22,14:16:00,Other,sulfamethoxazole-trimethoprim,IV,50,mg,9,12")
Because the time is truly linked to the date, I wrote a function to process the time difference.
timer <- function(df1){
maxtm = max(df1$proanbx_tm[df1$proanbx_dt == max(df1$proanbx_dt)]) %>% hms::as_hms()
mintm = min(df1$proanbx_tm[df1$proanbx_dt == min(df1$proanbx_dt)]) %>% hms::as_hms()
if(maxtm > mintm){
tmr = (maxtm - mintm) %>% hms::as_hms() # captures mult entries in the same day
} else if(mintm > maxtm) {
tmr = (maxtm - mintm) + hms::as_hms('24:00:00') # add a full day
} else { # only one entry or the time is identical in max/min
tmr = hms::as_hms('0')
}
return(tmr)
}
I collected the column names to return the columns to the original order.
ordNames = names(csf_antibio) # collect names to return order to columns
# [1] "study_id" "proanbx_dt" "proanbx_tm" "name" "othername" "route"
# [7] "dosage" "units" "doses" "freq"
# names kept = ordNames[,c(1:3,6:10)]
Find the sums and differences in time
csf2 <- csf_antibio %>%
mutate(proanbx_dt = as.Date(proanbx_dt), # convert to date
proanbx_tm = hms::as_hms(proanbx_tm)) %>% # convert to time
group_by(study_id) %>% # group by study
summarise(proanbx_tm = timer(.data), # difference in time
proanbx_dt = max(proanbx_dt) - min(proanbx_dt), # difference in days
across(tidyselect:::where(is.integer), sum),
units = "mg",
route = "IV") %>%
select(ordNames[c(1:3,6:10)])
head(csf2)
# # A tibble: 3 × 8
# study_id proanbx_dt proanbx_tm route dosage units doses freq
# <chr> <drtn> <time> <chr> <int> <chr> <int> <int>
# 1 CHLA_0001 18 days 23:33 IV 2588 mg 13 36
# 2 CHLA_0002 2 days 03:48 IV 182 mg 19 41
# 3 CHLA_0003 2 days 22:59 IV 650 mg 28 21

Related

Why does R throw an error on iterative calculation

I'm looking at covid-19 data to calculate estimates for the reproductive number R0.
library(ggplot2)
library(dplyr)
library(tidyr)
library(stringr)
library(TTR)
# Get COVID cases, available from:
url <- "https://static.usafacts.org/public/data/covid-19/covid_confirmed_usafacts.csv"
DoubleCOV <- read.csv(url, stringsAsFactors = FALSE)
names(DoubleCOV)[1] <- "countyFIPS"
DoubleCovid <- pivot_longer(DoubleCOV, cols=starts_with("X"),
values_to="cases",
names_to=c("X","date_infected"),
names_sep="X") %>%
mutate(infected = as.Date(date_infected, format="%m.%d.%y"),
countyFIPS = str_pad(as.character(countyFIPS), 5, pad="0"))
#data is by county, summarise for the state of interest
stateData <- DoubleCovid %>% filter(State == "AL") %>% filter(cases != 0) %>%
group_by(infected) %>% summarise(sum(cases)) %>%
mutate(DaysSince = infected - min(infected))
names(stateData)[2] <- "cumCases"
#3 day moving average to smooth a little
stateData <- stateData %>% mutate(MA = runMean(cumCases,3))
#calculate doubling rate (DR) and then R0 infectious period/doubling rate
for(j in 4:nrow(stateData)){
stateData$DR[j] <- log(2)/log(stateData$MA[j]/stateData$MA[j-1])
stateData$R0[j] <- 14/stateData$DR[j]
}
CDplot <- stateData %>%
ggplot(mapping = aes(x = as.numeric(DaysSince), y = R0)) +
geom_line(color = "firebrick")
print(CDplot)
So in the above the state of interest is Alabama, hence filter(State == "AL") and this works.
But if I change the state to "NY" I get
Error in `$<-.data.frame`(`*tmp*`, "DR", value = c(NA, NA, NA, 0.733907206043719 :
replacement has 4 rows, data has 39
head(stateData) yields
infected cumCases DaysSince MA
<date> <int> <drtn> <dbl>
1 2020-03-02 1 0 days NA
2 2020-03-03 2 1 days NA
3 2020-03-04 11 2 days 4.67
4 2020-03-05 23 3 days 12
5 2020-03-06 25 4 days 19.7
6 2020-03-07 77 5 days 41.7
The moving average values in rows 3 and 4 (12 and 4.67) would yield a doubling rate of 0.734 which aligns with the value in the error message value = c(NA, NA, NA, 0.733907206043719 but why does it throw an error after that?
Bonus question: I know loops are frowned upon in R...is there a way to get the moving average and R0 calculation without one?
You have to initialise the new variables before you can access them using the j index. Due to recycling, Alabama, which has 28 rows (divisible by 4), does not return an error, only the warnings about uninitialised columns. New York, however, has 39 rows, which is not divisible by 4 so recycling fails and R returns an error. You shouldn't ignore warnings, sometimes you can, but it's not a good idea.
Try this to see what R (you) is trying to do:
stateData[4]
You should get all rows of the 4th column, not the 4th row.
Solution: initialise your DR and R0 columns first.
stateData$DR <- NA
stateData$R0 <- NA
for(j in 4:nrow(stateData)){
stateData$DR[j] <- log(2)/log(stateData$MA[j]/stateData$MA[j-1])
stateData$R0[j] <- 14/stateData$DR[j]
}
For the bonus question, you can use lag in the same mutate with MA:
stateData <- stateData %>% mutate(MA = runMean(cumCases,3),
DR = log(2)/log(MA/lag(MA)),
R0 = 14 / DR)
stateData
# A tibble: 28 x 6
infected cumCases DaysSince MA DR R0
<date> <int> <drtn> <dbl> <dbl> <dbl>
1 2020-03-13 5 0 days NA NA NA
2 2020-03-14 11 1 days NA NA NA
3 2020-03-15 22 2 days 12.7 NA NA
4 2020-03-16 29 3 days 20.7 1.42 9.89
5 2020-03-17 39 4 days 30 1.86 7.53
6 2020-03-18 51 5 days 39.7 2.48 5.64
7 2020-03-19 78 6 days 56 2.01 6.96
8 2020-03-20 106 7 days 78.3 2.07 6.78
9 2020-03-21 131 8 days 105 2.37 5.92
10 2020-03-22 167 9 days 135. 2.79 5.03
# ... with 18 more rows
I'm using Alabama's data.

How to calculate a overall score based on matrix positions?

I have a dataframe consisting of 12 columns with different participants, in a top 5. It looks like this:
> top_5
4 5 8 9 11 12 15 16 19 20 22 23
[1,] "Nia" "Hung" "Hanaaa" "Ramziyya" "Marissa" "Jaelyn" "Shyanne" "Jaabir" "Dionicio" "Nia" "Shyanne" "Roger"
[2,] "Razeena" "Husni" "Bradly" "Marissa" "Bradly" "Muhsin" "Razeena" "Dionicio" "Magnus" "Kelsey" "Nia" "Schyler"
[3,] "Shyanne" "Schyler" "Necko" "Johannah" "Tatiana" "Glenn" "Nia" "Jaelyn" "Shyanne" "Hanaaa" "Mildred" "German"
[4,] "Schyler" "German" "Hung" "Lubaaba" "Johannah" "Magnus" "Dionicio" "German" "German" "Razeena" "Dionicio" "Jaabir"
[5,] "Husni" "Necko" "Razeena" "Afeefa" "Schyler" "Dionicio" "Jaabir" "Roger" "Johannah" "Remy" "Jaabir" "Jaelyn"
(And can be recreated using this):
structure(c("Nia", "Razeena", "Shyanne", "Schyler", "Husni",
"Hung", "Husni", "Schyler", "German", "Necko", "Hanaaa", "Bradly",
"Necko", "Hung", "Razeena", "Ramziyya", "Marissa", "Johannah",
"Lubaaba", "Afeefa", "Marissa", "Bradly", "Tatiana", "Johannah",
"Schyler", "Jaelyn", "Muhsin", "Glenn", "Magnus", "Dionicio",
"Shyanne", "Razeena", "Nia", "Dionicio", "Jaabir", "Jaabir",
"Dionicio", "Jaelyn", "German", "Roger", "Dionicio", "Magnus",
"Shyanne", "German", "Johannah", "Nia", "Kelsey", "Hanaaa", "Razeena",
"Remy", "Shyanne", "Nia", "Mildred", "Dionicio", "Jaabir", "Roger",
"Schyler", "German", "Jaabir", "Jaelyn"), .Dim = c(5L, 12L), .Dimnames = list(
NULL, c("4", "5", "8", "9", "11", "12", "15", "16", "19",
"20", "22", "23")))
Now if a participant is in the top row, it means that they are in first place in that column (so for the 1st column "Nia" is first, "Razeena" is second, etc.). A first place in the ranking is worth 5 points, the second place 4 points, etc. Now I want to calculate for each participant in the matrix her/his points.
My goal is to make an overall top 5. How would I go about this?
Here is a "convert to long then summarise by group" method similar to M--'s answer, but with data.table
library(data.table)
df <- as.data.table(top_5)[, points := .N:1]
total_points <- melt(df, 'points')[, .(points = sum(points)), value]
setorder(total_points, -points)
head(total_points, 5)
# value points
# 1: Nia 17
# 2: Shyanne 16
# 3: Dionicio 14
# 4: Razeena 11
# 5: Schyler 10
Or an idea very similar to akrun's, just using tapply in place of sapply + split
out <- sort(tapply(c(6 - row(top_5)), c(top_5), sum), decreasing = TRUE)
head(out, 5)
# Nia Shyanne Dionicio Razeena Schyler
# 17 16 14 11 10
An option is to split the row index reversed with the matrix values into a list and get the sum of each list element by looping over the list (sapply)
out <- sapply(split(row(top_5)[nrow(top_5):1, ], top_5), sum)
out
#Afeefa Bradly Dionicio German Glenn Hanaaa Hung Husni Jaabir Jaelyn Johannah Kelsey Lubaaba Magnus Marissa Mildred Muhsin
# 1 8 14 9 3 8 7 5 9 9 6 4 2 6 9 3 4
# Necko Nia Ramziyya Razeena Remy Roger Schyler Shyanne Tatiana
# 4 17 5 11 1 6 10 16 3
head(out[order(-out)], 5)
# Nia Shyanne Dionicio Razeena Schyler
# 17 16 14 11 10
Or another option is rowsum
rowsum(c(row(top_5)[nrow(top_5):1, ]), group = c(top_5))
Using tidyverse functions:
library(tidyr)
library(dplyr)
top_5 %>%
as.data.frame %>%
head(.,5) %>%
mutate(rank = nrow(.):1) %>%
pivot_longer(., -c(rank), values_to = "name", names_to = "col") %>%
group_by(name) %>%
summarise_at(vars(rank), list(points = sum))
#> # A tibble: 26 x 2
#> name points
#> <fct> <int>
#> 1 Husni 5
#> 2 Nia 17
#> 3 Razeena 11
#> 4 Schyler 10
#> 5 Shyanne 16
#> 6 German 9
#> 7 Hung 7
#> 8 Necko 4
#> 9 Bradly 8
#> 10 Hanaaa 8
#> # ... with 16 more rows

How to replace NAs by group mean?

I have two data frames, one containing the raw data (and lots of NAs) and one containing the means of the raw data at different intervals.
I would like to replace the NAs with the means at those intervals, while retaining the non-NAs.
I have googled a lot on "impute missing data r", or "replace missing data r", but have yet to find any solution that seems to fit, they all either seem to replace data with 0:s, or use a way to complex method for this assignment, like using the MICE package.
Code example:
This is the head of the first DF, with the raw data. As you can see, they are all NA for the first day.
steps date interval
1 NA 2012-10-01 0
2 NA 2012-10-01 5
3 NA 2012-10-01 10
4 NA 2012-10-01 15
5 NA 2012-10-01 20
6 NA 2012-10-01 25
...
The second data frame head contains the mean steps per interval, like this:
steps interval
1 1.72 0
2 0.340 5
3 0.132 10
4 0.151 15
5 0.0755 20
6 2.09 25
...
Now, what I am looking for is to be able to fill the NAs with the mean steps for the relevant interval, so it looks like this:
steps date interval
1 1.72 2012-10-01 0
2 0.340 2012-10-01 5
3 0.132 2012-10-01 10
4 0.151 2012-10-01 15
5 0.0755 2012-10-01 20
6 2.09 2012-10-01 25
...
Any tips, or links for resources I have missed? As this is a course assignment and I mostly wanna learn, any help without doing the assignment for me would be much appreciated! =)
edit: Also, since this is my first question on Stack Overflow, any comments on how to improve my question-making is also appreciated!
There are many ways to do that in R. For example
# generate dataframe with some interval vaulues
df1 <- data.frame(interval= rep(seq(0, 25, 5), 5))
# add a steps column
df1$steps <- 1:nrow(df)
# copy the dataframe
df2 <- df1
# replace some steps values with missings in df1
df1$steps[c(1,2,5, 14)] <- NA
# sapply goes thru every unique interval...
sapply(df1$interval, function(interval_i){
# replace missing steps of interval_i in df1 with the steps mean of interval_i of df2
df1$steps[is.na(df1$steps) & df1$interval == interval_i] <<- mean(df2$steps[df2$interval == interval_i], na.rm= TRUE)
# you must use <<- not <- to assign it to df outside of sapply
})
I'll add a solution using dplyr.
It's good practice to give some reproducible data in your question rather that can be put straight into R, rather than just pasting in the head of your data. I've created some dummy data instead:
# create random sample data
library(dplyr)
set.seed(100)
df1 <- tibble(
steps = runif(1e3),
date = lubridate::today() + runif(1e3) * 24,
interval = as.numeric(sample(seq(0,25, by=5), 1e3, replace = T))
)
# add 100 NAs at random
df1$steps[sample(1:1e3, 100)] <- NA
df1
# steps date interval
# <dbl> <date> <dbl>
# 1 0.308 2019-07-18 15
# 2 NA 2019-07-19 10
# 3 NA 2019-07-31 0
# 4 0.0564 2019-08-02 20
# 5 0.469 2019-07-25 0
# 6 0.484 2019-07-21 25
# 7 NA 2019-07-17 5
# 8 0.370 2019-07-28 0
# 9 0.547 2019-07-31 5
# 10 0.170 2019-08-08 15
# # … with 990 more rows
Using dplyr the imputation task is then pretty simple with group_by
df1 %>%
group_by(interval) %>%
mutate(steps = if_else(is.na(steps), mean(steps, na.rm = T), steps))
# # A tibble: 1,000 x 3
# # Groups: interval [6]
# steps date interval
# <dbl> <date> <dbl>
# 1 0.308 2019-07-18 15
# 2 0.573 2019-07-19 10
# 3 0.523 2019-07-31 0
# 4 0.0564 2019-08-02 20
# 5 0.469 2019-07-25 0
# 6 0.484 2019-07-21 25
# 7 0.527 2019-07-17 5
# 8 0.370 2019-07-28 0
# 9 0.547 2019-07-31 5
# 10 0.170 2019-08-08 15
# # … with 990 more rows
We can confirm that the imputed means are correct for each group by calculating the mean for each group and comparing it to the imputed values:
df1 %>%
group_by(interval) %>%
summarise(mean_int = mean(steps, na.rm=T))
# # A tibble: 6 x 2
# interval mean_int
# <dbl> <dbl>
# 1 0 0.523
# 2 5 0.527
# 3 10 0.573
# 4 15 0.511
# 5 20 0.475
# 6 25 0.485

Iteration for time series data, using purrr

I have a bunch of time series data stacked on top of one another in a data frame; one series for each region in a country. I'd like to apply the seas() function (from the seasonal package) to each series, iteratively, to make the series seasonally adjusted. To do this, I first have to convert the series to a ts class. I'm struggling to do all this using purrr.
Here's a minimum worked example:
library(seasonal)
library(tidyverse)
set.seed(1234)
df <- data.frame(region = rep(1:10, each = 20),
quarter = rep(1:20, 10),
var = sample(5:200, 200, replace = T))
For each region (indexed by a number) I'd like to perform the following operations. Here's the first region as an example:
tem1 <- df %>% filter(region==1)
tem2 <- ts(data = tem1$var, frequency = 4, start=c(1990,1))
tem3 <- seas(tem2)
tem4 <- as.data.frame(tem3$data)
I'd then like to stack the output (ie. the multiple tem4 data frames, one for each region), along with the region and quarter identifiers.
So, the start of the output for region 1 would be this:
final seasonaladj trend irregular region quarter
1 27 27 96.95 -67.97279 1 1
2 126 126 96.95 27.87381 1 2
3 124 124 96.95 27.10823 1 3
4 127 127 96.95 30.55075 1 4
5 173 173 96.95 75.01355 1 5
6 130 130 96.95 32.10672 1 6
The data for region 2 would be below this etc.
I started with the following but without luck so far. Basically, I'm struggling to get the time series into the tibble:
seas.adjusted <- df %>%
group_by(region) %>%
mutate(data.ts = map(.x = data$var,
.f = as.ts,
start = 1990,
freq = 4))
I don't know much about the seasonal adjustment part, so there may be things I missed, but I can help with moving your calculations into a map-friendly function.
After grouping by region, you can nest the data so there's a nested data frame for each region. Then you can run essentially the same code as you had, but inside a function in map. Unnesting the resulting column gives you a long-shaped data frame of adjustments.
Like I said, I don't have the expertise to know whether those last two columns having NAs is expected or not.
Edit: Based on #wibeasley's question about retaining the quarter column, I'm adding a mutate that adds a column of the quarters listed in the nested data frame.
library(seasonal)
library(tidyverse)
set.seed(1234)
df <- data.frame(region = rep(1:10, each = 20),
quarter = rep(1:20, 10),
var = sample(5:200, 200, replace = T))
df %>%
group_by(region) %>%
nest() %>%
mutate(data.ts = map(data, function(x) {
tem2 <- ts(x$var, frequency = 4, start = c(1990, 1))
tem3 <- seas(tem2)
as.data.frame(tem3$data) %>%
mutate(quarter = x$quarter)
})) %>%
unnest(data.ts)
#> # A tibble: 200 x 8
#> region final seasonaladj trend irregular quarter seasonal adjustfac
#> <int> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl>
#> 1 1 27 27 97.0 -68.0 1 NA NA
#> 2 1 126 126 97.0 27.9 2 NA NA
#> 3 1 124 124 97.0 27.1 3 NA NA
#> 4 1 127 127 97.0 30.6 4 NA NA
#> 5 1 173 173 97.0 75.0 5 NA NA
#> 6 1 130 130 97.0 32.1 6 NA NA
#> 7 1 6 6 97.0 -89.0 7 NA NA
#> 8 1 50 50 97.0 -46.5 8 NA NA
#> 9 1 135 135 97.0 36.7 9 NA NA
#> 10 1 105 105 97.0 8.81 10 NA NA
#> # ... with 190 more rows
I also gave a bit more thought to doing this without nesting, and instead tried doing it with a split. Passing that list of data frames into imap_dfr let me take each split piece of the data frame and its name (in this case, the value of region), then return everything rbinded back together into one data frame. I sometimes shy away from nested data just because I have trouble seeing what's going on, so this is an alternative that is maybe more transparent.
df %>%
split(.$region) %>%
imap_dfr(function(x, reg) {
tem2 <- ts(x$var, frequency = 4, start = c(1990, 1))
tem3 <- seas(tem2)
as.data.frame(tem3$data) %>%
mutate(region = reg, quarter = x$quarter)
}) %>%
select(region, quarter, everything()) %>%
head()
#> region quarter final seasonaladj trend irregular seasonal adjustfac
#> 1 1 1 27 27 96.95 -67.97274 NA NA
#> 2 1 2 126 126 96.95 27.87378 NA NA
#> 3 1 3 124 124 96.95 27.10823 NA NA
#> 4 1 4 127 127 96.95 30.55077 NA NA
#> 5 1 5 173 173 96.95 75.01353 NA NA
#> 6 1 6 130 130 96.95 32.10669 NA NA
Created on 2018-08-12 by the reprex package (v0.2.0).
I put all the action inside of f(), and then called it with purrr::map_df(). The re-inclusion of quarter is a hack.
f <- function( .region ) {
d <- df %>%
dplyr::filter(region == .region)
y <- d %>%
dplyr::pull(var) %>%
ts(frequency = 4, start=c(1990,1)) %>%
seas()
y$data %>%
as.data.frame() %>%
# dplyr::select(-seasonal, -adjustfac) %>%
dplyr::mutate(
quarter = d$quarter
)
}
purrr::map_df(1:10, f, .id = "region")
results:
region final seasonaladj trend irregular quarter seasonal adjustfac
1 1 27.00000 27.00000 96.95000 -6.797279e+01 1 NA NA
2 1 126.00000 126.00000 96.95000 2.787381e+01 2 NA NA
3 1 124.00000 124.00000 96.95000 2.710823e+01 3 NA NA
4 1 127.00000 127.00000 96.95000 3.055075e+01 4 NA NA
5 1 173.00000 173.00000 96.95000 7.501355e+01 5 NA NA
6 1 130.00000 130.00000 96.95000 3.210672e+01 6 NA NA
7 1 6.00000 6.00000 96.95000 -8.899356e+01 7 NA NA
8 1 50.00000 50.00000 96.95000 -4.647254e+01 8 NA NA
9 1 135.00000 135.00000 96.95000 3.671077e+01 9 NA NA
10 1 105.00000 105.00000 96.95000 8.806955e+00 10 NA NA
...
96 5 55.01724 55.01724 60.25848 9.130207e-01 16 1.9084928 1.9084928
97 5 60.21549 60.21549 59.43828 1.013076e+00 17 1.0462424 1.0462424
98 5 58.30626 58.30626 58.87065 9.904130e-01 18 0.1715082 0.1715082
99 5 61.68175 61.68175 58.07827 1.062045e+00 19 1.0537962 1.0537962
100 5 59.30138 59.30138 56.70798 1.045733e+00 20 2.5294523 2.5294523
...

How to group by in base R

I would like to express the following SQL query using base R (without any particular package):
select month, day, count(*) as count, avg(dep_delay) as avg_delay
from flights
group by month, day
having count > 1000
It selects the mean departure delay and the number of flights per day on busy days (days with more than 1000 flights). The dataset is nycflights13 containing information of flights departed from NYC in 2013.
Notice I can easily write this in dplyr as:
flights %>%
group_by(month, day) %>%
summarise(count = n(), avg_delay = mean(dep_delay, na.rm = TRUE)) %>%
filter(count > 1000)
Since I was reminded earlier about the elegance of by (tip of the hat to #Parfait), here is a solution using by:
res <- by(flights, list(flights$month, flights$day), function(x)
if (nrow(x) > 1000) {
c(
month = unique(x$month),
day = unique(x$day),
count = nrow(x),
avg_delay = mean(x$dep_delay, na.rm = TRUE))
})
# Store in data.frame and order by month, day
df <- do.call(rbind, res);
df <- df[order(df[, 1], df[, 2]) ,];
# month day count avg_delay
#[1,] 7 8 1004 37.296646
#[2,] 7 9 1001 30.711499
#[3,] 7 10 1004 52.860702
#[4,] 7 11 1006 23.609392
#[5,] 7 12 1002 25.096154
#[6,] 7 17 1001 13.670707
#[7,] 7 18 1003 20.626789
#[8,] 7 25 1003 19.674134
#[9,] 7 31 1001 6.280843
#[10,] 8 7 1001 8.680402
#[11,] 8 8 1001 43.349947
#[12,] 8 12 1001 8.308157
#[13,] 11 27 1014 16.697651
#[14,] 12 2 1004 9.021978
as commented you can use a combi of subset and aggregate. Changed the order of day & month to recieve the same order as your dplyr approach. Using na.action = NULL to count rows inclunding NAs.
library(nycflights13)
#> Warning: Paket 'nycflights13' wurde unter R Version 3.4.4 erstellt
subset(aggregate(dep_delay ~ day + month, flights,
function(x) cbind(count=length(x), avg_delay=mean(x, na.rm = TRUE)),
na.action = NULL),
dep_delay[,1] > 1000)
#> day month dep_delay.1 dep_delay.2
#> 189 8 7 1004.000000 37.296646
#> 190 9 7 1001.000000 30.711499
#> 191 10 7 1004.000000 52.860702
#> 192 11 7 1006.000000 23.609392
#> 193 12 7 1002.000000 25.096154
#> 198 17 7 1001.000000 13.670707
#> 199 18 7 1003.000000 20.626789
#> 206 25 7 1003.000000 19.674134
#> 212 31 7 1001.000000 6.280843
#> 219 7 8 1001.000000 8.680402
#> 220 8 8 1001.000000 43.349947
#> 224 12 8 1001.000000 8.308157
#> 331 27 11 1014.000000 16.697651
#> 336 2 12 1004.000000 9.021978
Created on 2018-04-05 by the reprex package (v0.2.0).
Not a particularly elegant solution, but this will do what you want using Base R
flights_split <- split(flights, f = list(flights$month, flights$day))
result <- lapply(flights_split, function(x) {
if(nrow(x) > 1000) {
data.frame(month = unique(x$month), day = unique(x$day), avg_delay = mean(x$dep_delay, na.rm = T), count = nrow(x))
} else {
NULL
}
}
)
do.call(rbind, result)
# month day mean_delay n
# 12.2 12 2 9.021978 1004
# 8.7 8 7 8.680402 1001
# 7.8 7 8 37.296646 1004
# 8.8 8 8 43.349947 1001
# 7.9 7 9 30.711499 1001
# 7.10 7 10 52.860702 1004
# 7.11 7 11 23.609392 1006
# 7.12 7 12 25.096154 1002
# 8.12 8 12 8.308157 1001
# 7.17 7 17 13.670707 1001
# 7.18 7 18 20.626789 1003
# 7.25 7 25 19.674134 1003
# 11.27 11 27 16.697651 1014
# 7.31 7 31 6.280843 1001
Here is my solution:
grp <- expand.grid(mth = unique(flights$month), d = unique(flights$day))
out <- mapply(function(mth, d){
sub_data <- subset(flights, month == mth & day == d)
df <- data.frame(
month = mth,
day = d,
count = nrow(sub_data),
avg_delay = mean(sub_data$dep_delay, na.rm = TRUE)
)
df[df$count > 1000]
}, grp$mth, grp$d)
res <- do.call(rbind, out)
This is a lot slower than the dplyr solution.

Resources