time difference between rows producing odd results - r

Plenty of material on stackoverflow regarding calculating time differences between rows/entries/observations. However, I'm stumped why I'm getting NA's in unusual positions.
I have 3 columns, DATETIME which is posixlt, GRP800 which is the group (factor), and TIME800 which is supposed to represent the time elapsed between each observation for each group. My particular code was derived from Calculate differences between rows faster than a for loop?.
df$TIME800<-unlist(by(df$DATETIME,df$GRP800,function(x)c(NA,diff(x))))
It does appear to function properly for the first group but then I am getting NA's in the middle of the 2nd group. I've tried several approaches using diff and it's producing the identical output. I'm quite puzzled. Any advice would be greatly appreciated.
DATETIME GRP800 TIME800
1 2013-07-16 16:01:30 1 NA
2 2013-07-16 20:00:54 1 3.990000
3 2013-07-17 00:01:30 1 4.010000
4 2013-07-17 04:01:00 1 3.991667
5 2013-07-17 08:00:50 1 3.997222
6 2013-07-17 12:01:46 1 4.015556
7 2013-07-17 16:00:50 1 3.984444
8 2013-07-17 20:01:00 1 4.002778
9 2013-07-18 00:01:18 1 4.005000
10 2013-07-18 04:01:02 1 3.995556
11 2013-07-18 08:00:50 1 3.996667
12 2013-07-18 12:01:18 2 NA
13 2013-07-18 16:01:02 2 3.970833
14 2013-07-18 20:00:59 2 4.007500
15 2013-07-19 00:01:31 2 3.997222
16 2013-07-19 04:01:18 2 4.011111
17 2013-07-19 08:01:02 2 NA
18 2013-07-19 12:01:57 2 2.007500
19 2013-07-19 20:01:00 2 NA
20 2013-07-20 00:01:00 2 2.003333
> dput(df[1:20,])
structure(list(DATETIME = structure(list(sec = c(30, 54, 30,
0, 50, 46, 50, 0, 18, 2, 50, 18, 2, 59, 31, 18, 2, 57, 0, 0),
min = c(1L, 0L, 1L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 1L,
0L, 1L, 1L, 1L, 1L, 1L, 1L), hour = c(16L, 20L, 0L, 4L, 8L,
12L, 16L, 20L, 0L, 4L, 8L, 12L, 16L, 20L, 0L, 4L, 8L, 12L,
20L, 0L), mday = c(16L, 16L, 17L, 17L, 17L, 17L, 17L, 17L,
18L, 18L, 18L, 18L, 18L, 18L, 19L, 19L, 19L, 19L, 19L, 20L
), mon = c(6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L), year = c(113L, 113L, 113L,
113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L, 113L,
113L, 113L, 113L, 113L, 113L, 113L, 113L), wday = c(2L, 2L,
3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L,
5L, 5L, 6L), yday = c(196L, 196L, 197L, 197L, 197L, 197L,
197L, 197L, 198L, 198L, 198L, 198L, 198L, 198L, 199L, 199L,
199L, 199L, 199L, 200L), isdst = c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
zone = c("MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT",
"MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT",
"MDT", "MDT", "MDT", "MDT"), gmtoff = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_)), .Names = c("sec",
"min", "hour", "mday", "mon", "year", "wday", "yday", "isdst",
"zone", "gmtoff"), class = c("POSIXlt", "POSIXt")), GRP800 = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L), TIME800 = c(NA, 3.99, 4.01, 3.991666667, 3.997222222,
4.015555556, 3.984444444, 4.002777778, 4.005, 3.995555556, 3.996666667,
NA, 3.970833333, 4.0075, 3.997222222, 4.011111111, NA, 2.0075,
NA, 2.003333333)), .Names = c("DATETIME", "GRP800", "TIME800"
), row.names = c(NA, 20L), class = "data.frame")

Related

summarise_each() with across() for dplyr package

I have this script, I want to know how I can replace summarise_each() with the across() function?
common_bw_elements = df %>%
group_by(range_of_commons = cut(common_IDs,
breaks= c(-Inf,0, 5, 10, 20, 30, 60, 100, 200, 300, 600, 1200, 1800, Inf))) %>%
summarise_each(funs(sum), sum_of_instances = frequent)
I am asking this, as I get the following message:
Warning message: summarise_each() is deprecated as of dplyr 0.7.0. Please use across() instead.
My code is very similar to the following post: summarize groups into intervals using dplyr
Any leads on this would be greatly appreciated.
For reference, you can use the following dput()
dput(df)
structure(list(common_IDs = c(0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 17L, 18L, 25L, 26L, 27L, 37L, 51L, 55L, 56L, 63L, 68L, 69L, 70L, 71L, 74L, 76L, 81L, 84L, 86L, 87L, 89L, 90L, 91L, 92L, 101L,
103L, 108L, 109L, 110L, 113L, 114L, 115L, 116L, 129L, 130L, 131L, 133L, 135L, 136L, 137L, 138L, 139L, 141L, 152L, 153L, 154L, 177L, 178L, 190L, 191L, 196L, 199L, 202L, 203L, 208L, 209L, 210L, 211L, 213L, 214L, 215L, 216L, 218L, 219L, 222L, 223L, 229L, 230L, 231L,
232L, 239L, 251L, 252L, 254L, 257L, 264L, 265L, 271L, 272L, 273L, 275L, 276L, 277L, 280L, 293L, 294L, 297L, 298L, 299L, 300L, 301L, 304L, 317L, 320L, 337L, 346L, 347L, 364L, 371L, 373L, 386L, 387L, 389L, 412L, 417L, 419L, 420L, 432L, 440L, 441L, 442L, 443L, 451L,
452L, 453L, 455L, 456L, 457L, 458L, 462L, 463L, 464L, 469L, 470L, 474L, 476L, 477L, 478L, 487L, 488L, 492L, 1484L, 1534L, 1546L, 1561L, 1629L, 1642L, 1670L, 1672L, 1681L, 1698L, 1723L, 1725L,
1736L, 1738L, 1745L, 1753L, 1759L, 1764L, 1766L, 1767L, 1770L, 1772L, 1775L, 1776L, 1781L, 1784L, 1787L, 1791L, 1802L, 1807L, 1813L, 1815L, 1817L, 1821L, 1823L, 1825L, 1846L, 1850L, 1852L,
1853L, 1854L, 1857L, 1858L, 1859L, 1868L, 1899L, 1904L, 1911L, 1913L, 1977L, 1997L, 1999L, 2023L, 2079L),
frequent = c(81L, 75L, 10L, 17L, 4L, 4L, 33L, 13L, 31L, 3L, 19L, 22L, 6L, 1L, 11L, 2L,
1L, 1L, 3L, 14L, 1L, 2L, 1L, 14L, 1L, 9L, 6L, 9L, 2L, 5L, 13L, 4L, 4L, 1L, 4L, 1L, 3L, 1L, 6L, 2L, 1L, 3L, 2L, 5L, 2L, 1L, 17L, 5L, 4L, 4L, 1L, 4L, 7L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 6L,
16L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 5L, 13L, 6L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 4L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 3L, 5L, 1L, 3L, 1L, 3L, 4L, 1L, 1L, 2L, 3L, 4L, 3L, 3L, 1L, 3L, 2L, 2L, 1L, 6L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L)),
class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -193L))
You can use summarise since you are only summing one variable by group.
library(tidyverse)
common_bw_elements = df %>%
group_by(range_of_commons = cut(common_IDs,
breaks= c(-Inf,0, 5, 10, 20, 30, 60, 100, 200, 300, 600, 1200, 1800, Inf))) %>%
summarise(sum_of_instances = sum(frequent))
Output
range_of_commons sum_of_instances
<fct> <int>
1 (-Inf,0] 81
2 (0,5] 110
3 (5,10] 46
4 (10,20] 34
5 (20,30] 47
6 (30,60] 15
7 (60,100] 85
8 (100,200] 87
9 (200,300] 92
10 (300,600] 75
11 (1.2e+03,1.8e+03] 29
12 (1.8e+03, Inf] 28
If you had multiple columns to sum, then we would use across (or if you only had a few columns, then instead of everything(), you can provide a vector of column names (e.g., c(common_IDs, frequent)):
df %>%
group_by(range_of_commons = cut(common_IDs,
breaks= c(-Inf,0, 5, 10, 20, 30, 60, 100, 200, 300, 600, 1200, 1800, Inf))) %>%
summarise(across(everything(), ~ sum(.x))) %>%
rename(sum_of_instances = frequent)
Output
range_of_commons common_IDs sum_of_instances
<fct> <int> <int>
1 (-Inf,0] 0 81
2 (0,5] 15 110
3 (5,10] 13 46
4 (10,20] 35 34
5 (20,30] 78 47
6 (30,60] 199 15
7 (60,100] 1191 85
8 (100,200] 3928 87
9 (200,300] 9392 92
10 (300,600] 17290 75
11 (1.2e+03,1.8e+03] 47829 29
12 (1.8e+03, Inf] 48922 28

Aggregate some columns while keeping other columns unchanged

I have data frame like this dummy sample, my real dataset had 56 variables.
I would like to drop the date and aggregate by id and sum last 4 total variables while keep the other unchanged.
df <- data.frame(stringsAsFactors=FALSE,
date = c("2019-02-10", "2019-02-10", "2019-02-11", "2019-02-11",
"2019-02-12", "2019-02-12", "2019-02-13", "2019-02-13",
"2019-02-14", "2019-02-14"),
id = c("18100410-aa", "18101080-ae", "18100410-aa", "18101080-ae",
"18100410-aa", "18101080-ae", "18100410-aa", "18101080-ae",
"18100410-aa", "18101080-ae"),
f_type = c(4L, 2L, 4L, 2L, 4L, 2L, 4L, 2L, 4L, 2L),
reg = c(6L, 7L, 6L, 7L, 6L, 7L, 6L, 7L, 6L, 7L),
hh_p10 = c(2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L),
internet = c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L),
youngest = c(5L, 7L, 5L, 7L, 5L, 7L, 5L, 7L, 5L, 7L),
a_group = c(3L, 6L, 3L, 6L, 3L, 6L, 3L, 6L, 3L, 6L),
total_prd = c(130L, 337L, 374L, 261L, 106L, 230L, 150L, 36L, 15L, 123L),
B_totalprod = c(20L, 0L, 256L, 0L, 32L, 0L, 0L, 36L, 0L, 45L),
p_totalprod = c(0L, 81L, 11L, 260L, 26L, 230L, 0L, 0L, 15L, 0L),
n_totalprod = c(110L, 256L, 107L, 1L, 48L, 0L, 150L, 0L, 0L, 78L)
)
I found this solution from plyr package here it is working but I need to specify all my 52 unaffected variables. I am just wondering is there any other way to do this task?
library(plyr)
ddply(df,.(id,f_type, reg, internet,hh_p10 ,youngest, a_group ),summarise,total_prd = sum(total_prd) ,
B_totalprod = sum(B_totalprod) , p_totalprod = sum(p_totalprod) ,
n_totalprod = sum(n_totalprod))
If your real dataset also has columns that contain "total" this should work:
library(tidyverse)
df %>%
select(-date) %>%
group_by(.dots = str_subset(names(.), "total", negate = TRUE)) %>%
summarise_all(list(sum = sum))
# A tibble: 2 x 11
# Groups: id, f_type, reg, hh_p10, internet, youngest [2]
id f_type reg hh_p10 internet youngest a_group total_prd_sum B_totalprod_sum p_totalprod_sum n_totalprod_sum
<chr> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1 18100410-aa 4 6 2 1 5 3 775 308 52 415
2 18101080-ae 2 7 1 2 7 6 987 81 571 335
The line group_by(.dots = str_subset(names(.), "total", negate = TRUE)) means we are going to group by all the column names in our this dataset that do not contain the word "total".

Display The sequence of statistical criteria for the time series by groups in data.frame using R

data sample
timeseries=structure(list(Data = structure(c(10L, 14L, 18L, 22L, 26L, 29L,
32L, 35L, 38L, 1L, 4L, 7L, 11L, 15L, 19L, 23L, 27L, 30L, 33L,
36L, 39L, 2L, 5L, 8L, 12L, 16L, 20L, 24L, 28L, 31L, 34L, 37L,
40L, 3L, 6L, 9L, 13L, 17L, 21L, 25L), .Label = c("01.01.2018",
"01.01.2019", "01.01.2020", "01.02.2018", "01.02.2019", "01.02.2020",
"01.03.2018", "01.03.2019", "01.03.2020", "01.04.2017", "01.04.2018",
"01.04.2019", "01.04.2020", "01.05.2017", "01.05.2018", "01.05.2019",
"01.05.2020", "01.06.2017", "01.06.2018", "01.06.2019", "01.06.2020",
"01.07.2017", "01.07.2018", "01.07.2019", "01.07.2020", "01.08.2017",
"01.08.2018", "01.08.2019", "01.09.2017", "01.09.2018", "01.09.2019",
"01.10.2017", "01.10.2018", "01.10.2019", "01.11.2017", "01.11.2018",
"01.11.2019", "01.12.2017", "01.12.2018", "01.12.2019"), class = "factor"),
client = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = c("Horns", "Kornev"), class = "factor"), stuff = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("chickens",
"hooves", "Oysters"), class = "factor"), Sales = c(374L,
12L, 120L, 242L, 227L, 268L, 280L, 419L, 12L, 172L, 336L,
117L, 108L, 150L, 90L, 117L, 116L, 146L, 120L, 211L, 213L,
67L, 146L, 118L, 152L, 122L, 201L, 497L, 522L, 65L, 268L,
441L, 247L, 348L, 445L, 477L, 62L, 226L, 476L, 306L)), .Names = c("Data",
"client", "stuff", "Sales"), class = "data.frame", row.names = c(NA,
-40L))
I need check timeseries on trend and seasonal using not acf function,
but some criterions for it. For serias of each group Cleint+Stuff.
#test adf
library("tseries")
adf.test(timeseries$Sales)
then
Seasonal Mann-Kendall Trend Test
library("trend")
res <- smk.test(timeseries$Sales)
and
#Cox and Stuart Trend Test
cs.test(timeseries$Sales)
The result of these tests should be in data.frame format for each group
How it can be done?
Edit
w=structure(list(Sales = c(18175L, 20015L, 48049L, 62826L, 34804L,
33105L, 38384L, 42316L, 44577L, 24939L, 15908L, 24859L, 13879L,
18739L, 13202L, 29653L, 30371L, 29638L, 5495L, 56932L, 1091L,
5906L, 8229L, 239L, 102L, 8L, 263L, 26L), group = c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L)), .Names = c("Sales", "group"
), class = "data.frame", row.names = c(NA, -28L))
transform to ts object
w=ts(mydat$new,frequency = 12,start=c(2015,1))
library(dplyr);w %>% group_by(group) %>% summarise(stat = cs.test(Sales)$statistic, pval = cs.test(Sales)$p.value)

Speeding up a loop (extracting specific values from a data frame)

My task is to extract all values in a column "2" after sorting by factor level in another column "3" (for the interested, i am sorting fasta sequences by organism). I am using this very simple code to get what i need.
df <- read.table("outfile.txt", fill=T) # the original output file includes many empty cells
# df is availabe at the bottom of this post
# splitting by factors
list1 <- split(df, df$V3)
# extract all values in column 2
list2 <- lapply(list1, function(x) as.data.frame(x$V2))
# writing results to file
for (x in names(list2))
write.table(list2[[x]], file=paste(x,".txt"), quote=F, row.names = F, col.names=F)
The works well on a small df. However, the output file contains several gigabytes of data. I tried a subset (500,000 rows on my local machine with 8GB RAM), but the second command is extremely slow (or R just hangs).
So i wondered and am asking the community, if there is a better way to solve this. Thank you.
Here is df:
dput(df)
structure(list(V1 = structure(c(1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L), .Label = c("C", "U"), class = "factor"),
V2 = structure(c(10L, 2L, 27L, 29L, 25L, 32L, 28L, 39L, 40L,
22L, 8L, 7L, 19L, 38L, 15L, 3L, 16L, 26L, 34L, 13L, 17L,
18L, 14L, 41L, 44L, 12L, 45L, 46L, 5L, 1L, 31L, 4L, 37L,
11L, 43L, 20L, 21L, 30L, 23L, 35L, 24L, 42L, 9L, 33L, 36L,
6L), .Label = c("M02978:20:000000000-B8C4P:1:1101:11008:4137",
"M02978:20:000000000-B8C4P:1:1101:14389:3444", "M02978:20:000000000-B8C4P:1:1101:14986:3769",
"M02978:20:000000000-B8C4P:1:1101:15333:4161", "M02978:20:000000000-B8C4P:1:1101:15438:4092",
"M02978:20:000000000-B8C4P:1:1101:15516:4514", "M02978:20:000000000-B8C4P:1:1101:16313:3660",
"M02978:20:000000000-B8C4P:1:1101:16433:3650", "M02978:20:000000000-B8C4P:1:1101:16663:4462",
"M02978:20:000000000-B8C4P:1:1101:17179:3407", "M02978:20:000000000-B8C4P:1:1101:17779:4225",
"M02978:20:000000000-B8C4P:1:1101:18008:3981", "M02978:20:000000000-B8C4P:1:1101:18047:3851",
"M02978:20:000000000-B8C4P:1:1101:18920:3936", "M02978:20:000000000-B8C4P:1:1101:19086:3737",
"M02978:20:000000000-B8C4P:1:1101:19203:3783", "M02978:20:000000000-B8C4P:1:1101:19335:3908",
"M02978:20:000000000-B8C4P:1:1101:19520:3921", "M02978:20:000000000-B8C4P:1:1101:19612:3701",
"M02978:20:000000000-B8C4P:1:1101:19655:4289", "M02978:20:000000000-B8C4P:1:1101:19918:4313",
"M02978:20:000000000-B8C4P:1:1101:20321:3602", "M02978:20:000000000-B8C4P:1:1101:21089:4350",
"M02978:20:000000000-B8C4P:1:1101:22293:4406", "M02978:20:000000000-B8C4P:1:1101:22453:3490",
"M02978:20:000000000-B8C4P:1:1101:23026:3811", "M02978:20:000000000-B8C4P:1:1101:23065:3472",
"M02978:20:000000000-B8C4P:1:1101:23770:3507", "M02978:20:000000000-B8C4P:1:1101:23991:3472",
"M02978:20:000000000-B8C4P:1:1101:24290:4332", "M02978:20:000000000-B8C4P:1:1101:24415:4142",
"M02978:20:000000000-B8C4P:1:1101:25066:3498", "M02978:20:000000000-B8C4P:1:1101:25678:4466",
"M02978:20:000000000-B8C4P:1:1101:25992:3830", "M02978:20:000000000-B8C4P:1:1101:26431:4388",
"M02978:20:000000000-B8C4P:1:1101:26573:4479", "M02978:20:000000000-B8C4P:1:1101:5567:4179",
"M02978:20:000000000-B8C4P:1:1101:6323:3723", "M02978:20:000000000-B8C4P:1:1101:6675:3536",
"M02978:20:000000000-B8C4P:1:1101:6868:3559", "M02978:20:000000000-B8C4P:1:1101:7078:3965",
"M02978:20:000000000-B8C4P:1:1101:8145:4431", "M02978:20:000000000-B8C4P:1:1101:8449:4257",
"M02978:20:000000000-B8C4P:1:1101:8592:3966", "M02978:20:000000000-B8C4P:1:1101:9468:4026",
"M02978:20:000000000-B8C4P:1:1101:9970:4051"), class = "factor"),
V3 = c(926550L, 0L, 1121396L, 406818L, 1265505L, 1167006L,
1121399L, 0L, 177437L, 0L, 1536652L, 0L, 1196029L, 0L, 1178540L,
138119L, 0L, 1536652L, 186802L, 0L, 1322246L, 1232437L, 1196029L,
1121396L, 452637L, 0L, 0L, 0L, 1541959L, 1121403L, 96561L,
1167006L, 767528L, 0L, 0L, 653733L, 1423815L, 857293L, 0L,
0L, 0L, 468059L, 1167006L, 1232437L, 880073L, 761193L), V4 = c(171L,
NA, 264L, 88L, 356L, 257L, 128L, NA, 97L, NA, 243L, NA, 96L,
NA, 80L, 93L, NA, 138L, 155L, NA, 243L, 262L, 77L, 470L,
135L, NA, NA, NA, 124L, 161L, 211L, 202L, 91L, NA, NA, 146L,
98L, 93L, NA, NA, NA, 107L, 382L, 247L, 130L, 157L), V5 = structure(c(25L,
1L, 2L, 17L, 9L, 5L, 3L, 1L, 16L, 1L, 14L, 1L, 7L, 1L, 6L,
11L, 1L, 14L, 24L, 1L, 10L, 8L, 7L, 2L, 18L, 1L, 1L, 1L,
15L, 4L, 26L, 5L, 13L, 1L, 1L, 20L, 12L, 22L, 1L, 1L, 1L,
19L, 5L, 8L, 23L, 21L), .Label = c("", "1121396,", "1121399,",
"1121403,", "1167006,", "1178540,", "1196029,", "1232437,",
"1265505,", "1322246,", "138119,", "1423815,", "1460634,1460635,",
"1536652,", "1541959,", "177437,", "406818,", "452637,",
"468059,", "653733,", "761193,", "857293,", "880073,", "883109,888727,1161902,1230734,1392487,",
"926550,", "96561,"), class = "factor")), .Names = c("V1",
"V2", "V3", "V4", "V5"), class = "data.frame", row.names = c(NA,
-46L))
using data.table package combined with write.table.
order by V3 and then write the V2 columns separately for each group in V3.
library('data.table')
setDT(df)[ order(V3), write.table(V2, file = paste0( V3, ".txt")), by = V3]
This worked for me but I cannot speak for how fast it would be on your machine.
lapply(unique(df$V3), function(x) write.table(df[which(df$V3 == x),]$V2, file = paste(x, ".txt", sep = ""), quote = FALSE, row.names = FALSE, col.names = FALSE))

ggplot + boxplot with time series value by group

A variation of this question -- I can't quite get the dimensions right in the data structure to make a boxplot with the right values.
what I'm looking to do: hours would be on the x-axis, region would be on the y-axis, and for every region there will be a boxplot showing the distribution of income by hour.
The closest I can get is the following, but it's not right. How do I create the boxplot with two factors (one a time series) as axes, populated by the value distribution?
data:
regions <- structure(list(location = structure(c(2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("east",
"north", "west"), class = "factor"), hour = structure(list(sec = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), min = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), hour = c(0L,
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 0L, 1L, 2L, 3L, 4L,
5L, 6L, 7L, 8L, 9L, 10L, 0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L,
9L, 10L), mday = c(13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L,
13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L,
13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L), mon = c(7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L
), year = c(115L, 115L, 115L, 115L, 115L, 115L, 115L, 115L, 115L,
115L, 115L, 115L, 115L, 115L, 115L, 115L, 115L, 115L, 115L, 115L,
115L, 115L, 115L, 115L, 115L, 115L, 115L, 115L, 115L, 115L, 115L,
115L, 115L), wday = c(4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L), yday = c(224L, 224L, 224L, 224L,
224L, 224L, 224L, 224L, 224L, 224L, 224L, 224L, 224L, 224L, 224L,
224L, 224L, 224L, 224L, 224L, 224L, 224L, 224L, 224L, 224L, 224L,
224L, 224L, 224L, 224L, 224L, 224L, 224L), isdst = c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
zone = c("CDT", "CDT", "CDT", "CDT", "CDT", "CDT", "CDT",
"CDT", "CDT", "CDT", "CDT", "CDT", "CDT", "CDT", "CDT", "CDT",
"CDT", "CDT", "CDT", "CDT", "CDT", "CDT", "CDT", "CDT", "CDT",
"CDT", "CDT", "CDT", "CDT", "CDT", "CDT", "CDT", "CDT"),
gmtoff = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_)), .Names = c("sec",
"min", "hour", "mday", "mon", "year", "wday", "yday", "isdst",
"zone", "gmtoff"), class = c("POSIXlt", "POSIXt")), hour_income = c(67L,
98L, 89L, 75L, 75L, 89L, 70L, 97L, 52L, 94L, 80L, 84L, 52L, 82L,
81L, 93L, 85L, 94L, 64L, 90L, 54L, 60L, 97L, 100L, 57L, 63L,
90L, 58L, 86L, 68L, 52L, 78L, 61L)), .Names = c("location", "hour",
"hour_income"), row.names = c(NA, -33L), class = "data.frame")
And the boxplot
ggplot(regions) +
geom_boxplot(aes(x=hour, y=hour_income, group=location))
First we convert datetimes to character format, then create the boxplots.
regions$hour <- strftime(regions$hour, format="%H:%M:%S")
ggplot(data = regions, aes(x = hour, y = hour_income)) + geom_boxplot()
But because you only have an observation for each region and hour when you try to create a boxplot to visualise the regions you can only obtain lines instead of boxplots, which are not very meaningful:
ggplot(data = regions, aes(x = hour, y = hour_income)) + geom_boxplot(aes(fill= location))

Resources