Merge data frames within a list - r

I have a list which looks like,
lapply(sample_list, head, 3)
$`2016-04-24 00:00:00.tcp`
ports freq
8 443 296
12 80 170
5 23 92
$`2016-04-24 00:00:00.udp`
ports freq
4 161 138
7 53 45
1 123 28
$`2016-04-24 01:00:00.tcp`
ports freq
13 443 342
20 80 215
10 25 60
$`2016-04-24 01:00:00.udp`
ports freq
4 161 85
8 53 42
12 902 27
I want to merge the data frames that come from the same protocol (i.e. the tcp together and udp together)
So the final result would be a new list with 2 data frames; One for tcp and one for udp such that,
lapply(final_list, head, 3)
$tcp
ports freq.00:00:00 freq.01:00:00
1 443 296 342
2 80 170 215
3 23 92 51
$udp
ports freq.00:00:00 freq.01:00:00
1 161 138 85
2 53 45 42
3 123 28 19
DATA
dput(sample_list)
structure(list(`2016-04-24 00:00:00.tcp` = structure(list(ports = c("443",
"80", "23", "21", "22", "25", "445", "110", "389", "135", "465",
"514", "91", "995", "84", "902"), freq = structure(c(296L, 170L,
92L, 18L, 16L, 15L, 14L, 4L, 3L, 2L, 2L, 2L, 2L, 2L, 1L, 1L), .Dim = 16L)), .Names = c("ports",
"freq"), row.names = c(8L, 12L, 5L, 3L, 4L, 6L, 9L, 1L, 7L, 2L,
10L, 11L, 15L, 16L, 13L, 14L), class = "data.frame"), `2016-04-24 00:00:00.udp` = structure(list(
ports = c("161", "53", "123", "902", "137", "514", "138",
"623", "69", "88", "500"), freq = structure(c(138L, 45L,
28L, 26L, 24L, 24L, 6L, 6L, 5L, 4L, 1L), .Dim = 11L)), .Names = c("ports",
"freq"), row.names = c(4L, 7L, 1L, 11L, 2L, 6L, 3L, 8L, 9L, 10L,
5L), class = "data.frame"), `2016-04-24 01:00:00.tcp` = structure(list(
ports = c("443", "80", "25", "23", "88", "21", "161", "22",
"445", "135", "389", "993", "548", "110", "143", "502", "514",
"81", "995", "102", "111", "311", "444", "789", "902", "91"
), freq = structure(c(342L, 215L, 60L, 51L, 42L, 32L, 31L,
18L, 18L, 6L, 5L, 4L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Dim = 26L)), .Names = c("ports", "freq"
), row.names = c(13L, 20L, 10L, 9L, 22L, 7L, 6L, 8L, 15L, 4L,
12L, 25L, 18L, 2L, 5L, 16L, 17L, 21L, 26L, 1L, 3L, 11L, 14L,
19L, 23L, 24L), class = "data.frame"), `2016-04-24 01:00:00.udp` = structure(list(
ports = c("161", "53", "902", "514", "123", "137", "69",
"138", "389", "443", "88", "623"), freq = structure(c(85L,
42L, 27L, 24L, 19L, 15L, 15L, 4L, 2L, 2L, 2L, 1L), .Dim = 12L)), .Names = c("ports",
"freq"), row.names = c(4L, 8L, 12L, 7L, 1L, 2L, 10L, 3L, 5L,
6L, 11L, 9L), class = "data.frame")), .Names = c("2016-04-24 00:00:00.tcp",
"2016-04-24 00:00:00.udp", "2016-04-24 01:00:00.tcp", "2016-04-24 01:00:00.udp"
))
Bonus question: What is the structure of freq? I never saw int [1:16(1d)] before.
str(sample_list$`2016-04-24 00:00:00.tcp`)
'data.frame': 16 obs. of 2 variables:
$ ports: chr "443" "80" "23" "21" ...
$ freq : int [1:16(1d)] 296 170 92 18 16 15 14 4 3 2 ...
The code I used to create the list (In this case called try1)
protocol_list <- lapply(per_hour1, function(i) split(i, i$protocol))
Analytic_Protocol_List <- lapply(protocol_list, function(i) lapply(i, dest.ports))
try1 <- lapply(unlist(Analytic_Protocol_List, recursive = FALSE), `[[`, 1)
Note that solutions from similar questions do not work for this case. Maybe because of the structure?

Another alternative:
library(dplyr)
library(tidyr)
data.table::melt(sample_list) %>%
separate(L1, into = c("time", "protocol"), sep = "\\.") %>%
unite(f, variable, time) %>%
spread(f, value) %>%
split(.$protocol)
Which, using your data, gives:
$tcp
ports protocol freq_2016-04-24 00:00:00 freq_2016-04-24 01:00:00
1 102 tcp NA 1
2 110 tcp 4 2
3 111 tcp NA 1
5 135 tcp 2 6
8 143 tcp NA 2
9 161 tcp NA 31
11 21 tcp 18 32
12 22 tcp 16 18
13 23 tcp 92 51
14 25 tcp 15 60
15 311 tcp NA 1
16 389 tcp 3 5
18 443 tcp 296 342
20 444 tcp NA 1
21 445 tcp 14 18
22 465 tcp 2 NA
24 502 tcp NA 2
25 514 tcp 2 2
28 548 tcp NA 3
31 789 tcp NA 1
32 80 tcp 170 215
33 81 tcp NA 2
34 84 tcp 1 NA
35 88 tcp NA 42
37 902 tcp 1 1
39 91 tcp 2 1
40 993 tcp NA 4
41 995 tcp 2 2
$udp
ports protocol freq_2016-04-24 00:00:00 freq_2016-04-24 01:00:00
4 123 udp 28 19
6 137 udp 24 15
7 138 udp 6 4
10 161 udp 138 85
17 389 udp NA 2
19 443 udp NA 2
23 500 udp 1 NA
26 514 udp 24 24
27 53 udp 45 42
29 623 udp 6 1
30 69 udp 5 15
36 88 udp 4 2
38 902 udp 26 27
Update:
If you want to sort by freq, you could do:
data.table::melt(sample_list) %>%
separate(L1, into = c("time", "protocol"), sep = "\\.") %>%
unite(f, variable, time) %>%
spread(f, value) %>%
arrange(protocol, desc(`freq_2016-04-24 00:00:00`)) %>%
split(.$protocol)

For the rbinding you can try the following:
do.call(rbind, sample_list[grep("tcp", names(sample_list))])
and:
do.call(rbind, sample_list[grep("udp", names(sample_list))])
and as refined by Marat below:
d <- do.call(rbind, sample_list)
d2 <- data.frame(d,do.call(rbind,strsplit(rownames((d)),'[.]')))
lapply(split(d2,d2$X2),dcast,ports~X1,value.var='freq')

you can just merge by ID
create a ID for each row of the data frame
let lappy(X) = x
x$1 <- cbind(ID=1:nrow(x$1))
same for x1,x2,x3....,xN
newx <- merge(x$1,x$2,...,x$N, by=ID)
since id merging is used overlapping won't occur, jusıt there each list$(X) as a data frame itself

Related

R script to average values every X days using different starting points

I have a long data set measuring height of trees once a week for 8 months. Also recorded are pot number ('pot'), the date of measuring ('date'), weeks since the start of the experiment ('no.week'), germination date ('germination'), weeks since germination ('after.germ').
I'm wanting to average tree height over 3 weeks starting at the week of germination.
For example, the experiment started on 3/25. Pot 3 germinated on 4/15 (no. week= 2). Pot 4 germinated on 4/29 (no. week= 4). I want to average the height of pot 3 starting on 4/15 and pot 4 starting on 4/29, and continue to average every 3 weeks for the duration of the experiment.
The key is starting the average at different points for each pot.
Any advice and tips would be great!
Subset:
pot table germination week no.week after.germ date height stem
61 3 2 4/15/2022 w1 1 NA 3/25/2022 NA NA
62 3 2 4/15/2022 w2 2 NA 4/15/2022 NA NA
63 3 2 4/15/2022 w3 3 1 4/22/2022 4.6 NA
64 3 2 4/15/2022 w4 4 2 4/29/2022 18.5 NA
65 3 2 4/15/2022 w5 5 3 5/6/2022 18.1 1
66 3 2 4/15/2022 w6 6 4 5/13/2022 18.1 1
67 3 2 4/15/2022 w7 7 5 5/20/2022 17.8 1
68 3 2 4/15/2022 w8 8 6 5/26/2022 19.4 1
69 3 2 4/15/2022 w9 9 7 6/3/2022 18.8 1
70 3 2 4/15/2022 w10 10 8 6/10/2022 19.3 1
71 3 2 4/15/2022 w11 11 9 6/17/2022 18.3 1
72 3 2 4/15/2022 w12 12 10 6/24/2022 18.6 1
73 3 2 4/15/2022 w13 13 11 7/1/2022 19.2 1
74 3 2 4/15/2022 w14 14 12 7/8/2022 19.2 1
75 3 2 4/15/2022 w15 15 13 7/15/2022 18.9 1
76 3 2 4/15/2022 w16 16 14 7/22/2022 15.3 1
77 3 2 4/15/2022 w17 17 15 7/29/2022 19.1 1
78 3 2 4/15/2022 w18 18 16 8/5/2022 19.0 1
79 3 2 4/15/2022 w19 19 17 8/12/2022 19.0 1
80 3 2 4/15/2022 w20 20 18 8/19/2022 19.8 1
81 3 2 4/15/2022 w21 21 19 8/26/2022 18.2 1
82 3 2 4/15/2022 w22 22 20 9/2/2022 19.2 1
83 3 2 4/15/2022 w24 24 21 9/16/2022 18.1 1
84 3 2 4/15/2022 w23 23 22 9/22/2022 19.2 1
85 3 2 4/15/2022 w25 25 23 9/30/2022 15.4 1
86 3 2 4/15/2022 w26 26 24 10/7/2022 18.4 1
87 3 2 4/15/2022 w27 27 25 10/14/2022 19.2 1
88 3 2 4/15/2022 w28 28 26 10/21/2022 19.0 1
89 3 2 4/15/2022 w29 29 27 10/29/2022 18.7 1
90 3 2 4/15/2022 w30 30 28 11/4/2022 19.3 1
91 6 4 4/29/2022 w1 1 NA 3/25/2022 NA NA
92 6 4 4/29/2022 w2 2 NA 4/15/2022 NA NA
93 6 4 4/29/2022 w3 3 NA 4/22/2022 NA NA
94 6 4 4/29/2022 w4 4 1 4/29/2022 16.7 NA
95 6 4 4/29/2022 w5 5 2 5/6/2022 17.5 1
96 6 4 4/29/2022 w6 6 3 5/13/2022 18.8 NA
97 6 4 4/29/2022 w7 7 4 5/20/2022 18.0 NA
98 6 4 4/29/2022 w8 8 5 5/26/2022 17.2 NA
99 6 4 4/29/2022 w9 9 6 6/3/2022 17.7 NA
100 6 4 4/29/2022 w10 10 7 6/10/2022 17.9 NA
101 6 4 4/29/2022 w11 11 8 6/17/2022 18.7 NA
102 6 4 4/29/2022 w12 12 9 6/24/2022 18.1 NA
103 6 4 4/29/2022 w13 13 10 7/1/2022 17.3 NA
104 6 4 4/29/2022 w14 14 11 7/8/2022 13.8 NA
105 6 4 4/29/2022 w15 15 12 7/15/2022 18.4 1
106 6 4 4/29/2022 w16 16 13 7/22/2022 19.0 1
107 6 4 4/29/2022 w17 17 14 7/29/2022 18.8 1
108 6 4 4/29/2022 w18 18 15 8/5/2022 NA 1
109 6 4 4/29/2022 w19 19 16 8/12/2022 19.0 1
110 6 4 4/29/2022 w20 20 17 8/19/2022 19.3 1
111 6 4 4/29/2022 w21 21 18 8/26/2022 18.6 1
112 6 4 4/29/2022 w22 22 19 9/2/2022 18.2 1
113 6 4 4/29/2022 w24 24 20 9/16/2022 18.0 1
114 6 4 4/29/2022 w23 23 21 9/22/2022 18.8 1
115 6 4 4/29/2022 w25 25 22 9/30/2022 19.7 1
116 6 4 4/29/2022 w26 26 23 10/7/2022 17.4 1
117 6 4 4/29/2022 w27 27 24 10/14/2022 18.8 1
118 6 4 4/29/2022 w28 28 25 10/21/2022 19.9 1
119 6 4 4/29/2022 w29 29 26 10/29/2022 17.9 1
120 6 4 4/29/2022 w30 30 27 11/4/2022 19.5 1
211 10 2 4/29/2022 w1 1 NA 3/25/2022 NA NA
212 10 2 4/29/2022 w2 2 NA 4/15/2022 NA NA
213 10 2 4/29/2022 w3 3 NA 4/22/2022 NA NA
214 10 2 4/29/2022 w4 4 NA 4/29/2022 NA NA
215 10 2 4/29/2022 w5 5 1 5/6/2022 9.5 1
216 10 2 4/29/2022 w6 6 2 5/13/2022 15.4 NA
217 10 2 4/29/2022 w7 7 3 5/20/2022 14.3 NA
218 10 2 4/29/2022 w8 8 4 5/26/2022 15.8 NA
219 10 2 4/29/2022 w9 9 5 6/3/2022 16.1 NA
220 10 2 4/29/2022 w10 10 6 6/10/2022 16.1 NA
221 10 2 4/29/2022 w11 11 7 6/17/2022 15.9 NA
222 10 2 4/29/2022 w12 12 8 6/24/2022 16.3 NA
223 10 2 4/29/2022 w13 13 9 7/1/2022 16.2 NA
224 10 2 4/29/2022 w14 14 10 7/8/2022 16.4 NA
225 10 2 4/29/2022 w15 15 11 7/15/2022 15.7 1
226 10 2 4/29/2022 w16 16 12 7/22/2022 15.5 1
227 10 2 4/29/2022 w17 17 13 7/29/2022 15.7 1
228 10 2 4/29/2022 w18 18 14 8/5/2022 15.5 1
229 10 2 4/29/2022 w19 19 15 8/12/2022 16.0 1
230 10 2 4/29/2022 w20 20 16 8/19/2022 15.9 1
231 10 2 4/29/2022 w21 21 17 8/26/2022 15.7 1
232 10 2 4/29/2022 w22 22 18 9/2/2022 15.5 1
233 10 2 4/29/2022 w24 24 19 9/16/2022 15.1 1
234 10 2 4/29/2022 w23 23 20 9/22/2022 15.8 1
235 10 2 4/29/2022 w25 25 21 9/30/2022 15.8 1
236 10 2 4/29/2022 w26 26 22 10/7/2022 15.1 1
237 10 2 4/29/2022 w27 27 23 10/14/2022 15.9 1
238 10 2 4/29/2022 w28 28 24 10/21/2022 16.5 1
239 10 2 4/29/2022 w29 29 25 10/29/2022 15.7 1
240 10 2 4/29/2022 w30 30 26 11/4/2022 16.2 1
271 14 2 4/15/2022 w1 1 NA 3/25/2022 NA NA
272 14 2 4/15/2022 w2 2 NA 4/15/2022 NA NA
273 14 2 4/15/2022 w3 3 1 4/22/2022 5.8 NA
274 14 2 4/15/2022 w4 4 2 4/29/2022 19.7 NA
275 14 2 4/15/2022 w5 5 3 5/6/2022 20.1 1
276 14 2 4/15/2022 w6 6 4 5/13/2022 19.4 1
277 14 2 4/15/2022 w7 7 5 5/20/2022 20.0 1
278 14 2 4/15/2022 w8 8 6 5/26/2022 19.6 1
279 14 2 4/15/2022 w9 9 7 6/3/2022 19.6 1
280 14 2 4/15/2022 w10 10 8 6/10/2022 20.2 1
281 14 2 4/15/2022 w11 11 9 6/17/2022 21.1 1
282 14 2 4/15/2022 w12 12 10 6/24/2022 21.3 1
283 14 2 4/15/2022 w13 13 11 7/1/2022 19.4 NA
284 14 2 4/15/2022 w14 14 12 7/8/2022 20.3 NA
285 14 2 4/15/2022 w15 15 13 7/15/2022 19.5 1
286 14 2 4/15/2022 w16 16 14 7/22/2022 19.3 1
287 14 2 4/15/2022 w17 17 15 7/29/2022 22.4 1
288 14 2 4/15/2022 w18 18 16 8/5/2022 20.0 1
289 14 2 4/15/2022 w19 19 17 8/12/2022 20.0 1
290 14 2 4/15/2022 w20 20 18 8/19/2022 20.4 1
291 14 2 4/15/2022 w21 21 19 8/26/2022 19.6 1
I calculated the rolling average, but a rolling average isn't quite what I am looking for since I need an average over a distinct period and starting at different points.
library(zoo)
cg22_avg<-cg22_long %>%
dplyr:::group_by(pot) %>%
dplyr:::mutate('3wkavg' = rollmean(height, 3, align="right", na.pad=TRUE ))
Perhaps this use of the slider package will help:
quux %>%
mutate(across(c(germination, date), ~ as.Date(., format = "%m/%d/%Y"))) %>%
dplyr::filter(date >= germination) %>%
group_by(pot) %>%
mutate(avg3 = slider::slide_period_dbl(.x=height, .i=date, .period="week", .f=mean, .before=3)) %>%
ungroup()
# # A tibble: 103 × 10
# pot table germination week no.week after.germ date height stem avg3
# <int> <int> <date> <chr> <int> <int> <date> <dbl> <int> <dbl>
# 1 3 2 2022-04-15 w2 2 NA 2022-04-15 NA NA NA
# 2 3 2 2022-04-15 w3 3 1 2022-04-22 4.6 NA NA
# 3 3 2 2022-04-15 w4 4 2 2022-04-29 18.5 NA NA
# 4 3 2 2022-04-15 w5 5 3 2022-05-06 18.1 1 NA
# 5 3 2 2022-04-15 w6 6 4 2022-05-13 18.1 1 14.8
# 6 3 2 2022-04-15 w7 7 5 2022-05-20 17.8 1 18.1
# 7 3 2 2022-04-15 w8 8 6 2022-05-26 19.4 1 18.4
# 8 3 2 2022-04-15 w9 9 7 2022-06-03 18.8 1 18.5
# 9 3 2 2022-04-15 w10 10 8 2022-06-10 19.3 1 18.8
# 10 3 2 2022-04-15 w11 11 9 2022-06-17 18.3 1 19.0
# # … with 93 more rows
# # ℹ Use `print(n = ...)` to see more rows
I chose .period="week", .before=3, which should include all observations within exactly 3 weeks. An alternative would be .period="day", .before=20 if you want 3 weeks but not including 3 weeks ago from today. There should be plenty of room to play here.
Data
quux <- structure(list(pot = c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L), table = c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), germination = c("4/15/2022", "4/15/2022", "4/15/2022", "4/15/2022", "4/15/2022", "4/15/2022", "4/15/2022", "4/15/2022", "4/15/2022", "4/15/2022", "4/15/2022", "4/15/2022", "4/15/2022", "4/15/2022", "4/15/2022", "4/15/2022", "4/15/2022", "4/15/2022", "4/15/2022", "4/15/2022", "4/15/2022", "4/15/2022", "4/15/2022", "4/15/2022", "4/15/2022", "4/15/2022", "4/15/2022", "4/15/2022", "4/15/2022", "4/15/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/29/2022", "4/15/2022", "4/15/2022", "4/15/2022", "4/15/2022", "4/15/2022", "4/15/2022", "4/15/2022", "4/15/2022", "4/15/2022", "4/15/2022", "4/15/2022", "4/15/2022", "4/15/2022", "4/15/2022", "4/15/2022", "4/15/2022", "4/15/2022", "4/15/2022", "4/15/2022", "4/15/2022", "4/15/2022"), week = c("w1", "w2", "w3", "w4", "w5", "w6", "w7", "w8", "w9", "w10", "w11", "w12", "w13", "w14", "w15", "w16", "w17", "w18", "w19", "w20", "w21", "w22", "w24", "w23", "w25", "w26", "w27", "w28", "w29", "w30", "w1", "w2", "w3", "w4", "w5", "w6", "w7", "w8", "w9", "w10", "w11", "w12", "w13", "w14", "w15", "w16", "w17", "w18", "w19", "w20", "w21", "w22", "w24", "w23", "w25", "w26", "w27", "w28", "w29", "w30", "w1", "w2", "w3", "w4", "w5", "w6", "w7", "w8", "w9", "w10", "w11", "w12", "w13", "w14", "w15", "w16", "w17", "w18", "w19", "w20", "w21", "w22", "w24", "w23", "w25", "w26", "w27", "w28", "w29", "w30", "w1", "w2", "w3", "w4", "w5", "w6", "w7", "w8", "w9", "w10", "w11", "w12", "w13", "w14", "w15", "w16", "w17", "w18", "w19", "w20", "w21"), no.week = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 24L, 23L, 25L, 26L, 27L, 28L, 29L, 30L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 24L, 23L, 25L, 26L, 27L, 28L, 29L, 30L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 24L, 23L, 25L, 26L, 27L, 28L, 29L, 30L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L), after.germ = c(NA, NA, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, NA, NA, NA, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, NA, NA, NA, NA, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, NA, NA, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L), date = c("3/25/2022", "4/15/2022", "4/22/2022", "4/29/2022", "5/6/2022", "5/13/2022", "5/20/2022", "5/26/2022", "6/3/2022", "6/10/2022", "6/17/2022", "6/24/2022", "7/1/2022", "7/8/2022", "7/15/2022", "7/22/2022", "7/29/2022", "8/5/2022", "8/12/2022", "8/19/2022", "8/26/2022", "9/2/2022", "9/16/2022", "9/22/2022", "9/30/2022", "10/7/2022", "10/14/2022", "10/21/2022", "10/29/2022", "11/4/2022", "3/25/2022", "4/15/2022", "4/22/2022", "4/29/2022", "5/6/2022", "5/13/2022", "5/20/2022", "5/26/2022", "6/3/2022", "6/10/2022", "6/17/2022", "6/24/2022", "7/1/2022", "7/8/2022", "7/15/2022", "7/22/2022", "7/29/2022", "8/5/2022", "8/12/2022", "8/19/2022", "8/26/2022", "9/2/2022", "9/16/2022", "9/22/2022", "9/30/2022", "10/7/2022", "10/14/2022", "10/21/2022", "10/29/2022", "11/4/2022", "3/25/2022", "4/15/2022", "4/22/2022", "4/29/2022", "5/6/2022", "5/13/2022", "5/20/2022", "5/26/2022", "6/3/2022", "6/10/2022", "6/17/2022", "6/24/2022", "7/1/2022", "7/8/2022", "7/15/2022", "7/22/2022", "7/29/2022", "8/5/2022", "8/12/2022", "8/19/2022", "8/26/2022", "9/2/2022", "9/16/2022", "9/22/2022", "9/30/2022", "10/7/2022", "10/14/2022", "10/21/2022", "10/29/2022", "11/4/2022", "3/25/2022", "4/15/2022", "4/22/2022", "4/29/2022", "5/6/2022", "5/13/2022", "5/20/2022", "5/26/2022", "6/3/2022", "6/10/2022", "6/17/2022", "6/24/2022", "7/1/2022", "7/8/2022", "7/15/2022", "7/22/2022", "7/29/2022", "8/5/2022", "8/12/2022", "8/19/2022", "8/26/2022"), height = c(NA, NA, 4.6, 18.5, 18.1, 18.1, 17.8, 19.4, 18.8, 19.3, 18.3, 18.6, 19.2, 19.2, 18.9, 15.3, 19.1, 19, 19, 19.8, 18.2, 19.2, 18.1, 19.2, 15.4, 18.4, 19.2, 19, 18.7, 19.3, NA, NA, NA, 16.7, 17.5, 18.8, 18, 17.2, 17.7, 17.9, 18.7, 18.1, 17.3, 13.8, 18.4, 19, 18.8, NA, 19, 19.3, 18.6, 18.2, 18, 18.8, 19.7, 17.4, 18.8, 19.9, 17.9, 19.5, NA, NA, NA, NA, 9.5, 15.4, 14.3, 15.8, 16.1, 16.1, 15.9, 16.3, 16.2, 16.4, 15.7, 15.5, 15.7, 15.5, 16, 15.9, 15.7, 15.5, 15.1, 15.8, 15.8, 15.1, 15.9, 16.5, 15.7, 16.2, NA, NA, 5.8, 19.7, 20.1, 19.4, 20, 19.6, 19.6, 20.2, 21.1, 21.3, 19.4, 20.3, 19.5, 19.3, 22.4, 20, 20, 20.4, 19.6), stem = c(NA, NA, NA, NA, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, NA, NA, NA, NA, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, NA, NA, 1L, 1L, 1L, 1L, 1L, 1L, 1L)), class = "data.frame", row.names = c("61", "62", "63", "64", "65", "66", "67", "68", "69", "70", "71", "72", "73", "74", "75", "76", "77", "78", "79", "80", "81", "82", "83", "84", "85", "86", "87", "88", "89", "90", "91", "92", "93", "94", "95", "96", "97", "98", "99", "100", "101", "102", "103", "104", "105", "106", "107", "108", "109", "110", "111", "112", "113", "114", "115", "116", "117", "118", "119", "120", "211", "212", "213", "214", "215", "216", "217", "218", "219", "220", "221", "222", "223", "224", "225", "226", "227", "228", "229", "230", "231", "232", "233", "234", "235", "236", "237", "238", "239", "240", "271", "272", "273", "274", "275", "276", "277", "278", "279", "280", "281", "282", "283", "284", "285", "286", "287", "288", "289", "290", "291"))

using tally to filter data in R

i have generated a contingency table using tally(), however there is some invalid data:
structure(c(0L, 5L, 0L, 1L, 2L, 4L, 3L, 2L, 0L, 3L, 11L, 7L,
3L, 6L, 3L, 26L, 27L, 16L, 5L, 4L, 69L, 78L, 36L, 19L, 12L, 88L,
133L, 70L, 29L, 25L, 84L, 132L, 58L, 24L, 23L, 44L, 52L, 25L,
16L, 3L), .Dim = c(5L, 8L), .Dimnames = list(D5 = c("1", "2",
"3", "4", NA), X4.1 = c("-9", "1", "2", "3", "4", "5", "6", "7"
)), class = "table")
I need to remove the rows & columns of such - i need to remove the first column and the last row. I have tried using the %>% operator, but keep getting errors.
Any tips?
It is a table object and not a data.frame.
> str(df1)
'table' int [1:5, 1:8] 0 5 0 1 2 4 3 2 0 3 ...
- attr(*, "dimnames")=List of 2
..$ D5 : chr [1:5] "1" "2" "3" "4" ...
..$ X4.1: chr [1:8] "-9" "1" "2" "3" ...
So we can use indexing to do the removal - nrow(df1) returns the number of rows, use this as row index with - to remove the last row and -1 as column index to remove the first column
df2 <- df1[-nrow(df1), -1]
df2
X4.1
D5 1 2 3 4 5 6 7
1 4 11 26 69 88 84 44
2 3 7 27 78 133 132 52
3 2 3 16 36 70 58 25
4 0 6 5 19 29 24 16

delete observations by days in R

My dataset has the next structure
df=structure(list(Data = structure(c(12L, 13L, 14L, 15L, 16L, 17L,
18L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L), .Label = c("01.01.2018",
"02.01.2018", "03.01.2018", "04.01.2018", "05.01.2018", "06.01.2018",
"07.01.2018", "12.02.2018", "13.02.2018", "14.02.2018", "15.02.2018",
"25.12.2017", "26.12.2017", "27.12.2017", "28.12.2017", "29.12.2017",
"30.12.2017", "31.12.2017"), class = "factor"), sku = 1:18, metric = c(100L,
210L, 320L, 430L, 540L, 650L, 760L, 870L, 980L, 1090L, 1200L,
1310L, 1420L, 1530L, 1640L, 1750L, 1860L, 1970L), action = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L)), .Names = c("Data", "sku", "metric", "action"), class = "data.frame", row.names = c(NA,
-18L))
I need to delete observations that have certain dates.
But in this dataset there is action variable. The action column has only two values 0 and 1.
Observations on these certain dates should be deleted only for the zero category of action.
these dates are presented in a separate datase.
datedata=structure(list(Data = structure(c(18L, 19L, 20L, 21L, 22L, 5L,
7L, 9L, 11L, 13L, 15L, 17L, 23L, 1L, 2L, 3L, 4L, 6L, 8L, 10L,
12L, 14L, 16L), .Label = c("01.05.2018", "02.05.2018", "03.05.2018",
"04.05.2018", "05.03.2018", "05.05.2018", "06.03.2018", "06.05.2018",
"07.03.2018", "07.05.2018", "08.03.2018", "08.05.2018", "09.03.2018",
"09.05.2018", "10.03.2018", "10.05.2018", "11.03.2018", "21.02.2018",
"22.02.2018", "23.02.2018", "24.02.2018", "25.02.2018", "30.04.2018"
), class = "factor")), .Names = "Data", class = "data.frame", row.names = c(NA,
-23L))
how can i do it?
A solution is to use dplyr::filter as:
library(dplyr)
library(lubridate)
df %>% mutate(Data = dmy(Data)) %>%
filter(action==1 | (action==0 & !(Data %in% dmy(datedata$Data))))
# Data sku metric action
# 1 2017-12-25 1 100 0
# 2 2017-12-26 2 210 0
# 3 2017-12-27 3 320 0
# 4 2017-12-28 4 430 0
# 5 2017-12-29 5 540 0
# 6 2017-12-30 6 650 0
# 7 2017-12-31 7 760 0
# 8 2018-01-01 8 870 0
# 9 2018-01-02 9 980 1
# 10 2018-01-03 10 1090 1
# 11 2018-01-04 11 1200 1
# 12 2018-01-05 12 1310 1
# 13 2018-01-06 13 1420 1
# 14 2018-01-07 14 1530 1
# 15 2018-02-12 15 1640 1
# 16 2018-02-13 16 1750 1
# 17 2018-02-14 17 1860 1
# 18 2018-02-15 18 1970 1
I guess this will work. Fist use match to see weather there is a match in the day of df and the day in datedata, then filter it
library (dplyr)
df <- df %>% mutate (Data.flag = match(Data,datedata$Data)) %>%
filter(!is.na(Data.flag) & action == 0)

Sum correlated variables

I have a list of 200 variables and I want to sum those that are highly correlated.
Assuming this is my data
mydata <- structure(list(APPLE= c(1L, 2L, 5L, 4L, 366L, 65L, 43L, 456L, 876L, 78L, 687L, 378L, 378L, 34L, 53L, 43L),
PEAR= c(2L, 2L, 5L, 4L, 366L, 65L, 43L, 456L, 876L, 78L, 687L, 378L, 378L, 34L, 53L, 41L),
PLUM = c(10L, 20L, 10L, 20L, 10L, 20L, 1L, 0L, 1L, 2010L,20L, 10L, 10L, 10L, 10L, 10L),
BANANA= c(2L, 10L, 31L, 2L, 2L, 5L, 2L, 5L, 1L, 52L, 1L, 2L, 52L, 6L, 2L, 1L),
LEMON = c(4L, 10L, 31L, 2L, 2L, 5L, 2L, 5L, 1L, 52L, 1L, 2L, 52L, 6L, 2L, 3L)),
.Names = c("APPLE", "PEAR", "PLUM", "BANANA", "LEMON"),
class = "data.frame", row.names = c(NA,-16L))
I have found this code which I am not sure how to tweak in order to leverage it for my purpose
https://stackoverflow.com/a/39484353/4797853
var.corelation <- cor(as.matrix(mydata), method="pearson")
library(igraph)
# prevent duplicated pairs
var.corelation <- var.corelation*lower.tri(var.corelation)
check.corelation <- which(var.corelation>0.62, arr.ind=TRUE)
graph.cor <- graph.data.frame(check.corelation, directed = FALSE)
groups.cor <- split(unique(as.vector(check.corelation)), clusters(graph.cor)$membership)
lapply(groups.cor,FUN=function(list.cor){rownames(var.corelation)[list.cor]})
The output that I am looking for is 2 data frames as follow:
DF1
GROUP1 GROUP2
3 16
4 40
ETC..
The values are the sum of the values within a group
DF2
ORIGINAL_VAR GROUP
APPLE 1
PEAR 1
PLUM 2
BANANA 2
LEMON 2
Try this (assuming that you have only clustered into 2 groups):
DF1 <- cbind.data.frame(GROUP1=rowSums(mydata[,groups.cor[[1]]]),
GROUP2=rowSums(mydata[,groups.cor[[2]]]))
DF1
GROUP1 GROUP2
1 3 16
2 4 40
3 10 72
4 8 24
5 732 14
6 130 30
7 86 5
8 912 10
9 1752 3
10 156 2114
11 1374 22
12 756 14
13 756 114
14 68 22
15 106 14
16 84 14
DF2 <- NULL
for (i in 1:2) {
DF2 <- rbind(DF2,
cbind.data.frame(ORIGINAL_VAR=rownames(var.corelation)[groups.cor[[i]]],
GROUP=i))
}
DF2
ORIGINAL_VAR GROUP
1 PEAR 1
2 APPLE 1
3 BANANA 2
4 LEMON 2
5 PLUM 2

Find max values of a column in a time series dataset [duplicate]

This question already has answers here:
Extract the maximum value within each group in a dataframe [duplicate]
(3 answers)
Closed 6 years ago.
I have a time series dataset DF where the first column is the timestep and the second column is the cellNo.. How can I drop all rows except the max(DF$cellno.) of each timestep?
> head(DF, n=100)
timestep cellNo.
1 1 1
2 1 2
3 1 3
4 1 4
5 1 5
6 1 6
7 1 7
8 1 8
9 1 9
10 1 10
11 1 11
12 1 12
13 1 13
14 1 14
15 1 15
16 1 16
17 1 17
18 1 18
19 1 19
20 1 20
21 1 21
22 1 22
23 1 23
24 1 24
25 1 25
26 1 26
27 1 27
28 1 28
29 1 29
30 1 30
31 1 31
32 1 32
33 2 1
34 2 2
35 2 3
36 2 4
37 2 5
38 2 6
39 2 7
40 2 8
41 2 9
42 2 10
43 2 11
44 2 12
45 2 13
46 2 14
47 2 15
48 2 16
49 2 17
50 2 18
51 2 19
52 2 20
53 2 21
54 2 22
55 2 23
56 2 24
57 2 25
58 2 26
59 2 27
60 2 28
61 2 29
62 2 30
63 2 31
64 2 32
65 3 1
66 3 2
67 3 3
68 3 4
69 3 5
70 3 6
71 3 7
72 3 8
73 3 9
74 3 10
75 3 11
76 3 12
77 3 13
78 3 14
79 3 15
80 3 16
81 3 17
82 3 18
83 3 19
84 3 20
85 3 21
86 3 22
87 3 23
88 3 24
89 3 25
90 3 26
91 3 27
92 3 28
93 3 29
94 3 30
95 3 31
96 3 32
97 4 1
98 4 2
99 4 3
100 4 4
If you want only max(cellno.) per timestep, you could do:
aggregate(cellNo.~timestep, DF, max)
# timestep cellNo.
# 1 1 32
# 2 2 32
# 3 3 32
# 4 4 4
Try this
# dput your data
df <- structure(list(timestep = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L), cellNo. = c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L,
16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L,
29L, 30L, 31L, 32L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L,
24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 1L, 2L, 3L, 4L,
5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L,
19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L,
32L, 1L, 2L, 3L, 4L)), .Names = c("timestep", "cellNo."), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24",
"25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35",
"36", "37", "38", "39", "40", "41", "42", "43", "44", "45", "46",
"47", "48", "49", "50", "51", "52", "53", "54", "55", "56", "57",
"58", "59", "60", "61", "62", "63", "64", "65", "66", "67", "68",
"69", "70", "71", "72", "73", "74", "75", "76", "77", "78", "79",
"80", "81", "82", "83", "84", "85", "86", "87", "88", "89", "90",
"91", "92", "93", "94", "95", "96", "97", "98", "99", "100"))
library(dplyr)
df %>% group_by(timestep) %>% summarise(max = max(cellNo.))
#Source: local data frame [4 x 2]
#timestep max
# (int) (int)
#1 1 32
#2 2 32
#3 3 32
#4 4 4
With data.table
library(data.table)
setDT(df1)[, .(Max = max(cellNo.)), timestep]

Resources