This question already has answers here:
Extract the maximum value within each group in a dataframe [duplicate]
(3 answers)
Closed 6 years ago.
I have a time series dataset DF where the first column is the timestep and the second column is the cellNo.. How can I drop all rows except the max(DF$cellno.) of each timestep?
> head(DF, n=100)
timestep cellNo.
1 1 1
2 1 2
3 1 3
4 1 4
5 1 5
6 1 6
7 1 7
8 1 8
9 1 9
10 1 10
11 1 11
12 1 12
13 1 13
14 1 14
15 1 15
16 1 16
17 1 17
18 1 18
19 1 19
20 1 20
21 1 21
22 1 22
23 1 23
24 1 24
25 1 25
26 1 26
27 1 27
28 1 28
29 1 29
30 1 30
31 1 31
32 1 32
33 2 1
34 2 2
35 2 3
36 2 4
37 2 5
38 2 6
39 2 7
40 2 8
41 2 9
42 2 10
43 2 11
44 2 12
45 2 13
46 2 14
47 2 15
48 2 16
49 2 17
50 2 18
51 2 19
52 2 20
53 2 21
54 2 22
55 2 23
56 2 24
57 2 25
58 2 26
59 2 27
60 2 28
61 2 29
62 2 30
63 2 31
64 2 32
65 3 1
66 3 2
67 3 3
68 3 4
69 3 5
70 3 6
71 3 7
72 3 8
73 3 9
74 3 10
75 3 11
76 3 12
77 3 13
78 3 14
79 3 15
80 3 16
81 3 17
82 3 18
83 3 19
84 3 20
85 3 21
86 3 22
87 3 23
88 3 24
89 3 25
90 3 26
91 3 27
92 3 28
93 3 29
94 3 30
95 3 31
96 3 32
97 4 1
98 4 2
99 4 3
100 4 4
If you want only max(cellno.) per timestep, you could do:
aggregate(cellNo.~timestep, DF, max)
# timestep cellNo.
# 1 1 32
# 2 2 32
# 3 3 32
# 4 4 4
Try this
# dput your data
df <- structure(list(timestep = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L), cellNo. = c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L,
16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L,
29L, 30L, 31L, 32L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L,
24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 1L, 2L, 3L, 4L,
5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L,
19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L,
32L, 1L, 2L, 3L, 4L)), .Names = c("timestep", "cellNo."), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24",
"25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35",
"36", "37", "38", "39", "40", "41", "42", "43", "44", "45", "46",
"47", "48", "49", "50", "51", "52", "53", "54", "55", "56", "57",
"58", "59", "60", "61", "62", "63", "64", "65", "66", "67", "68",
"69", "70", "71", "72", "73", "74", "75", "76", "77", "78", "79",
"80", "81", "82", "83", "84", "85", "86", "87", "88", "89", "90",
"91", "92", "93", "94", "95", "96", "97", "98", "99", "100"))
library(dplyr)
df %>% group_by(timestep) %>% summarise(max = max(cellNo.))
#Source: local data frame [4 x 2]
#timestep max
# (int) (int)
#1 1 32
#2 2 32
#3 3 32
#4 4 4
With data.table
library(data.table)
setDT(df1)[, .(Max = max(cellNo.)), timestep]
Related
I've used the following code to create a frequency count.
df %>% group_by(INCOME, HAPPY) %>% summarise(count=n())
Output:
INCOME HAPPY count
<int> <int> <int>
1 1 1 6
2 1 2 17
3 1 3 13
4 1 8 1
5 2 1 5
6 2 2 11
7 2 3 12
8 2 8 0
9 3 1 4
10 3 2 10
11 3 3 5
12 3 8 0
Yet, I would like to have the following frequency format.
1 2 3
1 6 5 4
2 17 11 10
3 13 12 5
8 1 0 0
Using xtabs from base R
xtabs(count ~ HAPPY + INCOME, df1)
INCOME
HAPPY 1 2 3
1 6 5 4
2 17 11 10
3 13 12 5
8 1 0 0
data
df1 <- structure(list(INCOME = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 3L), HAPPY = c(1L, 2L, 3L, 8L, 1L, 2L, 3L, 8L, 1L, 2L,
3L, 8L), count = c(6L, 17L, 13L, 1L, 5L, 11L, 12L, 0L, 4L, 10L,
5L, 0L)), class = "data.frame", row.names = c("1", "2", "3",
"4", "5", "6", "7", "8", "9", "10", "11", "12"))
After your code: df %>% group_by(INCOME, HAPPY) %>% summarise(count=n())
You could use this code to achieve your task:
library(dplyr)
library(tidyr)
library(tibble)
df %>%
mutate(group_id = as.integer(gl(n(), 4, n()))) %>%
pivot_wider(
HAPPY,
names_from = group_id,
values_from = count
) %>%
column_to_rownames("HAPPY")
1 2 3
1 6 5 4
2 17 11 10
3 13 12 5
8 1 0 0
data:
structure(list(INCOME = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 3L), HAPPY = c(1L, 2L, 3L, 8L, 1L, 2L, 3L, 8L, 1L, 2L,
3L, 8L), count = c(6L, 17L, 13L, 1L, 5L, 11L, 12L, 0L, 4L, 10L,
5L, 0L)), class = "data.frame", row.names = c("1", "2", "3",
"4", "5", "6", "7", "8", "9", "10", "11", "12"))
I think this can be simplified to -
library(dplyr)
library(tidyr)
df %>%
count(INCOME, HAPPY) %>%
pivot_wider(names_from = INCOME, values_from = n)
I have a dataset that looks like this:
year month quarter measurement
1 2005 1 1 33
2 2005 2 1 5
3 2005 4 2 3
4 2005 5 2 7
5 2005 6 2 10
6 2005 7 3 4
7 2005 10 4 9
8 2005 10 4 6
9 2005 11 4 8
10 2005 11 4 9
11 2005 11 4 3
12 2005 11 4 9
13 2005 11 4 2
14 2006 11 4 1
15 2006 12 4 1
16 2006 12 4 6
17 2006 1 1 9
18 2006 1 1 1
19 2006 1 1 10
20 2006 7 3 2
21 2006 1 1 0
22 2007 4 2 3
23 2007 1 1 4
24 2007 2 1 5
25 2007 2 1 1
26 2007 8 3 5
27 2007 9 3 6
28 2007 2 4 6
I'd like to take the mean of all the values that share the same year value (column 1) and quarter value (column 3). The mean calculations would be performed on the measurement values (column 4). The month column (column 2) can be ignored-- I included it to show why I may have multiple Quarter 1 values for a single year.
I don't think I can use a rolling average formula, as there's a different amount of Quarter values for different years.
I believe I'd like to format my data like this:
Thank you
We can use dcast from data.table
library(data.table)
dcast(setDT(df1), quarter ~ year, value.var = 'measurement', mean)
Or with pivot_wider from tidyr
library(tidyr)
library(dplyr)
df1 %>%
select(-month) %>%
pivot_wider(names_from = year, values_from = measurement,
values_fn = list(measurement = mean))
data
df1 <- structure(list(year = c(2005L, 2005L, 2005L, 2005L, 2005L, 2005L,
2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2006L, 2006L,
2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L), month = c(1L, 2L, 4L, 5L, 6L, 7L,
10L, 10L, 11L, 11L, 11L, 11L, 11L, 11L, 12L, 12L, 1L, 1L, 1L,
7L, 1L, 4L, 1L, 2L, 2L, 8L, 9L, 2L), quarter = c(1L, 1L, 2L,
2L, 2L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 1L, 1L, 1L,
3L, 1L, 2L, 1L, 1L, 1L, 3L, 3L, 4L), measurement = c(33L, 5L,
3L, 7L, 10L, 4L, 9L, 6L, 8L, 9L, 3L, 9L, 2L, 1L, 1L, 6L, 9L,
1L, 10L, 2L, 0L, 3L, 4L, 5L, 1L, 5L, 6L, 6L)), class = "data.frame",
row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24",
"25", "26", "27", "28"))
This question already has answers here:
How to group by two columns in R
(4 answers)
Closed 4 years ago.
I have a dataframe with 5 columns. I know how to calculate the mean for one column grouped by another column. However, i need to group it by two columns. For example, I want to calculate the mean for column 5 grouped by column 1 and column 2.
df <- structure(list(Country = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L), .Label = c("AT", "CH", "DE"), class = "factor"),
Occupation = c(1L, 3L, 5L, 3L, 1L, 2L, 5L, 3L, 5L, 3L, 1L,
2L, 1L, 5L, 3L, 3L, 1L, 3L, 2L, 5L, 5L, 1L, 2L, 1L, 3L),
Age = c(20L, 46L, 30L, 12L, 73L, 53L, 19L, 43L, 65L, 53L,
19L, 34L, 76L, 25L, 45L, 39L, 18L, 59L, 37L, 24L, 19L, 60L,
51L, 32L, 29L), Gender = structure(c(1L, 1L, 2L, 2L, 2L,
1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L,
2L, 2L, 1L, 1L, 2L), .Label = c("female", "male"), class = "factor"),
Income = c(100L, 80L, 78L, 29L, 156L, 56L, 95L, 104L, 87L,
56L, 203L, 45L, 112L, 78L, 56L, 140L, 99L, 67L, 89L, 109L,
43L, 145L, 30L, 101L, 77L)), class = "data.frame", row.names = c(NA,
-25L))
head(df)
Country Occupation Age Gender Income
1 AT 1 20 female 100
2 AT 3 46 female 80
3 AT 5 30 male 78
4 AT 3 12 male 29
5 AT 1 73 male 156
6 AT 2 53 female 56
So what I want to to is calculate the mean for column ‘income’, grouped by country and occupation. E.g., I want to calculate the mean of ‘income’ for all those people living in country ‘AT’ with occupation ‘3’, the mean of ‘income’ for all those living in country ‘CH’ with occupation ‘1’ and so on.
(1) base method (aggregate)
mean.df <- aggregate(Income ~ Country + Occupation, df, mean)
names(mean.df)[3] <- "Income_Mean"
merge(df, mean.df)
(2) base method (tapply)
mean.df1 <- tapply(df$Income, list(df$Country, df$Occupation), mean)
mean.df2 <- as.data.frame(as.table(mean.df1))
names(mean.df2) <- c("Country", "Occupation", "Income_Mean")
merge(df, mean.df2)
(3) stats method (ave)
df2 <- df
df2$Income_Mean <- ave(df$Income, df$Country, df$Occupation)
(4) dplyr method
df %>% group_by(Country, Occupation) %>%
mutate(Income_Mean = mean(Income))
Output :
Country Occupation Age Gender Income Income_Mean
<fct> <int> <int> <fct> <int> <dbl>
1 AT 1 20 female 100 128
2 AT 3 46 female 80 71
3 AT 5 30 male 78 86.5
4 AT 3 12 male 29 71
5 AT 1 73 male 156 128
6 AT 2 53 female 56 56
7 AT 5 19 male 95 86.5
8 AT 3 43 male 104 71
9 CH 5 65 male 87 82.5
10 CH 3 53 female 56 84
# ... with 15 more rows
Using sqldf:
sqldf("select Country,Occupation,Age,Gender,avg(Income) from df group by Country,Occupation")
OR
Using data.table:
library(data.table)
df=data.table(df)
df[, mean(Income), by = list(Country,Occupation)]
Output:
Country Occupation Age Gender avg(Income)
1 AT 1 73 male 128.0
2 AT 2 53 female 56.0
3 AT 3 43 male 71.0
4 AT 5 19 male 86.5
5 CH 1 18 female 138.0
6 CH 2 34 male 45.0
7 CH 3 39 male 84.0
8 CH 5 25 female 82.5
9 DE 1 32 female 123.0
10 DE 2 51 female 59.5
11 DE 3 29 male 72.0
12 DE 5 19 male 76.0
I am writing the following function:
for (i in 1:272){
gh[[i]]<- c(df[i, 1],df[i, 2])}
Error: object 'gh' not found
Why is this? Why do I get this error? And how can I correct it?
with the following data structure:
data.frame': 272 obs. of 2 variables:
$ V1: Factor w/ 17 levels "exp_var_nocorr_1",..: 1 1 1 1 1 1 1 1 1 1 ...
$ V2: Factor w/ 17 levels "exp_var_nocorr_1",..: 2 3 4 5 6 7 8 9 10 11 ...
or this, just to have it more visually, here I have the first 33 rows:
V1 V2
1 exp_var_nocorr_1 exp_var_nocorr_10
2 exp_var_nocorr_1 exp_var_nocorr_11
3 exp_var_nocorr_1 exp_var_nocorr_12
4 exp_var_nocorr_1 exp_var_nocorr_13
5 exp_var_nocorr_1 exp_var_nocorr_14
6 exp_var_nocorr_1 exp_var_nocorr_15
7 exp_var_nocorr_1 exp_var_nocorr_16
8 exp_var_nocorr_1 exp_var_nocorr_17
9 exp_var_nocorr_1 exp_var_nocorr_2
10 exp_var_nocorr_1 exp_var_nocorr_3
11 exp_var_nocorr_1 exp_var_nocorr_4
12 exp_var_nocorr_1 exp_var_nocorr_5
13 exp_var_nocorr_1 exp_var_nocorr_6
14 exp_var_nocorr_1 exp_var_nocorr_7
15 exp_var_nocorr_1 exp_var_nocorr_8
16 exp_var_nocorr_1 exp_var_nocorr_9
17 exp_var_nocorr_10 exp_var_nocorr_1
18 exp_var_nocorr_10 exp_var_nocorr_11
19 exp_var_nocorr_10 exp_var_nocorr_12
20 exp_var_nocorr_10 exp_var_nocorr_13
21 exp_var_nocorr_10 exp_var_nocorr_14
22 exp_var_nocorr_10 exp_var_nocorr_15
23 exp_var_nocorr_10 exp_var_nocorr_16
24 exp_var_nocorr_10 exp_var_nocorr_17
25 exp_var_nocorr_10 exp_var_nocorr_2
26 exp_var_nocorr_10 exp_var_nocorr_3
27 exp_var_nocorr_10 exp_var_nocorr_4
28 exp_var_nocorr_10 exp_var_nocorr_5
29 exp_var_nocorr_10 exp_var_nocorr_6
30 exp_var_nocorr_10 exp_var_nocorr_7
31 exp_var_nocorr_10 exp_var_nocorr_8
32 exp_var_nocorr_10 exp_var_nocorr_9
33 exp_var_nocorr_11 exp_var_nocorr_1
dput result:
structure(list(V1 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L), .Label = c("exp_var_nocorr_1",
"exp_var_nocorr_10", "exp_var_nocorr_11"), class = "factor"),
V2 = structure(c(2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L,
12L, 13L, 14L, 15L, 16L, 17L, 1L, 3L, 4L, 5L, 6L, 7L, 8L,
9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 1L), .Label = c("exp_var_nocorr_1",
"exp_var_nocorr_10", "exp_var_nocorr_11", "exp_var_nocorr_12",
"exp_var_nocorr_13", "exp_var_nocorr_14", "exp_var_nocorr_15",
"exp_var_nocorr_16", "exp_var_nocorr_17", "exp_var_nocorr_2",
"exp_var_nocorr_3", "exp_var_nocorr_4", "exp_var_nocorr_5",
"exp_var_nocorr_6", "exp_var_nocorr_7", "exp_var_nocorr_8",
"exp_var_nocorr_9"), class = "factor")), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24",
"25", "26", "27", "28", "29", "30", "31", "32", "33"))
The problem with the code that you did not allocate the gh list before accessing it's elements. At first before access list elements you need to initialize it, please see the code below:
set.seed(123)
smpl <- structure(list(V1 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L), .Label = c("exp_var_nocorr_1",
"exp_var_nocorr_10", "exp_var_nocorr_11"), class = "factor"),
V2 = structure(c(2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L,
12L, 13L, 14L, 15L, 16L, 17L, 1L, 3L, 4L, 5L, 6L, 7L, 8L,
9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 1L), .Label = c("exp_var_nocorr_1",
"exp_var_nocorr_10", "exp_var_nocorr_11", "exp_var_nocorr_12",
"exp_var_nocorr_13", "exp_var_nocorr_14", "exp_var_nocorr_15",
"exp_var_nocorr_16", "exp_var_nocorr_17", "exp_var_nocorr_2",
"exp_var_nocorr_3", "exp_var_nocorr_4", "exp_var_nocorr_5",
"exp_var_nocorr_6", "exp_var_nocorr_7", "exp_var_nocorr_8",
"exp_var_nocorr_9"), class = "factor")), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24",
"25", "26", "27", "28", "29", "30", "31", "32", "33"))
# data.simulation
df <- smpl[sample(33, 272, replace = TRUE), ]
# list memory allocation
gh <- lapply(1:272, function(x)c("", ""))
# fill-in gh
for (i in 1:272){
gh[[i]] <-c(df[i, 1],df[i, 2])}
gh[1:5]
Output:
[[1]]
[1] "exp_var_nocorr_1" "exp_var_nocorr_3"
[[2]]
[1] "exp_var_nocorr_10" "exp_var_nocorr_4"
[[3]]
[1] "exp_var_nocorr_1" "exp_var_nocorr_7"
[[4]]
[1] "exp_var_nocorr_10" "exp_var_nocorr_7"
[[5]]
[1] "exp_var_nocorr_10" "exp_var_nocorr_9"
I have a list which looks like,
lapply(sample_list, head, 3)
$`2016-04-24 00:00:00.tcp`
ports freq
8 443 296
12 80 170
5 23 92
$`2016-04-24 00:00:00.udp`
ports freq
4 161 138
7 53 45
1 123 28
$`2016-04-24 01:00:00.tcp`
ports freq
13 443 342
20 80 215
10 25 60
$`2016-04-24 01:00:00.udp`
ports freq
4 161 85
8 53 42
12 902 27
I want to merge the data frames that come from the same protocol (i.e. the tcp together and udp together)
So the final result would be a new list with 2 data frames; One for tcp and one for udp such that,
lapply(final_list, head, 3)
$tcp
ports freq.00:00:00 freq.01:00:00
1 443 296 342
2 80 170 215
3 23 92 51
$udp
ports freq.00:00:00 freq.01:00:00
1 161 138 85
2 53 45 42
3 123 28 19
DATA
dput(sample_list)
structure(list(`2016-04-24 00:00:00.tcp` = structure(list(ports = c("443",
"80", "23", "21", "22", "25", "445", "110", "389", "135", "465",
"514", "91", "995", "84", "902"), freq = structure(c(296L, 170L,
92L, 18L, 16L, 15L, 14L, 4L, 3L, 2L, 2L, 2L, 2L, 2L, 1L, 1L), .Dim = 16L)), .Names = c("ports",
"freq"), row.names = c(8L, 12L, 5L, 3L, 4L, 6L, 9L, 1L, 7L, 2L,
10L, 11L, 15L, 16L, 13L, 14L), class = "data.frame"), `2016-04-24 00:00:00.udp` = structure(list(
ports = c("161", "53", "123", "902", "137", "514", "138",
"623", "69", "88", "500"), freq = structure(c(138L, 45L,
28L, 26L, 24L, 24L, 6L, 6L, 5L, 4L, 1L), .Dim = 11L)), .Names = c("ports",
"freq"), row.names = c(4L, 7L, 1L, 11L, 2L, 6L, 3L, 8L, 9L, 10L,
5L), class = "data.frame"), `2016-04-24 01:00:00.tcp` = structure(list(
ports = c("443", "80", "25", "23", "88", "21", "161", "22",
"445", "135", "389", "993", "548", "110", "143", "502", "514",
"81", "995", "102", "111", "311", "444", "789", "902", "91"
), freq = structure(c(342L, 215L, 60L, 51L, 42L, 32L, 31L,
18L, 18L, 6L, 5L, 4L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Dim = 26L)), .Names = c("ports", "freq"
), row.names = c(13L, 20L, 10L, 9L, 22L, 7L, 6L, 8L, 15L, 4L,
12L, 25L, 18L, 2L, 5L, 16L, 17L, 21L, 26L, 1L, 3L, 11L, 14L,
19L, 23L, 24L), class = "data.frame"), `2016-04-24 01:00:00.udp` = structure(list(
ports = c("161", "53", "902", "514", "123", "137", "69",
"138", "389", "443", "88", "623"), freq = structure(c(85L,
42L, 27L, 24L, 19L, 15L, 15L, 4L, 2L, 2L, 2L, 1L), .Dim = 12L)), .Names = c("ports",
"freq"), row.names = c(4L, 8L, 12L, 7L, 1L, 2L, 10L, 3L, 5L,
6L, 11L, 9L), class = "data.frame")), .Names = c("2016-04-24 00:00:00.tcp",
"2016-04-24 00:00:00.udp", "2016-04-24 01:00:00.tcp", "2016-04-24 01:00:00.udp"
))
Bonus question: What is the structure of freq? I never saw int [1:16(1d)] before.
str(sample_list$`2016-04-24 00:00:00.tcp`)
'data.frame': 16 obs. of 2 variables:
$ ports: chr "443" "80" "23" "21" ...
$ freq : int [1:16(1d)] 296 170 92 18 16 15 14 4 3 2 ...
The code I used to create the list (In this case called try1)
protocol_list <- lapply(per_hour1, function(i) split(i, i$protocol))
Analytic_Protocol_List <- lapply(protocol_list, function(i) lapply(i, dest.ports))
try1 <- lapply(unlist(Analytic_Protocol_List, recursive = FALSE), `[[`, 1)
Note that solutions from similar questions do not work for this case. Maybe because of the structure?
Another alternative:
library(dplyr)
library(tidyr)
data.table::melt(sample_list) %>%
separate(L1, into = c("time", "protocol"), sep = "\\.") %>%
unite(f, variable, time) %>%
spread(f, value) %>%
split(.$protocol)
Which, using your data, gives:
$tcp
ports protocol freq_2016-04-24 00:00:00 freq_2016-04-24 01:00:00
1 102 tcp NA 1
2 110 tcp 4 2
3 111 tcp NA 1
5 135 tcp 2 6
8 143 tcp NA 2
9 161 tcp NA 31
11 21 tcp 18 32
12 22 tcp 16 18
13 23 tcp 92 51
14 25 tcp 15 60
15 311 tcp NA 1
16 389 tcp 3 5
18 443 tcp 296 342
20 444 tcp NA 1
21 445 tcp 14 18
22 465 tcp 2 NA
24 502 tcp NA 2
25 514 tcp 2 2
28 548 tcp NA 3
31 789 tcp NA 1
32 80 tcp 170 215
33 81 tcp NA 2
34 84 tcp 1 NA
35 88 tcp NA 42
37 902 tcp 1 1
39 91 tcp 2 1
40 993 tcp NA 4
41 995 tcp 2 2
$udp
ports protocol freq_2016-04-24 00:00:00 freq_2016-04-24 01:00:00
4 123 udp 28 19
6 137 udp 24 15
7 138 udp 6 4
10 161 udp 138 85
17 389 udp NA 2
19 443 udp NA 2
23 500 udp 1 NA
26 514 udp 24 24
27 53 udp 45 42
29 623 udp 6 1
30 69 udp 5 15
36 88 udp 4 2
38 902 udp 26 27
Update:
If you want to sort by freq, you could do:
data.table::melt(sample_list) %>%
separate(L1, into = c("time", "protocol"), sep = "\\.") %>%
unite(f, variable, time) %>%
spread(f, value) %>%
arrange(protocol, desc(`freq_2016-04-24 00:00:00`)) %>%
split(.$protocol)
For the rbinding you can try the following:
do.call(rbind, sample_list[grep("tcp", names(sample_list))])
and:
do.call(rbind, sample_list[grep("udp", names(sample_list))])
and as refined by Marat below:
d <- do.call(rbind, sample_list)
d2 <- data.frame(d,do.call(rbind,strsplit(rownames((d)),'[.]')))
lapply(split(d2,d2$X2),dcast,ports~X1,value.var='freq')
you can just merge by ID
create a ID for each row of the data frame
let lappy(X) = x
x$1 <- cbind(ID=1:nrow(x$1))
same for x1,x2,x3....,xN
newx <- merge(x$1,x$2,...,x$N, by=ID)
since id merging is used overlapping won't occur, jusıt there each list$(X) as a data frame itself