remove duplicate coordinates from X and Y column - r

Based on the data below how can I remove the rows with duplicate X and Y coordinates? In the example below, you will notice that one of X coordinate is -1.52 which is repeated twice but it's not a duplicate since it's corresponding Y coordiantes are different.
I don't know if it matters but please note that the orginal dataset has more than 2 decimal places for the X and Y values.
Sample data:
structure(list(id = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), X = c(-1.01,
-1.11, -1.11, -2.13, -2.13, -1.52, -1.52, -1.98, -3.69, -4.79),
Y = c(2.11, 3.33, 3.33, 6.66, 6.66, 7.77, 8.88, 9.99, 1.11,
6.68)), class = "data.frame", row.names = c(NA, -10L))
Desired data:
id X Y
1 -1.01 2.11
2 -1.11 3.33
4 -2.13 6.66
6 -1.52 7.77
7 -1.52 8.88
8 -1.98 9.99
9 -3.69 1.11
19 -4.79 6.68

Use duplicated
subset(df1, !duplicated(df1[-1]))
-output
id X Y
1 1 -1.01 2.11
2 2 -1.11 3.33
4 4 -2.13 6.66
6 6 -1.52 7.77
7 7 -1.52 8.88
8 8 -1.98 9.99
9 9 -3.69 1.11
10 10 -4.79 6.68
Or with distinct
library(dplyr)
df1 %>%
distinct(X, Y, .keep_all = TRUE)

Related

format table to have mean (sd) instead of separate columns R

I Have a data frame of several water quality measures. For each measure I have a calculated mean and SD. I have a value for 6 sites and 4 seasons. Currently my dataframe has the means in a column for examples 'Temp_1' and then a column for the standard deviation as 'Temp_2'. I want to export the file with one column for each water quality measure with the format mean (SD).
current output
This is an example for the first water measure, but I'd like to code it so it is also done to remaining factors as well.
desired output
Head of dataframe
structure(list(season = structure(c(1L, 1L, 1L, 1L, 1L, 1L), levels = c("Winter",
"Spring", "Summer", "Autumn"), class = "factor"), Site = structure(1:6, levels = c("1",
"2", "3", "4", "5", "6"), class = "factor"), Temp_1 = c(7.2,
7.05, 6.3, 6.25, 6.2, 5.4), Temp_2 = c(1.55563491861041, 1.90918830920368,
1.69705627484771, 2.33345237791561, 2.40416305603426, 2.40416305603426
), pH_1 = c(7.435, 7.38, 7.52, 7.525, 7.38, 7.565), pH_2 = c(0.289913780286484,
0.282842712474619, 0.0989949493661164, 0.120208152801713, 0.0565685424949239,
0.261629509039023), DO_1 = c(9, 9.1, 8.25, 8.85, 9.25, 9), DO_2 = c(0,
0.424264068711928, 0.0707106781186558, 0.494974746830583, 0.636396103067892,
0.42426406871193), EC_1 = c(337.5, 333, 321.5, 322, 309, 300.5
), EC_2 = c(55.8614357137373, 41.0121933088198, 51.618795026618,
32.5269119345812, 25.4558441227157, 30.4055915910215), SS_1 = c(5.945,
3.65, 5.025, 2.535, 10.22, 4.595), SS_2 = c(0.728319984622144,
1.06066017177982, 2.93449314192417, 0.473761543394987, 8.23072293301141,
0.67175144212722), TP_1 = c(73.5, 75, 61.5, 66.5, 83, 87), TP_2 = c(3.53553390593274,
12.7279220613579, 9.19238815542512, 6.36396103067893, 26.8700576850888,
24.0416305603426), SRP_1 = c(19, 19, 10, 14, 13.5, 23.5), SRP_2 = c(2.82842712474619,
1.4142135623731, 2.82842712474619, 0, 0.707106781186548, 3.53553390593274
), PP_1 = c(54.5, 56, 51.5, 52.5, 69.5, 63.5), PP_2 = c(6.36396103067893,
11.3137084989848, 6.36396103067893, 6.36396103067893, 26.1629509039023,
20.5060966544099), DA_1 = c(0.083, 0.0775, 0.0775, 0.044, 0.059,
0.051), DA_2 = c(0.00282842712474619, 0.0120208152801713, 0.00919238815542513,
0.0014142135623731, 0.0127279220613579, 0.00848528137423857),
DNI_1 = c(0.048739437, 0.041015562, 0.0617723365, 0.0337441755,
0.041480944, 0.0143461675), DNI_2 = c(0.0345079125942686,
0.0223312453226695, 0.0187360224120165, 0.0162032493604065,
0.0258169069873252, 0.0202885446465761), DNA_1 = c(20.43507986,
20.438919615, 14.98692746, 19.953408625, 17.03060377, 8.5767502525
), DNA_2 = c(1.80288106961836, 1.2687128010491, 2.28839365291436,
1.03116172040732, 0.396528484042397, 1.72350828181138), DF_1 = c(0.0992379715,
0.0947268395, 0.094323125, 0.098064875, 0.0980304675, 0.085783911
), DF_2 = c(0.00372072305060515, 0.00724914346231915, 0.0142932471712976,
0.0116895470668939, 0.00255671780854136, 0.00830519117656529
), DC_1 = c(12.18685357, 12.73924378, 13.09550326, 13.417557825,
15.140975265, 21.429763715), DC_2 = c(0.57615880774946, 0.0430071960969884,
0.702539578486863, 0.134642528587041, 0.66786605299916, 0.17012889453292
), DS_1 = c(15.834380095, 15.69623116, 14.37636388, 15.444235935,
14.647596185, 11.9877372), DS_2 = c(1.67153135346354, 1.69978765863781,
2.47560570280853, 1.03831263471691, 1.24488755930594, 0.975483163720397
), DOC_1 = c(19.74, 20.08, 21.24, 20.34, 21.88, 24.92), DOC_2 = c(2.7435743110038,
1.69705627484772, 2.60215295476649, 1.04651803615609, 0.226274169979695,
0.452548339959388)), row.names = c(NA, 6L), class = "data.frame")
Using mutate across with some tricks to organize paired data we can do it this way. Further adaptation is possible (for example just to keep the mean_sd columns (just use transmute instead of mutate):
Update:
library(dplyr)
library(stringr)
df %>%
mutate(across(-c(season, Site), ~round(.,2))) %>%
mutate(across(ends_with('_1'), ~ paste0(.,
"(",
get(str_replace(cur_column(), "_1$", "_2")),
")"
), .names = "mean_sd_{.col}")) %>%
rename_at(vars(starts_with('mean_sd')), ~ str_remove(., "\\_1"))
season Site Temp_1 Temp_2 pH_1 pH_2 DO_1 DO_2 EC_1 EC_2 SS_1 SS_2 TP_1 TP_2 SRP_1 SRP_2 PP_1 PP_2 DA_1 DA_2 DNI_1 DNI_2 DNA_1 DNA_2 DF_1
1 Winter 1 7.20 1.56 7.43 0.29 9.00 0.00 337.5 55.86 5.94 0.73 73.5 3.54 19.0 2.83 54.5 6.36 0.08 0.00 0.05 0.03 20.44 1.80 0.10
2 Winter 2 7.05 1.91 7.38 0.28 9.10 0.42 333.0 41.01 3.65 1.06 75.0 12.73 19.0 1.41 56.0 11.31 0.08 0.01 0.04 0.02 20.44 1.27 0.09
3 Winter 3 6.30 1.70 7.52 0.10 8.25 0.07 321.5 51.62 5.03 2.93 61.5 9.19 10.0 2.83 51.5 6.36 0.08 0.01 0.06 0.02 14.99 2.29 0.09
4 Winter 4 6.25 2.33 7.53 0.12 8.85 0.49 322.0 32.53 2.54 0.47 66.5 6.36 14.0 0.00 52.5 6.36 0.04 0.00 0.03 0.02 19.95 1.03 0.10
5 Winter 5 6.20 2.40 7.38 0.06 9.25 0.64 309.0 25.46 10.22 8.23 83.0 26.87 13.5 0.71 69.5 26.16 0.06 0.01 0.04 0.03 17.03 0.40 0.10
6 Winter 6 5.40 2.40 7.57 0.26 9.00 0.42 300.5 30.41 4.60 0.67 87.0 24.04 23.5 3.54 63.5 20.51 0.05 0.01 0.01 0.02 8.58 1.72 0.09
DF_2 DC_1 DC_2 DS_1 DS_2 DOC_1 DOC_2 mean_sd_Temp mean_sd_pH mean_sd_DO mean_sd_EC mean_sd_SS mean_sd_TP mean_sd_SRP mean_sd_PP mean_sd_DA
1 0.00 12.19 0.58 15.83 1.67 19.74 2.74 7.2(1.56) 7.43(0.29) 9(0) 337.5(55.86) 5.94(0.73) 73.5(3.54) 19(2.83) 54.5(6.36) 0.08(0)
2 0.01 12.74 0.04 15.70 1.70 20.08 1.70 7.05(1.91) 7.38(0.28) 9.1(0.42) 333(41.01) 3.65(1.06) 75(12.73) 19(1.41) 56(11.31) 0.08(0.01)
3 0.01 13.10 0.70 14.38 2.48 21.24 2.60 6.3(1.7) 7.52(0.1) 8.25(0.07) 321.5(51.62) 5.03(2.93) 61.5(9.19) 10(2.83) 51.5(6.36) 0.08(0.01)
4 0.01 13.42 0.13 15.44 1.04 20.34 1.05 6.25(2.33) 7.53(0.12) 8.85(0.49) 322(32.53) 2.54(0.47) 66.5(6.36) 14(0) 52.5(6.36) 0.04(0)
5 0.00 15.14 0.67 14.65 1.24 21.88 0.23 6.2(2.4) 7.38(0.06) 9.25(0.64) 309(25.46) 10.22(8.23) 83(26.87) 13.5(0.71) 69.5(26.16) 0.06(0.01)
6 0.01 21.43 0.17 11.99 0.98 24.92 0.45 5.4(2.4) 7.57(0.26) 9(0.42) 300.5(30.41) 4.6(0.67) 87(24.04) 23.5(3.54) 63.5(20.51) 0.05(0.01)
mean_sd_DNI mean_sd_DNA mean_sd_DF mean_sd_DC mean_sd_DS mean_sd_DOC
1 0.05(0.03) 20.44(1.8) 0.1(0) 12.19(0.58) 15.83(1.67) 19.74(2.74)
2 0.04(0.02) 20.44(1.27) 0.09(0.01) 12.74(0.04) 15.7(1.7) 20.08(1.7)
3 0.06(0.02) 14.99(2.29) 0.09(0.01) 13.1(0.7) 14.38(2.48) 21.24(2.6)
4 0.03(0.02) 19.95(1.03) 0.1(0.01) 13.42(0.13) 15.44(1.04) 20.34(1.05)
5 0.04(0.03) 17.03(0.4) 0.1(0) 15.14(0.67) 14.65(1.24) 21.88(0.23)
6 0.01(0.02) 8.58(1.72) 0.09(0.01) 21.43(0.17) 11.99(0.98) 24.92(0.45)
First answer:
We could do this like so:
library(dplyr)
df %>% mutate(mean_sd = paste0(Temp_1, " (", round(Temp_2,2), ")"), .before=5)
season Site Temp_1 Temp_2 mean_sd pH_1 pH_2 DO_1 DO_2 EC_1 EC_2 SS_1 SS_2 TP_1 TP_2 SRP_1 SRP_2 PP_1
1 Winter 1 7.20 1.555635 7.2 (1.56) 7.435 0.28991378 9.00 0.00000000 337.5 55.86144 5.945 0.7283200 73.5 3.535534 19.0 2.8284271 54.5
2 Winter 2 7.05 1.909188 7.05 (1.91) 7.380 0.28284271 9.10 0.42426407 333.0 41.01219 3.650 1.0606602 75.0 12.727922 19.0 1.4142136 56.0
3 Winter 3 6.30 1.697056 6.3 (1.7) 7.520 0.09899495 8.25 0.07071068 321.5 51.61880 5.025 2.9344931 61.5 9.192388 10.0 2.8284271 51.5
4 Winter 4 6.25 2.333452 6.25 (2.33) 7.525 0.12020815 8.85 0.49497475 322.0 32.52691 2.535 0.4737615 66.5 6.363961 14.0 0.0000000 52.5
5 Winter 5 6.20 2.404163 6.2 (2.4) 7.380 0.05656854 9.25 0.63639610 309.0 25.45584 10.220 8.2307229 83.0 26.870058 13.5 0.7071068 69.5
6 Winter 6 5.40 2.404163 5.4 (2.4) 7.565 0.26162951 9.00 0.42426407 300.5 30.40559 4.595 0.6717514 87.0 24.041631 23.5 3.5355339 63.5
PP_2 DA_1 DA_2 DNI_1 DNI_2 DNA_1 DNA_2 DF_1 DF_2 DC_1 DC_2 DS_1 DS_2 DOC_1
1 6.363961 0.0830 0.002828427 0.04873944 0.03450791 20.43508 1.8028811 0.09923797 0.003720723 12.18685 0.5761588 15.83438 1.6715314 19.74
2 11.313708 0.0775 0.012020815 0.04101556 0.02233125 20.43892 1.2687128 0.09472684 0.007249143 12.73924 0.0430072 15.69623 1.6997877 20.08
3 6.363961 0.0775 0.009192388 0.06177234 0.01873602 14.98693 2.2883937 0.09432312 0.014293247 13.09550 0.7025396 14.37636 2.4756057 21.24
4 6.363961 0.0440 0.001414214 0.03374418 0.01620325 19.95341 1.0311617 0.09806487 0.011689547 13.41756 0.1346425 15.44424 1.0383126 20.34
5 26.162951 0.0590 0.012727922 0.04148094 0.02581691 17.03060 0.3965285 0.09803047 0.002556718 15.14098 0.6678661 14.64760 1.2448876 21.88
6 20.506097 0.0510 0.008485281 0.01434617 0.02028854 8.57675 1.7235083 0.08578391 0.008305191 21.42976 0.1701289 11.98774 0.9754832 24.92
DOC_2
1 2.7435743
2 1.6970563
3 2.6021530
4 1.0465180
5 0.2262742
6 0.4525483
You can create a new column like this
df$Temp <- paste0(df$Temp_1, ' (', df$Temp_2, ')')
And select only the desired output columns
df[, c('season', 'Site', 'Temp')]
library(tidyverse)
df %>%
pivot_longer(-c(season, Site)) %>%
mutate(name = name %>% str_remove_all("[^a-zA-Z]")) %>%
group_by(season, Site, name) %>%
summarise(value = str_c(round(value, 2), collapse = ", ")) %>%
pivot_wider(names_from = name,
values_from = value)
# A tibble: 6 x 17
# Groups: season, Site [6]
season Site DA DC DF DNA DNI DO DOC DS EC pH PP SRP SS Temp TP
<fct> <fct> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 Winter 1 0.08, 0 12.19, 0.58 0.1, 0 20.44, 1.8 0.05, 0.03 9, 0 19.7~ 15.8~ 337.~ 7.43~ 54.5~ 19, ~ 5.94~ 7.2,~ 73.5~
2 Winter 2 0.08, 0.01 12.74, 0.04 0.09, 0.01 20.44, 1.27 0.04, 0.02 9.1, 0.~ 20.0~ 15.7~ 333,~ 7.38~ 56, ~ 19, ~ 3.65~ 7.05~ 75, ~
3 Winter 3 0.08, 0.01 13.1, 0.7 0.09, 0.01 14.99, 2.29 0.06, 0.02 8.25, 0~ 21.2~ 14.3~ 321.~ 7.52~ 51.5~ 10, ~ 5.03~ 6.3,~ 61.5~
4 Winter 4 0.04, 0 13.42, 0.13 0.1, 0.01 19.95, 1.03 0.03, 0.02 8.85, 0~ 20.3~ 15.4~ 322,~ 7.53~ 52.5~ 14, 0 2.54~ 6.25~ 66.5~
5 Winter 5 0.06, 0.01 15.14, 0.67 0.1, 0 17.03, 0.4 0.04, 0.03 9.25, 0~ 21.8~ 14.6~ 309,~ 7.38~ 69.5~ 13.5~ 10.2~ 6.2,~ 83, ~
6 Winter 6 0.05, 0.01 21.43, 0.17 0.09, 0.01 8.58, 1.72 0.01, 0.02 9, 0.42 24.9~ 11.9~ 300.~ 7.57~ 63.5~ 23.5~ 4.6,~ 5.4,~ 87, ~

Select first occurrence of a decimal value in R

Ok, I have been trying to get an answer for this but I cant find it anywhere, but it seems like an easy task (which is bugging me even more!)
I have a dataframe with a series of numbers in a column which I want to filter to get the first occurrence of a number....for example, if i have 1.01, 1.08, 1.15, I want to filter the rows to get the row with the value 1.01 in that column.
An examples is:
x<- c(2.04, 2.25, 3.99, 3.20, 2.60, 1.85, 3.57, 3.37, 2.59, 1.60, 3.93, 1.33, 1.08, 4.64, 2.09, 4.53, 3.04, 3.85, 3.15, 3.97)
y<- c(2.62, 2.48, 1.40, 2.27, 3.71, 1.86, 3.56, 2.08, 2.36, 3.23, 1.65, 3.43, 1.57, 4.49, 2.29, 3.32, 2.12, 4.45, 1.57, 4.70)
z <- data.frame(x, y)
z <- z[order(z$x, decreasing = FALSE), ]
And the filtered results should be:
x y
1.08 1.57
2.04 2.62
3.04 2.12
4.53 3.32
Any help would be apprreciated
z %>%
arrange(x) %>%
group_by(int = floor(x)) %>%
slice(1) %>%
ungroup()
# A tibble: 4 × 3
x y int
<dbl> <dbl> <dbl>
1 1.08 1.57 1
2 2.04 2.62 2
3 3.04 2.12 3
4 4.53 3.32 4
or
z %>%
arrange(x) %>%
filter(floor(x) != lag(floor(x), default = 0))
x y
1 1.08 1.57
2 2.04 2.62
3 3.04 2.12
4 4.53 3.32
You can also try this:
z1 <- z %>%
group_by(floor(z$x)) %>%
arrange(z$x) %>%
filter(row_number()==1)
z1
# A tibble: 4 × 3
# Groups: floor(z$x) [4]
x y `floor(z$x)`
<dbl> <dbl> <dbl>
1 1.08 1.57 1
2 2.04 2.62 2
3 3.04 2.12 3
4 4.53 3.32 4

R split by group and create new columns

I'm trying to split a data frame from long to wide format by converting selected rows to columns. Here is the current general long-format structure:
data_long <- data.frame(
id = c("kelp","kelp","fish","fish","beach","beach","kelp","kelp","fish","fish","beach","beach"),
desig = c("mpa","reference","mpa","reference","mpa","reference","mpa","reference","mpa","reference","mpa","reference"),
indicator = c("density","density","density","density","density","density","biomass","biomass","biomass","biomass","biomass","biomass"),
n = c(1118,1118,1118,1118,1118,1118,1118,1118,1118,1118,1118,1118),
m = c(0.35, 4.28, 1.16, 106.35, 13.44,0.63,0.35, 4.28, 1.16, 106.35, 13.44,0.63),
sd = c(1.19, 8.48, 4.25, 118, 31.77,2.79,1.19, 8.48, 4.25, 118, 31.77,2.79)
)
data_long
I want to keep id and indicator, split by "desig",and move "n", "m", and "sd" into new columns. The final data frame structure I'm trying to obtain is:
data_wide <- data.frame(
id = c("kelp","fish","beach","kelp","fish","beach"),
indicator = c("density","density","density","biomass","biomass","biomass"),
mpa.n = c(1118,1118,1118,1118,1118,1118),
mpa.m = c(0.35, 4.28, 1.16, 106.35, 13.44,0.63),
mpa.sd = c(1.19, 8.48, 4.25, 118, 31.77,2.79),
reference.n = c(1118,1118,1118,1118,1118,1118),
reference.m = c(0.35, 4.28, 1.16, 106.35, 13.44,0.63),
reference.sd = c(1.19, 8.48, 4.25, 118, 31.77,2.79)
)
data_wide
I can't seem to get this right using reshape2. Any suggestions?
We may use pivot_wider
library(tidyr)
library(dplyr)
pivot_wider(data_long, names_from = desig,
values_from = c(n, m, sd), names_glue = "{desig}.{.value}") %>%
select(id, indicator, starts_with("mpa"), starts_with('reference'))
-output
# A tibble: 6 × 8
id indicator mpa.n mpa.m mpa.sd reference.n reference.m reference.sd
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 kelp density 1118 0.35 1.19 1118 4.28 8.48
2 fish density 1118 1.16 4.25 1118 106. 118
3 beach density 1118 13.4 31.8 1118 0.63 2.79
4 kelp biomass 1118 0.35 1.19 1118 4.28 8.48
5 fish biomass 1118 1.16 4.25 1118 106. 118
6 beach biomass 1118 13.4 31.8 1118 0.63 2.79

How to create a new column with moving averages based on a variable number of rows?

I am trying to write a code that creates a new column with moving averages based on 'year' where the number of rows for each year is variable where each year only has one unique value being repeated for each row of that year. I want to calculate moving averages based these unique values independent of the number of rows per year.
Just a FYI; I'm very new to R and programming so if I missed something for you to understand my problem please let me know.
For example, the type of data I'm working with looks like this:
df <- data.frame(year = c(1702, 1702, 1702, 1702, 1702, 1703, 1703, 1703, 1704, 1704, 1705, 1705, 1705, 1705, 1706, 1706, 1707, 1707, 1707, 1708, 1708, 1708, 1708, 1708, 1709, 1709, 1709, 1709, 1709), avgtemp = c(5.3, 5.3, 5.3, 5.3, 5.3, 3.9, 3.9, 3.9, 6.12, 6.12, 4.16, 4.16, 4.16, 4.16, 5.65, 5.65, 3.11, 3.11, 3.11, 5.17, 5.17, 5.17, 5.17, 5.17, 4.75, 4.75, 4.75, 4.75, 4.75))
I found this post, Moving Average by Unique Date with multiple observations per date, and tried the solution offered there by Mark Peterson but it doesnt work for me.
I've tried the following code.
rolledavg <-
df %>%
group_by(year) %>%
summarise(rollavg = mean(avgtemp)) %>%
ungroup() %>%
arrange(year) %>%
mutate( ma3temp = rollapply(rollavg
, 3
, mean
, align= "right"
, partial=T
, fill = NA))
I get the following error:
"Error in order(year) : argument 1 is not a vector".
The expected output should be something like this:
expected output df
I would appreciate any help I can get. Don't mind working with other packages/solutions than the one offered above.
Something like this with sapply()?
dat$ra <- sapply(1:nrow(dat), function(n) mean(dat$avgtemp[1:n]))
# year avgtemp ra
# 1 1702 5.30 5.300000
# 2 1702 5.30 5.300000
# 3 1702 5.30 5.300000
# 4 1702 5.30 5.300000
# 5 1702 5.30 5.300000
# 6 1703 3.90 5.066667
# 7 1703 3.90 4.900000
# 8 1703 3.90 4.775000
# 9 1704 6.12 4.924444
# 10 1704 6.12 5.044000
# 11 1705 4.16 4.963636
# 12 1705 4.16 4.896667
# 13 1705 4.16 4.840000
# 14 1705 4.16 4.791429
# 15 1706 5.65 4.848667
# 16 1706 5.65 4.898750
# 17 1707 3.11 4.793529
# 18 1707 3.11 4.700000
# 19 1707 3.11 4.616316
Note: If you want just two digits, use round(mean(.), 2).
Update
Following the update of your question, you may calculate the moving average with filter()1 from a unique version of your data frame and merge the result with the original data frame.
dat <- merge(dat, transform(unique(dat), ra=filter(avgtemp, rep(1/3, 3), sides=1)))
# year avgtemp ra
# 1 1702 5.30 NA
# 2 1702 5.30 NA
# 3 1702 5.30 NA
# 4 1702 5.30 NA
# 5 1702 5.30 NA
# 6 1703 3.90 NA
# 7 1703 3.90 NA
# 8 1703 3.90 NA
# 9 1704 6.12 5.106667
# 10 1704 6.12 5.106667
# 11 1705 4.16 4.726667
# 12 1705 4.16 4.726667
# 13 1705 4.16 4.726667
# 14 1705 4.16 4.726667
# 15 1706 5.65 5.310000
# 16 1706 5.65 5.310000
# 17 1707 3.11 4.306667
# 18 1707 3.11 4.306667
# 19 1707 3.11 4.306667
This is also possible with the zoo::rollmean() function.
dat <- merge(dat, transform(unique(dat), ra=c(rep(NA, 3 - 1), zoo::rollmean(avgtemp, 3))))
Data
dat <- structure(list(year = c(1702, 1702, 1702, 1702, 1702, 1703, 1703,
1703, 1704, 1704, 1705, 1705, 1705, 1705, 1706, 1706, 1707, 1707,
1707), avgtemp = c(5.3, 5.3, 5.3, 5.3, 5.3, 3.9, 3.9, 3.9, 6.12,
6.12, 4.16, 4.16, 4.16, 4.16, 5.65, 5.65, 3.11, 3.11, 3.11)), row.names = c(NA,
-19L), class = "data.frame")

R: obtain single data frame from list of zoo-objects

I have a list of zoo-bjects consisting of irregular time-series, lodf, in the following format:
> head(lodf)
[[1]]
2014-08-08 2014-08-14 2014-09-12
1.15 1.32 2.39
[[2]]
2014-07-22 2014-07-24 2014-08-14 2014-08-20 2014-08-27 2014-09-12
0.50 0.75 1.29 1.36 1.28 1.28
[[3]]
2012-11-01 2012-11-02 2013-07-12 2013-08-13 2013-09-11 2014-07-01
1.00 1.27 0.91 1.00 0.99 0.98
...
I am ultimately trying to sum all these time-series into one combined time-series, i.e. sum down each column. To do this, I am trying to convert into a zoo/xts time-series for further manipulation , i.e. to apply na.locf and other zoo-library capabilities before summing across the individual data frames/dates using rowsum. i.e. I am trying to get my list of date frames above into a combined zoo object resembling this:
Value
12/09/2014 1.07
14/08/2014 1.32
08/08/2014 1.15
12/09/2014 0.48
27/08/2014 0.53
20/08/2014 0.61
14/08/2014 0.54
24/07/2014 0.75
22/07/2014 0.5
01/07/2014 0.98
01/07/2014 0
...
There is often over-lap between the individual data frames i.e. several values corresponding to the same date index, and What I would like to do in those cases is to sum the values. E.g. if I have
012-11-01
0.7
012-11-01
1.5
012-11-01
0.7
I would like to have
012-11-01
2.9
as the value for this date index in the resulting large data frame.
I have tried merge, reading as a zoo object, do.call(rbind) etc. in the current format, but I am stumped. For further context, this question is part of a larger project outlined here: R: time series with duplicate time index entries. Any help would be most appreciated!
Update: please find a data object below as requested:
> dput(head(lodf))
list(structure(c(1.15, 1.32, 2.39), index = structure(c(16290L,
16296L, 16325L), class = "Date"), class = "zoo"), structure(c(0.5,
0.75, 1.29, 1.36, 1.28, 1.28), index = structure(c(16273L, 16275L,
16296L, 16302L, 16309L, 16325L), class = "Date"), class = "zoo"),
structure(c(1, 1.27, 0.91, 1, 0.99, 0.98), index = structure(c(15645L,
15646L, 15898L, 15930L, 15959L, 16252L), class = "Date"), class = "zoo"),
structure(c(1.27, 1.29, 1.28, 1.17, 0.59, 0), index = structure(c(15645L,
15651L, 15665L, 15679L, 15686L, 15747L), class = "Date"), class = "zoo"),
structure(c(1.9, 1.35, 0.66, 1.16, 0.66, 1.16, 1.26, 1.23,
1.28, 1.23, 1.17, 0.66, 1.18, 0.66, 1.29, 1.35, 1.45, 1.53,
1.61, 1.82, 1.8, 1.89, 1.8, 1.81, 1.78, 1.68, 2.18, 1.68,
1.56, 1.93, 1.84, 1.69, 1.18, 1.73, 1.18, 1.72, 1.83, 1.9,
1.99, 1.93, 1.87, 1.96, 2.1, 2.22, 2.33, 2.38, 2.35, 2.23,
2.16, 2.18, 2.17, 2.2, 2.29, 2.27, 2.28, 2.42, 2.48, 2.99,
2.56, 2.65, 2.69, 3.21, 2.7, 2.8, 2.79, 2.8, 2.78, 2.26,
2.78, 2.26, 2.12, 2.07, 1.97, 1.84, 1.77, 1.18, 1.7, 1.78,
1.91, 1.98, 1.93, 1.83, 1.76, 1.18, 1.01, 0.97, 0.86, 0.69,
0.56), index = structure(c(15645L, 15652L, 15660L, 15740L,
15797L, 15841L, 15860L, 15867L, 15876L, 15887L, 15890L, 15897L,
15901L, 15905L, 15908L, 15909L, 15910L, 15911L, 15915L, 15926L,
15931L, 15932L, 15938L, 15953L, 15954L, 15975L, 15978L, 15979L,
15981L, 15982L, 15985L, 15986L, 15987L, 16001L, 16003L, 16006L,
16008L, 16010L, 16014L, 16016L, 16021L, 16022L, 16023L, 16027L,
16029L, 16031L, 16045L, 16052L, 16059L, 16072L, 16077L, 16078L,
16084L, 16091L, 16098L, 16100L, 16101L, 16106L, 16132L, 16133L,
16134L, 16139L, 16146L, 16150L, 16153L, 16157L, 16160L, 16163L,
16167L, 16169L, 16170L, 16171L, 16175L, 16177L, 16182L, 16184L,
16212L, 16216L, 16220L, 16224L, 16248L, 16254L, 16258L, 16261L,
16297L, 16301L, 16309L, 16310L, 16317L), class = "Date"), class = "zoo"),
structure(c(3.35, 3.44, 3.41, 3.14, 3.11, 2.55, 2.65, 2.87,
3.14, 3.24, 3.41, 4.04, 4.19, 4.34, 4.44, 1.2, 1.3, 1.29,
1.3, 1.27, 0.77, 0.69, 0.55, 0), index = structure(c(15645L,
15650L, 15694L, 15740L, 15741L, 15742L, 15743L, 15749L, 15750L,
15751L, 15755L, 15756L, 15758L, 15762L, 15784L, 15800L, 15805L,
15810L, 15824L, 15835L, 15838L, 15840L, 15847L, 15849L), class = "Date"), class = "zoo"))
>
The input displayed at the top of the question appears to be the first three components of the input specified at the bottom of the question. The variable name used at the bottom of the question, lodf, seems to suggest that it contains a list of data frames but in fact it contains a list of zoo objects.
The question asks for a single data frame result but we are assuming that the output should be a single zoo series too, for consistency. Also we shall use the name L for the input as lodf would wrongly suggest a list of data frames. If z is the result as a zoo series then
data.frame(index = index(z), data = coredata(z))
could be used if a data frame really were desired.
In the output section near the end of this answer we show the result of using as our input L <- lodf[1:3] (i.e. first 3 components only) and separately show the output using L <- lodf (i.e. all components) as our input.
1) Reduce. We merge the zoo series in the list, L, returning a list and filling in missing values with 0. Then use Reduce to sum the components:
Reduce(`+`, do.call(merge, c(L, retclass = "list", fill = 0)))
1a) A variation of this is to return a zoo object from merge (which is the default if we do not specify retclass), then fill in its NAs with 0, turn it back into a list and use Reduce:
Reduce(`+`, as.list(na.fill(do.call(merge, L), 0)))
2) rowSums In this solution we merge the lists to give zoo object z, optionally add column names and then add across rows producing the final zoo object.
z <- do.call(merge, L)
colnames(L) <- seq_along(L) # optionally add names
zoo(rowSums(z, na.rm = TRUE), time(z))
Note that a rowSums solution of zoo objects previously appeared here
3) + If we knew that there were exactly 3 components to the list then an alternate way to write the above would be this. We optionally add names 1, 2, 3, merge the zoo objects and fill NAs with 0. Finally we add the series together. Modify in the obvious way if the number of components differs.
z0 <- na.fill(do.call(merge, L), 0)
colnames(z0) <- 1:3 # optionally add names 1, 2, 3
z0[, 1] + z0[, 2] + z0[, 3]
Output Using L <- lodf[1:3] as displayed at the start of the question where lodf is shown at the bottom of the question our output is:
2012-11-01 2012-11-02 2013-07-12 2013-08-13 2013-09-11 2014-07-01 2014-07-22
1.00 1.27 0.91 1.00 0.99 0.98 0.50
2014-07-24 2014-08-08 2014-08-14 2014-08-20 2014-08-27 2014-09-12
0.75 1.15 2.61 1.36 1.28 3.67
or using L <- locf in the above we get the following (except for solution 3 which would have to be modified in an obvious way to use 6 rather than 3 components):
2012-11-01 2012-11-02 2012-11-06 2012-11-07 2012-11-08 2012-11-16 2012-11-21
7.52 1.27 3.44 1.29 1.35 0.66 1.28
2012-12-05 2012-12-12 2012-12-20 2013-02-04 2013-02-05 2013-02-06 2013-02-07
1.17 0.59 3.41 4.30 3.11 2.55 2.65
2013-02-11 2013-02-13 2013-02-14 2013-02-15 2013-02-19 2013-02-20 2013-02-22
0.00 2.87 3.14 3.24 3.41 4.04 4.19
2013-02-26 2013-03-20 2013-04-02 2013-04-05 2013-04-10 2013-04-15 2013-04-29
4.34 4.44 0.66 1.20 1.30 1.29 1.30
2013-05-10 2013-05-13 2013-05-15 2013-05-16 2013-05-22 2013-05-24 2013-06-04
1.27 0.77 0.69 1.16 0.55 0.00 1.26
2013-06-11 2013-06-20 2013-07-01 2013-07-04 2013-07-11 2013-07-12 2013-07-15
1.23 1.28 1.23 1.17 0.66 0.91 1.18
2013-07-19 2013-07-22 2013-07-23 2013-07-24 2013-07-25 2013-07-29 2013-08-09
0.66 1.29 1.35 1.45 1.53 1.61 1.82
2013-08-13 2013-08-14 2013-08-15 2013-08-21 2013-09-05 2013-09-06 2013-09-11
1.00 1.80 1.89 1.80 1.81 1.78 0.99
2013-09-27 2013-09-30 2013-10-01 2013-10-03 2013-10-04 2013-10-07 2013-10-08
1.68 2.18 1.68 1.56 1.93 1.84 1.69
2013-10-09 2013-10-23 2013-10-25 2013-10-28 2013-10-30 2013-11-01 2013-11-05
1.18 1.73 1.18 1.72 1.83 1.90 1.99
2013-11-07 2013-11-12 2013-11-13 2013-11-14 2013-11-18 2013-11-20 2013-11-22
1.93 1.87 1.96 2.10 2.22 2.33 2.38
2013-12-06 2013-12-13 2013-12-20 2014-01-02 2014-01-07 2014-01-08 2014-01-14
2.35 2.23 2.16 2.18 2.17 2.20 2.29
2014-01-21 2014-01-28 2014-01-30 2014-01-31 2014-02-05 2014-03-03 2014-03-04
2.27 2.28 2.42 2.48 2.99 2.56 2.65
2014-03-05 2014-03-10 2014-03-17 2014-03-21 2014-03-24 2014-03-28 2014-03-31
2.69 3.21 2.70 2.80 2.79 2.80 2.78
2014-04-03 2014-04-07 2014-04-09 2014-04-10 2014-04-11 2014-04-15 2014-04-17
2.26 2.78 2.26 2.12 2.07 1.97 1.84
2014-04-22 2014-04-24 2014-05-22 2014-05-26 2014-05-30 2014-06-03 2014-06-27
1.77 1.18 1.70 1.78 1.91 1.98 1.93
2014-07-01 2014-07-03 2014-07-07 2014-07-10 2014-07-22 2014-07-24 2014-08-08
0.98 1.83 1.76 1.18 0.50 0.75 1.15
2014-08-14 2014-08-15 2014-08-19 2014-08-20 2014-08-27 2014-08-28 2014-09-04
2.61 1.01 0.97 1.36 2.14 0.69 0.56
2014-09-12
Updates Added additional solutions and re-arranged and expanded presentation.
Try (If the list elements are list of zoo objects and if you need to get the sum of the matching index).
library(xts)
library(zoo)
z1 <- setNames(do.call(`merge`, lodf), paste0("Value", seq_along(lodf)))
xts(data.frame(value=rowSums(z1, na.rm=TRUE)), order.by=index(z1))
# value
#2012-11-01 1.00
#2012-11-02 1.27
#2013-07-12 0.91
#2013-08-13 1.00
#2013-09-11 0.99
#2014-07-01 0.98
#2014-07-22 0.50
#2014-07-24 0.75
#2014-08-08 1.15
#2014-08-14 2.61
#2014-08-20 1.36
#2014-08-27 1.28
#2014-09-12 3.67
If you need to use na.locf before summing
z2 <- na.locf(z1)
xts(data.frame(value=rowSums(z2, na.rm=TRUE)), order.by=index(z2))
data
lodf <- list(structure(c(1.15, 1.32, 2.39), index = structure(c(16290,
16296, 16325), class = "Date"), class = "zoo"), structure(c(0.5,
0.75, 1.29, 1.36, 1.28, 1.28), index = structure(c(16273, 16275,
16296, 16302, 16309, 16325), class = "Date"), class = "zoo"),
structure(c(1, 1.27, 0.91, 1, 0.99, 0.98), index = structure(c(15645,
15646, 15898, 15930, 15959, 16252), class = "Date"), class = "zoo"))
With base R:
lodf = list(structure(list(`014-08-08` = 1.15, `2014-08-14` = 1.32,
`2014-09-12` = 2.39), .Names = c("014-08-08", "2014-08-14",
"2014-09-12"), class = "data.frame", row.names = c(NA, -1L)),
structure(list(`2014-07-22` = 0.5, `2014-07-24` = 0.75, `2014-08-14` = 1.29,
`2014-08-20` = 1.36, `2014-08-27` = 1.28, `2014-09-12` = 1.28), .Names = c("2014-07-22",
"2014-07-24", "2014-08-14", "2014-08-20", "2014-08-27", "2014-09-12"
), class = "data.frame", row.names = c(NA, -1L)), structure(list(
`2012-11-01` = 1, `2012-11-02` = 1.27, `2013-07-12` = 0.91,
`2013-08-13` = 1, `2013-09-11` = 0.99, `2014-07-01` = 0.98), .Names = c("2012-11-01",
"2012-11-02", "2013-07-12", "2013-08-13", "2013-09-11", "2014-07-01"
), class = "data.frame", row.names = c(NA, -1L)))
lodf
[[1]]
014-08-08 2014-08-14 2014-09-12
1 1.15 1.32 2.39
[[2]]
2014-07-22 2014-07-24 2014-08-14 2014-08-20 2014-08-27 2014-09-12
1 0.5 0.75 1.29 1.36 1.28 1.28
[[3]]
2012-11-01 2012-11-02 2013-07-12 2013-08-13 2013-09-11 2014-07-01
1 1 1.27 0.91 1 0.99 0.98
ddf = data.frame(full=character(), stringsAsFactors=F)
ll = unlist(lapply(lodf, function(x) paste(names(x), x, sep='_')))
ddf[1:length(ll),1]=ll
ddf
full
1 014-08-08_1.15
2 2014-08-14_1.32
3 2014-09-12_2.39
4 2014-07-22_0.5
5 2014-07-24_0.75
6 2014-08-14_1.29
7 2014-08-20_1.36
8 2014-08-27_1.28
9 2014-09-12_1.28
10 2012-11-01_1
11 2012-11-02_1.27
12 2013-07-12_0.91
13 2013-08-13_1
14 2013-09-11_0.99
15 2014-07-01_0.98
ddf$date = unlist(lapply(strsplit(ddf$full, '_'),function(x)x[1]))
ddf$value = as.numeric(unlist(lapply(strsplit(ddf$full, '_'),function(x)x[2])))
ddf = ddf[,-1]
ddf
date value
1 014-08-08 1.15
2 2014-08-14 1.32
3 2014-09-12 2.39
4 2014-07-22 0.50
5 2014-07-24 0.75
6 2014-08-14 1.29
7 2014-08-20 1.36
8 2014-08-27 1.28
9 2014-09-12 1.28
10 2012-11-01 1.00
11 2012-11-02 1.27
12 2013-07-12 0.91
13 2013-08-13 1.00
14 2013-09-11 0.99
15 2014-07-01 0.98
Finally:
aggregate(value~date, ddf, sum)
date value
1 2012.11.01 1.00
2 2012.11.02 1.27
3 2013.07.12 0.91
4 2013.08.13 1.00
5 2013.09.11 0.99
6 2014.07.01 0.98
7 2014.07.22 0.50
8 2014.07.24 0.75
9 2014.08.08 1.15
10 2014.08.14 2.61
11 2014.08.20 1.36
12 2014.08.27 1.28
13 2014.09.12 3.67

Resources