Fetching data from a data table in R - r

I have two data tables: MP and MPSubSample. MP has monthly data from 1965 to 2018 and MPSubSample has a few data points from MP. I want to expand MPSubSample such that if there is data from 196801(January 1968), then I want to get data from three months before and three months after from J 1968 from MP data table and add it to MPSubSample data table. Example is as follows:
MPSubSample:
Month ER SENT SENT+ TS DS D12 E12 Inf
196608 -7.905 -1.12 -1.22 0.26 0.52 2.870 5.493 32.650
MP:
Month ER SENT SENT+ TS DS D12 E12 Inf
196604 2.1373 -1.66 -1.62 0.13 0.45 2.7967 5.38 32.28
196605 2.445 -1.56 -1.55 0.14 0.5 2.8133 5.42 32.35
196606 -1.443 -1.41 -1.49 0.31 0.51 2.83 5.46 32.38
196607 -1.622 -1.31 -1.39 0.22 0.52 2.85 5.4767 32.45
196608 -7.905 -1.12 -1.22 0.26 0.52 2.87 5.4933 32.65
196609 -1.066 -1.36 -1.33 -0.19 0.6 2.89 5.51 32.75
196610 3.8619 -1.31 -1.33 -0.34 0.69 2.8833 5.5233 32.85
196611 1.3946 -1.28 -1.29 -0.16 0.78 2.8767 5.5367 32.88
196612 0.1325 -1.23 -1.18 -0.12 0.79 2.87 5.55 32.92
196701 8.1534 -1.06 -1.08 -0.14 0.77 2.88 5.5167 32.9
I want the final data set to be:
Month ER SENT SENT+ TS DS D12 E12 Inf
196605 2.445 -1.56 -1.55 0.14 0.5 2.8133 5.42 32.35
196606 -1.44 -1.41 -1.49 0.31 0.51 2.83 5.46 32.38
196607 -1.622 -1.31 -1.39 0.22 0.52 2.85 5.4767 32.45
196608 -7.905 -1.12 -1.22 0.26 0.52 2.87 5.4933 32.65
196609 -1.066 -1.36 -1.33 -0.19 0.6 2.89 5.51 32.75
196610 3.8619 -1.31 -1.33 -0.34 0.69 2.8833 5.5233 32.85
196611 1.3946 -1.28 -1.29 -0.16 0.78 2.8767 5.5367 32.88

Try this,
library(data.table)
setDT(MP); setDT(MPSubSample)
YM_plus <- function(a, b) {
month <- a %% 100
newmonth <- month + b
newyear <- (a %/% 100) + (newmonth - 1) %/% 12
newmonth <- (newmonth - 1) %% 12 + 1
100 * newyear + newmonth
}
MP[, c("fromdate", "todate") := .(YM_plus(Month, -3), YM_plus(Month, +3)) ]
MP[MPSubSample, on = .(fromdate <= Month, todate >= Month)][, .SD, .SDcols = names(MPSubSample)]
# Month ER SENT SENT. TS DS D12 E12 Inf.
# 1: 196605 2.4450 -1.56 -1.55 0.14 0.50 2.8133 5.4200 32.35
# 2: 196606 -1.4430 -1.41 -1.49 0.31 0.51 2.8300 5.4600 32.38
# 3: 196607 -1.6220 -1.31 -1.39 0.22 0.52 2.8500 5.4767 32.45
# 4: 196608 -7.9050 -1.12 -1.22 0.26 0.52 2.8700 5.4933 32.65
# 5: 196609 -1.0660 -1.36 -1.33 -0.19 0.60 2.8900 5.5100 32.75
# 6: 196610 3.8619 -1.31 -1.33 -0.34 0.69 2.8833 5.5233 32.85
# 7: 196611 1.3946 -1.28 -1.29 -0.16 0.78 2.8767 5.5367 32.88
DataL
MPSubSample <- structure(list(Month = 196608L, ER = -7.905, SENT = -1.12, SENT. = -1.22, TS = 0.26, DS = 0.52, D12 = 2.87, E12 = 5.493, Inf. = 32.65), class = "data.frame", row.names = c(NA, -1L))
MP <- structure(list(Month = c(196604L, 196605L, 196606L, 196607L, 196608L, 196609L, 196610L, 196611L, 196612L, 196701L), ER = c(2.1373, 2.445, -1.443, -1.622, -7.905, -1.066, 3.8619, 1.3946, 0.1325, 8.1534), SENT = c(-1.66, -1.56, -1.41, -1.31, -1.12, -1.36, -1.31, -1.28, -1.23, -1.06), SENT. = c(-1.62, -1.55, -1.49, -1.39, -1.22, -1.33, -1.33, -1.29, -1.18, -1.08), TS = c(0.13, 0.14, 0.31, 0.22, 0.26, -0.19, -0.34, -0.16, -0.12, -0.14), DS = c(0.45, 0.5, 0.51, 0.52, 0.52, 0.6, 0.69, 0.78, 0.79, 0.77), D12 = c(2.7967, 2.8133, 2.83, 2.85, 2.87, 2.89, 2.8833, 2.8767, 2.87, 2.88), E12 = c(5.38, 5.42, 5.46, 5.4767, 5.4933, 5.51, 5.5233, 5.5367, 5.55, 5.5167), Inf. = c(32.28, 32.35, 32.38, 32.45, 32.65, 32.75, 32.85, 32.88, 32.92, 32.9)), class = "data.frame", row.names = c(NA, -10L))

Related

format table to have mean (sd) instead of separate columns R

I Have a data frame of several water quality measures. For each measure I have a calculated mean and SD. I have a value for 6 sites and 4 seasons. Currently my dataframe has the means in a column for examples 'Temp_1' and then a column for the standard deviation as 'Temp_2'. I want to export the file with one column for each water quality measure with the format mean (SD).
current output
This is an example for the first water measure, but I'd like to code it so it is also done to remaining factors as well.
desired output
Head of dataframe
structure(list(season = structure(c(1L, 1L, 1L, 1L, 1L, 1L), levels = c("Winter",
"Spring", "Summer", "Autumn"), class = "factor"), Site = structure(1:6, levels = c("1",
"2", "3", "4", "5", "6"), class = "factor"), Temp_1 = c(7.2,
7.05, 6.3, 6.25, 6.2, 5.4), Temp_2 = c(1.55563491861041, 1.90918830920368,
1.69705627484771, 2.33345237791561, 2.40416305603426, 2.40416305603426
), pH_1 = c(7.435, 7.38, 7.52, 7.525, 7.38, 7.565), pH_2 = c(0.289913780286484,
0.282842712474619, 0.0989949493661164, 0.120208152801713, 0.0565685424949239,
0.261629509039023), DO_1 = c(9, 9.1, 8.25, 8.85, 9.25, 9), DO_2 = c(0,
0.424264068711928, 0.0707106781186558, 0.494974746830583, 0.636396103067892,
0.42426406871193), EC_1 = c(337.5, 333, 321.5, 322, 309, 300.5
), EC_2 = c(55.8614357137373, 41.0121933088198, 51.618795026618,
32.5269119345812, 25.4558441227157, 30.4055915910215), SS_1 = c(5.945,
3.65, 5.025, 2.535, 10.22, 4.595), SS_2 = c(0.728319984622144,
1.06066017177982, 2.93449314192417, 0.473761543394987, 8.23072293301141,
0.67175144212722), TP_1 = c(73.5, 75, 61.5, 66.5, 83, 87), TP_2 = c(3.53553390593274,
12.7279220613579, 9.19238815542512, 6.36396103067893, 26.8700576850888,
24.0416305603426), SRP_1 = c(19, 19, 10, 14, 13.5, 23.5), SRP_2 = c(2.82842712474619,
1.4142135623731, 2.82842712474619, 0, 0.707106781186548, 3.53553390593274
), PP_1 = c(54.5, 56, 51.5, 52.5, 69.5, 63.5), PP_2 = c(6.36396103067893,
11.3137084989848, 6.36396103067893, 6.36396103067893, 26.1629509039023,
20.5060966544099), DA_1 = c(0.083, 0.0775, 0.0775, 0.044, 0.059,
0.051), DA_2 = c(0.00282842712474619, 0.0120208152801713, 0.00919238815542513,
0.0014142135623731, 0.0127279220613579, 0.00848528137423857),
DNI_1 = c(0.048739437, 0.041015562, 0.0617723365, 0.0337441755,
0.041480944, 0.0143461675), DNI_2 = c(0.0345079125942686,
0.0223312453226695, 0.0187360224120165, 0.0162032493604065,
0.0258169069873252, 0.0202885446465761), DNA_1 = c(20.43507986,
20.438919615, 14.98692746, 19.953408625, 17.03060377, 8.5767502525
), DNA_2 = c(1.80288106961836, 1.2687128010491, 2.28839365291436,
1.03116172040732, 0.396528484042397, 1.72350828181138), DF_1 = c(0.0992379715,
0.0947268395, 0.094323125, 0.098064875, 0.0980304675, 0.085783911
), DF_2 = c(0.00372072305060515, 0.00724914346231915, 0.0142932471712976,
0.0116895470668939, 0.00255671780854136, 0.00830519117656529
), DC_1 = c(12.18685357, 12.73924378, 13.09550326, 13.417557825,
15.140975265, 21.429763715), DC_2 = c(0.57615880774946, 0.0430071960969884,
0.702539578486863, 0.134642528587041, 0.66786605299916, 0.17012889453292
), DS_1 = c(15.834380095, 15.69623116, 14.37636388, 15.444235935,
14.647596185, 11.9877372), DS_2 = c(1.67153135346354, 1.69978765863781,
2.47560570280853, 1.03831263471691, 1.24488755930594, 0.975483163720397
), DOC_1 = c(19.74, 20.08, 21.24, 20.34, 21.88, 24.92), DOC_2 = c(2.7435743110038,
1.69705627484772, 2.60215295476649, 1.04651803615609, 0.226274169979695,
0.452548339959388)), row.names = c(NA, 6L), class = "data.frame")
Using mutate across with some tricks to organize paired data we can do it this way. Further adaptation is possible (for example just to keep the mean_sd columns (just use transmute instead of mutate):
Update:
library(dplyr)
library(stringr)
df %>%
mutate(across(-c(season, Site), ~round(.,2))) %>%
mutate(across(ends_with('_1'), ~ paste0(.,
"(",
get(str_replace(cur_column(), "_1$", "_2")),
")"
), .names = "mean_sd_{.col}")) %>%
rename_at(vars(starts_with('mean_sd')), ~ str_remove(., "\\_1"))
season Site Temp_1 Temp_2 pH_1 pH_2 DO_1 DO_2 EC_1 EC_2 SS_1 SS_2 TP_1 TP_2 SRP_1 SRP_2 PP_1 PP_2 DA_1 DA_2 DNI_1 DNI_2 DNA_1 DNA_2 DF_1
1 Winter 1 7.20 1.56 7.43 0.29 9.00 0.00 337.5 55.86 5.94 0.73 73.5 3.54 19.0 2.83 54.5 6.36 0.08 0.00 0.05 0.03 20.44 1.80 0.10
2 Winter 2 7.05 1.91 7.38 0.28 9.10 0.42 333.0 41.01 3.65 1.06 75.0 12.73 19.0 1.41 56.0 11.31 0.08 0.01 0.04 0.02 20.44 1.27 0.09
3 Winter 3 6.30 1.70 7.52 0.10 8.25 0.07 321.5 51.62 5.03 2.93 61.5 9.19 10.0 2.83 51.5 6.36 0.08 0.01 0.06 0.02 14.99 2.29 0.09
4 Winter 4 6.25 2.33 7.53 0.12 8.85 0.49 322.0 32.53 2.54 0.47 66.5 6.36 14.0 0.00 52.5 6.36 0.04 0.00 0.03 0.02 19.95 1.03 0.10
5 Winter 5 6.20 2.40 7.38 0.06 9.25 0.64 309.0 25.46 10.22 8.23 83.0 26.87 13.5 0.71 69.5 26.16 0.06 0.01 0.04 0.03 17.03 0.40 0.10
6 Winter 6 5.40 2.40 7.57 0.26 9.00 0.42 300.5 30.41 4.60 0.67 87.0 24.04 23.5 3.54 63.5 20.51 0.05 0.01 0.01 0.02 8.58 1.72 0.09
DF_2 DC_1 DC_2 DS_1 DS_2 DOC_1 DOC_2 mean_sd_Temp mean_sd_pH mean_sd_DO mean_sd_EC mean_sd_SS mean_sd_TP mean_sd_SRP mean_sd_PP mean_sd_DA
1 0.00 12.19 0.58 15.83 1.67 19.74 2.74 7.2(1.56) 7.43(0.29) 9(0) 337.5(55.86) 5.94(0.73) 73.5(3.54) 19(2.83) 54.5(6.36) 0.08(0)
2 0.01 12.74 0.04 15.70 1.70 20.08 1.70 7.05(1.91) 7.38(0.28) 9.1(0.42) 333(41.01) 3.65(1.06) 75(12.73) 19(1.41) 56(11.31) 0.08(0.01)
3 0.01 13.10 0.70 14.38 2.48 21.24 2.60 6.3(1.7) 7.52(0.1) 8.25(0.07) 321.5(51.62) 5.03(2.93) 61.5(9.19) 10(2.83) 51.5(6.36) 0.08(0.01)
4 0.01 13.42 0.13 15.44 1.04 20.34 1.05 6.25(2.33) 7.53(0.12) 8.85(0.49) 322(32.53) 2.54(0.47) 66.5(6.36) 14(0) 52.5(6.36) 0.04(0)
5 0.00 15.14 0.67 14.65 1.24 21.88 0.23 6.2(2.4) 7.38(0.06) 9.25(0.64) 309(25.46) 10.22(8.23) 83(26.87) 13.5(0.71) 69.5(26.16) 0.06(0.01)
6 0.01 21.43 0.17 11.99 0.98 24.92 0.45 5.4(2.4) 7.57(0.26) 9(0.42) 300.5(30.41) 4.6(0.67) 87(24.04) 23.5(3.54) 63.5(20.51) 0.05(0.01)
mean_sd_DNI mean_sd_DNA mean_sd_DF mean_sd_DC mean_sd_DS mean_sd_DOC
1 0.05(0.03) 20.44(1.8) 0.1(0) 12.19(0.58) 15.83(1.67) 19.74(2.74)
2 0.04(0.02) 20.44(1.27) 0.09(0.01) 12.74(0.04) 15.7(1.7) 20.08(1.7)
3 0.06(0.02) 14.99(2.29) 0.09(0.01) 13.1(0.7) 14.38(2.48) 21.24(2.6)
4 0.03(0.02) 19.95(1.03) 0.1(0.01) 13.42(0.13) 15.44(1.04) 20.34(1.05)
5 0.04(0.03) 17.03(0.4) 0.1(0) 15.14(0.67) 14.65(1.24) 21.88(0.23)
6 0.01(0.02) 8.58(1.72) 0.09(0.01) 21.43(0.17) 11.99(0.98) 24.92(0.45)
First answer:
We could do this like so:
library(dplyr)
df %>% mutate(mean_sd = paste0(Temp_1, " (", round(Temp_2,2), ")"), .before=5)
season Site Temp_1 Temp_2 mean_sd pH_1 pH_2 DO_1 DO_2 EC_1 EC_2 SS_1 SS_2 TP_1 TP_2 SRP_1 SRP_2 PP_1
1 Winter 1 7.20 1.555635 7.2 (1.56) 7.435 0.28991378 9.00 0.00000000 337.5 55.86144 5.945 0.7283200 73.5 3.535534 19.0 2.8284271 54.5
2 Winter 2 7.05 1.909188 7.05 (1.91) 7.380 0.28284271 9.10 0.42426407 333.0 41.01219 3.650 1.0606602 75.0 12.727922 19.0 1.4142136 56.0
3 Winter 3 6.30 1.697056 6.3 (1.7) 7.520 0.09899495 8.25 0.07071068 321.5 51.61880 5.025 2.9344931 61.5 9.192388 10.0 2.8284271 51.5
4 Winter 4 6.25 2.333452 6.25 (2.33) 7.525 0.12020815 8.85 0.49497475 322.0 32.52691 2.535 0.4737615 66.5 6.363961 14.0 0.0000000 52.5
5 Winter 5 6.20 2.404163 6.2 (2.4) 7.380 0.05656854 9.25 0.63639610 309.0 25.45584 10.220 8.2307229 83.0 26.870058 13.5 0.7071068 69.5
6 Winter 6 5.40 2.404163 5.4 (2.4) 7.565 0.26162951 9.00 0.42426407 300.5 30.40559 4.595 0.6717514 87.0 24.041631 23.5 3.5355339 63.5
PP_2 DA_1 DA_2 DNI_1 DNI_2 DNA_1 DNA_2 DF_1 DF_2 DC_1 DC_2 DS_1 DS_2 DOC_1
1 6.363961 0.0830 0.002828427 0.04873944 0.03450791 20.43508 1.8028811 0.09923797 0.003720723 12.18685 0.5761588 15.83438 1.6715314 19.74
2 11.313708 0.0775 0.012020815 0.04101556 0.02233125 20.43892 1.2687128 0.09472684 0.007249143 12.73924 0.0430072 15.69623 1.6997877 20.08
3 6.363961 0.0775 0.009192388 0.06177234 0.01873602 14.98693 2.2883937 0.09432312 0.014293247 13.09550 0.7025396 14.37636 2.4756057 21.24
4 6.363961 0.0440 0.001414214 0.03374418 0.01620325 19.95341 1.0311617 0.09806487 0.011689547 13.41756 0.1346425 15.44424 1.0383126 20.34
5 26.162951 0.0590 0.012727922 0.04148094 0.02581691 17.03060 0.3965285 0.09803047 0.002556718 15.14098 0.6678661 14.64760 1.2448876 21.88
6 20.506097 0.0510 0.008485281 0.01434617 0.02028854 8.57675 1.7235083 0.08578391 0.008305191 21.42976 0.1701289 11.98774 0.9754832 24.92
DOC_2
1 2.7435743
2 1.6970563
3 2.6021530
4 1.0465180
5 0.2262742
6 0.4525483
You can create a new column like this
df$Temp <- paste0(df$Temp_1, ' (', df$Temp_2, ')')
And select only the desired output columns
df[, c('season', 'Site', 'Temp')]
library(tidyverse)
df %>%
pivot_longer(-c(season, Site)) %>%
mutate(name = name %>% str_remove_all("[^a-zA-Z]")) %>%
group_by(season, Site, name) %>%
summarise(value = str_c(round(value, 2), collapse = ", ")) %>%
pivot_wider(names_from = name,
values_from = value)
# A tibble: 6 x 17
# Groups: season, Site [6]
season Site DA DC DF DNA DNI DO DOC DS EC pH PP SRP SS Temp TP
<fct> <fct> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 Winter 1 0.08, 0 12.19, 0.58 0.1, 0 20.44, 1.8 0.05, 0.03 9, 0 19.7~ 15.8~ 337.~ 7.43~ 54.5~ 19, ~ 5.94~ 7.2,~ 73.5~
2 Winter 2 0.08, 0.01 12.74, 0.04 0.09, 0.01 20.44, 1.27 0.04, 0.02 9.1, 0.~ 20.0~ 15.7~ 333,~ 7.38~ 56, ~ 19, ~ 3.65~ 7.05~ 75, ~
3 Winter 3 0.08, 0.01 13.1, 0.7 0.09, 0.01 14.99, 2.29 0.06, 0.02 8.25, 0~ 21.2~ 14.3~ 321.~ 7.52~ 51.5~ 10, ~ 5.03~ 6.3,~ 61.5~
4 Winter 4 0.04, 0 13.42, 0.13 0.1, 0.01 19.95, 1.03 0.03, 0.02 8.85, 0~ 20.3~ 15.4~ 322,~ 7.53~ 52.5~ 14, 0 2.54~ 6.25~ 66.5~
5 Winter 5 0.06, 0.01 15.14, 0.67 0.1, 0 17.03, 0.4 0.04, 0.03 9.25, 0~ 21.8~ 14.6~ 309,~ 7.38~ 69.5~ 13.5~ 10.2~ 6.2,~ 83, ~
6 Winter 6 0.05, 0.01 21.43, 0.17 0.09, 0.01 8.58, 1.72 0.01, 0.02 9, 0.42 24.9~ 11.9~ 300.~ 7.57~ 63.5~ 23.5~ 4.6,~ 5.4,~ 87, ~

Translating loop syntax from Stata to R

I need to write a for loop to calculate the product of year variables (e.g. var1874) * price variables (e.g. num1874), creating a new variable for each year and its corresponding price value (e.g. newvar1874).
Here's my data in R
A tibble: 4 x 7
cty var1874 var1875 var1876 num1874 num1875 num1876
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 0.78 0.83 0.99 2.64 2.8 3.1
2 2 0.69 0.69 0.89 2.3 2.3 2.58
3 3 0.42 0.48 0.59 2.28 2.44 2.64
4 4 0.82 0.94 1.09 2.28 2.36 3
I've been able to do this using the 'foreach' loop in Stata:
local vn 1874 1875 1876
foreach v of local vn {
gen newvar'v' = var'v'*num'v'
Does anyone know how I would do this same type of command using the for loop in R? I know there may be simpler ways to do this without the for loop, but I need to know how to do this using the for loop.
Using a for loop you could do:
vn <- 1874:1876
for (v in vn) d[[paste0("newvar", v)]] <- d[[paste0("var", v)]] * d[[paste0("num", v)]]
d
#> cty var1874 var1875 var1876 num1874 num1875 num1876 newvar1874 newvar1875
#> 1 1 0.78 0.83 0.99 2.64 2.80 3.10 2.0592 2.3240
#> 2 2 0.69 0.69 0.89 2.30 2.30 2.58 1.5870 1.5870
#> 3 3 0.42 0.48 0.59 2.28 2.44 2.64 0.9576 1.1712
#> 4 4 0.82 0.94 1.09 2.28 2.36 3.00 1.8696 2.2184
#> newvar1876
#> 1 3.0690
#> 2 2.2962
#> 3 1.5576
#> 4 3.2700
Or using lapply you could do:
d[, paste0("newvar", vn)] <- lapply(vn, function(v) d[[paste0("var", v)]] * d[[paste0("num", v)]])
DATA
d <- structure(list(
cty = 1:4, var1874 = c(0.78, 0.69, 0.42, 0.82),
var1875 = c(0.83, 0.69, 0.48, 0.94), var1876 = c(
0.99, 0.89,
0.59, 1.09
), num1874 = c(2.64, 2.3, 2.28, 2.28), num1875 = c(
2.8,
2.3, 2.44, 2.36
), num1876 = c(3.1, 2.58, 2.64, 3)
), class = "data.frame", row.names = c(
"1",
"2", "3", "4"
))

Percentage changes in shares

My data contains the closing prices of 10 shares of the S&P 500 index.
Data :
> dput(head(StocksData))
structure(list(ACE = c(56.86, 56.82, 56.63, 56.39, 55.97, 55.23
), AMD = c(8.47, 8.77, 8.91, 8.69, 8.83, 9.19), AFL = c(51.83,
50.88, 50.78, 50.5, 50.3, 49.65), APD = c(81.59, 80.38, 80.03,
79.61, 79.76, 79.77), AA = c(15.12, 15.81, 15.85, 15.66, 15.71,
15.78), ATI = c(53.54, 52.37, 52.53, 51.91, 51.32, 51.45), AGN = c(69.77,
69.53, 69.69, 69.98, 68.99, 68.75), ALL = c(29.32, 29.03, 28.99,
28.66, 28.47, 28.2), MO = c(20.09, 20, 20.07, 20.16, 20, 19.88
), AMZN = c(184.22, 185.01, 187.42, 185.86, 185.49, 184.68)), row.names = c(NA,
6L), class = "data.frame")
For each of the 10 shares, i want to calculate their daily percentage changes.
I am struggling with this because in this data i havent information about time , i havent info about year , month or days .
Any thoughts on this would be helpful.
You can use :
perc_change <- (StocksData[-1, ] - StocksData[-nrow(StocksData), ])/StocksData[-nrow(StocksData), ] * 100
perc_change
# ACE AMD AFL APD AA ATI AGN ALL MO AMZN
#2 -0.07 3.5 -1.83 -1.483 4.56 -2.19 -0.34 -0.99 -0.45 0.43
#3 -0.33 1.6 -0.20 -0.435 0.25 0.31 0.23 -0.14 0.35 1.30
#4 -0.42 -2.5 -0.55 -0.525 -1.20 -1.18 0.42 -1.14 0.45 -0.83
#5 -0.74 1.6 -0.40 0.188 0.32 -1.14 -1.41 -0.66 -0.79 -0.20
#6 -1.32 4.1 -1.29 0.013 0.45 0.25 -0.35 -0.95 -0.60 -0.44
Note that you have 1 row less than your original dataframe since we don't have previous value to compare for 1st day.

R: obtain single data frame from list of zoo-objects

I have a list of zoo-bjects consisting of irregular time-series, lodf, in the following format:
> head(lodf)
[[1]]
2014-08-08 2014-08-14 2014-09-12
1.15 1.32 2.39
[[2]]
2014-07-22 2014-07-24 2014-08-14 2014-08-20 2014-08-27 2014-09-12
0.50 0.75 1.29 1.36 1.28 1.28
[[3]]
2012-11-01 2012-11-02 2013-07-12 2013-08-13 2013-09-11 2014-07-01
1.00 1.27 0.91 1.00 0.99 0.98
...
I am ultimately trying to sum all these time-series into one combined time-series, i.e. sum down each column. To do this, I am trying to convert into a zoo/xts time-series for further manipulation , i.e. to apply na.locf and other zoo-library capabilities before summing across the individual data frames/dates using rowsum. i.e. I am trying to get my list of date frames above into a combined zoo object resembling this:
Value
12/09/2014 1.07
14/08/2014 1.32
08/08/2014 1.15
12/09/2014 0.48
27/08/2014 0.53
20/08/2014 0.61
14/08/2014 0.54
24/07/2014 0.75
22/07/2014 0.5
01/07/2014 0.98
01/07/2014 0
...
There is often over-lap between the individual data frames i.e. several values corresponding to the same date index, and What I would like to do in those cases is to sum the values. E.g. if I have
012-11-01
0.7
012-11-01
1.5
012-11-01
0.7
I would like to have
012-11-01
2.9
as the value for this date index in the resulting large data frame.
I have tried merge, reading as a zoo object, do.call(rbind) etc. in the current format, but I am stumped. For further context, this question is part of a larger project outlined here: R: time series with duplicate time index entries. Any help would be most appreciated!
Update: please find a data object below as requested:
> dput(head(lodf))
list(structure(c(1.15, 1.32, 2.39), index = structure(c(16290L,
16296L, 16325L), class = "Date"), class = "zoo"), structure(c(0.5,
0.75, 1.29, 1.36, 1.28, 1.28), index = structure(c(16273L, 16275L,
16296L, 16302L, 16309L, 16325L), class = "Date"), class = "zoo"),
structure(c(1, 1.27, 0.91, 1, 0.99, 0.98), index = structure(c(15645L,
15646L, 15898L, 15930L, 15959L, 16252L), class = "Date"), class = "zoo"),
structure(c(1.27, 1.29, 1.28, 1.17, 0.59, 0), index = structure(c(15645L,
15651L, 15665L, 15679L, 15686L, 15747L), class = "Date"), class = "zoo"),
structure(c(1.9, 1.35, 0.66, 1.16, 0.66, 1.16, 1.26, 1.23,
1.28, 1.23, 1.17, 0.66, 1.18, 0.66, 1.29, 1.35, 1.45, 1.53,
1.61, 1.82, 1.8, 1.89, 1.8, 1.81, 1.78, 1.68, 2.18, 1.68,
1.56, 1.93, 1.84, 1.69, 1.18, 1.73, 1.18, 1.72, 1.83, 1.9,
1.99, 1.93, 1.87, 1.96, 2.1, 2.22, 2.33, 2.38, 2.35, 2.23,
2.16, 2.18, 2.17, 2.2, 2.29, 2.27, 2.28, 2.42, 2.48, 2.99,
2.56, 2.65, 2.69, 3.21, 2.7, 2.8, 2.79, 2.8, 2.78, 2.26,
2.78, 2.26, 2.12, 2.07, 1.97, 1.84, 1.77, 1.18, 1.7, 1.78,
1.91, 1.98, 1.93, 1.83, 1.76, 1.18, 1.01, 0.97, 0.86, 0.69,
0.56), index = structure(c(15645L, 15652L, 15660L, 15740L,
15797L, 15841L, 15860L, 15867L, 15876L, 15887L, 15890L, 15897L,
15901L, 15905L, 15908L, 15909L, 15910L, 15911L, 15915L, 15926L,
15931L, 15932L, 15938L, 15953L, 15954L, 15975L, 15978L, 15979L,
15981L, 15982L, 15985L, 15986L, 15987L, 16001L, 16003L, 16006L,
16008L, 16010L, 16014L, 16016L, 16021L, 16022L, 16023L, 16027L,
16029L, 16031L, 16045L, 16052L, 16059L, 16072L, 16077L, 16078L,
16084L, 16091L, 16098L, 16100L, 16101L, 16106L, 16132L, 16133L,
16134L, 16139L, 16146L, 16150L, 16153L, 16157L, 16160L, 16163L,
16167L, 16169L, 16170L, 16171L, 16175L, 16177L, 16182L, 16184L,
16212L, 16216L, 16220L, 16224L, 16248L, 16254L, 16258L, 16261L,
16297L, 16301L, 16309L, 16310L, 16317L), class = "Date"), class = "zoo"),
structure(c(3.35, 3.44, 3.41, 3.14, 3.11, 2.55, 2.65, 2.87,
3.14, 3.24, 3.41, 4.04, 4.19, 4.34, 4.44, 1.2, 1.3, 1.29,
1.3, 1.27, 0.77, 0.69, 0.55, 0), index = structure(c(15645L,
15650L, 15694L, 15740L, 15741L, 15742L, 15743L, 15749L, 15750L,
15751L, 15755L, 15756L, 15758L, 15762L, 15784L, 15800L, 15805L,
15810L, 15824L, 15835L, 15838L, 15840L, 15847L, 15849L), class = "Date"), class = "zoo"))
>
The input displayed at the top of the question appears to be the first three components of the input specified at the bottom of the question. The variable name used at the bottom of the question, lodf, seems to suggest that it contains a list of data frames but in fact it contains a list of zoo objects.
The question asks for a single data frame result but we are assuming that the output should be a single zoo series too, for consistency. Also we shall use the name L for the input as lodf would wrongly suggest a list of data frames. If z is the result as a zoo series then
data.frame(index = index(z), data = coredata(z))
could be used if a data frame really were desired.
In the output section near the end of this answer we show the result of using as our input L <- lodf[1:3] (i.e. first 3 components only) and separately show the output using L <- lodf (i.e. all components) as our input.
1) Reduce. We merge the zoo series in the list, L, returning a list and filling in missing values with 0. Then use Reduce to sum the components:
Reduce(`+`, do.call(merge, c(L, retclass = "list", fill = 0)))
1a) A variation of this is to return a zoo object from merge (which is the default if we do not specify retclass), then fill in its NAs with 0, turn it back into a list and use Reduce:
Reduce(`+`, as.list(na.fill(do.call(merge, L), 0)))
2) rowSums In this solution we merge the lists to give zoo object z, optionally add column names and then add across rows producing the final zoo object.
z <- do.call(merge, L)
colnames(L) <- seq_along(L) # optionally add names
zoo(rowSums(z, na.rm = TRUE), time(z))
Note that a rowSums solution of zoo objects previously appeared here
3) + If we knew that there were exactly 3 components to the list then an alternate way to write the above would be this. We optionally add names 1, 2, 3, merge the zoo objects and fill NAs with 0. Finally we add the series together. Modify in the obvious way if the number of components differs.
z0 <- na.fill(do.call(merge, L), 0)
colnames(z0) <- 1:3 # optionally add names 1, 2, 3
z0[, 1] + z0[, 2] + z0[, 3]
Output Using L <- lodf[1:3] as displayed at the start of the question where lodf is shown at the bottom of the question our output is:
2012-11-01 2012-11-02 2013-07-12 2013-08-13 2013-09-11 2014-07-01 2014-07-22
1.00 1.27 0.91 1.00 0.99 0.98 0.50
2014-07-24 2014-08-08 2014-08-14 2014-08-20 2014-08-27 2014-09-12
0.75 1.15 2.61 1.36 1.28 3.67
or using L <- locf in the above we get the following (except for solution 3 which would have to be modified in an obvious way to use 6 rather than 3 components):
2012-11-01 2012-11-02 2012-11-06 2012-11-07 2012-11-08 2012-11-16 2012-11-21
7.52 1.27 3.44 1.29 1.35 0.66 1.28
2012-12-05 2012-12-12 2012-12-20 2013-02-04 2013-02-05 2013-02-06 2013-02-07
1.17 0.59 3.41 4.30 3.11 2.55 2.65
2013-02-11 2013-02-13 2013-02-14 2013-02-15 2013-02-19 2013-02-20 2013-02-22
0.00 2.87 3.14 3.24 3.41 4.04 4.19
2013-02-26 2013-03-20 2013-04-02 2013-04-05 2013-04-10 2013-04-15 2013-04-29
4.34 4.44 0.66 1.20 1.30 1.29 1.30
2013-05-10 2013-05-13 2013-05-15 2013-05-16 2013-05-22 2013-05-24 2013-06-04
1.27 0.77 0.69 1.16 0.55 0.00 1.26
2013-06-11 2013-06-20 2013-07-01 2013-07-04 2013-07-11 2013-07-12 2013-07-15
1.23 1.28 1.23 1.17 0.66 0.91 1.18
2013-07-19 2013-07-22 2013-07-23 2013-07-24 2013-07-25 2013-07-29 2013-08-09
0.66 1.29 1.35 1.45 1.53 1.61 1.82
2013-08-13 2013-08-14 2013-08-15 2013-08-21 2013-09-05 2013-09-06 2013-09-11
1.00 1.80 1.89 1.80 1.81 1.78 0.99
2013-09-27 2013-09-30 2013-10-01 2013-10-03 2013-10-04 2013-10-07 2013-10-08
1.68 2.18 1.68 1.56 1.93 1.84 1.69
2013-10-09 2013-10-23 2013-10-25 2013-10-28 2013-10-30 2013-11-01 2013-11-05
1.18 1.73 1.18 1.72 1.83 1.90 1.99
2013-11-07 2013-11-12 2013-11-13 2013-11-14 2013-11-18 2013-11-20 2013-11-22
1.93 1.87 1.96 2.10 2.22 2.33 2.38
2013-12-06 2013-12-13 2013-12-20 2014-01-02 2014-01-07 2014-01-08 2014-01-14
2.35 2.23 2.16 2.18 2.17 2.20 2.29
2014-01-21 2014-01-28 2014-01-30 2014-01-31 2014-02-05 2014-03-03 2014-03-04
2.27 2.28 2.42 2.48 2.99 2.56 2.65
2014-03-05 2014-03-10 2014-03-17 2014-03-21 2014-03-24 2014-03-28 2014-03-31
2.69 3.21 2.70 2.80 2.79 2.80 2.78
2014-04-03 2014-04-07 2014-04-09 2014-04-10 2014-04-11 2014-04-15 2014-04-17
2.26 2.78 2.26 2.12 2.07 1.97 1.84
2014-04-22 2014-04-24 2014-05-22 2014-05-26 2014-05-30 2014-06-03 2014-06-27
1.77 1.18 1.70 1.78 1.91 1.98 1.93
2014-07-01 2014-07-03 2014-07-07 2014-07-10 2014-07-22 2014-07-24 2014-08-08
0.98 1.83 1.76 1.18 0.50 0.75 1.15
2014-08-14 2014-08-15 2014-08-19 2014-08-20 2014-08-27 2014-08-28 2014-09-04
2.61 1.01 0.97 1.36 2.14 0.69 0.56
2014-09-12
Updates Added additional solutions and re-arranged and expanded presentation.
Try (If the list elements are list of zoo objects and if you need to get the sum of the matching index).
library(xts)
library(zoo)
z1 <- setNames(do.call(`merge`, lodf), paste0("Value", seq_along(lodf)))
xts(data.frame(value=rowSums(z1, na.rm=TRUE)), order.by=index(z1))
# value
#2012-11-01 1.00
#2012-11-02 1.27
#2013-07-12 0.91
#2013-08-13 1.00
#2013-09-11 0.99
#2014-07-01 0.98
#2014-07-22 0.50
#2014-07-24 0.75
#2014-08-08 1.15
#2014-08-14 2.61
#2014-08-20 1.36
#2014-08-27 1.28
#2014-09-12 3.67
If you need to use na.locf before summing
z2 <- na.locf(z1)
xts(data.frame(value=rowSums(z2, na.rm=TRUE)), order.by=index(z2))
data
lodf <- list(structure(c(1.15, 1.32, 2.39), index = structure(c(16290,
16296, 16325), class = "Date"), class = "zoo"), structure(c(0.5,
0.75, 1.29, 1.36, 1.28, 1.28), index = structure(c(16273, 16275,
16296, 16302, 16309, 16325), class = "Date"), class = "zoo"),
structure(c(1, 1.27, 0.91, 1, 0.99, 0.98), index = structure(c(15645,
15646, 15898, 15930, 15959, 16252), class = "Date"), class = "zoo"))
With base R:
lodf = list(structure(list(`014-08-08` = 1.15, `2014-08-14` = 1.32,
`2014-09-12` = 2.39), .Names = c("014-08-08", "2014-08-14",
"2014-09-12"), class = "data.frame", row.names = c(NA, -1L)),
structure(list(`2014-07-22` = 0.5, `2014-07-24` = 0.75, `2014-08-14` = 1.29,
`2014-08-20` = 1.36, `2014-08-27` = 1.28, `2014-09-12` = 1.28), .Names = c("2014-07-22",
"2014-07-24", "2014-08-14", "2014-08-20", "2014-08-27", "2014-09-12"
), class = "data.frame", row.names = c(NA, -1L)), structure(list(
`2012-11-01` = 1, `2012-11-02` = 1.27, `2013-07-12` = 0.91,
`2013-08-13` = 1, `2013-09-11` = 0.99, `2014-07-01` = 0.98), .Names = c("2012-11-01",
"2012-11-02", "2013-07-12", "2013-08-13", "2013-09-11", "2014-07-01"
), class = "data.frame", row.names = c(NA, -1L)))
lodf
[[1]]
014-08-08 2014-08-14 2014-09-12
1 1.15 1.32 2.39
[[2]]
2014-07-22 2014-07-24 2014-08-14 2014-08-20 2014-08-27 2014-09-12
1 0.5 0.75 1.29 1.36 1.28 1.28
[[3]]
2012-11-01 2012-11-02 2013-07-12 2013-08-13 2013-09-11 2014-07-01
1 1 1.27 0.91 1 0.99 0.98
ddf = data.frame(full=character(), stringsAsFactors=F)
ll = unlist(lapply(lodf, function(x) paste(names(x), x, sep='_')))
ddf[1:length(ll),1]=ll
ddf
full
1 014-08-08_1.15
2 2014-08-14_1.32
3 2014-09-12_2.39
4 2014-07-22_0.5
5 2014-07-24_0.75
6 2014-08-14_1.29
7 2014-08-20_1.36
8 2014-08-27_1.28
9 2014-09-12_1.28
10 2012-11-01_1
11 2012-11-02_1.27
12 2013-07-12_0.91
13 2013-08-13_1
14 2013-09-11_0.99
15 2014-07-01_0.98
ddf$date = unlist(lapply(strsplit(ddf$full, '_'),function(x)x[1]))
ddf$value = as.numeric(unlist(lapply(strsplit(ddf$full, '_'),function(x)x[2])))
ddf = ddf[,-1]
ddf
date value
1 014-08-08 1.15
2 2014-08-14 1.32
3 2014-09-12 2.39
4 2014-07-22 0.50
5 2014-07-24 0.75
6 2014-08-14 1.29
7 2014-08-20 1.36
8 2014-08-27 1.28
9 2014-09-12 1.28
10 2012-11-01 1.00
11 2012-11-02 1.27
12 2013-07-12 0.91
13 2013-08-13 1.00
14 2013-09-11 0.99
15 2014-07-01 0.98
Finally:
aggregate(value~date, ddf, sum)
date value
1 2012.11.01 1.00
2 2012.11.02 1.27
3 2013.07.12 0.91
4 2013.08.13 1.00
5 2013.09.11 0.99
6 2014.07.01 0.98
7 2014.07.22 0.50
8 2014.07.24 0.75
9 2014.08.08 1.15
10 2014.08.14 2.61
11 2014.08.20 1.36
12 2014.08.27 1.28
13 2014.09.12 3.67

Merging data frames based on column and row names, conditional column creation

I have a data frame with monthly returns and their corresponding month.
Data <- read.csv("C:/Users/h/Desktop/overflow.csv", sep=";", dec=",")
Data$Date <- as.Date(as.character(Data$Date), format="%Y-%m-%d")
The data frame looks like this now:
> Data
Fund.A Fund.B Fund.C Fund.D
2012-01-01 -0.01 0.04 0.11 0.10
2012-02-01 -0.04 -0.06 0.08 0.11
2012-03-01 -0.04 -0.07 0.15 -0.03
2012-04-01 0.00 -0.08 -0.04 0.13
2012-05-01 -0.07 0.10 0.06 0.02
2012-06-01 -0.05 0.06 0.06 -0.02
2012-07-01 0.12 -0.06 -0.09 -0.06
2012-08-01 0.08 -0.03 0.05 0.13
2012-09-01 0.10 0.07 -0.02 0.15
2012-10-01 -0.08 0.14 0.00 -0.04
2012-11-01 -0.09 0.11 -0.07 0.12
2012-12-01 -0.01 -0.09 0.07 -0.02
Now I want to continue the time series with new returns from a new csv, by simply matching the new return with the appropriate Fund in "Data". My problem is that new assets might have been added, messing up the order.
import <- read.csv("C:/Users/h/Desktop/import.csv", sep=";", dec=",")
import
2013-01-01
1 Funds: NA
2 Fund A 0.04
3 Fund AA -0.09
4 Fund C -0.10
5 Fund D 0.03
6 Fund B 0.14
As you can see, the "import" csv has new assets (Fund AA) as well as assets seen in "Data" (Fund a to D), where the funds are in rows and not columns. How can I write a code, which matches and adds a row to "Data" where the values in "import" falls under the right column (Fund) in "Data"? And if a new asset have been added, creates a column for the new asset?
As a bonus, the code would only add a row if the date in "import" is more recent date than the most recent one in "Data". To only import new returns.
Appreciate it!
For time series purpose, I would recommend using xts. It makes life a bit easier. Borrowing from Arun's usable data:
olddata <- structure(list(Date = structure(c(15340, 15371, 15400, 15431,
15461, 15492, 15522, 15553, 15584, 15614, 15645, 15675), class = "Date"),
Fund.A = c(-0.01, -0.04, -0.04, 0, -0.07, -0.05, 0.12, 0.08, 0.1, -0.08,
-0.09, -0.01), Fund.B = c(0.04, -0.06, -0.07, -0.08, 0.1, 0.06, -0.06,
-0.03, 0.07, 0.14, 0.11, -0.09), Fund.C = c(0.11, 0.08, 0.15, -0.04,
0.06, 0.06, -0.09, 0.05, -0.02, 0, -0.07, 0.07), Fund.D = c(0.1, 0.11,
-0.03, 0.13, 0.02, -0.02, -0.06, 0.13, 0.15, -0.04, 0.12, -0.02)),
.Names = c("Date", "Fund.A", "Fund.B", "Fund.C", "Fund.D"),
row.names = c(NA, 12L), class = "data.frame")
newimport <- structure(list(funds = c("Fund.A", "Fund.AA", "Fund.C",
"Fund.D", "Fund.B"), `2013-01-01` = c(0.04, -0.09, -0.1, 0.03, 0.14)),
.Names = c("funds", "2013-01-01"), row.names = c(NA, -5L),
class = "data.frame")
Convert data to xts for easy datewise subsetting:
olddata <- xts(olddata[,-1], olddata$Date)
newdata <- xts(t(newimport[,-1]), as.Date(colnames(newimport)[-1]))
colnames(newdata) <- newimport[,1]
Merge data together while taking care of any new columns:
cols <- names(newdata) %in% names(olddata)
combineData <- merge(rbind(olddata, newdata[,cols]), newdata[,!cols])
combineData
Fund.A Fund.B Fund.C Fund.D Fund.AA
2012-01-01 -0.01 0.04 0.11 0.10 NA
2012-02-01 -0.04 -0.06 0.08 0.11 NA
2012-03-01 -0.04 -0.07 0.15 -0.03 NA
2012-04-01 0.00 -0.08 -0.04 0.13 NA
2012-05-01 -0.07 0.10 0.06 0.02 NA
2012-06-01 -0.05 0.06 0.06 -0.02 NA
2012-07-01 0.12 -0.06 -0.09 -0.06 NA
2012-08-01 0.08 -0.03 0.05 0.13 NA
2012-09-01 0.10 0.07 -0.02 0.15 NA
2012-10-01 -0.08 0.14 0.00 -0.04 NA
2012-11-01 -0.09 0.11 -0.07 0.12 NA
2012-12-01 -0.01 -0.09 0.07 -0.02 NA
2013-01-01 0.04 0.14 -0.10 0.03 -0.09

Resources