pivot_wider removing a date column when pivoting in R - r
I have some long data and I am trying to use pivot_wider. What I currently have is:
df %>% group_by(TICKER) %>%
mutate(row_id_for_pivot = row_number()) %>%
pivot_wider(names_from = TICKER, values_from = RET, id_cols = row_id_for_pivot)
Which gives me:
row_id_for_pivot JEQ RLH PMC
<int> <chr> <chr> <chr>
1 1 0.007634 0.200405 0.025189
2 2 0.041667 0.065767 0.053440
3 3 0.060000 0.142405 0.062391
4 4 0.012007 0.058172 0.059276
However I lose the date column from the original data.
How can I keep the date column?
Data:
structure(list(date = structure(c(14638, 14666, 14699, 14729,
14757, 14790, 14820, 14852, 14882, 14911, 14943, 14974, 15005,
15033, 15064, 15093, 15125, 15155, 15184, 15217, 15247, 15278,
15308, 15338, 15370, 15399, 15429, 15460, 15491, 15520, 15552,
15583, 15611, 15644, 15674, 15705, 15736, 15764, 15792, 15825,
15856, 15884, 15917, 15947, 15978, 16009, 16038, 16070, 16101,
16129, 16160, 16190, 16220, 16251, 16282, 16311, 16343, 16374,
16402, 16435, 16435, 16465, 16493, 16525, 16555, 16584, 16616,
16647, 16678, 16708, 16738, 16769, 16800, 16800, 16829, 16860,
16891, 16920, 16952, 16982, 17011, 17044, 17074, 17105, 17135,
17165, 17165, 17197, 17225, 17256, 17284, 17317, 17347, 17378,
17409, 17438, 17470, 17500, 17529, 17529, 17562, 17590, 17619,
17651, 17682, 17711, 17743, 17774, 17802, 17835, 17865, 17896,
17896, 14638, 14666, 14699, 14729, 14757, 14790, 14820, 14852,
14882, 14911, 14943, 14974, 15005, 15033, 15064, 15093, 15125,
15155, 15184, 15217, 15247, 15278, 15308, 15338, 15370, 15399,
15429, 15460, 15491, 15520, 15552, 15583, 15611, 15644, 15674,
15705, 15736, 15764, 15792, 15825, 15856, 15884, 15917, 15947,
15978, 16009, 16038, 16070, 16101, 16129, 16160, 16190, 16220,
16251, 16282, 16311, 16343, 16374, 16402, 16435, 16465, 16493,
16525, 16555, 16584, 16616, 16647, 16678, 16708, 16738, 16769,
16800, 16829, 16860, 16891, 16920, 16952, 16982, 17011, 17044,
17074, 17105, 17135, 17165, 17197, 17225, 17256, 17284, 17317,
17347, 17378, 17409, 17438, 17470, 17500, 17529, 17562, 17590,
17619, 17651, 17682, 17711, 17743, 17774, 17802, 17835, 17865,
17896, 14638, 14666, 14699, 14729, 14757, 14790, 14820, 14852,
14882, 14911, 14943, 14974, 15005, 15033, 15064, 15093, 15125,
15155, 15184, 15217, 15247, 15278, 15308, 15338, 15370, 15399,
15429, 15460, 15491, 15520, 15552, 15583, 15611, 15644, 15674,
15705, 15736, 15764, 15792, 15825, 15856, 15884, 15917, 15947,
15978, 16009, 16038, 16070, 16101, 16129, 16160, 16190, 16220,
16251, 16282, 16311, 16343, 16374, 16402, 16435, 16465, 16493,
16525, 16555, 16584, 16616, 16647, 16678, 16708, 16738, 16769,
16800, 16829, 16860, 16891, 16920, 16952, 16982, 17011, 17044,
17074, 17105, 17135, 17165, 17197, 17225, 17256, 17284, 17317,
17347, 17378, 17409, 17438, 17470, 17500, 17529), class = "Date"),
TICKER = c("JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ",
"JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ",
"JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ",
"JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ",
"JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ",
"JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ",
"JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ",
"JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ",
"JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ",
"JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ",
"JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ",
"JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ",
"JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "JEQ", "RLH", "RLH",
"RLH", "RLH", "RLH", "RLH", "RLH", "RLH", "RLH", "RLH", "RLH",
"RLH", "RLH", "RLH", "RLH", "RLH", "RLH", "RLH", "RLH", "RLH",
"RLH", "RLH", "RLH", "RLH", "RLH", "RLH", "RLH", "RLH", "RLH",
"RLH", "RLH", "RLH", "RLH", "RLH", "RLH", "RLH", "RLH", "RLH",
"RLH", "RLH", "RLH", "RLH", "RLH", "RLH", "RLH", "RLH", "RLH",
"RLH", "RLH", "RLH", "RLH", "RLH", "RLH", "RLH", "RLH", "RLH",
"RLH", "RLH", "RLH", "RLH", "RLH", "RLH", "RLH", "RLH", "RLH",
"RLH", "RLH", "RLH", "RLH", "RLH", "RLH", "RLH", "RLH", "RLH",
"RLH", "RLH", "RLH", "RLH", "RLH", "RLH", "RLH", "RLH", "RLH",
"RLH", "RLH", "RLH", "RLH", "RLH", "RLH", "RLH", "RLH", "RLH",
"RLH", "RLH", "RLH", "RLH", "RLH", "RLH", "RLH", "RLH", "RLH",
"RLH", "RLH", "RLH", "RLH", "RLH", "RLH", "RLH", "PMC", "PMC",
"PMC", "PMC", "PMC", "PMC", "PMC", "PMC", "PMC", "PMC", "PMC",
"PMC", "PMC", "PMC", "PMC", "PMC", "PMC", "PMC", "PMC", "PMC",
"PMC", "PMC", "PMC", "PMC", "PMC", "PMC", "PMC", "PMC", "PMC",
"PMC", "PMC", "PMC", "PMC", "PMC", "PMC", "PMC", "PMC", "PMC",
"PMC", "PMC", "PMC", "PMC", "PMC", "PMC", "PMC", "PMC", "PMC",
"PMC", "PMC", "PMC", "PMC", "PMC", "PMC", "PMC", "PMC", "PMC",
"PMC", "PMC", "PMC", "PMC", "PMC", "PMC", "PMC", "PMC", "PMC",
"PMC", "PMC", "PMC", "PMC", "PMC", "PMC", "PMC", "PMC", "PMC",
"PMC", "PMC", "PMC", "PMC", "PMC", "PMC", "PMC", "PMC", "PMC",
"PMC", "PMC", "PMC", "PMC", "PMC", "PMC", "PMC", "PMC", "PMC",
"PMC", "PMC", "PMC", "PMC"), RET = c("0.007634", "0.041667",
"0.060000", "0.012007", "-0.113559", "-0.034417", "0.041584",
"-0.013308", "0.042389", "0.005545", "0.031250", "0.100713",
"0.003268", "0.076629", "-0.066636", "-0.024311", "-0.008306",
"0.021776", "0.011475", "-0.085900", "-0.063830", "0.013258",
"-0.031776", "-0.024710", "0.064000", "0.054511", "0.010695",
"-0.033510", "-0.080292", "0.027778", "-0.025096", "-0.001287",
"0.009220", "-0.015717", "0.029940", "0.091968", "0.026930",
"0.048951", "0.043333", "0.063898", "-0.018018", "-0.012232",
"-0.003096", "-0.026398", "0.090909", "-0.001462", "0.023426",
"0.021259", "-0.060086", "0.018265", "-0.024439", "-0.019383",
"0.062500", "0.045588", "0.023910", "-0.002747", "-0.020661",
"0.035162", "-0.022323", "-0.049509", "-0.049509", "0.048449",
"0.084813", "0.051948", "0.007407", "-0.008578", "0.021014",
"-0.035109", "-0.061481", "-0.082888", "0.090379", "0.029412",
"0.039734", "0.039734", "-0.089610", "-0.012839", "0.060694",
"0.013624", "0.034946", "-0.003896", "0.044329", "0.011236",
"0.014815", "-0.008516", "-0.040491", "-0.017762", "-0.017762",
"0.031908", "0.025000", "0.021823", "0.028894", "0.035409",
"0.002359", "0.018729", "0.007021", "-0.001147", "0.052813",
"0.034896", "0.000821", "0.000821", "0.049071", "-0.027484",
"-0.004348", "-0.008734", "-0.015419", "-0.011186", "-0.038462",
"-0.021177", "0.008414", "-0.117998", "0.027027", "-0.060471",
"-0.060471", "0.200405", "0.065767", "0.142405", "0.058172",
"-0.137435", "-0.094082", "0.247906", "-0.161074", "0.190400",
"0.057796", "-0.025413", "0.040417", "-0.038847", "0.113429",
"-0.039813", "0.067073", "-0.072000", "-0.027094", "-0.044304",
"-0.074172", "-0.040057", "0.035767", "0.007194", "-0.010000",
"0.056277", "0.050546", "0.067620", "0.015834", "0.015588",
"0.021251", "-0.135260", "-0.026738", "-0.141484", "0.054400",
"0.100152", "0.088276", "-0.053232", "-0.034806", "-0.013870",
"-0.081575", "-0.047473", "-0.017685", "0.088380", "-0.171429",
"-0.043557", "0.142315", "-0.086379", "0.100000", "-0.041322",
"0.029310", "-0.023451", "-0.013722", "0.015652", "-0.061644",
"0.000000", "0.049288", "-0.010452", "-0.003515", "0.040564",
"0.074576", "0.012618", "0.059190", "-0.019118", "0.049475",
"0.044286", "0.047880", "0.062663", "-0.009828", "0.054591",
"-0.036471", "-0.089133", "-0.060322", "-0.194009", "0.306195",
"0.142276", "-0.062871", "0.017722", "-0.097015", "0.093664",
"-0.176322", "0.275229", "0.007194", "0.047619", "-0.051136",
"-0.041916", "-0.068750", "-0.053691", "-0.078014", "0.007692",
"0.122137", "-0.027211", "-0.020979", "0.235714", "0.017341",
"0.028409", "0.088398", "0.060914", "-0.071770", "0.005155",
"0.010256", "0.106599", "0.068807", "0.072961", "0.092000",
"-0.084249", "-0.125600", "-0.174748", "-0.090909", "0.025189",
"0.053440", "0.062391", "0.059276", "-0.150259", "-0.106098",
"-0.109140", "-0.405819", "0.228093", "0.053515", "0.081673",
"0.054328", "-0.012227", "0.038904", "-0.026383", "0.150350",
"-0.062310", "0.034036", "0.000784", "0.153485", "-0.031229",
"0.093203", "0.003205", "-0.030032", "-0.173254", "-0.023108",
"0.013866", "-0.045052", "-0.163437", "0.099698", "-0.057692",
"0.224490", "0.004762", "-0.034755", "0.181669", "-0.013850",
"0.016854", "-0.010359", "-0.023029", "-0.079286", "0.211016",
"-0.112108", "0.056277", "-0.159836", "0.078862", "0.112283",
"0.529810", "-0.047830", "0.132093", "-0.009860", "0.160996",
"-0.028234", "-0.001839", "0.053427", "-0.055964", "-0.077807",
"-0.018481", "0.174376", "-0.239805", "-0.050436", "0.111058",
"0.086484", "0.127600", "0.016673", "0.160502", "0.001203",
"0.026126", "-0.042435", "-0.129890", "0.003512", "0.190760",
"0.028807", "-0.151714", "-0.221623", "-0.043271", "0.069199",
"0.123942", "-0.071886", "0.077048", "-0.048946", "0.111243",
"-0.152120", "0.010504", "0.045738", "-0.013917", "-0.008064",
"-0.048781", "0.008547", "0.046610", "0.062753", "-0.041905",
"0.168986", "-0.003401", "0.000000", "-0.001706", ""), row_id_for_pivot = 1:317), class = "data.frame", row.names = c(NA,
-317L))
edit: After running
x1 <- df2 %>%
group_by(TICKER) %>%
mutate(row_id_for_pivot = row_number()) %>%
pivot_wider(names_from = TICKER, values_from = RET,
id_cols = c(date, row_id_for_pivot))
x1 %>%
filter(date == "2015-01-30")
(Where d2 is the dput date.
I get:
# A tibble: 2 x 5
date row_id_for_pivot JEQ RLH PMC
<date> <int> <chr> <chr> <chr>
1 2015-01-30 62 0.048449 NA NA
2 2015-01-30 61 NA 0.012618 0.111058
EDIT 2:
Using df2 as the data above I run:
df2 %>%
distinct(date)
Which gives me 108 observations
I then run
out <- df2 %>%
group_by(TICKER, year = lubridate::year(date)) %>%
mutate(row_id_for_pivot = row_number()) %>%
pivot_wider(names_from = TICKER, values_from = RET,
id_cols = c(date, row_id_for_pivot)) %>%
arrange(date) %>%
group_by(date,row_id_for_pivot ) %>%
summarise_at(vars(-group_cols()), toString)
Which gives me 113 observations.
Taking a look at it I see I have some duplicates at dates:
2018-12-31, 2017-12-29, 2016-12-30, 2015-12-31, 2014-12-31
Doing the following:
> df2 %>%
+ filter(date == "2018-12-31")
date TICKER RET row_id_for_pivot
1 2018-12-31 JEQ -0.060471 112
2 2018-12-31 JEQ -0.060471 113
3 2018-12-31 RLH -0.090909 221
Tells me I have a duplicate in the original data. I now start to think it is a problem when I created the row_id_for_pivot column.
So I put new data with a few more observatons:
using df3 I run
xN <- df3 %>%
distinct() %>%
group_by(TICKER, year = lubridate::year(date)) %>%
mutate(row_id_for_pivot = row_number()) %>%
pivot_wider(names_from = TICKER, values_from = RET,
id_cols = c(date, row_id_for_pivot)) %>%
arrange(date) %>%
group_by(date,row_id_for_pivot ) %>%
summarise_at(vars(-group_cols()), toString)
Which gives me 126 observations when it should return 108 unique(xN$date).
Taking a look at the xN data after pivot_wider the first duplicate is 2012-07-31
So I run on the new data df3
> df3 %>%
+ filter(date == "2012-07-31")
date TICKER RET
1 2012-07-31 AMRE C
2 2012-07-31 AA -0.032000
3 2012-07-31 CHE 0.038551
4 2012-07-31 MLR 0.030760
5 2012-07-31 UMC 0.038568
There are no duplicates but there is a C. Could that be messing with my pivot?
Running the following:
> xN %>%
+ filter(date == "2012-07-31")
# A tibble: 2 x 7
# Groups: date [1]
date row_id_for_pivot AMRE AA CHE MLR UMC
<date> <int> <chr> <chr> <chr> <chr> <chr>
1 2012-07-31 1 C NA NA NA NA
2 2012-07-31 7 NA -0.032000 0.038551 0.030760 0.038568
Gives me 2 results.
Should I first set C to NA?
New data:
df3 <- structure(list(date = structure(c(15552, 15583, 15611, 15644,
15674, 15705, 15736, 15764, 15792, 15825, 15856, 15884, 15917,
15947, 15978, 16009, 16038, 16070, 16101, 16129, 16160, 16190,
16220, 16251, 16282, 16311, 16343, 16374, 16402, 16435, 16465,
16493, 16493, 17135, 17165, 17197, 17225, 17256, 17284, 17317,
17347, 17378, 17409, 17438, 17470, 17500, 17529, 17562, 17590,
17619, 17651, 17682, 17711, 17743, 17774, 17802, 17835, 17865,
17896, 14638, 14666, 14699, 14729, 14757, 14790, 14820, 14852,
14882, 14911, 14943, 14974, 15005, 15033, 15064, 15093, 15125,
15155, 15184, 15217, 15247, 15278, 15308, 15338, 15370, 15399,
15429, 15460, 15491, 15520, 15552, 15583, 15611, 15644, 15674,
15705, 15736, 15764, 15792, 15825, 15856, 15884, 15917, 15947,
15978, 16009, 16038, 16070, 16101, 16129, 16160, 16190, 16220,
16251, 16282, 16311, 16343, 16374, 16402, 16435, 16465, 16493,
16525, 16555, 16584, 16616, 16647, 16678, 16708, 16738, 16769,
16800, 16829, 16860, 16891, 16920, 16952, 16982, 17011, 17044,
17074, 17105, 14638, 14666, 14699, 14729, 14757, 14790, 14820,
14852, 14882, 14911, 14943, 14974, 15005, 15033, 15064, 15093,
15125, 15155, 15184, 15217, 15247, 15278, 15308, 15338, 15370,
15399, 15429, 15460, 15491, 15520, 15552, 15583, 15611, 15644,
15674, 15705, 15736, 15764, 15792, 15825, 15856, 15884, 15917,
15947, 15978, 16009, 16038, 16070, 16101, 16129, 16160, 16190,
16220, 16251, 16282, 16311, 16343, 16374, 16402, 16435, 16465,
16493, 16525, 16555, 16584, 16616, 16647, 16678, 16708, 16738,
16769, 16800, 16829, 16860, 16891, 16920, 16952, 16982, 17011,
17044, 17074, 17105, 17135, 17165, 17197, 17225, 17256, 17284,
17317, 17347, 17378, 17409, 17438, 17470, 17500, 17529, 17562,
17590, 17619, 17651, 17682, 17711, 17743, 17774, 17802, 17835,
17865, 17896, 14638, 14666, 14699, 14729, 14757, 14790, 14820,
14852, 14882, 14911, 14943, 14974, 15005, 15033, 15064, 15093,
15125, 15155, 15184, 15217, 15247, 15278, 15308, 15338, 15370,
15399, 15429, 15460, 15491, 15520, 15552, 15583, 15611, 15644,
15674, 15705, 15736, 15764, 15792, 15825, 15856, 15884, 15917,
15947, 15978, 16009, 16038, 16070, 16101, 16129, 16160, 16190,
16220, 16251, 16282, 16311, 16343, 16374, 16402, 16435, 16465,
16493, 16525, 16555, 16584, 16616, 16647, 16678, 16708, 16738,
16769, 16800, 16829, 16860, 16891, 16920, 16952, 16982, 17011,
17044, 17074, 17105, 17135, 17165, 17197, 17225, 17256, 17284,
17317, 17347, 17378, 17409, 17438, 17470, 17500, 17529, 17562,
17590, 17619, 17651, 17682, 17711, 17743, 17774, 17802, 17835,
17865, 17896, 14638, 14666, 14699, 14729, 14757, 14790, 14820,
14852, 14882, 14911, 14943, 14974, 15005, 15033, 15064, 15093,
15125, 15155, 15184, 15217, 15247, 15278, 15308, 15338, 15370,
15399, 15429, 15460, 15491, 15520, 15552, 15583, 15611, 15644,
15674, 15705, 15736, 15764, 15792, 15825, 15856, 15884, 15917,
15947, 15978, 16009, 16038, 16070, 16101, 16129, 16160, 16190,
16220, 16251, 16282, 16282, 16311, 16343, 16374, 16402, 16435,
16465, 16493, 16525, 16555, 16584, 16616, 16647, 16678, 16708,
16738, 16769, 16800, 16829, 16860, 16891, 16920, 16952, 16982,
17011, 17044, 17074, 17105, 17135, 17165, 17197, 17225, 17256,
17284, 17317, 17347, 17378, 17409, 17438, 17470, 17500, 17529,
17562, 17590, 17619, 17651, 17682, 17711, 17743, 17774, 17802,
17835, 17865, 17896), class = "Date"), TICKER = c("AMRE", "AMRE",
"AMRE", "AMRE", "AMRE", "AMRE", "AMRE", "AMRE", "AMRE", "AMRE",
"AMRE", "AMRE", "AMRE", "AMRE", "AMRE", "AMRE", "AMRE", "AMRE",
"AMRE", "AMRE", "AMRE", "AMRE", "AMRE", "AMRE", "AMRE", "AMRE",
"AMRE", "AMRE", "AMRE", "AMRE", "AMRE", "AMRE", "AMRE", "AA",
"AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA",
"AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA",
"AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA",
"AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA",
"AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA",
"AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA",
"AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA",
"AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA",
"AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA",
"AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "CHE", "CHE",
"CHE", "CHE", "CHE", "CHE", "CHE", "CHE", "CHE", "CHE", "CHE",
"CHE", "CHE", "CHE", "CHE", "CHE", "CHE", "CHE", "CHE", "CHE",
"CHE", "CHE", "CHE", "CHE", "CHE", "CHE", "CHE", "CHE", "CHE",
"CHE", "CHE", "CHE", "CHE", "CHE", "CHE", "CHE", "CHE", "CHE",
"CHE", "CHE", "CHE", "CHE", "CHE", "CHE", "CHE", "CHE", "CHE",
"CHE", "CHE", "CHE", "CHE", "CHE", "CHE", "CHE", "CHE", "CHE",
"CHE", "CHE", "CHE", "CHE", "CHE", "CHE", "CHE", "CHE", "CHE",
"CHE", "CHE", "CHE", "CHE", "CHE", "CHE", "CHE", "CHE", "CHE",
"CHE", "CHE", "CHE", "CHE", "CHE", "CHE", "CHE", "CHE", "CHE",
"CHE", "CHE", "CHE", "CHE", "CHE", "CHE", "CHE", "CHE", "CHE",
"CHE", "CHE", "CHE", "CHE", "CHE", "CHE", "CHE", "CHE", "CHE",
"CHE", "CHE", "CHE", "CHE", "CHE", "CHE", "CHE", "MLR", "MLR",
"MLR", "MLR", "MLR", "MLR", "MLR", "MLR", "MLR", "MLR", "MLR",
"MLR", "MLR", "MLR", "MLR", "MLR", "MLR", "MLR", "MLR", "MLR",
"MLR", "MLR", "MLR", "MLR", "MLR", "MLR", "MLR", "MLR", "MLR",
"MLR", "MLR", "MLR", "MLR", "MLR", "MLR", "MLR", "MLR", "MLR",
"MLR", "MLR", "MLR", "MLR", "MLR", "MLR", "MLR", "MLR", "MLR",
"MLR", "MLR", "MLR", "MLR", "MLR", "MLR", "MLR", "MLR", "MLR",
"MLR", "MLR", "MLR", "MLR", "MLR", "MLR", "MLR", "MLR", "MLR",
"MLR", "MLR", "MLR", "MLR", "MLR", "MLR", "MLR", "MLR", "MLR",
"MLR", "MLR", "MLR", "MLR", "MLR", "MLR", "MLR", "MLR", "MLR",
"MLR", "MLR", "MLR", "MLR", "MLR", "MLR", "MLR", "MLR", "MLR",
"MLR", "MLR", "MLR", "MLR", "MLR", "MLR", "MLR", "MLR", "MLR",
"MLR", "MLR", "MLR", "MLR", "MLR", "MLR", "MLR", "UMC", "UMC",
"UMC", "UMC", "UMC", "UMC", "UMC", "UMC", "UMC", "UMC", "UMC",
"UMC", "UMC", "UMC", "UMC", "UMC", "UMC", "UMC", "UMC", "UMC",
"UMC", "UMC", "UMC", "UMC", "UMC", "UMC", "UMC", "UMC", "UMC",
"UMC", "UMC", "UMC", "UMC", "UMC", "UMC", "UMC", "UMC", "UMC",
"UMC", "UMC", "UMC", "UMC", "UMC", "UMC", "UMC", "UMC", "UMC",
"UMC", "UMC", "UMC", "UMC", "UMC", "UMC", "UMC", "UMC", "UMC",
"UMC", "UMC", "UMC", "UMC", "UMC", "UMC", "UMC", "UMC", "UMC",
"UMC", "UMC", "UMC", "UMC", "UMC", "UMC", "UMC", "UMC", "UMC",
"UMC", "UMC", "UMC", "UMC", "UMC", "UMC", "UMC", "UMC", "UMC",
"UMC", "UMC", "UMC", "UMC", "UMC", "UMC", "UMC", "UMC", "UMC",
"UMC", "UMC", "UMC", "UMC", "UMC", "UMC", "UMC", "UMC", "UMC",
"UMC", "UMC", "UMC", "UMC", "UMC", "UMC", "UMC", "UMC"), RET = c("C",
"0.029099", "0.035862", "0.099190", "0.014119", "0.050242", "0.025656",
"-0.016487", "0.136416", "-0.023124", "0.045765", "-0.017103",
"-0.054292", "-0.077638", "0.040308", "0.014409", "-0.010795",
"-0.023550", "-0.030952", "0.065111", "-0.032872", "0.005432",
"0.066627", "0.041080", "0.264481", "0.006914", "-0.005579",
"0.072704", "0.080763", "0.004131", "0.001507", "", "", "C",
"-0.030721", "0.298077", "-0.051029", "-0.005493", "-0.019477",
"-0.023421", "-0.008804", "0.114855", "0.205494", "0.062443",
"0.024882", "-0.131226", "0.297760", "-0.034342", "-0.135525",
"-0.000222", "0.138790", "-0.061133", "-0.024756", "-0.077005",
"0.032355", "-0.095590", "-0.133911", "-0.090883", "-0.164414",
"-0.210298", "0.047133", "0.070677", "-0.056882", "-0.131050",
"-0.135739", "0.110338", "-0.082587", "0.185221", "0.085054",
"0.001142", "0.172571", "0.076673", "0.018709", "0.048071", "-0.037373",
"-0.009412", "-0.056514", "-0.071248", "-0.128988", "-0.252344",
"0.124347", "-0.065985", "-0.136727", "0.174567", "0.003937",
"-0.014749", "-0.028942", "-0.118191", "0.023392", "-0.032000",
"0.014168", "0.034463", "-0.028797", "-0.018670", "0.032105",
"0.018433", "-0.032805", "0.000000", "-0.002347", "0.003529",
"-0.080000", "0.016624", "-0.027673", "0.054545", "0.141626",
"0.039914", "0.106139", "0.082785", "0.022589", "0.096252", "0.046620",
"0.012621", "0.094049", "0.100739", "0.015253", "-0.031306",
"0.041641", "0.033413", "-0.086755", "-0.008866", "-0.053035",
"-0.126437", "0.038700", "-0.066319", "-0.108000", "-0.114798",
"-0.039514", "0.022222", "-0.075569", "0.051512", "0.054487",
"-0.261398", "0.229081", "0.072788", "0.165971", "-0.167413",
"0.000000", "0.145631", "-0.048023", "0.005952", "-0.055884",
"-0.030644", "0.154409", "0.015310", "0.011585", "0.036721",
"-0.039888", "-0.031479", "-0.054422", "0.141683", "0.034580",
"0.036308", "0.042173", "-0.020154", "0.053833", "0.017879",
"0.045338", "-0.027574", "-0.030339", "-0.071886", "-0.042756",
"-0.053230", "0.080058", "-0.093329", "-0.045658", "0.096270",
"0.104026", "0.013911", "-0.037332", "-0.076732", "0.088029",
"0.038551", "0.054803", "0.049372", "-0.029441", "0.015019",
"0.007491", "0.101473", "0.024090", "0.036145", "0.020505", "-0.139917",
"0.034419", "-0.025404", "-0.010625", "0.026709", "-0.051469",
"0.152020", "-0.016810", "0.030018", "0.074506", "0.057329",
"-0.069089", "0.060166", "0.064033", "0.086748", "0.039077",
"-0.025660", "0.004470", "0.067434", "-0.040323", "-0.042869",
"0.153846", "0.025069", "-0.034757", "0.079479", "0.055641",
"0.132418", "-0.079954", "-0.021122", "0.178467", "-0.016276",
"-0.030358", "-0.063284", "-0.082526", "0.054086", "-0.041860",
"0.006704", "0.045242", "0.079451", "-0.081215", "0.045505",
"0.002481", "0.055226", "0.076794", "0.035409", "0.076585", "0.023187",
"0.102304", "0.017579", "-0.000635", "-0.034371", "0.000354",
"0.024127", "0.105815", "0.102001", "-0.011873", "0.072216",
"-0.002533", "0.050957", "0.129590", "0.058595", "-0.012853",
"-0.017961", "0.024713", "-0.012240", "-0.047719", "0.041895",
"-0.105752", "-0.012335", "0.052632", "0.061864", "0.150442",
"0.048951", "-0.102000", "-0.007424", "-0.077038", "0.096434",
"-0.005174", "0.041605", "0.014979", "0.072382", "0.089122",
"-0.015644", "-0.025862", "0.094817", "0.086628", "-0.123596",
"0.193529", "-0.107051", "0.178098", "-0.212818", "-0.014916",
"0.036872", "-0.028817", "0.076389", "-0.030733", "-0.114634",
"0.106061", "0.030760", "-0.065773", "0.054759", "-0.043614",
"-0.068404", "0.075524", "0.001967", "0.041885", "0.016960",
"-0.058567", "0.083389", "-0.051924", "0.078674", "-0.034358",
"0.068664", "0.103651", "0.014941", "-0.013144", "-0.015566",
"-0.015812", "0.090305", "-0.008193", "0.048529", "0.020679",
"-0.067541", "-0.014070", "-0.098837", "0.169823", "-0.093576",
"0.168527", "-0.028379", "0.097030", "0.112816", "-0.086531",
"-0.083557", "-0.019503", "-0.122306", "0.238721", "-0.091747",
"0.160696", "-0.032187", "-0.000456", "-0.013315", "-0.099116",
"0.056302", "0.048323", "0.004704", "-0.028090", "0.042253",
"0.030289", "0.038444", "-0.036858", "0.161731", "0.043922",
"0.001890", "-0.056604", "0.061200", "-0.036053", "0.007874",
"-0.022266", "0.050302", "-0.038314", "0.120717", "0.010733",
"-0.012389", "-0.068817", "0.009690", "-0.065259", "0.034086",
"-0.010000", "0.076768", "-0.034522", "0.019569", "0.105566",
"-0.059722", "-0.101487", "0.168391", "-0.037776", "-0.095361",
"-0.022792", "0.096210", "-0.055851", "-0.067606", "-0.120846",
"0.068086", "-0.151815", "0.081712", "0.111511", "-0.077670",
"0.108772", "0.006329", "-0.119497", "-0.025000", "0.040293",
"-0.042253", "-0.058824", "-0.026072", "-0.134783", "-0.040201",
"0.167539", "0.026906", "-0.065502", "0.266355", "0.003690",
"-0.099265", "0.093878", "-0.212687", "0.028436", "0.038568",
"-0.064516", "0.009852", "-0.082927", "0.015957", "0.041885",
"-0.015075", "-0.056122", "-0.027027", "0.050000", "0.148148",
"0.073733", "-0.026732", "-0.109091", "0.051020", "-0.004854",
"-0.019512", "0.014925", "-0.009804", "-0.004950", "0.034826",
"0.048077", "0.055046", "0.047826", "-0.056703", "-0.056703",
"0.013699", "-0.103604", "0.100503", "-0.013699", "0.050926",
"0.048458", "0.054622", "-0.027888", "-0.008197", "-0.070248",
"-0.088889", "-0.078670", "-0.094444", "-0.006135", "0.141975",
"0.000000", "0.016216", "0.042553", "0.056122", "0.004831", "-0.091346",
"-0.005291", "0.058511", "-0.006194", "-0.015873", "-0.005376",
"0.027027", "-0.042105", "-0.038462", "0.034286", "0.127072",
"-0.053922", "0.005181", "0.061856", "0.184466", "-0.039865",
"0.106195", "0.000000", "0.040000", "-0.019231", "-0.062745",
"0.020920", "-0.012295", "0.074689", "0.030888", "0.041198",
"0.014388", "0.034730", "-0.003571", "-0.075269", "-0.271318",
"-0.031915", "-0.016484")), class = "data.frame", row.names = c(NA,
-466L))
If we provide the column in 'id_cols', it would be present
library(dplyr)
library(tidyr)
out <- df %>%
group_by(TICKER, year = lubridate::year(date)) %>%
mutate(row_id_for_pivot = row_number()) %>%
pivot_wider(names_from = TICKER, values_from = RET,
id_cols = c(date, row_id_for_pivot))
out
# A tibble: 113 x 5
# date row_id_for_pivot JEQ RLH PMC
# <date> <int> <chr> <chr> <chr>
# 1 2010-01-29 1 0.007634 0.200405 0.025189
# 2 2010-02-26 2 0.041667 0.065767 0.053440
# 3 2010-03-31 3 0.060000 0.142405 0.062391
# 4 2010-04-30 4 0.012007 0.058172 0.059276
# 5 2010-05-28 5 -0.113559 -0.137435 -0.150259
# 6 2010-06-30 6 -0.034417 -0.094082 -0.106098
# 7 2010-07-30 7 0.041584 0.247906 -0.109140
# 8 2010-08-31 8 -0.013308 -0.161074 -0.405819
# 9 2010-09-30 9 0.042389 0.190400 0.228093
#10 2010-10-29 10 0.005545 0.057796 0.053515
# … with 103 more rows
-checking the counts
count(out, row_id_for_pivot)
# A tibble: 13 x 2
# row_id_for_pivot n
# <int> <int>
# 1 1 9
# 2 2 9
# 3 3 9
# 4 4 9
# 5 5 9
# 6 6 9
# 7 7 9
# 8 8 9
# 9 9 9
#10 10 9
#11 11 9
#12 12 9
#13 13 5
and also the issue showed in OP's edit
out %>%
filter(date == "2015-01-30")
# A tibble: 1 x 5
# date row_id_for_pivot JEQ RLH PMC
# <date> <int> <chr> <chr> <chr>
#1 2015-01-30 1 0.048449 0.012618 0.111058
Update
Based on the new dataset, we can first convert the column 'RET' to numeric to have a single class to change 'C' to NA, and after the pivot_wider step, group by 'date', summarise_at the columns to select the first non-NA element
out1 <- df3 %>%
mutate(RET = as.numeric(RET)) %>%
group_by(TICKER, year = lubridate::year(date)) %>%
mutate(row_id_for_pivot = row_number()) %>%
pivot_wider(names_from = TICKER, values_from = RET,
id_cols = c(date, row_id_for_pivot)) %>%
select(-row_id_for_pivot) %>%
group_by(date) %>%
summarise_at(vars(-group_cols()), ~ .[order(is.na(.))][1])
The warning is when the non-numeric elements are converted to NA in as.numeric step.
out1 %>%
filter(date == "2012-07-31")
# A tibble: 1 x 6
# date AMRE AA CHE MLR UMC
# <date> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 2012-07-31 NA -0.032 0.0386 0.0308 0.0386
NOTE: the columns can be unquoted inside the c(...)
Related
How to bind more than 2 dataframes with different column number in R
I want to bind 4 dataframes. One of them, the "B8A_EVI_EOS_KRR05" has 11 rows while the others have 19. I've used cbind but I get a repetition of "B8A_EVI_EOS_KRR" first rows after the 11th row. I want to be able to bind the 4 dataframes by ID (choose the ID of B8A_NDVI_EOS_KRR, B8A_NIRv_EOS_KRR or B8A_kNDVI_EOS_KRR) and fill the empty cells with NA. My 4 dataframes look like this: dput(B8A_EVI_EOS_KRR05) structure(list(ID = c("AUR", "AUR", "AUR", "AUR", "AUR", "AUR", "P1", "P14", "P15", "P17", "P2"), D_EOS = structure(c(17067, 17353, 17712, 18082, 18360, 18516, 17714, 17007, 16987, 16988, 17715), class = "Date"), EVI_EOS = structure(c(17042, 17344, 17813, 18107, 18385, 18548, 17705, 17144, 17027, 17003, 17827 ), class = "Date")), row.names = c(NA, -11L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x0000000002745260>) dput(B8A_NDVI_EOS_KRR05) structure(list(ID = c("AUR", "AUR", "AUR", "AUR", "AUR", "AUR", "LAM", "LAM", "LAM", "LAM", "LAM", "LAM", "P0", "P1", "P14", "P15", "P17", "P2", "P3"), D_EOS = structure(c(17067, 17353, 17712, 18082, 18360, 18516, 17002, 17123, 17414, 17722, 18148, 18446, 17359, 17714, 17007, 16987, 16988, 17715, 17716), class = "Date"), NDVI_EOS = structure(c(17071, 17379, 17814, 18095, 18384, 18577, 16996, 17248, 17501, 17715, 18176, 18461, 17393, 17705, 17076, 16994, 17050, 17829, 17755), class = "Date")), row.names = c(NA, -19L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x0000000002745260>) dput(B8A_NIRv_EOS_KRR05) structure(list(ID = c("AUR", "AUR", "AUR", "AUR", "AUR", "AUR", "LAM", "LAM", "LAM", "LAM", "LAM", "LAM", "P0", "P1", "P14", "P15", "P17", "P2", "P3"), D_EOS = structure(c(17067, 17353, 17712, 18082, 18360, 18516, 17002, 17123, 17414, 17722, 18148, 18446, 17359, 17714, 17007, 16987, 16988, 17715, 17716), class = "Date"), NIRv_EOS = structure(c(17077, 17385, 17810, 18096, 18385, 18574, 17085, 17085, 17494, 17709, 18179, 18534, 17387, 17705, 17062, 16997, 17027, 17822, 17749), class = "Date")), row.names = c(NA, -19L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x0000000002745260>) dput(B8A_kNDVI_EOS_KRR05) structure(list(ID = c("AUR", "AUR", "AUR", "AUR", "AUR", "AUR", "LAM", "LAM", "LAM", "LAM", "LAM", "LAM", "P0", "P1", "P14", "P15", "P17", "P2", "P3"), D_EOS = structure(c(17067, 17353, 17712, 18082, 18360, 18516, 17002, 17123, 17414, 17722, 18148, 18446, 17359, 17714, 17007, 16987, 16988, 17715, 17716), class = "Date"), kNDVI_EOS = structure(c(17074, 17380, 17812, 18093, 18385, 18569, 16997, 17247, 17487, 17715, 18177, 18454, 17369, 17775, 17078, 16991, 17028, 17770, 17742), class = "Date")), row.names = c(NA, -19L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x0000000002745260>)` On the left is an example of how the dataframes looks like when I use cbind and on the right it is shown the desired output.
We may nest the datasets in a list, do a join and then unnest library(purrr) library(dplyr) library(tidyr) mget(ls(pattern = "^B8A_.*_EOS_KRR05$")) %>% imap(~ .x %>% nest(data = -ID) %>% rename(!! .y := data)) %>% reduce(full_join, by = "ID") %>% unnest(where(is.list), names_sep = "_") -output # A tibble: 19 × 9 ID B8A_EVI_EOS_KRR05_D… B8A_EVI_EOS_KRR… B8A_kNDVI_EOS_K… B8A_kNDVI_EOS_K… B8A_NDVI_EOS_KR… B8A_NDVI_EOS_KR… B8A_NIRv_EOS_KR… B8A_NIRv_EOS_KR… <chr> <date> <date> <date> <date> <date> <date> <date> <date> 1 AUR 2016-09-23 2016-08-29 2016-09-23 2016-09-30 2016-09-23 2016-09-27 2016-09-23 2016-10-03 2 AUR 2017-07-06 2017-06-27 2017-07-06 2017-08-02 2017-07-06 2017-08-01 2017-07-06 2017-08-07 3 AUR 2018-06-30 2018-10-09 2018-06-30 2018-10-08 2018-06-30 2018-10-10 2018-06-30 2018-10-06 4 AUR 2019-07-05 2019-07-30 2019-07-05 2019-07-16 2019-07-05 2019-07-18 2019-07-05 2019-07-19 5 AUR 2020-04-08 2020-05-03 2020-04-08 2020-05-03 2020-04-08 2020-05-02 2020-04-08 2020-05-03 6 AUR 2020-09-11 2020-10-13 2020-09-11 2020-11-03 2020-09-11 2020-11-11 2020-09-11 2020-11-08 7 P1 2018-07-02 2018-06-23 2018-07-02 2018-09-01 2018-07-02 2018-06-23 2018-07-02 2018-06-23 8 P14 2016-07-25 2016-12-09 2016-07-25 2016-10-04 2016-07-25 2016-10-02 2016-07-25 2016-09-18 9 P15 2016-07-05 2016-08-14 2016-07-05 2016-07-09 2016-07-05 2016-07-12 2016-07-05 2016-07-15 10 P17 2016-07-06 2016-07-21 2016-07-06 2016-08-15 2016-07-06 2016-09-06 2016-07-06 2016-08-14 11 P2 2018-07-03 2018-10-23 2018-07-03 2018-08-27 2018-07-03 2018-10-25 2018-07-03 2018-10-18 12 LAM NA NA 2016-07-20 2016-07-15 2016-07-20 2016-07-14 2016-07-20 2016-10-11 13 LAM NA NA 2016-11-18 2017-03-22 2016-11-18 2017-03-23 2016-11-18 2016-10-11 14 LAM NA NA 2017-09-05 2017-11-17 2017-09-05 2017-12-01 2017-09-05 2017-11-24 15 LAM NA NA 2018-07-10 2018-07-03 2018-07-10 2018-07-03 2018-07-10 2018-06-27 16 LAM NA NA 2019-09-09 2019-10-08 2019-09-09 2019-10-07 2019-09-09 2019-10-10 17 LAM NA NA 2020-07-03 2020-07-11 2020-07-03 2020-07-18 2020-07-03 2020-09-29 18 P0 NA NA 2017-07-12 2017-07-22 2017-07-12 2017-08-15 2017-07-12 2017-08-09 19 P3 NA NA 2018-07-04 2018-07-30 2018-07-04 2018-08-12 2018-07-04 2018-08-06
map over list and insert variables into a function
I have a list that I want to loop over and insert the variables into a function. However, the function I am using does not like the outputs I am getting from applying the map() function from {purr} package. Here is my list: $AAPL # A tibble: 10 x 2 ticker string <chr> <date> 1 AAPL 2020-01-28 2 AAPL 2020-04-30 3 AAPL 2020-07-30 4 AAPL 2020-10-29 5 AAPL 2021-01-27 6 AAPL 2020-01-29 7 AAPL 2020-05-01 8 AAPL 2020-07-31 9 AAPL 2020-10-30 10 AAPL 2021-01-28 $ABEV # A tibble: 8 x 2 ticker string <chr> <date> 1 ABEV 2020-02-26 2 ABEV 2020-05-06 3 ABEV 2020-07-29 4 ABEV 2020-10-28 5 ABEV 2020-02-27 6 ABEV 2020-05-07 7 ABEV 2020-07-30 8 ABEV 2020-10-29 my.list = list(AAPL = structure(list(ticker = c("AAPL", "AAPL", "AAPL", "AAPL", "AAPL", "AAPL", "AAPL", "AAPL", "AAPL", "AAPL"), string = structure(c(18289, 18382, 18473, 18564, 18654, 18290, 18383, 18474, 18565, 18655 ), class = "Date")), row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame"), na.action = structure(305:380, .Names = c("305", "306", "307", "308", "309", "310", "311", "312", "313", "314", "315", "316", "317", "318", "319", "320", "321", "322", "323", "324", "325", "326", "327", "328", "329", "330", "331", "332", "333", "334", "335", "336", "337", "338", "339", "340", "341", "342", "343", "344", "345", "346", "347", "348", "349", "350", "351", "352", "353", "354", "355", "356", "357", "358", "359", "360", "361", "362", "363", "364", "365", "366", "367", "368", "369", "370", "371", "372", "373", "374", "375", "376", "377", "378", "379", "380"), class = "omit")), ABEV = structure(list( ticker = c("ABEV", "ABEV", "ABEV", "ABEV", "ABEV", "ABEV", "ABEV", "ABEV"), string = structure(c(18318, 18388, 18472, 18563, 18319, 18389, 18473, 18564), class = "Date")), row.names = c(NA, -8L), class = c("tbl_df", "tbl", "data.frame"), na.action = structure(305:380, .Names = c("305", "306", "307", "308", "309", "310", "311", "312", "313", "314", "315", "316", "317", "318", "319", "320", "321", "322", "323", "324", "325", "326", "327", "328", "329", "330", "331", "332", "333", "334", "335", "336", "337", "338", "339", "340", "341", "342", "343", "344", "345", "346", "347", "348", "349", "350", "351", "352", "353", "354", "355", "356", "357", "358", "359", "360", "361", "362", "363", "364", "365", "366", "367", "368", "369", "370", "371", "372", "373", "374", "375", "376", "377", "378", "379", "380"), class = "omit"))) I want to loop over this and add the variables into a Quanld fucntion. The Quandl function works with the following inputs. Quandl.datatable('ORATS/VOL', tradedate=c('2021-02-19', "2020-01-20"), ticker='AAPL') So what I am trying to do is loop over the list and insert the dates (string) and the ticker (ticker) into this function. Here is what I have: library(tidyverse) map(my.list, ~Quandl.datatable('ORATS/VOL', tradedate=.x$string, ticker=.x$ticker[1])) This gives an error because it looks like the format is not in a vector when being input into the function. What am I missing here? Thank you for your help.
If we look at how the tradedate values in the manual entry, it is character class, while the 'string' column is Date class. May be, we can change it to character with as.character library(purrr) out <- map(my.list, ~Quandl.datatable('ORATS/VOL', tradedate = as.character(.x$string), ticker=.x$ticker[1])) The reason could be that in the API call, it is converting to character anyway, but if we use a Date class, the coercion to integer storage values may prevent it from executing
How can I fill in missing rows for monthly time series data?
Here's the dput of my data: structure(list(date = structure(c(8596, 8631, 8659, 8687, 8733, 8743, 8796, 8806, 8853, 8880, 8908, 8932, 8971, 8999, 9027, 9069, 9097, 9111, 9160, 9188, 9212, 9230, 9279, 9309, 9328, 9363, 9391, 9434, 9449, 9482, 9519, 9541, 9580, 9610, 9643, 9672, 9708, 9736, 9764, 9799, 9827, 9850, 9890, 9920, 9947, 9975, 10007, 10038, 10072, 10100, 10122, 10163, 10191, 10213, 10254, 10282, 10310, 10345, 10354, 10385, 10418, 10469, 10497, 10528, 10556, 10570, 10612, 10641, 10668, 10710, 10742, 10759, 10802, 10830, 10858, 10893, 10914, 10947, 10984, 11010, 11038, 11066, 11096, 11135, 11164, 11193, 11229, 11257, 11285, 11313, 11346, 11374, 11411, 11435, 11467, 11502, 11514, 11565, 11592, 11621, 11649, 11677, 11718, 11746, 11776, 11797, 11838, 11867, 11894, 11923, 11951, 11979, 12021, 12035, 12077, 12105, 12133, 12160, 12189, 12231, 12259, 12273, 12315, 12356, 12385, 12399, 12441, 12472, 12497, 12538, 12553, 12591, 12630, 12658, 12686, 12714, 12742, 12770, 12804, 12832, 12860, 12903, 12917, 12938, 12986, 13015, 13056, 13085, 13116, 13139, 13169, 13204, 13232, 13260, 13288, 13301, 13357, 13385, 13414, 13442, 13470, 13498, 13533, 13561, 13603, 13631, 13658, 13694, 13722, 13750, 13778, 13805, 13846, 13862, 13896, 13925, 13967, 13995, 14009, 14050, 14078, 14121, 14149, 14177, 14205, 14233, 14268, 14296, 14323, 14352, 14380, 14449, 14474, 14506, 14548, 14575, 14590, 14618, 14661, 14688, 14729, 14758, 14761, 14821, 14849, 14877, 14905, 14933, 14961, 14995, 15024, 15038, 15093, 15121, 15135, 15185, 15212, 15241, 15269, 15297, 15325, 15360, 15387, 15430, 15458, 15485, 15513, 15542, 15583, 15611, 15639, 15667, 15696, 15731, 15745, 15786, 15815, 15842, 15917, 15945, 15966, 16001, 16030, 16076, 16129, 16143, 16184, 16276, 16303, 16343, 16374, 16400, 16417, 16455, 16482, 16525, 16553, 16585, 16612, 16646, 16678, 16706, 16729, 16752, 16777, 16819, 16860, 16891, 16916, 16925, 16976, 17002, 17042, 17072, 17100, 17120, 17141, 17178, 17224, 17245, 17261, 17304, 17330, 17373, 17401, 17459, 17488, 17512, 17548, 17581, 17598, 17631), tzone = "UTC", tclass = "Date", class = "Date"), AverageTemp = c(16.5027083333333, 17.325, 17.1888888888889, 15.8277777777778, 16.6583333333333, 17.3333333333333, 16.64375, 17.1133333333333, 17.895119047619, 18.5694444444444, 18.8222222222222, 17.4305555555556, 17.6555555555556, 17.025, 17.3222222222222, 17.2770833333333, 17.4805555555556, 16.9708333333333, 17.9666666666667, 17.1222222222222, 18.0166666666667, 17.25, 18.1875, 17.6577777777778, 16.6541666666667, 17.1083333333333, 16.4666666666667, 17.5972756410256, 17.2, 17.4444444444444, 16.95, 17.7, 17.9222222222222, 18.4875, 17.8229166666667, 16.9166666666667, 16.7083333333333, 17.1666666666667, 17.3111111111111, 18.2333333333333, 16.6277777777778, 17.5875, 17.3833333333333, 17.4638888888889, 17.725, 18.1388888888889, 17.7001111111111, 17.7222222222222, 17.2041666666667, 17.8255952380952, 17.1833333333333, 17.8103070175439, 17.8194444444444, 17.952, 18.158412414966, 18.4910714285714, 18.3488562091503, 19.1341830065359, 18.45, 18.9107142857143, 17.2275, 19.0828761904762, 18.1599701591512, 18.965739220457, 18.6720606060606, 18.8786057692308, 18.602656449553, 18.6327347883598, 19.2925198412698, 20.1952463624339, 18.8900384227765, 18.0934444444444, 18.0554871794872, 17.8405270655271, 17.5540598290598, 17.454122110648, 17.5764155982906, 16.9989942528736, 16.4252032967033, 16.5388571428571, 17.0108695652174, 17.7725308641975, 18.4252564102564, 17.2278899240856, 17.3102091315453, 17.3627204585538, 17.280641025641, 17.3746616809117, 17.3014601139601, 17.2238271604938, 16.379012345679, 16.6044444444444, 17.624415954416, 18.4023148148148, 18.0341435185185, 17.3016666666667, 17.8204861111111, 17.827264957265, 17.2772467320261, 17.8786954365079, 17.84375, 17.1732638888889, 16.9219907407407, 17.3826388888889, 17.7413333333333, 18.4948412698413, 18.2363425925926, 17.3282057823129, 17.5083333333333, 17.414898989899, 16.9453125, 17.4988095238095, 17.6704012345679, 18.1333333333333, 18.11875, 17.4805555555556, 17.4271367521368, 17.9006944444444, 17.9818181818182, 17.3125, 16.73625, 17.2666666666667, 17.4279340277778, 17.8584444444444, 17.2966666666667, 17.1, 18.3420833333333, 18.5814285714286, 17.6430555555556, 18.2307122507123, 18.0830687830688, 16.7563492063492, 16.9055555555556, 17.0090277777778, 17.3863095238095, 16.9139880952381, 16.7479166666667, 17.0888888888889, 17.7648148148148, 18.2277777777778, 19.3694444444444, 17.7064021164021, 18.7371527777778, 17.94375, 17.9416666666667, 17.8736111111111, 18.5354166666667, 18.1919444444444, 18.2555555555556, 17.7704365079365, 17.3509259259259, 17.3931216931217, 18.3355923202614, 17.9180555555556, 18.2104166666667, 18.0171121593291, 17.6840277777778, 17.5509259259259, 16.9631313131313, 17.4478070175439, 17.6916666666667, 17.6143376068376, 18.7415656565657, 19.0048611111111, 18.285462962963, 18.3816964285714, 18.2041310541311, 17.2343518518519, 17.2149382716049, 17.3684027777778, 17.5229861111111, 16.8517857142857, 19.0929141414141, 19.300404040404, 18.735, 17.9280277777778, 18.4470274170274, 19.0686597406425, 18.325, 18.5, 18.4388888888889, 18.7291666666667, 18.3708333333333, 18.0234918630752, 19.4925980392157, 19.2101488095238, 19.3890625, 18.5150793650794, 19.1944444444444, 19.0815277777778, 19.5192658730159, 17.2212418300654, 17.8081168831169, 18.2517361111111, 17.7775555555556, 18.012962962963, 17.0347222222222, 16.5888888888889, 18.8123101604278, 18.9187091503268, 19.0161111111111, 19.2625, 20.875, 18.8092592592593, 18.6526515151515, 18.9083333333333, 18.9835227272727, 18.1829292929293, 17.9060606060606, 17.7835227272727, 17.8237719298246, 19.7386363636364, 18.4961051693405, 18.5332727272727, 18.3787878787879, 18.5134199134199, 17.8098930481283, 18.4179292929293, 17.230303030303, 18.9035064935065, 17.8935897435897, 17.6211966604824, 17.9238095238095, 18.8382886904762, 19.42625, 18.6395833333333, 18.0652777777778, 19.3354166666667, 18.75359375, 17.951123043623, 17.6063068181818, 17.828022875817, 17.5528846153846, 18.5647727272727, 19.0318181818182, 19.1659090909091, 18.8997564935065, 19.1301136363636, 18.1705882352941, 17.1361570247934, 18.6090909090909, 18.1429951690821, 17.8829545454545, 18.3387983091787, 18.41875, 19.7, 20.2508333333333, 17.6387426900585, 18.1770897832817, 17.5400297619048, 17.7547246376812, 17.246412037037, 17.0846153846154, 17.7060185185185, 18.325, 18.5408333333333, 19.4251587301587, 18.3706018518519, 17.917, 17.91, 18.6451388888889, 18.29375, 17.2316666666667, 18.7189393939394, 18.1669193548387, 18.367297979798, 17.7043055555556, 18.1879520697168, 19.12, 20.425, 18.6663888888889, 17.5108796296296, 18.1883333333333, 18.3060049019608, 18.32625, 18.2861111111111, 18.0375, 17.3445175438596, 18.6451058201058, 18.97875, 19.4583333333333, 18.2597222222222, 19.9197222222222, 18.2342307692308, 18.7666666666667, 19.8277777777778, 17.6464285714286, 18.690873015873, 18.4520833333333, 19.8696428571429, 19.9833333333333, 18.2416666666667)), class = "data.frame", row.names = c(NA, -292L)) My data is in YYYY-MM-DD format and is monthly data. Right now, there's missing data for a few months (e.g. 2017-09, 2014-05, 2014-06, 2013-12), but they are not specified in the data frame. How do I create a new row for possible missing months across my entire dataset? Since my dataset has two columns, the other column besides the date column should have an NA value specified for the new missing month row. I'm looking for a tidyverse, lubridate, or data.table solution.
You can use tidyr::complete for this, but you have the additional wrinkle that you have dates on different days in each month. First then you need to make a column to count months on, which we can do with the day(x) <- setter from lubridate. Here's an example using the provided data truncated to 2014 for conciseness. Note that you should use seq.Date to specify the full range of dates that you want to be included in the month column, and you also will have NAs in the date column. (you can replace with the first of the month if you want) library(tidyverse) library(lubridate) tbl <- structure(list(date = structure(c(16076, 16129, 16143, 16184, 16276, 16303, 16343, 16374, 16400, 16417), tzone = "UTC", tclass = "Date", class = "Date"), AverageTemp = c(18.3387983091787, 18.41875, 19.7, 20.2508333333333, 17.6387426900585, 18.1770897832817, 17.5400297619048, 17.7547246376812, 17.246412037037, 17.0846153846154)), row.names = c(NA, -10L), class = "data.frame") tbl %>% mutate(month = date %>% `day<-`(1)) %>% complete(month = seq.Date(min(month), max(month), by = "month")) #> # A tibble: 12 x 3 #> month date AverageTemp #> <date> <date> <dbl> #> 1 2014-01-01 2014-01-06 18.3 #> 2 2014-02-01 2014-02-28 18.4 #> 3 2014-03-01 2014-03-14 19.7 #> 4 2014-04-01 2014-04-24 20.3 #> 5 2014-05-01 NA NA #> 6 2014-06-01 NA NA #> 7 2014-07-01 2014-07-25 17.6 #> 8 2014-08-01 2014-08-21 18.2 #> 9 2014-09-01 2014-09-30 17.5 #> 10 2014-10-01 2014-10-31 17.8 #> 11 2014-11-01 2014-11-26 17.2 #> 12 2014-12-01 2014-12-13 17.1 As an alternative, you can instead just get the year and month components and use complete on the combination of the two: tbl %>% mutate(year = year(date), month = month(date)) %>% complete(year = min(year):max(year), month = 1:12) #> # A tibble: 12 x 4 #> year month date AverageTemp #> <dbl> <dbl> <date> <dbl> #> 1 2014 1 2014-01-06 18.3 #> 2 2014 2 2014-02-28 18.4 #> 3 2014 3 2014-03-14 19.7 #> 4 2014 4 2014-04-24 20.3 #> 5 2014 5 NA NA #> 6 2014 6 NA NA #> 7 2014 7 2014-07-25 17.6 #> 8 2014 8 2014-08-21 18.2 #> 9 2014 9 2014-09-30 17.5 #> 10 2014 10 2014-10-31 17.8 #> 11 2014 11 2014-11-26 17.2 #> 12 2014 12 2014-12-13 17.1 Created on 2019-03-20 by the reprex package (v0.2.1)
Finding Month on Month Turnover
I seemed to be stuck at a very basic problem, I know its easy but I am not able to figure out. So My data has HireDate and TermDate. TermDate is the last day of any employee. I want to do as follow: Leavers = Current Month Count taken from TermDate Turnover for particular Month = Current Month Leavers / AVG (Row Count for Last Month and Current Month) Reproduce Data structure(list(HireDate = structure(c(17702, 13242, 16895, 17167, 12335, 13879, 12303, 13745, 14789, 16785, 15390, 17167, 12886, 13472, 15569, 13796, 16811, 11484, 13062, 17592, 16113, 13437, 15614, 17167, 17167, 16251, 17623, 13312, 14165, 17167, 17167, 10695, 15764, 13749, 16801, 17167, 13594, 13874, 17167, 17167, 13157, 17167, 12501, 13243, 12192, 12287, 12965, 13328, 17167, 13343, 17167, 17167, 11839, 17167, 13262, 13326, 14124, 16161, 17167, 17226, 12786, 13823, 13822, 13255, 17704, 17653, 12258, 12769, 13727, 10712, 17400, 13952, 14048, 14333, 17233, 17690, 13108, 13383, 13517, 13829, 17213, 13696, 16741, 17167, 17241, 12198, 14018, 12902, 16801, 17167, 17591, 12843, 13627, 14553, 15593, 16097, 16801, 13075, 13529, 17167), class = "Date"), TermDate = structure(c(NA, 13439, 17712, NA, 12880, 15408, 12877, 16493, 17135, 16944, 17135, NA, 14054, 15670, 17531, 14327, NA, 13889, NA, NA, 16741, 17135, 17620, 17620, 17354, 17316, NA, 13312, 17166, NA, NA, 15705, NA, 15112, NA, NA, 15705, 13970, 17655, NA, 13612, NA, 15418, 15917, 15705, NA, 14274, 13449, NA, 13559, 17417, NA, 14400, NA, NA, 14334, 14813, 16343, 17703, NA, 12824, 15711, 15411, 14484, NA, NA, NA, 15309, 16493, 17197, NA, 14911, 16957, 15882, NA, NA, 14435, 13768, 13517, 14907, NA, 17284, NA, NA, NA, 12772, 17166, NA, 16881, 17439, NA, 14944, NA, 15028, 16581, 16778, NA, 13788, 14064, 17620), class = "Date")), row.names = 14296:14395, class = "data.frame")
A bit lengthy but it would work: library(data.table) df_leavers <- setDT(df)[, `:=` (TermDate = as.Date(as.character(TermDate)), HireDate = as.Date(as.character(HireDate)))] df_presences <- copy(df_leavers) df_leavers <- df_leavers[, TermDate := format(TermDate, "%Y-%m")][!is.na(TermDate), (Leavers = .N), , by = TermDate] df_presences <- df_presences[, maxTerm := max(TermDate, na.rm = T)][ is.na(TermDate), TermDate := maxTerm][ , .(YearMonth = format(seq(HireDate, TermDate, by = "month"), "%Y-%m")), by = 1:nrow(df)][ , (Presences = .N), by = YearMonth] df_final <- df_leavers[df_presences, on = .(TermDate = YearMonth)] setnames(df_final, c("YearMonth", "Leavers", "Presences")) df_final <- df_final[is.na(Leavers), Leavers := 0][order(YearMonth),][, previousMonth := shift(Presences)][ is.na(previousMonth), previousMonth := 0][, AvgPresences := (Presences + previousMonth) / 2][ , Turnover := round(Leavers / AvgPresences, 2)][, "previousMonth" := NULL] Output (beginning and end of dataset): YearMonth Leavers Presences AvgPresences Turnover 1: 1999-04 0 1 0.5 0.00 2: 1999-05 0 2 1.5 0.00 3: 1999-06 0 2 2.0 0.00 4: 1999-07 0 2 2.0 0.00 5: 1999-08 0 2 2.0 0.00 --- 227: 2018-02 0 32 32.5 0.00 228: 2018-03 3 36 34.0 0.09 229: 2018-04 0 33 34.5 0.00 230: 2018-05 1 34 33.5 0.03 231: 2018-06 2 36 35.0 0.06
library(dplyr) df %>% mutate(leavemonth=strftime(TermDate,format="%m-%Y")) %>% group_by(leavemonth) %>% summarize(n=n()) # A tibble: 51 x 2 leavemonth n <chr> <int> 1 01-2007 1 2 01-2008 1 3 01-2009 1 4 01-2013 1 5 01-2017 1 6 02-2005 1 7 02-2007 1 8 02-2011 1 9 02-2015 2 10 03-2009 2 # ... with 41 more rows I create a column with a unique identifier for the month-year of the termination date of each row, then count them using summarize. If you'd like to just add n to the existing table, we can replace the summarize with add_count: df %>% mutate(leavemonth=strftime(TermDate,format="%m-%Y")) %>% add_count(leavemonth) # A tibble: 100 x 4 HireDate TermDate leavemonth n <date> <date> <chr> <int> 1 2018-06-20 NA NA 34 2 2006-04-04 2006-10-18 10-2006 2 3 2016-04-04 2018-06-30 06-2018 2 4 2017-01-01 NA NA 34 5 2003-10-10 2005-04-07 04-2005 2 6 2008-01-01 2012-03-09 03-2012 3 7 2003-09-08 2005-04-04 04-2005 2 8 2007-08-20 2015-02-27 02-2015 2 9 2010-06-29 2016-11-30 11-2016 3 10 2015-12-16 2016-05-23 05-2016 1 # ... with 90 more rows
justifying labels and label transparency in ggplot
I have the following data: new_pairs x y Freq start.latittude start.longitude start.station end.latitude 1 359 519 929 40.75188 -73.97770 Pershing\nSquare N 40.75510 2 477 465 5032 40.75514 -73.98658 Broadway &\nW 41 St 40.75641 3 484 519 1246 40.75188 -73.97770 Pershing\nSquare N 40.75500 4 484 318 2654 40.75320 -73.97799 E 43 St &\nVanderbilt\nAve 40.75500 5 492 267 1828 40.75098 -73.98765 Broadway &\nW 36 St 40.75020 6 492 498 957 40.74855 -73.98808 Broadway &\nW 32 St 40.75020 7 492 362 1405 40.75173 -73.98754 Broadway &\nW 37 St 40.75020 8 493 477 1582 40.75641 -73.99003 W 41 St &\n8 Ave 40.75680 9 493 529 728 40.75757 -73.99099 W 42 St &\n8 Ave 40.75680 10 529 2021 1748 40.75929 -73.98860 W 45 St &\n8 Ave 40.75757 end.longitude end.station interaction 1 -73.97499 E 47 St &\nPark Av E 47 St &Park Av > PershingSquare N 2 -73.99003 W 41 St &\n8 Ave W 41 St &8 Ave > Broadway &W 41 St 3 -73.98014 W 44 St &\n5 Ave W 44 St &5 Ave > PershingSquare N 4 -73.98014 W 44 St &\n5 Ave W 44 St &5 Ave > E 43 St &VanderbiltAve 5 -73.99093 W 33 St &\n7 Ave W 33 St &7 Ave > Broadway &W 36 St 6 -73.99093 W 33 St &\n7 Ave W 33 St &7 Ave > Broadway &W 32 St 7 -73.99093 W 33 St &\n7 Ave W 33 St &7 Ave > Broadway &W 37 St 8 -73.98291 W 45 St &\n6 Ave W 45 St &6 Ave > W 41 St &8 Ave 9 -73.98291 W 45 St &\n6 Ave W 45 St &6 Ave > W 42 St &8 Ave 10 -73.99099 W 42 St &\n8 Ave W 42 St &8 Ave > W 45 St &8 Ave I would like to change the plot so that the labels are all justified to the center and change transparency based on their Freq so that the lower Freq are more transparent and the higher Freq are less transparent ggplot(data= new_pairs, aes(x= reorder(interaction, -Freq), y=Freq))+ geom_bar(stat="identity", aes(fill = Freq, alpha = .7)) + ylab("Bikes received")+ xlab("Station")+ geom_text(aes(x = interaction, label = interaction), vjust="inward",hjust = "inward", size = 4, nudge_y = 1, fontface ="bold")+theme(axis.text.y=element_blank())+ggtitle("Bikes received viarebalancing")+coord_flip()+theme(legend.position = "none") dput(new_pairs) structure(list(x = structure(c(146L, 253L, 260L, 260L, 268L, 268L, 268L, 269L, 269L, 304L), .Label = c("72", "79", "82", "83", "116", "119", "120", "127", "128", "137", "143", "144", "146", "147", "150", "151", "152", "153", "157", "160", "161", "164", "167", "168", "173", "174", "195", "212", "216", "217", "218", "223", "224", "225", "228", "229", "232", "233", "236", "237", "238", "239", "241", "242", "243", "244", "245", "247", "248", "249", "250", "251", "252", "253", "254", "257", "258", "259", "260", "261", "262", "263", "264", "265", "266", "267", "268", "270", "271", "274", "275", "276", "278", "279", "280", "281", "282", "284", "285", "289", "290", "291", "293", "294", "295", "296", "297", "298", "300", "301", "302", "303", "304", "305", "306", "307", "308", "309", "310", "311", "312", "313", "314", "315", "316", "317", "318", "319", "320", "321", "322", "323", "324", "325", "326", "327", "328", "329", "330", "331", "332", "334", "335", "336", "337", "339", "340", "341", "342", "343", "344", "345", "346", "347", "348", "349", "350", "351", "352", "353", "354", "355", "356", "357", "358", "359", "360", "361", "362", "363", "364", "365", "366", "367", "368", "369", "372", "373", "375", "376", "377", "379", "380", "382", "383", "384", "385", "386", "387", "388", "389", "390", "391", "392", "393", "394", "395", "396", "397", "398", "399", "400", "401", "402", "403", "404", "405", "406", "407", "408", "409", "410", "411", "412", "414", "415", "416", "417", "418", "419", "420", "421", "422", "423", "426", "427", "428", "430", "431", "432", "433", "434", "435", "436", "437", "438", "439", "440", "441", "442", "443", "444", "445", "446", "447", "448", "449", "450", "453", "454", "455", "456", "457", "458", "459", "460", "461", "462", "463", "464", "465", "466", "467", "468", "469", "470", "471", "472", "473", "474", "475", "476", "477", "478", "479", "480", "481", "482", "483", "484", "485", "486", "487", "488", "489", "490", "491", "492", "493", "494", "495", "496", "497", "498", "499", "500", "501", "502", "503", "504", "505", "507", "508", "509", "510", "511", "512", "513", "514", "515", "516", "517", "518", "519", "520", "521", "522", "523", "524", "525", "526", "527", "528", "529", "530", "531", "532", "533", "534", "536", "537", "538", "539", "540", "545", "546", "2000", "2002", "2003", "2004", "2005", "2006", "2008", "2009", "2010", "2012", "2017", "2021", "2022", "2023", "3002"), class = "factor"), y = structure(c(294L, 241L, 294L, 107L, 66L, 274L, 149L, 253L, 304L, 327L), .Label = c("72", "79", "82", "83", "116", "119", "120", "127", "128", "137", "143", "144", "146", "147", "150", "151", "152", "153", "157", "160", "161", "164", "167", "168", "173", "174", "195", "212", "216", "217", "218", "223", "224", "225", "228", "229", "232", "233", "236", "237", "238", "239", "241", "242", "243", "244", "245", "247", "248", "249", "250", "251", "252", "253", "254", "257", "258", "259", "260", "261", "262", "263", "264", "265", "266", "267", "268", "270", "271", "274", "275", "276", "278", "279", "280", "281", "282", "284", "285", "289", "290", "291", "293", "294", "295", "296", "297", "298", "300", "301", "302", "303", "304", "305", "306", "307", "308", "309", "310", "311", "312", "313", "314", "315", "316", "317", "318", "319", "320", "321", "322", "323", "324", "325", "326", "327", "328", "329", "330", "331", "332", "334", "335", "336", "337", "339", "340", "341", "342", "343", "344", "345", "346", "347", "348", "349", "350", "351", "352", "353", "354", "355", "356", "357", "358", "359", "360", "361", "362", "363", "364", "365", "366", "367", "368", "369", "372", "373", "375", "376", "377", "379", "380", "382", "383", "384", "385", "386", "387", "388", "389", "390", "391", "392", "393", "394", "395", "396", "397", "398", "399", "400", "401", "402", "403", "404", "405", "406", "407", "408", "409", "410", "411", "412", "414", "415", "416", "417", "418", "419", "420", "421", "422", "423", "426", "427", "428", "430", "431", "432", "433", "434", "435", "436", "437", "438", "439", "440", "441", "442", "443", "444", "445", "446", "447", "448", "449", "450", "453", "454", "455", "456", "457", "458", "459", "460", "461", "462", "463", "464", "465", "466", "467", "468", "469", "470", "471", "472", "473", "474", "475", "476", "477", "478", "479", "480", "481", "482", "483", "484", "485", "486", "487", "488", "489", "490", "491", "492", "493", "494", "495", "496", "497", "498", "499", "500", "501", "502", "503", "504", "505", "507", "508", "509", "510", "511", "512", "513", "514", "515", "516", "517", "518", "519", "520", "521", "522", "523", "524", "525", "526", "527", "528", "529", "530", "531", "532", "533", "534", "536", "537", "538", "539", "540", "545", "546", "2000", "2002", "2003", "2004", "2006", "2008", "2009", "2010", "2012", "2017", "2021", "2022", "2023", "3002"), class = "factor"), Freq = c(929L, 5032L, 1246L, 2654L, 1828L, 957L, 1405L, 1582L, 728L, 1748L), start.latittude = c(40.75188406, 40.75513557, 40.75188406, 40.75320159, 40.75097711, 40.74854862, 40.75172632, 40.75640548, 40.7575699, 40.75929124), start.longitude = c(-73.97770164, -73.98658032, -73.97770164, -73.9779874, -73.98765428, -73.98808416, -73.98753523, -73.9900262, -73.99098507, -73.98859651), start.station = c("Pershing\nSquare N", "Broadway &\nW 41 St", "Pershing\nSquare N", "E 43 St &\nVanderbilt\nAve", "Broadway &\nW 36 St", "Broadway &\nW 32 St", "Broadway &\nW 37 St", "W 41 St &\n8 Ave", "W 42 St &\n8 Ave", "W 45 St &\n8 Ave" ), end.latitude = c(40.75510267, 40.75640548, 40.75500254, 40.75500254, 40.75019995, 40.75019995, 40.75019995, 40.7568001, 40.7568001, 40.7575699), end.longitude = c(-73.97498696, -73.9900262, -73.98014437, -73.98014437, -73.99093085, -73.99093085, -73.99093085, -73.98291153, -73.98291153, -73.99098507), end.station = c("E 47 St &\nPark Av", "W 41 St &\n8 Ave", "W 44 St &\n5 Ave", "W 44 St &\n5 Ave", "W 33 St &\n7 Ave", "W 33 St &\n7 Ave", "W 33 St &\n7 Ave", "W 45 St &\n6 Ave", "W 45 St &\n6 Ave", "W 42 St &\n8 Ave"), interaction = c("E 47 St &Park Av > PershingSquare N", "W 41 St &8 Ave > Broadway &W 41 St", "W 44 St &5 Ave > PershingSquare N", "W 44 St &5 Ave > E 43 St &VanderbiltAve", "W 33 St &7 Ave > Broadway &W 36 St", "W 33 St &7 Ave > Broadway &W 32 St", "W 33 St &7 Ave > Broadway &W 37 St", "W 45 St &6 Ave > W 41 St &8 Ave", "W 45 St &6 Ave > W 42 St &8 Ave", "W 42 St &8 Ave > W 45 St &8 Ave")), .Names = c("x", "y", "Freq", "start.latittude", "start.longitude", "start.station", "end.latitude", "end.longitude", "end.station", "interaction" ), row.names = c(NA, -10L), class = "data.frame")`
Here's an option: ggplot(data= new_pairs, aes(x= reorder(interaction, -Freq), y=Freq))+ geom_bar(stat="identity", aes(fill = Freq, alpha = Freq)) + ylab("Bikes received")+ xlab("Station")+ ylim(0, max(new_pairs$Freq)+50) + geom_text(aes(label = interaction,y=(max(new_pairs$Freq)+50)/2,alpha = Freq), vjust="center",hjust = "center", size = 4, nudge_y = 1, fontface ="bold")+ theme(axis.text.y=element_blank())+ggtitle("Bikes received viarebalancing")+ coord_flip()+theme(legend.position = "none") You can set a y value in your geom_text aes to put the labels where you want them to be (you use coord_flip so changing y controls the horizontal placement of the text). I set ylim manually to be able to max(new_pairs$Freq)+50 to center the text labels.
If you want to center the text to each bar, here's a solution (based on the thread I linked above): library(plyr) new_pairs <- ddply(new_pairs, .(interaction), transform, pos = cumsum(Freq) - (0.5 * Freq)) ggplot(data= new_pairs, aes(x= reorder(interaction, -Freq), y=Freq))+ geom_bar(stat="identity", aes(fill = Freq, alpha = Freq)) + ylab("Bikes received")+ xlab("Station")+ geom_text(aes(label = interaction, y = pos, alpha = Freq), vjust="center",hjust = "center", size = 4, nudge_y = 1, fontface ="bold")+ theme(axis.text.y=element_blank())+ggtitle("Bikes received via rebalancing")+ coord_flip()+theme(legend.position = "none")