the randomly generated data frame contains ID, Dates, and Earnings. I changed up the data frame format so that each column represents a date and its values corresponds to the earnings.
I want to create a new variable named "Date_over100 " that would determine the date when one's cumulative earnings have exceeded 100. I have put below a reproducible code that would generate the data frame. I assume conditional statements or loops would be used to achieve this. I would appreciate all the help there is. Thanks in advance!
ID <- c(1:10)
Date <- sample(seq(as.Date('2021/01/01'), as.Date('2021/01/11'), by="day", replace=T), 10)
Earning <- round(runif(10,30,50),digits = 2)
df <- data.frame(ID,Date,Earning,check.names = F)
df1 <- df%>%
arrange(Date)%>%
pivot_wider(names_from = Date, values_from = Earning)
df1 <- as.data.frame(df1)
df1[is.na(df1)] <- round(runif(sum(is.na(df1)),min=30,max=50),digits = 2)
I go back to long format for the calculation, then join to the wide data:
library(dplyr)
library(tidyr)
df1 %>% pivot_longer(cols = -ID, names_to = "date") %>%
group_by(ID) %>%
summarize(Date_over_100 = Date[which.max(cumsum(value) > 100)]) %>%
right_join(df1, by = "ID")
# # A tibble: 10 × 12
# ID Date_over_100 `2021-01-04` `2021-01-01` `2021-01-08` `2021-01-11` `2021-01-02` `2021-01-09`
# <int> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 2021-01-08 45.0 46.2 40.1 47.4 47.5 48.8
# 2 2 2021-01-08 36.7 30.3 36.2 47.5 41.4 41.7
# 3 3 2021-01-08 49.5 46.0 45.0 43.9 45.4 37.1
# 4 4 2021-01-08 31.0 48.7 47.3 40.4 40.8 35.5
# 5 5 2021-01-08 48.2 35.2 32.1 44.2 35.4 49.7
# 6 6 2021-01-08 40.8 37.6 31.8 40.3 38.3 42.5
# 7 7 2021-01-08 37.9 42.9 36.8 46.0 39.8 33.6
# 8 8 2021-01-08 47.7 47.8 39.7 46.4 43.8 46.5
# 9 9 2021-01-08 32.9 42.0 41.8 32.8 33.9 35.5
# 10 10 2021-01-08 34.5 40.1 42.7 35.9 44.8 31.8
# # … with 4 more variables: 2021-01-10 <dbl>, 2021-01-03 <dbl>, 2021-01-07 <dbl>, 2021-01-05 <dbl>
Related
I have data that is in the following format:
(data <- tribble(
~Date, ~ENRSxOPEN, ~ENRSxCLOSE, ~INFTxOPEN, ~INFTxCLOSE,
"1989-09-11",82.97,82.10,72.88,72.56,
"1989-09-12",83.84,83.96,73.52,72.51,
"1989-09-13",83.16,83.88,72.91,72.12))
# A tibble: 3 x 5
Date ENRSxOPEN ENRSxCLOSE INFTxOPEN INFTxCLOSE
<chr> <dbl> <dbl> <dbl> <dbl>
1 1989-09-11 83.0 82.1 72.9 72.6
2 1989-09-12 83.8 84.0 73.5 72.5
3 1989-09-13 83.2 83.9 72.9 72.1
For analysis, I want to pivot this tibble longer to the following format:
tribble(
~Ticker, ~Date, ~OPEN, ~CLOSE,
"ENRS","1989-09-11",82.97,82.10,
"ENRS","1989-09-12",83.84,83.96,
"ENRS","1989-09-13",83.16,83.88,
"INFT","1989-09-11",72.88,72.56,
"INFT","1989-09-12",73.52,72.51,
"INFT","1989-09-13",72.91,72.12)
# A tibble: 3 x 5
Date ENRSxOPEN ENRSxCLOSE INFTxOPEN INFTxCLOSE
<chr> <dbl> <dbl> <dbl> <dbl>
1 1989-09-11 83.0 82.1 72.9 72.6
2 1989-09-12 83.8 84.0 73.5 72.5
3 1989-09-13 83.2 83.9 72.9 72.1
I.e., I want to separate the Open/Close prices from the ticker, and put the latter as an entirely new column in the beginning.
I've tried to use the function pivot_longer:
pivot_longer(data, cols = ENRSxOPEN:INFTxCLOSE)
While this goes into the direction of what I wanna achieve, it does not separate the prices and keep them in one row for each Ticker.
Is there a way to add additional arguments to pivot_longer()to achieve that?
pivot_longer(data, -Date, names_to = c('Ticker', '.value'), names_sep = 'x')
# A tibble: 6 x 4
Date Ticker OPEN CLOSE
<dbl> <chr> <dbl> <dbl>
1 1969 ENRS 83.0 82.1
2 1969 INFT 72.9 72.6
3 1968 ENRS 83.8 84.0
4 1968 INFT 73.5 72.5
5 1967 ENRS 83.2 83.9
6 1967 INFT 72.9 72.1
So I'm using the quantmod library to calculate historical returns, but while I can get the past prices, how can I calculate the returns and add it on to the dataframe???
My code looks like this
tickers <- c('KO', 'AAPL')
getSymbols(tickers, from = '2020-07-01', to = '2021-07-01')
history <- cbind(KO$KO.Close,AAPL$AAPL.Close)
First I did a way to better import and structure data
Import
library(quantmod)
library(tidyverse)
tickers <- c('KO', 'AAPL')
df <-
map_df(
.x = tickers,
.f = function(x){
getSymbols(x, from = '2020-07-01', to = '2021-07-01',auto.assign = FALSE) %>%
as_tibble() %>%
set_names(c("open","high","low","close","volume","adjusted")) %>%
mutate(symbol = x)
}
)
# A tibble: 504 x 7
open high low close volume adjusted symbol
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 44.9 45.5 44.7 44.8 14316500 43.1 KO
2 45.3 45.4 44.8 44.9 15111900 43.2 KO
3 45.1 45.3 44.6 45.2 15146000 43.5 KO
4 45 45.5 44.8 45.2 13043600 43.5 KO
5 45.1 45.2 44.5 45.1 13851200 43.3 KO
6 45.0 45.0 43.8 43.9 16087100 42.2 KO
7 43.9 45.2 43.9 45.2 15627800 43.4 KO
8 45.5 45.7 45.0 45.2 16705300 43.5 KO
9 44.9 45.9 44.7 45.9 17080100 44.1 KO
10 46.3 47.2 46.2 46.4 23738000 44.6 KO
Return
I do not know if this is the right formula for return, but you can change later inside mutate
df %>%
group_by(symbol) %>%
mutate(return = 100*((open/lag(open))-1))
# A tibble: 504 x 8
# Groups: symbol [2]
open high low close volume adjusted symbol return
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <dbl>
1 44.9 45.5 44.7 44.8 14316500 43.1 KO NA
2 45.3 45.4 44.8 44.9 15111900 43.2 KO 0.801
3 45.1 45.3 44.6 45.2 15146000 43.5 KO -0.331
4 45 45.5 44.8 45.2 13043600 43.5 KO -0.310
5 45.1 45.2 44.5 45.1 13851200 43.3 KO 0.311
6 45.0 45.0 43.8 43.9 16087100 42.2 KO -0.199
7 43.9 45.2 43.9 45.2 15627800 43.4 KO -2.60
8 45.5 45.7 45.0 45.2 16705300 43.5 KO 3.76
9 44.9 45.9 44.7 45.9 17080100 44.1 KO -1.36
10 46.3 47.2 46.2 46.4 23738000 44.6 KO 3.10
# ... with 494 more rows
Assuming the return you're looking for as today's value/yesterday's value, and using the tidyverse:
library(tidyverse)
library(timetk)
tickers <- c('KO', 'AAPL')
quantmod::getSymbols(tickers, from = '2020-07-01', to = '2021-07-01')
# Convert to a tibble to keep the dates
equity1 <- tk_tbl(KO) %>%
select(date = index, 5)
equity2 <- tk_tbl(AAPL) %>%
select(date = index, 5)
# Combine the series using a join, in case dates don't line up exactly.
history <- full_join(equity1, equity2, by = "date")
# Make data long, group by equity, do the calculation, turn back into wide data:
return <- history %>%
pivot_longer(-date) %>%
group_by(name) %>%
mutate(return = value/lag(value)-1) %>%
ungroup() %>%
pivot_wider(names_from = name, values_from = c(value, return))
# A tibble: 252 x 5
date value_KO.Close value_AAPL.Close return_KO.Close return_AAPL.Close
<date> <dbl> <dbl> <dbl> <dbl>
1 2020-07-01 44.8 91.0 NA NA
2 2020-07-02 44.9 91.0 0.00134 0
3 2020-07-06 45.2 93.5 0.00780 0.0268
4 2020-07-07 45.2 93.2 -0.000442 -0.00310
5 2020-07-08 45.1 95.3 -0.00310 0.0233
6 2020-07-09 43.9 95.8 -0.0257 0.00430
7 2020-07-10 45.2 95.9 0.0282 0.00175
8 2020-07-13 45.2 95.5 0.00221 -0.00461
9 2020-07-14 45.9 97.1 0.0137 0.0165
10 2020-07-15 46.4 97.7 0.0116 0.00688
# ... with 242 more rows
I am trying to convert my yahoo price downloads to a "tidy" format, but in the reprex below, the dates lose their format and are converted to rownumbers. Stated differently, how do I convert from xts to tibble and preserve the dates?
prices <- getSymbols("QQQ", adjustOHLC = TRUE, auto.assign = FALSE) %>%
as_tibble() %>%
rownames_to_column(var = "Date")
head(prices)
To keep it in all in a single tidyverse pipe, simply convert to a data frame first:
library(quantmod)
library(tibble)
getSymbols("QQQ", adjustOHLC = TRUE, auto.assign = FALSE) %>%
as.data.frame() %>%
rownames_to_column(var = "Date") %>%
as_tibble()
#> # A tibble: 3,419 x 7
#> Date QQQ.Open QQQ.High QQQ.Low QQQ.Close QQQ.Volume QQQ.Adjusted
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 2007-01-03 43.5 44.1 42.5 43.2 167689500 38.3
#> 2 2007-01-04 43.3 44.2 43.2 44.1 136853500 39.1
#> 3 2007-01-05 44.0 44.0 43.5 43.8 138958800 38.9
#> 4 2007-01-08 43.9 44.1 43.6 43.9 106401600 38.9
#> 5 2007-01-09 44.0 44.3 43.6 44.1 121577500 39.1
#> 6 2007-01-10 44.0 44.7 43.8 44.6 121070100 39.6
#> 7 2007-01-11 44.7 45.2 44.7 45.1 174029800 40.0
#> 8 2007-01-12 45.0 45.3 45.0 45.3 104217300 40.2
#> 9 2007-01-16 45.3 45.4 45.1 45.3 95690500 40.1
#> 10 2007-01-17 45.1 45.3 44.8 44.9 127142600 39.8
#> # ... with 3,409 more rows
Created on 2020-08-02 by the reprex package (v0.3.0)
I think you should use index() on the .xts rather than rownames_to_column() on the tibble
library(quantmod)
library(dplyr)
price.xts <-getSymbols("QQQ", adjustOHLC = TRUE, auto.assign = FALSE)
price<-as_tibble(price.xts)
price$Date <-index(price.xts)
head(price)
tail(price)
I've got a data frame like below with vector coordinates:
df <- structure(list(x0 = c(22.6, 38.5, 73.7), y0 = c(62.9, 56.6, 27.7
), x1 = c(45.8, 49.3, 80.8), y1 = c(69.9, 21.9, 14)), row.names = c(NA,
-3L), class = c("tbl_df", "tbl", "data.frame"))
# A tibble: 3 x 4
x0 y0 x1 y1
<dbl> <dbl> <dbl> <dbl>
1 22.6 62.9 45.8 69.9
2 38.5 56.6 49.3 21.9
3 73.7 27.7 80.8 14
For visualisation purposes I need to manually interpolate points, i.e. add an intermediate row between each two rows of df, where the starting coordinates x0, y0 are the ending coordinates of original, previous row, while ending coordinates x1, y1 are the starting coordinates of original, next row. I also need to preserve information if an observation is from original dataset or it is manually added. So the expected output would be:
# A tibble: 5 x 5
x y pass_end_x pass_end_y source
<dbl> <dbl> <dbl> <dbl> <chr>
1 22.6 62.9 45.8 69.9 original
2 45.8 69.9 38.5 56.6 added
3 38.5 56.6 49.3 21.9 original
4 49.3 21.9 73.7 27.7 added
5 73.7 27.7 80.8 14 original
How can I do that in efficient and elegant way (preferably in tidyverse)?
To do this, all I'm going to do is swap the column names of the start and end points, and then use lead to get the next value of x1 and y1. Then we just add the source tag, and bind_rows
library(tidyverse)
df2 <- df
names(df2) <- names(df2)[c(3,4,1,2)] # swap names
df2 <- df2 %>% mutate(x1 = lead(x1), y1 = lead(y1),source = "added")
df <- df %>% mutate(source = "original") %>% bind_rows(., df2)
Resulting in:
# A tibble: 6 x 5
x0 y0 x1 y1 source
<dbl> <dbl> <dbl> <dbl> <chr>
1 22.6 62.9 45.8 69.9 original
2 38.5 56.6 49.3 21.9 original
3 73.7 27.7 80.8 14 original
4 45.8 69.9 38.5 56.6 added
5 49.3 21.9 73.7 27.7 added
6 80.8 14 NA NA added
If you need the rows in order:
df2 <- df2 %>% mutate(x1 = lead(x1), y1 = lead(y1),source = "added", ID = seq(1,n()*2, by =2)+1)
df <- df %>% mutate(source = "original", ID = seq(1,n()*2, by =2)) %>% bind_rows(., df2) %>% arrange(ID)
# A tibble: 6 x 6
x0 y0 x1 y1 source ID
<dbl> <dbl> <dbl> <dbl> <chr> <dbl>
1 22.6 62.9 45.8 69.9 original 1
2 45.8 69.9 38.5 56.6 added 2
3 38.5 56.6 49.3 21.9 original 3
4 49.3 21.9 73.7 27.7 added 4
5 73.7 27.7 80.8 14 original 5
6 80.8 14 NA NA added 6
Let's use the following example:
set.seed(2409)
N=5
T=10
id<- rep(LETTERS[1:N],each=T)
time<-rep(1:T, times=N)
var1<-runif(N*T,0,100)
var2<-runif(N*T,0,100)
var3<-runif(N*T,0,100)
var4<-runif(N*T,0,100)
var5<-runif(N*T,0,100)
df<-data.frame(id,time,var1,var2,var3,var4,var5); rm(N,T,id,time,var1,var2,var3,var4,var5)
I now try to execute a function for several of these variables (not the whole series of variables!) and create new variables accordingly.
I already have a suitable code for creating log variables. For this I would use the following code:
cols <- c("var1",
"var3",
"var5")
log <- log(df[cols])
colnames(log) <- paste(colnames(log), "log", sep = "_")
df <- cbind(df,log); rm(log, cols)
This would give me my additional log variables. But now I also want to create lagged and z-transformed variables. These functions refer to the individual IDs. So I wrote the following code that of course works, but is extremely long and inefficient in my real dataset where I apply the function to 38 variables each:
library(Hmisc)
library(dplyr)
df<-df %>%
group_by(id) %>%
mutate(var1_1=Lag(var1, shift=1),
var3_1=Lag(var3, shift=1),
var5_1=Lag(var5, shift=1),
var1_2=Lag(var1, shift=2),
var3_2=Lag(var3, shift=2),
var5_2=Lag(var5, shift=2),
var1_z=scale(var1),
var3_z=scale(var3),
var5_z=scale(var5)
)
I am very sure that there is also a way to make this more efficient. It would be desirable if I could define the original variable once and execute different functions and create new variables as a result.
Thank you very much!
You can use mutate_at with funs. This will apply the three functions in funs to each of the three variables in vars, creating 9 new columns.
library(dplyr)
df %>%
group_by(id) %>%
mutate_at(vars(var1, var3, var5),
funs(lag1 = lag(.), lag2 = lag(., 2), scale))
# # A tibble: 50 x 16
# # Groups: id [5]
# id time var1 var2 var3 var4 var5 var1_lag1 var3_lag1 var5_lag1
# <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 A 1 38.8 25.7 29.2 91.1 35.3 NA NA NA
# 2 A 2 87.1 22.3 8.27 31.5 93.7 38.8 29.2 35.3
# 3 A 3 61.7 38.8 0.887 63.0 50.4 87.1 8.27 93.7
# 4 A 4 0.692 60.1 71.5 74.0 41.6 61.7 0.887 50.4
# 5 A 5 60.1 13.3 90.4 80.6 47.5 0.692 71.5 41.6
# 6 A 6 46.4 3.67 36.7 86.9 67.5 60.1 90.4 47.5
# 7 A 7 80.4 72.1 82.2 25.5 70.3 46.4 36.7 67.5
# 8 A 8 48.8 25.7 93.4 19.8 81.2 80.4 82.2 70.3
# 9 A 9 48.2 31.5 82.1 47.2 49.2 48.8 93.4 81.2
# 10 A 10 21.8 32.6 76.5 19.7 41.1 48.2 82.1 49.2
# # ... with 40 more rows, and 6 more variables: var1_lag2 <dbl>, var3_lag2 <dbl>,
# # var5_lag2 <dbl>, var1_scale <dbl>, var3_scale <dbl>, var5_scale <dbl>
Here is an option with data.table
library(data.table)
nm1 <- c('var1', 'var3', 'var5')
nm2 <- paste0(nm1, rep(c('_lag1', '_lag2'), each = 3))
nm3 <- paste0(nm1, '_scale')
setDT(df)[, c(nm2, nm3) := c(shift(.SD, n = 1:2), lapply(.SD,
function(x) as.vector(scale(x)))), by = id, .SDcols = nm1]'