I have data that is in the following format:
(data <- tribble(
~Date, ~ENRSxOPEN, ~ENRSxCLOSE, ~INFTxOPEN, ~INFTxCLOSE,
"1989-09-11",82.97,82.10,72.88,72.56,
"1989-09-12",83.84,83.96,73.52,72.51,
"1989-09-13",83.16,83.88,72.91,72.12))
# A tibble: 3 x 5
Date ENRSxOPEN ENRSxCLOSE INFTxOPEN INFTxCLOSE
<chr> <dbl> <dbl> <dbl> <dbl>
1 1989-09-11 83.0 82.1 72.9 72.6
2 1989-09-12 83.8 84.0 73.5 72.5
3 1989-09-13 83.2 83.9 72.9 72.1
For analysis, I want to pivot this tibble longer to the following format:
tribble(
~Ticker, ~Date, ~OPEN, ~CLOSE,
"ENRS","1989-09-11",82.97,82.10,
"ENRS","1989-09-12",83.84,83.96,
"ENRS","1989-09-13",83.16,83.88,
"INFT","1989-09-11",72.88,72.56,
"INFT","1989-09-12",73.52,72.51,
"INFT","1989-09-13",72.91,72.12)
# A tibble: 3 x 5
Date ENRSxOPEN ENRSxCLOSE INFTxOPEN INFTxCLOSE
<chr> <dbl> <dbl> <dbl> <dbl>
1 1989-09-11 83.0 82.1 72.9 72.6
2 1989-09-12 83.8 84.0 73.5 72.5
3 1989-09-13 83.2 83.9 72.9 72.1
I.e., I want to separate the Open/Close prices from the ticker, and put the latter as an entirely new column in the beginning.
I've tried to use the function pivot_longer:
pivot_longer(data, cols = ENRSxOPEN:INFTxCLOSE)
While this goes into the direction of what I wanna achieve, it does not separate the prices and keep them in one row for each Ticker.
Is there a way to add additional arguments to pivot_longer()to achieve that?
pivot_longer(data, -Date, names_to = c('Ticker', '.value'), names_sep = 'x')
# A tibble: 6 x 4
Date Ticker OPEN CLOSE
<dbl> <chr> <dbl> <dbl>
1 1969 ENRS 83.0 82.1
2 1969 INFT 72.9 72.6
3 1968 ENRS 83.8 84.0
4 1968 INFT 73.5 72.5
5 1967 ENRS 83.2 83.9
6 1967 INFT 72.9 72.1
Related
the randomly generated data frame contains ID, Dates, and Earnings. I changed up the data frame format so that each column represents a date and its values corresponds to the earnings.
I want to create a new variable named "Date_over100 " that would determine the date when one's cumulative earnings have exceeded 100. I have put below a reproducible code that would generate the data frame. I assume conditional statements or loops would be used to achieve this. I would appreciate all the help there is. Thanks in advance!
ID <- c(1:10)
Date <- sample(seq(as.Date('2021/01/01'), as.Date('2021/01/11'), by="day", replace=T), 10)
Earning <- round(runif(10,30,50),digits = 2)
df <- data.frame(ID,Date,Earning,check.names = F)
df1 <- df%>%
arrange(Date)%>%
pivot_wider(names_from = Date, values_from = Earning)
df1 <- as.data.frame(df1)
df1[is.na(df1)] <- round(runif(sum(is.na(df1)),min=30,max=50),digits = 2)
I go back to long format for the calculation, then join to the wide data:
library(dplyr)
library(tidyr)
df1 %>% pivot_longer(cols = -ID, names_to = "date") %>%
group_by(ID) %>%
summarize(Date_over_100 = Date[which.max(cumsum(value) > 100)]) %>%
right_join(df1, by = "ID")
# # A tibble: 10 × 12
# ID Date_over_100 `2021-01-04` `2021-01-01` `2021-01-08` `2021-01-11` `2021-01-02` `2021-01-09`
# <int> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 2021-01-08 45.0 46.2 40.1 47.4 47.5 48.8
# 2 2 2021-01-08 36.7 30.3 36.2 47.5 41.4 41.7
# 3 3 2021-01-08 49.5 46.0 45.0 43.9 45.4 37.1
# 4 4 2021-01-08 31.0 48.7 47.3 40.4 40.8 35.5
# 5 5 2021-01-08 48.2 35.2 32.1 44.2 35.4 49.7
# 6 6 2021-01-08 40.8 37.6 31.8 40.3 38.3 42.5
# 7 7 2021-01-08 37.9 42.9 36.8 46.0 39.8 33.6
# 8 8 2021-01-08 47.7 47.8 39.7 46.4 43.8 46.5
# 9 9 2021-01-08 32.9 42.0 41.8 32.8 33.9 35.5
# 10 10 2021-01-08 34.5 40.1 42.7 35.9 44.8 31.8
# # … with 4 more variables: 2021-01-10 <dbl>, 2021-01-03 <dbl>, 2021-01-07 <dbl>, 2021-01-05 <dbl>
So I'm using the quantmod library to calculate historical returns, but while I can get the past prices, how can I calculate the returns and add it on to the dataframe???
My code looks like this
tickers <- c('KO', 'AAPL')
getSymbols(tickers, from = '2020-07-01', to = '2021-07-01')
history <- cbind(KO$KO.Close,AAPL$AAPL.Close)
First I did a way to better import and structure data
Import
library(quantmod)
library(tidyverse)
tickers <- c('KO', 'AAPL')
df <-
map_df(
.x = tickers,
.f = function(x){
getSymbols(x, from = '2020-07-01', to = '2021-07-01',auto.assign = FALSE) %>%
as_tibble() %>%
set_names(c("open","high","low","close","volume","adjusted")) %>%
mutate(symbol = x)
}
)
# A tibble: 504 x 7
open high low close volume adjusted symbol
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 44.9 45.5 44.7 44.8 14316500 43.1 KO
2 45.3 45.4 44.8 44.9 15111900 43.2 KO
3 45.1 45.3 44.6 45.2 15146000 43.5 KO
4 45 45.5 44.8 45.2 13043600 43.5 KO
5 45.1 45.2 44.5 45.1 13851200 43.3 KO
6 45.0 45.0 43.8 43.9 16087100 42.2 KO
7 43.9 45.2 43.9 45.2 15627800 43.4 KO
8 45.5 45.7 45.0 45.2 16705300 43.5 KO
9 44.9 45.9 44.7 45.9 17080100 44.1 KO
10 46.3 47.2 46.2 46.4 23738000 44.6 KO
Return
I do not know if this is the right formula for return, but you can change later inside mutate
df %>%
group_by(symbol) %>%
mutate(return = 100*((open/lag(open))-1))
# A tibble: 504 x 8
# Groups: symbol [2]
open high low close volume adjusted symbol return
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <dbl>
1 44.9 45.5 44.7 44.8 14316500 43.1 KO NA
2 45.3 45.4 44.8 44.9 15111900 43.2 KO 0.801
3 45.1 45.3 44.6 45.2 15146000 43.5 KO -0.331
4 45 45.5 44.8 45.2 13043600 43.5 KO -0.310
5 45.1 45.2 44.5 45.1 13851200 43.3 KO 0.311
6 45.0 45.0 43.8 43.9 16087100 42.2 KO -0.199
7 43.9 45.2 43.9 45.2 15627800 43.4 KO -2.60
8 45.5 45.7 45.0 45.2 16705300 43.5 KO 3.76
9 44.9 45.9 44.7 45.9 17080100 44.1 KO -1.36
10 46.3 47.2 46.2 46.4 23738000 44.6 KO 3.10
# ... with 494 more rows
Assuming the return you're looking for as today's value/yesterday's value, and using the tidyverse:
library(tidyverse)
library(timetk)
tickers <- c('KO', 'AAPL')
quantmod::getSymbols(tickers, from = '2020-07-01', to = '2021-07-01')
# Convert to a tibble to keep the dates
equity1 <- tk_tbl(KO) %>%
select(date = index, 5)
equity2 <- tk_tbl(AAPL) %>%
select(date = index, 5)
# Combine the series using a join, in case dates don't line up exactly.
history <- full_join(equity1, equity2, by = "date")
# Make data long, group by equity, do the calculation, turn back into wide data:
return <- history %>%
pivot_longer(-date) %>%
group_by(name) %>%
mutate(return = value/lag(value)-1) %>%
ungroup() %>%
pivot_wider(names_from = name, values_from = c(value, return))
# A tibble: 252 x 5
date value_KO.Close value_AAPL.Close return_KO.Close return_AAPL.Close
<date> <dbl> <dbl> <dbl> <dbl>
1 2020-07-01 44.8 91.0 NA NA
2 2020-07-02 44.9 91.0 0.00134 0
3 2020-07-06 45.2 93.5 0.00780 0.0268
4 2020-07-07 45.2 93.2 -0.000442 -0.00310
5 2020-07-08 45.1 95.3 -0.00310 0.0233
6 2020-07-09 43.9 95.8 -0.0257 0.00430
7 2020-07-10 45.2 95.9 0.0282 0.00175
8 2020-07-13 45.2 95.5 0.00221 -0.00461
9 2020-07-14 45.9 97.1 0.0137 0.0165
10 2020-07-15 46.4 97.7 0.0116 0.00688
# ... with 242 more rows
I am trying to reshape my data to long instead of wide format using the same code provided earlier link , however it doesn't work even after several trials to modify names_pattern = "(.*)_(pre|post.*)",
My data sample is
data1<-read.table(text="
Serial_ID pre_EDV pre_ESV pre_LVEF post_EDV post_ESV post_LVEF
1 76.2 32.9 56.8 86.3 36.6 57.6
2 65.4 35.9 45.1 60.1 26.1 56.7
3 64.4 35.1 45.5 72.5 41.1 43.3
4 50 13.9 72.1 46.4 18.4 60.4
5 89.6 32 64.3 70.9 19.3 72.8
6 62 20.6 66.7 55.9 17.8 68.2
7 91.2 37.7 58.6 61.9 23.8 61.6
8 62 24 61.3 69.3 34.9 49.6
9 104.1 22.7 78.8 38.6 11.5 70.1
10 90.6 31.2 65.6 48 16.1 66.4", sep="", header=T)
I want to reshape my data to
put identical column headings below each other eg post_EDV below
pre_EDV
Create new column Pre vs. post
Fix column heading (remove "pre_" and "post_" to be "EDV" only (as shown in the screenshot below)).
This is the used code:
library(dplyr);library(tidyr);library(stringr)
out <- data %>% pivot_longer(cols = -Serial_ID,
names_to = c(".value", "prevspost"),
names_pattern = "(.*)_(pre|post.*)",
names_sep="_") #%>% as.data.frame
Also I tried names_prefix = c("pre_","post_") instead of names_pattern = "(.*)_(pre|post.*)", but it doesn't work.
Any advice will be greatly appreciated.
Edit I recommend using #Dave2e's superior approach.
The reason your attempt didn't work is because the pattern has to match in order. You could try this:
library(tidyr)
library(dplyr)
data1 %>% pivot_longer(cols = -Serial_ID,
names_to = c("prevspost",".value"),
names_pattern = "(pre|post)_(\\w+)") %>%
dplyr::arrange(desc(prevspost),Serial_ID)
# A tibble: 20 x 5
Serial_ID prevspost EDV ESV LVEF
<int> <chr> <dbl> <dbl> <dbl>
1 1 pre 76.2 32.9 56.8
2 2 pre 65.4 35.9 45.1
3 3 pre 64.4 35.1 45.5
4 4 pre 50 13.9 72.1
5 5 pre 89.6 32 64.3
6 6 pre 62 20.6 66.7
7 7 pre 91.2 37.7 58.6
8 8 pre 62 24 61.3
9 9 pre 104. 22.7 78.8
10 10 pre 90.6 31.2 65.6
11 1 post 86.3 36.6 57.6
12 2 post 60.1 26.1 56.7
13 3 post 72.5 41.1 43.3
14 4 post 46.4 18.4 60.4
15 5 post 70.9 19.3 72.8
16 6 post 55.9 17.8 68.2
17 7 post 61.9 23.8 61.6
18 8 post 69.3 34.9 49.6
19 9 post 38.6 11.5 70.1
20 10 post 48 16.1 66.4
Your initial approach very close, it needed some simplification. Use only "names_sep" or "names_pattern"
library(tidyr)
library(dplyr)
data1 %>% pivot_longer(cols = -Serial_ID,
names_to = c("Pre vs. post", '.value'),
names_sep="_")
# A tibble: 20 x 5
Serial_ID `Pre vs. post` EDV ESV LVEF
<int> <chr> <dbl> <dbl> <dbl>
1 1 pre 76.2 32.9 56.8
2 1 post 86.3 36.6 57.6
3 2 pre 65.4 35.9 45.1
4 2 post 60.1 26.1 56.7
5 3 pre 64.4 35.1 45.5
6 3 post 72.5 41.1 43.3
7 4 pre 50 13.9 72.1
8 4 post 46.4 18.4 60.4
9 5 pre 89.6 32 64.3
10 5 post 70.9 19.3 72.8
11 6 pre 62 20.6 66.7
12 6 post 55.9 17.8 68.2
13 7 pre 91.2 37.7 58.6
14 7 post 61.9 23.8 61.6
15 8 pre 62 24 61.3
16 8 post 69.3 34.9 49.6
17 9 pre 104. 22.7 78.8
18 9 post 38.6 11.5 70.1
19 10 pre 90.6 31.2 65.6
20 10 post 48 16.1 66.4
try this:
library(dplyr);library(tidyr);library(stringr)
out <- data1 %>% pivot_longer(-Serial_ID,
names_to = c("measurement", "names"),
values_to = "values",
names_sep = "_")
out
# # A tibble: 60 x 4
# Serial_ID measurement names values
# <int> <chr> <chr> <dbl>
# 1 1 pre EDV 76.2
# 2 1 pre ESV 32.9
# 3 1 pre LVEF 56.8
# 4 1 post EDV 86.3
# 5 1 post ESV 36.6
# 6 1 post LVEF 57.6
# 7 2 pre EDV 65.4
# 8 2 pre ESV 35.9
# 9 2 pre LVEF 45.1
# 10 2 post EDV 60.1
# # ... with 50 more rows
Your code snipped passed the object "data" instead of "data1" into the pipe which produced an error:
"Error: No tidyselect variables were registered".
I used the codes below to add a regression line after a boxplot.
boxplot(yield~Year, data=dfreg.raw,
ylab = 'Yield (bushels/acre)',
col = 'orange')
yield.year <- lm(yield~Year, data = dfreg.raw)
abline(reg = yield.year)
However, the regression line did not show up. The plot I got is below
My data looks like this. It's a panel data, which might end up problems with regression line.
> head(dfreg.raw)
# A tibble: 6 x 15
index Year yield State.Code harv frez_j dd_j cupc_j sm7_j fitted_j max_spring_j sp_spring_j
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 16001 1984 105 16 7200 330. 2438. 7.32 53.4 49.1 19.7 0.863
2 16001 1985 96.8 16 8200 413. 2407. 5.71 52.5 48.4 23.9 -0.391
3 16001 1986 94.9 16 7400 476. 2638. 8.34 52.5 48.4 23.4 -0.122
4 16001 1987 106. 16 9700 154. 2838. 5.44 54.4 49.9 25.6 -0.485
5 16001 1988 89.6 16 7600 184. 2944. 3.28 54.5 50.0 23.9 0.115
6 16001 1989 96.4 16 7300 383. 2766. 5.91 52.6 48.4 23.5 -1.02
# … with 3 more variables: pc_spring_j <dbl>, lt <dbl>, qt <dbl>
Anyone has any idea on this?
The x values are 1:max(levels of x variable), so the abline doesn't work. You can try something like this below.
First simulate a dataset:
dfreg.raw= data.frame(
yield=rpois(100,lambda=rep(seq(60,100,by=10),each=20)),
Year=rep(1995:1999,each=20)
)
Then plot:
boxplot(yield~Year, data=dfreg.raw,
ylab = 'Yield (bushels/acre)',
col = 'orange')
yield.year <- lm(yield~Year, data = dfreg.raw)
Get a unique ascending vector of Years, and predict
X = sort(unique(dfreg.raw$Year))
lines(x=1:length(X),
y=predict(yield.year,data.frame(Year=X)),col="blue",lty=8)
Let's use the following example:
set.seed(2409)
N=5
T=10
id<- rep(LETTERS[1:N],each=T)
time<-rep(1:T, times=N)
var1<-runif(N*T,0,100)
var2<-runif(N*T,0,100)
var3<-runif(N*T,0,100)
var4<-runif(N*T,0,100)
var5<-runif(N*T,0,100)
df<-data.frame(id,time,var1,var2,var3,var4,var5); rm(N,T,id,time,var1,var2,var3,var4,var5)
I now try to execute a function for several of these variables (not the whole series of variables!) and create new variables accordingly.
I already have a suitable code for creating log variables. For this I would use the following code:
cols <- c("var1",
"var3",
"var5")
log <- log(df[cols])
colnames(log) <- paste(colnames(log), "log", sep = "_")
df <- cbind(df,log); rm(log, cols)
This would give me my additional log variables. But now I also want to create lagged and z-transformed variables. These functions refer to the individual IDs. So I wrote the following code that of course works, but is extremely long and inefficient in my real dataset where I apply the function to 38 variables each:
library(Hmisc)
library(dplyr)
df<-df %>%
group_by(id) %>%
mutate(var1_1=Lag(var1, shift=1),
var3_1=Lag(var3, shift=1),
var5_1=Lag(var5, shift=1),
var1_2=Lag(var1, shift=2),
var3_2=Lag(var3, shift=2),
var5_2=Lag(var5, shift=2),
var1_z=scale(var1),
var3_z=scale(var3),
var5_z=scale(var5)
)
I am very sure that there is also a way to make this more efficient. It would be desirable if I could define the original variable once and execute different functions and create new variables as a result.
Thank you very much!
You can use mutate_at with funs. This will apply the three functions in funs to each of the three variables in vars, creating 9 new columns.
library(dplyr)
df %>%
group_by(id) %>%
mutate_at(vars(var1, var3, var5),
funs(lag1 = lag(.), lag2 = lag(., 2), scale))
# # A tibble: 50 x 16
# # Groups: id [5]
# id time var1 var2 var3 var4 var5 var1_lag1 var3_lag1 var5_lag1
# <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 A 1 38.8 25.7 29.2 91.1 35.3 NA NA NA
# 2 A 2 87.1 22.3 8.27 31.5 93.7 38.8 29.2 35.3
# 3 A 3 61.7 38.8 0.887 63.0 50.4 87.1 8.27 93.7
# 4 A 4 0.692 60.1 71.5 74.0 41.6 61.7 0.887 50.4
# 5 A 5 60.1 13.3 90.4 80.6 47.5 0.692 71.5 41.6
# 6 A 6 46.4 3.67 36.7 86.9 67.5 60.1 90.4 47.5
# 7 A 7 80.4 72.1 82.2 25.5 70.3 46.4 36.7 67.5
# 8 A 8 48.8 25.7 93.4 19.8 81.2 80.4 82.2 70.3
# 9 A 9 48.2 31.5 82.1 47.2 49.2 48.8 93.4 81.2
# 10 A 10 21.8 32.6 76.5 19.7 41.1 48.2 82.1 49.2
# # ... with 40 more rows, and 6 more variables: var1_lag2 <dbl>, var3_lag2 <dbl>,
# # var5_lag2 <dbl>, var1_scale <dbl>, var3_scale <dbl>, var5_scale <dbl>
Here is an option with data.table
library(data.table)
nm1 <- c('var1', 'var3', 'var5')
nm2 <- paste0(nm1, rep(c('_lag1', '_lag2'), each = 3))
nm3 <- paste0(nm1, '_scale')
setDT(df)[, c(nm2, nm3) := c(shift(.SD, n = 1:2), lapply(.SD,
function(x) as.vector(scale(x)))), by = id, .SDcols = nm1]'