I am working on some portfolio data and I'm stumped by this data manipulation. I have this sample data
df <- tibble(
date = as.Date(c("2020-01-31", "2020-01-31", "2020-01-31",
"2020-02-29", "2020-02-29", "2020-02-29",
"2020-03-31", "2020-03-31", "2020-03-31") ),
id = c("KO", "AAPL", "MSFT",
"KO", "AAPL", "GOOG",
"KO", "AAPL", "MSFT"),
weight = c(0.3, 0.4, 0.3,
0.5, 0.3, 0.2,
0.6, 0.2, 0.2),
`weight_change (desired column)` = c(NA, NA, NA,
0.2, -0.1, 0.2,
0.1, -0.1, 0.2)
)
These are the positions in a sample portfolio. The portfolio gets new weights every month. What I want to calculate is the change in weight for each item in terms of the previous months weight. In this example we see that at the end of February, KO's current weight is 0.5 which is up 0.2 from the previous month. AAPL is down 0.1, while GOOG replaces MSFT so the change with the previous month is its entire current weight: 0.2. How can I set up a mutate such that it looks for the stock in the previous date and calculates the difference between the weights?
If the data is monthly for each 'id', we can do a complete to take account of the missing months, then do a group by diff
library(dplyr)
library(tidyr)
library(zoo)
df %>%
mutate(yearmonth = as.Date(as.yearmon(date))) %>%
group_by(id) %>%
complete(yearmonth = seq(first(yearmonth), last(yearmonth), by = '1 month')) %>%
mutate(weight_change = if(n() == 1) weight else c(NA, diff(replace_na(weight, 0)))) %>%
ungroup %>%
select(names(df), weight_change) %>%
filter(!is.na(date))
# A tibble: 9 x 5
# date id weight `weight_change (desired column)` weight_change
# <date> <chr> <dbl> <dbl> <dbl>
#1 2020-01-31 AAPL 0.4 NA NA
#2 2020-02-29 AAPL 0.3 -0.1 -0.1
#3 2020-03-31 AAPL 0.2 -0.1 -0.100
#4 2020-02-29 GOOG 0.2 0.2 0.2
#5 2020-01-31 KO 0.3 NA NA
#6 2020-02-29 KO 0.5 0.2 0.2
#7 2020-03-31 KO 0.6 0.1 0.100
#8 2020-01-31 MSFT 0.3 NA NA
#9 2020-03-31 MSFT 0.2 0.2 0.2
Here is my not so compact solution. I just use some helper columns, which I leave in so that one can follow.
library(tidyverse)
library(lubridate)
df <- tibble(
date = c("2020-01-31", "2020-01-31", "2020-01-31",
"2020-02-29", "2020-02-29", "2020-02-29",
"2020-03-31", "2020-03-31", "2020-03-31"),
id = c("KO", "AAPL", "MSFT", "KO", "AAPL", "GOOG", "KO", "AAPL", "MSFT"),
weight = c(0.3, 0.4, 0.3, 0.5, 0.3, 0.2, 0.6, 0.2, 0.2),
`weight_change (desired_column)` = c(NA, NA, NA, 0.2, -0.1, 0.2, 0.1, -0.1, 0.2)
) %>% #new code starts here
mutate(
date = as_date(date),
date_ym = floor_date(date,
unit = "month"))%>%
group_by(id)%>%
arrange(date)%>%
mutate(id_n = row_number(),
prev_exist = case_when(lag(date_ym) == date_ym - months(1) ~ "immediate month", #if there is an immediate month
id_n == 1 & date != min(df$date)~ "new month", #if this is a new month
TRUE ~ "no immediate month"),
weight_change = case_when(prev_exist == "new month"~ weight,
prev_exist == "no immediate month" & id_n > 1~ weight,
TRUE ~ weight-lag(weight)),
date_ym = NULL,
id_n = NULL,
prev_exist = NULL)
A timetk approach:
library(timetk)
df %>%
mutate(Month = lubridate::floor_date(date, "month")) %>%
group_by(id) %>%
timetk::pad_by_time(.date_var = Month, .by="month") %>%
select(-Month) %>%
mutate(WC = if(n() == 1) weight else c(NA, diff(weight)))
A tibble: 10 x 5
Groups: id [4]
id date weight weight_change WC
<chr> <date> <dbl> <dbl> <dbl>
1 KO 2020-01-31 0.3 NA NA
2 KO 2020-02-29 0.5 0.2 0.2
3 KO 2020-03-31 0.6 0.1 0.100
4 AAPL 2020-01-31 0.4 NA NA
5 AAPL 2020-02-29 0.3 -0.1 -0.1
6 AAPL 2020-03-31 0.2 -0.1 -0.100
7 MSFT 2020-01-31 0.3 NA NA
8 MSFT NA NA NA NA
9 MSFT 2020-03-31 0.2 0.2 NA
10 GOOG 2020-02-29 0.2 0.2 0.2
Related
I have some struggles converting the following data (from an Excel-sheet) into a tidy format:
input <- structure(list(...11 = c(
NA, NA, "<1000", ">=1000 and <2000",
"2000", ">2000 and < 3000", ">=3000"
), ...13 = c(
"male", "female",
NA, NA, NA, NA, NA
), ...14 = c(
"<777", "<555", "0.3", "0.1",
"0.15", "0.13", "0.15"
), ...15 = c(
"888-999", "555-999", "0.23",
"0.21", "0", "0.21", "0.36"
), ...16 = c(
"556-899", "1020-1170",
"0.13", "0.29", "0.7", "0.8", "0.2"
), ...17 = c(
">960", ">11000",
"0.58", "0.31", "0.22", "0.65", "0.7"
)), row.names = c(NA, -7L), class = c("tbl_df", "tbl", "data.frame"))
# A tibble: 7 × 6
...11 ...13 ...14 ...15 ...16 ...17
<chr> <chr> <chr> <chr> <chr> <chr>
1 NA male <777 888-999 556-899 >960
2 NA female <555 555-999 1020-1170 >11000
3 <1000 NA 0.3 0.23 0.13 0.58
4 >=1000 and <2000 NA 0.1 0.21 0.29 0.31
5 2000 NA 0.15 0 0.7 0.22
6 >2000 and < 3000 NA 0.13 0.21 0.8 0.65
7 >=3000 NA 0.15 0.36 0.2 0.7
I would like to bring it into the following structure:
output <- tibble::tribble(
~gender, ~x, ~y, ~share,
"male", "<777", "<1000", 0.3,
"female", "<555", "<1000", 0.3,
"male", "<777", ">=1000 and <2000", 0.1,
"female", "<555", ">=1000 and <2000", 0.1,
)
# A tibble: 4 × 4
gender x y share
<chr> <chr> <chr> <dbl>
1 male <777 <1000 0.3
2 female <555 <1000 0.3
3 male <777 >=1000 and <2000 0.1
4 female <555 >=1000 and <2000 0.1
Any hints are much appreciated!
As outlined in the comments, here's a suggested approach:
Import the excel sheet twice using readxl's read_excel using the skip argument:
library(readxl)
df1 <- read_excel(file = "yourfile.xlsx", skip = 2)
df2 <- read_excel(file = "yourfile.xlsx", skip = 1)
That should give you (note X1 might be called ...1):
df1 <- read_table("NA male <777 888-999 556-899 >960
<1000 NA 0.3 0.23 0.13 0.58
>=1000and<2000 NA 0.1 0.21 0.29 0.31
2000 NA 0.15 0 0.7 0.22
>2000and<3000 NA 0.13 0.21 0.8 0.65
>=3000 NA 0.15 0.36 0.2 0.7")
df2 <- read_table("NA female <555 555-999 1020-1170 >11000
<1000 NA 0.3 0.23 0.13 0.58
>=1000and<2000 NA 0.1 0.21 0.29 0.31
2000 NA 0.15 0 0.7 0.22
>2000and<3000 NA 0.13 0.21 0.8 0.65
>=3000 NA 0.15 0.36 0.2 0.7")
Then do a little wrangling; most importantly turn into a long format:
library(dplyr)
library(tidyr)
df1 <- df1 |>
select(-male) |>
rename(y = X1) |>
mutate(gender = "male") |>
pivot_longer(-c("gender", "y"), names_to = "x", values_to = "share")
df2 <- df2 |>
select(-female) |>
rename(y = X1) |>
mutate(gender = "female") |>
pivot_longer(-c("gender", "y"), names_to = "x", values_to = "share")
And voila, a tidy frame:
bind_rows(df1, df2) |> arrange(y)
Output:
# A tibble: 40 × 4
y gender x share
<chr> <chr> <chr> <dbl>
1 <1000 male <777 0.3
2 <1000 male 888-999 0.23
3 <1000 male 556-899 0.13
4 <1000 male >960 0.58
5 <1000 female <555 0.3
6 <1000 female 555-999 0.23
7 <1000 female 1020-1170 0.13
8 <1000 female >11000 0.58
9 >=1000and<2000 male <777 0.1
10 >=1000and<2000 male 888-999 0.21
# … with 30 more rows
It's a bit unclear, but I think you'd need to do something like this
df <- input[3:nrow(input),]
input <- input[1:2, 2:3]
t <- input[rep(1:nrow(input), nrow(df)),]
s <- df[rep(1:nrow(df), 2), ]
t <- cbind(t,s)
, and repeat as needed if you need this for multiple columns.
I would like to make a connection between the x and df2 datasets. Notice that the dataset x, I have a percentage value, which in this case for the day 03-01-2021 is 0.1 and for the days 01-02-2021 and 01-01-2022 it is 0.45. So from that information, I know the percentage value for 03-01-2021 is 0.1, so this value falls into category I of my dataset df2 (since the values range from 0.1 to 0.2). As for the days 02-01-2021 and 01-01-2022, they correspond to category F of the df2,since the values range from 0.4 to 0.5. So, I would like to generate an output table as follows:
library(dplyr)
df1<- structure(
list(date2= c("01-01-2022","01-01-2022","03-01-2021","03-01-2021","01-02-2021","01-02-2021"),
Category= c("ABC","CDE","ABC","CDE","ABC","CDE"),
coef= c(5,4,0,2,4,5)),
class = "data.frame", row.names = c(NA, -6L))
x<-df1 %>%
group_by(date2) %>%
summarize(across("coef", sum),.groups = 'drop')%>%
arrange(date2 = as.Date(date2, format = "%d-%m-%Y"))
number<-20
x$Percentage<-x$coef/number
date2 coef Percentage
<chr> <dbl> <dbl>
1 03-01-2021 2 0.1
2 01-02-2021 9 0.45
3 01-01-2022 9 0.45
df2 <- structure(
list(
Category = c("A", "B", "C", "D",
"E", "F", "G", "H", "I", "J"),
From = c(0.9,
0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1, 0),
Until = c(
1,
0.8999,
0.7999,
0.6999,
0.5999,
0.4999,
0.3999,
0.2999,
0.1999,
0.0999
),
`1 Val` = c(
2222,
2017.8,
1793.6,
1621.5,
1522.4,
1457.3,
1325.2,
1229.15,
1223.1,
1177.05
),
`2 Val` = c(3200, 2220, 2560,
2200, 2220, 2080, 1220, 1240, 1720, 1620),
`3 Val` = c(
4665,
4122.5,
3732,
3498.75,
3265.5,
3032.25,
2799,
2682.375,
2565.75,
2449.125
),
`4 Val` = c(
6112,
5222.8,
4889.6,
4224,
4278.4,
3972.8,
3667.2,
3224.4,
3361.6,
3222.8
)
),
row.names = c(NA,-10L),
class = c("tbl_df",
"tbl", "data.frame")
)
Category From Until 1 Val 2 Val 3 Val 4 Val
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 0.9 1 2222 3200 4665 6112
2 B 0.8 0.900 2018 2220 4122 5223
3 C 0.7 0.800 1794 2560 3732 4890
4 D 0.6 0.700 1622 2200 3499 4224
5 E 0.5 0.600 1522 2220 3266 4278
6 F 0.4 0.500 1457 2080 3032 3973
7 G 0.3 0.400 1325 1220 2799 3667
8 H 0.2 0.300 1229 1240 2682 3224
9 I 0.1 0.200 1223 1720 2566 3362
10 J 0 0.0999 1177 1620 2449 3223
Using tidyverse, we do a rowwise on the 'x' dataset, slice the rows of 'df2' where the 'Percentage' falls between the 'From' and 'Until', and unpack the data.frame/tibble column
library(dplyr)
library(tidyr)
x %>%
rowwise %>%
mutate(out = df2 %>%
slice(which(Percentage>= From &
Percentage <= Until)[1]) %>%
select(-(1:3)) ) %>%
ungroup %>%
unpack(out)
-output
# A tibble: 3 × 7
date2 coef Percentage `1 Val` `2 Val` `3 Val` `4 Val`
<chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 03-01-2021 2 0.1 1223. 1720 2566. 3362.
2 01-02-2021 9 0.45 1457. 2080 3032. 3973.
3 01-01-2022 9 0.45 1457. 2080 3032. 3973.
Or this could be done with a non-equi join
library(data.table)
nm1 <- names(df2)[endsWith(names(df2), 'Val')]
setDT(x)[setDT(df2), (nm1) := mget(nm1),
on = .(Percentage >= From, Percentage <= Until)]
-output
> x
date2 coef Percentage 1 Val 2 Val 3 Val 4 Val
1: 03-01-2021 2 0.10 1223.1 1720 2565.75 3361.6
2: 01-02-2021 9 0.45 1457.3 2080 3032.25 3972.8
3: 01-01-2022 9 0.45 1457.3 2080 3032.25 3972.8
I want to convert the column time to be in time decimal format and then find the time interval within each group of the user_id. I have tried the answer below, but I could not get it to work:
Days difference between two dates in same column in R
structure(list(question_id = c(5502L, 5502L, 5502L, 5502L, 5502L
), user_id = c(112197L, 112197L, 112197L, 114033L, 114033L),
time = structure(c(1603720173, 1603720388, 1603720702, 1603603115,
1603949442), class = c("POSIXct", "POSIXt"), tzone = ""),
prediction = c(0.9, 0.95, 0.9, 0.99, 0.94), log_score = c(0.84799690655495,
0.925999418556223, 0.84799690655495, 0.985500430304885, 0.910732661902913
)), row.names = 156182:156186, class = "data.frame")
Perhaps this is what you're looking for?
library(dplyr)
user_data %>%
group_by(user_id) %>%
summarise(day.interval = difftime(max(time), min(time),units = "days"))
# A tibble: 2 x 2
user_id day.interval
<int> <drtn>
1 112197 0.006122685 days
2 114033 4.008414352 days
library(tidyverse)
library(lubridate)
df <- tibble::tribble(
~question_id, ~user_id, ~time, ~prediction, ~log_score,
5502L, 112197L, "2020-10-26 14:49:33", 0.9, 0.84799690655495,
5502L, 112197L, "2020-10-26 14:53:08", 0.95, 0.925999418556223,
5502L, 112197L, "2020-10-26 14:58:22", 0.9, 0.84799690655495,
5502L, 114033L, "2020-10-25 06:18:35", 0.99, 0.985500430304885,
5502L, 114033L, "2020-10-29 06:30:42", 0.94, 0.910732661902913
)
df %>%
as_tibble() %>%
mutate(time = lubridate::ymd_hms(time)) %>%
group_by(user_id) %>%
mutate(diff = time - lag(time),
diff2 = hms::hms(seconds_to_period(diff)))
#> # A tibble: 5 x 7
#> # Groups: user_id [2]
#> question_id user_id time prediction log_score diff diff2
#> <int> <int> <dttm> <dbl> <dbl> <drtn> <time>
#> 1 5502 112197 2020-10-26 14:49:33 0.9 0.848 NA secs NA
#> 2 5502 112197 2020-10-26 14:53:08 0.95 0.926 215 secs 00:03:35
#> 3 5502 112197 2020-10-26 14:58:22 0.9 0.848 314 secs 00:05:14
#> 4 5502 114033 2020-10-25 06:18:35 0.99 0.986 NA secs NA
#> 5 5502 114033 2020-10-29 06:30:42 0.94 0.911 346327 secs 96:12:07
This question already has an answer here:
How to use Pivot_longer to reshape from wide-type data to long-type data with multiple variables
(1 answer)
Closed 2 years ago.
Overview
So, I'm looking to tidy my data frame. I have found a solution to my problem but it seems highly inefficient when I am working with my large dataset. Currently my code gathers my data frame, applies a separate function to split the ticker from the metric, and then spreads the data appropriately. See the example below
Data frame
structure(list(date = c("2009-07-01", "2009-07-02", "2009-07-06",
"2009-07-07", "2009-07-08"), PRED.Open = c(0.5, 0.5, 0.7, 0.7,
0.7), PRED.High = c(0.5, 0.6, 0.7, 0.7, 0.7), PRED.Low = c(0.5,
0.5, 0.5, 0.7, 0.7), PRED.Close = c(0.5, 0.6, 0.5, 0.7, 0.7),
PRED.Volume = c(0L, 300L, 200L, 0L, 0L), PRED.Adjusted = c(0.5,
0.6, 0.5, 0.7, 0.7), GDM.Open = c(1041.02002, 1085.109985,
1052.02002, 1011.429993, 1006.630005), GDM.High = c(1097.790039,
1085.109985, 1052.02002, 1029.290039, 1006.630005), GDM.Low = c(1041.02002,
1038.540039, 995.450012, 1005.280029, 948.73999), GDM.Close = c(1085.109985,
1052.02002, 1011.429993, 1006.630005, 966.22998), GDM.Volume = c(0L,
0L, 0L, 0L, 0L), GDM.Adjusted = c(1085.109985, 1052.02002,
1011.429993, 1006.630005, 966.22998), NBL.Open = c(29.885,
29.325001, 27.370001, 27.485001, 26.815001), NBL.High = c(30.35,
29.325001, 27.545, 27.610001, 27.18), NBL.Low = c(29.83,
28.07, 26.825001, 26.605, 25.745001)), row.names = c(NA,
-5L), class = "data.frame")
Current Solution
df <- df %>% gather(c(2:ncol(df)), key = "ticker", value = "val")
df <- separate(df, col = "ticker", into = c("ticker", "metric"), sep = "\\.") %>%
ungroup() %>%
spread(key = "metric", value = "val") %>%
arrange(ticker, date)
Desired Outcome
Question
Is there a more efficient way to accomplish this?
If you use pivot_longer from tidyr 1.0.0 you can do this in one line :
tidyr::pivot_longer(df,
cols = -date,
names_to = c('ticker', '.value'),
names_sep = '\\.') %>%
dplyr::arrange(ticker, date)
# A tibble: 15 x 8
# date ticker Open High Low Close Volume Adjusted
# <chr> <chr> <dbl> <dbl> <dbl> <dbl> <int> <dbl>
# 1 2009-07-01 GDM 1041.0 1097.8 1041.0 1085.1 0 1085.1
# 2 2009-07-02 GDM 1085.1 1085.1 1038.5 1052.0 0 1052.0
# 3 2009-07-06 GDM 1052.0 1052.0 995.45 1011.4 0 1011.4
# 4 2009-07-07 GDM 1011.4 1029.3 1005.3 1006.6 0 1006.6
# 5 2009-07-08 GDM 1006.6 1006.6 948.74 966.23 0 966.23
# 6 2009-07-01 NBL 29.885 30.35 29.83 NA NA NA
# 7 2009-07-02 NBL 29.325 29.325 28.07 NA NA NA
# 8 2009-07-06 NBL 27.370 27.545 26.825 NA NA NA
# 9 2009-07-07 NBL 27.485 27.610 26.605 NA NA NA
#10 2009-07-08 NBL 26.815 27.18 25.745 NA NA NA
#11 2009-07-01 PRED 0.5 0.5 0.5 0.5 0 0.5
#12 2009-07-02 PRED 0.5 0.6 0.5 0.6 300 0.6
#13 2009-07-06 PRED 0.7 0.7 0.5 0.5 200 0.5
#14 2009-07-07 PRED 0.7 0.7 0.7 0.7 0 0.7
#15 2009-07-08 PRED 0.7 0.7 0.7 0.7 0 0.7
Say I have some data with 2 numeric variables ranging from 0 to 1 (it1, it2), a name variable, which has the name of the subject the numeric variable belongs to and then some date for every measure, ranging from year 2014 to 2017. Now, what I want to do is create a data set that only contains measures of people that have values for every year of my measure, and then in the future maybe specify that I only want measures for people with data ranging from 2015 to 2017. Does anybody have any hint on what package or code could help me with my problem? Thanks in advance.
date <- c("2015-11-26", "2015-12-30","2016-11-13", "2014-09-22", "2014-01-13", "2014-07-26", "2016-11-26", "2016-04-04", "2017-04-09", "2017-02-23", "2015-03-22")
names <- c("Max", "Allen", "Allen", "Bob", "Max", "Sarah", "Max", "Sarah", "Max", "Sarah", "Sarah")
it1 <- c(0.6, 0.3, 0.1, 0.2, 0.3, 0.8, 0.8, 0.5, 0.5, 0.3, 0.7)
it2 <- c(0.5, 0.8, 0.1, 0.4, 0.4, 0.4, 0.5, 0.8, 0.6, 0.5, 0.4)
date <- as.Date(date, format = "%Y-%m-%d")
myframe <- data.frame(date, names, it1, it2)
Desired output:
date <- c("2015-11-26", "2014-01-13", "2014-07-26", "2016-11-26", "2016-04-04", "2017-04-09", "2017-02-23", "2015-03-22")
names <- c("Max", "Max", "Sarah", "Max", "Sarah", "Max", "Sarah", "Sarah")
it1 <- c(0.6, 0.3, 0.8, 0.8, 0.5, 0.5, 0.3, 0.7)
it2 <- c(0.5, 0.4, 0.4, 0.5, 0.8, 0.6, 0.5, 0.4)
date <- as.Date(date, format = "%Y-%m-%d")
myframe <- data.frame(date, names, it1, it2)
Create a table of year vs. name and for those names in all years select out those rows. No packages are used.
tab <- table(as.POSIXlt(myframe$date)$year + 1900, myframe$names)
subset(myframe, names %in% colnames(tab)[colSums(sign(tab)) == nrow(tab)])
giving:
date names it1 it2
1 2015-11-26 Max 0.6 0.5
5 2014-01-13 Max 0.3 0.4
6 2014-07-26 Sarah 0.8 0.4
7 2016-11-26 Max 0.8 0.5
8 2016-04-04 Sarah 0.5 0.8
9 2017-04-09 Max 0.5 0.6
10 2017-02-23 Sarah 0.3 0.5
11 2015-03-22 Sarah 0.7 0.4
library(lubridate)
myframe[with(data = myframe[year(myframe$date) >= 2014 & year(myframe$date) <= 2017,],
expr = ave(year(date), names, FUN = function(x)
all(year(date) %in% x))) == 1,]
# date names it1 it2
#1 2015-11-26 Max 0.6 0.5
#5 2014-01-13 Max 0.3 0.4
#6 2014-07-26 Sarah 0.8 0.4
#7 2016-11-26 Max 0.8 0.5
#8 2016-04-04 Sarah 0.5 0.8
#9 2017-04-09 Max 0.5 0.6
#10 2017-02-23 Sarah 0.3 0.5
#11 2015-03-22 Sarah 0.7 0.4