R - cut function duplicates values - r

I have this code
bucket <- seq(0, 100000, by = 5000)
dt <-
data.frame(sold_amount = bucket) %>%
mutate(bucket = cut(bucket, breaks = bucket, include.lowest = T, dig.lab = 10))
If I execute it, bucket [0, 5000] is duplicated, with include.lowest = T bucket for amount 0 is na How can i get bins [0,5000] for sold amount 0 and (5000,10000] for sold amount 5000?

Maybe this?
cut(bucket, breaks = c(bucket,Inf), include.lowest = T, right = FALSE, dig.lab = 10)
such that
> dt <-
+ data.frame(sold_amount = bucket) %>%
+ mutate(bucket = cut(bucket, breaks = c(bucket, Inf), include.lowest = T, right = FALSE, dig.lab = .... [TRUNCATED]
> dt
sold_amount bucket
1 0 [0,5000)
2 5000 [5000,10000)
3 10000 [10000,15000)
4 15000 [15000,20000)
5 20000 [20000,25000)
6 25000 [25000,30000)
7 30000 [30000,35000)
8 35000 [35000,40000)
9 40000 [40000,45000)
10 45000 [45000,50000)
11 50000 [50000,55000)
12 55000 [55000,60000)
13 60000 [60000,65000)
14 65000 [65000,70000)
15 70000 [70000,75000)
16 75000 [75000,80000)
17 80000 [80000,85000)
18 85000 [85000,90000)
19 90000 [90000,95000)
20 95000 [95000,100000)
21 100000 [100000,Inf]

A pragmatic approach would be to just filter out the offending line:
library(tidyverse)
bucket <- seq(0, 100000, by = 5000)
dt <-
data.frame(sold_amount = bucket) %>%
mutate(bucket = cut(bucket, breaks = bucket, include.lowest = T, dig.lab = 10)) %>%
dplyr::filter(sold_amount != 0)
> dt
sold_amount bucket
1 5000 [0,5000]
2 10000 (5000,10000]
3 15000 (10000,15000]
4 20000 (15000,20000]
5 25000 (20000,25000]
6 30000 (25000,30000]
7 35000 (30000,35000]
8 40000 (35000,40000]
9 45000 (40000,45000]
10 50000 (45000,50000]
11 55000 (50000,55000]
12 60000 (55000,60000]
13 65000 (60000,65000]
14 70000 (65000,70000]
15 75000 (70000,75000]
16 80000 (75000,80000]
17 85000 (80000,85000]
18 90000 (85000,90000]
19 95000 (90000,95000]
20 100000 (95000,100000]

Maybe just remove the first row
dt <-
data.frame(sold_amount = bucket) %>%
mutate(bucket = cut(bucket, breaks = bucket, include.lowest = T, dig.lab = 10))%>%
.[-1,]
dt

An approach with my santoku package:
library(santoku)
dt$bucket <- chop_width(dt$sold_amount, 5000, labels = lbl_intervals("%d"))
dt
sold_amount bucket
1 0 [0, 5000)
2 5000 [5000, 10000)
3 10000 [10000, 15000)
4 15000 [15000, 20000)
5 20000 [20000, 25000)
6 25000 [25000, 30000)
7 30000 [30000, 35000)
8 35000 [35000, 40000)
9 40000 [40000, 45000)
10 45000 [45000, 50000)
11 50000 [50000, 55000)
12 55000 [55000, 60000)
13 60000 [60000, 65000)
14 65000 [65000, 70000)
15 70000 [70000, 75000)
16 75000 [75000, 80000)
17 80000 [80000, 85000)
18 85000 [85000, 90000)
19 90000 [90000, 95000)
20 95000 [95000, 100000)
21 100000 {100000}

Related

How I use accumulate function from dplyr to rowwise reproduce a data frame or tibble in R?

I want to reproduce the following table from Hull's book Options & Derivatives.
I believe that accumulate function from dplyr might be suitable but I cannot reproduce it.
my effort is the following.
library(tidyverse)
initial_deposit = 12000
contract_value = 9000
closing_stock_indices = c(1250,1241,1238.3,1244.6,1241.3,1240.1, 1236.2, 1229.9,
1230.8,1225.4,1228.1,1211,1211,1214.3,1216.1,1223,1226.9)
(seq_along(closing_stock_indices) - 1) %>%
as.data.frame() %>%
setNames('Day') %>%
mutate(Closing_SI = closing_stock_indices,
Daily_change = c(0, diff(Closing_SI*200)),
Cum_gain = cumsum( Daily_change),
Margin_balance = accumulate(Cum_gain[-1], .init = initial_deposit,
~ if (.x >= initial_deposit).x + .y else initial_deposit + .y),
Variation_Margin = -1 * pmin(Margin_balance - initial_deposit, 0),
margin_call = c(0, Variation_Margin[-n()]))
or
tibble(closing_stock_indices)%>%
dplyr::mutate(Day = row_number())%>%
mutate(Close =closing_stock_indices )%>%
dplyr::mutate(y = as.numeric( Close - (dplyr::lag(Close, 1))))%>%
dplyr::select(-closing_stock_indices)%>%
dplyr::mutate(y = replace_na(y,0),
Cum_gain = 200*cumsum(y))%>%
dplyr::mutate(Margin_balance = initial_deposit+Cum_gain)
Is there any other alternative using rowwise() from dplyr?
I would appreciate any help.
You need to accumulate over the daily change not the cumulative change, otherwise you are cumulating twice. I have used a base lambda function instead of the purrr one for expressiveness:
library(tidyverse)
initial_deposit = 12000
contract_value = 9000
closing_stock_indices = c(1250,1241,1238.3,1244.6,1241.3,1240.1, 1236.2, 1229.9,
1230.8,1225.4,1228.1,1211,1211,1214.3,1216.1,1223,1226.9)
(seq_along(closing_stock_indices) - 1) %>%
as.data.frame() %>%
setNames('Day') %>%
mutate(
Closing_SI = closing_stock_indices,
Daily_change = c(0, diff(Closing_SI*200)),
Cum_gain = cumsum(Daily_change),
Margin_balance = accumulate(
Daily_change[-1],
.init = initial_deposit,
\(bal, gain) if (bal > contract_value) bal + gain
else initial_deposit + gain
),
margin_call = ifelse(Margin_balance < contract_value,
initial_deposit - Margin_balance,
0)
)
#> Day Closing_SI Daily_change Cum_gain Margin_balance margin_call
#> 1 0 1250.0 0 0 12000 0
#> 2 1 1241.0 -1800 -1800 10200 0
#> 3 2 1238.3 -540 -2340 9660 0
#> 4 3 1244.6 1260 -1080 10920 0
#> 5 4 1241.3 -660 -1740 10260 0
#> 6 5 1240.1 -240 -1980 10020 0
#> 7 6 1236.2 -780 -2760 9240 0
#> 8 7 1229.9 -1260 -4020 7980 4020
#> 9 8 1230.8 180 -3840 12180 0
#> 10 9 1225.4 -1080 -4920 11100 0
#> 11 10 1228.1 540 -4380 11640 0
#> 12 11 1211.0 -3420 -7800 8220 3780
#> 13 12 1211.0 0 -7800 12000 0
#> 14 13 1214.3 660 -7140 12660 0
#> 15 14 1216.1 360 -6780 13020 0
#> 16 15 1223.0 1380 -5400 14400 0
#> 17 16 1226.9 780 -4620 15180 0

How can I reproduce the following finance table in r

I want to recreate in R the following table :
I have been provided with only these three parameters:
initial_deposit = 15000
contract_value = 250
closing_stock_indices = c(1000, 1002, 994, 998, 997)
My effort in R is this:
cl =c(1000,1002,994,998,997) # The closing stock indices.
re = c(0,diff(cl))
t = time(cl)
mtm = re*250 # The contract value of 250.
mb = 15000+mtm # The initial deposit of 15000.
vm = ifelse(mb>0,0,mtm)
d = data.frame(t,cl,re,mtm,mb,vm);d
but I cannot do the last two columns.Any help ?
You may do the following
library(tidyverse)
initial_deposit = 15000
contract_value = 250
closing_stock_indices = c(1000, 1002, 994, 998, 997)
(seq_along(closing_stock_indices) - 1) %>%
as.data.frame() %>%
setNames('Day') %>%
mutate(Closing_SI = closing_stock_indices,
Daily_change = c(0, diff(Closing_SI)),
Marking_to_market = contract_value * Daily_change,
Margin_balance = accumulate(Marking_to_market[-1], .init = initial_deposit,
~ if (.x >= initial_deposit) .x + .y else initial_deposit + .y),
Variation_Margin = -1 * pmin(Margin_balance - initial_deposit, 0),
REquired_Deposit = c(initial_deposit, Variation_Margin[-n()]))
Day Closing_SI Daily_change Marking_to_market Margin_balance Variation_Margin REquired_Deposit
1 0 1000 0 0 15000 0 15000
2 1 1002 2 500 15500 0 0
3 2 994 -8 -2000 13500 1500 0
4 3 998 4 1000 16000 0 1500
5 4 997 -1 -250 15750 0 0
Check on another vector
initial_deposit = 15000
contract_value = 250
closing_stock_indices = c(1000, 1002, 994, 998, 997, 990, 1000)
(seq_along(closing_stock_indices) - 1) %>%
as.data.frame() %>%
setNames('Day') %>%
mutate(Closing_SI = closing_stock_indices,
Daily_change = c(0, diff(Closing_SI)),
Marking_to_market = contract_value * Daily_change,
Margin_balance = accumulate(Marking_to_market[-1], .init = initial_deposit,
~ if (.x >= initial_deposit) .x + .y else initial_deposit + .y),
Variation_Margin = -1 * pmin(Margin_balance - initial_deposit, 0),
REquired_Deposit = c(initial_deposit, Variation_Margin[-n()]))
Day Closing_SI Daily_change Marking_to_market Margin_balance Variation_Margin REquired_Deposit
1 0 1000 0 0 15000 0 15000
2 1 1002 2 500 15500 0 0
3 2 994 -8 -2000 13500 1500 0
4 3 998 4 1000 16000 0 1500
5 4 997 -1 -250 15750 0 0
6 5 990 -7 -1750 14000 1000 0
7 6 1000 10 2500 17500 0 1000

Applying a function for multiple groups using dplyr

I have some data for multiple location and year
big.data <- data.frame(loc.id = rep(1:3, each = 10*3),
year = rep(rep(1981:1983, each = 10),times = 3),
day = rep(1:10, times = 3*3),
CN = rep(c(50,55,58), each = 10*3),
top.FC = rep(c(72,76,80),each = 10*3),
DC = rep(c(0.02,0.5,0.8), each = 10*3),
WAT0 = rep(c(20,22,26), each = 10*3),
Precp = sample(1:100,90, replace = T),
ETo = sample(1:10,90, replace = T))
I have a function: water.model which uses a second function internally called water.update
water.model <- function(dat){
top.FC <- unique(dat$top.FC)
dat$WAT <- -9.9
dat$RO <- -9.9
dat$DR <- -9.9
dat$WAT[1] <- top.FC/2 # WAT.i is a constant
dat$RO[1] <- NA
dat$DR[1] <- NA
for(d in 1:(nrow(dat)-1)){
dat[d + 1,10:12] <- water.update(WAT0 = dat$WAT[d],
RAIN.i = dat$Precp[d + 1],
ETo.i = dat$ETo[d + 1],
CN = unique(dat$CN),
DC = unique(dat$DC),
top.FC = unique(dat$top.FC))
}
return(dat)
}
water.update <- function(WAT0, RAIN.i, ETo.i, CN, DC, top.FC){
S = 25400/CN - 254; IA = 0.2*S
if (RAIN.i > IA) { RO = (RAIN.i - 0.2 * S)^2/(RAIN.i + 0.8 * S)
} else {
RO = 0
}
if (WAT0 + RAIN.i - RO > top.FC) {
DR = DC * (WAT0 + RAIN.i - RO - top.FC)
} else {
DR = 0
}
dWAT = RAIN.i - RO - DR - ETo.i
WAT1 = WAT0 + dWAT
WAT1 <- ifelse(WAT1 < 0, 0, WAT1)
return(list(WAT1,RO,DR))
}
If I run the above function for a single location X year
big.data.sub <- big.data[big.data$loc.id == 1 & big.data$year == 1981,]
water.model(big.data.sub)
loc.id year day CN top.FC DC WAT0 Precp ETo WAT RO DR
1 1 1981 1 50 72 0.02 20 52 5 36.0000 NA NA
2 1 1981 2 50 72 0.02 20 12 9 39.0000 0.0000000 0.000000
3 1 1981 3 50 72 0.02 20 3 2 40.0000 0.0000000 0.000000
4 1 1981 4 50 72 0.02 20 81 9 107.8750 3.2091485 0.915817
5 1 1981 5 50 72 0.02 20 37 10 133.4175 0.0000000 1.457501
6 1 1981 6 50 72 0.02 20 61 7 184.5833 0.3937926 2.440475
7 1 1981 7 50 72 0.02 20 14 10 186.0516 0.0000000 2.531665
8 1 1981 8 50 72 0.02 20 9 6 186.5906 0.0000000 2.461032
9 1 1981 9 50 72 0.02 20 77 9 248.3579 2.4498216 3.782815
10 1 1981 10 50 72 0.02 20 18 6 256.4708 0.0000000 3.887159
How do I run this for all location and year?
big.data %>% group_by(loc.id, year) %>% # apply my function here.
My final data should look like the above with three new columns called WAT, RO and DR which are generated when the function is run.
We can split the data and apply the water.model by looping over the list with map
library(tidyverse)
split(big.data, big.data[c('loc.id', 'year')], drop = TRUE) %>%
map_df(water.model)
Or apply the function within do after group_by
big.data %>%
group_by(loc.id, year) %>%
do(data.frame(water.model(.)))

R - ggplot - Error in function, arguments imply differing number of rows: 25, 0

I'm having trouble plotting fixed effects from an lmer model.
library(ggplot2)
library(lme4)
library(lmerTest)
library(effects)
data(diamonds)
m1 <- lmer(carat ~ price * depth + (1 | cut), diamonds)
summary(m1)
ee <- Effect(c("price", "depth"), m1)
ggplot(data.frame(ee), aes(price, fit, color = cut)) + geom_line()
When I use ggplot I get this error:
Don't know how to automatically pick scale for object of type
function. Defaulting to continuous. Error in (function (..., row.names
= NULL, check.rows = FALSE, check.names = TRUE, : arguments imply differing number of rows: 25, 0
but a simple plot(ee) yields 5 tiled plots:
A different model yields a plot:
m3 <- lmer(price ~ depth * clarity + (1 | cut), diamonds)
summary(m3)
eg <- Effect(c("depth", "clarity"), m3)
ggplot(as.data.frame(eg), aes(depth, fit, color = clarity)) + geom_line()
There does not appear to be a mismatch in the number of rows per column:
> as.data.frame(ee)
price depth fit se lower upper
1 330 40 0.4618286 0.04227714 0.3789651 0.5446922
2 5000 40 0.7931920 0.04074246 0.7133365 0.8730476
3 9600 40 1.1195885 0.04366662 1.0340016 1.2051754
4 14000 40 1.4317938 0.04988618 1.3340165 1.5295711
5 19000 40 1.7865726 0.05966749 1.6696239 1.9035214
6 330 50 0.4566107 0.03977778 0.3786459 0.5345754
7 5000 50 0.8690398 0.03930531 0.7920010 0.9460785
8 9600 50 1.2752869 0.04021511 1.1964649 1.3541088
9 14000 50 1.6638710 0.04228263 1.5809968 1.7467453
10 19000 50 2.1054440 0.04584704 2.0155834 2.1953045
11 330 60 0.4513927 0.03870323 0.3755341 0.5272513
12 5000 60 0.9448875 0.03868697 0.8690608 1.0207143
13 9600 60 1.4309852 0.03871997 1.3550938 1.5068767
14 14000 60 1.8959482 0.03879694 1.8199059 1.9719906
15 19000 60 2.4243153 0.03893796 2.3479966 2.5006340
16 330 70 0.4461747 0.03917094 0.3693994 0.5229501
17 5000 70 1.0207353 0.03892648 0.9444391 1.0970315
18 9600 70 1.5866836 0.03940454 1.5094504 1.6639168
19 14000 70 2.1280255 0.04050652 2.0486324 2.2074186
20 19000 70 2.7431867 0.04245996 2.6599648 2.8264085
21 330 80 0.4409568 0.04112833 0.3603449 0.5215686
22 5000 80 1.0965831 0.04000842 1.0181662 1.1749999
23 9600 80 1.7423820 0.04216278 1.6597426 1.8250214
24 14000 80 2.3601027 0.04684598 2.2682842 2.4519212
25 19000 80 3.0620580 0.05442428 2.9553860 3.1687300
What causes this error?

if statement and mutate

EMPLTOT_N FIRMTOT average min
12289593 4511051 5 1
26841282 1074459 55 10
15867437 81243 300 100
6060684 8761 750 500
52366969 8910 1000 1000
137003 47573 5 1
226987 10372 55 10
81011 507 300 100
23379 52 750 500
13698 42 1000 1000
67014 20397 5 1
My data look like the data above. I want to create a new column EMP using mutate function that:
emp= average*FIRMTOT if EMPLTOT_N/FIRMTOT<min
and emp=EMPLTOT_N if EMPLTOT_N/FIRMTOT>min
In your sample data EMPLTOT_N / FIRMTOT is never less than min, but this should work:
df <- read.table(text = "EMPLTOT_N FIRMTOT average min
12289593 4511051 5 1
26841282 1074459 55 10
15867437 81243 300 100
6060684 8761 750 500
52366969 8910 1000 1000
137003 47573 5 1
226987 10372 55 10
81011 507 300 100
23379 52 750 500
13698 42 1000 1000
67014 20397 5 1", header = TRUE)
library('dplyr')
mutate(df, emp = ifelse(EMPLTOT_N / FIRMTOT < min, average * FIRMTOT, EMPLTOT_N))
In the above if EMPLTOT_N / FIRMTOT == min, emp will be given the value of EMPLTOT_N since you didn't specify what you want to happen in this case.

Resources