Hi ladies and gentlemen,
here you see my input file:
Input <- data.table(
id = c("x1","x2","x3"),
from = c("2020-01-01","2020-02-15","2019-01-15"),
to = c("2020-12-31","2021-02-14","2021-02-14"),
Amount = c(120, 120, 240)
)
basically I would like to split the amount on a monthly basis. Can you suggest a fast and elegant way to do it?
Here what I would like to get for one of the id (for the other id the logic should be the same).
Output_x2 <- data.table(
id = c("x2"),
Period = c(2021),
to = c(202002, 202003, 202004, 202005, 202006, 202007, 202008, 202009, 202010, 202011, 202012, 202101, 202102),
Amount_ProRata = c(5, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,5)
)
A possible solution:
library(data.table)
library(tidyverse)
library(lubridate)
spread <- function(x,y,z,value)
{
nMonths <- as.duration(y %--% z) %>%
as.numeric("months") %>%
round(0)
frac1 <- y %>%
{day(.)/days_in_month(.)} %>%
round(1)
frac2 <- z %>%
{day(.)/days_in_month(.)} %>%
round(1)
wholem <- round(nMonths - frac1 - frac2,0)
splita <- (value/nMonths) * c(frac1,rep(1,wholem),frac2)
splita <- splita[splita > 0]
months <- seq(ymd(y),ymd(z)+2, by = "months")
months <- months[1:length(splita)]
df <- data.frame(id=x,
Period=year(months),
to=paste0(year(months),sprintf("%02d",month(months))),
Amount_ProRata=splita)
}
map_dfr(1:nrow(Input),
~ spread(Input$id[.x], Input$from[.x], Input$to[.x], Input$Amount[.x]))
Related
In R I've got a dataset like this one:
df <- data.frame(
ID = c(1:30),
x1 = seq(0, 1, length.out = 30),
x2 = seq(100, 3000, length.out = 30),
category = gl(3, 10, labels = c("NEGATIVE", "NEUTRAL", "POSITIVE"))
)
Now I want to add a new column with randomized boolean values, but inside each category the proportion of TRUE and FALSE values should be the same (i.e. the randomizing process should generate the same count of true and false values, in the above data frame 5 TRUEs and 5 FALSEs in each of the 3 categories). How to do this?
You can sample a vector of "TRUE" and "FALSE" values without replacement so you have a randomized and balanced column in your data-frame.
sample(rep(c("TRUE","FALSE"),each=5),10,replace=FALSE)
Based on Yacine Hajji answer:
addRandomBool <- function(df, p){
n <- ceiling(nrow(df) * p)
df$bool <- sample(rep(c("TRUE","FALSE"), times = c(n, nrow(df) - n)))
df
}
Reduce(rbind, lapply(split(df, df$category), addRandomBool, p = 0.5))
where parametar p determines the proportion of TRUE.
This will sample within each group from a vector of 5 TRUE and 5 FALSE without replacement. It will assume that there are always 10 records per group.
library(dplyr)
library(tidyr)
df <- data.frame(
ID = c(1:30),
x1 = seq(0, 1, length.out = 30),
x2 = seq(100, 3000, length.out = 30),
category = gl(3, 10, labels = c("NEGATIVE", "NEUTRAL", "POSITIVE"))
)
set.seed(pi)
df %>%
group_by(category) %>%
nest() %>%
mutate(data = lapply(data,
function(df){ # Function to saple and assign the new_col
df$new_col <- sample(rep(c(FALSE, TRUE),
each = 5),
size = 10,
replace = FALSE)
df
})) %>%
unnest(cols = "data")
This next example is a little more generalized, but still assumes (approximately) even distribution of TRUE and FALSE within a group. But it can accomodate variable group sizes, and even groups with odd numbers of records (but will favor FALSE for odd numbers of records)
library(dplyr)
library(tidyr)
df <- data.frame(
ID = c(1:30),
x1 = seq(0, 1, length.out = 30),
x2 = seq(100, 3000, length.out = 30),
category = gl(3, 10, labels = c("NEGATIVE", "NEUTRAL", "POSITIVE"))
)
set.seed(pi)
df %>%
group_by(category) %>%
nest() %>%
mutate(data = lapply(data,
function(df){
df$new_col <- sample(rep(c(FALSE, TRUE),
length.out = nrow(df)),
size = nrow(df),
replace = FALSE)
df
})) %>%
unnest(cols = "data")
Maintaining Column Order
A couple of options to maintain the column order:
First, you can save the column order before you do your group_by - nest, and then use select to set the order when you're done.
set.seed(pi)
orig_col <- names(df) # original column order
df %>%
group_by(category) %>%
nest() %>%
mutate(data = lapply(data,
function(df){
df$new_col <- sample(rep(c(FALSE, TRUE),
length.out = nrow(df)),
size = nrow(df),
replace = FALSE)
df
})) %>%
unnest(cols = "data") %>%
select_at(c(orig_col, "new_col")) # Restore the column order
Or you can use a base R solution that doesn't change the column order in the first place
df <- split(df, df["category"])
df <- lapply(df,
function(df){
df$new_col <- sample(rep(c(FALSE, TRUE),
length.out = nrow(df)),
size = nrow(df),
replace = FALSE)
df
})
do.call("rbind", c(df, list(make.row.names = FALSE)))
There are likely a dozen other ways to do this, and probably more efficient ways that I'm not thinking of.
i had data frame which contain many row. Here it is:
library(lubridate)
date_ <- date(seq(ymd_h("2020-01-01-00"), ymd_h("2021-03-31-23"), by = "hours"))
hour_ <- hour(seq(ymd_h("2020-01-01-00"), ymd_h("2021-03-31-23"), by = "hours"))
game1 <- sort(round(runif(length(date_), 10, 50), 0), decreasing = TRUE)
game2 <- sort(round(runif(length(date_), 20, 100), 0), decreasing = TRUE)
game3 <- sort(round(runif(length(date_), 30, 150), 0), decreasing = TRUE)
game4 <- sort(round(runif(length(date_), 40, 200), 0), decreasing = TRUE)
game_data <- data.frame(date_, hour_, game1, game2, game3, game4)
I just want to subset game_data to get all the last 6 months data. How do i get it?
Last 6 months of data from the max date in the data ?
You can try -
library(lubridate)
library(dplyr)
result <- subset(game_data, date_ > max(date_) %m-% months(6))
Or with dplyr -
result <- game_data %>% filter(date_ > max(date_) %m-% months(6))
I'm looking to run a function on each group of a dataset, and bind the output to the existing set inside the tidyverse environment. After the example set, I've added how I do it right now, which requires splitting the set and running lapply (I want to move everything towards the tidyverse).
library(TTR)
test = data.frame('high'=rnorm(100,10,0.1),'low'=rnorm(100,0,0.1), 'close'=rnorm(100,5,0.1))
stoch(test,
nFastK = 14, nFastD = 3, nSlowD = 3,
maType=list(list(SMA), list(SMA), list(SMA)),
bounded = TRUE,
smooth = 1)
Here is how it used to be done with lists:
get_stoch = function(dat_) {
stochs = stoch(dat_ %>% select(-ticker), nFastK = 14, nFastD = 3, nSlowD = 3,
maType=list(list(SMA), list(SMA), list(SMA)),
bounded = TRUE, smooth = 1)
dat_ = cbind(dat_,stochs)
}
test = data.frame('ticker'=c(rep('A',50),rep('B',50)),
'high'=rnorm(100,10,0.1),'low'=rnorm(100,0,0.1), 'close'=rnorm(100,5,0.1)) %>%
split(.,.$ticker) %>%
lapply(.,get_stoch) %>%
bind_rows
If you want to translate your code to tidyverse you can use :
library(dplyr)
library(purrr)
df %>% group_split(ticker) %>% map_dfr(get_stoch)
You can use plyr::ddply to run a split-apply-bind method in tidyverse-like language:
df <- data.frame(ticker = c(rep('A', 50), rep('B', 50)),
high = rnorm(100, 10, 0.1),
low = rnorm(100, 0, 0.1),
close = rnorm(100, 5, 0.1))
test1 <- df %>%
split(.,.$ticker) %>%
lapply(.,get_stoch) %>%
bind_rows
test2 <- df %>%
ddply("ticker", get_stoch)
identical(test1, test2)
#> [1] TRUE
I have a df such as:
name <- rep(c("a","b","c"),5)
QV.low <- runif(15, 2, 5)
QV.med <- runif(15, 5.0, 7.5)
QV.high <- runif(15, 7.5, 10)
df <- as.data.frame(cbind(name, QV.low, QV.med,QV.high))
and a list of names:
name.list <- c("a","b")
I want to do an operation, eg:
df %>%
subset(name %in% name.list) %>%
summarise(.,sum = sum(QV.low))
but I want to for each QV. variable via a loop.
I tried:
QV.list <- c("QV.low", "QV.med", "QV.high")
for(qv in 1:length(QV.list)){
QV <- noquote(QV.list[qv])
print(QV)
df %>%
subset(name %in% name.list) %>%
summarise(.,sum = sum(QV))
}
But it does not work.
How can I "extract" the character value from the QV.list in order to use it as df variable later?
You need to have at least 3 different names in namecol otherwise namecol %in% name.list1 is useless. If there's no filter and no pipe, there's no need for a loop. A simple colSums(df[,-1]) will do the job.
library(tidyverse)
QV.low <- runif(10, 2, 5)
QV.med <- runif(10, 5.0, 7.5)
QV.high <- runif(10, 7.5, 10)
namecol <- sample(c("a","b", "c"), 10, replace = T)
df <- data.frame(namecol, QV.low, QV.med,QV.high)
df
name.list1 <- c("a","b") # select some names
QV.list <- c("QV.low", "QV.med", "QV.high")
for(i in QV.list){
QV <- noquote(i)
print(QV)
qv <- sym(i)
print(df %>%
filter(namecol %in% name.list1) %>%
summarise(sum = sum(!!qv)))
}
will give you
[1] QV.low
sum
1 29.093
[1] QV.med
sum
1 61.07034
[1] QV.high
sum
1 86.02611
if I understood your problem you can resolve with this:
for( name in names(df)){
df[,name]
....
df %>% summarise(.,sum = sum(df[,name]))
}
Suppose I have the following:
myDF <- cbind.data.frame("Id" = rep(1:5, each = 4), values = c(rnorm(4,0,1), rnorm(4, 10, 1), rnorm(4, 20,1 ), rnorm(4, 30,1), rnorm(4, 40,1)))
idVector <- sample(1:5, size = 5, replace = TRUE)
If my `idVector = 4,4,3,2,1', I want to pull all the rows with Id 4, then Id 4 again, then 3 then 2 then 1.
I can do it using the following:
do.call("rbind", lapply(idVector, function(x, currentDF){
currentDF[currentDF$Id == x,]}
, myDF))
Is there a neater way to do it using dplyr or plyr?
With dplyr
library(dplyr)
left_join(data.frame(Id=idVector), myDF)