R, Cumulative Sum in Reverse - r

Let's say we have two tables:
A table of budgets:
Item Budget
A 900
B 350
C 100
D 0
bDT = structure(list(Item = c("A", "B", "C", "D"), Budget = c(900L,
350L, 100L, 0L)), .Names = c("Item", "Budget"), row.names = c(NA,
-4L), class = "data.frame")
and a table of expected expenses by item per date.
Item Date Expense
A 2017-08-24 850
B 2017-08-18 300
B 2017-08-11 50
C 2017-08-18 50
C 2017-08-11 100
D 2017-08-01 500
expDF = structure(list(Item = c("A", "B", "B", "C", "C", "D"), Date = structure(c(17402,
17396, 17389, 17396, 17389, 17379), class = "Date"), Expense = c(850L,
300L, 50L, 50L, 100L, 500L)), .Names = c("Item", "Date", "Expense"
), row.names = c(NA, -6L), class = "data.frame")
I'm looking to summarize the amount we can spend per item per date like this:
Item Date Spend
A 8/24/2017 850
B 8/18/2017 300
B 8/11/2017 50
C 8/18/2017 50
C 8/11/2017 50
D 8/1/2017 0

This works:
library(data.table)
setDT(bDF); setDT(expDF)
expDF[bDF, on=.(Item), Spending :=
pmin(
Expense,
pmax(
0,
Budget - cumsum(shift(Expense, fill=0))
)
)
, by=.EACHI]
Item Date Expense Spending
1: A 2017-08-24 850 850
2: B 2017-08-18 300 300
3: B 2017-08-11 50 50
4: C 2017-08-18 50 50
5: C 2017-08-11 100 50
6: D 2017-08-01 500 0
How it works
cumsum(shift(Expense, fill = 0)) is prior spending**
max(0, Budget - prior spending) is remaining budget
min(Expense, remaining budget) is current spending
The data.table syntax x[i, on=, j, by=.EACHI] is a join. In this case j takes the form v := expr, which adds a new column to x. See ?data.table for details.
** Well, "prior" in ordering of the table. I'll ignore the OP's weird reversed dates.

Related

How to merge two dataframes based on range value of one table

DF1
SIC Value
350 100
460 500
140 200
290 400
506 450
DF2
SIC1 AREA
100-200 Forest
201-280 Hospital
281-350 Education
351-450 Government
451-550 Land
Note:class of SIC1 is having character,we need to convert to numeric range
i am trying to get the output like below
Desired output:
DF3
SIC Value AREA
350 100 Education
460 500 Land
140 200 Forest
290 400 Education
506 450 Land
i have tried first to convert character class of SIC1 to numeric
then tried to merge,but no luck,can someone guide on this?
An option can be to use tidyr::separate along with sqldf to join both tables on range of values.
library(sqldf)
library(tidyr)
DF2 <- separate(DF2, "SIC1",c("Start","End"), sep = "-")
sqldf("select DF1.*, DF2.AREA from DF1, DF2
WHERE DF1.SIC between DF2.Start AND DF2.End")
# SIC Value AREA
# 1 350 100 Education
# 2 460 500 Lan
# 3 140 200 Forest
# 4 290 400 Education
# 5 506 450 Lan
Data:
DF1 <- read.table(text =
"SIC Value
350 100
460 500
140 200
290 400
506 450",
header = TRUE, stringsAsFactors = FALSE)
DF2 <- read.table(text =
"SIC1 AREA
100-200 Forest
201-280 Hospital
281-350 Education
351-450 Government
451-550 Lan",
header = TRUE, stringsAsFactors = FALSE)
We could do a non-equi join. Split (tstrsplit) the 'SIC1' column in 'DF2' to numeric columns and then do a non-equi join with the first dataset.
library(data.table)
setDT(DF2)[, c('start', 'end') := tstrsplit(SIC1, '-', type.convert = TRUE)]
DF2[, -1, with = FALSE][DF1, on = .(start <= SIC, end >= SIC),
mult = 'last'][, .(SIC = start, Value, AREA)]
# SIC Value AREA
#1: 350 100 Education
#2: 460 500 Land
#3: 140 200 Forest
#4: 290 400 Education
#5: 506 450 Land
Or as #Frank mentioned we can do a rolling join to extract the 'AREA' and update it on the first dataset
setDT(DF1)[, AREA := DF2[DF1, on=.(start = SIC), roll=TRUE, x.AREA]]
data
DF1 <- structure(list(SIC = c(350L, 460L, 140L, 290L, 506L), Value = c(100L,
500L, 200L, 400L, 450L)), .Names = c("SIC", "Value"),
class = "data.frame", row.names = c(NA, -5L))
DF2 <- structure(list(SIC1 = c("100-200", "201-280", "281-350", "351-450",
"451-550"), AREA = c("Forest", "Hospital", "Education", "Government",
"Land")), .Names = c("SIC1", "AREA"), class = "data.frame",
row.names = c(NA, -5L))

R lapply update list of data.tables with list - no such index at level 1

I am trying to update a list of date.tables with a list, that seems like it should work as it does in this example:
set.seed(1965)
dt_lst <- list(dt1 <- data.table(a = rnorm(1:4),
b = c(4,3,2,1)), dt2 <- data.table(c = rnorm(1:5),
d = letters[1:5]))
> dt_lst
[[1]]
a b
1: 0.8428429 4
2: 0.2958355 3
3: -1.0520980 2
4: 0.9628192 1
[[2]]
c d
1: -0.05033855 a
2: -0.94065157 b
3: 1.20459624 c
4: -0.47791557 d
5: -0.30362496 e
Now a list for the update (someone said dt1 was group 1 and dt2 was group2 and group needed to be in the results):
group1 <- list(1,2)
And lapply update:
dt_lst_tst <- lapply(seq_along(dt_lst),
function(x)
dt_lst[[x]][, group:= group1[[x]]])
> dt_lst_tst
[[1]]
a b group
1: 0.8428429 4 1
2: 0.2958355 3 1
3: -1.0520980 2 1
4: 0.9628192 1 1
[[2]]
c d group
1: -0.05033855 a 2
2: -0.94065157 b 2
3: 1.20459624 c 2
4: -0.47791557 d 2
5: -0.30362496 e 2
Perfect, and characteristic of my data where I never know how big a data.table I'll have (nrows) nor which 'group' it is supposed to be in
until after the fact, hence updating.
So now, with a very small amount of my data:
> dput(combine_sub1)
list(structure(list(smp = 1:4, x = c(491, 491, 491, 491), y = c(798,
798, 798, 798)), .Names = c("smp", "x", "y"), class = c("data.table",
"data.frame"), row.names = c(NA, -4L), .internal.selfref = <pointer:
0x2b859d8>),
structure(list(smp = 1:6, x = c(650, 650, 650, 650, 650,
650), y = c(437, 437, 437, 437, 437, 437)), .Names = c("smp",
"x", "y"), class = c("data.table", "data.frame"), row.names = c(NA,
-6L), .internal.selfref = <pointer: 0x2b859d8>), structure(list(
smp = 1:5, x = c(480, 485, 540, 572, 589), y = c(462,
462, 455, 451, 450)), .Names = c("smp", "x", "y"), class =
c("data.table",
"data.frame"), row.names = c(NA, -5L), .internal.selfref = <pointer:
0x2b859d8>))
> combine_sub1
[[1]]
smp x y
1: 1 491 798
2: 2 491 798
3: 3 491 798
4: 4 491 798
[[2]]
smp x y
1: 1 650 437
2: 2 650 437
3: 3 650 437
4: 4 650 437
5: 5 650 437
6: 6 650 437
[[3]]
smp x y
1: 1 480 462
2: 2 485 462
3: 3 540 455
4: 4 572 451
5: 5 589 450
group3_lst <- list(1,2,3)
> group3_lst
[[1]]
[1] 1
[[2]]
[1] 2
[[3]]
[1] 3
And using similar lapply as above:
> combine_sub1_tst <- lapply(seq_along(combine_sub1),
+ function(x)
+ combine_sub1[[x]][ , group := group3_lst[[x]]])
Error in group3_lst[[x]] : no such index at level 1
And I can't figure out why the difference. Any help appreciated.
The problem seems to have been caused by the use of variable x in the function call and it interferes with the x column in the data.tables in group3_lst. Use a difference variable name that's not in those data.tables it will work fine, e.g. use i: combine_sub1_tst <- lapply(seq_along(combine_sub1), function(i) combine_sub1[[i]][ , group := group3_lst[[i]]])

How to Matching conditions in R Dataframe

I am having dataframe which looks like:
Count_ID Stats Date
123 A 10-01-2017
123 A 12-01-2017
123 B 15-01-2017
456 B 18-01-2017
456 C 17-01-2017
789 A 20-01-2017
486 A 25-01-2017
486 A 28-01-2017
I want to add a Status & Count column in Dataframe which give me below mention status.
Match oldest Count_ID as per date having Stats as "A" compare if any Count_ID with same value (i.e 123) is having date > than that Previous same Count_ID having Stats as "A", than show it "False" in status column.
If there are multiple Count_ID with same value (i.e 123) than check Stats "A" than match any same Count_ID with Stats other than "A" or "A" are having date > than of those having Stats "A", than show status as "False"
If there are multiple same Count_ID (i.e 123) having Stats as "A" with date difference <30 days (w.r.t the previous Count_ID as per Date) show status as "False-B".
In count column, show difference of days between same Count_ID created from previous Count_ID.
Where no condition show it as "-".
Required Output:
Count_ID Stats Date Status Count
123 A 10-01-2017 False-B 0
123 A 12-01-2017 False-B 2
123 B 15-01-2017 False 3
456 B 18-01-2017 - 0
456 C 17-01-2017 False 1
789 A 20-01-2017 - 0
486 A 25-01-2017 False-B 0
486 A 28-01-2017 False-B 3
Dput:
structure(list(Count_ID = c(123L, 123L, 123L, 456L, 456L, 789L,
486L, 486L), Stats = c("A", "A", "B", "B", "C", "A", "A", "A"
), Date = c("10/01/2017", "12/01/2017", "15/01/2017", "18/01/2017",
"17/01/2017", "20/01/2017", "25/01/2017", "28/01/2017")), .Names = c("Count_ID",
"Stats", "Date"), class = "data.frame", row.names = c(NA, -8L
))
If I understood the question correctly then you can try this
library(dplyr)
df %>%
group_by(Count_ID) %>%
mutate(Count = c(0, abs(as.numeric(diff(Date)))),
Status = ifelse((Date==min(Date[Stats=='A']) | Date>min(Date[Stats=='A'])) & (n()>1), "FALSE", "-")) %>%
mutate(Status = ifelse(Stats=='A' & Count < 30 & Status=='FALSE', 'FALSE-B', Status)) %>%
data.frame()
Note that the condition for "row item 5" is not clear so I have left it as -. I am not sure how you want to go about this row as there is no Stats = A for Count_ID = 456.
Output is:
Count_ID Stats Date Count Status
1 123 A 2017-01-10 0 FALSE-B
2 123 A 2017-01-12 2 FALSE-B
3 123 B 2017-01-15 3 FALSE
4 456 B 2017-01-18 0 -
5 456 C 2017-01-17 1 -
6 789 A 2017-01-20 0 -
7 486 A 2017-01-25 0 FALSE-B
8 486 A 2017-01-28 3 FALSE-B
Sample data:
df <- structure(list(Count_ID = c(123L, 123L, 123L, 456L, 456L, 789L,
486L, 486L), Stats = c("A", "A", "B", "B", "C", "A", "A", "A"
), Date = structure(c(17176, 17178, 17181, 17184, 17183, 17186,
17191, 17194), class = "Date")), .Names = c("Count_ID", "Stats",
"Date"), row.names = c(NA, -8L), class = "data.frame")

R conditional filter

I have two dataframes in R.
Release dataframe
Date Product
2011-01-13 A
2011-02-15 A
2011-01-14 B
2011-02-15 B
Casedata dataframe
Date Product Numberofcases
2011-01-13 A 50
2011-01-12 A 20
2011-01-11 A 100
2011-01-10 A 120
2011-01-09 A 150
2011-01-08 A 180
2011-01-07 A 200
2011-01-06 A 220
2011-01-23 A 500
2011-01-31 A 450
2011-02-08 A 50
2011-02-09 A 1000
2011-02-10 A 1200
2011-02-11 A 1500
2011-02-12 A 1800
2011-02-13 A 2000
2011-02-14 A 2200
2011-02-15 A 5000
2011-01-31 A 4500
:::
:::
2011-01-15 B 1000
My requirement is for every product release date(from release dataframe), I should obtain the corresponding sum(numberofcases) one week prior to the release date(in the casedata dataframe). ie., for product A and release date 2011-01-13, it should be sum of all cases in the previous week (from 2011-01-06 to 2011-01-13) ie., (50+20+100+120+150+180+200+220)
Releasedate Product Numberofcasesoneweekpriorrelease
2011-01-13 A 1040
2011-02-15 A 19250
2011-01-14 B ...
2011-02-15 B ...
What I have tried :
beforerelease <- sqldf("select product,release.date_release,sum(numberofcasescreated) as numberofcasesbeforerelease from release left join casedata using (product) where date_case>=weekbeforerelease and date_case<=date_release group by product,date_release")
finaldf <- merge(beforerelease,afterelease,by=c("monthyear","product"))
I am struck and it is not giving me the expected outcome. Can somebody help me ?
Using the recently implemented non-equi joins feature in the current development version of data.table, v1.9.7, this can be done simply as (assuming all Date columns are of class Date):
require(data.table)
setDT(release)[, Date2 := Date-7L]
setDT(casedata)[release, on = .(Product, Date >= Date2, Date <= Date),
.(count = sum(Numberofcases)), by = .EACHI]
# Product Date Date count
# 1: A 2011-01-06 2011-01-13 1040
# 2: A 2011-02-08 2011-02-15 14750
# 3: B 2011-01-07 2011-01-14 NA
# 4: B 2011-02-08 2011-02-15 NA
With the data.table package you could follow two approaches:
1) Using the foverlaps functionality:
library(data.table)
# convert to a 'data.table' with 'setDT()'
# and create a release window
setDT(release)[, `:=` (bdat = as.Date(Date)-7, edat = as.Date(Date))][, Date := NULL]
# convert to a 'data.table' and create a 2nd date column for use with 'foverlaps
setDT(casedata)[, `:=` (bdat = as.Date(Date), edat = as.Date(Date))][, Date := NULL]
# set the key for use in 'foverlaps'
setkey(release, Product, bdat, edat)
setkey(casedata, Product, bdat, edat)
# do an overlap join ('foverlaps') and summarise
foverlaps(casedata, release, type = 'within', nomatch = 0L)[, .(cases.prior.release = sum(Numberofcases)), by = .(Product, release.date = edat)]
which gives:
Product release.date cases.prior.release
1: A 2011-01-13 1040
2: A 2011-02-15 14750
2) Using the standard join functionality of data.table:
setDT(release)
setDT(casedata)
casedata[, Date := as.Date(Date)
][release[, `:=` (Date = as.Date(Date), idx = .I)
][, .(dates = seq(Date-7,Date,'day')), by = .(Product,idx)],
on = c('Product', Date = 'dates'), nomatch = 0L
][, .(releasedate = Date[.N], cases.prior.release = sum(Numberofcases)), by = .(Product,idx)
][, idx := NULL]
which will get you the same result.
Used data:
release <- structure(list(Date = c("2011-01-13", "2011-02-15", "2011-01-14", "2011-02-15"),
Product = c("A", "A", "B", "B")),
.Names = c("Date", "Product"), class = "data.frame", row.names = c(NA, -4L))
casedata <- structure(list(Date = c("2011-01-13", "2011-01-12", "2011-01-11", "2011-01-10", "2011-01-09", "2011-01-08", "2011-01-07", "2011-01-06", "2011-01-23", "2011-01-31", "2011-02-08", "2011-02-09", "2011-02-10", "2011-02-11", "2011-02-12", "2011-02-13", "2011-02-14", "2011-02-15", "2011-01-31"),
Product = c("A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A"),
Numberofcases = c(50L, 20L, 100L, 120L, 150L, 180L, 200L, 220L, 500L, 450L, 50L, 1000L, 1200L, 1500L, 1800L, 2000L, 2200L, 5000L, 4500L)),
.Names = c("Date", "Product", "Numberofcases"), class = "data.frame", row.names = c(NA, -19L))

Number of active items in bins

I have a list of items with 2 dates (start date and end date) and duration in days (end date - start date). I want to cut them into bins to show the number of "active items" in each bin, i.e. if start date <= bin date and end date > bin date, the item should be counted in the bin.
Item StartDate EndDate Duration
Machine1 2005/01/21 2011/03/29 2258
Machine2 2004/05/12 2012/05/08 2918
Machine3 2004/10/15 2005/09/10 330
Machine4 2004/08/30 2011/08/02 2528
Machine5 2005/06/06 2010/12/03 2006
Machine6 2004/05/11 2007/03/17 1040
Machine7 2005/08/09 2011/05/30 2120
Machine8 2005/01/06 2012/06/07 2709
Machine9 2005/06/13 2008/08/28 1172
Machine10 2005/06/28 2010/04/08 1745
Machine11 2004/11/09 2007/05/14 916
Machine12 2005/05/26 2012/09/16 2670
Machine13 2004/05/28 2009/06/09 1838
Machine14 2005/01/06 2012/05/25 2696
Machine15 2005/08/20 2012/02/11 2366
Machine16 2004/08/02 2011/10/23 2638
Machine17 2004/08/10 2009/03/15 1678
Machine18 2005/05/08 2006/04/17 344
Machine19 2005/08/26 2006/07/24 332
Machine20 2004/03/30 2006/05/07 768
Bin counts that I want to produce:
2004/01/01 0
2005/01/01 9
2006/01/01 19
2007/01/01 16
2008/01/01 14
2009/01/01 13
2010/01/01 11
2011/01/01 9
2012/01/01 5
2013/01/01 0
As you can see, the totals of the bins do not add up to the total number of items, as you would expect with a traditional histogram.
I can do this with some verbose code, but I'm sure there must be some short way, using cut or split. I'm aware that the bin labels are off by one according to my definition above, but let's ignore that for now.
A way is:
#turn dates to actual dates
DF$StartDate <- as.Date(DF$StartDate, "%Y/%m/%d")
DF$EndDate <- as.Date(DF$EndDate, "%Y/%m/%d")
binDF[,1] <- as.Date(binDF[,1], "%Y/%m/%d")
counts <- colSums(sapply(binDF[,1], function(x) {DF$StartDate <= x & DF$EndDate > x}))
#> counts
#[1] 0 9 19 16 14 13 11 9 5 0
And as a complete dataframe:
resDF <- data.frame(dates = binDF[,1], counts = counts, stringsAsFactors = F)
#> resDF
# dates counts
#1 2004-01-01 0
#2 2005-01-01 9
#3 2006-01-01 19
#4 2007-01-01 16
#5 2008-01-01 14
#6 2009-01-01 13
#7 2010-01-01 11
#8 2011-01-01 9
#9 2012-01-01 5
#10 2013-01-01 0
The dataframes DF and binDF:
DF <- structure(list(Item = c("Machine1", "Machine2", "Machine3", "Machine4",
"Machine5", "Machine6", "Machine7", "Machine8", "Machine9", "Machine10",
"Machine11", "Machine12", "Machine13", "Machine14", "Machine15",
"Machine16", "Machine17", "Machine18", "Machine19", "Machine20"
), StartDate = c("2005/01/21", "2004/05/12", "2004/10/15", "2004/08/30",
"2005/06/06", "2004/05/11", "2005/08/09", "2005/01/06", "2005/06/13",
"2005/06/28", "2004/11/09", "2005/05/26", "2004/05/28", "2005/01/06",
"2005/08/20", "2004/08/02", "2004/08/10", "2005/05/08", "2005/08/26",
"2004/03/30"), EndDate = c("2011/03/29", "2012/05/08", "2005/09/10",
"2011/08/02", "2010/12/03", "2007/03/17", "2011/05/30", "2012/06/07",
"2008/08/28", "2010/04/08", "2007/05/14", "2012/09/16", "2009/06/09",
"2012/05/25", "2012/02/11", "2011/10/23", "2009/03/15", "2006/04/17",
"2006/07/24", "2006/05/07"), Duration = c(2258L, 2918L, 330L,
2528L, 2006L, 1040L, 2120L, 2709L, 1172L, 1745L, 916L, 2670L,
1838L, 2696L, 2366L, 2638L, 1678L, 344L, 332L, 768L)), .Names = c("Item",
"StartDate", "EndDate", "Duration"), class = "data.frame", row.names = c(NA,
-20L))
binDF <- structure(list(V1 = c("2004/01/01", "2005/01/01", "2006/01/01",
"2007/01/01", "2008/01/01", "2009/01/01", "2010/01/01", "2011/01/01",
"2012/01/01", "2013/01/01"), V2 = c(0L, 9L, 19L, 16L, 14L, 13L,
11L, 9L, 5L, 0L)), .Names = c("V1", "V2"), class = "data.frame", row.names = c(NA,
-10L))

Resources