My dataframe looks like this and I want two separate cumulative columns, one for fund A and the other for fund B
Name Event SalesAmount Fund Cum-A(desired) Cum-B(desired)
John Webinar NA NA NA NA
John Sale 1000 A 1000 NA
John Sale 2000 B 1000 2000
John Sale 3000 A 4000 2000
John Email NA NA 4000 2000
Tom Webinar NA NA NA NA
Tom Sale 1000 A 1000 NA
Tom Sale 2000 B 1000 2000
Tom Sale 3000 A 4000 2000
Tom Email NA NA 4000 2000
I have tried:
df<-
df %>%
group_by(Name)%>%
mutate(Cum-A = as.numeric(ifelse(Fund=="A",cumsum(SalesAmount),0)))%>%
mutate(Cum-B = as.numeric(ifelse(Fund=="B",cumsum(SalesAmount),0)))
but it is totally not what I want as it shows me the runningtotal of both funds,albeit only on the row when the funds match.
Kindly help.
How about:
library(dplyr)
d %>%
group_by(Name) %>%
mutate(cA=cumsum(ifelse(!is.na(Fund) & Fund=="A",SalesAmount,0))) %>%
mutate(cB=cumsum(ifelse(!is.na(Fund) & Fund=="B",SalesAmount,0)))
The output:
Source: local data frame [10 x 8]
Groups: Name
Name Event SalesAmount Fund Cum.A.desired. Cum.B.desired. cA cB
1 John Webinar NA NA NA NA 0 0
2 John Sale 1000 A 1000 NA 1000 0
3 John Sale 2000 B 1000 2000 1000 2000
4 John Sale 3000 A 4000 2000 4000 2000
5 John Email NA NA 4000 2000 4000 2000
6 Tom Webinar NA NA NA NA 0 0
7 Tom Sale 1000 A 1000 NA 1000 0
8 Tom Sale 2000 B 1000 2000 1000 2000
9 Tom Sale 3000 A 4000 2000 4000 2000
10 Tom Email NA NA 4000 2000 4000 2000
Zeroes in the resulting columns can be replaced by NA afterwards if needed:
result$cA[result$cA==0] <- NA
result$cB[result$cB==0] <- NA
Your input data set:
d <- structure(list(Name = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("John", "Tom"), class = "factor"), Event = structure(c(3L, 2L, 2L, 2L, 1L, 3L, 2L, 2L, 2L, 1L), .Label = c("Email", "Sale", "Webinar"), class = "factor"), SalesAmount = c(NA, 1000L, 2000L, 3000L, NA, NA, 1000L, 2000L, 3000L, NA), Fund = structure(c(NA, 1L, 2L, 1L, NA, NA, 1L, 2L, 1L, NA), .Label = c("A", "B"), class = "factor"), Cum.A.desired. = c(NA, 1000L, 1000L, 4000L, 4000L, NA, 1000L, 1000L, 4000L, 4000L), Cum.B.desired. = c(NA, NA, 2000L, 2000L, 2000L, NA, NA, 2000L, 2000L, 2000L)), .Names = c("Name", "Event", "SalesAmount", "Fund", "Cum.A.desired.", "Cum.B.desired." ), class = "data.frame", row.names = c(NA, -10L))
Here's an approach generalizing to more funds, using zoo and data.table:
# prep
require(data.table)
require(zoo)
setDT(d)
d[,Fund:=as.character(Fund)] # because factors are the worst
uf <- unique(d[Event=="Sale"]$Fund) # collect set of funds
First, assign cumulative sales on the relevant subset of observations:
for (f in uf) d[(Event=="Sale"&Fund==f),paste0('c',f):=cumsum(SalesAmount),by=Name]
Then, carry the last observation forward:
d[,paste0('c',uf):=lapply(.SD,na.locf,na.rm=FALSE),.SDcols=paste0('c',uf),by=Name]
You can shorten #Marat's answer slightly by rolling it all into a single mutate:
df %>%
group_by(Name) %>%
mutate(
cA = cumsum(ifelse(!is.na(Fund) & Fund == "A", SalesAmount, 0)),
cB = cumsum(ifelse(!is.na(Fund) & Fund == "B", SalesAmount, 0)),
cA = ifelse(cA == 0, NA, cA),
cB = ifelse(cB == 0, NA, cB)
)
Related
I have a dataframe that looks like this:
# A tibble: 9 x 5
# Groups: group [3]
group year value1 value2 value3
<int> <dbl> <int> <int> <int>
1 1 2000 NA 3 4
2 1 2001 8 3 4
3 1 2002 4 3 NA
4 2 2000 NA NA 1
5 2 2001 9 NA 1
6 2 2002 1 NA NA
7 3 2000 NA 5 NA
8 3 2001 9 5 NA
9 3 2002 NA 5 NA
I need a script that returns the years of the first and last non-na value for each column, irrespective of group. Ideally, the output would look like this. Beware the actual dataset is much larger.
start end
value 1 2001 2002
value 2 2000 2002
value 3 2000 2001
We can reshape into 'long' format and then do a group by the 'name' and summarise to get the min and max 'year'
library(dplyr)
library(tidyr)
library(tibble)
df1 %>%
select(-group) %>%
pivot_longer(cols = starts_with('value'), values_drop_na = TRUE) %>%
group_by(name) %>%
summarise(start = min(year), end = max(year)) %>%
column_to_rownames('name')
# start end
#value1 2001 2002
#value2 2000 2002
#value3 2000 2001
Or with melt from data.table
library(data.table)
melt(setDT(df1), id.var = c('year', 'group'), na.rm = TRUE)[,
.(start = min(year), end = max(year)), .(variable)]
Or we could also make use of summarise_at
df1 %>%
summarise_at(vars(starts_with('value')), ~
list(range(year[!is.na(.)]))) %>%
unnest(everything()) %>%
pivot_longer(everything())
data
df1 <- structure(list(group = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L),
year = c(2000L, 2001L, 2002L, 2000L, 2001L, 2002L, 2000L,
2001L, 2002L), value1 = c(NA, 8L, 4L, NA, 9L, 1L, NA, 9L,
NA), value2 = c(3L, 3L, 3L, NA, NA, NA, 5L, 5L, 5L), value3 = c(4L,
4L, NA, 1L, 1L, NA, NA, NA, NA)), class = "data.frame",
row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9"))
A base solution wehre we find first not NA in year for the mentioned columns value1 to value3.
data.frame(t(sapply(paste0("value", 1:3), function(i){
val_i <- df1[ , i]
data.frame(start =
df1$year[min(which(!is.na(val_i)))], end=
df1$year[max(which(!is.na(val_i)))])
})))
Another data.table option. Not sure if using names(.SD) is recommended but it should scale pretty well
library(data.table)
setDT(df1)[, .(val = names(.SD),
start = lapply(.SD, function(x) min(year[!is.na(x)])),
end = lapply(.SD, function(x) max(year[!is.na(x)]))), .SDcols = startsWith(names(df1), "value")]
val start end
1: value1 2001 2002
2: value2 2000 2002
3: value3 2000 2001
I have a long-format balanced data frame (df1) that has 7 columns:
df1 <- structure(list(Product_ID = c(1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3,
3, 3, 3, 3), Product_Category = structure(c(1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L), .Label = c("A", "B"), class = "factor"),
Manufacture_Date = c(1950, 1950, 1950, 1950, 1950, 1960,
1960, 1960, 1960, 1960, 1940, 1940, 1940, 1940, 1940), Control_Date = c(1961L,
1962L, 1963L, 1964L, 1965L, 1961L, 1962L, 1963L, 1964L, 1965L,
1961L, 1962L, 1963L, 1964L, 1965L), Country_Code = structure(c(1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), .Label = c("ABC",
"DEF", "GHI"), class = "factor"), Var1 = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Var2 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), row.names = c(NA,
15L), class = "data.frame")
Each Product_ID in this data set is linked with a unique Product_Category and Country_Code and Manufacture_Date, and is followed over time (Control_Date). Product_Category has two possible values (A or B); Country_Code and Manufacture_Date have 190 and 90 unique values, respectively. There are 400,000 unique Product_ID's, that are followed over a period of 50 years (Control_Date from 1961 to 2010). This means that df1 has 20,000,000 rows. The last two columns of this data frame are NA at the beginning and have to be filled using the data available in another data frame (df2):
df2 <- structure(list(Product_ID = 1:6, Product_Category = structure(c(1L,
2L, 1L, 1L, 1L, 2L), .Label = c("A", "B"), class = "factor"),
Manufacture_Date = c(1950, 1960, 1940, 1950, 1940, 2000),
Country_Code = structure(c(1L, 2L, 3L, 1L, 2L, 3L), .Label = c("ABC",
"DEF", "GHI"), class = "factor"), Year_1961 = c(5, NA, 10,
NA, 6, NA), Year_1962 = c(NA, NA, 4, 5, 3, NA), Year_1963 = c(8,
6, NA, 5, 6, NA), Year_1964 = c(NA, NA, 9, NA, 10, NA), Year_1965 = c(6,
NA, 7, 4, NA, NA)), row.names = c(NA, 6L), class = "data.frame")
This second data frame contains another type of information on the exact same 400,000 products, in wide-format. Each row represents a unique product (Product_ID) accompanied by its Product_Category, Manufacture_Date and Country_Code. There are 50 other columns (for each year from 1961 to 2010) that contain a measured value (or NA) for each product in each of those years.
Now what I would like to do is to fill in the Var1 & Var2 columns in the first data frame, by doing some calculation on the data available in the second data frame. More precisely, for each row in the first data frame (i.e. a product at Control_Date "t"), the last two columns are defined as follows:
Var1: total number of products in df2 with the same Product_Category, Manufacture_Date and Country_Code that have non-NA value in Year_t;
Var2: total number of products in df2 with different Product_Category but the same Manufacture_Date and Country_Code that have non-NA value in Year_t.
My initial solution with nested for-loops is as follows:
for (i in unique(df1$Product_ID)){
Category <- unique(df1[which(df1$Product_ID==i),"Product_Category"])
Opposite_Category <- ifelse(Category=="A","B","A")
Manufacture <- unique(df1[which(df1$Product_ID==i),"Manufacture_Date"])
Country <- unique(df1[which(df1$Product_ID==i),"Country_Code"])
ID_Similar_Product <- df2[which(df2$Product_Category==Category & df2$Manufacture_Date==Manufacture & df2$Country_Code==Country),"Product_ID"]
ID_Quasi_Similar_Product <- df2[which(df2$Product_Category==Opposite_Category & df2$Manufacture_Date==Manufacture & df2$Country_Code==Country),"Product_ID"]
for (j in unique(df1$Control_Date)){
df1[which(df1$Product_ID==i & df1$Control_Date==j),"Var1"] <- length(which(!is.na(df2[which(df2$Product_ID %in% ID_Similar_Product),paste0("Year_",j)])))
df1[which(df1$Product_ID==i & df1$Control_Date==j),"Var2"] <- length(which(!is.na(df2[which(df2$Product_ID %in% ID_Quasi_Similar_Product),paste0("Year_",j)])))
}
}
The problem with this approach is that it takes a lot of time to be run. So I would like to know if anybody could suggest a vectorized version that would do the job in less time.
See if this does what you want. I'm using the data.table package since you have a rather large (20M) dataset.
library(data.table)
setDT(df1)
setDT(df2)
# Set keys on the "triplet" to speed up everything
setkey(df1, Product_Category, Manufacture_Date, Country_Code)
setkey(df2, Product_Category, Manufacture_Date, Country_Code)
# Omit the Var1 and Var2 from df1
df1[, c("Var1", "Var2") := NULL]
# Reshape df2 to long form
df2.long <- melt(df2, measure=patterns("^Year_"))
# Split "variable" at the "_" to extract 4-digit year into "Control_Date" and delete leftovers.
df2.long[, c("variable","Control_Date") := tstrsplit(variable, "_", fixed=TRUE)][
, variable := NULL]
# Group by triplet, Var1=count non-NA in value, join with...
# (Group by doublet, N=count non-NA), update Var2=N-Var1.
df2_N <- df2.long[, .(Var1 = sum(!is.na(value))),
by=.(Product_Category, Manufacture_Date, Country_Code)][
df2.long[, .(N = sum(!is.na(value))),
by=.(Manufacture_Date, Country_Code)],
Var2 := N - Var1, on=c("Manufacture_Date", "Country_Code")]
# Update join: df1 with df2_N
df1[df2_N, c("Var1","Var2") := .(i.Var1, i.Var2),
on = .(Product_Category, Manufacture_Date, Country_Code)]
df1
Product_ID Product_Category Manufacture_Date Control_Date Country_Code Var1 Var2
1: 3 A 1940 1961 GHI 4 0
2: 3 A 1940 1962 GHI 4 0
3: 3 A 1940 1963 GHI 4 0
4: 3 A 1940 1964 GHI 4 0
5: 3 A 1940 1965 GHI 4 0
6: 1 A 1950 1961 ABC 6 0
7: 1 A 1950 1962 ABC 6 0
8: 1 A 1950 1963 ABC 6 0
9: 1 A 1950 1964 ABC 6 0
10: 1 A 1950 1965 ABC 6 0
11: 2 B 1960 1961 DEF NA NA
12: 2 B 1960 1962 DEF NA NA
13: 2 B 1960 1963 DEF NA NA
14: 2 B 1960 1964 DEF NA NA
15: 2 B 1960 1965 DEF NA NA
df2
Product_ID Product_Category Manufacture_Date Country_Code Year_1961 Year_1962 Year_1963 Year_1964 Year_1965
1: 5 A 1940 DEF 6 3 6 10 NA
2: 3 A 1940 GHI 10 4 NA 9 7
3: 1 A 1950 ABC 5 NA 8 NA 6
4: 4 A 1950 ABC NA 5 5 NA 4
5: 2 B 1940 DEF NA NA 6 NA NA
6: 6 B 2000 GHI NA NA NA NA NA
I am dealing with a dataset like this
Id Value Date
1 250 NA
1 250 2010-06-21
2 6 NA
2 6 2012-08-23
3 545 NA
7 3310 NA
My goal is to remove entire rows if there is an NA in Date column and ID is duplicate. The final output should look like:
Id Value Date
1 250 2010-06-21
2 6 2012-08-23
3 545 NA
7 3310 NA
df1[!(is.na(df1$Date) & duplicated(df1$Id) | duplicated(df1$Id, fromLast = TRUE)),]
# Id Value Date
#2 1 250 2010-06-21
#4 2 6 2012-08-23
#5 3 545 <NA>
#6 7 3310 <NA>
DATA
df1 = structure(list(Id = c(1L, 1L, 2L, 2L, 3L, 7L), Value = c(250L,
250L, 6L, 6L, 545L, 3310L), Date = c(NA, "2010-06-21", NA, "2012-08-23",
NA, NA)), .Names = c("Id", "Value", "Date"), class = "data.frame", row.names = c(NA,
-6L))
This is a query that comes from an earlier thread I chanced upon, two tables DT1 and DT2
DT1
Country State City Start End
1 IN Telangana Hyderabad 100 200
2 IN Maharashtra Pune 300 400
3 IN Haryana Gurgaon 500 600
4 IN Maharashtra Pune 700 800
5 IN Gujarat Ahmedabad 900 1000
DT2 with 7 rows
ID No
1 157
2 346
3 389
4 453
5 562
6 9874
7 98745
When they are joined using this code,
DT2[DT1, on=.(No>Start,No<End), ]
produces this output, with 6 rows
ID No No.1 Country State City
1: 1 100 200 IN Telangana Hyderabad
2: 2 300 400 IN Maharashtra Pune
3: 3 300 400 IN Maharashtra Pune
4: 5 500 600 IN Haryana Gurgaon
5: NA 700 800 IN Maharashtra Pune
6: NA 900 1000 IN Gujarat Ahmedabad
i can understand the NAs corresponding to IDs 6 and 7 (rownumbers 5 and 6), but why is the NA corresponding to ID 4 missing.
ID4 which has 453 no, maps to no ranges in DT1 and should have thrown an NA?
EDIT1: Providing Code to create the datasets
DT1<-
structure(list(Country = structure(c(1L, 1L, 1L, 1L, 1L), .Label = "IN", class = "factor"),
State = structure(c(4L, 3L, 2L, 3L, 1L), .Label = c("Gujarat",
"Haryana", "Maharashtra", "Telangana"), class = "factor"),
City = structure(c(3L, 4L, 2L, 4L, 1L), .Label = c("Ahmedabad",
"Gurgaon", "Hyderabad", "Pune"), class = "factor"), Start = c(100L,
300L, 500L, 700L, 900L), End = c(200L, 400L, 600L, 800L,
1000L)), .Names = c("Country", "State", "City", "Start",
"End"), class = c("data.table", "data.frame"))
DT2<-
structure(list(ID = 1:7, No = c(157L, 346L, 389L, 453L, 562L,
9874L, 98745L)), .Names = c("ID", "No"), class = c("data.table",
"data.frame"))
I am trying to get the maximum value in the column event until an agreement (dummy) is reached; Events are nested in agreements, agreements are nested in dyad which run over year. Note that years are not always continuous, meaning there are breaks between the years (1986, 1987,2001,2002).
I am able to get the maximum values within the dyad with a ddply and max(event); but I struggle how to ‘assign’ the different events to the right agreement (until/after). I am basically lacking an 'identifier' which assigns each observation to an agreement.
The results which I am looking for are already in the column "result".
dyad year event agreement agreement.name result
1 1985 9
1 1986 4 1 agreement1 9
1 1987
1 2001 3
1 2002 1 agreement2 3
2 1999 1
2 2000 5
2 2001 1 agreement3 5
2 2002 2
2 2003
2 2004 1 agreement 4 2
Here is the data in a format which is hopefully easier to use:
df<-structure(list(dyad = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L), year = c(1985L, 1986L, 1987L, 2001L, 2002L, 1999L, 2000L,
2001L, 2002L, 2003L, 2004L), event = c(9L, 4L, NA, 3L, NA, 1L,
5L, NA, 2L, NA, NA), agreement = c(NA, 1L, NA, NA, 1L, NA, NA,
1L, NA, NA, 1L), agreement.name = c("", "agreement1", "", "",
"agreement2", "", "", "agreement3", "", "", "agreement 4"), result = c(NA,
9L, NA, NA, 3L, NA, NA, 5L, NA, NA, 2L)), .Names = c("dyad",
"year", "event", "agreement", "agreement.name", "result"), class = "data.frame", row.names = c(NA,
-11L))
Here is an option using data.table. Convert the 'data.frame' to 'data.table' (setDT(df)), create another grouping variable ('ind') based on the non-empty elements in 'agreement.name'. Grouped by both 'dyad' and 'ind' columns, we create a new column 'result' using ifelse to fill the rows that have 'agreement.name' is non-empty with the max of 'event'
library(data.table)
setDT(df)[, ind:=cumsum(c(TRUE,diff(agreement.name=='')>0)),dyad][,
result:=ifelse(agreement.name!='', max(event, na.rm=TRUE), NA) ,
list(dyad, ind)][, ind:=NULL][]
# dyad year event agreement agreement.name result
# 1: 1 1985 9 NA NA
# 2: 1 1986 4 1 agreement1 9
# 3: 1 1987 NA NA NA
# 4: 1 2001 3 NA NA
# 5: 1 2002 NA 1 agreement2 3
# 6: 2 1999 1 NA NA
# 7: 2 2000 5 NA NA
# 8: 2 2001 NA 1 agreement3 5
# 9: 2 2002 2 NA NA
#10: 2 2003 NA NA NA
#11: 2 2004 NA 1 agreement 4 2
Or instead of ifelse, we can use numeric index
setDT(df)[, result:=c(NA, max(event, na.rm=TRUE))[(agreement.name!='')+1L] ,
list(ind= cumsum(c(TRUE,diff(agreement.name=='')>0)),dyad)][]
data
df <- structure(list(dyad = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L), year = c(1985L, 1986L, 1987L, 2001L, 2002L, 1999L, 2000L,
2001L, 2002L, 2003L, 2004L), event = c(9L, 4L, NA, 3L, NA, 1L,
5L, NA, 2L, NA, NA), agreement = c(NA, 1L, NA, NA, 1L, NA, NA,
1L, NA, NA, 1L), agreement.name = c("", "agreement1", "", "",
"agreement2", "", "", "agreement3", "", "", "agreement 4")),
.Names = c("dyad",
"year", "event", "agreement", "agreement.name"), row.names = c(NA,
-11L), class = "data.frame")