I have a transactional data like this
library(data.table)
library(stringr)
sample <- data.table (customerid=c(1,1,2,2,2,3,4,4,5,5,6,6,6,7,7),
product=c("A","A+B","A","A+B+C","A+C","B","B+C+D","C+D","A+D","A+B+D","A+B","A","A+C","B+D","D"))
I am trying to count how many product each customer buy totally and add it into a column name total_product
I tried this code in data.table
sample[, A:= str_detect(product,"A")]
sample[, B:= str_detect(product,"B")]
sample[, C:= str_detect(product,"C")]
sample[, C:= str_detect(product,"D")]
sample
the code returns
customerid product A B C D
1: 1 A TRUE FALSE FALSE FALSE
2: 1 A+B TRUE TRUE FALSE FALSE
3: 2 A TRUE FALSE FALSE FALSE
4: 2 A+B+C TRUE TRUE TRUE FALSE
5: 2 A+C TRUE FALSE TRUE FALSE
6: 3 B FALSE TRUE FALSE FALSE
7: 4 B+C+D FALSE TRUE TRUE TRUE
8: 4 C+D FALSE FALSE TRUE TRUE
9: 5 A+D TRUE FALSE FALSE TRUE
10: 5 A+B+D TRUE TRUE FALSE TRUE
11: 6 A+B TRUE TRUE FALSE FALSE
12: 6 A TRUE FALSE FALSE FALSE
13: 6 A+C TRUE FALSE TRUE FALSE
14: 7 B+D FALSE TRUE FALSE TRUE
15: 7 D FALSE FALSE FALSE TRUE
I saw a question on Stack that I should merge four c(A,B,C,D) column and count the TRUE
But in my case, I will have the same product count more than one time.
Thanks for your advice!
We can use lapply on the pattern vector (LETTERS[1:4]) and either specify the arguments of the function str_detect
sample[, LETTERS[1:4] := lapply(LETTERS[1:4], str_detect, string = product)]
Or use anonymous/lambda function
sample[, LETTERS[1:4] := lapply(LETTERS[1:4], function(x)
str_detect(product, x))]
Then create the 'total_product' count as the row wise sum of logical vector i.e. TRUE -> 1 and FALSE -> 0
sample[, total_product := rowSums(.SD), .SDcols = A:D]
If we want to count the unique elements from 'product' for each 'customerid', an option is to split the column with strsplit, get the unique count with uniqueN
sample[, .(total_product = uniqueN(unlist(strsplit(product,
'+', fixed = TRUE)))), by = customerid]
-output
# customerid total_product
#1: 1 2
#2: 2 3
#3: 3 1
#4: 4 3
#5: 5 3
#6: 6 3
#7: 7 2
Related
Thanks in advance for your kind help. This is my dataframe:
df <- data.frame('a'=c(1,2,3,4,5), 'b'=c("A",NA,"B","C","A"))
df
And I want to create a new column based on if the value of dataframe$b is present/or absent (TRUE/FALSE). I'm using grepl for this but I'm not sure how to dinamically create the new column.
I'm creating a vector with the unique values of df$b
list <- as.vector(unique(df$b))
And want to iterate with a for in df$b, in order to get a dataframe like this:
a b A B C
1 1 A TRUE FALSE FALSE
2 2 NA FALSE FALSE FALSE
3 3 B FALSE TRUE FALSE
4 4 A FALSE FALSE TRUE
5 5 A TRUE FALSE FALSE
But I'm not sure how to generate the new column inside the for loop. I'm trying to do something like this:
for (i in list) {
logical <- grepl(df$b, i)
df$i <- logical
But it generates an error. Any help will be appreciated
This may need table
df <- cbind(df, as.data.frame.matrix(table(df) > 0))
-output
df
a b A B C
1 1 A TRUE FALSE FALSE
2 2 <NA> FALSE FALSE FALSE
3 3 B FALSE TRUE FALSE
4 4 C FALSE FALSE TRUE
5 5 A TRUE FALSE FALSE
You can use this for loop
list <- as.vector(unique(na.omit(df$b)))
for(i in 1:length(list)){
`[[`(df , list[i]) <- ifelse(!is.na(df$b),
list[i] == df$b , FALSE)
}
output
a b A B C
1 1 A TRUE FALSE FALSE
2 2 <NA> FALSE FALSE FALSE
3 3 B FALSE TRUE FALSE
4 4 C FALSE FALSE TRUE
5 5 A TRUE FALSE FALSE
a<-c(TRUE,FALSE,TRUE,FALSE,TRUE,FALSE)
b<-c(TRUE,FALSE,TRUE,FALSE,FALSE,FALSE)
c<-c(TRUE,TRUE,TRUE,FALSE,TRUE,FALSE)
costumer<-c("one","two","three","four","five","six")
df<-data.frame(costumer,a,b,c)
That's an example code. It looks like this printed:
costumer a b c
1 one TRUE TRUE TRUE
2 two FALSE FALSE TRUE
3 three TRUE TRUE TRUE
4 four FALSE FALSE FALSE
5 five TRUE FALSE TRUE
6 six FALSE FALSE FALSE
I want to create a new column df$items that contains only the column names that are TRUE for each row in the data. Something like this:
costumer a b c items
1 one TRUE TRUE TRUE a,b,c
2 two FALSE FALSE TRUE c
3 three TRUE TRUE TRUE a,b,c
4 four FALSE FALSE FALSE
5 five TRUE FALSE TRUE
6 six FALSE FALSE FALSE
I thought of using apply function or use which for selecting indexes, but couldn't figure it out. Can anyone help me?
df$items <- apply(df, 1, function(x) paste0(names(df)[x == TRUE], collapse = ","))
df
custumer a b c items
1 one TRUE TRUE TRUE a,b,c
2 two FALSE FALSE TRUE c
3 three TRUE TRUE TRUE a,b,c
4 four FALSE FALSE FALSE
5 five TRUE FALSE TRUE a,c
6 six FALSE FALSE FALSE
df$items = apply(df[2:4], 1, function(x) toString(names(df[2:4])[x]))
df
# custumer a b c items
# 1 one TRUE TRUE TRUE a, b, c
# 2 two FALSE FALSE TRUE c
# 3 three TRUE TRUE TRUE a, b, c
# 4 four FALSE FALSE FALSE
# 5 five TRUE FALSE TRUE a, c
# 6 six FALSE FALSE FALSE
You could use
df$items <- apply(df, 1, function(x) toString(names(df)[which(x == TRUE)]))
Output
# custumer a b c items
# 1 one TRUE TRUE TRUE a, b, c
# 2 two FALSE FALSE TRUE c
# 3 three TRUE TRUE TRUE a, b, c
# 4 four FALSE FALSE FALSE
# 5 five TRUE FALSE TRUE a, c
# 6 six FALSE FALSE FALSE
We can use pivot_longer to reshape to 'long' format and then do a group by paste
library(dplyr)
library(tidyr)
library(stringr)
df %>%
pivot_longer(cols = a:c) %>%
group_by(costumer) %>%
summarise(items = toString(name[value])) %>%
left_join(df)
I'm a relative newcomer to R so I'm sorry if there's an obvious answer to this. I've looked at other questions and I think 'apply' is the answer but I can't work out how to use it in this case.
I've got a longitudinal survey where participants are invited every year. In some years they fail to take part, and sometimes they die. I need to identify which participants have taken part for a consistent 'streak' since from the start of the survey (i.e. if they stop, they stop for good).
I've done this with a 'for' loop, which works fine in the example below. But I have many years and many participants, and the loop is very slow. Is there a faster approach I could use?
In the example, TRUE means they participated in that year. The loop creates two vectors - 'finalyear' for the last year they took part, and 'streak' to show if they completed all years before the finalyear (i.e. cases 1, 3 and 5).
dat <- data.frame(ids = 1:5, "1999" = c(T, T, T, F, T), "2000" = c(T, F, T, F, T), "2001" = c(T, T, T, T, T), "2002" = c(F, T, T, T, T), "2003" = c(F, T, T, T, F))
finalyear <- NULL
streak <- NULL
for (i in 1:nrow(dat)) {
x <- as.numeric(dat[i,2:6])
y <- max(grep(1, x))
finalyear[i] <- y
streak[i] <- sum(x) == y
}
dat$finalyear <- finalyear
dat$streak <- streak
Thanks!
We could use max.col and rowSums as a vectorized approach.
dat$finalyear <- max.col(dat[-1], 'last')
If there are rows without TRUE values, we can make sure to return 0 for that row by multiplying with the double negation of rowSums. The FALSE will be coerced to 0 and multiplying with 0 returns 0 for that row.
dat$finalyear <- max.col(dat[-1], 'last')*!!rowSums(dat[-1])
Then, we create the 'streak' column by comparing the rowSums of columns 2:6 with that of 'finalyear'
dat$streak <- rowSums(dat[,2:6])==dat$finalyear
dat
# ids X1999 X2000 X2001 X2002 X2003 finalyear streak
#1 1 TRUE TRUE TRUE FALSE FALSE 3 TRUE
#2 2 TRUE FALSE TRUE TRUE TRUE 5 FALSE
#3 3 TRUE TRUE TRUE TRUE TRUE 5 TRUE
#4 4 FALSE FALSE TRUE TRUE TRUE 5 FALSE
#5 5 TRUE TRUE TRUE TRUE FALSE 4 TRUE
Or a one-line code (it could fit in one-line, but decided to make it obvious by 2-lines ) suggested by #ColonelBeauvel
library(dplyr)
mutate(dat, finalyear=max.col(dat[-1], 'last'),
streak=rowSums(dat[-1])==finalyear)
For-loops are not inherently bad in R, but they are slow if you grow vectors iteratively (like you are doing). There are often better ways to do things. Example of a solution with only apply-functions:
dat$finalyear <- apply(dat[,2:6],MARGIN=1,function(x){max(which(x))})
dat$streak <- apply(dat[,2:7],MARGIN=1,function(x){sum(x[1:5])==x[6]})
Or option 2, based on comment by #Spacedman:
dat$finalyear <- apply(dat[,2:6],MARGIN=1,function(x){max(which(x))})
dat$streak <- apply(dat[,2:6],MARGIN=1,function(x){max(which(x))==sum(x)})
> dat
ids X1999 X2000 X2001 X2002 X2003 finalyear streak
1 1 TRUE TRUE TRUE FALSE FALSE 3 TRUE
2 2 TRUE FALSE TRUE TRUE TRUE 5 FALSE
3 3 TRUE TRUE TRUE TRUE TRUE 5 TRUE
4 4 FALSE FALSE TRUE TRUE TRUE 5 FALSE
5 5 TRUE TRUE TRUE TRUE FALSE 4 TRUE
Here is a solution with dplyr and tidyr.
gather(data = dat,year,value,-ids) %>%
mutate(year=as.integer(gsub("X","",year))) %>%
group_by(ids) %>%
summarize(finalyear=last(year[value]),
streak=!any(value[first(year):finalyear] == FALSE))
output
ids finalyear streak
1 1 2001 TRUE
2 2 2003 FALSE
3 3 2003 TRUE
4 4 2003 FALSE
5 5 2002 TRUE
Here's a base version using apply to loop over rows and rle to see how often the state changes. Your condition seems to be equivalent to the state starting as TRUE and only ever changing to FALSE at most once, so I test the rle as being shorter than 3 and the first value being TRUE:
> dat$streak = apply(dat[,2:6],1,function(r){r[1] & length(rle(r)$length)<=2})
>
> dat
ids X1999 X2000 X2001 X2002 X2003 streak
1 1 TRUE TRUE TRUE FALSE FALSE TRUE
2 2 TRUE FALSE TRUE TRUE TRUE FALSE
3 3 TRUE TRUE TRUE TRUE TRUE TRUE
4 4 FALSE FALSE TRUE TRUE TRUE FALSE
5 5 TRUE TRUE TRUE TRUE FALSE TRUE
There's probably loads of ways of working out finalyear, this just finds the last element of each row which is TRUE:
> dat$finalyear = apply(dat[,2:6], 1, function(r){max(which(r))})
> dat
ids X1999 X2000 X2001 X2002 X2003 streak finalyear
1 1 TRUE TRUE TRUE FALSE FALSE TRUE 3
2 2 TRUE FALSE TRUE TRUE TRUE FALSE 5
3 3 TRUE TRUE TRUE TRUE TRUE TRUE 5
4 4 FALSE FALSE TRUE TRUE TRUE FALSE 5
5 5 TRUE TRUE TRUE TRUE FALSE TRUE 4
I'm trying to use R to find the average number of attempts before a success in a dataframe with 300,000+ rows. Data is structured as below.
EventID SubjectID ActionID Success DateUpdated
a b c TRUE 2014-06-21 20:20:08.575032+00
b a c FALSE 2014-06-20 02:58:40.70699+00
I'm still learning my way through R. It looks like I can use ddply to separate the frame out based on Subject and Action (I want to see how many times a given subject tries an action before achieving a success), but I can't figure out how to write the formula I need to apply.
library(data.table)
# example data
dt = data.table(group = c(1,1,1,1,1,2,2), success = c(F,F,T,F,T,F,T))
# group success
#1: 1 FALSE
#2: 1 FALSE
#3: 1 TRUE
#4: 1 FALSE
#5: 1 TRUE
#6: 2 FALSE
#7: 2 TRUE
dt[, which(success)[1] - 1, by = group]
# group V1
#1: 1 2
#2: 2 1
Replace group with list(subject, action) or whatever is appropriate for your data (after converting it to data.table from data.frame).
To follow up on Tarehman's suggestion, since I like rle,
foo <- rle(data$Success)
mean(foo$lengths[foo$values==FALSE])
This might be an answer to a totally different question, but does this get close to what you want?
tfs <- sample(c(FALSE,TRUE),size = 50, replace = TRUE, prob = c(0.8,0.2))
tfs_sums <- cumsum(!tfs)
repsums <- tfs_sums[duplicated(tfs_sums)]
mean(repsums - c(0,repsums[-length(repsums)]))
tfs
[1] FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
[20] FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE
[39] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
repsums
1 6 8 9 20 20 20 20 24 26 31 36
repsums - c(0,repsums[-length(repsums)])
1 5 2 1 11 0 0 0 4 2 5 5
The last vector shown is the length of each continuous "run" of FALSE values in the vector tfs
you could use data.table work around to get what you need as follows:
library (data.table)
df=data.frame(EventID=c("a","b","c","d"),SubjectID=c("b","a","a","a"),ActionID=c("c","c","c","c"),Success=c(TRUE,FALSE,FALSE,TRUE))
dt=data.table(df)
dt[ , Index := 1:.N , by = c("SubjectID" , "ActionID","Success") ]
Now this Index column will hold the number that you need for each subject/action consecutive experiments. You need to aggregate to get that number (max number)
result=stats:::aggregate.formula(Index~(SubjectID+ActionID),data=dt,FUN= function(x) max(x))
so this will give you the max index and it is the number of the falses before you hit a true. Note that you might need to do further processing to filter out subjects that has never had a true
I'd like to summarize a set of observations in a datatable and could use some help with the syntax.
I think this is as simple as a join but I'm trying to identify that specific values were seen on a specific observation DAY even if its across multiple measurements or sensors on that day.
observations are summarized by date
observations date have varied counts of measurements (rows per date)
'M'easurement columns indicate that a specific value was observed in ANY sensor for the day.
I've created 2 sample sets of data that I hope will clarify the goal. I've also created an image of an excel spreadsheet that hopes to show the relationship between the data.
library(data.table)
raw <- data.table(
Date = as.Date(c("2013-5-4","2013-5-4","2013-5-4", "2013-5-9","2013-5-9", "2013-5-16","2013-5-16","2013-5-16", "2013-5-30")),
S1 = c(4, 2, 3, 1, 1, 8, 7, 3, 3),
S2 = c(2, 5, 2, 4, 4, 9, 1, 6, 4),
S3 = c(6, 2, 2, 7, 3, 2, 7, 2, 1)
)
summarized <- data.table(
Date = as.Date(c("2013-5-4", "2013-5-9", "2013-5-16", "2013-5-30")),
M1 = c(FALSE,TRUE,TRUE,TRUE),
M2 = c(TRUE,FALSE,TRUE,FALSE),
M3 = c(TRUE,TRUE,TRUE,TRUE),
M4 = c(TRUE,FALSE,FALSE,TRUE),
M5 = c(TRUE,FALSE,FALSE,FALSE),
M6 = c(TRUE,FALSE,TRUE,FALSE),
M7 = c(FALSE,TRUE,TRUE,FALSE),
M8 = c(FALSE,FALSE,TRUE,FALSE),
M9 = c(FALSE,FALSE,TRUE,FALSE),
M10 = c(FALSE,FALSE,TRUE,FALSE)
)
Excel
Raw is the measurements input. Multiple measurements can happen on the same observation date (i.e. multiple rows).
Summarized is what I'm hoping to get out. Rows are summarized and the 'm'easurement columns merely indicate that the value (following the M, i.e. M1, M2) was observed on the day in any of the V columns. For example, the number 2 was seen on the first and last observation on 5/16, but the number 5 was not seen in any of the 9 values on 5/16.
I think I need to use a join but how to calculate the M columns escapes me.
Any help is much appreciated.
Question: is there a name for this type of operation in data science or mathematics?
Update:
I'm trying the following
setkey(raw,Date)
s <- data.table( Date=unique(raw$Date)) # get a datatable of the unique dates
setkey(s,Date)
s[raw, M1:=(length(na.omit(match(c(raw$V1,raw$v2,raw$v3),1)))>=1)]
Note that the values are not what's expected for 5-4 (should be FALSE). I think this is becuase the raw rows are not being constrained in my match statement.
Date M1
1: 2013-05-04 TRUE
2: 2013-05-09 TRUE
3: 2013-05-16 TRUE
4: 2013-05-30 TRUE
My guess is I need to use something different to subset the raw rows in the join.
This seems to work:
raw[,lapply(1:10,`%in%`,unique(unlist(.SD))),by=Date]
The result is
Date V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
1: 2013-05-04 FALSE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE
2: 2013-05-09 TRUE FALSE TRUE TRUE FALSE FALSE TRUE FALSE FALSE FALSE
3: 2013-05-16 TRUE TRUE TRUE FALSE FALSE TRUE TRUE TRUE TRUE FALSE
4: 2013-05-30 TRUE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
If you want the columns with "M" instead of "V", use c(M=1:10) in place of 1:10.
This is a reshaping problem.
First, since it doesn't matter which sensor the data came from, let's aggregate your three columns into one column.
temp <- raw[,Reduce(union,list(S1,S2,S3)),by=Date]
Now we want to reshape from "long" to "wide" format.
A data table solution borrowed from this answer:
setkey(temp,Date,V1)
temp[CJ(unique(Date),unique(V1)), list(.N)][,
setNames(as.list(as.logical(N)), paste0("M",unique(V1))), by = Date]
# Date M1 M2 M3 M4 M5 M6 M7 M8 M9
# 1: 2013-05-04 FALSE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE
# 2: 2013-05-09 TRUE FALSE TRUE TRUE FALSE FALSE TRUE FALSE FALSE
# 3: 2013-05-16 TRUE TRUE TRUE FALSE FALSE TRUE TRUE TRUE TRUE
# 4: 2013-05-30 TRUE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
Base reshape works this way:
as.data.table(reshape(temp, timevar = "V1", v.names = "V1", idvar = "Date", direction = "wide"))
# Date V1.4 V1.2 V1.3 V1.5 V1.6 V1.1 V1.7 V1.8 V1.9
# 1: 2013-05-04 4 2 3 5 6 NA NA NA NA
# 2: 2013-05-09 4 NA 3 NA NA 1 7 NA NA
# 3: 2013-05-16 NA 2 3 NA 6 1 7 8 9
# 4: 2013-05-30 4 NA 3 NA NA 1 NA NA NA
## to order by column
temp2 <- as.data.table(reshape(temp[order(V1)], timevar = "V1", v.names = "V1", idvar = "Date", direction = "wide"))
# Date V1.1 V1.2 V1.3 V1.4 V1.5 V1.6 V1.7 V1.8 V1.9
# 1: 2013-05-09 1 NA 3 4 NA NA 7 NA NA
# 2: 2013-05-16 1 2 3 NA NA 6 7 8 9
# 3: 2013-05-30 1 NA 3 4 NA NA NA NA NA
# 4: 2013-05-04 NA 2 3 4 5 6 NA NA NA
##converts to logical true/false
temp2[,lapply(.SD,function(x) {x[is.na(x)] <- 0; as.logical(x)}), by = Date]
# Date vv V1.1 V1.2 V1.3 V1.4 V1.5 V1.6 V1.7 V1.8 V1.9
# 1: 2013-05-09 TRUE TRUE FALSE TRUE TRUE FALSE FALSE TRUE FALSE FALSE
# 2: 2013-05-16 TRUE TRUE TRUE TRUE FALSE FALSE TRUE TRUE TRUE TRUE
# 3: 2013-05-30 TRUE TRUE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
# 4: 2013-05-04 TRUE FALSE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE
The package reshape2 is a bit more intuitive:
require(reshape2)
## dummy variable for TRUE/FALSE
temp[,vv := TRUE]
temp_reshape2 <- as.data.table(dcast(temp, Date ~ V1, value.var = "vv"))
## replace NA with FALSE
temp_reshape2[, lapply(.SD, function(x) {x[is.na(x)] <- FALSE; x}), by = Date]
# Date 1 2 3 4 5 6 7 8 9
# 1: 2013-05-04 FALSE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE
# 2: 2013-05-09 TRUE FALSE TRUE TRUE FALSE FALSE TRUE FALSE FALSE
# 3: 2013-05-16 TRUE TRUE TRUE FALSE FALSE TRUE TRUE TRUE TRUE
# 4: 2013-05-30 TRUE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
And for completion, a lame parse-eval solution:
limits <- temp[,c(min(V1),max(V1))]
sapply(temp[,min(V1) : max(V1)], function(x) {
temp[,eval(parse(text=paste0("M",x," := any(abs(V1 - ",x,") < .Machine$double.eps)"))),by = Date]
})