I have following panel data:
firmid date return
1 1 1
1 2 1
1 3 1
2 2 2
2 3 2
3 1 2
3 3 2
I want to transform this long format to wide but only for date 1 to look like this
firmid return in date=1
1 1
3 2
I appreciate any advice!
df <- read.table(header = T, text = "firmid date return
1 1 1
1 2 1
1 3 1
2 2 2
2 3 2
3 1 2
3 3 2")
Base R solution:
df <- df[df$date == 1, ]
df$date <- NULL
df
firmid return
1 1 1
6 3 2
data.table solution:
library(data.table)
setDT(df)
df <- df[date == 1, ]
df[, date := NULL]
firmid return
1: 1 1
2: 3 2
You can use dplyr to achieve it too:
library(dplyr)
df2 <- df %>%
filter(date == 1) %>%
select(-date)
# firmid return
#1 1 1
#2 3 2
A different dplyr solution that allows you to have multiple values of return within firmid:
df %>%
filter(date == 1) %>%
group_by(firmid, return) %>%
summarise()
Related
I have the following dataframe:
df <-read.table(header=TRUE, text="id code
1 A
1 B
1 C
2 A
2 A
2 A
3 A
3 B
3 A")
Per id, I would love to find those individuals that have at least 2 conditions, namely:
conditionA = "A"
conditionB = "B"
conditionC = "C"
and create a new colum with "index", 1 if there are two or more conditions met and 0 otherwise:
df_output <-read.table(header=TRUE, text="id code index
1 A 1
1 B 1
1 C 1
2 A 0
2 A 0
2 A 0
3 A 1
3 B 1
3 A 1")
So far I have tried the following:
df_output = df %>%
group_by(id) %>%
mutate(index = ifelse(grepl(conditionA|conditionB|conditionC, code), 1, 0))
and as you can see I am struggling to get the threshold count into the code.
You can create a vector of conditions, and then use %in% and sum to count the number of occurrences in each group. Use + (or ifelse) to convert logical into 1 and 0:
conditions = c("A", "B", "C")
df %>%
group_by(id) %>%
mutate(index = +(sum(unique(code) %in% conditions) >= 2))
id code index
1 1 A 1
2 1 B 1
3 1 C 1
4 2 A 0
5 2 A 0
6 2 A 0
7 3 A 1
8 3 B 1
9 3 A 1
You could use n_distinct(), which is a faster and more concise equivalent of length(unique(x)).
df %>%
group_by(id) %>%
mutate(index = +(n_distinct(code) >= 2)) %>%
ungroup()
# # A tibble: 9 × 3
# id code index
# <int> <chr> <int>
# 1 1 A 1
# 2 1 B 1
# 3 1 C 1
# 4 2 A 0
# 5 2 A 0
# 6 2 A 0
# 7 3 A 1
# 8 3 B 1
# 9 3 A 1
You can check conditions using intersect() function and check whether resulting list is of minimal (eg- 2) length.
conditions = c('A', 'B', 'C')
df_output2 =
df %>%
group_by(id) %>%
mutate(index = as.integer(length(intersect(code, conditions)) >= 2))
dat <- data.frame(id = c(1,1,1,2,2,2,3,3,3,3,3,3,4,4,4,4,4, 5, 5,6,6,6,7,7,7),
a = c(1,2,2,1,1,2,1,1,1,2,2,2,1,1,1,1,2, 1,2,2,2,2,1,1,1))
View(dat1)
Desired Output:
id a
2 1
2 1
2 2
3 1
3 1
3 1
3 2
3 2
3 2
4 1
4 1
4 1
4 1
4 2
I tried with the following code:
library(dplyr)
dat1 <- dat %>% group_by(id) %>%
filter(n() > 2) %>%
filter(any(a!= 2)) %>% filter(any(a != 1)) %>% filter(a == 1 | (cumsum(a == 2) == 1))
But the group id=1 must remove since have only one row with the a==1 value. I need at least 2 rows with a==1.
You can use n_distinct to get count of unique values and sum to count the occurrence of a = 1 for each id.
Using dplyr, you can do -
library(dplyr)
dat %>%
group_by(id) %>%
filter(n_distinct(a) >=2 && sum(a == 1) >= 2) %>%
ungroup
# id a
# <dbl> <dbl>
# 1 2 1
# 2 2 1
# 3 2 2
# 4 3 1
# 5 3 1
# 6 3 1
# 7 3 2
# 8 3 2
# 9 3 2
#10 4 1
#11 4 1
#12 4 1
#13 4 1
#14 4 2
This can also be written in base R and data.table.
#Base R
subset(dat, as.logical(ave(a, id,
FUN = function(x) length(unique(x)) >= 2 && sum(x == 1) >= 2)))
#data.table
library(data.table)
setDT(dat)[, .SD[uniqueN(a) >=2 && sum(a == 1) >= 2], id]
I need to fill $Year with missing values of the sequence by the factor of $Country. The $Count column can just be padded out with 0's.
Country Year Count
A 1 1
A 2 1
A 4 2
B 1 1
B 3 1
So I end up with
Country Year Count
A 1 1
A 2 1
A 3 0
A 4 2
B 1 1
B 2 0
B 3 1
Hope that's clear guys, thanks in advance!
This is a dplyr/tidyr solution using complete and full_seq:
library(dplyr)
library(tidyr)
df %>% group_by(Country) %>% complete(Year=full_seq(Year,1),fill=list(Count=0))
Country Year Count
<chr> <dbl> <dbl>
1 A 1 1
2 A 2 1
3 A 3 0
4 A 4 2
5 B 1 1
6 B 2 0
7 B 3 1
library(data.table)
# d is your original data.frame
setDT(d)
foo <- d[, .(Year = min(Year):max(Year)), Country]
res <- merge(d, foo, all.y = TRUE)[is.na(Count), Count := 0]
Similar to #PoGibas' answer:
library(data.table)
# set default values
def = list(Count = 0L)
# create table with all levels
fullDT = setkey(DT[, .(Year = seq(min(Year), max(Year))), by=Country])
# initialize to defaults
fullDT[, names(def) := def ]
# overwrite from data
fullDT[DT, names(def) := mget(sprintf("i.%s", names(def))) ]
which gives
Country Year Count
1: A 1 1
2: A 2 1
3: A 3 0
4: A 4 2
5: B 1 1
6: B 2 0
7: B 3 1
This generalizes to having more columns (besides Count). I guess similar functionality exists in the "tidyverse", with a name like "expand" or "complete".
Another base R idea can be to split on Country, use setdiff to find the missing values from the seq(max(Year)), and rbind them to original data frame. Use do.call to rbind the list back to a data frame, i.e.
d1 <- do.call(rbind, c(lapply(split(df, df$Country), function(i){
x <- rbind(i, data.frame(Country = i$Country[1],
Year = setdiff(seq(max(i$Year)), i$Year),
Count = 0));
x[with(x, order(Year)),]}), make.row.names = FALSE))
which gives,
Country Year Count
1 A 1 1
2 A 2 1
3 A 3 0
4 A 4 2
5 B 1 1
6 B 2 0
7 B 3 1
> setkey(DT,Country,Year)
> DT[setkey(DT[, .(min(Year):max(Year)), by = Country], Country, V1)]
Country Year Count
1: A 1 1
2: A 2 1
3: A 3 NA
4: A 4 2
5: B 1 1
6: B 2 NA
7: B 3 1
Another dplyr and tidyr solution.
library(dplyr)
library(tidyr)
dt2 <- dt %>%
group_by(Country) %>%
do(data_frame(Country = unique(.$Country),
Year = full_seq(.$Year, 1))) %>%
full_join(dt, by = c("Country", "Year")) %>%
replace_na(list(Count = 0))
Here is an approach in base R that uses tapply, do.call, range, and seq, to calculate year sequences. Then constructs a data.frame from the named list that is returned, merges this onto the original which adds the desired rows, and finally fills in missing values.
# get named list with year sequences
temp <- tapply(dat$Year, dat$Country, function(x) do.call(seq, as.list(range(x))))
# construct data.frame
mydf <- data.frame(Year=unlist(temp), Country=rep(names(temp), lengths(temp)))
# merge onto original
mydf <- merge(dat, mydf, all=TRUE)
# fill in missing values
mydf[is.na(mydf)] <- 0
This returns
mydf
Country Year Count
1 A 1 1
2 A 2 1
3 A 3 0
4 A 4 2
5 B 1 1
6 B 2 0
7 B 3 1
I'm trying to build a churn model that includes the maximum consecutive number of UX failures for each customer and having trouble. Here's my simplified data and desired output:
library(dplyr)
df <- data.frame(customerId = c(1,2,2,3,3,3), date = c('2015-01-01','2015-02-01','2015-02-02', '2015-03-01','2015-03-02','2015-03-03'),isFailure = c(0,0,1,0,1,1))
> df
customerId date isFailure
1 1 2015-01-01 0
2 2 2015-02-01 0
3 2 2015-02-02 1
4 3 2015-03-01 0
5 3 2015-03-02 1
6 3 2015-03-03 1
desired results:
> desired.df
customerId maxConsecutiveFailures
1 1 0
2 2 1
3 3 2
I'm flailing quite a bit and searching through other rle questions isn't helping me yet - this is what I was "expecting" a solution to resemble:
df %>%
group_by(customerId) %>%
summarise(maxConsecutiveFailures =
max(rle(isFailure[isFailure == 1])$lengths))
We group by the 'customerId' and use do to perform the rle on 'isFailure' column. Extract the lengths that are 'TRUE' for values (lengths[values]), and create the 'Max' column with an if/else condition to return 0 for those that didn't have any 1 value.
df %>%
group_by(customerId) %>%
do({tmp <- with(rle(.$isFailure==1), lengths[values])
data.frame(customerId= .$customerId, Max=if(length(tmp)==0) 0
else max(tmp)) }) %>%
slice(1L)
# customerId Max
#1 1 0
#2 2 1
#3 3 2
Here is my try, only using standard dplyr functions:
df %>%
# grouping key(s):
group_by(customerId) %>%
# check if there is any value change
# if yes, a new sequence id is generated through cumsum
mutate(last_one = lag(isFailure, 1, default = 100),
not_eq = last_one != isFailure,
seq = cumsum(not_eq)) %>%
# the following is just to find the largest sequence
count(customerId, isFailure, seq) %>%
group_by(customerId, isFailure) %>%
summarise(max_consecutive_event = max(n))
Output:
# A tibble: 5 x 3
# Groups: customerId [3]
customerId isFailure max_consecutive_event
<dbl> <dbl> <int>
1 1 0 1
2 2 0 1
3 2 1 1
4 3 0 1
5 3 1 2
A final filter on isFailure value would yield the wanted result (need to add back 0 failure count customers though).
The script can take any values of isFailure column and count the maximum consecutive days of having the same value.
I have a dataframe as follows. It is ordered by column time.
Input -
df = data.frame(time = 1:20,
grp = sort(rep(1:5,4)),
var1 = rep(c('A','B'),10)
)
head(df,10)
time grp var1
1 1 1 A
2 2 1 B
3 3 1 A
4 4 1 B
5 5 2 A
6 6 2 B
7 7 2 A
8 8 2 B
9 9 3 A
10 10 3 B
I want to create another variable var2 which computes no of distinct var1 values so far i.e. until that point in time for each group grp . This is a little different from what I'd get if I were to use n_distinct.
Expected output -
time grp var1 var2
1 1 1 A 1
2 2 1 B 2
3 3 1 A 2
4 4 1 B 2
5 5 2 A 1
6 6 2 B 2
7 7 2 A 2
8 8 2 B 2
9 9 3 A 1
10 10 3 B 2
I want to create a function say cum_n_distinct for this and use it as -
d_out = df %>%
arrange(time) %>%
group_by(grp) %>%
mutate(var2 = cum_n_distinct(var1))
A dplyr solution inspired from #akrun's answer -
Ths logic is basically to set 1st occurrence of each unique values of var1 to 1 and rest to 0 for each group grp and then apply cumsum on it -
df = df %>%
arrange(time) %>%
group_by(grp,var1) %>%
mutate(var_temp = ifelse(row_number()==1,1,0)) %>%
group_by(grp) %>%
mutate(var2 = cumsum(var_temp)) %>%
select(-var_temp)
head(df,10)
Source: local data frame [10 x 4]
Groups: grp
time grp var1 var2
1 1 1 A 1
2 2 1 B 2
3 3 1 A 2
4 4 1 B 2
5 5 2 A 1
6 6 2 B 2
7 7 2 A 2
8 8 2 B 2
9 9 3 A 1
10 10 3 B 2
Assuming stuff is ordered by time already, first define a cumulative distinct function:
dist_cum <- function(var)
sapply(seq_along(var), function(x) length(unique(head(var, x))))
Then a base solution that uses ave to create groups (note, assumes var1 is factor), and then applies our function to each group:
transform(df, var2=ave(as.integer(var1), grp, FUN=dist_cum))
A data.table solution, basically doing the same thing:
library(data.table)
(data.table(df)[, var2:=dist_cum(var1), by=grp])
And dplyr, again, same thing:
library(dplyr)
df %>% group_by(grp) %>% mutate(var2=dist_cum(var1))
Try:
Update
With your new dataset, an approach in base R
df$var2 <- unlist(lapply(split(df, df$grp),
function(x) {x$var2 <-0
indx <- match(unique(x$var1), x$var1)
x$var2[indx] <- 1
cumsum(x$var2) }))
head(df,7)
# time grp var1 var2
# 1 1 1 A 1
# 2 2 1 B 2
# 3 3 1 A 2
# 4 4 1 B 2
# 5 5 2 A 1
# 6 6 2 B 2
# 7 7 2 A 2
Here's another solution using data.table that's pretty quick.
Generic Function
cum_n_distinct <- function(x, na.include = TRUE){
# Given a vector x, returns a corresponding vector y
# where the ith element of y gives the number of unique
# elements observed up to and including index i
# if na.include = TRUE (default) NA is counted as an
# additional unique element, otherwise it's essentially ignored
temp <- data.table(x, idx = seq_along(x))
firsts <- temp[temp[, .I[1L], by = x]$V1]
if(na.include == FALSE) firsts <- firsts[!is.na(x)]
y <- rep(0, times = length(x))
y[firsts$idx] <- 1
y <- cumsum(y)
return(y)
}
Example Use
cum_n_distinct(c(5,10,10,15,5)) # 1 2 2 3 3
cum_n_distinct(c(5,NA,10,15,5)) # 1 2 3 4 4
cum_n_distinct(c(5,NA,10,15,5), na.include = FALSE) # 1 1 2 3 3
Solution To Your Question
d_out = df %>%
arrange(time) %>%
group_by(grp) %>%
mutate(var2 = cum_n_distinct(var1))