Count with conditions in R dataframe - r

I have the following DF:
Week SKU Discount(%)
1 111 5
2 111 5
3 111 0
4 111 10
1 222 0
2 222 10
3 222 15
4 222 20
1 333 5
2 333 0
3 333 0
I would like to have this outcome:
Week SKU Discount(%) Duration LastDiscount
1 111 5 2 0
2 111 5 2 0
3 111 0 0 0
4 111 10 1 2
1 222 0 0 0
2 222 10 3 0
3 222 15 3 0
4 222 20 3 0
1 333 5 1 0
2 333 0 0 0
3 333 0 0 0
Duration is the number of weeks that 1 SKU had discounts continuously.
LastDiscount counts the number of weeks from the last time the SKU was on a continuous discount, only if there are weeks with 0 in between discounts.

One option to check the "Duration' is after grouping by 'SKU', use rle (run-length-encoding) on a logical vector, gets the lengths and 'values' and replicate those duration. Similarly, the "LastDiscount" can be obtained by getting the sum of logical values
library(dplyr)
df1 %>%
group_by(SKU) %>%
mutate(Duration = with(rle(Discount > 0), rep(lengths*values,
lengths)),
temp = with(rle(Discount > 0), sum(values != 0)),
LastDiscount = if(temp[1] > 1) c(rep(0, n()-1), temp[1]) else 0) %>%
select(-temp)
# A tibble: 11 x 5
# Groups: SKU [3]
# Week SKU Discount Duration LastDiscount
# <int> <int> <int> <int> <dbl>
# 1 1 111 5 2 0
# 2 2 111 5 2 0
# 3 3 111 0 0 0
# 4 4 111 10 1 2
# 5 1 222 0 0 0
# 6 2 222 10 3 0
# 7 3 222 15 3 0
# 8 4 222 20 3 0
# 9 1 333 5 1 0
#10 2 333 0 0 0
#11 3 333 0 0 0
Or using data.table
library(data.table)
i1 <- setDT(df1)[, grp := rleid(Discount > 0), SKU][Discount > 0,
Duration := .N, .(grp, SKU)][,
LastDiscount := uniqueN(grp[Discount > 0]), .(SKU)][,
tail(.I[Discount > 0 & LastDiscount > 1], 1), SKU]$V1
df1[-i1, LastDiscount := 0][]
data
df1 <- structure(list(Week = c(1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L,
3L), SKU = c(111L, 111L, 111L, 111L, 222L, 222L, 222L, 222L,
333L, 333L, 333L), Discount = c(5L, 5L, 0L, 10L, 0L, 10L, 15L,
20L, 5L, 0L, 0L)), class = "data.frame", row.names = c(NA, -11L
))

Related

R Only Keep Rows up to a certin condition

I have a dataframe as follows
head(data)
subject block trial timeLeft timeRight stim1 stim2 Chosen
1 1 13 0 0 0 2 1 2
2 1 13 1 0 1 3 2 2
3 1 13 3 0 0 3 1 1
4 1 13 4 2 0 2 3 3
5 1 13 6 1 1 1 3 1
6 1 13 7 2 2 2 1 1
...
454 1006 14 0 0 0 6 5 5
455 1006 14 1 0 0 6 4 6
456 1006 14 3 0 1 4 5 4
457 1006 14 4 1 1 4 5 4
458 1006 14 6 1 2 6 4 6
my objective is to group by subject and block and to only keep rows prior and including where both timeLeft and timeRight =0
in this case the output would be
subject block trial timeLeft timeRight stim1 stim2 Chosen
1 1 13 0 0 0 2 1 2
2 1 13 1 0 1 3 2 2
3 1 13 3 0 0 3 1 1
...
454 1006 14 0 0 0 6 5 5
455 1006 14 1 0 0 6 4 6
Thank you in advance!
here is the structure of the data
'data.frame': 64748 obs. of 8 variables:
$ subject : num 1 1 1 1 1 1 1 1 1 1 ...
$ block : int 13 13 13 13 13 13 13 13 13 13 ...
$ trial : int 0 1 3 4 6 7 9 10 12 13 ...
$ timeLeft : int 0 0 0 2 1 2 2 1 3 4 ...
$ timeRight: int 0 1 0 0 1 2 1 3 4 4 ...
$ stim1 : int 2 3 3 2 1 2 2 3 2 2 ...
$ stim2 : int 1 2 1 3 3 1 3 1 1 1 ...
$ Chosen : int 2 2 1 3 1 1 2 1 2 2 ...
You may do this with the help of custom function -
library(dplyr)
select_rows <- function(timeLeft, timeRight) {
inds <- which(timeLeft == 0 & timeRight == 0)
if(length(inds) >= 2) inds[1]:inds[2]
else 0
}
data %>%
group_by(subject, block) %>%
slice(select_rows(timeLeft, timeRight)) %>%
ungroup
# subject block trial timeLeft timeRight stim1 stim2 Chosen
# <int> <int> <int> <int> <int> <int> <int> <int>
#1 1 13 0 0 0 2 1 2
#2 1 13 1 0 1 3 2 2
#3 1 13 3 0 0 3 1 1
#4 1006 14 0 0 0 6 5 5
#5 1006 14 1 0 0 6 4 6
If the data is huge you may also do this with data.table -
library(data.table)
setDT(data)[, .SD[select_rows(timeLeft, timeRight)], .(subject, block)]
data
It is easier to help if you provide data in a reproducible format
data <- structure(list(subject = c(1L, 1L, 1L, 1L, 1L, 1L, 1006L, 1006L,
1006L, 1006L, 1006L), block = c(13L, 13L, 13L, 13L, 13L, 13L,
14L, 14L, 14L, 14L, 14L), trial = c(0L, 1L, 3L, 4L, 6L, 7L, 0L,
1L, 3L, 4L, 6L), timeLeft = c(0L, 0L, 0L, 2L, 1L, 2L, 0L, 0L,
0L, 1L, 1L), timeRight = c(0L, 1L, 0L, 0L, 1L, 2L, 0L, 0L, 1L,
1L, 2L), stim1 = c(2L, 3L, 3L, 2L, 1L, 2L, 6L, 6L, 4L, 4L, 6L
), stim2 = c(1L, 2L, 1L, 3L, 3L, 1L, 5L, 4L, 5L, 5L, 4L), Chosen = c(2L,
2L, 1L, 3L, 1L, 1L, 5L, 6L, 4L, 4L, 6L)), class = "data.frame", row.names =
c("1", "2", "3", "4", "5", "6", "454", "455", "456", "457", "458"))
If you want to keep all rows before timeLeft and timeRight are 0, you can try this way.
Data
subject block trial timeLeft timeRight stim1 stim2 Chosen
1 1 13 0 0 0 2 1 2
2 1 13 1 0 1 3 2 2
3 1 13 3 0 0 3 1 1
4 1 13 4 2 0 2 3 3
5 1 13 6 1 1 1 3 1
6 1 13 7 2 2 2 1 1
7 1006 14 0 0 1 6 5 5
8 1006 14 0 0 0 6 5 5
9 1006 14 1 0 0 6 4 6
10 1006 14 3 0 1 4 5 4
11 1006 14 4 1 1 4 5 4
12 1006 14 6 1 2 6 4 6
I add one more row for subject:1006, to make first row is not 0,0.
Code
df %>%
group_by(subject) %>%
mutate(key = max(which((timeLeft == 0 & timeRight ==0)))) %>%
slice(1:key)
subject block trial timeLeft timeRight stim1 stim2 Chosen key
<int> <int> <int> <int> <int> <int> <int> <int> <int>
1 1 13 0 0 0 2 1 2 3
2 1 13 1 0 1 3 2 2 3
3 1 13 3 0 0 3 1 1 3
4 1006 14 0 0 1 6 5 5 3
5 1006 14 0 0 0 6 5 5 3
6 1006 14 1 0 0 6 4 6 3
You can filter for only rows that meet the condition and then group
data %>%
filter(timeLeft > 0 & timeRight > 0) %>%
group_by(subject, block)

How to custom arrange such that no group dimension contains the same index twice?

I have the following tibble containing all the permutations of some indexes:
bb <- as_tibble(expand.grid(v1=0:2, v2=0:2)) %>%
arrange(v1, v2)
bb
# A tibble: 9 x 2
v1 v2
<int> <int>
1 0 0
2 0 1
3 0 2
4 1 0
5 1 1
6 1 2
7 2 0
8 2 1
9 2 2
How can it be arranged in such a way that it generates this output instead:
v1 v2
<int> <int>
1 0 0
2 1 1
3 2 2
4 0 1
5 1 2
6 2 0
7 0 2
8 1 0
9 2 1
Where the output is three groups/sets such that within each set there is no repetition of the index within each variable. Note that there can be only so many rows per group/set fulfilling this criteria ...
Sorry that I am not very familiar with tibble, so I provide a solution with data.frame in base R:
shifter <- function(x, n) ifelse(n == 0, return(x), return(c(tail(x, -n), head(x, n))))
res <- `rownames<-`(Reduce(rbind,lapply(seq(length(dfs<-split(df,rep(0:2,3)))),
function(k) {
dfs[[k]][,2] <- shifter(dfs[[k]][,1],k-1)
dfs[[k]]})),seq(nrow(df)))
which gives:
> res
v1 v2
1 0 0
2 1 1
3 2 2
4 0 1
5 1 2
6 2 0
7 0 2
8 1 0
9 2 1
DATA
df <- structure(list(v1 = c(0L, 0L, 0L, 1L, 1L, 1L, 2L, 2L, 2L), v2 = c(0L,
1L, 2L, 0L, 1L, 2L, 0L, 1L, 2L)), class = "data.frame", row.names = c(NA,
-9L))
Update: a more efficient generator for all combinations with desired format is given as below:
genAllCombn <- function(n) {
v1 <- rep(0:(n-1),n)
v2 <- (v1 + rep(0:(n-1),1,each = n)) %% n
return(data.frame(v1,v2))
}
> genAllCombn(4)
v1 v2
1 0 0
2 1 1
3 2 2
4 3 3
5 0 1
6 1 2
7 2 3
8 3 0
9 0 2
10 1 3
11 2 0
12 3 1
13 0 3
14 1 0
15 2 1
16 3 2

applying sorting function in r for every four rows returns dataframe sorted but without extended selection, other columns are not sorted accordingly

I need every four rows to be sorted by the 4th column, separately from the next four rows, made a function :
for (i in seq(1,nrow(data_frame), by=4)) {
data_frame[i:(i+3),4] <- sort(data_frame[i:(i+3),4], decreasing=TRUE) }
problem is only the 4th column gets sorted but the corresponding rows are maintained.
from
x y z userID
-1 1 2 5 1
-2 1 1 2 2
-3 0 0 5 5
-6 1 2 5 3
-4 1 1 2 6
-5 0 0 5 4
-4 1 1 2 1
-5 0 0 5 5
to -
x y z userID
-1 1 2 5 5
-2 1 1 2 3
-3 0 0 5 2
-6 1 2 5 1
-4 1 1 2 6
-5 0 0 5 5
-4 1 1 2 4
-5 0 0 5 1
With tidyverse, we can use %/% to create a grouping column with %/% and use that to sort the 'userID'
library(tidyverse)
df1 %>%
group_by(grp = (row_number()-1) %/% 4 + 1) %>%
#or use
#group_by(grp = cumsum(rep(c(TRUE, FALSE, FALSE, FALSE), length.out = n()))) %>%
mutate(userID = sort(userID, decreasing = TRUE))
# A tibble: 8 x 5
# Groups: grp [2]
# x y z userID grp
# <int> <int> <int> <int> <dbl>
#1 1 2 5 5 1
#2 1 1 2 3 1
#3 0 0 5 2 1
#4 1 2 5 1 1
#5 1 1 2 6 2
#6 0 0 5 5 2
#7 1 1 2 4 2
#8 0 0 5 1 2
Or using base R with ave
with(df1, ave(userID, (seq_along(userID)-1) %/% 4 + 1,
FUN = function(x) sort(x, decreasing = TRUE)))
#[1] 5 3 2 1 6 5 4 1
data
df1 <- structure(list(x = c(1L, 1L, 0L, 1L, 1L, 0L, 1L, 0L), y = c(2L,
1L, 0L, 2L, 1L, 0L, 1L, 0L), z = c(5L, 2L, 5L, 5L, 2L, 5L, 2L,
5L), userID = c(1L, 2L, 5L, 3L, 6L, 4L, 1L, 5L)), row.names = c(NA,
-8L), class = "data.frame")
In base R, we can split every 4 rows, order the fourth column and return the updated dataframe back.
df[] <- do.call(rbind, lapply(split(df, gl(nrow(df)/4, 4)),
function(p) p[order(p[[4]], decreasing = TRUE), ]))
df
# x y z userID
#1 0 0 5 5
#2 1 2 5 3
#3 1 1 2 2
#4 1 2 5 1
#5 1 1 2 6
#6 0 0 5 5
#7 0 0 5 4
#8 1 1 2 1
tidyverse approach using the same logic would be
library(tidyverse)
df %>%
group_split(gl(n()/4, 4), keep = FALSE) %>%
map_dfr(. %>% arrange(desc(userID)))

R - Insert Missing Numbers in A Sequence by Group's Max Value

I'd like to insert missing numbers in the index column following these two conditions:
Partitioned by multiple columns
The minimum value is always 1
The maximum value is always the maximum for the group and type
Current Data:
group type index vol
A 1 1 200
A 1 2 244
A 1 5 33
A 2 2 66
A 2 3 2
A 2 4 199
A 2 10 319
B 1 4 290
B 1 5 188
B 1 6 573
B 1 9 122
Desired Data:
group type index vol
A 1 1 200
A 1 2 244
A 1 3 0
A 1 4 0
A 1 5 33
A 2 1 0
A 2 2 66
A 2 3 2
A 2 4 199
A 2 5 0
A 2 6 0
A 2 7 0
A 2 8 0
A 2 9 0
A 2 10 319
B 1 1 0
B 1 2 0
B 1 3 0
B 1 4 290
B 1 5 188
B 1 6 573
B 1 7 0
B 1 8 0
B 1 9 122
I've just added in spaces between the partitions for clarity.
Hope you can help out!
You can do the following
library(dplyr)
library(tidyr)
my_df %>%
group_by(group, type) %>%
complete(index = 1:max(index), fill = list(vol = 0))
# group type index vol
# 1 A 1 1 200
# 2 A 1 2 244
# 3 A 1 3 0
# 4 A 1 4 0
# 5 A 1 5 33
# 6 A 2 1 0
# 7 A 2 2 66
# 8 A 2 3 2
# 9 A 2 4 199
# 10 A 2 5 0
# 11 A 2 6 0
# 12 A 2 7 0
# 13 A 2 8 0
# 14 A 2 9 0
# 15 A 2 10 319
# 16 B 1 1 0
# 17 B 1 2 0
# 18 B 1 3 0
# 19 B 1 4 290
# 20 B 1 5 188
# 21 B 1 6 573
# 22 B 1 7 0
# 23 B 1 8 0
# 24 B 1 9 122
With group_by you specify the groups you indicated withed the white spaces. With complete you specify which columns should be complete and then what values should be filled in for the remaining column (default would be NA)
Data
my_df <-
structure(list(group = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), .Label = c("A", "B"), class = "factor"),
type = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L),
index = c(1L, 2L, 5L, 2L, 3L, 4L, 10L, 4L, 5L, 6L, 9L),
vol = c(200L, 244L, 33L, 66L, 2L, 199L, 319L, 290L, 188L, 573L, 122L)),
class = "data.frame", row.names = c(NA, -11L))
One dplyr and tidyr possibility could be:
df %>%
group_by(group, type) %>%
complete(index = full_seq(1:max(index), 1), fill = list(vol = 0))
group type index vol
<fct> <int> <dbl> <dbl>
1 A 1 1 200
2 A 1 2 244
3 A 1 3 0
4 A 1 4 0
5 A 1 5 33
6 A 2 1 0
7 A 2 2 66
8 A 2 3 2
9 A 2 4 199
10 A 2 5 0
11 A 2 6 0
12 A 2 7 0
13 A 2 8 0
14 A 2 9 0
15 A 2 10 319
16 B 1 1 0
17 B 1 2 0
18 B 1 3 0
19 B 1 4 290
20 B 1 5 188
21 B 1 6 573
22 B 1 7 0
23 B 1 8 0
24 B 1 9 122

Is there are way to get rowIDs of first occurrences of value from codechunks?

Is there a way to get that realized?
example:(distribution is random)
ID size
1 x
2 x
3 x
4 x
5 x
0 2
0 x
0 x
0 x
4 x
5 x
0 4
0 x
0 x
0 x
4 x
5 x
0 3
0 x
0 x
0 x
4 x
5 x
This is just an example but very hard to code for me.
The x's are random numeric and not relevant. The values I need are the shown integers in the size column, so everytime a ID==0 occurrs, I need the first size value.
Use data.table::lag() to create a new column which lags one row behind ID. If d is your data.frame:
d <- d %>% dplyr::mutate(prevID = lag(ID))
ID size prevID
1 1 44 NA
2 2 55 1
3 3 66 2
4 4 77 3
5 5 88 4
6 0 2 5
7 0 33 0
8 0 44 0
9 0 55 0
10 4 66 0
11 5 77 4
12 0 4 5
13 0 11 0
14 0 22 0
15 0 33 0
16 4 44 0
17 5 55 4
18 0 3 5
19 0 44 0
20 0 55 0
21 0 66 0
22 4 77 0
23 5 88 4
Then get the rows where ID is 0 and not equal to prevID - these are the first 0 rows:
> which(d$ID == 0 & d$prevID != 0)
[1] 6 12 18
Or use this to filter the original data.frame:
> d[which(d$ID == 0 & d$prevID != 0), ]
# A tibble: 3 x 3
ID size prevID
<int> <int> <int>
1 0 2 5
2 0 4 5
3 0 3 5
Here is an idea using rleid from data.table,
library(data.table)
setDT(dt)[, grp := rleid(ID == 0)][ID == 0, .(size = first(size)), by = grp]
which gives,
grp size
1: 2 2
2: 4 4
3: 6 3
In the tidyverse, one idea can be,
library(tidyverse)
df %>%
mutate(grp = cumsum(ID != 0)) %>%
filter(ID == 0) %>%
group_by(grp) %>%
summarise(size = first(size))
ehich gives,
# A tibble: 3 x 2
grp size
<int> <fctr>
1 5 2
2 7 4
3 9 3
Or a base R solution:
df <- read.table(text = "
ID size
1 1
2 5
3 6
4 7
5 8
0 2
0 5
0 7
0 9
4 0
5 3
0 4
0 5
0 1
0 4
4 7
5 9
0 3
0 5
0 6
0 9
4 9
5 4", header = T)
ids <- which(df$ID == 0)
temp <- c(TRUE, (diff(ids) != 1))
df$size[ids[temp]]
#[1] 2 4 3
library(dplyr)
df %>%
mutate(row_idx = row_number()) %>%
filter(ID==0) %>%
filter(row_idx-lag(row_idx)>1 | row_number()==1) %>%
select(-row_idx)
Output is:
1 0 2
2 0 4
3 0 3
#sample data
> dput(df)
structure(list(ID = c(1L, 2L, 3L, 4L, 5L, 0L, 0L, 0L, 0L, 4L,
5L, 0L, 0L, 0L, 0L, 4L, 5L, 0L, 0L, 0L, 0L, 4L, 5L), size = structure(c(4L,
4L, 4L, 4L, 4L, 1L, 4L, 4L, 4L, 4L, 4L, 3L, 4L, 4L, 4L, 4L, 4L,
2L, 4L, 4L, 4L, 4L, 4L), .Label = c("2", "3", "4", "x"), class = "factor")), .Names = c("ID",
"size"), class = "data.frame", row.names = c(NA, -23L))

Resources