I have a dataframe which looks like the following:
d b c a
1 3400 100 3 -1
2 3400 50 3 1
3 3400 100 1 -1
4 3408 50 1 1
5 3412 100 3 1
6 3423 50 1 1
7 3434 100 1 1
8 3436 100 3 1
9 3438 50 3 1
10 3445 50 1 1
11 3454 100 3 1
12 3465 100 1 1
and I want to group by column a and b based on the condition that the group starts with column c value= 3 and the group ends if the column d value is + 30 ahead of the first group entry(So the interval length = 30, but the starting point of every interval can be in another interval). Then I want to count the rows in each group.
So the expected output for this sample should be:
b a rowcount
100 -1 2 ( starting at d = 3400)
50 1 3 ( starting at d = 3400)
100 1 3 (starting at d= 3412)
50 1 2 (starting at d= 3438)
100 1 2 (starting at d= 3454)
I tried:
df<-df%>%
group_by(b,a,first(c) == 3 & lead(d) - d < 30)
summarise(number = n())
but this does not give me the desired output. Any comments are appreciated!
UPDATE: New Example:
d b c a
1 3400 100 3 1
2 3400 100 3 1
3 3400 100 1 1
4 3408 100 1 1
5 3412 100 3 1
6 3434 100 3 1
7 3436 100 1 1
8 3438 100 3 1
9 3445 100 1 1
10 3443 100 3 1
11 3444 100 1 1
12 3463 100 3 1
13 3463 100 1 1
14 3463 100 3 1
Your code gives as output:
a b count desc addition_info
<dbl> <dbl> <int> <chr> <chr>
1 1 100 5 ( starting at d = 3400) There is 3 `c == 3` in this group
2 1 100 6 ( starting at d = 3434) There is 3 `c == 3` in this group
3 1 100 3 ( starting at d = 3463) There is 2 `c == 3` in this group
but the third group is wrong, since the difference in d = 29 and therefore <30. Why is this the case? So the right output in this example should be:
a b count desc addition_info
<dbl> <dbl> <int> <chr> <chr>
1 1 100 5 ( starting at d = 3400) There is 3 `c == 3` in this group
2 1 100 9 ( starting at d = 3434) There is 3 `c == 3` in this group
Tweaked your example a bit, still I think this will suffice largely. (Also see notes below the code)
df <- read.table(text = " d b c a
1 3400 100 3 1
2 3400 100 3 1
3 3400 100 1 1
4 3408 100 1 1
5 3412 100 3 1
6 3434 100 3 1
7 3436 100 1 1
8 3438 100 3 1
9 3445 100 1 1
10 3443 100 3 1
11 3444 100 1 1
12 3463 100 3 1
13 3463 100 1 1
14 3463 100 3 1
15 3465 100 3 1", header = T)
#added one row in df
> df
d b c a
1 3400 100 3 1
2 3400 100 3 1
3 3400 100 1 1
4 3408 100 1 1
5 3412 100 3 1
6 3434 100 3 1
7 3436 100 1 1
8 3438 100 3 1
9 3445 100 1 1
10 3443 100 3 1
11 3444 100 1 1
12 3463 100 3 1
13 3463 100 1 1
14 3463 100 3 1
15 3465 100 3 1
Now follow this strategy
library(tidyverse)
library(data.table) # for rleid()
df %>% mutate(r = row_number()) %>%
group_by(b, a) %>% mutate(grp_no = rleid(accumulate(d, ~ifelse(.y - .x > 30, .y, .x)))) %>%
group_by(b, a, grp_no) %>%
summarise(row_count = n(), r = first(r), d = first(d)) %>%
arrange(r) %>%
mutate(additional = paste("group starts at d =", d)) %>%
select(-r, -d)
# A tibble: 3 x 5
# Groups: b, a [1]
b a grp_no row_count additional
<int> <int> <int> <int> <chr>
1 100 1 1 5 group starts at d = 3400
2 100 1 2 9 group starts at d = 3434
3 100 1 3 1 group starts at d = 3465
With first example, its output is
# A tibble: 5 x 5
# Groups: b, a [3]
b a grp_no row_count additional
<int> <int> <int> <int> <chr>
1 100 -1 1 2 group starts at d = 3400
2 50 1 1 3 group starts at d = 3400
3 100 1 1 3 group starts at d = 3412
4 50 1 2 2 group starts at d = 3438
5 100 1 2 2 group starts at d = 3454
Note: you may also use dplyr::dense_rank instead of rleid in above syntax, like this
df %>% mutate(r = row_number()) %>%
group_by(b, a) %>%
mutate(grp_no = dense_rank(accumulate(d, ~ifelse(.y - .x > 30, .y, .x)) )) %>%
group_by(b, a, grp_no) %>%
summarise(row_count = n(), r = first(r), d = first(d)) %>%
arrange(r) %>%
mutate(additional = paste("group starts at d =", d)) %>%
select(-r, -d)
EndNote: Now I am not how your logic of c==3 fits into this? If you'll clarify I may try again
library(dplyr, warn.conflicts = FALSE)
library(purrr)
options(scipen = 999)
data <- structure(list(d = c(3400L, 3400L, 3400L, 3408L, 3412L, 3423L,
3434L, 3436L, 3438L, 3445L, 3454L, 3645L), b = c(100L, 50L, 100L,
50L, 100L, 50L, 100L, 100L, 50L, 50L, 100L, 100L), c = c(3L,
3L, 1L, 1L, 3L, 1L, 1L, 3L, 3L, 1L, 3L, 1L), a = c(-1L, 1L, -1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L)), row.names = c(NA, -12L), class = "data.frame")
# split data into group of a,b
new_data <- data %>%
group_by(a, b) %>%
group_split()
# function group index - assuming that every c == 1 have a c == 3 before it
# then only need to sort by the d different with first record by less than 30
group_function <- function(df) {
bin <- seq(from = min(df$d), to = max(df$d) + 30, by = 30)
df <- df %>%
mutate(d_group = cut(d, breaks = bin,
include.lowest = TRUE, right = FALSE)) %>%
group_by(d_group)
df$group_index <- group_indices(df)
df %>%
group_by(a, b, group_index) %>%
summarize(count = n(),
desc = sprintf("( starting at d = %s)", first(d)),
# I added the count of c==3 in the group just to show that sample data
# is not follow the logic you mentioned
addition_info = paste0("There is ",
sum(c == 3), " `c == 3` in this group"),
.groups = "drop") %>%
select(-group_index)
}
new_data %>%
map_dfr(group_function)
#> # A tibble: 6 x 5
#> a b count desc addition_info
#> <int> <int> <int> <chr> <chr>
#> 1 -1 100 2 ( starting at d = 3400) There is 1 `c == 3`` in this group
#> 2 1 50 3 ( starting at d = 3400) There is 1 `c == 3`` in this group
#> 3 1 50 2 ( starting at d = 3438) There is 1 `c == 3`` in this group
#> 4 1 100 3 ( starting at d = 3412) There is 2 `c == 3`` in this group
#> 5 1 100 1 ( starting at d = 3454) There is 1 `c == 3`` in this group
#> 6 1 100 1 ( starting at d = 3645) There is 0 `c == 3`` in this group
Created on 2021-04-04 by the reprex package (v1.0.0)
Update: included the logics described.
Related
the data example is like this below:
A B C
0 1 2
0 1 2
1 10 15
1 5 15
0 2 5
0 3 5
1 20 50
1 30 50
Above A and B is the original data, and C is the column that I want to create. C is the group sum of B based on the same and adjacent A value.Even though there are some A=0, if they are not adjacent, then it should be not sum together.
tidyverse
library(tidyverse)
df <- data.frame(
A = c(0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L),
B = c(1L, 1L, 10L, 5L, 2L, 3L, 20L, 30L)
)
df %>%
group_by(grp = data.table::rleid(A)) %>%
mutate(res = sum(B)) %>%
ungroup() %>%
select(-grp)
#> # A tibble: 8 × 3
#> A B res
#> <int> <int> <int>
#> 1 0 1 2
#> 2 0 1 2
#> 3 1 10 15
#> 4 1 5 15
#> 5 0 2 5
#> 6 0 3 5
#> 7 1 20 50
#> 8 1 30 50
Created on 2022-04-27 by the reprex package (v2.0.1)
data.table
library(data.table)
setDT(df)[, res := sum(B), by = rleid(A)][]
#> A B res
#> 1: 0 1 2
#> 2: 0 1 2
#> 3: 1 10 15
#> 4: 1 5 15
#> 5: 0 2 5
#> 6: 0 3 5
#> 7: 1 20 50
#> 8: 1 30 50
Created on 2022-04-27 by the reprex package (v2.0.1)
base
df$res <- with(df, ave(B, data.table::rleid(A), FUN = sum))
df
#> A B res
#> 1 0 1 2
#> 2 0 1 2
#> 3 1 10 15
#> 4 1 5 15
#> 5 0 2 5
#> 6 0 3 5
#> 7 1 20 50
#> 8 1 30 50
Created on 2022-04-27 by the reprex package (v2.0.1)
library(tidyverse)
data <- tribble(
~A, ~B, ~C,
0, 1, 2,
0, 1, 2,
1, 10, 15,
1, 5, 15,
0, 2, 5,
0, 3, 5,
1, 20, 50,
1, 30, 50
)
data <- data %>% select(-C)
rle_group <- function(x) {
rle <- rle(x)
rle$values <- rle$values %>%
length() %>%
seq()
inverse.rle(rle)
}
data %>%
mutate(
group = rle_group(A)
) %>%
group_by(group) %>%
mutate(
C = sum(B)
)
#> # A tibble: 8 × 4
#> # Groups: group [4]
#> A B group C
#> <dbl> <dbl> <int> <dbl>
#> 1 0 1 1 2
#> 2 0 1 1 2
#> 3 1 10 2 15
#> 4 1 5 2 15
#> 5 0 2 3 5
#> 6 0 3 3 5
#> 7 1 20 4 50
#> 8 1 30 4 50
Created on 2022-04-27 by the reprex package (v2.0.0)
A dplyr solution without any rle functions:
library(dplyr)
df %>%
group_by(grp = cumsum(c(TRUE, diff(A) != 0))) %>%
mutate(C = sum(B)) %>%
ungroup() %>%
select(-grp)
# A tibble: 8 x 3
A B C
<int> <int> <int>
1 0 1 2
2 0 1 2
3 1 10 15
4 1 5 15
5 0 2 5
6 0 3 5
7 1 20 50
8 1 30 50
Its base equivalent:
within(df, C <- ave(B, cumsum(c(TRUE, diff(A) != 0)), FUN = sum))
A B C
1 0 1 2
2 0 1 2
3 1 10 15
4 1 5 15
5 0 2 5
6 0 3 5
7 1 20 50
8 1 30 50
I have a dataframe of the following form:
id A B C
1 2 1 3
2 2 1 1
3 1 1 3
. . . .
. . . .
id is a unique identifier variable and A is a categorical variable with p levels. What is the most straightforward way to transform the dataframe into a form where each row is repeated p times, and instead of A as a categorical variable for each row, it is a dummy variable which displays 1 in the row corresponding to the level of A for that id, and 0 otherwise? For example, the transformed dataframe above would look like this for 3 levels of A:
id A B C
1 0 1 3
1 1 1 3
1 0 1 3
2 0 1 1
2 1 1 1
2 0 1 1
3 1 1 3
3 0 1 3
3 0 1 3
. . . .
. . . .
Apologies if the title didn't specify the nature of this problem properly or it has already been asked: I'm not well versed in R so I don't really know how to ask this question in a concise way or search for it. Thanks!
Use uncount to replicate the rows and then change the values of 'A' by creating a logical vector with sequence of rows (row_number()) after doing a group by 'id'
library(dplyr)
library(tidyr)
p <- 3
df1 %>%
uncount(p) %>%
group_by(id) %>%
mutate( A = +(row_number() == A)) %>%
ungroup
-output
# A tibble: 9 × 4
id A B C
<int> <int> <int> <int>
1 1 0 1 3
2 1 1 1 3
3 1 0 1 3
4 2 0 1 1
5 2 1 1 1
6 2 0 1 1
7 3 1 1 3
8 3 0 1 3
9 3 0 1 3
Or the similar option in base R with rep and ave
transform(df1[rep(seq_len(nrow(df1)), each = p),],
A = +(A == ave(A, id, FUN = seq_along)))
data
df1 <- structure(list(id = 1:3, A = c(2L, 2L, 1L), B = c(1L, 1L, 1L),
C = c(3L, 1L, 3L)), class = "data.frame", row.names = c(NA,
-3L))
This might work:
library(tidyverse)
df <- data.frame(id = c(1:3),
a = c(2, 2, 1),
b = c(1, 1, 1),
c = c(3, 1, 3))
df
rep(df, 3) %>%
arrange(id) %>%
group_by(id) %>%
mutate(a = ifelse(row_number() == a, 1, 0)) %>%
ungroup()
# id a b c
# <int> <dbl> <dbl> <dbl>
# 1 1 0 1 3
# 2 1 1 1 3
# 3 1 0 1 3
# 4 2 0 1 1
# 5 2 1 1 1
# 6 2 0 1 1
# 7 3 1 1 3
# 8 3 0 1 3
# 9 3 0 1 3
This is my df:
group value
1 10
1 20
1 25
2 5
2 10
2 15
I now want to compute differences between each value of a group and a reference value, which is the first row of a group. More precisely:
group value diff
1 10 NA # because this is the reference for group 1
1 20 10 # value[2] - value[1]
1 25 15 # value[3] - value[1]
2 5 NA # because this is the reference for group 2
2 10 5 # value[5] - value[4]
2 15 10 # value[6] - value[4]
I found good answers for difference scores of the previous line (e.g., lag-function in dpylr, shift-function in data.table). However, I am looking for a fixed reference point and I couldn't make it work.
Try the code below
transform(
df,
Diff = ave(value, group, FUN = function(x) c(NA, diff(x)))
)
which gives
group value Diff
1 1 10 NA
2 1 20 10
3 1 25 5
4 2 5 NA
5 2 10 5
6 2 15 5
I think you can also use this:
library(dplyr)
df %>%
group_by(group) %>%
mutate(diff = value - value[1],
diff = replace(diff, row_number() == 1, NA))
# A tibble: 6 x 3
# Groups: group [2]
group value diff
<int> <int> <int>
1 1 10 NA
2 1 20 10
3 1 25 15
4 2 5 NA
5 2 10 5
6 2 15 10
df <-
structure(list(
group = c(1L, 1L, 1L, 2L, 2L, 2L),
value = c(10L,
20L, 25L, 5L, 10L, 15L)
),
class = "data.frame",
row.names = c(NA,
-6L))
library(tidyverse)
df %>%
group_by(group) %>%
mutate(DIFF = ifelse(row_number() == 1, NA, value - first(value))) %>%
ungroup()
#> # A tibble: 6 x 3
#> group value DIFF
#> <int> <int> <int>
#> 1 1 10 NA
#> 2 1 20 10
#> 3 1 25 15
#> 4 2 5 NA
#> 5 2 10 5
#> 6 2 15 10
Created on 2021-06-18 by the reprex package (v2.0.0)
I have a dataframe:
a<-c(1,1,1,1,1,1,1,1,1,1,1)
b<-c(100,100,100,100,100,100,100,100,100,100,100)
c<-c(1,3,1,1,3,1,1,3,1,1,3)
d<-c(3400,3403,3407,3408,3412,3423,3434,3436,3445,3454,3645)
df<-data.frame(d,b,c,a)
df
d b c a
1 3400 100 1 1
2 3403 100 3 1
3 3407 100 1 1
4 3408 100 1 1
5 3412 100 3 1
6 3423 100 1 1
7 3434 100 1 1
8 3436 100 3 1
9 3445 100 1 1
10 3454 100 1 1
11 3645 100 3 1
and i want to filter always a rowpair, which fulfills the following condition: the column c value of the first row must be 3, the column c value of the second row must be 1 and the column d value between the pair has to be <10.
So the expected output in this example should be:
d b c a
2 3403 100 3 1
3 3407 100 1 1
8 3436 100 3 1
9 3445 100 1 1
I tried the following:
filter(df,first(c)==3,nth(c,2)==1,any(diff(d) < 10))
but for some reason, it does not work. Thanks for your help!
You can first establish the indices of the first-pair parts using which:
library(dplyr)
inds <- which(df$c == 3 & lead(df$c) == 1 & lead(df$d) - df$d < 10)
and then subset your dataframe on the indices plus 1:
df[sort(unique(c(inds, inds + 1))),]
d b c a
2 3403 100 3 1
3 3407 100 1 1
8 3436 100 3 1
9 3445 100 1 1
Alternatively, you can do:
library(dplyr)
df1 <- df %>% # get the first row
filter(c == 3 & lead(c) == 1 & lead(d) - d < 10)
df2 <- df %>% # get the second row
filter(lag(c) == 3 & c == 1 & d - lag(d) < 10)
arrange(rbind(df1, df2), d) # bind the two together and arange by d
The following code is not simple but it produces the expected result.
library(dplyr)
df %>%
mutate(flag = cumsum(c == 3)) %>%
group_by(flag) %>%
slice_head(n = 2) %>%
filter(n() > 1) %>%
mutate(flag = flag*(diff(d) < 10)) %>%
ungroup() %>%
filter(flag > 0) %>%
select(-flag)
## A tibble: 4 x 4
# d b c a
# <dbl> <dbl> <dbl> <dbl>
#1 3403 100 3 1
#2 3407 100 1 1
#3 3436 100 3 1
#4 3445 100 1 1
segment1<-c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)
segment2<-c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2)
rating <-c(1,1,2,2,1,3,1,1,2,2,3,2,2,2,1,1,2,3,3,1,1,2,3,2)
data<-data.frame(segment1,segment2,rating)
df1<-aggregate(
x = data.frame(list("count"=data$rating)),
by = data.frame(list("segment1"=data$segment1, "segment2"=data$segment2, "rating"=data$rating)),
FUN = length
)
df2<-aggregate(
x = data.frame(list("count"=df1$count)),
by = data.frame(list("segment1"=df1$segment1, "segment2"=df1$segment2)),
FUN = sum
)
df2$rating1<-0
df2$rating2<-0
df2$rating3<-0
df1
segment1 segment2 rating count
1 1 1 1 6
2 1 2 1 3
3 1 1 2 7
4 1 2 2 3
5 1 1 3 2
6 1 2 3 3
df2
segment1 segment2 count rating1 rating2 rating3
1 1 1 15 0 0 0
2 1 2 9 0 0 0
I need this output for df2
segment1 segment2 count rating1 rating2 rating3
1 1 1 15 6 7 2
2 1 2 9 3 3 3
You can do this with just a group_by/summarize combination. You do not need to need to create any intermediate data frames and join them. Here is the code:
data %>%
group_by(segment1, segment2) %>%
summarize(
count = n(),
rating1 = sum(rating == 1),
rating2 = sum(rating == 2),
rating3 = sum(rating == 3)
)
This gives you this:
# A tibble: 2 x 6
# Groups: segment1 [?]
segment1 segment2 count rating1 rating2 rating3
<dbl> <dbl> <int> <int> <int> <int>
1 1 1 15 6 7 2
2 1 2 9 3 3 3
My input to get the same dataframe as you :
(Maybe post the way you aggregate the first dataframe to get the second one)
> df1 <- read.table(text ="segment1 segment2 rating count
1 1 1 1 1415
2 2 2 1 272
3 1 1 2 1970
4 2 2 2 363
5 1 1 3 8484
6 2 2 3 1465
7 1 1 4 33619
8 2 2 4 5332
9 1 1 5 58173
0 2 2 5 12031",header=T)
> df2 <- read.table(text =" segment1 segment2 score count avg rating_1 rating_2 rating_3 rating_4 rating_5
1 1 1 456148 103661 4.4 0 0 0 0 0
2 2 2 86876 19463 4.5 0 0 0 0 0",header=T)
> df2
segment1 segment2 score count avg rating_1 rating_2 rating_3 rating_4 rating_5
1 1 1 456148 103661 4.4 0 0 0 0 0
2 2 2 86876 19463 4.5 0 0 0 0 0
My script maybe isn't the best looking one but it will work for any number of segments only if segment1==segment2==segment3... (like in the example you provided, is that always the case ???)
> n_seg=2 # number of segments
n_rat=5 # number of ratings
for(i in 1:n_seg) # segment
{
for(j in 1:n_rat) # rating
{
df2[i,5+j]=df2[i,5+j]+df1$count[df1$segment1==i & df1$rating==j]
}
}
My output :
> df2
segment1 segment2 score count avg rating_1 rating_2 rating_3 rating_4 rating_5
1 1 1 456148 103661 4.4 1415 1970 8484 33619 58173
2 2 2 86876 19463 4.5 272 363 1465 5332 12031
Maybe this will work for you
library(dplyr)
library(tidyr)
df3 <- df2 %>%
select(segment1, segment2, count)
df3
# segment1 segment2 count
# 1 1 1 15
# 2 1 2 9
df1 %>%
mutate(rating = paste0("rating", rating)) %>%
spread(rating, count) %>%
left_join(., df3, by=c("segment1", "segment2")) %>%
select(segment1, segment2, count, rating1, rating2, rating3)
# segment1 segment2 count rating1 rating2 rating3
# 1 1 1 15 6 7 2
# 2 1 2 9 3 3 3
To save the answer
ans <- df1 %>%
mutate(rating = paste0("rating", rating)) %>%
spread(rating, count) %>%
left_join(., df3, by=c("segment1", "segment2")) %>%
select(segment1, segment2, count, rating1, rating2, rating3)