How to mark rows in groups between row numbers based on value? - r

here is a table example:
dt <- data.frame(cat = rep(c("A", "B", "C"), c(10, 8, 10)), value=c(0,0,0,3,5,0,8,0,0,0,0,0,2,2,3,0,1,0,0,0,0,0,1,2,3,4,0,0))
dt
cat value
1 A 0
2 A 0
3 A 0
4 A 3
5 A 5
6 A 0
7 A 8
8 A 0
9 A 0
10 A 0
11 B 0
12 B 0
13 B 2
14 B 2
15 B 3
16 B 0
17 B 1
18 B 0
19 C 0
20 C 0
21 C 0
22 C 0
23 C 1
24 C 2
25 C 3
26 C 4
27 C 0
28 C 0
What I would like to do is to flag rows between first and last value which is different than 0 for each group (and also those after last value marked in the other way). So, the final table should look like:
cat value flag
1 A 0 0
2 A 0 0
3 A 0 0
4 A 3 1
5 A 5 1
6 A 0 1
7 A 8 1
8 A 0 2
9 A 0 2
10 A 0 2
11 B 0 0
12 B 0 0
13 B 2 1
14 B 2 1
15 B 3 1
16 B 0 1
17 B 1 1
18 B 0 2
19 C 0 0
20 C 0 0
21 C 0 0
22 C 0 0
23 C 1 1
24 C 2 1
25 C 3 1
26 C 4 1
27 C 0 2
28 C 0 2
Thanks a lot in advance,

Does this work:
library(dplyr)
dt %>% group_by(cat) %>% mutate(c1 = cumsum(value)) %>%
mutate(flat = case_when(c1 == 0 ~ 0,
c1 == max(c1) & value == 0 ~ 2,
TRUE ~ 1)) %>%
select(1,2,4) %>% print(n = 50)
# A tibble: 28 x 3
# Groups: cat [3]
cat value flat
<chr> <dbl> <dbl>
1 A 0 0
2 A 0 0
3 A 0 0
4 A 3 1
5 A 5 1
6 A 0 1
7 A 8 1
8 A 0 2
9 A 0 2
10 A 0 2
11 B 0 0
12 B 0 0
13 B 2 1
14 B 2 1
15 B 3 1
16 B 0 1
17 B 1 1
18 B 0 2
19 C 0 0
20 C 0 0
21 C 0 0
22 C 0 0
23 C 1 1
24 C 2 1
25 C 3 1
26 C 4 1
27 C 0 2
28 C 0 2

Write a function which assign 0, 1 and 2 value based on condition.
library(dplyr)
assign_flag <- function(x) {
#First non-zero value
first <- match(TRUE, x > 0)
#last non-zero value
last <- which.max(cumsum(x))
case_when(row_number() < first ~ 0,
row_number() <= last ~ 1,
TRUE ~ 2)
}
and apply it for each group.
dt %>%
group_by(cat) %>%
mutate(flag = assign_flag(value)) %>%
ungroup
# cat value flag
#1 A 0 0
#2 A 0 0
#3 A 0 0
#4 A 3 1
#5 A 5 1
#6 A 0 1
#7 A 8 1
#8 A 0 2
#9 A 0 2
#10 A 0 2
#11 B 0 0
#12 B 0 0
#13 B 2 1
#14 B 2 1
#15 B 3 1
#16 B 0 1
#17 B 1 1
#18 B 0 2
#19 C 0 0
#20 C 0 0
#21 C 0 0
#22 C 0 0
#23 C 1 1
#24 C 2 1
#25 C 3 1
#26 C 4 1
#27 C 0 2
#28 C 0 2

A base R solution
dt_split = lapply( split(dt, f = dt$cat), function(x){
# Find nonzero elements
flag_tmp = which(x$value!=0)
# Define flags
x$flag =c(rep(0,flag_tmp[1]-1), # The leading zeros
rep(1,tail(flag_tmp, n=1)+1 - flag_tmp[1]), # The nonzero flag
rep(2, nrow(x) -tail(flag_tmp, n=1)) # The trailing zero flag
)
x
})
dt = do.call(rbind, dt_split)

Related

add column with total count of rows meeting a condition in dplyr

Trying to get totals by class and condition but not grouping data.
Reproducible example:
df <- data.frame("class" = c("a","b","c","d","b","b","b","b","c","c","a"),"increment" = c(0,0,0,0,0,0,32,12,0,0,0))
R> df
class increment
1 a 0
2 b 0
3 c 0
4 d 0
5 b 0
6 b 0
7 b 32
8 b 12
9 c 0
10 c 0
11 a 0
I want the total cases where increment is different from Zero but for every class.
Desired output:
R> df
class increment increment_count_per_class
1 a 0 0
2 b 0 2
3 c 0 0
4 d 0 0
5 b 0 2
6 b 0 2
7 b 32 2
8 b 12 2
9 c 0 0
10 c 0 0
11 a 0 0
My first approach is here below, but I know there must be a less convoluted way using dplyr:
df <- df %>% mutate(has.increment = ifelse(increment>0,1,0))
R> df
class increment has.increment
1 a 0 0
2 b 0 0
3 c 0 0
4 d 0 0
5 b 0 0
6 b 0 0
7 b 32 1
8 b 12 1
9 c 0 0
10 c 0 0
11 a 0 0
Get totals per class when increment exists
N <- df %>% group_by(class,has.increment) %>% tally() %>% filter(has.increment == 1)
R> N
# A tibble: 1 x 3
# Groups: class [1]
class has.increment n
<chr> <dbl> <int>
1 b 1 2
Then join:
merge(N,df, by = "class", all = TRUE)
R> merge(N,df, by = "class", all = TRUE)
class has.increment.x n increment has.increment.y
1 a NA NA 0 0
2 a NA NA 0 0
3 b 1 2 0 0
4 b 1 2 12 1
5 b 1 2 0 0
6 b 1 2 0 0
7 b 1 2 32 1
8 c NA NA 0 0
9 c NA NA 0 0
10 c NA NA 0 0
11 d NA NA 0 0
Try this:
df %>%
group_by(class) %>%
mutate(increment_count_per_class = sum(increment!=0))

How to aggregate a data frame by a list of variables, preserve ungrouped columns and indicate subsets?

By grouping a data frame by multiple grouping variables a:c I want to divide it virtually into subsets. After that I want to add two columns, one that contains the count of the subset size, and another that is the ID of the subset.
set.seed(67)
n <- 1000
df1 <- data.frame(
a=rbinom(n, 1, .5),
b=sample(20:40, n, replace = TRUE),
c=sample(seq(3000, 4000, 100), n, replace = TRUE),
d=rbinom(n, 1, .13),
k=rbinom(n, 1, .88),
l=rbinom(n, 1, .075),
m=rbinom(n, 1, .05),
n=rbinom(n, 1, .3)
)
> head(df1)
a b c d k l m n
1 1 21 3900 0 1 0 0 0
2 0 26 3600 0 1 0 0 0
3 0 23 3900 0 1 0 0 0
4 1 23 3900 0 1 0 0 0
5 0 32 4000 1 1 0 0 0
6 1 23 3200 0 0 0 0 0
I already got the group counts right, but I need the other variables to be preserved.
> with(df1, aggregate(d, list(a, b, c), length))
Group.1 Group.2 Group.3 x
1 0 20 3000 2
2 1 20 3000 3
3 0 21 3000 2
4 1 21 3000 3
5 0 22 3000 3
6 1 22 3000 1
...
When I define the whole data frame as object, it displays also the counts but the values are overwritten:
> with(df1, aggregate(df1, list(a, b, c), length))
Group.1 Group.2 Group.3 a b c d k l m n
1 0 20 3000 2 2 2 2 2 2 2 2
2 1 20 3000 3 3 3 3 3 3 3 3
3 0 21 3000 2 2 2 2 2 2 2 2
4 1 21 3000 3 3 3 3 3 3 3 3
5 0 22 3000 3 3 3 3 3 3 3 3
6 1 22 3000 1 1 1 1 1 1 1 1
...
Actually I want something like this:
a b c d k l m n count id
847 0 20 3000 1 1 0 0 1 2 1
939 0 20 3000 0 0 0 0 0 2 1
264 1 21 3000 0 1 0 0 0 3 2
569 1 21 3000 0 1 0 0 0 3 2
876 1 21 3000 0 1 0 0 1 3 2
346 0 22 3000 0 1 0 0 1 3 3
846 0 22 3000 0 1 0 0 0 3 3
929 0 22 3000 0 1 0 0 1 3 3
...
How would I do that?
In base R, you could use ave...
df1 <- df1[order(df1$c,df1$b,df1$a),]
df1$id <- cumsum(!duplicated(df1[,c("a","b","c")]))
df1$count <- ave(df1$a,df1$id,FUN=length)
head(df1)
a b c d k l m n id count
847 0 20 3000 1 1 0 0 1 1 2
939 0 20 3000 0 0 0 0 0 1 2
217 1 20 3000 0 1 0 0 0 2 3
458 1 20 3000 0 1 0 0 0 2 3
631 1 20 3000 0 1 0 0 0 2 3
360 0 21 3000 0 1 1 0 0 3 2
The order of the df will affect what id values you get, but hopefully this does not matter too much.
Is this what you're after?
library(tidyverse);
df1 %>%
group_by(a, b, c) %>%
mutate(count = n()) %>%
ungroup() %>%
arrange(b, c, a) %>%
mutate(id = cumsum(!duplicated(paste0(a, b, c))));
## A tibble: 1,000 x 10
# a b c d k l m n count id
# <int> <int> <dbl> <int> <int> <int> <int> <int> <int> <int>
# 1 0 20 3000. 1 1 0 0 1 2 1
# 2 0 20 3000. 0 0 0 0 0 2 1
# 3 1 20 3000. 0 1 0 0 0 3 2
# 4 1 20 3000. 0 1 0 0 0 3 2
# 5 1 20 3000. 0 1 0 0 0 3 2
# 6 0 20 3100. 0 1 0 0 0 2 3
# 7 0 20 3100. 0 1 1 0 0 2 3
# 8 1 20 3100. 0 1 0 0 0 1 4
# 9 0 20 3200. 1 1 0 0 0 3 5
#10 0 20 3200. 0 1 0 0 0 3 5
## ... with 990 more rows
With data.table, this can be done in few lines using inbuilt .GRP and .N variables.
setDT(df1)
df1 <- df1[order(c,b,a)]
df1[,':='(count = .N, id = .GRP),.(a,b,c)]
print(head(df1))
a b c d k l m n count id
1: 0 20 3000 1 1 0 0 1 2 1
2: 0 20 3000 0 0 0 0 0 2 1
3: 1 20 3000 0 1 0 0 0 3 2
4: 1 20 3000 0 1 0 0 0 3 2
5: 1 20 3000 0 1 0 0 0 3 2
6: 0 21 3000 0 1 1 0 0 2 3
Here is similar answer to Maurits Evers, utilizing group_indices
library(tidyverse)
df1 %>%
mutate(id = group_indices(., a,b,c)) %>% #extract the group indices when grouped by a, b and c
group_by(a, b, c) %>% #group by a, b and c
mutate(count = n()) %>% #get the number of elements in each group
arrange(a, b, c) #arrange by a, b, c or however you prefer
#output
# A tibble: 1,000 x 10
# Groups: a, b, c [414]
a b c d k l m n id count
<int> <int> <dbl> <int> <int> <int> <int> <int> <int> <int>
1 0 20 3000 1 1 0 0 1 1 2
2 0 20 3000 0 0 0 0 0 1 2
3 0 20 3100 0 1 0 0 0 2 2
4 0 20 3100 0 1 1 0 0 2 2
5 0 20 3200 1 1 0 0 0 3 3
6 0 20 3200 0 1 0 0 0 3 3
7 0 20 3200 0 1 0 0 0 3 3
8 0 20 3300 1 1 0 0 1 4 2
9 0 20 3300 0 1 0 0 0 4 2
10 0 20 3400 0 1 0 0 1 5 1
# ... with 990 more rows

How can I count occurences of different integer values (in a particular column) for rows with different factor values(another column)?

I have a dataframe which looks something like this:
My_Data = data.frame(name = rep(LETTERS[1:10],3), number = sample(0:3,30, replace=TRUE)
name number
1 A 3
2 B 3
3 C 0
4 D 3
5 E 2
6 F 2
7 G 2
8 H 2
9 I 1
10 J 3
11 A 1
12 B 2
13 C 0
14 D 1
15 E 3
16 F 0
17 G 2
18 H 2
19 I 2
20 J 2
21 A 0
22 B 1
23 C 3
24 D 0
25 E 2
26 F 0
27 G 1
28 H 1
29 I 3
30 J 0
Now I would like to get a dataframe which has columns for each of the possible values in the number column and the count of the occurences for each of the number values with respect to each value in the name column
name number_0 number_1 number_2 number_3
1 A 1 1 0 1
2 B 0 1 1 1
3 C 2 0 0 1
4 D 1 1 0 1
5 E 0 0 2 1
6 F 2 0 1 0
7 G 0 1 2 0
8 H 0 1 2 0
9 I 0 1 1 1
10 J 1 0 1 1
How can I do that?
Thanks!
Edit: I am not looking for a conversion to the wide format. I am looking for a way to count occurences for each of the possible values.
You can also use xtabs() function.
xtabs(~My_Data$name + My_Data$number)
We could get the count and then spread to 'wide' format
library(dplyr)
library(tidyr)
My_Data %>%
count(name, number) %>%
mutate(number = paste('number', number, sep='_')) %>%
spread(number, n, fill = 0)
# A tibble: 10 x 5
# name number_0 number_1 number_2 number_3
# * <chr> <dbl> <dbl> <dbl> <dbl>
# 1 A 1 1 0 1
# 2 B 0 1 1 1
# 3 C 2 0 0 1
# 4 D 1 1 0 1
# 5 E 0 0 2 1
# 6 F 2 0 1 0
# 7 G 0 1 2 0
# 8 H 0 1 2 0
# 9 I 0 1 1 1
#10 J 1 0 1 1
Try also:
table(My_Data)
or, if you need a data.frame:
as.data.frame.matrix(table(My_Data))

Finding variance of columns from 2 dataframes

I have 2 dataframes
DataFrame A and Dataframe B.
A <- data.frame(a=c(1,2,3,4,5),b=c(2,4,6,8,10),c=c(3,6,9,12,15),x=c(4,8,12,16,20),y=c(5,10,15,20,25))
B <- data.frame(a=c(1,2,3,4,5),b=c(2,4,6,8,10),c=c(3,6,9,12,15),x=c(4,8,12,16,20),y=c(5,10,15,20,25))
A
a b c x y
1 2 3 4 5
2 4 6 8 10
3 6 9 12 15
4 8 12 16 20
5 10 15 20 25
B
a b c x y
1 2 3 4 5
2 4 6 8 10
3 6 9 12 15
4 8 12 16 20
5 10 15 20 25
Expected Output:
C
a b c x y
1 0 0 0 0
2 0 0 0 0
3 0 0 0 0
4 0 0 0 0
5 0 0 0 0
Both have a key column which is alpha-numeric.
Both dataframes have 260 columns in all out of which 250 are float.
Is there an eaiser way to easily compute the variance of each of the 250 columns and store the variance in another dataframe?
I think you want difference brtween respective columns of two dataframes
temp = names(A)
data.frame(A["a"], do.call(cbind, lapply(temp[!temp %in% "a"], function(x) A[x] - B[x])))
# a b c x y
#1 1 0 0 0 0
#2 2 0 0 0 0
#3 3 0 0 0 0
#4 4 0 0 0 0
#5 5 0 0 0 0
We can use Map/mapply to find the difference between the corresponding columns of 'A' and 'B'
cbind(A[1], mapply(`-`, A[-1], B[names(A)[-1]]))
# a b c x y
#1 1 0 0 0 0
#2 2 0 0 0 0
#3 3 0 0 0 0
#4 4 0 0 0 0
#5 5 0 0 0 0
Or just
cbind(A[1], A[-1] - B[-1])

Creating a new variable with if statement in R

I’ve the following situation
In my dataset, there are two variables (x1 and x2).
mydata <- data.frame(x1=c("a","b","c","n","a","d","b","l","a","c","t","a","b","d","c","l","n","b"), x2=c(1,NA,5,2,NA,5,4,NA,2,NA,2,6,NA,6,NA,2,6,NA))
mydata[is.na(mydata)] <- " "
x1 x2
1 a 1
2 b
3 c 5
4 n 2
5 a
6 d 5
7 b 4
8 l
9 a 2
10 c
11 t 2
12 a 6
13 b
14 d 6
15 c
16 l 2
17 n 6
18 b
I want to create a third variable, let’s call it x3, which is defined as
If (x1 = a or b or c AND x2= “ ”), then x3=1, else x3=0
The output is
x1 x2 x3
1 a 1 0
2 b 1
3 c 5 0
4 n 2 0
5 a 1
6 d 5 0
7 b 4 0
8 l 0
9 a 2 0
10 c 1
11 t 2 0
12 a 6 0
13 b 1
14 d 6 0
15 c 1
16 l 2 0
17 n 6 0
18 b 1
You can try
mydata$x3 <- (mydata$x1 %in% c('a', 'b', 'c') & mydata$x2==' ')+0
mydata$x3
#[1] 0 1 0 0 1 0 0 0 0 1 0 0 1 0 1 0 0 1
Or as commented by #David Arenburg, other options to create a column include functions such as transform, within etc
transform(mydata, x3 = (x1 %in% letters[1:3] & x2 == ' ') + 0L)

Resources