add column with total count of rows meeting a condition in dplyr - r

Trying to get totals by class and condition but not grouping data.
Reproducible example:
df <- data.frame("class" = c("a","b","c","d","b","b","b","b","c","c","a"),"increment" = c(0,0,0,0,0,0,32,12,0,0,0))
R> df
class increment
1 a 0
2 b 0
3 c 0
4 d 0
5 b 0
6 b 0
7 b 32
8 b 12
9 c 0
10 c 0
11 a 0
I want the total cases where increment is different from Zero but for every class.
Desired output:
R> df
class increment increment_count_per_class
1 a 0 0
2 b 0 2
3 c 0 0
4 d 0 0
5 b 0 2
6 b 0 2
7 b 32 2
8 b 12 2
9 c 0 0
10 c 0 0
11 a 0 0
My first approach is here below, but I know there must be a less convoluted way using dplyr:
df <- df %>% mutate(has.increment = ifelse(increment>0,1,0))
R> df
class increment has.increment
1 a 0 0
2 b 0 0
3 c 0 0
4 d 0 0
5 b 0 0
6 b 0 0
7 b 32 1
8 b 12 1
9 c 0 0
10 c 0 0
11 a 0 0
Get totals per class when increment exists
N <- df %>% group_by(class,has.increment) %>% tally() %>% filter(has.increment == 1)
R> N
# A tibble: 1 x 3
# Groups: class [1]
class has.increment n
<chr> <dbl> <int>
1 b 1 2
Then join:
merge(N,df, by = "class", all = TRUE)
R> merge(N,df, by = "class", all = TRUE)
class has.increment.x n increment has.increment.y
1 a NA NA 0 0
2 a NA NA 0 0
3 b 1 2 0 0
4 b 1 2 12 1
5 b 1 2 0 0
6 b 1 2 0 0
7 b 1 2 32 1
8 c NA NA 0 0
9 c NA NA 0 0
10 c NA NA 0 0
11 d NA NA 0 0

Try this:
df %>%
group_by(class) %>%
mutate(increment_count_per_class = sum(increment!=0))

Related

Add new columns to data frame for each unique value in another column

I'd like to add a new column for each unique value in another column. Then for each of these columns the value will either be the value from the other column, or zero.
For the example below i would like to add 4 new columns (A:D), for column B would be (0,0,0,0,5,6,7,8,0,0,0,0,0,0,0,0) etc.
df <- data.frame(Group=rep(c('A', 'B', 'C', 'D'), each=4),
score=1:16)
df
Using map_dfc:
library(purrr)
library(dplyr)
map_dfc(setNames(unique(df$Group), unique(df$Group)),
~ ifelse(df$Group == .x, df$score, 0)) %>%
bind_cols(df, .)
Group score A B C D
1 A 1 1 0 0 0
2 A 2 2 0 0 0
3 A 3 3 0 0 0
4 A 4 4 0 0 0
5 B 5 0 5 0 0
6 B 6 0 6 0 0
7 B 7 0 7 0 0
8 B 8 0 8 0 0
9 C 9 0 0 9 0
10 C 10 0 0 10 0
11 C 11 0 0 11 0
12 C 12 0 0 12 0
13 D 13 0 0 0 13
14 D 14 0 0 0 14
15 D 15 0 0 0 15
16 D 16 0 0 0 16
Or in base R:
cbind(df,
sapply(unique(df$Group), \(x) ifelse(df$Group == x, df$score, 0)))

How to mark rows in groups between row numbers based on value?

here is a table example:
dt <- data.frame(cat = rep(c("A", "B", "C"), c(10, 8, 10)), value=c(0,0,0,3,5,0,8,0,0,0,0,0,2,2,3,0,1,0,0,0,0,0,1,2,3,4,0,0))
dt
cat value
1 A 0
2 A 0
3 A 0
4 A 3
5 A 5
6 A 0
7 A 8
8 A 0
9 A 0
10 A 0
11 B 0
12 B 0
13 B 2
14 B 2
15 B 3
16 B 0
17 B 1
18 B 0
19 C 0
20 C 0
21 C 0
22 C 0
23 C 1
24 C 2
25 C 3
26 C 4
27 C 0
28 C 0
What I would like to do is to flag rows between first and last value which is different than 0 for each group (and also those after last value marked in the other way). So, the final table should look like:
cat value flag
1 A 0 0
2 A 0 0
3 A 0 0
4 A 3 1
5 A 5 1
6 A 0 1
7 A 8 1
8 A 0 2
9 A 0 2
10 A 0 2
11 B 0 0
12 B 0 0
13 B 2 1
14 B 2 1
15 B 3 1
16 B 0 1
17 B 1 1
18 B 0 2
19 C 0 0
20 C 0 0
21 C 0 0
22 C 0 0
23 C 1 1
24 C 2 1
25 C 3 1
26 C 4 1
27 C 0 2
28 C 0 2
Thanks a lot in advance,
Does this work:
library(dplyr)
dt %>% group_by(cat) %>% mutate(c1 = cumsum(value)) %>%
mutate(flat = case_when(c1 == 0 ~ 0,
c1 == max(c1) & value == 0 ~ 2,
TRUE ~ 1)) %>%
select(1,2,4) %>% print(n = 50)
# A tibble: 28 x 3
# Groups: cat [3]
cat value flat
<chr> <dbl> <dbl>
1 A 0 0
2 A 0 0
3 A 0 0
4 A 3 1
5 A 5 1
6 A 0 1
7 A 8 1
8 A 0 2
9 A 0 2
10 A 0 2
11 B 0 0
12 B 0 0
13 B 2 1
14 B 2 1
15 B 3 1
16 B 0 1
17 B 1 1
18 B 0 2
19 C 0 0
20 C 0 0
21 C 0 0
22 C 0 0
23 C 1 1
24 C 2 1
25 C 3 1
26 C 4 1
27 C 0 2
28 C 0 2
Write a function which assign 0, 1 and 2 value based on condition.
library(dplyr)
assign_flag <- function(x) {
#First non-zero value
first <- match(TRUE, x > 0)
#last non-zero value
last <- which.max(cumsum(x))
case_when(row_number() < first ~ 0,
row_number() <= last ~ 1,
TRUE ~ 2)
}
and apply it for each group.
dt %>%
group_by(cat) %>%
mutate(flag = assign_flag(value)) %>%
ungroup
# cat value flag
#1 A 0 0
#2 A 0 0
#3 A 0 0
#4 A 3 1
#5 A 5 1
#6 A 0 1
#7 A 8 1
#8 A 0 2
#9 A 0 2
#10 A 0 2
#11 B 0 0
#12 B 0 0
#13 B 2 1
#14 B 2 1
#15 B 3 1
#16 B 0 1
#17 B 1 1
#18 B 0 2
#19 C 0 0
#20 C 0 0
#21 C 0 0
#22 C 0 0
#23 C 1 1
#24 C 2 1
#25 C 3 1
#26 C 4 1
#27 C 0 2
#28 C 0 2
A base R solution
dt_split = lapply( split(dt, f = dt$cat), function(x){
# Find nonzero elements
flag_tmp = which(x$value!=0)
# Define flags
x$flag =c(rep(0,flag_tmp[1]-1), # The leading zeros
rep(1,tail(flag_tmp, n=1)+1 - flag_tmp[1]), # The nonzero flag
rep(2, nrow(x) -tail(flag_tmp, n=1)) # The trailing zero flag
)
x
})
dt = do.call(rbind, dt_split)

R: df header columns are ordinal ranking and spread across columns for each observation

I have a questionnaire data that look like below:
items no_stars1 no_stars2 no_stars3 average satisfied bad
1 A 1 0 0 0 0 1
2 B 0 1 0 1 0 0
3 C 0 0 1 0 1 0
4 D 0 1 0 0 1 0
5 E 0 0 1 1 0 0
6 F 0 0 1 0 1 0
7 G 1 0 0 0 0 1
Basically, the header columns (no. of stars rating and satisfactory) are ordinal ranking for each Items. I would like to summarize the no_stars(col 2:4) and satisfactory(col 5:7) into one column so that the output would look like this :
items no_stars satisfactory
1 A 1 1
2 B 2 2
3 C 3 3
4 D 2 3
5 E 3 2
6 F 3 3
7 G 1 1
$no_stars <- 1 is for no_stars1, 2 for no_stars2, 3 for no_stars3
$satisfactory <- 1 is for bad, 2 for average, 3 for good
I have tried the code below
df$no_stars2[df$no_stars2 == 1] <- 2
df$no_stars3[df$no_stars3 == 1] <- 3
df$average[df$average == 1] <- 2
df$satisfied[df$satisfied == 1] <- 3
no_stars <- df$no_stars1 + df$no_stars2 + df$no_stars3
satisfactory <- df$bad + df$average + df$satisfied
tidy_df <- data.frame(df$Items, no_stars, satisfactory)
tidy_df
Is there any function in R that can do the same thing? or
anyone got better and simpler solution ?
Thanks
Just use max.col and set preferences:
starsOrder<-c("no_stars1","no_stars2","no_stars3")
satOrder<-c("bad","average","satisfied")
data.frame(items=df$items,no_stars=max.col(df[,starsOrder]),
satisfactory=max.col(df[,satOrder]))
# items no_stars satisfactory
#1 A 1 1
#2 B 2 2
#3 C 3 3
#4 D 2 3
#5 E 3 2
#6 F 3 3
#7 G 1 1
Another tidyverse solution making use of factor to integer conversions to encode no_stars and satisfactory and spreading from wide to long twice:
library(tidyverse)
df %>%
gather(no_stars, v1, starts_with("no_stars")) %>%
mutate(no_stars = as.integer(factor(no_stars))) %>%
gather(satisfactory, v2, average, satisfied, bad) %>%
filter(v1 > 0 & v2 > 0) %>%
mutate(satisfactory = as.integer(factor(
satisfactory, levels = c("bad", "average", "satisfied")))) %>%
select(-v1, -v2) %>%
arrange(items)
# items no_stars satisfactory
#1 A 1 1
#2 B 2 2
#3 C 3 3
#4 D 2 3
#5 E 3 2
#6 F 3 3
#7 G 1 1
While there may be more elegant solutions, using dplyr::case_when() gives you the flexibility to code things however you want:
library(dplyr)
df %>%
dplyr::mutate(
no_stars = dplyr::case_when(
no_stars1 == 1 ~ 1,
no_stars2 == 1 ~ 2,
no_stars3 == 1 ~ 3)
, satisfactory = dplyr::case_when(
average == 1 ~ 2,
satisfied == 1 ~ 3,
bad == 1 ~ 1)
)
# items no_stars1 no_stars2 no_stars3 average satisfied bad no_stars satisfactory
# 1 A 1 0 0 0 0 1 1 1
# 2 B 0 1 0 1 0 0 2 2
# 3 C 0 0 1 0 1 0 3 3
# 4 D 0 1 0 0 1 0 2 3
# 5 E 0 0 1 1 0 0 3 2
# 6 F 0 0 1 0 1 0 3 3
# 7 G 1 0 0 0 0 1 1 1
dat%>%
replace(.==1,NA)%>%
replace_na(setNames(as.list(names(.)),names(.)))%>%
replace(.==0,NA)%>%
mutate(s=coalesce(!!!.[2:4]),
no_stars=as.numeric(factor(s,unique(s))),
t=coalesce(!!!.[5:7]),
satisfactory=as.numeric(factor(t,unique(t))))%>%
select(items,no_stars,satisfactory)
items no_stars satisfactory
1 A 1 1
2 B 2 2
3 C 3 3
4 D 2 3
5 E 3 2
6 F 3 3
7 G 1 1
using apply and match :
data.frame(
items = df1$items,
no_stars = apply(df1[2:4], 1, match, x=1),
satisfactory = apply(df1[c(7,5:6)], 1, match, x=1))
# items no_stars satisfactory
# 1 A 1 1
# 2 B 2 2
# 3 C 3 3
# 4 D 2 3
# 5 E 3 2
# 6 F 3 3
# 7 G 1 1
data
df1 <- read.table(header=TRUE,stringsAsFactors=FALSE,text="
items no_stars1 no_stars2 no_stars3 average satisfied bad
1 A 1 0 0 0 0 1
2 B 0 1 0 1 0 0
3 C 0 0 1 0 1 0
4 D 0 1 0 0 1 0
5 E 0 0 1 1 0 0
6 F 0 0 1 0 1 0
7 G 1 0 0 0 0 1")

How to replace certain values with their column name

I have a following table in R
df <- data.frame('a' = c(1,0,0,1,0),
'b' = c(1,0,0,1,0),
'c' = c(1,1,0,1,1))
df
a b c
1 1 1 1
2 0 0 1
3 0 0 0
4 1 1 1
4 0 0 1
What I want is to replace the row value with the column name whenever the row is equal to 1. The output would be this one:
a b c
1 a b c
2 0 0 c
3 0 0 0
4 a b c
4 0 0 c
How can I do this in R? Thanks.
I would use Map and replace:
df[] <- Map(function(n, x) replace(x, x == 1, n), names(df), df)
df
# a b c
# 1 a b c
# 2 0 0 c
# 3 0 0 0
# 4 a b c
# 5 0 0 c
We can use
df[] <- names(df)[(NA^!df) * col(df)]
df[is.na(df)] <- 0
df
# a b c
#1 a b c
#2 0 0 c
#3 0 0 0
#4 a b c
#4 0 0 c
You can try stack and unstack
a=stack(df)
a
values ind
1 1 a
2 0 a
3 0 a
4 1 a
5 0 a
6 1 b
7 0 b
8 0 b
9 1 b
10 0 b
11 1 c
12 1 c
13 0 c
14 1 c
15 1 c
a$values[a$values==1]=as.character(a$ind)[a$values==1]
unstack(a)
a b c
1 a b c
2 0 0 c
3 0 0 0
4 a b c
5 0 0 c
We can try iterating over the names of the data frame, and then handling each column, for a base R option:
df <- data.frame(a=c(1,0,0,1,0), b=c(1,0,0,1,0), c=c(1,1,0,1,1))
df <- data.frame(sapply(names(df), function(x) {
y <- df[[x]]
y[y == 1] <- x
return(y)
}))
df
a b c
1 a b c
2 0 0 c
3 0 0 0
4 a b c
5 0 0 c
Demo
You can do it with ifelse, but you have to do some intermediate transposing to account for R's column-major order processing.
data.frame(t(ifelse(t(df)==1,names(df),0)))
a b c
1 a b c
2 0 0 c
3 0 0 0
4 a b c
5 0 0 c

Dplyr create dummy variable/indicator columns, for sql tables [duplicate]

This question already has answers here:
Generate a dummy-variable
(17 answers)
Closed 5 years ago.
test <- data.frame(
x=rep(letters[1:3],each=2),
y=c(4,4,5,5,5,6)
)
x y
1 a 4
2 a 4
3 b 5
4 b 5
5 c 5
6 c 6
How do i create new columns which contains dummy variables 1 and 0 to indicate the row's observation.
I wish to create something like this.. for column x
x y x_a x_b x_c
1 a 4 1 0 0
2 a 4 1 0 0
3 b 5 0 1 0
4 b 5 0 1 0
5 c 5 0 0 1
6 c 6 0 0 1
Or for column y
x y y_4 y_5 x_6
1 a 4 1 0 0
2 a 4 1 0 0
3 b 5 0 1 0
4 b 5 0 1 0
5 c 5 0 1 0
6 c 6 0 0 1
I managed to this is in base R using ifelse in new columns.
I wish to do this in dplyr so it can work on sql tables.
con <- DBI::dbConnect(RSQLite::SQLite(), path = "")
dbWriteTable(con, "test",test)
testdb <- tbl(con, "test")
testdb %>% mutate(i = row_number(), i2 = 1) %>% spread(x, i2, fill = 0)
the row_number() function do not work on sql tables.
Error: Window function row_number() is not supported by this database. Im using SQLite..
For x:
library(dplyr)
test %>% bind_cols(as_data_frame(setNames(lapply(unique(test$x),
function(x){as.integer(test$x == x)}),
paste0('x_', unique(test$x)))))
x y x_a x_b x_c
1 a 4 1 0 0
2 a 4 1 0 0
3 b 5 0 1 0
4 b 5 0 1 0
5 c 5 0 0 1
6 c 6 0 0 1
For y:
test %>% bind_cols(as_data_frame(setNames(lapply(unique(test$y),
function(x){as.integer(test$y == x)}),
paste0('y_', unique(test$y)))))
x y y_4 y_5 y_6
1 a 4 1 0 0
2 a 4 1 0 0
3 b 5 0 1 0
4 b 5 0 1 0
5 c 5 0 1 0
6 c 6 0 0 1

Resources