Dplyr create dummy variable/indicator columns, for sql tables [duplicate] - r

This question already has answers here:
Generate a dummy-variable
(17 answers)
Closed 5 years ago.
test <- data.frame(
x=rep(letters[1:3],each=2),
y=c(4,4,5,5,5,6)
)
x y
1 a 4
2 a 4
3 b 5
4 b 5
5 c 5
6 c 6
How do i create new columns which contains dummy variables 1 and 0 to indicate the row's observation.
I wish to create something like this.. for column x
x y x_a x_b x_c
1 a 4 1 0 0
2 a 4 1 0 0
3 b 5 0 1 0
4 b 5 0 1 0
5 c 5 0 0 1
6 c 6 0 0 1
Or for column y
x y y_4 y_5 x_6
1 a 4 1 0 0
2 a 4 1 0 0
3 b 5 0 1 0
4 b 5 0 1 0
5 c 5 0 1 0
6 c 6 0 0 1
I managed to this is in base R using ifelse in new columns.
I wish to do this in dplyr so it can work on sql tables.
con <- DBI::dbConnect(RSQLite::SQLite(), path = "")
dbWriteTable(con, "test",test)
testdb <- tbl(con, "test")
testdb %>% mutate(i = row_number(), i2 = 1) %>% spread(x, i2, fill = 0)
the row_number() function do not work on sql tables.
Error: Window function row_number() is not supported by this database. Im using SQLite..

For x:
library(dplyr)
test %>% bind_cols(as_data_frame(setNames(lapply(unique(test$x),
function(x){as.integer(test$x == x)}),
paste0('x_', unique(test$x)))))
x y x_a x_b x_c
1 a 4 1 0 0
2 a 4 1 0 0
3 b 5 0 1 0
4 b 5 0 1 0
5 c 5 0 0 1
6 c 6 0 0 1
For y:
test %>% bind_cols(as_data_frame(setNames(lapply(unique(test$y),
function(x){as.integer(test$y == x)}),
paste0('y_', unique(test$y)))))
x y y_4 y_5 y_6
1 a 4 1 0 0
2 a 4 1 0 0
3 b 5 0 1 0
4 b 5 0 1 0
5 c 5 0 1 0
6 c 6 0 0 1

Related

add column with total count of rows meeting a condition in dplyr

Trying to get totals by class and condition but not grouping data.
Reproducible example:
df <- data.frame("class" = c("a","b","c","d","b","b","b","b","c","c","a"),"increment" = c(0,0,0,0,0,0,32,12,0,0,0))
R> df
class increment
1 a 0
2 b 0
3 c 0
4 d 0
5 b 0
6 b 0
7 b 32
8 b 12
9 c 0
10 c 0
11 a 0
I want the total cases where increment is different from Zero but for every class.
Desired output:
R> df
class increment increment_count_per_class
1 a 0 0
2 b 0 2
3 c 0 0
4 d 0 0
5 b 0 2
6 b 0 2
7 b 32 2
8 b 12 2
9 c 0 0
10 c 0 0
11 a 0 0
My first approach is here below, but I know there must be a less convoluted way using dplyr:
df <- df %>% mutate(has.increment = ifelse(increment>0,1,0))
R> df
class increment has.increment
1 a 0 0
2 b 0 0
3 c 0 0
4 d 0 0
5 b 0 0
6 b 0 0
7 b 32 1
8 b 12 1
9 c 0 0
10 c 0 0
11 a 0 0
Get totals per class when increment exists
N <- df %>% group_by(class,has.increment) %>% tally() %>% filter(has.increment == 1)
R> N
# A tibble: 1 x 3
# Groups: class [1]
class has.increment n
<chr> <dbl> <int>
1 b 1 2
Then join:
merge(N,df, by = "class", all = TRUE)
R> merge(N,df, by = "class", all = TRUE)
class has.increment.x n increment has.increment.y
1 a NA NA 0 0
2 a NA NA 0 0
3 b 1 2 0 0
4 b 1 2 12 1
5 b 1 2 0 0
6 b 1 2 0 0
7 b 1 2 32 1
8 c NA NA 0 0
9 c NA NA 0 0
10 c NA NA 0 0
11 d NA NA 0 0
Try this:
df %>%
group_by(class) %>%
mutate(increment_count_per_class = sum(increment!=0))

R: df header columns are ordinal ranking and spread across columns for each observation

I have a questionnaire data that look like below:
items no_stars1 no_stars2 no_stars3 average satisfied bad
1 A 1 0 0 0 0 1
2 B 0 1 0 1 0 0
3 C 0 0 1 0 1 0
4 D 0 1 0 0 1 0
5 E 0 0 1 1 0 0
6 F 0 0 1 0 1 0
7 G 1 0 0 0 0 1
Basically, the header columns (no. of stars rating and satisfactory) are ordinal ranking for each Items. I would like to summarize the no_stars(col 2:4) and satisfactory(col 5:7) into one column so that the output would look like this :
items no_stars satisfactory
1 A 1 1
2 B 2 2
3 C 3 3
4 D 2 3
5 E 3 2
6 F 3 3
7 G 1 1
$no_stars <- 1 is for no_stars1, 2 for no_stars2, 3 for no_stars3
$satisfactory <- 1 is for bad, 2 for average, 3 for good
I have tried the code below
df$no_stars2[df$no_stars2 == 1] <- 2
df$no_stars3[df$no_stars3 == 1] <- 3
df$average[df$average == 1] <- 2
df$satisfied[df$satisfied == 1] <- 3
no_stars <- df$no_stars1 + df$no_stars2 + df$no_stars3
satisfactory <- df$bad + df$average + df$satisfied
tidy_df <- data.frame(df$Items, no_stars, satisfactory)
tidy_df
Is there any function in R that can do the same thing? or
anyone got better and simpler solution ?
Thanks
Just use max.col and set preferences:
starsOrder<-c("no_stars1","no_stars2","no_stars3")
satOrder<-c("bad","average","satisfied")
data.frame(items=df$items,no_stars=max.col(df[,starsOrder]),
satisfactory=max.col(df[,satOrder]))
# items no_stars satisfactory
#1 A 1 1
#2 B 2 2
#3 C 3 3
#4 D 2 3
#5 E 3 2
#6 F 3 3
#7 G 1 1
Another tidyverse solution making use of factor to integer conversions to encode no_stars and satisfactory and spreading from wide to long twice:
library(tidyverse)
df %>%
gather(no_stars, v1, starts_with("no_stars")) %>%
mutate(no_stars = as.integer(factor(no_stars))) %>%
gather(satisfactory, v2, average, satisfied, bad) %>%
filter(v1 > 0 & v2 > 0) %>%
mutate(satisfactory = as.integer(factor(
satisfactory, levels = c("bad", "average", "satisfied")))) %>%
select(-v1, -v2) %>%
arrange(items)
# items no_stars satisfactory
#1 A 1 1
#2 B 2 2
#3 C 3 3
#4 D 2 3
#5 E 3 2
#6 F 3 3
#7 G 1 1
While there may be more elegant solutions, using dplyr::case_when() gives you the flexibility to code things however you want:
library(dplyr)
df %>%
dplyr::mutate(
no_stars = dplyr::case_when(
no_stars1 == 1 ~ 1,
no_stars2 == 1 ~ 2,
no_stars3 == 1 ~ 3)
, satisfactory = dplyr::case_when(
average == 1 ~ 2,
satisfied == 1 ~ 3,
bad == 1 ~ 1)
)
# items no_stars1 no_stars2 no_stars3 average satisfied bad no_stars satisfactory
# 1 A 1 0 0 0 0 1 1 1
# 2 B 0 1 0 1 0 0 2 2
# 3 C 0 0 1 0 1 0 3 3
# 4 D 0 1 0 0 1 0 2 3
# 5 E 0 0 1 1 0 0 3 2
# 6 F 0 0 1 0 1 0 3 3
# 7 G 1 0 0 0 0 1 1 1
dat%>%
replace(.==1,NA)%>%
replace_na(setNames(as.list(names(.)),names(.)))%>%
replace(.==0,NA)%>%
mutate(s=coalesce(!!!.[2:4]),
no_stars=as.numeric(factor(s,unique(s))),
t=coalesce(!!!.[5:7]),
satisfactory=as.numeric(factor(t,unique(t))))%>%
select(items,no_stars,satisfactory)
items no_stars satisfactory
1 A 1 1
2 B 2 2
3 C 3 3
4 D 2 3
5 E 3 2
6 F 3 3
7 G 1 1
using apply and match :
data.frame(
items = df1$items,
no_stars = apply(df1[2:4], 1, match, x=1),
satisfactory = apply(df1[c(7,5:6)], 1, match, x=1))
# items no_stars satisfactory
# 1 A 1 1
# 2 B 2 2
# 3 C 3 3
# 4 D 2 3
# 5 E 3 2
# 6 F 3 3
# 7 G 1 1
data
df1 <- read.table(header=TRUE,stringsAsFactors=FALSE,text="
items no_stars1 no_stars2 no_stars3 average satisfied bad
1 A 1 0 0 0 0 1
2 B 0 1 0 1 0 0
3 C 0 0 1 0 1 0
4 D 0 1 0 0 1 0
5 E 0 0 1 1 0 0
6 F 0 0 1 0 1 0
7 G 1 0 0 0 0 1")

Creating a new variable by detecting max value for each id

My data set contains three variables:
id <- c(1,1,1,1,1,1,2,2,2,2,5,5,5,5,5,5)
ind <- c(0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1)
price <- c(1,2,3,4,5,6,1,2,3,4,1,2,3,4,5,6)
mdata <- data.frame(id,ind,price)
I need to create a new variable (ind2) that is if ind=0, then ind2=0.
also, if ind=1, then ind2=0, unless the price value is max, then ind2=1.
The new data looks like:
id ind ind2 price
1 0 0 1
1 0 0 2
1 0 0 3
1 0 0 4
1 0 0 5
1 0 0 6
2 1 0 1
2 1 0 2
2 1 0 3
2 1 1 4
5 1 0 1
5 1 0 2
5 1 0 3
5 1 0 4
5 1 0 5
5 1 1 6
library(dplyr)
mdata %>%
group_by(id) %>%
mutate(ind2 = +(ind == 1L & price == max(price)))
# id ind price ind2
# 1 1 0 1 0
# 2 1 0 2 0
# 3 1 0 3 0
# 4 1 0 4 0
# 5 1 0 5 0
# 6 1 0 6 0
# 7 2 1 1 0
# 8 2 1 2 0
# 9 2 1 3 0
# 10 2 1 4 1
# 11 5 1 1 0
# 12 5 1 2 0
# 13 5 1 3 0
# 14 5 1 4 0
# 15 5 1 5 0
# 16 5 1 6 1
Or if you prefer data.table
setDT(mdata)[, ind2 := +(ind == 1L & price == max(price)), by = id]
Or with base R
mdata$ind2 <- unlist(lapply(split(mdata,mdata$id),
function(x) +(x$ind == 1L & x$price == max(x$price))))

Sum rows in a group, starting when a specific value occurs

I want to accumulate the values of a column till the end of the group, though starting the addition when a specific value occurs in another column. I am only interested in the first instance of the specific value within a group. So if that value occurs again within the group, the addition column should continue to add the values. I know this sounds like a rather strange problem, so hopefully the example table makes sense.
The following data frame is what I have now:
> df = data.frame(group = c(1,1,1,1,2,2,2,2,2,3,3,3,4,4,4),numToAdd = c(1,1,3,2,4,2,1,3,2,1,2,1,2,3,2),occurs = c(0,0,1,0,0,1,0,0,0,0,1,1,0,0,0))
> df
group numToAdd occurs
1 1 1 0
2 1 1 0
3 1 3 1
4 1 2 0
5 2 4 0
6 2 2 1
7 2 1 0
8 2 3 0
9 2 2 0
10 3 1 0
11 3 2 1
12 3 1 1
13 4 2 0
14 4 3 0
15 4 2 0
Thus, whenever a 1 occurs within a group, I want a cumulative sum of the values from the column numToAdd, until a new group starts. This would look like the following:
> finalDF = data.frame(group = c(1,1,1,1,2,2,2,2,2,3,3,3,4,4,4),numToAdd = c(1,1,3,2,4,2,1,3,2,1,2,1,2,3,2),occurs = c(0,0,1,0,0,1,0,0,0,0,1,1,0,0,0),added = c(0,0,3,5,0,2,3,6,8,0,2,3,0,0,0))
> finalDF
group numToAdd occurs added
1 1 1 0 0
2 1 1 0 0
3 1 3 1 3
4 1 2 0 5
5 2 4 0 0
6 2 2 1 2
7 2 1 0 3
8 2 3 0 6
9 2 2 0 8
10 3 1 0 0
11 3 2 1 2
12 3 1 1 3
13 4 2 0 0
14 4 3 0 0
15 4 2 0 0
Thus, the added column is 0 until a 1 occurs within the group, then accumulates the values from numToAdd until it moves to a new group, turning the added column back to 0. In group three, a value of 1 is found a second time, yet the cumulated sum continues. Additionally, in group 4, a value of 1 is never found, thus the value within the added column remains 0.
I've played around with dplyr, but can't get it to work. The following solution only outputs the total sum, and not the increasing cumulated number at each row.
library(dplyr)
df =
df %>%
mutate(added=ifelse(occurs == 1,cumsum(numToAdd),0)) %>%
group_by(group)
Try
df %>%
group_by(group) %>%
mutate(added= cumsum(numToAdd*cummax(occurs)))
# group numToAdd occurs added
# 1 1 1 0 0
# 2 1 1 0 0
# 3 1 3 1 3
# 4 1 2 0 5
# 5 2 4 0 0
# 6 2 2 1 2
# 7 2 1 0 3
# 8 2 3 0 6
# 9 2 2 0 8
# 10 3 1 0 0
# 11 3 2 1 2
# 12 3 1 1 3
# 13 4 2 0 0
# 14 4 3 0 0
# 15 4 2 0 0
Or using data.table
library(data.table)#v1.9.5+
i1 <-setDT(df)[, .I[(rleid(occurs) + (occurs>0))>1], group]$V1
df[, added:=0][i1, added:=cumsum(numToAdd), by = group]
Or a similar option as in dplyr
setDT(df)[,added := cumsum(numToAdd * cummax(occurs)) , by = group]
You can use split-apply-combine in base R with something like:
df$added <- unlist(lapply(split(df, df$group), function(x) {
y <- rep(0, nrow(x))
pos <- cumsum(x$occurs) > 0
y[pos] <- cumsum(x$numToAdd[pos])
y
}))
df
# group numToAdd occurs added
# 1 1 1 0 0
# 2 1 1 0 0
# 3 1 3 1 3
# 4 1 2 0 5
# 5 2 4 0 0
# 6 2 2 1 2
# 7 2 1 0 3
# 8 2 3 0 6
# 9 2 2 0 8
# 10 3 1 0 0
# 11 3 2 1 2
# 12 3 1 1 3
# 13 4 2 0 0
# 14 4 3 0 0
# 15 4 2 0 0
To add another base R approach:
df$added <- unlist(lapply(split(df, df$group), function(x) {
c(x[,'occurs'][cumsum(x[,'occurs']) == 0L],
cumsum(x[,'numToAdd'][cumsum(x[,'occurs']) != 0L]))
}))
# group numToAdd occurs added
# 1 1 1 0 0
# 2 1 1 0 0
# 3 1 3 1 3
# 4 1 2 0 5
# 5 2 4 0 0
# 6 2 2 1 2
# 7 2 1 0 3
# 8 2 3 0 6
# 9 2 2 0 8
# 10 3 1 0 0
# 11 3 2 1 2
# 12 3 1 1 3
# 13 4 2 0 0
# 14 4 3 0 0
# 15 4 2 0 0
Another base R:
df$added <- unlist(lapply(split(df,df$group),function(x){
cumsum((cumsum(x$occurs) > 0) * x$numToAdd)
}))

create a new data frame with existing ones

Suppose I have the following data frames
treatmet1<-data.frame(id=c(1,2,7))
treatment2<-data.frame(id=c(3,7,10))
control<-data.frame(id=c(4,5,8,9))
I want to create a new data frame that is the union of those 3 and have an indicator column that takes the value 1 for each one.
experiment<-data.frame(id=c(1:10),treatment1=0, treatment2=0, control=0)
where experiment$treatment1[1]=1 etc etc
What is the best way of doing this in R?
Thanks!
Updated as per # Flodel:
kk<-rbind(treatment1,treatment2,control)
var1<-c("treatment1","treatment2","control")
kk$df<-rep(var1,c(dim(treatment1)[1],dim(treatment2)[1],dim(control)[1]))
kk
id df
1 1 treatment1
2 2 treatment1
3 7 treatment1
4 3 treatment2
5 7 treatment2
6 10 treatment2
7 4 control
8 5 control
9 8 control
10 9 control
If you want in the form of 1 and 0 , you can use table
ll<-table(kk)
ll
df
id control treatment1 treatment2
1 0 1 0
2 0 1 0
3 0 0 1
4 1 0 0
5 1 0 0
7 0 1 1
8 1 0 0
9 1 0 0
10 0 0 1
If you want it as a data.frame, then you can use reshape:
kk2<-reshape(data.frame(ll),timevar = "df",idvar = "id",direction = "wide")
names(kk2)[-1]<-sort(var1)
> kk2
kk2
id control treatment1 treatment2
1 1 0 1 0
2 2 0 1 0
3 3 0 0 1
4 4 1 0 0
5 5 1 0 0
6 7 0 1 1
7 8 1 0 0
8 9 1 0 0
9 10 0 0 1
df.bind <- function(...) {
df.names <- all.names(substitute(list(...)))[-1L]
ids.list <- setNames(lapply(list(...), `[[`, "id"), df.names)
num.ids <- max(unlist(ids.list))
tabs <- lapply(ids.list, tabulate, num.ids)
data.frame(id = seq(num.ids), tabs)
}
df.bind(treatment1, treatment2, control)
# id treatment1 treatment2 control
# 1 1 1 0 0
# 2 2 1 0 0
# 3 3 0 1 0
# 4 4 0 0 1
# 5 5 0 0 1
# 6 6 0 0 0
# 7 7 1 1 0
# 8 8 0 0 1
# 9 9 0 0 1
# 10 10 0 1 0
(Notice how it does include a row for id == 6.)
Taking
treatment1<-data.frame(id=c(1,2,7))
treatment2<-data.frame(id=c(3,7,10))
control<-data.frame(id=c(4,5,8,9))
You can use this:
x <- c("treatment1", "treatment2", "control")
f <- function(s) within(get(s), assign(s, 1))
r <- Reduce(function(x,y) merge(x,y,all=TRUE), lapply(x, f))
r[is.na(r)] <- 0
Result:
> r
id treatment1 treatment2 control
1 1 1 0 0
2 2 1 0 0
3 3 0 1 0
4 4 0 0 1
5 5 0 0 1
6 7 1 1 0
7 8 0 0 1
8 9 0 0 1
9 10 0 1 0
This illustrates what I was imagining to be the rbind strategy:
alldf <- rbind(treatmet1,treatment2,control)
alldf$grps <- model.matrix( ~ factor( c( rep(1,nrow(treatmet1)),
rep(2,nrow(treatment2)),
rep(3,nrow(control) ) ))-1)
dimnames( alldf[[2]])[2]<- list(c("trt1","trt2","ctrl"))
alldf
#-------------------
id grps.trt1 grps.trt2 grps.ctrl
1 1 1 0 0
2 2 1 0 0
3 7 1 0 0
4 3 0 1 0
5 7 0 1 0
6 10 0 1 0
7 4 0 0 1
8 5 0 0 1
9 8 0 0 1
10 9 0 0 1

Resources