I have this example dataset
df <- data.frame(subjects = 1:12,
Why_are_you_not_happy =
c(1,2,"1,2,5",5,1,2,"3,4",3,2,"1,5",3,4),
why_are_you_sad =
c("1,2,3",1,2,3,"4,5,3",2,1,4,3,1,1,1) )
And would like to convert it into a dummy variables format (based on the 5 answers of each question). Can someone guide me through an effective way ? thanks.
You can separate_rows for multiple choices, convert to dummy and summarise by subjects (to get one row per subjects, with all their choices).
library(fastDummies)
library(tidyr)
library(dplyr)
df %>%
separate_rows(Why_are_you_not_happy, why_are_you_sad) %>%
dummy_cols(c("Why_are_you_not_happy", "why_are_you_sad"),
remove_selected_columns = TRUE) %>%
group_by(subjects) %>%
summarise(across(everything(), max))
output
# A tibble: 12 × 11
subjects Why_are_you…¹ Why_a…² Why_a…³ Why_a…⁴ Why_a…⁵ why_a…⁶ why_a…⁷ why_a…⁸ why_a…⁹ why_a…˟
<int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1 1 1 0 0 0 0 1 1 1 0 0
2 2 0 1 0 0 0 1 0 0 0 0
3 3 1 1 0 0 1 0 1 0 0 0
4 4 0 0 0 0 1 0 0 1 0 0
5 5 1 0 0 0 0 0 0 1 1 1
6 6 0 1 0 0 0 0 1 0 0 0
7 7 0 0 1 1 0 1 0 0 0 0
8 8 0 0 1 0 0 0 0 0 1 0
9 9 0 1 0 0 0 0 0 1 0 0
10 10 1 0 0 0 1 1 0 0 0 0
11 11 0 0 1 0 0 1 0 0 0 0
12 12 0 0 0 1 0 1 0 0 0 0
Let's say we have a df as follows:
A B C D E
1 1 0 0 1
0 0 1 0 0
0 0 0 0 1
1 1 1 1 0
0 1 1 0 1
1 0 1 0 0
So I would like to make another variable F which says, if the sum of A:D is greater than 1, F is 1 and A:D are 0.
Additionally, If E == 1, then F = 0.
So here's how I wrote it but it's not working...
#Counter
df<- df %>%
mutate(case_count = A+B+C+D)
df$F <- ifelse(df$E == 1, 0,
ifelse(df$case_count > 1,
df$A == 0 &
df$B == 0 &
df$C == 0 &
df$D == 0 &
df$F == 1, 0))
And the correct result here should then be
A B C D E case_count F
1 1 0 0 1 2 0
0 0 1 0 0 1 0
0 0 0 0 1 0 0
0 0 0 0 0 4 1
0 1 1 0 1 2 0
0 0 0 0 0 2 1
Using dplyr and the new functions across and c_across
df %>%
rowwise() %>%
mutate(
case_count = sum(c_across(A:D)),
F_ = ifelse(E == 1, 0, ifelse(case_count > 1, 1, 0))
) %>%
mutate(across(A:D, ~ifelse(F_ == 1, 0, .)))
I named the new column F_ instead of just F because the latter may be confused with the abbreviation for FALSE.
Output
# A tibble: 6 x 7
# Rowwise:
# A B C D E case_count F_
# <dbl> <dbl> <dbl> <dbl> <int> <int> <dbl>
# 1 1 1 0 0 1 2 0
# 2 0 0 1 0 0 1 0
# 3 0 0 0 0 1 0 0
# 4 0 0 0 0 0 4 1
# 5 0 1 1 0 1 2 0
# 6 0 0 0 0 0 2 1
You can try this solution (DF is your original data):
#Create index
DF$I1 <- rowSums(DF[,1:4])
DF[DF[,6]>1,1:4]<-0
#Create F
DF$F <- ifelse(DF$I1>1,1,0)
DF$F <- ifelse(DF$E==1,0,DF$F)
A B C D E I1 F
1 0 0 0 0 1 2 0
2 0 0 1 0 0 1 0
3 0 0 0 0 1 0 0
4 0 0 0 0 0 4 1
5 0 0 0 0 1 2 0
6 0 0 0 0 0 2 1
Having a dataframe like this:
data.frame(previous = c(1,2,2,1,3,3), next = c(1,1,2,3,1,3), id = c(1,2,3,4,5,6))
How is it possible to exatract a data frame which will check the previous and next columns and create 9 new columns which will have 1 only if the combination of previous and next exist. Example if previous if 2 and next 1 the combination is 2 1 and receives one.
Example of expected output:
data.frame(previous = c(1,2,2,1,3,3), next = c(1,1,2,3,1,3),
col1_1 = c(1,0,0,0,0,0),
col1_2 = c(0,0,0,0,0,0),
col1_3 = c(0,0,0,1,0,0),
col2_1 = c(0,1,0,0,0,0),
col2_2 = c(0,0,1,0,0,0),
col2_3 = c(0,0,0,0,0,0),
col3_1 = c(0,0,0,0,1,0),
col3_2 = c(0,0,0,0,0,0),
col3_3 = c(0,0,0,0,0,1), id = c(1,2,3,4,5,6))
You could use expand.grid to get all the combinations.
Assuming your data frame is called df and the column next is actually called next. to avoid clashing with the keyword next:
as.data.frame(apply(expand.grid(1:3, 1:3), 1, function(x) {
as.numeric(x[1] == df$previous & x[2] == df$next.)}))
#> V1 V2 V3 V4 V5 V6 V7 V8 V9
#> 1 1 0 0 0 0 0 0 0 0
#> 2 0 1 0 0 0 0 0 0 0
#> 3 0 0 0 0 1 0 0 0 0
#> 4 0 0 0 0 0 0 1 0 0
#> 5 0 0 1 0 0 0 0 0 0
#> 6 0 0 0 0 0 0 0 0 1
An step by step approach might be the following one. I have changed the next column name for next1 to avoid problems:
AllComb<-expand.grid(unique(df$previous),unique(df$next1))# Creating all possible combinations
myframe <- matrix(rep(0,nrow(AllComb)*nrow(df)),ncol=nrow(AllComb),nrow =nrow(df))
colnames(myframe)<-paste("col_",AllComb$Var1,"_",AllComb$Var2, sep ="")
for(id_row in 1:ncol(df)){
myvec <- df[id_row,]
Word <- paste("col_",myvec[1],"_",myvec[2], sep ="")# Finding Word
Colindex <-which(colnames(myframe)==Word) #Finding Column index
myframe[id_row, Colindex] <-1 # Replacing in column index and vetor
}
dfRes<-cbind(previous =df$previous, "next"= df$next1, myframe, id=df$id)
# previous next col_1_1 col_2_1 col_3_1 col_1_2 col_2_2 col_3_2 col_1_3 col_2_3 col_3_3 id
# [1,] 1 1 1 0 0 0 0 0 0 0 0 1
# [2,] 2 1 0 1 0 0 0 0 0 0 0 2
# [3,] 2 2 0 0 0 0 1 0 0 0 0 3
# [4,] 1 3 0 0 0 0 0 0 0 0 0 4
# [5,] 3 1 0 0 0 0 0 0 0 0 0 5
# [6,] 3 3 0 0 0 0 0 0 0 0 0 6
Inside a by you could use a switch, because your values are nicely consecutive 1:3. Finally we merge to get the result.
tmp <- by(dat, dat$next., function(x) {
x1 <- x$previous
o <- `colnames<-`(t(sapply(x1, function(z)
switch(z, c(1, 0, 0), c(0, 1, 0), c(0, 0, 1)))),
paste(el(x1), 1:3, sep="_"))
cbind(x, col=o)
})
res <- Reduce(function(...) merge(..., all=TRUE), tmp)
res[is.na(res)] <- 0 ## set NA to zero if wanted
Result
res[order(res$id),] ## order by ID if needed
# previous next. id col.1_1 col.1_2 col.1_3 col.2_1 col.2_2 col.2_3
# 1 1 1 1 1 0 0 0 0 0
# 3 2 1 2 0 1 0 0 0 0
# 4 2 2 3 0 0 0 0 1 0
# 2 1 3 4 1 0 0 0 0 0
# 5 3 1 5 0 0 1 0 0 0
# 6 3 3 6 0 0 1 0 0 0
Data
dat <- structure(list(previous = c(1, 2, 2, 1, 3, 3), next. = c(1, 1,
2, 3, 1, 3), id = c(1, 2, 3, 4, 5, 6)), class = "data.frame", row.names = c(NA,
-6L))
Note: next as column name is not particularly a good idea, since it has a special meaning in R.
Here is a tidyverse approach:
library(tidyr)
library(dplyr)
df %>%
rowid_to_column() %>%
complete(previous, nxt) %>%
unite(col , previous, nxt, sep = "_", remove = FALSE) %>%
pivot_wider(names_from = col, values_from = rowid, values_fn = list(rowid = ~1), values_fill = list(rowid = 0)) %>%
na.omit() %>%
arrange(id)
# A tibble: 6 x 12
previous nxt id `1_1` `1_2` `1_3` `2_1` `2_2` `2_3` `3_1` `3_2` `3_3`
<dbl> <dbl> <dbl> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1 1 1 1 1 0 0 0 0 0 0 0 0
2 2 1 2 0 0 0 1 0 0 0 0 0
3 2 2 3 0 0 0 0 1 0 0 0 0
4 1 3 4 0 0 1 0 0 0 0 0 0
5 3 1 5 0 0 0 0 0 0 1 0 0
6 3 3 6 0 0 0 0 0 0 0 0 1
This is another tidyverse solution that differ a little (maybe more concise) from #H1's one.
library(dplyr)
library(tidyr)
df %>%
mutate(n = 1) %>%
complete(id, previous, next., fill = list(n = 0)) %>%
unite(col, previous, next.) %>%
pivot_wider(names_from = col, names_prefix = "col", values_from = n) %>%
right_join(df)
# # A tibble: 6 x 12
# id col1_1 col1_2 col1_3 col2_1 col2_2 col2_3 col3_1 col3_2 col3_3 previous next.
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 1 0 0 0 0 0 0 0 0 1 1
# 2 2 0 0 0 1 0 0 0 0 0 2 1
# 3 3 0 0 0 0 1 0 0 0 0 2 2
# 4 4 0 0 1 0 0 0 0 0 0 1 3
# 5 5 0 0 0 0 0 0 1 0 0 3 1
# 6 6 0 0 0 0 0 0 0 0 1 3 3
You can try the code below
dfout <- within(df,
col <- `colnames<-`(t(sapply((Previous-1)*3+Next,
function(v) replace(rep(0,9),v,1))),
do.call(paste,c(expand.grid(1:3,1:3),sep = "_"))))
such that
> dfout
Previous Next id col.1_1 col.2_1 col.3_1 col.1_2 col.2_2 col.3_2 col.1_3 col.2_3 col.3_3
1 1 1 1 1 0 0 0 0 0 0 0 0
2 2 1 2 0 0 0 1 0 0 0 0 0
3 2 2 3 0 0 0 0 1 0 0 0 0
4 1 3 4 0 0 1 0 0 0 0 0 0
5 3 1 5 0 0 0 0 0 0 1 0 0
6 3 3 6 0 0 0 0 0 0 0 0 1
Have a 1000*16 matrix from a simulation with team names as characters. I want to count number of occurrences per team in all 16 columns.
I know I could do apply(test, 2, table) but that makes the data hard to work with afterward since all teams is not included in every column.
If you have a vector that is all the unique team names you could do something like this. I'm counting occurrences here via column to ensure that not every team (in this case letter) is not included.
set.seed(15)
letter_mat <- matrix(
sample(
LETTERS,
size = 1000*16,
replace = TRUE
),
ncol = 16,
nrow = 1000
)
output <- t(
apply(
letter_mat,
1,
function(x) table(factor(x, levels = LETTERS))
)
)
head(output)
A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
[1,] 1 2 0 1 1 1 1 0 0 0 1 0 0 0 0 1 1 1 1 1 0 1 1 0 0 1
[2,] 0 1 0 2 2 1 0 1 0 0 0 0 0 1 0 0 0 0 0 1 1 1 0 2 2 1
[3,] 1 1 0 0 1 0 1 2 1 0 0 0 0 0 1 0 1 0 1 1 0 0 3 0 1 1
[4,] 0 1 0 0 0 1 0 0 0 2 0 1 0 0 1 1 1 1 2 0 2 3 0 0 0 0
[5,] 2 1 0 0 0 0 0 2 0 2 1 1 1 0 0 2 0 2 1 0 0 1 0 0 0 0
[6,] 0 0 0 0 0 1 3 1 0 0 0 0 1 1 3 0 1 0 0 1 0 0 0 1 0 3
I have a dataframe where data are grouped by ID. I need to know how many cells are the 10% of each group in order to select this number in a sample, but this sample should select the cells which EP is 1.
I've tried to do a nested For loop: one For to know the quantity of cells which are the 10% for each group and the bigger one to sample this number meeting the condition EP==1
x <- data.frame("ID"=rep(1:2, each=10),"EP" = rep(0:1, times=10))
x
ID EP
1 1 0
2 1 1
3 1 0
4 1 1
5 1 0
6 1 1
7 1 0
8 1 1
9 1 0
10 1 1
11 2 0
12 2 1
13 2 0
14 2 1
15 2 0
16 2 1
17 2 0
18 2 1
19 2 0
20 2 1
for(j in 1:1000){
for (i in 1:nrow(x)){
d <- x[x$ID==i,]
npix <- 10*nrow(d)/100
}
r <- sample(d[d$EP==1,],npix)
print(r)
}
data frame with 0 columns and 0 rows
data frame with 0 columns and 0 rows
data frame with 0 columns and 0 rows
.
.
.
until 1000
I would want to get this dataframe, where each sample is in a new column in x, and the cell sampled has "1":
ID EP s1 s2....s1000
1 1 0 0 0 ....
2 1 1 0 1
3 1 0 0 0
4 1 1 0 0
5 1 0 0 0
6 1 1 0 0
7 1 0 0 0
8 1 1 0 0
9 1 0 0 0
10 1 1 1 0
11 2 0 0 0
12 2 1 0 0
13 2 0 0 0
14 2 1 0 1
15 2 0 0 0
16 2 1 0 0
17 2 0 0 0
18 2 1 1 0
19 2 0 0 0
20 2 1 0 0
see that each 1 in S1 and s2 are the sampled cells and correspond to 10% of cells in each group (1, 2) which meet the condition EP==1
you can try
set.seed(1231)
x <- data.frame("ID"=rep(1:2, each=10),"EP" = rep(0:1, times=10))
library(tidyverse)
x %>%
group_by(ID) %>%
mutate(index= ifelse(EP==1, 1:n(),0)) %>%
mutate(s1 = ifelse(index %in% sample(index[index!=0], n()*0.1), 1, 0)) %>%
mutate(s2 = ifelse(index %in% sample(index[index!=0], n()*0.1), 1, 0))
# A tibble: 20 x 5
# Groups: ID [2]
ID EP index s1 s2
<int> <int> <dbl> <dbl> <dbl>
1 1 0 0 0 0
2 1 1 2 0 0
3 1 0 0 0 0
4 1 1 4 0 0
5 1 0 0 0 0
6 1 1 6 1 1
7 1 0 0 0 0
8 1 1 8 0 0
9 1 0 0 0 0
10 1 1 10 0 0
11 2 0 0 0 0
12 2 1 2 0 0
13 2 0 0 0 0
14 2 1 4 0 1
15 2 0 0 0 0
16 2 1 6 0 0
17 2 0 0 0 0
18 2 1 8 0 0
19 2 0 0 0 0
20 2 1 10 1 0
We can write a function which gives us 1's which are 10% for each ID and place it where EP = 1.
library(dplyr)
rep_func <- function() {
x %>%
group_by(ID) %>%
mutate(s1 = 0,
s1 = replace(s1, sample(which(EP == 1), floor(0.1 * n())), 1)) %>%
pull(s1)
}
then use replicate to repeat it for n times
n <- 5
x[paste0("s", seq_len(n))] <- replicate(n, rep_func())
x
# ID EP s1 s2 s3 s4 s5
#1 1 0 0 0 0 0 0
#2 1 1 0 0 0 0 0
#3 1 0 0 0 0 0 0
#4 1 1 0 0 0 0 0
#5 1 0 0 0 0 0 0
#6 1 1 1 0 0 1 0
#7 1 0 0 0 0 0 0
#8 1 1 0 1 0 0 0
#9 1 0 0 0 0 0 0
#10 1 1 0 0 1 0 1
#11 2 0 0 0 0 0 0
#12 2 1 0 0 1 0 0
#13 2 0 0 0 0 0 0
#14 2 1 1 1 0 0 0
#15 2 0 0 0 0 0 0
#16 2 1 0 0 0 0 1
#17 2 0 0 0 0 0 0
#18 2 1 0 0 0 1 0
#19 2 0 0 0 0 0 0
#20 2 1 0 0 0 0 0