from column with factors to two different column with 0, 1 - r

I have a column with group1 group 2 in data frame.
group <- c( "group1", "group1", "group2", "group1", "group2" )
value<- c(1:5)
dat <- data.frame(value, group)
I want to make it like this-
group1 <- c(1, 1, 0, 1, 0)
group2 <- c(0, 0, 1, 0, 1)
dat<- data.frame(value, group1, group2)
I tried this but have to remove the group column later
dat<- dat %>%
mutate( group1 = ifelse(data1$group =="group1", 1, 0 ),
group2 = ifelse(data1$group =="group2", 1, 0 ) )
Is there any other nice way to do this job.
Thanks in advance for your help.

You could create a dummy column and get data in wide format.
library(dplyr)
library(tidyr)
dat %>%
mutate(n = 1) %>%
pivot_wider(names_from = group, values_from = n, values_fill = 0) -> result
# value group1 group2
# <int> <dbl> <dbl>
#1 1 1 0
#2 2 1 0
#3 3 0 1
#4 4 1 0
#5 5 0 1
Or in base R use table :
table(dat)
# group
#value group1 group2
# 1 1 0
# 2 1 0
# 3 0 1
# 4 1 0
# 5 0 1

A base R option using reshape
replace(
out <- reshape(
cbind(dat, q = 1),
direction = "wide",
idvar = "value",
timevar = "group"
),
is.na(out),
0
)
giving
value q.group1 q.group2
1 1 1 0
2 2 1 0
3 3 0 1
4 4 1 0
5 5 0 1

We can use data.table
library(data.table)
dcast(setDT(dat), value ~ group, length)
# value group1 group2
#1: 1 1 0
#2: 2 1 0
#3: 3 0 1
#4: 4 1 0
#5: 5 0 1
Or this can be done with pivot_wider in a single step by specifying values_fn
library(dplyr)
library(tidyr)
dat %>%
pivot_wider(names_from = group, values_from = group,
values_fn = length, values_fill = 0)
# A tibble: 5 x 3
# value group1 group2
# <int> <int> <int>
#1 1 1 0
#2 2 1 0
#3 3 0 1
#4 4 1 0
#5 5 0 1

Insert %>% select(!"group") at the end of the dplyr pipe. Also remove data1$ from it - you probably meant dat, even that's not needed.
dat %>%
mutate(group1 = ifelse(group =="group1", 1, 0 ),
group2 = ifelse(group =="group2", 1, 0 )) %>%
select(!"group")
value group1 group2
1 1 1 0
2 2 1 0
3 3 0 1
4 4 1 0
5 5 0 1

Related

dplyr mutate ifelse returning first value of group instead of by-row

I'm trying to mutate a data.frame using ifelse:
df = data.frame(grp = c('a', 'a', 'a', 'b', 'b', 'b'),
value1 = c(0, 0, 0, 0, 1, 2),
value2 = 1:6)
df %>%
group_by(grp) %>%
mutate(value2 = ifelse(all(value1 == 0), 0, value2))
which returns
# # A tibble: 6 x 3
# # Groups: grp [2]
# grp value1 value2
# <chr> <dbl> <dbl>
# 1 a 0 0
# 2 a 0 0
# 3 a 0 0
# 4 b 0 4
# 5 b 1 4
# 6 b 2 4
instead of
# # A tibble: 6 x 3
# # Groups: grp [2]
# grp value1 value2
# <chr> <dbl> <dbl>
# 1 a 0 0
# 2 a 0 0
# 3 a 0 0
# 4 b 0 4
# 5 b 1 5
# 6 b 2 6
How can I change the mutate so that the rows of "value2" are unchanged if the condition is false?
You can use if and else instead of ifelse():
df %>%
group_by(grp) %>%
mutate(value2 = if(all(value1 == 0)) 0 else value2)
grp value1 value2
<fct> <dbl> <dbl>
1 a 0 0
2 a 0 0
3 a 0 0
4 b 0 4
5 b 1 5
6 b 2 6
You can try ifelse as a mask, e.g.,
df %>%
group_by(grp) %>%
mutate(value2 = ifelse(all(value1 == 0), 0, 1)*value2)
or (thank #tmfmnk's comment)
df %>%
group_by(grp) %>%
mutate(value2 = any(value1 != 0)*value2)
which gives
grp value1 value2
<chr> <dbl> <dbl>
1 a 0 0
2 a 0 0
3 a 0 0
4 b 0 4
5 b 1 5
6 b 2 6
The problem you encountered is due to the fact that all(value1 == 0) returns a single logical value. You need to have a vector of logic values to have your desired output, e.g.,
df %>%
group_by(grp) %>%
mutate(value2 = ifelse(rep(all(value1 == 0),n()), 0, value2))

r count values in rows after dcast

I want to sum all values in a row of a dataframe after performing a dcast operation from the reshape2 package. Problem is that all values are the same (10) and are the sum of all rows combined. Values should be 4,2,4
Example data with code:
df <- data.frame(x = as.factor(c("A","A","A","A","B","B","C","C","C","C")),
y = as.factor(c("AA","AB","AA","AC","BB","BA","CC","CC","CC","CD")),
z = c("var1","var1","var2","var1","var2","var1","var1","var2","var2","var1"))
df2 <- df %>%
group_by(x,y) %>%
summarise(num = n()) %>%
ungroup()
df3 <- dcast(df2,x~y, fill = 0 )
df3$total <- sum(df3$AA,df3$AB,df3$AC,df3$BA,df3$BB,df3$CC,df3$CD)
sum gives you 1 combined value and that value is repeated for all other rows.
sum(df3$AA,df3$AB,df3$AC,df3$BA,df3$BB,df3$CC,df3$CD)
#[1] 10
You need rowSums to get sum of each row separately.
df3$total <- rowSums(df3[-1])
Here is a simplified tidyverse approach starting from df :
library(dplyr)
library(tidyr)
df %>%
count(x, y, name = 'num') %>%
pivot_wider(names_from = y, values_from = num, values_fill = 0) %>%
mutate(total = rowSums(select(., AA:CD)))
# x AA AB AC BA BB CC CD total
# <fct> <int> <int> <int> <int> <int> <int> <int> <dbl>
#1 A 2 1 1 0 0 0 0 4
#2 B 0 0 0 1 1 0 0 2
#3 C 0 0 0 0 0 3 1 4
We can specify the values_fn in pivot_wider and also use adorn_totals from janitor
library(dplyr)
library(tidyr)
library(janitor)
df %>%
pivot_wider(names_from = y, values_from = z, values_fill = 0,
values_fn = length) %>%
adorn_totals("col")
-output
# x AA AB AC BB BA CC CD Total
# A 2 1 1 0 0 0 0 4
# B 0 0 0 1 1 0 0 2
# C 0 0 0 0 0 3 1 4
Or using base R with xtabs and addmargins
addmargins(xtabs(z ~ x + y, transform(df, z = 1)), 2)
# y
#x AA AB AC BA BB CC CD Sum
# A 2 1 1 0 0 0 0 4
# B 0 0 0 1 1 0 0 2
# C 0 0 0 0 0 3 1 4

Create new columns based on comma-separated values in another column in R [duplicate]

This question already has answers here:
Convert column with pipe delimited data into dummy variables [duplicate]
(4 answers)
Closed 2 years ago.
I have some data similar to that below.
df <- data.frame(id = 1:5, tags = c("A,B,AB,C", "C", "AB,E", NA, "B,C"))
df
# id tags
# 1 1 A,B,AB,C
# 2 2 C
# 3 3 AB,E
# 4 4 <NA>
# 5 5 B,C
I'd like to create a new dummy variable column for each tag in the "tags" column, resulting in a dataframe like the following:
correct_df <- data.frame(id = 1:5,
tags = c("A,B,AB,C", "C", "AB,E", NA, "B,C"),
A = c(1, 0, 0, 0, 0),
B = c(1, 0, 0, 0, 1),
C = c(1, 1, 0, 0, 1),
E = c(0, 0, 1, 0, 0),
AB = c(1, 0, 1, 0, 0)
)
correct_df
# id tags A B C E AB
# 1 1 A,B,AB,C 1 1 1 0 1
# 2 2 C 0 0 1 0 0
# 3 3 AB,E 0 0 0 1 1
# 4 4 <NA> 0 0 0 0 0
# 5 5 B,C 0 1 1 0 0
One of the challenges is ensuring that the "A" column has 1 only for the "A" tag, so that it doesn't has 1 for the "AB" tag, for example. The following won't work for this reason, since "A" gets 1 for the "AB" tag:
df <- df %>%
mutate(A = ifelse(grepl("A", tags, fixed = T), 1, 0))
df
# id tags A
# 1 1 A,B,AB,C 1
# 2 2 C 0
# 3 3 AB,E 1 < Incorrect
# 4 4 <NA> 0
# 5 5 B,C 0
Another challenge is doing this programmatically. I can probably deal with a solution that manually creates a column for each tag, but a solution that doesn't assume which tag columns need to be created beforehand is best, since there can potentially be many different tags. Is there some relatively simple solution that I'm overlooking?
Does this work:
> library(tidyr)
> library(dplyr)
> df %>% separate_rows(tags) %>% mutate(A = case_when(tags == 'A' ~ 1, TRUE ~ 0),
+ B = case_when(tags == 'B' ~ 1, TRUE ~ 0),
+ C = case_when(tags == 'C' ~ 1, TRUE ~ 0),
+ E = case_when(tags == 'E' ~ 1, TRUE ~ 0),
+ AB = case_when(tags == 'AB' ~ 1, TRUE ~ 0)) %>%
+ group_by(id) %>% mutate(tags = toString(tags)) %>% group_by(id, tags) %>% summarise(across(A:AB, sum))
`summarise()` regrouping output by 'id' (override with `.groups` argument)
# A tibble: 5 x 7
# Groups: id [5]
id tags A B C E AB
<int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 A, B, AB, C 1 1 1 0 1
2 2 C 0 0 1 0 0
3 3 AB, E 0 0 0 1 1
4 4 NA 0 0 0 0 0
5 5 B, C 0 1 1 0 0
>
Here's a solution:
library(dplyr)
library(stringr)
library(magrittr)
library(tidyr)
#Data
df <- data.frame(id = 1:5, tags = c("A,B,AB,C", "C", "AB,E", NA, "B,C"))
#Separate into rows
df %<>% mutate(t2 = tags) %>% separate_rows(t2, sep = ",")
#Create a presence/absence column
df %<>% mutate(pa = 1)
#Pivot wider and use the presence/absence
#column as entries; fill with 0 if absent
df %<>% pivot_wider(names_from = t2, values_from = pa, values_fill = 0)
df
# # A tibble: 5 x 8
# id tags A B AB C E `NA`
# <int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 A,B,AB,C 1 1 1 1 0 0
# 2 2 C 0 0 0 1 0 0
# 3 3 AB,E 0 0 1 0 1 0
# 4 4 NA 0 0 0 0 0 1
# 5 5 B,C 0 1 0 1 0 0
Edit: updated the code to enable it to retain the tags column. Sorry.

Add a column identifying the first row of each group defined by other columns

I have a dataset data with the columns X0 and value and would like to group by X0 after sorting and generate an indicator for the first row in each group which would look like the column first below:
X0 value first
1 A 26509 1
2 A 28146 0
3 B 19950 1
4 B 19981 0
5 B 20304 0
Another dplyr method.
library(dplyr)
dat2 <- dat %>%
group_by(X0) %>%
mutate(first = as.integer(row_number() == 1L)) %>%
ungroup()
dat2
# # A tibble: 5 x 3
# X0 value first
# <chr> <int> <int>
# 1 A 26509 1
# 2 A 28146 0
# 3 B 19950 1
# 4 B 19981 0
# 5 B 20304 0
Or use the data.table package.
library(data.table)
setDT(dat)
dat2 <- dat[, first := as.integer(rowid(X0) == 1L)]
dat2[]
# X0 value first
# 1: A 26509 1
# 2: A 28146 0
# 3: B 19950 1
# 4: B 19981 0
# 5: B 20304 0
DATA
dat <- read.table(text = "X0 value
1 A 26509
2 A 28146
3 B 19950
4 B 19981
5 B 20304",
header = TRUE, stringsAsFactors = FALSE)
Multiple ways to do this, A dplyr way could be
library(dplyr)
df %>%
group_by(X0) %>%
mutate(new_first = if_else(row_number() == 1, 1, 0))
# X0 value first new_first
# <fct> <int> <int> <dbl>
#1 A 26509 1 1.00
#2 A 28146 0 0
#3 B 19950 1 1.00
#4 B 19981 0 0
#5 B 20304 0 0
The same logic can be used in base R ave method
df$new_first <- ave(df$value, df$X0, FUN = function(x)
ifelse(seq_along(x) == 1, 1, 0))
df
# X0 value first new_first
#1 A 26509 1 1
#2 A 28146 0 0
#3 B 19950 1 1
#4 B 19981 0 0
#5 B 20304 0 0
More concisely,
as.integer(ave(df$value, df$X0, FUN = seq_along) == 1)
#[1] 1 0 1 0 0
We can use duplicated from base R to get a logical vector based on the duplicate values of 'X0', convert it to binary with as.integer
df1$first <- as.integer(!duplicated(df1$X0))
df1$first
#[1] 1 0 1 0 0
If the 'value' column is not sorted
library(dplyr)
df1 %>%
group_by(X0) %>%
mutate(first = as.integer(value == min(value)))
data
df1 <- structure(list(X0 = c("A", "A", "B", "B", "B"), value = c(26509L,
28146L, 19950L, 19981L, 20304L), first = c(1L, 0L, 1L, 0L, 0L
)), .Names = c("X0", "value", "first"), class = "data.frame",
row.names = c("1", "2", "3", "4", "5"))

reshape or table data from long to wide [duplicate]

This question already has answers here:
How to reshape data from long to wide format
(14 answers)
Closed 4 years ago.
I'm using R and I'm really at a loss right now. I have data like this:
df <- data.frame(
group = c(2, 2, 2, 1, 1, 0, 0, 1, 1, 0, 1, 0),
grade = c(2, 4, 3, 1, 3, 2, 5, 1, 1, 2, 3, 1)
)
I want to have it like this:
group0 group1 group2
1 1 3 0
2 2 0 1
3 0 2 1
4 0 0 1
5 1 0 0
6 0 0 0
I've been trying for hours using subset, tapply, table, for loops and what not but I can't seem to figure it out. I'd be really happy if someone could help me, I can't help but think I'm missing something really easy and obvious.
How can I produce my target output?
/ Solved, see below. Thanks for finding a fitting title btw, you guys are the best!
You can do something like this with dplyr and tidyr:
df %>%
count(group, grade) %>%
mutate(group = paste0('group', group)) %>%
spread(group, n, fill = 0)
# A tibble: 5 x 4
grade group0 group1 group2
* <int> <dbl> <dbl> <dbl>
1 1 1 3 0
2 2 2 0 1
3 3 0 2 1
4 4 0 0 1
5 5 1 0 0
If you don't want the additional 'grade' column, you can do:
df %>%
count(group, grade) %>%
mutate(group = paste0('group', group)) %>%
spread(group, n, fill = 0) %>%
select(-grade)
group0 group1 group2
* <dbl> <dbl> <dbl>
1 1 3 0
2 2 0 1
3 0 2 1
4 0 0 1
5 1 0 0
Alternatively, consider a base R approach using: by for grouping, aggregate for counts, setNames for group## column names, and Reduce for chain merge of dataframes:
# DATAFRAME LIST BY EACH GROUP
grp_list <- by(df, df$group, function(d) setNames(aggregate(.~grade, d, FUN=length),
c("grade", paste0("group",max(d$group)))))
# CHAIN MERGE (OUTER JOIN)
final_df <- Reduce(function(x,y) merge(x,y, by="grade", all=TRUE), grp_list)
# FILL NA WITH ZEROS
final_df[is.na(final_df)] <- 0
final_df
# grade group0 group1 group2
# 1 1 1 3 0
# 2 2 2 0 1
# 3 3 0 2 1
# 4 4 0 0 1
# 5 5 1 0 0
And to remove grade, use transform after chain merge or directly on final_df:
final_df <- transform(Reduce(function(x,y) merge(x,y, by="grade", all=TRUE), grp_list),
grade = NULL)
final_df <- transform(final_df, grade = NULL)

Resources