adding 1/0 columns from list all at once - r

I have a dataframe with identifiers and storm categories. Right now, the categories are in one column, but I want to add columns for each category with a 1 or 0 value. I don't think I want to reshape the data as wide, because in the actual dataset there are a number of long format variables I want to keep. I am using a series of ifelse statements currently, but it feels like there is probably a much better way:
library(dplyr)
library(tidyr)
df <- data.frame(
ID = c("A", "B", "C", "D", "A", "B", "C", "D", "A", "B", "C", "D"),
cat = c("TS", NA, NA, "TS", "1", "1", NA, NA, "2", NA, NA, NA)
)
df$cat_TS <- ifelse(df$cat == "TS", 1, 0) %>% replace_na(., 0)
df$cat_1 <- ifelse(df$cat == "1", 1, 0) %>% replace_na(., 0)
df$cat_2 <- ifelse(df$cat == "2", 1, 0) %>% replace_na(., 0)

We may use pivot_wider - create a sequence column 'rn', and then use pivot_wider to reshape to wide with values_fn as length and values_fill as 0
library(dplyr)
library(tidyr)
library(stringr)
df %>%
mutate(rn = row_number(), cat1 = cat) %>%
pivot_wider(names_from = cat1, values_from = cat1,
values_fn = length, values_fill = 0, names_prefix = "cat_")%>%
select(-cat_NA, -rn)
-output
# A tibble: 12 × 5
ID cat cat_TS cat_1 cat_2
<chr> <chr> <int> <int> <int>
1 A TS 1 0 0
2 B <NA> 0 0 0
3 C <NA> 0 0 0
4 D TS 1 0 0
5 A 1 0 1 0
6 B 1 0 1 0
7 C <NA> 0 0 0
8 D <NA> 0 0 0
9 A 2 0 0 1
10 B <NA> 0 0 0
11 C <NA> 0 0 0
12 D <NA> 0 0 0
or use fastDummies
library(fastDummies)
df %>%
dummy_cols("cat", remove_selected_columns = FALSE, ignore_na = TRUE) %>%
mutate(across(starts_with('cat_'), ~ replace_na(.x, 0)))
-output
ID cat cat_1 cat_2 cat_TS
1 A TS 0 0 1
2 B <NA> 0 0 0
3 C <NA> 0 0 0
4 D TS 0 0 1
5 A 1 1 0 0
6 B 1 1 0 0
7 C <NA> 0 0 0
8 D <NA> 0 0 0
9 A 2 0 1 0
10 B <NA> 0 0 0
11 C <NA> 0 0 0
12 D <NA> 0 0 0

An idea using base R
First, get all unique category names
cats <- unique(df$cat[!is.na(df$cat)])
cats
[1] "TS" "1" "2"
Then look for matches in column cat for each entry in cats. PS, I left the cat column in to show the matching is right. Remove it by using df$ID instead of df as the first argument in cbind.
cbind(df, setNames(data.frame(sapply(seq_along(cats), function(x)
df$cat %in% cats[x]) * 1), cats))
ID cat TS 1 2
1 A TS 1 0 0
2 B <NA> 0 0 0
3 C <NA> 0 0 0
4 D TS 1 0 0
5 A 1 0 1 0
6 B 1 0 1 0
7 C <NA> 0 0 0
8 D <NA> 0 0 0
9 A 2 0 0 1
10 B <NA> 0 0 0
11 C <NA> 0 0 0
12 D <NA> 0 0 0

Related

Make a new column for every variable and tally [duplicate]

This question already has answers here:
R Split delimited strings in a column and insert as new column (in binary) [duplicate]
(3 answers)
Closed 4 months ago.
I have the following dataframe:
sample name
1 a cobra, tiger, reptile
2 b tiger, spynx
3 c reptile, cobra
4 d sphynx, tiger
5 e cat, dog, tiger
6 f dog, spynx
and what I want to make from that is.
sample cobra tiger spynx reptile cat dog
1 a 1 1 0 1 0 0
2 b 0 1 1 0 0 0
3 c 1 0 0 1 0 0
4 d 0 1 1 0 0 0
5 e 0 1 0 0 1 1
6 f 0 0 1 0 1 1
so basically make a new column out of all the variables that are in the column: name. and put a 1 if a value is present in the df$name and 0 if it is not present.
all <- unique(unlist(strsplit(as.character(df$name), ", ")))
all <- all[!is.na(all)]
for(i in df){
df[i]<- 0 }
this gives me all the variables as 0's, and now I want to match it to the name column, and if it is present make a 1 out of the 0
How would you approach this?
With tidyr and dplyr...
library(tidyr)
library(dplyr, warn = FALSE)
df1 |>
separate_rows(name) |>
group_by(sample, name) |>
summarise(count = n(), .groups = "drop") |>
pivot_wider(names_from = "name", values_from = "count", values_fill = 0)
#> # A tibble: 6 × 8
#> sample cobra reptile tiger spynx sphynx cat dog
#> <chr> <int> <int> <int> <int> <int> <int> <int>
#> 1 a 1 1 1 0 0 0 0
#> 2 b 0 0 1 1 0 0 0
#> 3 c 1 1 0 0 0 0 0
#> 4 d 0 0 1 0 1 0 0
#> 5 e 0 0 1 0 0 1 1
#> 6 f 0 0 0 1 0 0 1
Created on 2022-10-19 with reprex v2.0.2
data
df1 <- data.frame(sample = letters[1:6],
name = c("cobra, tiger, reptile",
"tiger, spynx",
"reptile, cobra",
"sphynx, tiger",
"cat, dog, tiger",
"dog, spynx"))

Separate rows to make dummy rows

Consider this dataframe:
dat <- structure(list(col1 = c(1, 2, 0), col2 = c(0, 3, 2), col3 = c(1, 2, 3)), class = "data.frame", row.names = c(NA, -3L))
col1 col2 col3
1 1 0 1
2 2 3 2
3 0 2 3
How can one dummify rows? i.e. whenever there is a row with more than 1 non-0 value, separate the row into multiple rows with one non-0 value per row.
In this case, this would be:
col1 col2 col3
1 1 0 0
2 0 0 1
3 2 0 0
4 0 3 0
5 0 0 2
6 0 2 0
7 0 0 3
You can do:
library(tidyverse)
dat |>
pivot_longer(everything()) |>
mutate(id = 1:n()) |>
pivot_wider(values_fill = 0) |>
filter(!if_all(-id, ~ . == 0)) |>
select(-id)
# A tibble: 7 x 3
col1 col2 col3
<dbl> <dbl> <dbl>
1 1 0 0
2 0 0 1
3 2 0 0
4 0 3 0
5 0 0 2
6 0 2 0
7 0 0 3
Another approach, here I used data.table
library(data.table)
x <- rbindlist(apply(dat, 1, function(x) {
x <- data.table(diag(x, ncol(dat)))
x[colSums(x) > 0]
}))
setnames(x, names(dat))
x
# col1 col2 col3
# 1: 1 0 0
# 2: 0 0 1
# 3: 2 0 0
# 4: 0 3 0
# 5: 0 0 2
# 6: 0 2 0
# 7: 0 0 3
A very ugly way is:
library(tidyverse)
dat %>%
apply(1, diag) %>%
matrix(nrow = 3) %>%
t() %>%
as.data.frame() %>%
rename_with(~ names(dat), everything()) %>%
filter(rowSums(.) != 0)
col1 col2 col3
1 1 0 0
2 0 0 1
3 2 0 0
4 0 3 0
5 0 0 2
6 0 2 0
7 0 0 3

How to summarize values across multiple columns?

I have a dataframe that looks like this:
1 2 3 4 5
A B A B A
C B B B B
A C A B B
And I would like to summarize my data in a frequency table like this:
A B C
1 2 0 1
2 0 2 1
3 2 1 0
4 0 3 0
5 1 2 0
How would I be able to do this?
You can use the following solution:
library(tidyr)
library(janitor)
tab %>%
pivot_longer(everything(), names_to = "nm", values_to = "val",
names_prefix = "X") %>%
tabyl(nm, val)
nm A B C
1 2 0 1
2 0 2 1
3 2 1 0
4 0 3 0
5 1 2 0
You could use
library(tidyr)
data %>%
pivot_longer(everything()) %>%
{ table(.$name, .$value) }
which returns
A B C
1 2 0 1
2 0 2 1
3 2 1 0
4 0 3 0
5 1 2 0
Another table option with stack
> t(table(stack(df)))
values
ind A B C
1 2 0 1
2 0 2 1
3 2 1 0
4 0 3 0
5 1 2 0
An option with base R with table
table(c(col(df1)), c(t(df1)))
A B C
1 2 1 0
2 1 1 1
3 0 3 0
4 1 1 1
5 1 2 0
Data
df1 <- structure(list(`1` = c("A", "C", "A"), `2` = c("B", "B", "C"),
`3` = c("A", "B", "A"), `4` = c("B", "B", "B"), `5` = c("A",
"B", "B")), class = "data.frame", row.names = c(NA, -3L))

adding together multiple sets of columns in r

I'm trying to add several sets of columns together.
Example df:
df <- data.frame(
key = 1:5,
ab0 = c(1,0,0,0,1),
ab1 = c(0,2,1,0,0),
ab5 = c(1,0,0,0,1),
bc0 = c(0,1,0,2,0),
bc1 = c(2,0,0,0,0),
bc5 = c(0,2,1,0,1),
df0 = c(0,0,0,1,0),
df1 = c(1,0,3,0,0),
df5 = c(1,0,0,0,6)
)
Giving me:
key ab0 ab1 ab5 bc0 bc1 bc5 df0 df1 df5
1 1 1 0 1 0 2 0 0 1 1
2 2 0 2 0 1 0 2 0 0 0
3 3 0 1 0 0 0 1 0 3 0
4 4 0 0 0 2 0 0 1 0 0
5 5 1 0 1 0 0 1 0 0 6
I want to add all sets of columns with 0s and 5s in them together and place them in the 0 column.
So the end result would be:
key ab0 ab1 ab5 bc0 bc1 bc5 df0 df1 df5
1 1 2 0 1 0 2 0 0 1 1
2 2 0 2 0 3 0 2 0 0 0
3 3 0 1 0 1 0 1 0 3 0
4 4 0 0 0 2 0 0 2 0 0
5 5 2 0 1 1 0 1 0 0 6
I could add the columns together using 3 lines:
df$ab0 <- df$ab0 + df$ab5
df$bc0 <- df$bc0 + df$bc5
df$df0 <- df$df0 + df$df5
But my real example has over a hundred columns so I'd like to iterate over them and use apply.
The column names of the first set are contained in col0 and the names of the second set are in col5.
col0 <- c("ab0","bc0","df0")
col5 <- c("ab5","bc5","df5")
I created a function to add the columns to gether using mapply:
fun1 <- function(df,x,y) {
df[,x] <- df[,x] + df[,y]
}
mapply(fun1,df,col0,col5)
But I get an error: Error in df[, x] : incorrect number of dimensions
Thoughts?
Simply add two data frames together by their subsetted columns, assuming they will be the same length. No loops needed. All vectorized operation.
final_df <- df[grep("0", names(df))] + df[grep("5", names(df))]
final_df <- cbind(final_df, df[grep("0", names(df), invert=TRUE)])
final_df <- final_df[order(names(final_df))]
final_df
# ab0 ab1 ab5 bc0 bc1 bc5 df0 df1 df5 key
# 1 2 0 1 0 2 0 1 1 1 1
# 2 0 2 0 3 0 2 0 0 0 2
# 3 0 1 0 1 0 1 0 3 0 3
# 4 0 0 0 2 0 0 1 0 0 4
# 5 2 0 1 1 0 1 6 0 6 5
Rextester demo
You could use map2 from the purrr package to iterate over the two vectors at once:
df <- data.frame(
key = 1:5,
ab0 = c(1,0,0,0,1),
ab1 = c(0,2,1,0,0),
ab5 = c(1,0,0,0,1),
bc0 = c(0,1,0,2,0),
bc1 = c(2,0,0,0,0),
bc5 = c(0,2,1,0,1),
df0 = c(0,0,0,1,0),
df1 = c(1,0,3,0,0),
df5 = c(1,0,0,0,6)
)
col0 <- c("ab0","bc0","df0")
col5 <- c("ab5","bc5","df5")
purrr::map2(col0, col5, function(x, y) {
df[[x]] <<- df[[x]] + df[[y]]
})
> df
key ab0 ab1 ab5 bc0 bc1 bc5 df0 df1 df5
1 1 2 0 1 0 2 0 1 1 1
2 2 0 2 0 3 0 2 0 0 0
3 3 0 1 0 1 0 1 0 3 0
4 4 0 0 0 2 0 0 1 0 0
5 5 2 0 1 1 0 1 6 0 6
Here's an approach using tidyr and dplyr from the tidyverse meta-package.
First, I bring the table into long ("tidy") format, and split out the column into two components, and spread by the number part of those components.
Then I do the calculation you describe.
Finally, I bring it back into the original format using the inverse of step 1.
library(tidyverse)
df_tidy <- df %>%
# Step 1
gather(col, value, -key) %>%
separate(col, into = c("grp", "num"), 2) %>%
spread(num, value) %>%
# Step 2
mutate(`0` = `0` + `5`) %>%
# Step 3, which is just the inverse of Step 1.
gather(num, value, -key, - grp) %>%
unite(col, c("grp", "num")) %>%
spread(col, value)
df_tidy
key ab_0 ab_1 ab_5 bc_0 bc_1 bc_5 df_0 df_1 df_5
1 1 2 0 1 0 2 0 1 1 1
2 2 0 2 0 3 0 2 0 0 0
3 3 0 1 0 1 0 1 0 3 0
4 4 0 0 0 2 0 0 1 0 0
5 5 2 0 1 1 0 1 6 0 6

How to split data in each records in R?

I have a dataframe which has a column,
service-id
ids-1-2-3-4-5
ids-1-2-6
ids-5
ids-7-8
with many other columns.
I want to split the data ids-1-2-3-4-5 into different columns 1,2,3...8 like one hot encoding ,having columns 1 2 3 4 5 6 7 8 also having 1 and rest 0 if not present.
col.1 col.2 col.3 col.4 col.5 col.6 ..... col.8
1 1 1 1 1 0 ..... 0 for ids-1-2-3-4-5
1 1 0 0 0 1 ...... 0 for ids-1-2-6
I tried tidyverse but it is not helpful.
A solution using basic R code.
Your data
db<-data.frame("service-id"=c("ids-1-2-3-4-5","ids-1-2-6","ids-5","ids-7-8"))
Identify number of columns
ncol<-max(suppressWarnings(as.numeric(unlist(strsplit(as.character(db$service.id),"-")))),na.rm = T)
Extract numeric id list
number_list<-strsplit(as.character(db$service.id),"-")
number_list<-suppressWarnings(lapply(number_list,as.numeric))
number_list <- lapply(number_list, function(x) x[!is.na(x)])
Create output dataframe
f<-function(x,ncol)
{
return(as.numeric(seq(1:ncol) %in% x))
}
out<-t(data.frame(lapply(number_list, f, ncol=ncol)))
colnames(out)<-paste0("col.",seq(1:ncol))
rownames(out)<-NULL
Your output
out
col.1 col.2 col.3 col.4 col.5 col.6 col.7 col.8
[1,] 1 1 1 1 1 0 0 0
[2,] 1 1 0 0 0 1 0 0
[3,] 0 0 0 0 1 0 0 0
[4,] 0 0 0 0 0 0 1 1
If we need tidyverse option, here is a way
library(tidyverse)
df1 %>%
rownames_to_column('rn') %>%
extract(service.id, into = c('id', 'col'), "^([^-]+)-(.*)") %>%
separate_rows(col) %>%
mutate(n = 1, col = paste0("col.", col)) %>%
spread(col, n, fill = 0) %>%
select(-rn, -id)
# col.1 col.2 col.3 col.4 col.5 col.6 col.7 col.8
#1 1 1 1 1 1 0 0 0
#2 1 1 0 0 0 1 0 0
#3 0 0 0 0 1 0 0 0
#4 0 0 0 0 0 0 1 1
data
df1 <- structure(list(service.id = c("ids-1-2-3-4-5", "ids-1-2-6", "ids-5",
"ids-7-8")), .Names = "service.id", class = "data.frame", row.names = c(NA,
-4L))

Resources