Factors in many columns to boolean convert - r

My question is a little like this but the problem with the structure of data is different:
Sample data:
df <-data.frame(id = c(1,2,3), stock_1 = c("Google","Microsoft","Yahoo"), stock_2 = c("Yahoo","Google","NA"))
and I would like to convert to this:
df <-data.frame(id = c(1,2,3), Google = c(1,1,0), Microsoft = c(0,1,0), Yahoo= c(1,0,1))
I tried to use sapply() but from the answer to the linked question it is only for one column.

Here is a way to do it with data.table:
library(data.table)
setDT(df)
dcast(melt(df, id = 'id')[value != 'NA'],
id ~ value, fun.aggregate = length)
# id Google Microsoft Yahoo
# 1: 1 1 0 1
# 2: 2 1 1 0
# 3: 3 0 0 1
fill = 0 is unnecessary, and to tolerate duplicates, we can try:
dcast(melt(df, id = 'id')[value != 'NA'],
id ~ value, fun.aggregate = function(x){ 1 * (length(x) != 0)})
2017-01-01
As mentationed by Uwe, we can removed from NAs from the molten data by setting na.rm = TRUE if it is not hard coded as a string ("NA"), the commands finally looks this:
dcast(melt(df, id = 'id', na.rm = TRUE), id ~ value, fun.aggregate = length)
# or
dcast(melt(df, id = 'id', na.rm = TRUE),
id ~ value, fun.aggregate = function(x){ 1 * (length(x) != 0)})

We can also do this with tidyverse
library(tidyverse)
df %>%
gather(key, val, -id) %>%
filter(!is.na(val)) %>%
mutate(ind = 1) %>%
select(-key) %>%
spread(val, ind, fill = 0)
NOTE: It is better to use NA instead of "NA" as we can take care of NA with is.na or na.omit or complete.cases

Related

Trying to pivot a table in R

I am trying to pivot a table in R for example:
To
Code to create the starting table:
df <- data.frame (ID = c("A","A","A","B","B","C"),
Dates = c("01/01/2021", "10/02/2021", "30/03/2021","04/04/2021","06/05/2021","20/06/2021"))
Assume there is a max of three dates for the above example.
You are missing a column with the identifier "Date1", "Date2", "Date3". You can create it with mutate(), then use pivot_wider() from the tidyverse library.
dt <- data.frame (ID = c("A","A","A","B","B","C"),
Dates = c("01/01/2021", "10/02/2021", "30/03/2021","04/04/2021","06/05/2021","20/06/2021"))
library(tidyverse)
dt %>% group_by(ID) %>%
mutate(col = paste0("Date",row_number())) %>%
pivot_wider(id_cols = ID, names_from = col, values_from = Dates)
This is my approach :
my_df <- data.frame (ID = c("A","A","A","B","B","C"),
Dates = c("01/01/2021", "10/02/2021", "30/03/2021","04/04/2021","06/05/2021","20/06/2021"),
stringsAsFactors = FALSE)
my_df <- my_df %>% group_by(ID) %>% mutate(value = paste("Date", seq_along(ID), sep = ""))
my_df <- dcast(my_df, ID ~ value, value.var = "Dates")
Here's an approach similar to what you're requesting.
library("maditr")
df <- dcast(df, Dates ~ ID,fun.aggregate = length)
Another solution, using data.table
df <- data.frame(
ID = c("A","A","A","B","B","C"),
Dates = c("01/01/2021", "10/02/2021", "30/03/2021","04/04/2021","06/05/2021","20/06/2021")
)
library(data.table)
setDT(df)
df <- df[, .(dates = lapply(.SD, function(x) paste(x, collapse = ", "))), by = ID, .SDcols = c("Dates")]
df[, c("Date1", "Date2", "Date3") := tstrsplit(dates, ", ")]
df[, dates := NULL]
df
# ID Date1 Date2 Date3
# 1: A 01/01/2021 10/02/2021 30/03/2021
# 2: B 04/04/2021 06/05/2021 <NA>
# 3: C 20/06/2021 <NA> <NA>
A base R option using reshape
reshape(
transform(
df,
q = ave(seq_along(ID), ID, FUN = seq_along)
),
direction = "wide",
idvar = "ID",
timevar = "q"
)
gives
ID Dates.1 Dates.2 Dates.3
1 A 01/01/2021 10/02/2021 30/03/2021
4 B 04/04/2021 06/05/2021 <NA>
6 C 20/06/2021 <NA> <NA>

Splitting column values by field value in data frame

I need to separate the values in a column in R. I went through this article, but it does not give name of the column according to the splitted value.
https://tidyr.tidyverse.org/reference/separate.html
My column value is like this:
df <- data.frame(c("['78', '79', '80', '98']",
"['1', '78', '80']"))
colnames(df) <- c("list")
Required Output:
val_1
val_78
val_79
val_80
val_98
0
1
1
1
1
1
1
0
1
0
Thank you!
Here are couple of options both of which require cleaning of the column by removing values like []' in the text.
Using splitstackshape::cSplit_e.
library(dplyr)
library(tidyr)
df %>%
mutate(val = gsub("\\[|\\]|'", "", val)) %>%
splitstackshape::cSplit_e('val', sep = ',\\s', fixed = FALSE,
type = 'character', fill = 0, drop = TRUE)
# val_1 val_78 val_79 val_80 val_98
#1 0 1 1 1 1
#2 1 1 0 1 0
tidyverse -
df %>%
mutate(val = gsub("\\[|\\]|'", "", val)) %>%
mutate(row = row_number()) %>%
separate_rows(val, sep = ',\\s+') %>%
pivot_wider(names_from = val, values_from = val, values_fn = length,
values_fill = 0, names_prefix = 'val_') %>%
select(-row)
data
df <- data.frame(val = c("['78', '79', '80', '98']","['1', '78', '80']"))

Assign group id start from 0 and end with 1 in R

I have a dataset the following
DT <- data.drame(v1 = c(0,0,0,1,0,0,1))
I want to create a ID cumulatively stopped at a value of 1.
The ID should be
ID<-c(1,2,3,4,1,2,3)
If you are using dplyr, this will do the trick.
DT = data.frame(v1 = c(0,0,0,1,0,0,1))
DT %>%
dplyr::mutate(rno = row_number()) %>%
dplyr::mutate(group = ifelse(v1 == 0, NA, rno)) %>%
tidyr::fill(group, .direction = "up") %>%
dplyr::group_by(group) %>%
dplyr::mutate(ID = row_number()) %>%
dplyr::ungroup() %>%
dplyr::select(v1, ID)
In base R, we can use ave :
with(DT, ave(v1, c(0, cumsum(v1)[-length(v1)]), FUN = seq_along))
#[1] 1 2 3 4 1 2 3
In dplyr , we can use lag to create groups and assign row number in each group.
library(dplyr)
DT %>% group_by(gr = lag(cumsum(v1), default = 0)) %>% mutate(ID = row_number())
and we can use the same logic in data.table :
library(data.table)
setDT(DT)[, ID := seq_len(.N), shift(cumsum(v1), fill = 0)]

Summarizing and spreading data

I have data similar to below :
df=data.frame(
company=c("McD","McD","McD","KFC","KFC"),
Title=c("Crew Member","Manager","Trainer","Crew Member","Manager"),
Manhours=c(12,NA,5,13,10)
)
df
I would wish to manipulate it and obtain the data frame as below:
df=data.frame(
company=c("KFC", "McD"),
Manager=c(1,1),
Surbodinate=c(1,2),
TotalEmp=c(2,3),
TotalHours=c(23,17)
)
I have managed to manipulate and categorise the employees as well as their count as below:
df<- df %>%
mutate(Role = if_else((Title=="Manager" ),
"Manager","Surbodinate"))%>%
count(company, Role) %>%
spread(Role, n, fill=0)%>%
as.data.frame() %>%
mutate(TotalEmp= select(., Manager:Surbodinate) %>%
apply(1, sum, na.rm=TRUE))
Also, I have summarised the man hours as below:
df <- df %>%group_by(company) %>%
summarize(TotalHours = sum(Manhours, na.rm = TRUE))
How would I combine these two steps at once or is there a cleaner/simpler way of getting the desired output?
dplyr solution:
df %>%
mutate(Title = if_else((Title=="Manager" ),
"Manager","Surbodinate")) %>%
group_by(company) %>%
summarise(Manager = sum(Title == "Manager"), Subordinate = sum(Title == "Surbodinate"), TotalEmp = n(), Manhours = sum(Manhours, na.rm = TRUE))
company Manager Subordinate TotalEmp Manhours
<fct> <int> <int> <int> <dbl>
1 KFC 1 1 2 23
2 McD 1 2 3 17
how about something like this:
df %>%
mutate(Role = ifelse(Title=="Manager" ,
"Manager", "Surbodinate"))%>%
group_by(company) %>%
mutate(TotalEmp = n(),
TotalHours = sum(Manhours, na.rm=TRUE)) %>%
reshape2::dcast(company + TotalEmp + TotalHours ~ Role)
This is not tidyverse nor is it a one step process. But if you use data.table you could do:
library(data.table)
setDT(df, key = "company")
totals <- DT[, .(TotalEmp = .N, TotalHours = sum(Manhours, na.rm = TRUE)), by = company]
dcast(DT, company ~ ifelse(Title == "Manager", "Manager", "Surbodinate"))[totals]
# company Manager Surbodinate TotalEmp TotalHours
# 1 KFC 1 1 2 23
# 2 McD 1 2 3 17

removing groups with a certain NA number

Sorry to bother with a relatively simple question perhaps.
I have this type of dataframe:
A long list of names in the column "NAME" c(a, b, c, d, e ...) , two potential classes in the column "SURNAME" c(A, B) and a third column containing values.
I want to remove all NAMES for which at least in one of the SURNAME classes I have more than 2 "NA" in the VALUE column.
I wanted to post an example dataset but I am struggling to format it properly
I was trying to use
df <- df %>%
group_by(NAME) %>%
group_by(SURNAME) %>%
filter(!is.na(VALUE)) %>%
filter(length(VALUE)>=3)
it does not throw an error but I have the impression that something is wrong. Any suggestion? Many thanks
Let's create a dataset to work with:
set.seed(1234)
df <- data.frame(
name = sample(x=letters, size=1e3, replace=TRUE),
surname = sample(x=c("A", "B"), size=1e3, replace=TRUE),
value = sample(x=c(1:10*10,NA), size=1e3, replace=TRUE),
stringsAsFactors = FALSE
)
Here's how to do it with Base R:
# count NAs by name-surname combos (na.action arg is important!)
agg <- aggregate(value ~ name + surname, data=df, FUN=function(x) sum(is.na(x)), na.action=NULL)
# rename is count of NAs column
names(agg)[3] <- "number_of_na"
#add count of NAs back to original data
df <- merge(df, agg, by=c("name", "surname"))
# subset the original data
result <- df[df$number_of_na < 3, ]
Here's how to do it with data.table:
library(data.table)
dt <- as.data.table(df)
dt[ , number_of_na := sum(is.na(value)), by=.(name, surname)]
result <- dt[number_of_na < 3]
Here's how to do it with dplr/tidyverse:
library(dplyr) # or library(tidyverse)
result <- df %>%
group_by(name, surname) %>%
summarize(number_of_na = sum(is.na(value))) %>%
right_join(df, by=c("name", "surname")) %>%
filter(number_of_na < 3)
After grouping by 'NAME', 'SURNAME', create a column with the number of NA elements in that group and then filter out any 'NAME' that have an 'ind' greater than or equal to 3
df %>%
group_by(NAME, SURNAME) %>%
mutate(ind = sum(is.na(VALUE))) %>%
group_by(NAME) %>%
filter(!any(ind >=3)) %>%
select(-ind)
Or do an anti_join after doing the filtering by 'NAME', 'SURNAME' based on the condition
df %>%
group_by(NAME, SURNAME) %>%
filter(sum(is.na(VALUE))>=3) %>%
ungroup %>%
distinct(NAME) %>%
anti_join(df, .)
data
set.seed(24)
df <- data.frame(NAME = rep(letters[1:5], each = 20),
SURNAME = sample(LETTERS[1:4], 5 * 20, replace = TRUE),
VALUE = sample(c(NA, 1:3), 5 *20, replace = TRUE),
stringsAsFactors = FALSE)

Resources