Splitting column values by field value in data frame - r

I need to separate the values in a column in R. I went through this article, but it does not give name of the column according to the splitted value.
https://tidyr.tidyverse.org/reference/separate.html
My column value is like this:
df <- data.frame(c("['78', '79', '80', '98']",
"['1', '78', '80']"))
colnames(df) <- c("list")
Required Output:
val_1
val_78
val_79
val_80
val_98
0
1
1
1
1
1
1
0
1
0
Thank you!

Here are couple of options both of which require cleaning of the column by removing values like []' in the text.
Using splitstackshape::cSplit_e.
library(dplyr)
library(tidyr)
df %>%
mutate(val = gsub("\\[|\\]|'", "", val)) %>%
splitstackshape::cSplit_e('val', sep = ',\\s', fixed = FALSE,
type = 'character', fill = 0, drop = TRUE)
# val_1 val_78 val_79 val_80 val_98
#1 0 1 1 1 1
#2 1 1 0 1 0
tidyverse -
df %>%
mutate(val = gsub("\\[|\\]|'", "", val)) %>%
mutate(row = row_number()) %>%
separate_rows(val, sep = ',\\s+') %>%
pivot_wider(names_from = val, values_from = val, values_fn = length,
values_fill = 0, names_prefix = 'val_') %>%
select(-row)
data
df <- data.frame(val = c("['78', '79', '80', '98']","['1', '78', '80']"))

Related

Assigning Counts from Unstructed Text

I'm looking for a more efficient method to assign animal counts to their respective columns.
have:
data.frame(
size = c(50,50,50),
type = c('50 monkeys', '25 monkeys | 25 apes', '30 monkeys | 10 monkeys | 10 monkeys')
)
want:
data.frame(
size = c(50,50,50),
monkeys = c(50, 25, 50),
apes = c(0, 25, 0)
)
current method:
dat_ %>%
mutate(monkeys = unlist(lapply(str_extract_all(type,paste0('[0-9]+',’monkeys’)), function(x) sum(parse_number(x)))))
You can use separate_rows to split the | delimited data in different rows, separate to split count and name of animal in different columns and pivot_wider to get data in wide format.
library(dplyr)
library(tidyr)
df %>%
mutate(row = row_number()) %>%
separate_rows(type, sep = '\\s*\\|\\s*') %>%
separate(type, c('count', 'type'), sep = '\\s+', convert = TRUE) %>%
pivot_wider(names_from = type, values_from = count,
values_fn = sum, values_fill = 0) %>%
select(-row)
# size monkeys apes
# <dbl> <int> <int>
#1 50 50 0
#2 50 25 25
#3 50 50 0

dplyr count unique and repeat id's by months

I have a df that looks like the following:
ID DATE
12 10-20-20
12 10-22-20
10 10-15-20
9 10-10-20
11 11-01-20
7 11-02-20
I would like to group by month and then create a column for unique id count and repeat id count like below:
MONTH Unique_Count Repeat_Count
10-1-20 2 2
11-1-20 2 0
I am able to get the date down to the first of the month and group by ID but I am not sure how to count unique instances within the months.
df %>%
mutate(month = floor_date(as.Date(DATE), "month")) %>%
group_by(ID) %>%
mutate(count = n())
Are you perhaps looking for:
df %>%
mutate(month = strftime(floor_date(as.Date(DATE, "%m-%d-%y"), "month"),
"%m-%d-%y")) %>%
group_by(month) %>%
summarize(unique_count = length(which(table(ID) == 1)),
repeat_count = sum(table(ID)[(which(table(ID) > 1))]))
#> # A tibble: 2 x 3
#> month unique_count repeat_count
#> <chr> <int> <int>
#> 1 10-01-20 2 2
#> 2 11-01-20 2 0
Here's a shot at it:
library(lubridate)
library(dplyr)
dates <- as.Date(c("2020-10-15", "2020-10-15", "2020-11-16", "2020-11-16", "2020-11-16"))
ids <- c(12, 12, 13, 13, 14)
df <- data.frame(dates, ids)
duplicates <- df %>%
group_by(dates_floored = floor_date(dates, unit = "month"), ids) %>%
mutate(duplicate_count = n()) %>%
filter(duplicate_count > 1) %>%
distinct(ids, .keep_all = TRUE)
uniques <- df %>%
group_by(dates_floored = floor_date(dates, unit = "month"), ids) %>%
mutate(unique_count = n()) %>%
filter(unique_count < 2) %>%
distinct(ids, .keep_all = TRUE)
df_cleaned <- full_join(uniques, duplicates, by = c("ids", "dates", "dates_floored")) %>%
group_by(dates_floored) %>%
summarize(count_duplicates = sum(duplicate_count, na.rm = TRUE),
count_unique = sum(unique_count, na.rm = TRUE))
df_cleaned

Assign group id start from 0 and end with 1 in R

I have a dataset the following
DT <- data.drame(v1 = c(0,0,0,1,0,0,1))
I want to create a ID cumulatively stopped at a value of 1.
The ID should be
ID<-c(1,2,3,4,1,2,3)
If you are using dplyr, this will do the trick.
DT = data.frame(v1 = c(0,0,0,1,0,0,1))
DT %>%
dplyr::mutate(rno = row_number()) %>%
dplyr::mutate(group = ifelse(v1 == 0, NA, rno)) %>%
tidyr::fill(group, .direction = "up") %>%
dplyr::group_by(group) %>%
dplyr::mutate(ID = row_number()) %>%
dplyr::ungroup() %>%
dplyr::select(v1, ID)
In base R, we can use ave :
with(DT, ave(v1, c(0, cumsum(v1)[-length(v1)]), FUN = seq_along))
#[1] 1 2 3 4 1 2 3
In dplyr , we can use lag to create groups and assign row number in each group.
library(dplyr)
DT %>% group_by(gr = lag(cumsum(v1), default = 0)) %>% mutate(ID = row_number())
and we can use the same logic in data.table :
library(data.table)
setDT(DT)[, ID := seq_len(.N), shift(cumsum(v1), fill = 0)]

Summarizing and spreading data

I have data similar to below :
df=data.frame(
company=c("McD","McD","McD","KFC","KFC"),
Title=c("Crew Member","Manager","Trainer","Crew Member","Manager"),
Manhours=c(12,NA,5,13,10)
)
df
I would wish to manipulate it and obtain the data frame as below:
df=data.frame(
company=c("KFC", "McD"),
Manager=c(1,1),
Surbodinate=c(1,2),
TotalEmp=c(2,3),
TotalHours=c(23,17)
)
I have managed to manipulate and categorise the employees as well as their count as below:
df<- df %>%
mutate(Role = if_else((Title=="Manager" ),
"Manager","Surbodinate"))%>%
count(company, Role) %>%
spread(Role, n, fill=0)%>%
as.data.frame() %>%
mutate(TotalEmp= select(., Manager:Surbodinate) %>%
apply(1, sum, na.rm=TRUE))
Also, I have summarised the man hours as below:
df <- df %>%group_by(company) %>%
summarize(TotalHours = sum(Manhours, na.rm = TRUE))
How would I combine these two steps at once or is there a cleaner/simpler way of getting the desired output?
dplyr solution:
df %>%
mutate(Title = if_else((Title=="Manager" ),
"Manager","Surbodinate")) %>%
group_by(company) %>%
summarise(Manager = sum(Title == "Manager"), Subordinate = sum(Title == "Surbodinate"), TotalEmp = n(), Manhours = sum(Manhours, na.rm = TRUE))
company Manager Subordinate TotalEmp Manhours
<fct> <int> <int> <int> <dbl>
1 KFC 1 1 2 23
2 McD 1 2 3 17
how about something like this:
df %>%
mutate(Role = ifelse(Title=="Manager" ,
"Manager", "Surbodinate"))%>%
group_by(company) %>%
mutate(TotalEmp = n(),
TotalHours = sum(Manhours, na.rm=TRUE)) %>%
reshape2::dcast(company + TotalEmp + TotalHours ~ Role)
This is not tidyverse nor is it a one step process. But if you use data.table you could do:
library(data.table)
setDT(df, key = "company")
totals <- DT[, .(TotalEmp = .N, TotalHours = sum(Manhours, na.rm = TRUE)), by = company]
dcast(DT, company ~ ifelse(Title == "Manager", "Manager", "Surbodinate"))[totals]
# company Manager Surbodinate TotalEmp TotalHours
# 1 KFC 1 1 2 23
# 2 McD 1 2 3 17

Factors in many columns to boolean convert

My question is a little like this but the problem with the structure of data is different:
Sample data:
df <-data.frame(id = c(1,2,3), stock_1 = c("Google","Microsoft","Yahoo"), stock_2 = c("Yahoo","Google","NA"))
and I would like to convert to this:
df <-data.frame(id = c(1,2,3), Google = c(1,1,0), Microsoft = c(0,1,0), Yahoo= c(1,0,1))
I tried to use sapply() but from the answer to the linked question it is only for one column.
Here is a way to do it with data.table:
library(data.table)
setDT(df)
dcast(melt(df, id = 'id')[value != 'NA'],
id ~ value, fun.aggregate = length)
# id Google Microsoft Yahoo
# 1: 1 1 0 1
# 2: 2 1 1 0
# 3: 3 0 0 1
fill = 0 is unnecessary, and to tolerate duplicates, we can try:
dcast(melt(df, id = 'id')[value != 'NA'],
id ~ value, fun.aggregate = function(x){ 1 * (length(x) != 0)})
2017-01-01
As mentationed by Uwe, we can removed from NAs from the molten data by setting na.rm = TRUE if it is not hard coded as a string ("NA"), the commands finally looks this:
dcast(melt(df, id = 'id', na.rm = TRUE), id ~ value, fun.aggregate = length)
# or
dcast(melt(df, id = 'id', na.rm = TRUE),
id ~ value, fun.aggregate = function(x){ 1 * (length(x) != 0)})
We can also do this with tidyverse
library(tidyverse)
df %>%
gather(key, val, -id) %>%
filter(!is.na(val)) %>%
mutate(ind = 1) %>%
select(-key) %>%
spread(val, ind, fill = 0)
NOTE: It is better to use NA instead of "NA" as we can take care of NA with is.na or na.omit or complete.cases

Resources