dplyr count certain text - r

In my data frame I'm trying to count certain text '000', 'xxx' and not (000|xxx)
My dataframe is this:
Name per1 per2 per3
a1 000 xxx 230
a1 xxx 000 NA
a2 000 340 xxx
a3 000 xxx NA
Desired result count:
000 xxx Others
a1 2 2 1
a2 1 1 1
a3 1 1 0
Using dplyr: I tried but going wrong, please help on how to attain this
df %>% groupby(Name) %>% filter(grepl('000')) %>% summarize(000 = n())

An option is to convert data to long format and then use reshape2::dcast to get count as:
library(tidyverse)
library(reshape2)
df %>% gather(key, value, -Name) %>%
mutate(value = ifelse(is.na(value), "Others", value)) %>%
dcast(Name~value, fun.aggregate = length)
# Name 000 230 340 Others xxx
# 1 a1 2 1 0 1 2
# 2 a2 1 0 1 0 1
# 3 a3 1 0 0 1 1
OR: If OP is interested in having count for 000, xxx and Others categories then:
library(tidyverse)
library(reshape2)
df %>% gather(key, value, -Name) %>%
mutate(value =
ifelse(is.na(value) | !(value %in% c("000", "xxx")), "Others", value)) %>%
dcast(Name~value, fun.aggregate = length)
# Name 000 Others xxx
# 1 a1 2 2 2
# 2 a2 1 1 1
# 3 a3 1 1 1
Data:
df<-read.table(text="
Name per1 per2 per3
a1 000 xxx 230
a1 xxx 000 NA
a2 000 340 xxx
a3 000 xxx NA",
header=TRUE, stringsAsFactor = FALSE)

Here are a few tidyverse possibilities, all variations on the same idea:
library(tidyverse)
df %>%
nest(-Name) %>%
rowwise %>%
summarize(`000` = sum(data =='000',na.rm=T),
xxx = sum(data =='xxx',na.rm=T),
Others = sum(!is.na(data))-`000` - xxx)
df %>%
nest(-Name) %>%
group_by(Name) %>%
summarize(`000` = sum(data[[1]]=='000',na.rm=T),
xxx = sum(data[[1]]=='xxx',na.rm=T),
Others = sum(!is.na(data[[1]]))-`000` - xxx)
df %>%
group_by(Name) %>%
do(tibble(`000` = sum(.[-1]=='000',na.rm=T),
xxx = sum(.[-1]=='xxx',na.rm=T),
Others = sum(!is.na(.[-1]))-`000` - xxx)) %>%
ungroup
# # A tibble: 3 x 4
# Name `000` xxx Others
# <chr> <int> <int> <int>
# 1 a1 2 2 1
# 2 a2 1 1 1
# 3 a3 1 1 0
Note how rowwise and grouping by row work slightly differently.
Here's a base R translation too:
do.call(
rbind,
by(df,df$Name,function(x) data.frame(
Name = x$Name[1],
`000` = sum(x[-1]=='000',na.rm=T),
xxx = sum(x[-1]=='xxx',na.rm=T),
Others = sum(x[-1]!='000' & x[-1]!='xxx',na.rm=T))))
# Name X000 xxx Others
# a1 a1 2 2 1
# a2 a2 1 1 1
# a3 a3 1 1 0

If I understood it correctly, and the task is to count all xxx, 000, and !000&!xxx by Name we can also use base::table() to obtain desired output:
df <- data.frame(Name = c("a1", "a1", "a2", "a3"),
per1 = c("000", "xxx", "000", "000"),
per2 = c("xxx", "000", 340, "xxx"),
per3 = c(230, NA, "xxx", NA),
stringsAsFactors = F
)
Vals <- unlist(df[,-1]) # convert to the vector
Vals[!(Vals %in% c("000", "xxx")) & !is.na(Vals)] <- "Others" # !(xxx|000) <- Others
#
as.data.frame.matrix( # group by Name, count
table(rep(df$Name, ncol(df) - 1), Vals, useNA = "no") # don't count NAs
) # convert to data.frame
# 000 Others xxx
#a1 2 1 2
#a2 1 1 1
#a3 1 0 1

Related

Concatenate column names in one column conditional on using mutate, across and case_when

I would like to:
Use across and case_when to check if columns A1-A3 == 1
Concatenate the column names of the columns where A1-A3 == 1 and
mutate a new column with the concatenated column names
My dataframe:
df <- tribble(
~ID, ~A1, ~A2, ~A3,
1, 0, 1, 1,
2, 0, 1, 1,
3, 1, 1, 1,
4, 1, 0, 1,
5, 0, 1, 0)
Desired Output:
# A tibble: 5 x 5
ID A1 A2 A3 New_Col
<dbl> <dbl> <dbl> <dbl> <chr>
1 1 0 1 1 A2 A3
2 2 0 1 1 A2 A3
3 3 1 1 1 A1 A2 A3
4 4 1 0 1 A1 A3
5 5 0 1 0 A2
So far I have tried:
df %>%
rowwise() %>%
mutate(New_Col = across(A1:A3, ~ case_when(. == 1 ~ paste0("colnames(.)", collapse = " "))))
Not working Output:
ID A1 A2 A3 New_Col$A1 $A2 $A3
<dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr>
1 1 0 1 1 NA colnames(.) colnames(.)
2 2 0 1 1 NA colnames(.) colnames(.)
3 3 1 1 1 colnames(.) colnames(.) colnames(.)
4 4 1 0 1 colnames(.) NA colnames(.)
5 5 0 1 0 NA colnames(.) NA
What I want to learn:
Is it possible to use across to check for conditions across multiple columns
If yes how looks the part after ~ of case_when to get the specific colnames
How can I get only one column after using mutate, across and case_when and not 3 like here.
I thought I already was able to master this task, but somehow I lost it...
To use across with case_when you can do -
library(dplyr)
library(tidyr)
df %>%
mutate(across(A1:A3, ~case_when(. == 1 ~ cur_column()), .names = 'new_{col}')) %>%
unite(New_Col, starts_with('new'), na.rm = TRUE, sep = ' ')
# ID A1 A2 A3 New_Col
# <dbl> <dbl> <dbl> <dbl> <chr>
#1 1 0 1 1 A2 A3
#2 2 0 1 1 A2 A3
#3 3 1 1 1 A1 A2 A3
#4 4 1 0 1 A1 A3
#5 5 0 1 0 A2
across creates 3 new columns named new_A1, new_A2 and new_A3 with the column name if the value is 1 or NA otherwise. Using unite we combine the 3 columns into one New_col.
Also we can use rowwise with c_across -
df %>%
rowwise() %>%
mutate(New_Col = paste0(names(.[-1])[c_across(A1:A3) == 1], collapse = ' '))
without rowwise/ across you may also obtain same using cur_data()
df %>% group_by(ID) %>%
mutate(new_col = paste0(names(df[-1])[as.logical(cur_data())], collapse = ' '))
# A tibble: 5 x 5
# Groups: ID [5]
ID A1 A2 A3 new_col
<dbl> <dbl> <dbl> <dbl> <chr>
1 1 0 1 1 A2 A3
2 2 0 1 1 A2 A3
3 3 1 1 1 A1 A2 A3
4 4 1 0 1 A1 A3
5 5 0 1 0 A2
a . instead of df inside mutate will also do
df %>% group_by(ID) %>%
mutate(new_col = paste0(names(.[-1])[as.logical(cur_data())], collapse = ' '))
Using base R
df$New_Col <- apply(df[-1], 1, \(x) paste(names(x)[as.logical(x)], collapse=' '))
df$New_Col
#[1] "A2 A3" "A2 A3" "A1 A2 A3" "A1 A3" "A2"
Or using tidyverse
library(dplyr)
library(purrr)
library(stringr)
df %>%
mutate(New_Col = across(A1:A3, ~ c('', cur_column())[. + 1] ) %>%
invoke(str_c, .))
One option involving also purrr could be:
df %>%
mutate(New_Col = pmap_chr(across(-ID),
~ paste(names(c(...))[which(c(...) == 1)], collapse = " ")))
ID A1 A2 A3 New_Col
<dbl> <dbl> <dbl> <dbl> <chr>
1 1 0 1 1 A2 A3
2 2 0 1 1 A2 A3
3 3 1 1 1 A1 A2 A3
4 4 1 0 1 A1 A3
5 5 0 1 0 A2

Mutate multiple columns if grouping variable has all observations missing

I'm trying to mutate the columns "a" and "b" only if the grouping variable "group" has all observations missing. The attempted solution changes the group "blue", in which not all observations are missing. Thanks in advance for your valuable time!
Code below:
library(tidyverse)
# sample data
a <- c(NA,NA,1,1,NA,1)
b <- c(1,1,NA,NA,1,NA)
c <- letters[1:6]
group <- c("yellow","yellow","black","black", "blue", "blue")
(data <- as_tibble(data.frame(a,b,c,group)))
# a b c group
# <dbl> <dbl> <fct> <fct>
# 1 NA 1 a yellow
# 2 NA 1 b yellow
# 3 1 NA c black
# 4 1 NA d black
# 5 NA 1 e blue
# 6 1 NA f blue
# failed attempt: observations from group "blue" change
(data %>%
dplyr::group_by(group) %>%
dplyr::mutate(across(1:2, ~ ifelse(all(is.na(.x)), 99999,.x))))
# a b c group
# <dbl> <dbl> <fct> <fct>
# 1 99999 1 a yellow
# 2 99999 1 b yellow
# 3 1 99999 c black
# 4 1 99999 d black
# 5 NA 1 e blue
# 6 NA 1 f blue
# desired output - observations from blue remain the same
a2 <- c(99999,99999,1,1,NA,1)
b2 <- c(1,1,99999,99999,1,NA)
c2 <- letters[1:6]
group2 <- c("yellow","yellow","black","black", "blue", "blue")
(data_desired <- as_tibble(data.frame(a2,b2,c2,group2)))
# a2 b2 c2 group2
# <dbl> <dbl> <fct> <fct>
# 1 99999 1 a yellow
# 2 99999 1 b yellow
# 3 1 99999 c black
# 4 1 99999 d black
# 5 NA 1 e blue
# 6 1 NA f blue
You could try this:
library(tidyverse)
# sample data
a <- c(NA,NA,1,1,NA,1)
b <- c(1,1,NA,NA,1,NA)
c <- letters[1:6]
group <- c("yellow","yellow","black","black", "blue", "blue")
(data <- as_tibble(data.frame(a,b,c,group)))
(data %>%
dplyr::group_by(group) %>%
dplyr::mutate(across(1:2, ~ ifelse(is.na(.x), 99999,.x))))
# A tibble: 6 x 4
# Groups: group [3]
a b c group
<dbl> <dbl> <fct> <fct>
1 99999 1 a yellow
2 99999 1 b yellow
3 1 99999 c black
4 1 99999 d black
5 99999 1 e blue
6 1 99999 f blue
Not the best solution but you can deal with it...
data <- data %>%
group_by(group) %>%
mutate(new = paste0(a, "_", b),
new1 = if_else(new == lag(new), str_replace(new, "NA", "99999"), new),
new2 = if_else(new == lead(new), str_replace(new, "NA", "99999"), new)
) %>%
separate(col = new1, into = c("a_new1", "b_new1"), sep = "_", extra = "drop") %>%
separate(col = new2, into = c("a_new2", "b_new2"), sep = "_", extra = "drop") %>%
mutate(a2 = if_else(is.na(a_new1), replace_na(a_new2), a_new1),
b2 = if_else(is.na(b_new1), replace_na(b_new2), b_new1)
) %>%
select(a, b, c, group, a2, b2) %>%
type_convert()
data
# A tibble: 6 x 6
# Groups: group [3]
a b c group a2 b2
<dbl> <dbl> <fct> <fct> <dbl> <dbl>
1 NA 1 a yellow 99999 1
2 NA 1 b yellow 99999 1
3 1 NA c black 1 99999
4 1 NA d black 1 99999
5 NA 1 e blue NA 1
6 1 NA f blue 1 NA
Thanks all for the input!
Finally, this is how I resolved this with lists and purrr.
library(tidyverse)
library(purrr)
# sample data
a <- c(NA,NA,1,1,NA,1)
b <- c(1,1,NA,NA,1,NA)
c <- letters[1:6]
group <- c("yellow","yellow","black","black", "blue", "blue")
(data <- as_tibble(data.frame(a,b,c,group)))
# list with groups in which all cases are NA
list1 <- data %>%
split(.,.$group) %>%
map(~select(.x,as.vector(which(colSums(is.na(.)) == nrow(.))))) %>%
map(~mutate_all(.x, replace_na, 99999))
# list with groups in which there is at least one valid observation
list2 <- data %>%
split(.,.$group) %>%
map(~select(.x, as.vector(which(colSums(is.na(.)) != nrow(.)))))
# putting the groups together into a dataframe
list3 <- mapply(cbind, list1, list2, SIMPLIFY=FALSE)
(desired_output <- do.call(rbind.data.frame, list3))

Group by, summarize, spread in R not working

I have a data frame that looks like the following:
ID Code Desc
1 0A Red
1 NA Red
2 1A Blue
3 2B Green
I want to first create a new column where I concatenate the values in the Code column where the IDs are the same. So:
ID Combined_Code Desc
1 0A | NA Red
2 1A Blue
3 2B Green
Then I want to take the original Code column and spread it. The values in this case would be a count of how many times each Code shows up for a given ID. So:
ID Combined_Code 0A NA 1A 2B Desc
1 0A | NA 1 1 0 0 Red
2 1A 0 0 1 0 Blue
3 2B 0 0 0 1 Green
I've tried:
sample_data %>%
group_by(ID) %>%
summarise(Combined_Code = paste(unique(Combined_Code), collapse ='|'))
This works for creating the concatenation. However, I can't get this to work in tandem with spread:
sample_data %>%
group_by(ID) %>%
summarise(Combined_Code = paste(unique(Combined_Code), collapse ='|'))
sample_data <- spread(count(sample_data, ID, Combined_Code, Desc., Code), Code, n, fill = 0)
Doing this spreads, but drops the concatenation. I've also tried this with filter instead of summarise, which gives the same result. This results in:
ID Combined_Code 0A NA 1A 2B Desc
1 0A 1 0 0 0 Red
1 NA 0 1 0 0 Red
2 1A 0 0 1 0 Blue
3 2B 0 0 0 1 Green
Finally, I've tried piping spread through the summarise function:
sample_data %>%
group_by(ID) %>%
summarise(Combined_Code = paste(unique(Combined_Code), collapse ='|')) %>%
spread(count(sample_data, ID, Combined_Code, Desc., Code), Code, n, fill = 0)
This results in the error:
Error: `var` must evaluate to a single number or a column name, not a list
Run `rlang::last_error()` to see where the error occurred.
What can I do to solve these problems?
We can do a group by paste
library(dplyr)
library(stringr)
df1 %>%
group_by(ID, Desc) %>%
summarise(Combined_Code = str_c(Code, collapse="|"))
# A tibble: 3 x 3
# Groups: ID [3]
# ID Desc Combined_Code
# <int> <chr> <chr>
#1 1 Red 0A|0B
#2 2 Blue 1A
#3 3 Green 2B
For the second case, after creating a 'val' column of 1s, paste the 'Code' elements afte grouping by 'ID', 'Desc', then use pivot_wider from tidyr to reshape from 'long' to 'wide format.
library(tidyr)
df1 %>%
mutate(val = 1) %>%
group_by(ID, Desc) %>%
mutate(Combined_Code = str_c(Code, collapse="|")) %>%
pivot_wider(names_from = Code, values_from = val, values_fill = list(val = 0))
# A tibble: 3 x 7
# Groups: ID, Desc [3]
# ID Desc Combined_Code `0A` `0B` `1A` `2B`
# <int> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#1 1 Red 0A|0B 1 1 0 0
#2 2 Blue 1A 0 0 1 0
#3 3 Green 2B 0 0 0 1
The OP's expected output is
ID Combined_Code 0A 0B 1A 2B Desc
1 0A | 0B 1 1 0 0 Red
2 1A 0 0 1 0 Blue
3 2B 0 0 0 1 Green
Update
For the updated dataset, there are NA elements in the 'Code', and by default str_c returns NA if there any NA as one of the elements, while paste still returns the NA along with the other elements. Here, we replace the str_c with paste
df2 %>%
mutate(val = 1) %>%
group_by(ID, Desc) %>%
mutate(Combined_Code = paste(Code, collapse="|")) %>%
pivot_wider(names_from = Code, values_from = val, values_fill = list(val = 0))
# A tibble: 3 x 7
# Groups: ID, Desc [3]
# ID Desc Combined_Code `0A` `NA` `1A` `2B`
# <int> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#1 1 Red 0A|NA 1 1 0 0
#2 2 Blue 1A 0 0 1 0
#3 3 Green 2B 0 0 0 1
data
df1 <- structure(list(ID = c(1L, 1L, 2L, 3L), Code = c("0A", "0B", "1A",
"2B"), Desc = c("Red", "Red", "Blue", "Green")),
class = "data.frame", row.names = c(NA,
-4L))
df2 <- structure(list(ID = c(1L, 1L, 2L, 3L), Code = c("0A", NA, "1A",
"2B"), Desc = c("Red", "Red", "Blue", "Green")), class = "data.frame",
row.names = c(NA,
-4L))

Pairwise count data from long format

Example data
I have the following data:
df <- data.frame(
id = c('X1','X1','X1','X1','X2','X2','X2','X2'),
pos = c(1,2,3,4,1,2,3,4),
group = c(100,200,100,300,100,200,100,200)
)
Which thus looks like:
id pos group
1 X1 1 100
2 X1 2 200
3 X1 3 100
4 X1 4 300
5 X2 1 100
6 X2 2 200
7 X2 3 100
8 X2 4 200
What I try to achieve
I want to plot this data using geom_segment(), where pos will be on the x-xis, and group on the y-axis. Then for each of these segments I want to count how often they are present in the dataset (based on the id column). When doing this for the example dataset the result would be:
pos1 pos2 group1 group2 id.count
1 2 100 200 2
2 3 200 100 2
3 4 100 300 1
3 4 100 200 1
I have no clue how to start with this, while I'm familiar with group_by from dplyr I can not figure out how to build the initial four columns.
If the ordering in your data set is as in your example you can try this:
library(dplyr)
df %>% group_by(id) %>%
transmute(pos1 = pos, pos2 = lead(pos),
group1 = group, group2 = lead(group)) %>%
na.omit() %>% ungroup()%>%
count(pos1, pos2, group1, group2, name = "id.count")
# A tibble: 4 x 5
# pos1 pos2 group1 group2 id.count
# <dbl> <dbl> <dbl> <dbl> <int>
# 1 2 100 200 2
# 2 3 200 100 2
# 3 4 100 200 1
# 3 4 100 300 1
I tried the following that works, but wonder if there is a more elegant solution for this:
# Simple stats
vals <- unique(df$pos)
min.val = min(vals)
max.val = max(vals)
# Combination
comb.df <- data.frame(
pos1 = min.val:(max.val - 1),
pos2 = (min.val + 1): max.val
)
# Combine
comb.df <- comb.df %>%
left_join(df %>% select(pos1 = pos, group1 = group, id )) %>%
left_join(df %>% select(pos2 = pos, group2 = group, id ))
# Count
comb.df <- comb.df %>%
group_by(pos1, pos2, group1, group2) %>%
summarise(n.ids = n_distinct(id))

Mutating based on multiple columns in a data frame

Ok so my dataframe looks like this let's call if df
KEY A1 A2 A3 A4 B1 B2 B3 B4 C1 C2 C3 C4
1 120 100 NA 110 1 1 NA 1 NA NA NA NA
2 100 NA 115 NA NA NA NA NA Y N Y N
So what I'm trying to do is make it so that when an A columns has a value of 100 and the corresponding B or C column has a value of 1 or "Y" respectively that makes a new column with a X with a value of 1. In Row 1 that would be A2 and B2 and in row that would be A1 and C1.
I tried doing three sets of gather and then using the mutate function using case_when. like so
df<- df %>%
gather(key="A",value="code",dx)%>%
gather(key="B",value="number",dxadm)%>%
gather(key="C",value="character",dxpoa) %>%
mutate(X=case_when(
code == 100 & present >0 ~ 1,
code ==100 & character == "Y"~1)
)
Except my spread function of these rows came back with rows all array and my X out of place.
Alternatively, I considered something like
df <- df %>%
mutate(X=case_when(
A1 == 100 & B1 >0 ~ 1,
A1 ==100 & C1 == "Y"~1,
A2 == 100 & B2 >0 ~ 1,
A2 ==100 & C2 == "Y"~1,)
and so on for all permutations. The two problems with this are that I have a lot of columns and I'd like to this for multiple different values of A.
Can anyone recommend an alternative or at least a way to make the second solution into something that would only require one annoying long piece of code that I could make into a more generalizable function? Thanks!
A suggestion
require(read.so) #awesome package to read from Stackoverflow,
# available on GitHub [https://alistaire47.github.io/read.so/][1]
require(tidyr)
require(reshape2)
require(dplyr)
dat <- read.so()
dat %>% gather(var, value, 2:13) %>% #make it long
mutate(var = gsub('([A-Z])', '\\1_', .[['var']])) %>% #add underscore
separate(var, c('var', 'number') ) %>% #separate your column
dcast(KEY+number ~ var) %>% #dcast is a bit complex but quite powerful
group_by(KEY) %>%
filter(A == 100)
# A tibble: 2 x 5
# Groups: KEY [2]
KEY number A B C
<int> <chr> <chr> <chr> <chr>
1 1 2 100 1 <NA>
2 2 1 100 <NA> Y
A solution using dplyr and tidyr. We can gather all the columns except KEY, separate the letters and numbers, and then spread the letter so that we can create the X column without specifying the numbers. Notice that I assume if the condition is not met, X would be 0, and based on your description, I used any(A %in% 100 & (B %in% 1 | C %in% "Y")) to test the condition as any given numbers met the condition, X would be 1.
library(dplyr)
library(tidyr)
df2 <- df %>%
gather(Column, Value, -KEY) %>%
separate(Column, into = c("Letter", "Number"), sep = 1) %>%
spread(Letter, Value, convert = TRUE) %>%
group_by(KEY) %>%
mutate(X = ifelse(any(A %in% 100 & (B %in% 1 | C %in% "Y")), 1L, 0L))
df2 %>% as.data.frame()
# KEY Number A B C X
# 1 1 1 120 1 <NA> 1
# 2 1 2 100 1 <NA> 1
# 3 1 3 NA NA <NA> 1
# 4 1 4 110 1 <NA> 1
# 5 2 1 100 NA Y 1
# 6 2 2 NA NA N 1
# 7 2 3 115 NA Y 1
# 8 2 4 NA NA N 1
I think the structure of df2 is good, but if you really want the original structure, we can do the following.
df3 <- df2 %>%
gather(Letter, Value, A:C) %>%
unite(Column, Letter, Number, sep = "") %>%
spread(Column, Value) %>%
select(names(df), X)
df3 %>% as.data.frame()
# KEY A1 A2 A3 A4 B1 B2 B3 B4 C1 C2 C3 C4 X
# 1 1 120 100 <NA> 110 1 1 <NA> 1 <NA> <NA> <NA> <NA> 1
# 2 2 100 <NA> 115 <NA> <NA> <NA> <NA> <NA> Y N Y N 1
df3 is the final output.
DATA
df <- read.table(text = "KEY A1 A2 A3 A4 B1 B2 B3 B4 C1 C2 C3 C4
1 120 100 NA 110 1 1 NA 1 NA NA NA NA
2 100 NA 115 NA NA NA NA NA Y N Y N",
header = TRUE, stringsAsFactors = FALSE)
Same idea as Tjebo, but sticking to the tidyverse....
library(tidyverse)
dat <- data.frame(stringsAsFactors=FALSE,
KEY = c(1L, 2L),
A1 = c(120L, 100L),
A2 = c(100L, NA),
A3 = c(NA, 115L),
A4 = c(110L, NA),
B1 = c(1L, NA),
B2 = c(1L, NA),
B3 = c(NA, NA),
B4 = c(1L, NA),
C1 = c(NA, "Y"),
C2 = c(NA, "N"),
C3 = c(NA, "Y"),
C4 = c(NA, "N"))
dat %>%
gather(var, value, -KEY) %>% #make it long
extract(var, regex = "(.)(.)", into = c("var", "number") ) %>%
spread(var, value) %>%
filter( A %in% 100 )
#> KEY number A B C
#> 1 1 2 100 1 <NA>
#> 2 2 1 100 <NA> Y
Created on 2018-02-27 by the reprex package (v0.2.0).

Resources