Add more rows based on a grouping variable R - r

I'd like to add more rows to my dataset based on a grouping variable. Right now, my data has 2 rows but I would like 3 rows and the var app to be repeated for the third row.
This is what my data currently looks like:
my_data <- data.frame(app = c('a','b'), type = c('blue','red'), code = c(1:2), type_2 = c(NA, 'blue'), code_2 = c(NA, 3))
app type code type_2 code_2
a blue 1 NA NA
b red 2 blue 3
I would like the data to look like this:
app type code
a blue 1
b red 2
b blue 3

library(data.table)
setDT(my_data)
res <-
melt(
my_data,
id.vars = "app",
measure.vars = patterns(c("^type", "^code")),
value.name = c("type", "code")
)[!is.na(type), .(app, type, code)]

Using tidyverse
library(dplyr)
library(stringr)
library(tidyr)
my_data %>%
rename_at(vars(c(type, code)), ~ str_c(., "_1")) %>%
pivot_longer(cols = -app, names_to = c(".value", "grp"), names_sep = "_",
values_drop_na = TRUE) %>% select(-grp)
# A tibble: 3 x 3
# app type code
# <chr> <chr> <dbl>
#1 a blue 1
#2 b red 2
#3 b blue 3

Related

Collapsing Columns in R using tidyverse with mutate, replace, and unite. Writing a function to reuse?

Data:
ID
B
C
1
NA
x
2
x
NA
3
x
x
Results:
ID
Unified
1
C
2
B
3
B_C
I'm trying to combine colums B and C, using mutate and unify, but how would I scale up this function so that I can reuse this for multiple columns (think 100+), instead of having to write out the variables each time? Or is there a function that's already built in to do this?
My current solution is this:
library(tidyverse)
Data %>%
mutate(B = replace(B, B == 'x', 'B'), C = replace(C, C == 'x', 'C')) %>%
unite("Unified", B:C, na.rm = TRUE, remove= TRUE)
We may use across to loop over the column, replace the value that corresponds to 'x' with column name (cur_column())
library(dplyr)
library(tidyr)
Data %>%
mutate(across(B:C, ~ replace(., .== 'x', cur_column()))) %>%
unite(Unified, B:C, na.rm = TRUE, remove = TRUE)
-output
ID Unified
1 1 C
2 2 B
3 3 B_C
data
Data <- structure(list(ID = 1:3, B = c(NA, "x", "x"), C = c("x", NA,
"x")), class = "data.frame", row.names = c(NA, -3L))
Here are couple of options.
Using dplyr -
library(dplyr)
cols <- names(Data)[-1]
Data %>%
rowwise() %>%
mutate(Unified = paste0(cols[!is.na(c_across(B:C))], collapse = '_')) %>%
ungroup -> Data
Data
# ID B C Unified
# <int> <chr> <chr> <chr>
#1 1 NA x C
#2 2 x NA B
#3 3 x x B_C
Base R
Data$Unified <- apply(Data[cols], 1, function(x)
paste0(cols[!is.na(x)], collapse = '_'))

pivot_wider a dataframe with complex names R

so I have a dataframe that looks like this:
datInput <- tibble(id = 1:2,
c.0.opt = c("a,b", "c,d"),
c.0.optI = c("1,2", "3,4"),
c.0.sel = c("a", "c"),
c.1.opt = c("e,f", "g,h"),
c.1.optI = c("5,6", "7,8"),
c.1.sel = c("e", "g"))
datInput
# id c.0.opt c.0.optI c.0.sel c.1.opt c.1.optI c.1.sel
#1 1 a,b 1,2 a e,f 5,6 e
#2 2 c,d 3,4 c g,h 7,8 g
And I need it to look like this:
datOutput <- tibble(id = c(1,1,2,2),
c_opt = c("a,b", "e,f", "c,d", "g,h"),
c_optI = c("1,2", "5,6", "3,4", "7,8"),
c_sel = c("a", "e", "c", "g"))
# id c_opt c_optI c_sel
#1 1 a,b 1,2 a
#2 1 e,f 5,6 e
#3 2 c,d 3,4 c
#4 2 g,h 7,8 g
I usually use dplyr::pivot_longer for this kind of tasks, but I don't know how to do it with those complicated column names, were the row identifier is in the middle. Is there a way to do this?
Thanks
We can use pivot_longer as well with names_sep as regex lookaround to match the . in column names that succeeds a digit
library(dplyr)
library(tidyr)
library(stringr)
pivot_longer(datInput, cols = -id, names_to = c("grp", ".value"),
names_sep = "(?<=\\d)\\.") %>%
select(-grp) %>%
rename_with(~ str_c('c_', .), -id)
# A tibble: 4 x 4
# id c_opt c_optI c_sel
# <int> <chr> <chr> <chr>
#1 1 a,b 1,2 a
#2 1 e,f 5,6 e
#3 2 c,d 3,4 c
#4 2 g,h 7,8 g
datInput %>%
gather(colname, val,-1 ) %>%
mutate(colname = gsub("\\.\\d\\.","_",colname)) %>%
pivot_wider(id_cols = id, names_from = colname, values_from = val, values_fn = list) %>%
unnest(cols = c(colnames(.)))
# A tibble: 4 x 4
id c_opt c_optI c_sel
<int> <chr> <chr> <chr>
1 1 a,b 1,2 a
2 1 e,f 5,6 e
3 2 c,d 3,4 c
4 2 g,h 7,8 g
I modified Akrun's answer with comments from zimia like this:
datOutput <- datInput %>%
pivot_longer(-id, names_to = "colname", values_to = "val") %>%
mutate(colname = gsub("\\.\\d\\.","_",colname)) %>%
pivot_wider(id_cols = id, names_from = colname, values_from = val, values_fn = list) %>%
unnest(cols = c(colnames(.)))
It works perfectly. Thank you both.

How to set tibble column types programmatically

I have a tibble from reading a tall XLSX file using:
> file = readxl::read_xlsx(filename, "sheetname")
An toy example avoiding an actual XLSX file:
> file = tibble(
+ names = c("name1", "name2", "name3"),
+ values = c(TRUE, 1, "chr")
+ )
> file
# A tibble: 3 x 2
names values
<chr> <chr>
1 name1 TRUE
2 name2 1
3 name3 chr
I want to convert it into this:
# A tibble: 1 x 3
name1 name2 name3
<dbl> <lgl> <chr>
1 1 TRUE chr
but because pivot_wider() determines that the values column is of type <chr>, pivot_wider() keeps that type for all widened columns.
> file %>% pivot_wider(names_from = names, values_from = values)
# A tibble: 1 x 3
name1 name2 name3
<chr> <chr> <chr>
1 TRUE 1 chr
This requires me to manually set each column type again. Is there another (automated) way? The format of this file is fixed, but the contents might change so I can't rely on hardcoded type setting. Ideally for me, we could do something like
readxl::read_xlsx(filename, "sheetname") %>%
pivot_wider(names_from = column1, values_from = column2, col_types = NULL)
If we wrap with type.convert, it would automatically change the type
library(dplyr)
library(tidyr)
file %>%
pivot_wider(names_from = names, values_from = values) %>%
type.convert(as.is = TRUE)
# A tibble: 1 x 3
# name1 name2 name3
# <lgl> <int> <chr>
#1 TRUE 1 chr
or use deframe/as_tibble_row, convert to tibble
library(tibble)
deframe(file) %>%
as_tibble_row %>%
type.convert(as.is = TRUE)
Another option is data.table::transpose
type.convert(data.table::transpose(file, make.names = 'names'), as.is = TRUE)
# name1 name2 name3
#1 TRUE 1 chr
data
file <- structure(list(names = c("name1", "name2", "name3"), values = c("TRUE",
"1", "chr")), row.names = c(NA, -3L), class = c("tbl_df", "tbl",
"data.frame"))

Splitting values in a column

sorry I'm new to R but I've got some data that looks like the following:
I'd like count the number of times each object is mentioned in the findings. So the result would look like this:
I've tried tidyverse and separate but can't seem to get the hang of it, any help would be amazing, thanks in advance!
To recreate my data:
df <- data.frame(
col_1 = paste0("image", 1:5),
findings = c("rock|cat|sun", "cat", "cat|dog|fish|sun", "sun", "dog|cat")
)
You can use separate_rows() and then count().
library(tidyverse)
df %>%
separate_rows(findings) %>%
count(findings)
# # A tibble: 5 x 2
# findings n
# <chr> <int>
# 1 cat 4
# 2 dog 2
# 3 fish 1
# 4 rock 1
# 5 sun 3
Data
df <- structure(list(col_1 = c("image_1", "image_2", "image_3", "image_4",
"image_5"), findings = c("rock|cat|sun", "cat", "cat|dog|fish|sun",
"sun", "dog|cat")), class = "data.frame", row.names = c(NA, -5L))
In base R:
as.data.frame(table(unlist(strsplit(df$col_2, "|", fixed = TRUE))))
# Var1 Freq
# 1 cat 4
# 2 dog 2
# 3 fish 1
# 4 rock 1
# 5 sun 3
Reproducible data (please provide it in your next post):
df <- data.frame(
col_1 = paste0("image", 1:5),
col_2 = c("rock|cat|sun", "cat", "cat|dog|fish|sun", "sun", "dog|cat")
)
An option with cSplit
library(splitstackshape)
cSplit(df, 'col_2', 'long', sep="|")[, .N, col_2]
# col_2 N
#1: rock 1
#2: cat 4
#3: sun 3
#4: dog 2
#5: fish 1
data
df <- structure(list(col_1 = c("image1", "image2", "image3", "image4",
"image5"), col_2 = c("rock|cat|sun", "cat", "cat|dog|fish|sun",
"sun", "dog|cat")), class = "data.frame", row.names = c(NA, -5L
))
Using tidyverse:
df %>%
separate_rows(findings) %>%
group_by(findings) %>%
summarize(total_count_col=n())
First we convert the data into a long format using separate_rows, then group and count the number of rows with each finding.
Example:
df<-data.frame(col1=c(rep(letters[1:3],3),"d"),col2=c(rep("moose|cat|dog",9),"rock"), stringsAsFactors = FALSE)
df %>% separate_rows(col2) %>% group_by(col2) %>% summarize(total_count_col=n())
# A tibble: 4 x 2
col2 total_count_col
<chr> <int>
1 cat 9
2 dog 9
3 moose 9
4 rock 1

Grouping Over All Possible Combinations of Several Variables With dplyr

Given a situation such as the following
library(dplyr)
myData <- tbl_df(data.frame( var1 = rnorm(100),
var2 = letters[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var3 = LETTERS[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var4 = month.abb[1:3] %>%
sample(100, replace = TRUE) %>%
factor()))
I would like to group `myData' to eventually find summary data grouping by all possible combinations of var2, var3, and var4.
I can create a list with all possible combinations of variables as character values with
groupNames <- names(myData)[2:4]
myGroups <- Map(combn,
list(groupNames),
seq_along(groupNames),
simplify = FALSE) %>%
unlist(recursive = FALSE)
My plan was to make separate data sets for each variable combination with a for() loop, something like
### This Does Not Work
for (i in 1:length(myGroups)){
assign( myGroups[i]%>%
unlist() %>%
paste0(collapse = "")%>%
paste0("Data"),
myData %>%
group_by_(lapply(myGroups[[i]], as.symbol)) %>%
summarise( n = length(var1),
avgVar2 = var2 %>%
mean()))
}
Admittedly I am not very good with lists, and looking up this issue was a bit challenging since dpyr updates have altered how grouping works a bit.
If there is a better way to do this than separate data sets I would love to know.
I've gotten a loop similar to above working when I am only grouping by a single variable.
Any and all help is greatly appreciated! Thank you!
This seems convulated, and there's probably a way to simplify or fancy it up with a do, but it works. Using your myData and myGroups,
results = lapply(myGroups, FUN = function(x) {
do.call(what = group_by_, args = c(list(myData), x)) %>%
summarise( n = length(var1),
avgVar1 = mean(var1))
}
)
> results[[1]]
Source: local data frame [3 x 3]
var2 n avgVar1
1 a 31 0.38929738
2 b 31 -0.07451717
3 c 38 -0.22522129
> results[[4]]
Source: local data frame [9 x 4]
Groups: var2
var2 var3 n avgVar1
1 a A 11 -0.1159160
2 a B 11 0.5663312
3 a C 9 0.7904056
4 b A 7 0.0856384
5 b B 13 0.1309756
6 b C 11 -0.4192895
7 c A 15 -0.2783099
8 c B 10 -0.1110877
9 c C 13 -0.2517602
> results[[7]]
# I won't paste them here, but it has all 27 rows, grouped by var2, var3 and var4.
I changed your summarise call to average var1 since var2 isn't numeric.
I have created a function based on the answer of #Gregor and the comments that followed:
library(magrittr)
myData <- tbl_df(data.frame( var1 = rnorm(100),
var2 = letters[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var3 = LETTERS[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var4 = month.abb[1:3] %>%
sample(100, replace = TRUE) %>%
factor()))
Function combSummarise
combSummarise <- function(data, variables=..., summarise=...){
# Get all different combinations of selected variables (credit to #Michael)
myGroups <- lapply(seq_along(variables), function(x) {
combn(c(variables), x, simplify = FALSE)}) %>%
unlist(recursive = FALSE)
# Group by selected variables (credit to #konvas)
df <- eval(parse(text=paste("lapply(myGroups, function(x){
dplyr::group_by_(data, .dots=x) %>%
dplyr::summarize_( \"", paste(summarise, collapse="\",\""),"\")})"))) %>%
do.call(plyr::rbind.fill,.)
groupNames <- c(myGroups[[length(myGroups)]])
newNames <- names(df)[!(names(df) %in% groupNames)]
df <- cbind(df[, groupNames], df[, newNames])
names(df) <- c(groupNames, newNames)
df
}
Call of combSummarise
combSummarise (myData, var=c("var2", "var3", "var4"),
summarise=c("length(var1)", "mean(var1)", "max(var1)"))
or
combSummarise (myData, var=c("var2", "var4"),
summarise=c("length(var1)", "mean(var1)", "max(var1)"))
or
combSummarise (myData, var=c("var2", "var4"),
summarise=c("length(var1)"))
etc
Inspired by the answers by Gregor and dimitris_ps, I wrote a dplyr style function that runs summarise for all combinations of group variables.
summarise_combo <- function(data, ...) {
groupVars <- group_vars(data) %>% map(as.name)
groupCombos <- map( 0:length(groupVars), ~combn(groupVars, ., simplify=FALSE) ) %>%
unlist(recursive = FALSE)
results <- groupCombos %>%
map(function(x) {data %>% group_by(!!! x) %>% summarise(...)} ) %>%
bind_rows()
results %>% select(!!! groupVars, everything())
}
Example
library(tidyverse)
mtcars %>% group_by(cyl, vs) %>% summarise_combo(cyl_n = n(), mean(mpg))
Using unite to create a new column is the simplest way
library(tidyverse)
df = tibble(
a = c(1,1,2,2,1,1,2,2),
b = c(3,4,3,4,3,4,3,4),
val = c(1,2,3,4,5,6,7,8)
)
print(df)#output1
df_2 = unite(df, 'combined_header', a, b, sep='_', remove=FALSE) #remove=F doesn't remove existing columns
print(df_2)#output2
df_2 %>% group_by(combined_header) %>%
summarize(avg_val=mean(val)) %>% print()#output3
#avg 1_3 = mean(1,5)=3 avg 1_4 = mean(2, 6) = 4
RESULTS
Output:
output1
a b val
<dbl> <dbl> <dbl>
1 1 3 1
2 1 4 2
3 2 3 3
4 2 4 4
5 1 3 5
6 1 4 6
7 2 3 7
8 2 4 8
output2
combined_header a b val
<chr> <dbl> <dbl> <dbl>
1 1_3 1 3 1
2 1_4 1 4 2
3 2_3 2 3 3
4 2_4 2 4 4
5 1_3 1 3 5
6 1_4 1 4 6
7 2_3 2 3 7
8 2_4 2 4 8
output3
combined_header avg_val
<chr> <dbl>
1 1_3 3
2 1_4 4
3 2_3 5
4 2_4 6

Resources