Split multiple columns with delimiter and have consistent column names in r - r

I have a large data set and here is the sample (the raw data has more columns
dta0 = data.frame(cbind(paste(seq(10,15),seq(20,25),sep = ";"),
paste(seq(30,35),seq(40,45),sep = ";") ) )
colnames(dta0) = c("H1","H2")
Here is my desired output
desired_dta = data.frame(cbind(seq(10,15),seq(20,25),seq(30,35),seq(40,45)))
colnames(desired_dta) = c("H1_x","H1_y","H2_x","H2_y")
How can I name columns like "H1_x", "H1_y","H2_x", "H2_y" ....?

You could try
library(tidyr)
dta0 %>%
separate(H1, c("H1_x", "H1_y"), ";") %>%
separate(H2, c("H2_x", "H2_y"), ";")
#> H1_x H1_y H2_x H2_y
#> 1 10 20 30 40
#> 2 11 21 31 41
#> 3 12 22 32 42
#> 4 13 23 33 43
#> 5 14 24 34 44
#> 6 15 25 35 45
Or in base R
setNames(as.data.frame(do.call(cbind, lapply(dta0,
function(x) do.call(rbind, strsplit(x, ";"))))),
unlist(lapply(names(dta0), paste0, c("_x", "_y"))))
#> H1_x H1_y H2_x H2_y
#> 1 10 20 30 40
#> 2 11 21 31 41
#> 3 12 22 32 42
#> 4 13 23 33 43
#> 5 14 24 34 44
#> 6 15 25 35 45

Here is an option with cSplit
library(splitstackshape)
cSplit(dta0, names(dta0), sep=";")
# H1_1 H1_2 H2_1 H2_2
#1: 10 20 30 40
#2: 11 21 31 41
#3: 12 22 32 42
#4: 13 23 33 43
#5: 14 24 34 44
#6: 15 25 35 45

Another option is using separate_rows() then reshape to long and after that reshape to wide. Here the code:
library(tidyverse)
#Code
dta0 %>% mutate(id=1:n()) %>%
separate_rows(c(H1,H2),sep = ';') %>%
group_by(id) %>% mutate(Var=1:n()) %>%
pivot_longer(-c(id,Var)) %>%
mutate(Var=ifelse(Var==1,'x','y'),
name=paste0(name,'.',Var)) %>% select(-c(Var)) %>%
pivot_wider(names_from = name,values_from=value) %>% ungroup() %>%
select(-id) %>%
select(sort(current_vars())) %>%
mutate_each(funs = as.numeric)
Output:
# A tibble: 6 x 4
H1.x H1.y H2.x H2.y
<dbl> <dbl> <dbl> <dbl>
1 10 20 30 40
2 11 21 31 41
3 12 22 32 42
4 13 23 33 43
5 14 24 34 44
6 15 25 35 45

Assuming your data has an even number of columns:
nc = ncol(desired_dta)
colnames(desired_dta) = paste0("H",rep(1:(nc/2),each = 2),rep(c("_x","_y"),nc/2))

Related

How to filter DataFrame columns by value keeping original column order?

I'm trying to filter a DataFrame, keeping only columns containing "_time" or "___" in their column names.
I tried using df %>% select(contains(c("_time", "___")). However, this changes the order of the columns in the output, where all columns with _time are displayed first and the columns with "___" are displayed last.
How can filtering be done without changing the column order?
We can use matches
library(dplyr)
df %>%
select(matches("_time|___"))
-output
h_time l_time f___d m_time s___hello
1 11 16 21 26 31
2 12 17 22 27 32
3 13 18 23 28 33
4 14 19 24 29 34
5 15 20 25 30 35
compared to
df %>%
select(contains(c("_time", "___")))
h_time l_time m_time f___d s___hello
1 11 16 26 21 31
2 12 17 27 22 32
3 13 18 28 23 33
4 14 19 29 24 34
5 15 20 30 25 35
data
df <- data.frame(col1 = 1:5, col2 = 6:10, h_time = 11:15,
l_time = 16:20, f___d = 21:25, m_time = 26:30,
col_new = 41:45, s___hello = 31:35)
Base R: Data from #akrun (many thanks)
df[,grepl("_time|___", colnames(df))]
h_time l_time f___d m_time s___hello
1 11 16 21 26 31
2 12 17 22 27 32
3 13 18 23 28 33
4 14 19 24 29 34
5 15 20 25 30 35

Data manipulation: gather or spread or both?

I am trying to change my data frame so I can look at it with some different plots. Essentially I want to compare different models. This is what I have:
variable = c('A','B','C','A','B','C')
optimal = c(10,20,30,40,80,100)
control = c(15,15,15,15,15,15)
method_1 = c(11,22,28,44,85,95)
method_2 = c(9, 19,31,39,79,102)
df = data.frame(variable, optimal, control, method_1, method_2)
df
and so it looks like this:
variable optimal control method_1 method_2
1 A 10 15 11 9
2 B 20 15 22 19
3 C 30 15 28 31
4 A 40 15 44 39
5 B 80 15 85 79
6 C 100 15 95 102
And I need something that looks like this:
variable A B C
1 optimal 10 20 30
2 optimal 40 80 100
3 control 15 15 15
4 control 15 15 15
5 method_1 11 22 28
6 method_1 44 85 95
7 method_2 9 19 31
8 method_2 39 79 102
I've tried gather and spread and transpose but nothing worked. Any thoughts? Feels that should be a easy fix, but I could not get my head around it. Thanks in advance.
You have to go long first and then wide, i.e.
library(dplyr)
library(tidyr)
df %>%
pivot_longer(-1) %>%
pivot_wider(names_from = variable, values_from = value) %>%
unnest()
name A B C
<chr> <dbl> <dbl> <dbl>
1 optimal 10 20 30
2 optimal 40 80 100
3 control 15 15 15
4 control 15 15 15
5 method_1 11 22 28
6 method_1 44 85 95
7 method_2 9 19 31
8 method_2 39 79 102
I think you need both. Also note that gather and spread has been retired and replaced with pivot_longer and pivot_wider instead.
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = -variable) %>%
group_by(variable) %>%
mutate(row = row_number()) %>%
pivot_wider(names_from = variable, values_from = value) %>%
select(-row)
# name A B C
# <chr> <dbl> <dbl> <dbl>
#1 optimal 10 20 30
#2 control 15 15 15
#3 method_1 11 22 28
#4 method_2 9 19 31
#5 optimal 40 80 100
#6 control 15 15 15
#7 method_1 44 85 95
#8 method_2 39 79 102

Create multiple new columns in tibble in R based on value of previous row giving prefix to all

I have a tibble as so:
df <- tibble(a = seq(1:10),
b = seq(21,30),
c = seq(31,40))
I want to create a new tibble, where I want to lag some. I want to create new columns called prev+lagged_col_name, eg prev_a.
In my actual data, there are a lot of cols so I don't want to manually write it out. Additonally I only want to do it for some cols. In this eg, I have done it manually but wanted to know if there is a way to use a function to do it.
df_new <- df %>%
mutate(prev_a = lag(a),
prev_b = lag(b),
prev_d = lag(d))
Thanks for your help!
With the current dplyr version you can create new variable names with mutate_at, using a named list will take the name of the list as suffix. If you want it as a prefix as in your example you can use rename_at to correct the variable naming. With your real data, you need to adjust the vars() selection. For your example data matches("[a-c]") did work.
library(dplyr)
df <- tibble(a = seq(1:10),
b = seq(21,30),
c = seq(31,40))
df %>%
mutate_at(vars(matches("[a-c]")), list(prev = ~ lag(.x)))
#> # A tibble: 10 x 6
#> a b c a_prev b_prev c_prev
#> <int> <int> <int> <int> <int> <int>
#> 1 1 21 31 NA NA NA
#> 2 2 22 32 1 21 31
#> 3 3 23 33 2 22 32
#> 4 4 24 34 3 23 33
#> 5 5 25 35 4 24 34
#> 6 6 26 36 5 25 35
#> 7 7 27 37 6 26 36
#> 8 8 28 38 7 27 37
#> 9 9 29 39 8 28 38
#> 10 10 30 40 9 29 39
df %>%
mutate_at(vars(matches("[a-c]")), list(prev = ~ lag(.x))) %>%
rename_at(vars(contains( "_prev") ), list( ~paste("prev", gsub("_prev", "", .), sep = "_")))
#> # A tibble: 10 x 6
#> a b c prev_a prev_b prev_c
#> <int> <int> <int> <int> <int> <int>
#> 1 1 21 31 NA NA NA
#> 2 2 22 32 1 21 31
#> 3 3 23 33 2 22 32
#> 4 4 24 34 3 23 33
#> 5 5 25 35 4 24 34
#> 6 6 26 36 5 25 35
#> 7 7 27 37 6 26 36
#> 8 8 28 38 7 27 37
#> 9 9 29 39 8 28 38
#> 10 10 30 40 9 29 39
Created on 2020-04-29 by the reprex package (v0.3.0)
You could do this this way
df_new <- bind_cols(
df,
df %>% mutate_at(.vars = vars("a","b","c"), function(x) lag(x))
)
Names are a bit nasty but you can rename them check here. Or see #Bas comment to get the names with a suffix.
# A tibble: 10 x 6
a b c a1 b1 c1
<int> <int> <int> <int> <int> <int>
1 1 21 31 NA NA NA
2 2 22 32 1 21 31
3 3 23 33 2 22 32
4 4 24 34 3 23 33
5 5 25 35 4 24 34
6 6 26 36 5 25 35
7 7 27 37 6 26 36
8 8 28 38 7 27 37
9 9 29 39 8 28 38
10 10 30 40 9 29 39
If you have dplyr 1.0 you can use the new accross() function.
See some expamples from the docs, instead of mean you want lag
df %>% mutate_if(is.numeric, mean, na.rm = TRUE)
# ->
df %>% mutate(across(is.numeric, mean, na.rm = TRUE))
df %>% mutate_at(vars(x, starts_with("y")), mean, na.rm = TRUE)
# ->
df %>% mutate(across(c(x, starts_with("y")), mean, na.rm = TRUE))
df %>% mutate_all(mean, na.rm = TRUE)
# ->
df %>% mutate(across(everything(), mean, na.rm = TRUE))

How to add a column after the last column containing a certain string?

I have a dataset like this
df <- data.frame(abc_1 = 1:10, abc_2 = 11:20, abc_3 = 21:30, somevar = 31:40)
head(df)
abc_1 abc_2 abc_3 somevar
1 1 11 21 31
2 2 12 22 32
3 3 13 23 33
4 4 14 24 34
5 5 15 25 35
6 6 16 26 36
I would like to insert a new column after abc_3 (in my case the row sums of abc_1, abc_2, abc_3). Since (a) the dataset is huge, (b) I might decide to manipulate the dataset before adding the column (i.e. messing up the column indices), and since (c) I would like to do this for many variables that contain some different string, I am looking to use a way of doing this without referencing the column index but rather by matching the string abc.
I have found add_column in the tibble package but it only allows adding the column by its index like so
library(tibble)
add_column(df, abc_sum = rowSum(abc_1, abc_2, abc_3), .after = 3)
What I want is something like this:
abc_1 abc_2 abc_3 abc_sum somevar
1 1 11 21 33 31
2 2 12 22 36 32
3 3 13 23 39 33
4 4 14 24 42 34
5 5 15 25 45 35
6 6 16 26 48 36
I am looking to replace the .after = 3 with an expression that inserts it after abc_3 by matching the string abc_3.
You can do:
add_column(df,
abc_sum = rowSums(df[startsWith(names(df), "abc")]),
.after = "abc_3")
abc_1 abc_2 abc_3 abc_sum somevar
1 1 11 21 33 31
2 2 12 22 36 32
3 3 13 23 39 33
4 4 14 24 42 34
5 5 15 25 45 35
6 6 16 26 48 36
7 7 17 27 51 37
8 8 18 28 54 38
9 9 19 29 57 39
10 10 20 30 60 40
We can use reduce
library(dplyr)
library(purrr)
df %>%
mutate(abc_sum = select(., starts_with('abc')) %>%
reduce(`+`)) %>%
select(starts_with('abc'), everything())
# abc_1 abc_2 abc_3 abc_sum somevar
#1 1 11 21 33 31
#2 2 12 22 36 32
#3 3 13 23 39 33
#4 4 14 24 42 34
#5 5 15 25 45 35
#6 6 16 26 48 36
#7 7 17 27 51 37
#8 8 18 28 54 38
#9 9 19 29 57 39
#10 10 20 30 60 40

Group_by / summarize by two variables within a function

I would like to write a function that summarize the provided data by some specified criteria, in this case by age
The example data is a table of users' age and their stats.
df <- data.frame('Age'=rep(18:25,2), 'X1'=10:17, 'X2'=28:35,'X4'=22:29)
Next I define the output columns that are relevant for the analysis
output_columns <- c('Age', 'X1', 'X2', 'X3')
This function computes the basic the sum of X1. X2 and X3 grouped by age.
aggr <- function(data, criteria, output_columns){
k <- data %>% .[, colnames(.) %in% output_columns] %>%
group_by_(.dots = criteria) %>%
#summarise_each(funs(count), age) %>%
summarize_if(is.numeric, sum)
return (k)
}
When I call it like this
> e <- aggr(df, "Age", output_columns)
> e
# A tibble: 8 x 3
Age X1 X2
<int> <int> <int>
1 18 20 56
2 19 22 58
3 20 24 60
4 21 26 62
5 22 28 64
6 23 30 66
7 24 32 68
8 25 34 70
I want to have another column called count which shows the number of observations in each age group. Desired output is
> desired
Age X1 X2 count
1 18 20 56 2
2 19 22 58 2
3 20 24 60 2
4 21 26 62 2
5 22 28 64 2
6 23 30 66 2
7 24 32 68 2
8 25 34 70 2
I have tried different ways to do that, e.g. tally(), summarize_each
etc. They all deliver wrong results.
I believe their should be an easy and simple way to do that.
Any help is appreciated.
Since you're already summing all variables, you can just add a column of all 1s before the summary function
aggr <- function(data, criteria, output_columns){
data %>%
.[, colnames(.) %in% output_columns] %>%
group_by_(.dots = criteria) %>%
mutate(n = 1L) %>%
summarize_if(is.numeric, sum)
}
# A tibble: 8 x 4
Age X1 X2 n
<int> <int> <int> <int>
1 18 20 56 2
2 19 22 58 2
3 20 24 60 2
4 21 26 62 2
5 22 28 64 2
6 23 30 66 2
7 24 32 68 2
8 25 34 70 2
We could create the 'count' column before summarise_if
aggr<- function(data, criteria, output_columns){
data %>%
select(intersect(names(.), output_columns))%>%
group_by_at(criteria)%>%
group_by(count = n(), add= TRUE) %>%
summarize_if(is.numeric,sum) %>%
select(setdiff(names(.), 'count'), count)
}
aggr(df,"Age",output_columns)
# A tibble: 8 x 4
# Groups: Age [8]
# Age X1 X2 count
# <int> <int> <int> <int>
#1 18 20 56 2
#2 19 22 58 2
#3 20 24 60 2
#4 21 26 62 2
#5 22 28 64 2
#6 23 30 66 2
#7 24 32 68 2
#8 25 34 70 2
In base R you could do
aggr <- function(data, criteria, output_columns){
ds <- data[, colnames(data) %in% output_columns]
d <- aggregate(ds, by=list(criteria), function(x) c(sum(x), length(x)))
"names<-"(do.call(data.frame, d)[, -c(2:3, 5)], c(names(ds), "n"))
}
> with(df, aggr(df, Age, output_columns))
Age X1 X2 n
1 18 20 56 2
2 19 22 58 2
3 20 24 60 2
4 21 26 62 2
5 22 28 64 2
6 23 30 66 2
7 24 32 68 2
8 25 34 70 2

Resources