Data manipulation: gather or spread or both? - r

I am trying to change my data frame so I can look at it with some different plots. Essentially I want to compare different models. This is what I have:
variable = c('A','B','C','A','B','C')
optimal = c(10,20,30,40,80,100)
control = c(15,15,15,15,15,15)
method_1 = c(11,22,28,44,85,95)
method_2 = c(9, 19,31,39,79,102)
df = data.frame(variable, optimal, control, method_1, method_2)
df
and so it looks like this:
variable optimal control method_1 method_2
1 A 10 15 11 9
2 B 20 15 22 19
3 C 30 15 28 31
4 A 40 15 44 39
5 B 80 15 85 79
6 C 100 15 95 102
And I need something that looks like this:
variable A B C
1 optimal 10 20 30
2 optimal 40 80 100
3 control 15 15 15
4 control 15 15 15
5 method_1 11 22 28
6 method_1 44 85 95
7 method_2 9 19 31
8 method_2 39 79 102
I've tried gather and spread and transpose but nothing worked. Any thoughts? Feels that should be a easy fix, but I could not get my head around it. Thanks in advance.

You have to go long first and then wide, i.e.
library(dplyr)
library(tidyr)
df %>%
pivot_longer(-1) %>%
pivot_wider(names_from = variable, values_from = value) %>%
unnest()
name A B C
<chr> <dbl> <dbl> <dbl>
1 optimal 10 20 30
2 optimal 40 80 100
3 control 15 15 15
4 control 15 15 15
5 method_1 11 22 28
6 method_1 44 85 95
7 method_2 9 19 31
8 method_2 39 79 102

I think you need both. Also note that gather and spread has been retired and replaced with pivot_longer and pivot_wider instead.
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = -variable) %>%
group_by(variable) %>%
mutate(row = row_number()) %>%
pivot_wider(names_from = variable, values_from = value) %>%
select(-row)
# name A B C
# <chr> <dbl> <dbl> <dbl>
#1 optimal 10 20 30
#2 control 15 15 15
#3 method_1 11 22 28
#4 method_2 9 19 31
#5 optimal 40 80 100
#6 control 15 15 15
#7 method_1 44 85 95
#8 method_2 39 79 102

Related

How to keep grouped variables together in training and test data

I'm making and testing the accuracy of age extrapolations from growth measurements and to do this I have to split my data into my training and test data.
The issue is that individuals in my data set were measured multiple times and sometimes they were measured twice, sometimes 3 times. In the dataset Birds is the individual chick, age is the age at measurement, and wing is that measurement value.
I've tried using the group_by function to keep their measurements together, but this doesn't seem to work. I also tried nesting the data but that puts the data in a new table and my code doesn't like that. Is there another way I could keep the groups together while still randomly assigning them to training and test data?
library('tidyverse')
library("ggplot2")
library("readxl")
library("writexl")
library('dplyr')
library('Rmisc')
library('cowplot')
library('purrr')
library('caTools')
library('MLmetrics')
Bird<-c(1,1,1,2,2,3,3,3,4,4,5,5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10,)
Age<-c(10,17,27,17,28,10,17,27,10,17,10,17,28,10,17,28,10,17,28,10,17,28,10,17,28,11,18,)
Wing<-c(39,63,98,61,99,34,48,80,30,37,35,51,71,40,55,79,34,47,77,36,55,84,35,55,88,36,59,)
Set14<-data.frame(Bird, Age, Wing) %>%
group_by(Bird)
Set14$Bird<-as.factor((Set14$Bird))
Set14
sample_size = floor(0.7*nrow(Set14))
picked = sample(seq_len(nrow(Set14)),size = sample_size)
Training =Set14[picked,]
Training
Test =Set14[-picked,]
Test
trm<-lm(Age~Wing, data=Training)
predval<-predict(object=trm,
newdata=Test)
predval
error<-data.frame(actual=Test$Age, calculated=predval)
error
MAPE(error$actual, error$calculated)
In Base R you could do:
a <- as.integer(Set14$Bird)
train_index <- a %in% sample(n<-length(unique(a)), 0.7*n)
train <- set14[train, ]
test <- set14[!train, ]
in Tidyverse:
ungroup(Set14) %>%
nest_by(Bird) %>%
ungroup() %>%
mutate(tt = floor(.7*n()),
tt = sample(rep(c('train', 'test'), c(tt[1], n()-tt[1])))) %>%
unnest(data) %>%
group_split(tt, .keep = FALSE)
[[1]]
# A tibble: 9 x 3
Bird Age Wing
<fct> <dbl> <dbl>
1 1 10 39
2 1 17 63
3 1 27 98
4 3 10 34
5 3 17 48
6 3 27 80
7 7 10 34
8 7 17 47
9 7 28 77
[[2]]
# A tibble: 18 x 3
Bird Age Wing
<fct> <dbl> <dbl>
1 2 17 61
2 2 28 99
3 4 10 30
4 4 17 37
5 5 10 35
6 5 17 51
7 5 28 71
8 6 10 40
9 6 17 55
10 6 28 79
11 8 10 36
12 8 17 55
13 8 28 84
14 9 10 35
15 9 17 55
16 9 28 88
17 10 11 36
18 10 18 59

Split multiple columns with delimiter and have consistent column names in r

I have a large data set and here is the sample (the raw data has more columns
dta0 = data.frame(cbind(paste(seq(10,15),seq(20,25),sep = ";"),
paste(seq(30,35),seq(40,45),sep = ";") ) )
colnames(dta0) = c("H1","H2")
Here is my desired output
desired_dta = data.frame(cbind(seq(10,15),seq(20,25),seq(30,35),seq(40,45)))
colnames(desired_dta) = c("H1_x","H1_y","H2_x","H2_y")
How can I name columns like "H1_x", "H1_y","H2_x", "H2_y" ....?
You could try
library(tidyr)
dta0 %>%
separate(H1, c("H1_x", "H1_y"), ";") %>%
separate(H2, c("H2_x", "H2_y"), ";")
#> H1_x H1_y H2_x H2_y
#> 1 10 20 30 40
#> 2 11 21 31 41
#> 3 12 22 32 42
#> 4 13 23 33 43
#> 5 14 24 34 44
#> 6 15 25 35 45
Or in base R
setNames(as.data.frame(do.call(cbind, lapply(dta0,
function(x) do.call(rbind, strsplit(x, ";"))))),
unlist(lapply(names(dta0), paste0, c("_x", "_y"))))
#> H1_x H1_y H2_x H2_y
#> 1 10 20 30 40
#> 2 11 21 31 41
#> 3 12 22 32 42
#> 4 13 23 33 43
#> 5 14 24 34 44
#> 6 15 25 35 45
Here is an option with cSplit
library(splitstackshape)
cSplit(dta0, names(dta0), sep=";")
# H1_1 H1_2 H2_1 H2_2
#1: 10 20 30 40
#2: 11 21 31 41
#3: 12 22 32 42
#4: 13 23 33 43
#5: 14 24 34 44
#6: 15 25 35 45
Another option is using separate_rows() then reshape to long and after that reshape to wide. Here the code:
library(tidyverse)
#Code
dta0 %>% mutate(id=1:n()) %>%
separate_rows(c(H1,H2),sep = ';') %>%
group_by(id) %>% mutate(Var=1:n()) %>%
pivot_longer(-c(id,Var)) %>%
mutate(Var=ifelse(Var==1,'x','y'),
name=paste0(name,'.',Var)) %>% select(-c(Var)) %>%
pivot_wider(names_from = name,values_from=value) %>% ungroup() %>%
select(-id) %>%
select(sort(current_vars())) %>%
mutate_each(funs = as.numeric)
Output:
# A tibble: 6 x 4
H1.x H1.y H2.x H2.y
<dbl> <dbl> <dbl> <dbl>
1 10 20 30 40
2 11 21 31 41
3 12 22 32 42
4 13 23 33 43
5 14 24 34 44
6 15 25 35 45
Assuming your data has an even number of columns:
nc = ncol(desired_dta)
colnames(desired_dta) = paste0("H",rep(1:(nc/2),each = 2),rep(c("_x","_y"),nc/2))

Create multiple new columns in tibble in R based on value of previous row giving prefix to all

I have a tibble as so:
df <- tibble(a = seq(1:10),
b = seq(21,30),
c = seq(31,40))
I want to create a new tibble, where I want to lag some. I want to create new columns called prev+lagged_col_name, eg prev_a.
In my actual data, there are a lot of cols so I don't want to manually write it out. Additonally I only want to do it for some cols. In this eg, I have done it manually but wanted to know if there is a way to use a function to do it.
df_new <- df %>%
mutate(prev_a = lag(a),
prev_b = lag(b),
prev_d = lag(d))
Thanks for your help!
With the current dplyr version you can create new variable names with mutate_at, using a named list will take the name of the list as suffix. If you want it as a prefix as in your example you can use rename_at to correct the variable naming. With your real data, you need to adjust the vars() selection. For your example data matches("[a-c]") did work.
library(dplyr)
df <- tibble(a = seq(1:10),
b = seq(21,30),
c = seq(31,40))
df %>%
mutate_at(vars(matches("[a-c]")), list(prev = ~ lag(.x)))
#> # A tibble: 10 x 6
#> a b c a_prev b_prev c_prev
#> <int> <int> <int> <int> <int> <int>
#> 1 1 21 31 NA NA NA
#> 2 2 22 32 1 21 31
#> 3 3 23 33 2 22 32
#> 4 4 24 34 3 23 33
#> 5 5 25 35 4 24 34
#> 6 6 26 36 5 25 35
#> 7 7 27 37 6 26 36
#> 8 8 28 38 7 27 37
#> 9 9 29 39 8 28 38
#> 10 10 30 40 9 29 39
df %>%
mutate_at(vars(matches("[a-c]")), list(prev = ~ lag(.x))) %>%
rename_at(vars(contains( "_prev") ), list( ~paste("prev", gsub("_prev", "", .), sep = "_")))
#> # A tibble: 10 x 6
#> a b c prev_a prev_b prev_c
#> <int> <int> <int> <int> <int> <int>
#> 1 1 21 31 NA NA NA
#> 2 2 22 32 1 21 31
#> 3 3 23 33 2 22 32
#> 4 4 24 34 3 23 33
#> 5 5 25 35 4 24 34
#> 6 6 26 36 5 25 35
#> 7 7 27 37 6 26 36
#> 8 8 28 38 7 27 37
#> 9 9 29 39 8 28 38
#> 10 10 30 40 9 29 39
Created on 2020-04-29 by the reprex package (v0.3.0)
You could do this this way
df_new <- bind_cols(
df,
df %>% mutate_at(.vars = vars("a","b","c"), function(x) lag(x))
)
Names are a bit nasty but you can rename them check here. Or see #Bas comment to get the names with a suffix.
# A tibble: 10 x 6
a b c a1 b1 c1
<int> <int> <int> <int> <int> <int>
1 1 21 31 NA NA NA
2 2 22 32 1 21 31
3 3 23 33 2 22 32
4 4 24 34 3 23 33
5 5 25 35 4 24 34
6 6 26 36 5 25 35
7 7 27 37 6 26 36
8 8 28 38 7 27 37
9 9 29 39 8 28 38
10 10 30 40 9 29 39
If you have dplyr 1.0 you can use the new accross() function.
See some expamples from the docs, instead of mean you want lag
df %>% mutate_if(is.numeric, mean, na.rm = TRUE)
# ->
df %>% mutate(across(is.numeric, mean, na.rm = TRUE))
df %>% mutate_at(vars(x, starts_with("y")), mean, na.rm = TRUE)
# ->
df %>% mutate(across(c(x, starts_with("y")), mean, na.rm = TRUE))
df %>% mutate_all(mean, na.rm = TRUE)
# ->
df %>% mutate(across(everything(), mean, na.rm = TRUE))

Group_by / summarize by two variables within a function

I would like to write a function that summarize the provided data by some specified criteria, in this case by age
The example data is a table of users' age and their stats.
df <- data.frame('Age'=rep(18:25,2), 'X1'=10:17, 'X2'=28:35,'X4'=22:29)
Next I define the output columns that are relevant for the analysis
output_columns <- c('Age', 'X1', 'X2', 'X3')
This function computes the basic the sum of X1. X2 and X3 grouped by age.
aggr <- function(data, criteria, output_columns){
k <- data %>% .[, colnames(.) %in% output_columns] %>%
group_by_(.dots = criteria) %>%
#summarise_each(funs(count), age) %>%
summarize_if(is.numeric, sum)
return (k)
}
When I call it like this
> e <- aggr(df, "Age", output_columns)
> e
# A tibble: 8 x 3
Age X1 X2
<int> <int> <int>
1 18 20 56
2 19 22 58
3 20 24 60
4 21 26 62
5 22 28 64
6 23 30 66
7 24 32 68
8 25 34 70
I want to have another column called count which shows the number of observations in each age group. Desired output is
> desired
Age X1 X2 count
1 18 20 56 2
2 19 22 58 2
3 20 24 60 2
4 21 26 62 2
5 22 28 64 2
6 23 30 66 2
7 24 32 68 2
8 25 34 70 2
I have tried different ways to do that, e.g. tally(), summarize_each
etc. They all deliver wrong results.
I believe their should be an easy and simple way to do that.
Any help is appreciated.
Since you're already summing all variables, you can just add a column of all 1s before the summary function
aggr <- function(data, criteria, output_columns){
data %>%
.[, colnames(.) %in% output_columns] %>%
group_by_(.dots = criteria) %>%
mutate(n = 1L) %>%
summarize_if(is.numeric, sum)
}
# A tibble: 8 x 4
Age X1 X2 n
<int> <int> <int> <int>
1 18 20 56 2
2 19 22 58 2
3 20 24 60 2
4 21 26 62 2
5 22 28 64 2
6 23 30 66 2
7 24 32 68 2
8 25 34 70 2
We could create the 'count' column before summarise_if
aggr<- function(data, criteria, output_columns){
data %>%
select(intersect(names(.), output_columns))%>%
group_by_at(criteria)%>%
group_by(count = n(), add= TRUE) %>%
summarize_if(is.numeric,sum) %>%
select(setdiff(names(.), 'count'), count)
}
aggr(df,"Age",output_columns)
# A tibble: 8 x 4
# Groups: Age [8]
# Age X1 X2 count
# <int> <int> <int> <int>
#1 18 20 56 2
#2 19 22 58 2
#3 20 24 60 2
#4 21 26 62 2
#5 22 28 64 2
#6 23 30 66 2
#7 24 32 68 2
#8 25 34 70 2
In base R you could do
aggr <- function(data, criteria, output_columns){
ds <- data[, colnames(data) %in% output_columns]
d <- aggregate(ds, by=list(criteria), function(x) c(sum(x), length(x)))
"names<-"(do.call(data.frame, d)[, -c(2:3, 5)], c(names(ds), "n"))
}
> with(df, aggr(df, Age, output_columns))
Age X1 X2 n
1 18 20 56 2
2 19 22 58 2
3 20 24 60 2
4 21 26 62 2
5 22 28 64 2
6 23 30 66 2
7 24 32 68 2
8 25 34 70 2

Appending many columns - functions of existing columns - to data frame

I have a data frame with 200 columns: A_1, ..., A_100, B_1, ..., B_100. The entries of A are integers from 1 to 5 or NA, while the entries of B are -1, 0, 1, NA.
I want to append 100 more columns: C_1, ..., C_100 where C_i = A_i + B_i, except when it would yield 0 or 6, in which case it should stay as is.
What would be the best way to do this in R, in terms of clarity and computational complexity? There has to be a better way than a for loop or something like that, perhaps there are functions for this in some library? I'm going to have to do similar operations a lot so I'd like a streamlined method.
You can try:
library(tidyverse)
# some data
d <- data.frame(A_1=1:10,
A_2=1:10,
A_3=1:10,
B_1=11:20,
B_2=21:30,
B_3=31:40)
d %>%
gather(key, value) %>%
separate(key, into = c("a","b")) %>%
group_by(b, a) %>%
mutate(n=row_number()) %>%
unite(a2,b, n) %>%
spread(a, value) %>%
mutate(Sum=A+B) %>%
separate(a2, into = c("a", "b"), remove = T) %>%
select(-A,-B) %>%
mutate(a=paste0("C_",a)) %>%
spread(a, Sum) %>%
arrange(as.numeric(b)) %>%
left_join(d %>% rownames_to_column(), by=c("b"="rowname"))
# A tibble: 10 x 10
b C_1 C_2 C_3 A_1 A_2 A_3 B_1 B_2 B_3
<chr> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1 1 12 22 32 1 1 1 11 21 31
2 2 14 24 34 2 2 2 12 22 32
3 3 16 26 36 3 3 3 13 23 33
4 4 18 28 38 4 4 4 14 24 34
5 5 20 30 40 5 5 5 15 25 35
6 6 22 32 42 6 6 6 16 26 36
7 7 24 34 44 7 7 7 17 27 37
8 8 26 36 46 8 8 8 18 28 38
9 9 28 38 48 9 9 9 19 29 39
10 10 30 40 50 10 10 10 20 30 40
The idea is to use tidyr's gather and spread to get the columns A and B side by side. Then you can calculate the sum and transform it back to the expected data.frame. As long your data.frame has the same number of A and B columns, it is working.

Resources