I am having some issues ranslating a dataframe into wide format using pivot_wider. My dataframe looks like this:
Data <- read.table(header = T, text = "
ID A B C D
1 6.01764 0.00409222 0.000500143 101.816
1 6.01769 0.00431931 0.000565946 101.334
1 6.01774 0.00454617 0.00063163 101.923
2 6.01779 0.00477308 0.000697374 101.914
2 6.01784 0.00500005 0.000763118 101.905
2 6.0179 0.00522703 0.000828803 101.926
3 6.01795 0.005454 0.000894606 101.889
3 6.018 0.00568086 0.000960231 101.895
3 6.01805 0.00590783 0.00102603 101.87
")
I would like to create unique column names by combining The "ID" with the Column name so that it looks like this:
Datalong <- read.table(header = T, text = "
1A 1B 1C 1D 2A 2B 2C 2D 3A 3B 3C 3D
6.01764 0.00409222 0.000500143 101.816 6.01779 0.00477308 0.000697374 101.914 6.01795 0.005454 0.000894606 101.889
6.01769 0.00431931 0.000565946 101.334 6.01784 0.00500005 0.000763118 101.905 6.018 0.00568086 0.000960231 101.895
6.01774 0.00454617 0.00063163 101.923 6.0179 0.00522703 0.000828803 101.926 6.01805 0.00590783 0.00102603 101.87
")
I am thinking I might need to add a new column that counts each instance of the ID column (as it is time series data)
I have tried:
DataNew <- Data %>% pivot_wider(names_from = ID, values_from = c(ID, colnames(Data)))
And
Data %>% group_by(ID) %>% mutate(time = row_number()) %>% pivot_wider(names_from = time, values_from = c(ID, colnames(Data)))
but to no avail. Any support would be greatly appreciated!
How about this:
DataNew <- Data %>%
pivot_longer(-ID, names_to="var", values_to="vals") %>%
group_by(ID, var) %>%
mutate(obs =1:n(),
vnames = paste0(ID, var)) %>%
ungroup %>%
select(-c(ID, var)) %>%
pivot_wider(names_from = vnames, values_from = vals) %>%
select(-obs)
DataNew
# # A tibble: 3 x 12
# `1A` `1B` `1C` `1D` `2A` `2B` `2C` `2D` `3A` `3B`
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 6.02 0.00409 5.00e-4 102. 6.02 0.00477 6.97e-4 102. 6.02 0.00545
# 2 6.02 0.00432 5.66e-4 101. 6.02 0.00500 7.63e-4 102. 6.02 0.00568
# 3 6.02 0.00455 6.32e-4 102. 6.02 0.00523 8.29e-4 102. 6.02 0.00591
# # … with 2 more variables: `3C` <dbl>, `3D` <dbl>
We can use dcast from data.table
library(data.table)
dcast(setDT(Data), rowid(ID) ~ ID, value.var = c('A', 'B', 'C', 'D'))
Related
My data take this shape:
set.seed(666)
grouping <- rep(c("A", "B"), 3)
theMonth <- c("2022_01", "2022_01", "2022_02", "2022_02", "2022_03", "2022_03")
revenue <- sample(100:1000, 6)
df <- tibble(grouping, theMonth, revenue)
I'm being asked to spread these data by month...
step1 <- spread(df, theMonth, revenue)
step1
# A tibble: 2 × 4
grouping `2022_01` `2022_02` `2022_03`
<chr> <int> <int> <int>
1 A 673 707 639
2 B 737 222 753
...but also, within the same table, I'm being asked for the cumulative progress of B (and only B) toward a target, say in this case 10000. So the desired output is something like:
grouping `2022_01` `2022_02` `2022_03`
<chr> <int> <int> <int>
1 A 673 707 639
2 B 737 222 753
3 CumSumB 737 959 1712
4 Progress 7.37% 9.59% 17.12%
What's the best way to attack this? Should I do it before I spread, probably using mutate? Or is there a clean way to do it after the spread?
(Answer does not have to use dplyr, but that is my preferred package for this sort of work.)
We may filter the data first, get the cumulative sum column, bind the data with the original data and then create the row for 'Progress' with add_row
library(dplyr)
library(tidyr)
library(tibble)
df %>%
filter(grouping == 'B') %>%
mutate(grouping = 'CumSumB', revenue = cumsum(revenue)) %>%
bind_rows(df, .) %>%
pivot_wider(names_from = theMonth, values_from = revenue) %>%
add_row(., tibble(grouping = "Progress", .[3, -1]/10000 * 100))
-output
# A tibble: 4 × 4
grouping `2022_01` `2022_02` `2022_03`
<chr> <dbl> <dbl> <dbl>
1 A 673 707 639
2 B 737 222 753
3 CumSumB 737 959 1712
4 Progress 7.37 9.59 17.1
Adding the % would make the whole column character. If needed, it can be done
library(stringr)
df %>%
filter(grouping == 'B') %>%
mutate(grouping = 'CumSumB', revenue = cumsum(revenue)) %>%
bind_rows(df, .) %>%
pivot_wider(names_from = theMonth, values_from = revenue) %>%
add_row(., tibble(grouping = "Progress", .[3, -1]/10000 * 100)) %>%
mutate(across(-grouping, ~ replace(.x, n(), str_c(.x[n()], "%"))))
# A tibble: 4 × 4
grouping `2022_01` `2022_02` `2022_03`
<chr> <chr> <chr> <chr>
1 A 673 707 639
2 B 737 222 753
3 CumSumB 737 959 1712
4 Progress 7.37% 9.59% 17.12%
Here is an alternative approach:
library(dplyr)
library(tidyr)
df %>%
mutate(revenueA = lag(revenue, default = revenue[1])) %>%
filter(row_number() %% 2 == 0) %>%
mutate(CumSum = cumsum(revenue),
Progres = paste0(CumSum/100, "%")) %>%
pivot_longer(-c(grouping, theMonth),
names_to = "key",
values_to = "val",
values_transform = list(val = as.character)) %>%
pivot_wider(names_from = theMonth, values_from = val) %>%
mutate(grouping = case_when(key == "revenue" ~"B",
key == "revenueA" ~ "A",
TRUE ~ key)) %>%
arrange(grouping) %>%
select(-key)
grouping `2022_01` `2022_02` `2022_03`
<chr> <chr> <chr> <chr>
1 A 673 707 639
2 B 737 222 753
3 CumSum 737 959 1712
4 Progres 7.37% 9.59% 17.12%
Here is another option:
library(dplyr)
library(tidyr)
df %>%
pivot_wider(names_from = grouping, values_from = revenue) %>%
mutate(
CumSumB = cumsum(B),
Progress = (CumSumB / 10000) * 100
) %>%
pivot_longer(-theMonth, names_to = "grouping") %>%
pivot_wider(names_from = theMonth, values_from = value)
Returns:
grouping `2022_01` `2022_02` `2022_03`
<chr> <dbl> <dbl> <dbl>
1 A 673 707 639
2 B 737 222 753
3 CumSumB 737 959 1712
4 Progress 7.37 9.59 17.1
I got a data frame with a lot of columns and want to summarise them with multiple functions.
test_df <- data.frame(Group = sample(c("A", "B", "C"), 10, T), var1 = sample(1:5, 10, T), var2 = sample(3:7, 10, T))
test_df %>%
group_by(Group) %>%
summarise_all(c(Mean = mean, Sum = sum))
# A tibble: 3 x 5
Group var1_Mean var2_Mean var1_Sum var2_Sum
<chr> <dbl> <dbl> <int> <int>
1 A 3.14 5.14 22 36
2 B 4.5 4.5 9 9
3 C 4 6 4 6
This results in a tibble with the first row Group and column names with a combination of the previous column name and the function name.
The desired result is a table with the previous column names as first row and the groups and functions in the column names.
I can achive this with
test_longer <- test_df %>% pivot_longer(cols = starts_with("var"), names_to = "var", values_to = "val")
# Add row number because spread needs unique identifiers for rows
test_longer <- test_longer %>%
group_by(Group) %>%
mutate(grouped_id = row_number())
spread(test_longer, Group, val) %>%
select(-grouped_id) %>%
group_by(var) %>%
summarise_all(c(Mean = mean, Sum = sum), na.rm = T)
# A tibble: 2 x 7
var A_Mean B_Mean C_Mean A_Sum B_Sum C_Sum
<chr> <dbl> <dbl> <dbl> <int> <int> <int>
1 var1 3.14 4.5 4 22 9 4
2 var2 5.14 4.5 6 36 9 6
But this seems to be a rather long detour... There probably is a better way, but I could not find it. Any suggestions? Thank you
There's lots of ways to go about it, but I would simplify it by pivoting to a longer data frame initially, and then grouping by var and group. Then you can just pivot wider to get the final result you want. Note that I used summarize(across()) which replaces the deprecated summarize_all(), even though with a single column could've just manually specified Mean = ... and Sum = ....
set.seed(123)
test_df %>%
pivot_longer(
var1:var2,
names_to = "var"
) %>%
group_by(Group, var) %>%
summarize(
across(
everything(),
list(Mean = mean, Sum = sum),
.names = "{.fn}"
),
.groups = "drop"
) %>%
pivot_wider(
names_from = "Group",
values_from = c(Mean, Sum),
names_glue = "{Group}_{.value}"
)
#> # A tibble: 2 × 7
#> var A_Mean B_Mean C_Mean A_Sum B_Sum C_Sum
#> <chr> <dbl> <dbl> <dbl> <int> <int> <int>
#> 1 var1 1 2.5 3.2 1 10 16
#> 2 var2 5 4.5 4.4 5 18 22
I would like to perform multiple pairwise t-tests on a dataset containing about 400 different column variables and 3 subject groups, and extract p-values for every comparison. A shorter representative example of the data, using only 2 variables could be the following;
df <- tibble(var1 = rnorm(90, 1, 1), var2 = rnorm(90, 1.5, 1), group = rep(1:3, each = 30))
Ideally the end result will be a summarised data frame containing four columns; one for the variable being tested (var1, var2 etc.), two for the groups being tested every time and a final one for the p-value.
I've tried duplicating the group column in the long form, and doing a double group_by in order to do the comparisons but with no result
result <- df %>%
pivot_longer(var1:var2, "var", "value") %>%
rename(group_a = group) %>%
mutate(group_b = group_a) %>%
group_by(group_a, group_b) %>%
summarise(n = n())
We can reshape the data into 'long' format with pivot_longer, then grouped by 'group', apply the pairwise.t.test, extract the list elements and transform into tibble with tidy (from broom) and unnest the list column
library(dplyr)
library(tidyr)
library(broom)
df %>%
pivot_longer(cols = -group, names_to = 'grp') %>%
group_by(group) %>%
summarise(out = list(pairwise.t.test(value, grp
) %>%
tidy)) %>%
unnest(c(out))
-output
# A tibble: 3 x 4
group group1 group2 p.value
<int> <chr> <chr> <dbl>
1 1 var2 var1 0.0760
2 2 var2 var1 0.0233
3 3 var2 var1 0.000244
In case you end up wanting more information about the t-tests, here is an approach that will allow you to extract more information such as the degrees of freedom and value of the test statistic:
library(dplyr)
library(tidyr)
library(purrr)
library(broom)
df <- tibble(
var1 = rnorm(90, 1, 1),
var2 = rnorm(90, 1.5, 1),
group = rep(1:3, each = 30)
)
df %>%
select(-group) %>%
names() %>%
map_dfr(~ {
y <- .
combn(3, 2) %>%
t() %>%
as.data.frame() %>%
pmap_dfr(function(V1, V2) {
df %>%
select(group, all_of(y)) %>%
filter(group %in% c(V1, V2)) %>%
t.test(as.formula(sprintf("%s ~ group", y)), ., var.equal = TRUE) %>%
tidy() %>%
transmute(y = y,
group_1 = V1,
group_2 = V2,
df = parameter,
t_value = statistic,
p_value = p.value
)
})
})
#> # A tibble: 6 x 6
#> y group_1 group_2 df t_value p_value
#> <chr> <int> <int> <dbl> <dbl> <dbl>
#> 1 var1 1 2 58 -0.337 0.737
#> 2 var1 1 3 58 -1.35 0.183
#> 3 var1 2 3 58 -1.06 0.295
#> 4 var2 1 2 58 -0.152 0.879
#> 5 var2 1 3 58 1.72 0.0908
#> 6 var2 2 3 58 1.67 0.100
And here is #akrun's answer tweaked to give the same p-values as the above approach. Note the p.adjust.method = "none" which gives independent t-tests which will inflate your Type I error rate.
df %>%
pivot_longer(
cols = -group,
names_to = "y"
) %>%
group_by(y) %>%
summarise(
out = list(
tidy(
pairwise.t.test(
value,
group,
p.adjust.method = "none",
pool.sd = FALSE
)
)
)
) %>%
unnest(c(out))
#> # A tibble: 6 x 4
#> y group1 group2 p.value
#> <chr> <chr> <chr> <dbl>
#> 1 var1 2 1 0.737
#> 2 var1 3 1 0.183
#> 3 var1 3 2 0.295
#> 4 var2 2 1 0.879
#> 5 var2 3 1 0.0909
#> 6 var2 3 2 0.100
Created on 2021-07-30 by the reprex package (v1.0.0)
I often have to transform long tidy data frames into wide format. To do so I use the following standard procedure:
# Example data frame
df <- data.frame("ID" = rep(1:5, each = 4), "score" = runif(20, 0, 100), "location" = rep(c("a", "b", "c", "d"), 5))
# Transform into wide format
df_wide <- df %>%
group_by_at(vars(-score)) %>% # group by everything other than the value column.
mutate(row_id=1:n()) %>% ungroup() %>% # build group index
spread(key=location, value=score) %>% # spread
dplyr::select(-row_id)
Instead of typing this little script over and over again, I wanted to define a function to do it automatically. I found many useful posts on how to include column names as function inputs, but somehow it doesn't work or I get error messages. What am I doing wrong?
Below a few of my attempts (neither of them work), following these and this suggestions:
wide_fun <- function(dat, key_name, value_name) {
group_by_at(vars(- !! sym(value_name))) %>% # group by everything other than the value column.
mutate(row_id=1:n()) %>% ungroup() %>% # build group index
spread(key=!! sym(key_name), value=!! sym(value_name)) %>% # spread
dplyr::select(-row_id)
}
wide_fun2 <- function(dat, key_name, value_name) {
key_col <- enquo(key_name)
value_col <- enquo(value_name)
group_by_at(vars(- !!value_col)) %>% # group by everything other than the value column.
mutate(row_id=1:n()) %>% ungroup() %>% # build group index
spread(key= !!key_col, value= !!value_col) %>% # spread
dplyr::select(-row_id)
}
wide_fun3 <- function(dat, key_name, value_name) {
group_by_at(vars(- value_name)) %>% # group by everything other than the value column.
mutate(row_id=1:n()) %>% ungroup() %>% # build group index
spread(key=key_name, value=value_name) %>% # spread
dplyr::select(-row_id)
}
wide_fun3(df, quote(location), quote(score))
Thanks for your help!
I've slightly updated your code to dplyr 1.0.0 and tidyr. Then you can make use of the new dplyr programming feature {{}} to specify variables that are arguments of a function.
# Example data frame
df <- data.frame("ID" = rep(1:5, each = 4), "score" = runif(20, 0, 100), "location" = rep(c("a", "b", "c", "d"), 5))
library(dplyr)
wide_fun <- function(.data, key_name, value_name) {
.data %>%
group_by(across(-{{value_name}})) %>% # group by everything other than the value column.
mutate(row_id = 1:n()) %>% ungroup() %>% # build group index
tidyr::pivot_wider(
names_from = {{key_name}},
values_from = {{value_name}}) %>% # spread
select(-row_id)
}
wide_fun(df, location, score)
#> # A tibble: 5 x 5
#> ID a b c d
#> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 1 90.8 38.9 28.7 39.0
#> 2 2 94.5 24.9 84.6 54.6
#> 3 3 61.1 97.2 12.2 57.7
#> 4 4 52.7 85.6 41.4 100.
#> 5 5 17.8 86.1 92.3 33.7
Created on 2020-09-11 by the reprex package (v0.3.0)
Edit
This function should also work with older versions of dplyr:
library(dplyr)
wide_fun_2 <- function(.data, key_name, value_name) {
.data %>%
group_by_at(vars(-!!ensym(value_name))) %>% # group by everything other than the value column.
mutate(row_id = 1:n()) %>% ungroup() %>% # build group index
tidyr::pivot_wider(
names_from = !!ensym(key_name),
values_from = !!ensym(value_name)) %>% # spread
select(-row_id)
}
df %>%
wide_fun_2(location, score)
A tibble: 5 x 5
ID a b c d
<int> <dbl> <dbl> <dbl> <dbl>
1 1 72.2 81.4 52.5 48.8
2 2 36.1 27.5 82.2 73.0
3 3 83.9 68.2 80.9 15.7
4 4 0.451 70.0 18.5 43.2
5 5 82.6 68.2 22.8 63.0
If you just provide the argument that specifies the column, you only need to deal with symbols and not quosures, therefore you need to use ensym.
Using tidyr/dplyr, I have some factor columns which I'd like to Z-score, and then mutate an average Z-score, whilst retaining the original data for reference.
I'd like to avoid using a for loop in tidyr/dplyr, thus I'm gathering my data and performing my calculation (Z-score) on a single column. However, I'm struggling with restoring the wide format.
Here is a MWE:
library(dplyr)
library(tidyr)
# Original Data
dfData <- data.frame(
Name = c("Steve","Jwan","Ashley"),
A = c(10,20,12),
B = c(0.2,0.3,0.5)
) %>% tbl_df()
# Gather to Z-score
dfLong <- dfData %>% gather("Factor","Value",A:B) %>%
mutate(FactorZ = paste0("Z_",Factor)) %>%
group_by(Factor) %>%
mutate(ValueZ = (Value - mean(Value,na.rm = TRUE))/sd(Value,na.rm = TRUE))
# Now go wide to do some mutations (eg Z)Avg = (Z_A + Z_B)/2)
# This does not work
dfWide <- dfLong %>%
spread(Factor,Value) %>%
spread(FactorZ,ValueZ)%>%
mutate(Z_Avg = (Z_A+Z_B)/2)
# This is the desired result
dfDesired <- dfData %>% mutate(Z_A = (A - mean(A,na.rm = TRUE))/sd(A,na.rm = TRUE)) %>% mutate(Z_B = (B - mean(B,na.rm = TRUE))/sd(B,na.rm = TRUE)) %>%
mutate(Z_Avg = (Z_A+Z_B)/2)
Thanks for any help/input!
Another approach using dplyr (version 0.5.0)
library(dplyr)
dfData %>%
mutate_each(funs(Z = scale(.)), -Name) %>%
mutate(Z_Avg = (A_Z+B_Z)/2)
means <-function(x)mean(x, na.rm=T)
dfWide %>% group_by(Name) %>% summarise_each(funs(means)) %>% mutate(Z_Avg = (Z_A + Z_B)/2)
# A tibble: 3 x 6
Name A B Z_A Z_B Z_Avg
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Ashley 12 0.5 -0.3779645 1.0910895 0.3565625
2 Jwan 20 0.3 1.1338934 -0.2182179 0.4578378
3 Steve 10 0.2 -0.7559289 -0.8728716 -0.8144003
Here is one approach with long and wide format. For z-transformation, you can use the base function scale. Furthermore, this approach includes a join to combine the original data frame and the one including the new values.
dfLong <- dfData %>%
gather(Factor, Value, A:B) %>%
group_by(Factor) %>%
mutate(ValueZ = scale(Value))
# Name Factor Value ValueZ
# <fctr> <chr> <dbl> <dbl>
# 1 Steve A 10.0 -0.7559289
# 2 Jwan A 20.0 1.1338934
# 3 Ashley A 12.0 -0.3779645
# 4 Steve B 0.2 -0.8728716
# 5 Jwan B 0.3 -0.2182179
# 6 Ashley B 0.5 1.0910895
dfWide <- dfData %>% inner_join(dfLong %>%
ungroup %>%
select(-Value) %>%
mutate(Factor = paste0("Z_", Factor)) %>%
spread(Factor, ValueZ) %>%
mutate(Z_Avg = (Z_A + Z_B) / 2))
# Name A B Z_A Z_B Z_Avg
# <fctr> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 Steve 10 0.2 -0.7559289 -0.8728716 -0.8144003
# 2 Jwan 20 0.3 1.1338934 -0.2182179 0.4578378
# 3 Ashley 12 0.5 -0.3779645 1.0910895 0.3565625
I would just do it all in wide format. No need to keep switching between the long and wide formats.
dfData %>%
mutate(Z_A=(A-mean(unlist(dfData$A)))/sd(unlist(dfData$A)),
Z_B=(B-mean(unlist(dfData$B)))/sd(unlist(dfData$B))) %>%
mutate(Z_AVG=(Z_A+Z_B)/2)