My data looks like this :
structure(
list(
ID = c(1, 2, 3, 4, 5, 6),
Compagny = c("x", "x", "x", "y", "y", "y"),
Variable = c("size", "lenght", "diameter", "size", "lenght", "diameter"),
Score = c(12, 15, 8, 20, 4, 7)
),
row.names = c(NA, -6L),
class = "data.frame"
)
ID
Compagny
Variable
Score
1
x
size
12
2
x
lenght
15
3
x
diameter
8
4
y
size
20
5
y
lenght
4
6
y
diameter
7
i want to pivot wider so that variables are columns :
ID
Compagny
size
lenght
diameter
1
x
12
15
8
2
y
20
4
7
I've Followed this tutorial
because i had the same problem
pivot_wider issue "Values in `values_from` are not uniquely identified; output will contain list-cols"
I Copy/paste this lines of codes found above :
d %>%
group_by(name) %>%
mutate(row = row_number()) %>%
tidyr::pivot_wider(names_from = name, values_from = val) %>%
select(-row)
That became
PivoTable <- LongTable %>%
group_by(score) %>%
mutate(row = row_number()) %>%
tidyr::pivot_wider(names_from = score, values_from = mean) %>%
select(-row)
And I also have a special identifier for each row.
It still doesn't work even though I dont have a propre table, but a matrix with NA values instead (cf. picture)
You can do this:
df %>%
pivot_wider(id_cols = -ID, names_from = Variable, values_from = Score) %>%
mutate(ID = row_number(), .before = Compagny)
# A tibble: 2 x 5
ID Compagny size lenght diameter
<int> <chr> <int> <int> <int>
1 1 x 12 15 8
2 2 y 20 4 7
ESGTable <- select(EU_ESG_PILLARS,compagny, variable_name, mean_value)
ESGTable <- tibble::rowid_to_column(ESGTable, "ID")
# Petittable <- tibble::rowid_to_column(Petittable, "ID")
StackTry <- ESGTable %>%
pivot_wider(id_cols = -ID, names_from = variable_name, values_from = mean_value) %>%
mutate(ID = row_number(), .before = compagny)
## --> worked
Adding and index column (1-->n rows)
delete one colomn that was redundant
pivot_wider(id_cols --> identify each row uniquely
Thank you very much for such quick help !
Output
Related
I have a dataframe that is set up as follows:
set.seed(1234)
id <- rep(1:50, each = 3)
stimuli <- rep(c("a", "b", "c"), each = 1, times = 50)
dv_1 <- rnorm(150, mean = 2, sd = 0.7)
dv_2 <- rnorm(150, mean = 4, sd = 1.5)
dv_3 <- rnorm(150, mean = 7.5, sd = 1)
simdat <- data.frame(id, stimuli, dv_1, dv_2, dv_3)
simranks <- t(apply(simdat[,-1], 1, function(x) rank(x, ties.method = "min") ))
colnames(simranks) <- paste(colnames(simranks), "_rank", sep="")
simdat <- data.frame(simdat, simranks)
I then have split the dataframe according to the different types of stimuli, e.g.,
dat_a <- simdat %>%
dplyr::filter(stimuli == "a") %>%
select(id, dv_1_rank, dv_2_rank, dv_3_rank)
Then I would like to perform a bunch of different analyses on the subsetted data:
a_melt <- melt(dat_a, id.vars = c("id"), measure.vars = c("dv_1_rank", "dv_2_rank", "dv_3_rank"))
a_perc <- a_melt %>%
group_by(variable, value) %>%
summarise(count = n()) %>%
mutate(perc = count/sum(count))
ggplot(a_perc, aes(x = variable, y = perc, fill = value)) +
geom_col(position = "stack") +
scale_y_continuous(labels=scales::percent)
How can I write the code so that, rather than copying and pasting the code chunks for stimuli b and stimuli c, it loops over all of them (the "stimuli" column in the original dataset)
Here is how I usually deal with this.
I split my dataset into chunks gathered in a list then,
I use lapply ou purrr::map function to apply a function, that does the analysis for one chunk, to each chunck.
Here you could do something like:
analyses <- function(.df){
require(dplyr)
require(ggplot2)
df_melt <- data.table::melt(.df, id.vars = c("id"), measure.vars = c("dv_1_rank", "dv_2_rank", "dv_3_rank"))
df_perc <- df_melt %>%
group_by(variable, value) %>%
summarise(count = n()) %>%
mutate(perc = count/sum(count))
ggplot(df_perc, aes(x = variable, y = perc, fill = value)) +
geom_col(position = "stack") +
scale_y_continuous(labels=scales::percent)
}
lapply(split(simdat, ~ stimuli), analyses)
Note: this is just a proof of concept.
Does this produce your desired output?
library(tidyverse)
simdat <- expand_grid(stimuli = c("a", "b", "c"), id = 1:20) %>%
mutate(
dv_1_rank = floor(runif(nrow(.), 1, 5)),
dv_2_rank = floor(runif(nrow(.), 1, 5)),
dv_3_rank = floor(runif(nrow(.), 1, 5)),
)
a_perc <- simdat %>%
pivot_longer(dv_1_rank:dv_3_rank) %>%
group_by(stimuli, name, value) %>%
summarise(count = n(), .groups = "drop") %>%
group_by(stimuli, name) %>%
mutate(perc = count/sum(count)) %>%
ungroup()
print(a_perc)
#> # A tibble: 36 x 5
#> stimuli name value count perc
#> <chr> <chr> <dbl> <int> <dbl>
#> 1 a dv_1_rank 1 4 0.2
#> 2 a dv_1_rank 2 8 0.4
#> 3 a dv_1_rank 3 5 0.25
#> 4 a dv_1_rank 4 3 0.15
#> 5 a dv_2_rank 1 5 0.25
#> 6 a dv_2_rank 2 6 0.3
#> 7 a dv_2_rank 3 4 0.2
#> 8 a dv_2_rank 4 5 0.25
#> 9 a dv_3_rank 1 3 0.15
#> 10 a dv_3_rank 2 5 0.25
#> # ... with 26 more rows
Created on 2022-03-14 by the reprex package (v2.0.1)
I have a dataframe such as this, where most columns contain strings of values; the values in columns A_aoi, B_aoi, and C_aoi denote gaze directions (A, B, and C to speakers, * nowhere/elsewhere); the values in columns A_aoi_dur, B_aoi_dur, and C_aoi_dur denote the durations of these gazes:
df
# A tibble: 5 x 7
speaker A_aoi A_aoi_dur B_aoi B_aoi_dur C_aoi C_aoi_dur
<chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 ID01.B B*B*B 494,251,416,217,35 A* 153,1260 A 1413
2 ID01.A *B*C*C 445,412,116,533,600,153 A 2259 A*A*A 379,123,1300,144,313
3 ID01.A B*B*B 1098,249,168,184,526 A*A 1090,313,822 A*A 817,626,782
4 ID01.C C*C*B* 1794,1561,158,208,125,63 C* 2735,1174 *A 152,3757
5 ID01.B B*B*C*C*B 1585,1068,249,51,998,352,1016,66,425 * 5810 *B*B* 835,173,3827,661,314
For each speaker (identifiable by the suffixes A, B, and C in column speaker) I want to compute the summed durations and proportions of their gazes directions. The table I want to obtain is this:
Expected result:
A_aoi Total Prop B_aoi Total Prop C_aoi Total Prop
1 * 5431 34.77843 * 8557 54.79636 * 6021 38.556609
2 B 5533 35.43161 A 4324 27.68955 A 8761 56.102715
3 C 4652 29.78996 C 2735 17.51409 B 834 5.340676
My feeling is that this is best done by converting the dataframe to a long format. So using separate_rows and constructing many intermediate dataframes for each speaker's gazes directions and gaze durations I'v ended up with this convoluted code -- it does what it's supposed to do. But I'm pretty sure there's a more economical and more elegant way!
What would that be? Help is much appreciated!
library(dplyr)
library(tidyr)
### A:
a_dur <- df %>%
separate_rows(A_aoi_dur, sep = ",") %>%
select(A_aoi_dur)
a_aoi <- df %>%
separate_rows(A_aoi, sep = "") %>%
select(A_aoi) %>%
filter(!A_aoi == "")
A <- cbind(a_dur, a_aoi)
# get grouped total durations and proportions:
A_stat <- A %>%
group_by(A_aoi) %>%
summarise(Total = sum(as.numeric(A_aoi_dur))) %>%
mutate(Prop = Total/sum(Total)*100)
### B:
b_dur <- df %>%
separate_rows(B_aoi_dur, sep = ",") %>%
select(B_aoi_dur)
b_aoi <- df %>%
separate_rows(B_aoi, sep = "") %>%
select(B_aoi) %>%
filter(!B_aoi == "")
B <- cbind(b_dur, b_aoi)
# get grouped total durations and proportions:
B_stat <- B %>%
group_by(B_aoi) %>%
summarise(Total = sum(as.numeric(B_aoi_dur))) %>%
mutate(Prop = Total/sum(Total)*100)
### C:
c_dur <- df %>%
separate_rows(C_aoi_dur, sep = ",") %>%
select(C_aoi_dur)
c_aoi <- df %>%
separate_rows(C_aoi, sep = "") %>%
select(C_aoi) %>%
filter(!C_aoi == "")
C <- cbind(c_dur, c_aoi)
# get grouped total durations and proportions:
C_stat <- C %>%
group_by(C_aoi) %>%
summarise(Total = sum(as.numeric(C_aoi_dur))) %>%
mutate(Prop = Total/sum(Total)*100)
# get final table:
cbind(A_stat, B_stat, C_stat)
Reproducible data:
df <- structure(list(speaker = c("ID01.B", "ID01.A", "ID01.A", "ID01.C",
"ID01.B"), A_aoi = c("B*B*B", "*B*C*C", "B*B*B", "C*C*B*", "B*B*C*C*B"
), A_aoi_dur = c("494,251,416,217,35", "445,412,116,533,600,153",
"1098,249,168,184,526", "1794,1561,158,208,125,63", "1585,1068,249,51,998,352,1016,66,425"
), B_aoi = c("A*", "A", "A*A", "C*", "*"), B_aoi_dur = c("153,1260",
"2259", "1090,313,822", "2735,1174", "5810"), C_aoi = c("A",
"A*A*A", "A*A", "*A", "*B*B*"), C_aoi_dur = c("1413", "379,123,1300,144,313",
"817,626,782", "152,3757", "835,173,3827,661,314")), row.names = c(NA,
-5L), class = c("tbl_df", "tbl", "data.frame"))
One way of doing it (avoiding duplicated column names though):
library(dplyr)
library(purrr)
library(tidyr)
library(stringr)
map_columns <- function(aoi, dur){
tibble(
speaker = aoi,
duration = as.integer(dur)
)
}
df %>%
select(-1) %>% #This column seems irrelevant
mutate(
A_aoi = str_split(A_aoi, ''),
B_aoi = str_split(B_aoi, ''),
C_aoi = str_split(C_aoi, ''),
A_aoi_dur = str_split(A_aoi_dur, ','),
B_aoi_dur = str_split(B_aoi_dur, ','),
C_aoi_dur = str_split(C_aoi_dur, ','),
A_aoi = map2(A_aoi, A_aoi_dur, map_columns),
B_aoi = map2(B_aoi, B_aoi_dur, map_columns),
C_aoi = map2(C_aoi, C_aoi_dur, map_columns),
) %>%
select(1, 3, 5) %>%
gather() %>%
unnest(cols = value) %>%
group_by(key, speaker) %>%
summarise(
total = sum(duration)
) %>%
mutate(
prop = total/sum(total)*100
) %>%
ungroup() %>%
nest(data = -key) %>%
spread(key, data) %>%
unnest(cols = c(A_aoi, B_aoi, C_aoi), names_repair = ~paste0(., '_', rep(LETTERS[1:3], each = 3)))
Output:
# A tibble: 3 x 9
speaker_A total_A prop_A speaker_B total_B prop_B speaker_C total_C prop_C
<chr> <int> <dbl> <chr> <int> <dbl> <chr> <int> <dbl>
1 * 5431 34.8 * 8557 54.8 * 6021 38.6
2 B 5533 35.4 A 4324 27.7 A 8761 56.1
3 C 4652 29.8 C 2735 17.5 B 834 5.34
Here is a shot still need to sort the column a bit at the end but I think it is a tidy version compare with your code though the output is a bit different as it have all the aoi in one columns instead of have 3 columns differently as yours.
library(dplyr)
library(tidyr)
library(purrr)
# Using group_split to separate duration & attention group
split_df <- df %>%
pivot_longer(cols = contains("aoi"), names_to = "aoi",
values_to = "aoi_values") %>%
mutate(aoi_names = if_else(grepl("dur", aoi), "duration", "aoi")) %>%
group_split(aoi_names)
# For each group apply the same logics you do then combined them together
tidy_df <- bind_cols(split_df[[1]] %>%
separate_rows(aoi_values, sep = "") %>%
filter(aoi_values != "") %>%
select(speaker, aoi, aoi_values),
split_df[[2]] %>%
separate_rows(aoi_values, sep = ",") %>%
mutate(aoi = gsub("_dur", "", aoi)) %>%
select(duration = aoi_values))
# Finally calculate and pivot wider to have your desire output
tidy_df %>%
group_by(aoi, aoi_values) %>%
summarize(total_duration = sum(as.numeric(duration)),
.groups = "drop") %>%
group_by(aoi) %>%
mutate(prop = total_duration / sum(total_duration) * 100) %>%
pivot_wider(id_cols = aoi_values, names_from = aoi,
names_glue = "{aoi}_{.value}",
values_fill = 0,
values_from = c(total_duration, prop)) %>%
select(aoi_values, sort(names(.)))
Output
# A tibble: 4 x 7
aoi_values A_aoi_prop A_aoi_total_duration B_aoi_prop B_aoi_total_duration C_aoi_prop C_aoi_total_duration
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 * 34.8 5431 54.8 8557 38.6 6021
2 B 35.4 5533 0 0 5.34 834
3 C 29.8 4652 17.5 2735 0 0
4 A 0 0 27.7 4324 56.1 8761
I have a tibble with multiple columns with multiple list columns I'd like to unnest_wider.
df1 <- tibble(
gr = c('a', 'b', 'c'),
values1 = list(1:2, 3:4, 5:6),
values2 = list(1:2, 3:4, 5:6)
)
I have tried many approaches that have not worked including adding a vector into col
df1 %>% # unnest_wider doesn't take multiple inputs
unnest_wider(col = c(values, values2),
names_sep = c("_1", "_2"),
names_repair = "unique")
and trying mutate_at
df1 %>% # mutate_at doesn't send data
mutate_at(vars(values, values2),
~unnest_wider(col = .,
names_sep = c("_1", "_2"),
names_repair = "unique"))
How can I unnest multiple columns wider?
Here is one option with map
library(dplyr)
library(purrr)
map_dfc(names(df1[-1]), ~
df1 %>%
select(.x) %>%
unnest_wider(c(!!.x), names_sep=c("_1", "_2"),
names_repair = 'unique')) %>%
bind_cols(df1 %>%
select(gr), .)
# A tibble: 3 x 5
# gr values1_1 values1_2 values2_1 values2_2
#* <chr> <int> <int> <int> <int>
#1 a 1 2 1 2
#2 b 3 4 3 4
#3 c 5 6 5 6
Answer I used
df1 %>%
unnest() %>%
mutate(q_name = rep(c("1", "2"), nrow(.)/2)) %>%
pivot_wider(id_cols = gr,
names_from = q_name,
values_from = values1:values2)
I have a dataframe that looks like this:
df <-
data.frame(
a_1995 = 1:4,
b_1995 = 11:14,
a_1996 = 21:24,
a_1997 = 1:4,
b_1997 = 51:54,
a_1998 = 31:34,
a_1999 = 21:24)
For some years, I have multiple measures, therefore. I want to create a new set of columns, which are the averages of the 1 or 2 measurements take for that year. I could do this manually as follows to get the desired output:
out <-
df %>%
mutate(
avg_1995 = rowMeans(select(., contains("1995"))),
avg_1996 = rowMeans(select(., contains("1996"))),
avg_1997 = rowMeans(select(., contains("1997"))),
avg_1998 = rowMeans(select(., contains("1998"))),
avg_1999 = rowMeans(select(., contains("1999"))))
Is there a way to automate this using purrr or dplyr functions? (I have hundreds of columns like this.)
One option could be:
map_dfc(.x = as.character(1995:1999),
~ df %>%
transmute(!!paste("ave", .x, sep = "_") := rowMeans(select(., contains(.x)))))
ave_1995 ave_1996 ave_1997 ave_1998 ave_1999
1 6 21 26 31 21
2 7 22 27 32 22
3 8 23 28 33 23
4 9 24 29 34 24
Here is another base R solution using aggregate
u<-aggregate(.~year,data.frame(year = gsub("\\D+","avg_",names(df)),t(df)),mean)
dfout <- setNames(data.frame(t(u[-1]),row.names = NULL),u$year)
such that
> dfout
avg_1995 avg_1996 avg_1997 avg_1998 avg_1999
1 6 21 26 31 21
2 7 22 27 32 22
3 8 23 28 33 23
4 9 24 29 34 24
Base R
d = data.frame(lapply(split.default(df, gsub("\\D+", "", names(df))), rowMeans), check.names = FALSE)
names(d) = paste0("avg_", names(d))
cbind(df, d)
tidyverse
library(dplyr)
library(tidyr)
df %>%
mutate(rn = row_number()) %>%
left_join(df %>%
mutate(rn = row_number()) %>%
gather(key, val, -rn) %>%
mutate(year = paste0("avg_", gsub("\\D+", "", key))) %>%
group_by(rn, year) %>%
summarise(val = mean(val)) %>%
spread(year, val),
by = "rn") %>%
select(-rn)
Here is a solution with tidyr and dplyr :
df <-
data.frame(
a_1995 = 1:4,
b_1995 = 11:14,
a_1996 = 21:24,
a_1997 = 1:4,
b_1997 = 51:54,
a_1998 = 31:34,
a_1999 = 21:24)
suppressPackageStartupMessages( library(dplyr) )
suppressPackageStartupMessages( library(tidyr) )
df %>%
pivot_longer(data = ., cols = names(.),
names_to = "type_year"
) %>%
separate(col = "type_year", into = c("type", "year"), sep = "_") %>%
group_by(year) %>%
summarise(mean_value = mean(value)) %>%
pivot_wider(names_from = year, values_from = mean_value) %>%
rename_all(~paste0("avg_", .))
#> # A tibble: 1 x 5
#> avg_1995 avg_1996 avg_1997 avg_1998 avg_1999
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 7.5 22.5 27.5 32.5 22.5
additional solution
result <- df %>%
mutate(n = row_number()) %>%
pivot_longer(-n) %>%
tidyr::extract(name, "year", "(\\d{4})") %>%
group_by(n, year) %>%
summarise(value = mean(value, na.rm = T)) %>%
pivot_wider(n, names_from = year, values_from = value, names_prefix = "avg_") %>%
ungroup() %>%
select(-n) %>%
bind_cols(df, .)
I have a dataframe I've created in the form
FREQ CNT
0 5
1 20
2 1000
3 3
4 3
I want to further group my results to be in the following form:
CUT CNT
0+1 25
2+3 1003
4+5 ...
.....
I've tried using the between and cut functions in dplyr but it just adds a new interval column to my dataframe can anyone give me a good indication as to where to go to achieve this?
Here is a way to do it in dplyr:
library(dplyr)
df <- df %>%
mutate(id = 1:n()) %>%
mutate(new_freq = ifelse(id %% 2 != 0, paste0(FREQ, "+", lead(FREQ, 1)), paste0(lag(FREQ, 1), "+", FREQ)))
df <- df %>%
group_by(new_freq) %>%
mutate(new_cnt = sum(CNT))
unique(df[, 4:5])
# A tibble: 2 x 2
# Groups: new_freq [2]
# new_freq new_cnt
# <chr> <int>
#1 0+1 25
#2 2+3 1003
data
df <- structure(list(FREQ = 0:3, CNT = c(5L, 20L, 1000L, 3L)), class = "data.frame", row.names = c(NA, -4L))
A non-elegant solution using dplyr... probably a better way to do this.
dat <- data.frame(FREQ = c(0,1,2,3,4), CNT = c(5,20,1000, 3, 3))
dat2 <- dat %>%
mutate(index = 0:(nrow(dat)-1)%/%2) %>%
group_by(index)
dat2 %>%
summarise(new_CNT = sum(CNT)) %>%
left_join(dat2 %>%
mutate(CUT = paste0(FREQ[1], "+", FREQ[2])) %>%
distinct(index, CUT),
by = "index") %>%
select(-index)
# A tibble: 3 x 2
new_CNT CUT
<dbl> <chr>
1 25 0+1
2 1003 2+3
3 3 4+NA