I have this sample dataset and i want to convert it into the following format:
Type <- c("AGE", "AGE", "REGION", "REGION", "REGION", "DRIVERS", "DRIVERS")
Level <- c("18-25", "26-70", "London", "Southampton", "Newcastle", "1", "2")
Estimate <- c(1.5,1,2,3,1,2,2.5)
df_before <- data.frame(Type, Level, Estimate)
Type Level Estimate
1 AGE 18-25 1.5
2 AGE 26-70 1.0
3 REGION London 2.0
4 REGION Southampton 3.0
5 REGION Newcastle 1.0
6 DRIVERS 1 2.0
7 DRIVERS 2 2.5
Basically, I would like to to transform the dataset into the following format. I have tried with the function dcast() but it seems that is not working.
AGE Estimate_AGE REGION Estimate_REGION DRIVERS Estimate_DRIVERS
1 18-25 1.5 London 2 1 2.0
2 26-70 1.0 Southampton 3 2 2.5
3 <NA> NA Newcastle 1 <NA> NA
df_before %>%
group_by(Type) %>%
mutate(id = row_number(), Estimate = as.character(Estimate))%>%
pivot_longer(-c(Type, id)) %>%
pivot_wider(id, names_from = c(Type, name))%>%
type.convert(as.is = TRUE)
# A tibble: 3 x 7
id AGE_Level AGE_Estimate REGION_Level REGION_Estimate DRIVERS_Level DRIVERS_Estimate
<int> <chr> <dbl> <chr> <int> <int> <dbl>
1 1 18-25 1.5 London 2 1 2
2 2 26-70 1 Southampton 3 2 2.5
3 3 NA NA Newcastle 1 NA NA
In data.table:
library(data.table)
setDT(df_before)
dcast(melt(df_before, 'Type'), rowid(Type, variable)~Type + variable)
Note that you will get alot of warning because of the type mismatch. You could use reshape2::melt to avoid this.
Anyway your datafram is not in a standard format.
In Base R >=4.0
transform(df_before, id = ave(Estimate, Type, FUN = seq_along)) |>
reshape(v.names = c('Level', 'Estimate'), dir = 'wide', timevar = 'Type', sep = "_")
id Level_AGE Estimate_AGE Level_REGION Estimate_REGION Level_DRIVERS Estimate_DRIVERS
1 1 18-25 1.5 London 2 1 2.0
2 2 26-70 1.0 Southampton 3 2 2.5
5 3 <NA> NA Newcastle 1 <NA> NA
IN base R <4
reshape(transform(df_before, id = ave(Estimate, Type, FUN = seq_along)),
v.names = c('Level', 'Estimate'), dir = 'wide', timevar = 'Type', sep = "_")
Update:
The exact output as the desired output:
df_before %>%
group_by(Type) %>%
mutate(id = row_number()) %>%
pivot_wider(
names_from = Type,
values_from = c(Level, Estimate)
) %>%
select(AGE = Level_AGE, Estimate_AGE, REGION = Level_REGION,
Estimate_REGION, DRIVERS = Level_DRIVERS, Estimate_DRIVERS) %>%
type.convert(as.is=TRUE)
AGE Estimate_AGE REGION Estimate_REGION DRIVERS Estimate_DRIVERS
<chr> <dbl> <chr> <int> <int> <dbl>
1 18-25 1.5 London 2 1 2
2 26-70 1 Southampton 3 2 2.5
3 NA NA Newcastle 1 NA NA
First answer:
Main aspect is to group by Type as already provided Onyambu's solution. After that we could use one pivot_wider:
library(dplyr)
library(tidyr)
df_before %>%
group_by(Type) %>%
mutate(id = row_number()) %>%
pivot_wider(
names_from = Type,
values_from = c(Level, Estimate)
)
id Level_AGE Level_REGION Level_DRIVERS Estimate_AGE Estimate_REGION Estimate_DRIVERS
<int> <chr> <chr> <chr> <dbl> <dbl> <dbl>
1 1 18-25 London 1 1.5 2 2
2 2 26-70 Southampton 2 1 3 2.5
3 3 NA Newcastle NA NA 1 NA
We can try this:
library(tidyverse)
Type <- c("AGE", "AGE", "REGION", "REGION", "REGION", "DRIVERS", "DRIVERS")
Level <- c("18-25", "26-70", "London", "Southampton", "Newcastle", "1", "2")
Estimate <- c(1.5, 1, 2, 3, 1, 2, 2.5)
df_before <- data.frame(Type, Level, Estimate)
data <-
df_before %>% group_split(Type)
data <-
map2(
data, map(data, ~ unique(.$Type)),
~ mutate(., "{.y}" := Level, "Estimate_{.y}" := Estimate) %>%
select(-c("Type", "Level", "Estimate"))
)
#get the longest number of rows to be able to join the columns
max_rows <- map_dbl(data, nrow) %>%
max()
#add rows if needed
map_if(
data, ~ nrow(.) < max_rows,
~ rbind(., NA)
) %>%
bind_cols()
#> # A tibble: 3 × 6
#> AGE Estimate_AGE DRIVERS Estimate_DRIVERS REGION Estimate_REGION
#> <chr> <dbl> <chr> <dbl> <chr> <dbl>
#> 1 18-25 1.5 1 2 London 2
#> 2 26-70 1 2 2.5 Southampton 3
#> 3 <NA> NA <NA> NA Newcastle 1
Created on 2021-12-07 by the reprex package (v2.0.1)
A solution based on tidyr::pivot_wider and purrr::map_dfc:
library(tidyverse)
Type <- c("AGE", "AGE", "REGION", "REGION", "REGION", "DRIVERS", "DRIVERS")
Level <- c("18-25", "26-70", "London", "Southampton", "Newcastle", "1", "2")
Estimate <- c(1.5,1,2,3,1,2,2.5)
df_before <- data.frame(Type, Level, Estimate)
df_before %>%
pivot_wider(names_from=Type, values_from=c(Level, Estimate), values_fn=list) %>%
map_dfc(~ c(unlist(.x), rep(NA, max(table(df_before$Type))-length(unlist(.x)))))
#> # A tibble: 3 × 6
#> Level_AGE Level_REGION Level_DRIVERS Estimate_AGE Estimate_REGION
#> <chr> <chr> <chr> <dbl> <dbl>
#> 1 18-25 London 1 1.5 2
#> 2 26-70 Southampton 2 1 3
#> 3 <NA> Newcastle <NA> NA 1
#> # … with 1 more variable: Estimate_DRIVERS <dbl>
Another solution, based on dplyr:: group_split and purrr::map_dfc:
library(tidyverse)
df_before %>%
mutate(maxn = max(table(.$Type))) %>%
group_by(Type) %>% group_split() %>%
map_dfc(
~ data.frame(c(.x$Level, rep(NA, .x$maxn[1] - nrow(.x))),
c(.x$Estimate, rep(NA, .x$maxn[1] - nrow(.x)))) %>%
set_names(c(.x$Type[1], paste0("Estimate_", .x$Type[1])))) %>%
type.convert(as.is=T)
#> AGE Estimate_AGE DRIVERS Estimate_DRIVERS REGION Estimate_REGION
#> 1 18-25 1.5 1 2.0 London 2
#> 2 26-70 1.0 2 2.5 Southampton 3
#> 3 <NA> NA NA NA Newcastle 1
Related
This is an example dataframe
means2 <- as.data.frame(matrix(runif(n=25, min=1, max=20), nrow=5))
names(means2) <- c("B_T0|B_T0", "B_T0|B_T1", "B_T0|Fibro_T0", "B_T5|Endo_T5", "Macro_T1|Fibro_T1")
I have column names in my dataframe in R in this format
\S+_T\d+|\S+_T\d+
The syntax is something like (Name)_ (T)(Number) | (Name)_ (T)(Number)
Step 1) I want to select columns which contain the same (T)(Number) on both sides of the "|"
I did this with some manual labor :
means_t0 <- means2 %>% select(matches("\\S+_T0\\|\\S+_T0")) %>% rownames_to_column("id_cp_interaction")
means_t1 <- means2 %>% select(matches("\\S+_T1\\|\\S+_T1")) %>% rownames_to_column("id_cp_interaction")
means_t5 <- means2 %>% select(matches("\\S+_T5\\|\\S+_T5")) %>% rownames_to_column("id_cp_interaction")
means3 <- full_join(means_t0, means_t1) %>% full_join(means_t5)
This gives me what I want and it was easy to do because I only had 3 types - T0, T1 and T5. What do I do if I had a huge number?
Step 2) From the output of Step1, I want to do a negation of the last question i.e. select only those columns with Names which are not the same
For example B_T0|B_T0 should be removed but B_T0|Fibro_T0 should be retained
Is there a way to regex capture the part in front of the pipe(|) and match it to the part at the back of the pipe(|)
Thank you
If you have that much information in your column names, I like to transform the data into the long format and then separate the info from the column name into several columns. Then it's easy to filter by these columns:
means2 <- as.data.frame(matrix(runif(n=25, min=1, max=20), nrow=5))
names(means2) <- c("B_T0|B_T0", "B_T0|B_T1", "B_T0|Fibro_T0", "B_T5|Endo_T5", "Macro_T1|Fibro_T1")
means2 <- cbind(data.frame(id_cp_interaction = 1:5), means2)
library(tidyr)
library(dplyr)
library(stringr)
res <- means2 %>%
pivot_longer(
cols = -id_cp_interaction,
names_to = "names",
values_to = "values"
) %>%
mutate(
celltype_1 = str_extract(names, "^[^_]*"),
timepoint_1 = str_extract(names, "[0-9](?=|)"),
celltype_2 = str_extract(names, "(?<=\\|)(.*?)(?=_)"),
timepoint_2 = str_extract(names, "[0-9]$")
)
head(res, n = 7)
#> # A tibble: 7 × 7
#> id_cp_interaction names values celltype_1 timepoint_1 celltype_2 timepoint_2
#> <int> <chr> <dbl> <chr> <chr> <chr> <chr>
#> 1 1 B_T0|B… 1.68 B 0 B 0
#> 2 1 B_T0|B… 19.3 B 0 B 1
#> 3 1 B_T0|F… 10.6 B 0 Fibro 0
#> 4 1 B_T5|E… 12.5 B 5 Endo 5
#> 5 1 Macro_… 2.84 Macro 1 Fibro 1
#> 6 2 B_T0|B… 2.17 B 0 B 0
#> 7 2 B_T0|B… 10.1 B 0 B 1
# only keep interactions of different cell types
res %>%
filter(celltype_1 != celltype_2) %>%
head()
#> # A tibble: 6 × 7
#> id_cp_interaction names values celltype_1 timepoint_1 celltype_2 timepoint_2
#> <int> <chr> <dbl> <chr> <chr> <chr> <chr>
#> 1 1 B_T0|F… 10.6 B 0 Fibro 0
#> 2 1 B_T5|E… 12.5 B 5 Endo 5
#> 3 1 Macro_… 2.84 Macro 1 Fibro 1
#> 4 2 B_T0|F… 1.47 B 0 Fibro 0
#> 5 2 B_T5|E… 11.3 B 5 Endo 5
#> 6 2 Macro_… 13.0 Macro 1 Fibro 1
Created on 2022-09-19 by the reprex package (v1.0.0)
I have over a hundred variables for which I'm trying to calculate frequency and percent. How can I maintain the factor order of each variables' values in the output? Please note that specifying the order for each variable outside the dataset is not practical as I have over 100 variables.
Example data:
df <- data.frame(gender=factor(c("male", "female", "male", NA), levels=c("male", "female")),
disease=factor(c("yes","yes","no", NA), levels=c("yes", "no")))
df
gender disease
1 male yes
2 female yes
3 male no
4 <NA> <NA>
Attempt:
df %>% gather(key, value, factor_key = T) %>%
group_by(key, value) %>%
summarise(n=n()) %>%
ungroup() %>%
group_by(key) %>%
mutate(percent=n/sum(n))
Output:
# A tibble: 6 x 4
# Groups: key [2]
key value n percent
<fct> <chr> <int> <dbl>
1 gender female 1 0.25
2 gender male 2 0.5
3 gender NA 1 0.25
4 disease no 1 0.25
5 disease yes 2 0.5
6 disease NA 1 0.25
Desired output would order gender as male, female and disease as yes, no.
Update: if you use pivot_longer (the new gather), it retains the factor levels! You can also fine-tune the column types with arguments names_transform and values_transform in pivot_longer.
library(tidyverse)
df <- data.frame(gender=factor(c("male", "female", "male", NA), levels=c("male", "female")),
disease=factor(c("yes","yes","no", NA), levels=c("yes", "no")))
df %>%
pivot_longer(everything()) %>%
group_by(name, value) %>%
summarise(n=n(), .groups = "drop_last") %>%
mutate(percent=n/sum(n))
#> # A tibble: 6 x 4
#> # Groups: name [2]
#> name value n percent
#> <chr> <fct> <int> <dbl>
#> 1 disease yes 2 0.5
#> 2 disease no 1 0.25
#> 3 disease <NA> 1 0.25
#> 4 gender male 2 0.5
#> 5 gender female 1 0.25
#> 6 gender <NA> 1 0.25
Created on 2020-10-16 by the reprex package (v0.3.0)
Because gather drops the factor for the value variable and summarise also appears to drop data frame attributes, you'll have to re-add them. You can re-add them in a semi-automated by reading in and combining the factor levels like this:
library(tidyverse)
df <- data.frame(gender=factor(c("male", "female", "male", NA), levels=c("male", "female")),
disease=factor(c("yes","yes","no", NA), levels=c("yes", "no")))
df %>%
gather(key, value, factor_key = T) %>%
group_by(key, value) %>%
summarise(n=n()) %>%
ungroup() %>%
group_by(key) %>%
mutate(percent=n/sum(n),
value = factor(value, levels = df %>% map(levels) %>% unlist())) %>%
arrange(key, value)
#> Warning: attributes are not identical across measure variables;
#> they will be dropped
#> `summarise()` regrouping output by 'key' (override with `.groups` argument)
#> # A tibble: 6 x 4
#> # Groups: key [2]
#> key value n percent
#> <fct> <fct> <int> <dbl>
#> 1 gender male 2 0.5
#> 2 gender female 1 0.25
#> 3 gender <NA> 1 0.25
#> 4 disease yes 2 0.5
#> 5 disease no 1 0.25
#> 6 disease <NA> 1 0.25
Created on 2020-10-16 by the reprex package (v0.3.0)
I am trying to find a way to rename my factor levels (1, 2, 3) with girl, boy, other in the dplyr tibble output.
This is the code:
library(dplyr)
df1 %>%
dplyr::group_by(sex)%>%
dplyr::summarise(percent=100*n()/nrow(df1), n=n())
And my result is:
# A tibble: 3 x 3
sexs percent n
<int> <dbl> <int>
1 1 52.1 731
2 2 47.1 661
3 NA 0.855 12
The desired result would be:
# A tibble: 3 x 3
sexs percent n
<int> <dbl> <int>
Girl 1 52.1 731
Boy 2 47.1 661
Other NA 0.855 12
I happen to love the forcats package because when I get done I can actually see what I did. Another solution by simply adding to the pipe before your existiung code.
library(dplyr)
library(forcats)
sex <- sample(1:2, 100, replace = TRUE)
sex[[88]] <- NA
df1 <- data.frame(sex)
df1 %>%
mutate(newsex = fct_explicit_na(fct_recode(as_factor(sex),
Girl = "1",
Boy = "2" ),
na_level = "Other")) %>%
group_by(newsex, sex) %>%
summarise(percent = 100 * n() / nrow(df1), n=n())
#> # A tibble: 3 x 4
#> # Groups: newsex [3]
#> newsex sex percent n
#> <fct> <int> <dbl> <int>
#> 1 Girl 1 56 56
#> 2 Boy 2 43 43
#> 3 Other NA 1 1
Created on 2020-05-11 by the reprex package (v0.3.0)
When posting please provide some sample data to work with, it will help others test and make sure everything is working properly. This problem is relatively simple so it shouldn't be a problem.
If you want to replace the NA with literally any other number you can do this
df1 %>%
dplyr::mutate(sex = ifelse(is.na(sex), 0, sex),
sex = factor(sex,
levels = c(1,2,0),
labels = c("Girl", "Boy", "Other"))) %>%
dplyr::group_by(sex)%>%
dplyr::summarise(percent=100*n()/nrow(df1), n=n())
Otherwise you can use case_when to assign the factors and then convert the column to a factor
df1 %>%
dplyr::mutate(sex = case_when(
sex == 1 ~ "Girl",
sex == 2 ~ "Boy",
is.na(sex) ~ "Other") %>%
as_factor(.)) %>%
dplyr::group_by(sex)%>%
dplyr::summarise(percent=100*n()/nrow(df1), n=n())
I have with some data with missing values (i.e. NA values), the simplified format is below (code for input at the end):
#> id x country
#> 1 1 2.0 USA
#> 2 2 4.0 USA
#> 3 3 3.5 JPN
#> 4 4 NA JPN
For each country, I'd like to take the mean of x, and a count of usable values of x (i.e. not NA), so I've used group_by, and it works for the mean:
df <- df %>% group_by(country) %>%
mutate(mean_x = mean(x, na.rm = TRUE),
#count_x = count(x))
)
df
#> # A tibble: 4 x 4
#> # Groups: country [2]
#> id x country mean_x
#> <dbl> <dbl> <fct> <dbl>
#> 1 1 2 USA 3
#> 2 2 4 USA 3
#> 3 3 3.5 JPN 3.5
#> 4 4 NA JPN 3.5
but when I try to add the count(), I'm getting an error
library(tidyverse)
df <- data.frame(id = c(1, 2, 3, 4),
x = c(2, 4, 3.5, NA),
country = c("USA", "USA", "JPN", "JPN")
)
df
df <- df %>% group_by(country) %>%
mutate(mean_x = mean(x, na.rm = TRUE),
count_x = count(x))
)
df
#> Error in UseMethod("summarise_") : no applicable method for 'summarise_' applied to an
#> object of class "c('double', 'numeric')"
My desired output would be:
#> id x country mean_x count
#> <dbl> <dbl> <fct> <dbl>
#> 1 1 2 USA 3 2
#> 2 2 4 USA 3 2
#> 3 3 3.5 JPN 3.5 1
#> 4 4 NA JPN 3.5 1
Reproducible code below:
library(tidyverse)
df <- data.frame(id = c(1, 2, 3, 4),
x = c(2, 4, 3.5, NA),
country = c("USA", "USA", "JPN", "JPN")
)
df
df <- df %>% group_by(country) %>%
mutate(mean_x = mean(x, na.rm = TRUE),
count_x = count(x))
)
df
count is not the right function here. The first argument to count is a dataframe or tibble specifically. However, what you are passing is a vector hence you get the error. Also count summarises the dataframe so that you have only one row per group. See for example,
library(dplyr)
df %>%
group_by(country) %>%
mutate(mean_x = mean(x, na.rm = TRUE)) %>%
count(country)
# country n
# <fct> <int>
#1 JPN 2
#2 USA 2
If you want to add a new column without summarising, use add_count instead
df %>%
group_by(country) %>%
mutate(mean_x = mean(x, na.rm = TRUE)) %>%
add_count(country)
# id x country mean_x n
# <dbl> <dbl> <fct> <dbl> <int>
#1 1 2 USA 3 2
#2 2 4 USA 3 2
#3 3 3.5 JPN 3.5 2
#4 4 NA JPN 3.5 2
However, both of this function don't do what you need. To count non-NA values per group, you need
df %>%
group_by(country) %>%
mutate(mean_x = mean(x, na.rm = TRUE),
count = length(na.omit(x)))
#OR
#count = sum(!is.na(x)))#as #Humpelstielzchen mentioned
# id x country mean_x count
# <dbl> <dbl> <fct> <dbl> <int>
#1 1 2 USA 3 2
#2 2 4 USA 3 2
#3 3 3.5 JPN 3.5 1
#4 4 NA JPN 3.5 1
We can also create the 'count' with a group_by n()
library(dplyr)
df %>%
group_by(country) %>%
mutate(mean_x = mean(x, na.rm = TRUE)) %>%
summarise(n = n())
# A tibble: 2 x 2
# country n
# <fct> <int>
#1 JPN 2
#2 USA 2
I'm trying to assess which unit in a pair is the "winner". group_by() %>% mutate() is close to the right thing, but it's not quite there. in particular
dat %>% group_by(pair) %>% mutate(winner = ifelse(score[1] > score[2], c(1, 0), c(0, 1))) doesn't work.
The below does, but is clunky with an intermediate summary data frame. Can we improve this?
library(tidyverse)
set.seed(343)
# units within pairs get scores
dat <-
data_frame(pair = rep(1:3, each = 2),
unit = rep(1:2, 3),
score = rnorm(6))
# figure out who won in each pair
summary_df <-
dat %>%
group_by(pair) %>%
summarize(winner = which.max(score))
# merge back and determine whether each unit won
dat <-
left_join(dat, summary_df, "pair") %>%
mutate(won = as.numeric(winner == unit))
dat
#> # A tibble: 6 x 5
#> pair unit score winner won
#> <int> <int> <dbl> <int> <dbl>
#> 1 1 1 -1.40 2 0
#> 2 1 2 0.523 2 1
#> 3 2 1 0.142 1 1
#> 4 2 2 -0.847 1 0
#> 5 3 1 -0.412 1 1
#> 6 3 2 -1.47 1 0
Created on 2018-09-26 by the reprex
package (v0.2.0).
maybe related to Weird group_by + mutate + which.max behavior
You could do:
dat %>%
group_by(pair) %>%
mutate(won = score == max(score),
winner = unit[won == TRUE]) %>%
# A tibble: 6 x 5
# Groups: pair [3]
pair unit score won winner
<int> <int> <dbl> <lgl> <int>
1 1 1 -1.40 FALSE 2
2 1 2 0.523 TRUE 2
3 2 1 0.142 TRUE 1
4 2 2 -0.847 FALSE 1
5 3 1 -0.412 TRUE 1
6 3 2 -1.47 FALSE 1
Using rank:
dat %>% group_by(pair) %>% mutate(won = rank(score) - 1)
More for fun (and slightly faster), using the outcome of the comparison (score[1] > score[2]) to index a vector with 'won alternatives' :
dat %>% group_by(pair) %>%
mutate(won = c(0, 1, 0)[1:2 + (score[1] > score[2])])