Rearranging / dataframe in R - r

I have an excel file that looks like this:
ID
strength_score_week_1
agility_score_week_1
strength_score_week_2
agility_score_week_2
1
3
6
4
6
2
5
6
6
6
3
8
8
9
8
4
6
7
6
4
I want to rearrange/ rewrite the data above into a data frame that arranges it to this format:
Week
training type
mean score
1
agility
1
strength
2
agility
2
strength
essentially what I want to do with the final table is - I want to group it by training type and plot 2 line graphs showing the mean score for agility and strength over a period of 40 weeks
any help would be very much appreciated!

df <- data.frame(
ID = c(1L, 2L, 3L, 4L),
strength_score_week_1 = c(3L, 5L, 8L, 6L),
agility_score_week_1 = c(6L, 6L, 8L, 7L),
strength_score_week_2 = c(4L, 6L, 9L, 6L),
agility_score_week_2 = c(6L, 6L, 8L, 4L)
)
df
#> ID strength_score_week_1 agility_score_week_1 strength_score_week_2
#> 1 1 3 6 4
#> 2 2 5 6 6
#> 3 3 8 8 9
#> 4 4 6 7 6
#> agility_score_week_2
#> 1 6
#> 2 6
#> 3 8
#> 4 4
library(tidyverse)
df %>%
pivot_longer(!ID, names_pattern = '([^_]*)_score_week_(.*)', names_to = c('training_type', 'week')) %>%
group_by(week, training_type) %>%
summarise(mean_score = mean(value), .groups = 'drop') %>%
mutate(week = as.numeric(week)) %>%
ggplot(aes(x = week, y = mean_score, color = training_type, group = training_type)) +
geom_line()
Created on 2021-07-22 by the reprex package (v2.0.0)

Try this
library(readxl) #library to import excel sheets
df <- t(read_excel('Book1.xlsx')[,-1]) #import data (remove id column)
df_mean <- rowMeans(df) #calculate mean score
#get auxiliar matrix with names of elements
aux <- matrix(unlist(strsplit(rownames(df), '_')), nrow = nrow(df), byrow = T)[,c(1,4)]
colnames(aux) <- c('feature', 'week')
#Join everything in a data frame
df <- as.data.frame(cbind(df_mean, aux))
#plot
library(ggplot2)
ggplot(df)+
geom_point(aes(x = week, y = df_mean, colour = factor(feature)))

library(dplyr)
library(tibble)
library(stringr)
dt <- as.data.frame(t(dt))[-1,]
dt %>%
rownames_to_column() %>%
rowwise() %>%
mutate(`training type` = str_split(rowname, "_")[[1]][1],
week = str_split(rowname, "_")[[1]][4]) %>%
ungroup() %>%
mutate(`mean score` = rowMeans(.[,2:5])) %>%
select(week, `training type`, `mean score`)
Which results to:
# A tibble: 4 x 3
week `training type` `mean score`
<chr> <chr> <dbl>
1 1 strength 5.5
2 1 agility 6.75
3 2 strength 6.25
4 2 agility 6
If you have training types that contain multiple words you would to use a different function instead of str_split. If that is the case I can re-write that part of the code

A base R option
do.call(
rbind,
apply(
aggregate(
cbind(strength, agility) ~ time,
reshape(
setNames(df, gsub("_score_", ".", names(df))),
direction = "long",
idvar = "ID",
varying = -1
), mean
), 1, function(x) cbind(week = x[[1]], rev(stack(x[-1])))
)
)
gives
week ind values
1 week_1 strength 5.50
2 week_1 agility 6.75
3 week_2 strength 6.25
4 week_2 agility 6.00

I would use a mix of pivot_longer, seperate and mutate in this fashion,
data %>%
pivot_longer(cols = -"ID", names_to = "training_type") %>%
mutate(training_type = str_remove(training_type, "_score")) %>%
group_by(training_type) %>%
summarise(mean_score = mean(value, na.rm = TRUE)) %>%
separate(
col = "training_type",
sep = "_week_",
into = c("training_type", "week")
) %>%
mutate(week = as.numeric(week))
Which gives you the following output,
# A tibble: 4 x 3
training_type week mean_score
<chr> <dbl> <dbl>
1 agility 1 6.75
2 agility 2 6
3 strength 1 5.5
4 strength 2 6.25
Which are ready to be plotted by,
data %>% ggplot(
mapping = aes(
x = week,
y = mean_score,
color = training_type
)
) + geom_line()

Related

calculate the sum of cells in each colomn when they have same first 4 lettre in R

i have a table with rows named (fam1A,fam1B,fam1D,fam1U,fam2A,fam2B,fam3A,fam3D ETC..)
now i want to add the sum of (fam1A+fam1B+fam1U) to fam1D(same for fam2,fam3....ETC) and in the same time i want to add half (1/2) of fam1U + half of fam1D TO fam1A and to fam1B
so at the end i will have a table which contains famnD as (famnA+famnB+famnU+famnD) and
famnA as(famnA+(1/2)famnD+(1/2)famnU) and famnB as (famnB+1/2famnD+1/2famnU)
rownames
ssss1
ssss2
fam1A
2
5
fam1B
3
8
fam1D
7
3
fam1U
2
5
fam2U
2
4
fam2D
2
5
fam4B
2
5
The desired output is not entirely clear, but this answer does the calculations separately for the ssss1 and ssss2 columns, and if the rows needed for the calculation aren't present (e.g., fam4B can't be fam4B+1/2fam4D+1/2fam4U because there are no fam4D or fam4U) they are omitted. If you want the missing values to be considered 0, add values_fill = 0 to the first pivot_wider().
library(dplyr)
library(tidyr)
df <- tribble(
~rownames, ~ssss1, ~ssss2,
"fam1A", 2L, 5L,
"fam1B", 3L, 8L,
"fam1D", 7L, 3L,
"fam1U", 2L, 5L,
"fam2U", 2L, 4L,
"fam2D", 2L, 5L,
"fam4B", 2L, 5L
)
df %>%
separate(rownames, into = c("fam", "number", "letter"),
sep = c(-2, -1), convert = TRUE) %>%
pivot_wider(names_from = letter, values_from = c(ssss1, ssss2)) %>%
pivot_longer(contains("ssss"), names_sep = "_", names_to = c("ssss", ".value")) %>%
mutate(D_new = A + B + U + D,
A_new = A + (1/2)*D + (1/2)*U,
B_new = B + (1/2)*D + (1/2)*U,
D = D_new,
A = A_new,
B = B_new) %>%
select(-contains("new")) %>%
pivot_wider(names_from = ssss, values_from = c(A, B, D, U)) %>%
pivot_longer(cols = -c(fam, number), names_sep = "_", names_to = c("letter", ".value"),
values_drop_na = TRUE) %>%
unite(rownames, fam, number, letter, sep = "")
#> # A tibble: 5 × 3
#> rownames ssss1 ssss2
#> <chr> <dbl> <dbl>
#> 1 fam1A 6.5 9
#> 2 fam1B 7.5 12
#> 3 fam1D 14 21
#> 4 fam1U 2 5
#> 5 fam2U 2 4
Created on 2022-05-26 by the reprex package (v2.0.1)

Extract data based on time to death

Hi I'm analysing the pattern of spending for individuals before they died. My dataset contains individuals' monthly spending and their dates of death. The dataset looks similar to this:
ID 2018_11 2018_12 2019_01 2019_02 2019_03 2019_04 2019_05 2019_06 2019_07 2019_08 2019_09 2019_10 2019_11 2019_12 2020_01 date_of_death
A 15 14 6 23 23 5 6 30 1 15 6 7 8 30 1 2020-01-02
B 2 5 6 7 7 8 9 15 12 14 31 30 31 0 0 2019-11-15
Each column denotes the month of the year. For example, "2018_11" means November 2018. The number in each cell denotes the spending in that specific month.
I would like to construct a data frame which contains the spending data of each individual in their last 0-12 months. It will look like this:
ID last_12_month last_11_month ...... last_1_month last_0_month date_of_death
A 6 23 30 1 2020-01-02
B 2 5 30 31 2019-11-15
Each individual died at different time. For example, individual A died on 2020-01-02, so the data of the "last_0_month" for this person should be extracted from the column "2020_01", and that of "last_12_month" extracted from "2019_01"; individual B died on 2019-11-15, so the data of "last_0_month" for this person should be extracted from the column "2019_11", and that of "last_12_month" should be extracted from the column "2018_11".
I will be really grateful for your help.
Using data.table and lubridate packages
library(data.table)
library(lubridate)
setDT(dt)
dt <- melt(dt, id.vars = c("ID", "date_of_death"))
dt[, since_death := interval(ym(variable), ymd(date_of_death)) %/% months(1)]
dt <- dcast(dt[since_death %between% c(0, 12)], ID + date_of_death ~ since_death, value.var = "value", fun.aggregate = sum)
setcolorder(dt, c("ID", "date_of_death", rev(names(dt)[3:15])))
setnames(dt, old = names(dt)[3:15], new = paste("last", names(dt)[3:15], "month", sep = "_"))
Results
dt
# ID date_of_death last_12_month last_11_month last_10_month last_9_month last_8_month last_7_month last_6_month last_5_month last_4_month last_3_month
# 1: A 2020-01-02 6 23 23 5 6 30 1 15 6 7
# 2: B 2019-11-15 2 5 6 7 7 8 9 15 12 14
# last_2_month last_1_month last_0_month
# 1: 8 30 1
# 2: 31 30 31
Data
dt <- structure(list(ID = c("A", "B"), `2018_11` = c(15L, 2L), `2018_12` = c(14L,
5L), `2019_01` = c(6L, 6L), `2019_02` = c(23L, 7L), `2019_03` = c(23L,
7L), `2019_04` = c(5L, 8L), `2019_05` = c(6L, 9L), `2019_06` = c(30L,
15L), `2019_07` = c(1L, 12L), `2019_08` = 15:14, `2019_09` = c(6L,
31L), `2019_10` = c(7L, 30L), `2019_11` = c(8L, 31L), `2019_12` = c(30L,
0L), `2020_01` = 1:0, date_of_death = structure(c(18263L, 18215L
), class = c("IDate", "Date"))), row.names = c(NA, -2L), class = c("data.frame"))
here you can find a similar approach to the one presented by #RuiBarradas but using lubridate for extracting the difference in months:
library(dplyr)
library(tidyr)
library(lubridate)
# Initial data
df <- structure(list(
ID = c("A", "B"),
`2018_11` = c(15, 2),
`2018_12` = c(14, 5),
`2019_01` = c(6, 6),
`2019_02` = c(23, 7),
`2019_03` = c(23, 7),
`2019_04` = c(5, 8),
`2019_05` = c(6, 9),
`2019_06` = c(30, 15),
`2019_07` = c(1, 12),
`2019_08` = c(15, 14),
`2019_09` = c(6, 31),
`2019_10` = c(7, 30),
`2019_11` = c(8, 31),
`2019_12` = c(30, 0),
`2020_01` = c(1, 0),
date_of_death = c("2020-01-02", "2019-11-15")
),
row.names = c(NA, -2L),
class = "data.frame"
)
# Convert to longer all cols that start with 20 (e.g. 2020, 2021)
df_long <- df %>%
pivot_longer(starts_with("20"), names_to = "month")
# treatment
df_long <- df_long %>%
mutate(
# To date, just in case
date_of_death = as.Date(date_of_death),
# Need to reformat the colnames from (e.g.) 2021_01 to 2021-01-01
month_fmt = as.Date(paste0(gsub("_", "-", df_long$month), "-01")),
# End of month
month_fmt = ceiling_date(month_fmt, "month") - days(1),
# End of month for month of death
date_of_death_eom = ceiling_date(date_of_death, "month") - days(1),
# Difference in months (using end of months
month_diff = round(time_length(
interval(month_fmt, date_of_death_eom),"month"),0)) %>%
# Select only months bw 0 and 12
filter(month_diff %in% 0:12) %>%
# Create labels for the next step
mutate(labs = paste0("last_", month_diff,"_month"))
# To wider
end <- df_long %>%
pivot_wider(
id_cols = c(ID, date_of_death),
names_from = labs,
values_from = value
)
end
#> # A tibble: 2 x 15
#> ID date_of_death last_12_month last_11_month last_10_month last_9_month
#> <chr> <date> <dbl> <dbl> <dbl> <dbl>
#> 1 A 2020-01-02 6 23 23 5
#> 2 B 2019-11-15 2 5 6 7
#> # ... with 9 more variables: last_8_month <dbl>, last_7_month <dbl>,
#> # last_6_month <dbl>, last_5_month <dbl>, last_4_month <dbl>,
#> # last_3_month <dbl>, last_2_month <dbl>, last_1_month <dbl>,
#> # last_0_month <dbl>
Created on 2022-03-09 by the reprex package (v2.0.1)
Here is a tidyverse solution.
Reshape the data to long format, coerce the date columns to class "Date", use Dirk Eddelbuettel's accepted answer to this question to compute the date differences in months and keep the rows with month differences between 0 and 12.
This grouped long format is probably more useful and I compute means by group and plot the spending of the last 12 months prior to death but since the question asks for a wide format, the output data set spending12_wide is created.
options(width=205)
df1 <- read.table(text = "
ID 2018_11 2018_12 2019_01 2019_02 2019_03 2019_04 2019_05 2019_06 2019_07 2019_08 2019_09 2019_10 2019_11 2019_12 2020_01 date_of_death
A 15 14 6 23 23 5 6 30 1 15 6 7 8 30 1 2020-01-02
B 2 5 6 7 7 8 9 15 12 14 31 30 31 0 0 2019-11-15
", header = TRUE, check.names = FALSE)
suppressPackageStartupMessages(library(dplyr))
library(tidyr)
library(ggplot2)
# Dirk's functions
monnb <- function(d) {
lt <- as.POSIXlt(as.Date(d, origin = "1900-01-01"))
lt$year*12 + lt$mon
}
# compute a month difference as a difference between two monnb's
diffmon <- function(d1, d2) { monnb(d2) - monnb(d1) }
spending12 <- df1 %>%
pivot_longer(cols = starts_with('20'), names_to = "month") %>%
mutate(month = as.Date(paste0(month, "_01"), "%Y_%m_%d"),
date_of_death = as.Date(date_of_death)) %>%
group_by(ID, date_of_death) %>%
mutate(diffm = diffmon(month, date_of_death)) %>%
filter(diffm >= 0 & diffm <= 12)
spending12 %>% summarise(spending = mean(value), .groups = "drop")
#> # A tibble: 2 x 3
#> ID date_of_death spending
#> <chr> <date> <dbl>
#> 1 A 2020-01-02 12.4
#> 2 B 2019-11-15 13.6
spending12_wide <- spending12 %>%
mutate(month = zoo::as.yearmon(month)) %>%
pivot_wider(
id_cols = c(ID, date_of_death),
names_from = diffm,
names_glue = "last_{.name}_month",
values_from = value
)
spending12_wide
#> # A tibble: 2 x 15
#> # Groups: ID, date_of_death [2]
#> ID date_of_death last_12_month last_11_month last_10_month last_9_month last_8_month last_7_month last_6_month last_5_month last_4_month last_3_month last_2_month last_1_month last_0_month
#> <chr> <date> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
#> 1 A 2020-01-02 6 23 23 5 6 30 1 15 6 7 8 30 1
#> 2 B 2019-11-15 2 5 6 7 7 8 9 15 12 14 31 30 31
ggplot(spending12, aes(month, value, color = ID)) +
geom_line() +
geom_point()
Created on 2022-03-09 by the reprex package (v2.0.1)

group categories according to whether they meet a sequence of conditions with tidyverse

I have a problem on how to recategorize a variable according to whether it meets a certain condition or not. That is, if the category does not meet the criteria, it is assigned to another category that does.
My data has the following form:
data = data.frame(firm_size = c("Micro", "Small", "Medium","Big"),
employees = c(5,10,100,1000))
> data
firm_size employees
1 Micro 5
2 Small 10
3 Medium 100
4 Big 1000
So, if my condition is that I must group the companies that have less than 10 employees and then combine them with the other category that does meet the criteria
> new_data
firm_size employees
1 Micro-Small 15
3 Medium 100
4 Big 1000
What I'm trying to do is write a function that generalizes this procedure, for example, that also works if my data is
> data
firm_size employees
1 Micro 5
2 Small 8
3 Medium 9
4 Big 1000
> new_data
firm_size employees
1 Micro-Small-Medium 22
4 Big 1000
I think that this can be done with the tools of the tidyverse.
Thanks in advance
Here's an approach with tally:
library(dplyr)
size <- 10
data %>%
arrange(firm_size,desc(employees)) %>%
group_by(firm_size = c(as.character(firm_size[employees > size]),
rep(paste(firm_size[employees <= size], collapse = "-"),
sum(employees <= size)))) %>%
tally(employees, name = "employees")
## A tibble: 3 x 2
# firm_size employees
# <chr> <dbl>
#1 Big 1000
#2 Medium 100
#3 Small-Micro 15
And for your second set of data:
data2 %>%
arrange(firm_size,desc(employees)) %>%
group_by(firm_size = c(as.character(firm_size[employees > size]),
rep(paste(firm_size[employees <= size], collapse = "-"),
sum(employees <= size)))) %>%
tally(employees, name = "employees")
## A tibble: 2 x 2
# firm_size employees
# <chr> <int>
#1 Big 1000
#2 Medium-Small-Micro 22
Data
data <- structure(list(firm_size = structure(c(3L, 4L, 2L, 1L), .Label = c("Big",
"Medium", "Micro", "Small"), class = "factor"), employees = c(5,
10, 100, 1000)), class = "data.frame", row.names = c(NA, -4L))
data2 <- structure(list(firm_size = structure(c(3L, 4L, 2L, 1L), .Label = c("Big",
"Medium", "Micro", "Small"), class = "factor"), employees = c(5L,
8L, 9L, 1000L)), class = "data.frame", row.names = c("1", "2",
"3", "4"))
You can use the great forcats package
library(tidyverse)
data <- data.frame(
firm_size = c("Micro", "Small", "Medium", "Big", "Small"),
employees = c(5, 10, 100, 1000, 10)
)
# If you need n groups
data %>%
mutate(firm_size2 = firm_size %>% as_factor() %>% fct_lump(n = 2, w = employees)) %>%
group_by(firm_size2) %>%
summarise(sum_emp = sum(employees),.groups = "drop")
#> # A tibble: 3 x 2
#> firm_size2 sum_emp
#> <fct> <dbl>
#> 1 Medium 100
#> 2 Big 1000
#> 3 Other 25
# If you need at least x on the sum of a vector
data %>%
mutate(firm_size2 = firm_size %>% as_factor() %>% fct_lump_min(min = 10, w = employees)) %>%
group_by(firm_size2) %>%
summarise(sum_emp = sum(employees),.groups = "drop")
#> # A tibble: 4 x 2
#> firm_size2 sum_emp
#> <fct> <dbl>
#> 1 Small 20
#> 2 Medium 100
#> 3 Big 1000
#> 4 Other 5
Created on 2020-06-11 by the reprex package (v0.3.0)
Yet another solution, set into a custom function:
library(tidyverse)
mymerge <- function(dat, min) {
merged_dat <- dat %>%
filter(if_else(employees <= min, TRUE, FALSE)) %>%
summarize(firm_size = str_flatten(firm_size, collapse = " - "),
employees = sum(employees))
dat %>%
filter(if_else(employees <= min, FALSE, TRUE)) %>%
bind_rows(merged_dat)
}
mymerge(data, 30)
firm_size employees
1 Medium 100
2 Big 1000
3 Micro - Small 15
mymerge(data, 300)
firm_size employees
1 Big 1000
2 Micro - Small - Medium 115

Creating an interval in for frequency table in R

I have a dataframe I've created in the form
FREQ CNT
0 5
1 20
2 1000
3 3
4 3
I want to further group my results to be in the following form:
CUT CNT
0+1 25
2+3 1003
4+5 ...
.....
I've tried using the between and cut functions in dplyr but it just adds a new interval column to my dataframe can anyone give me a good indication as to where to go to achieve this?
Here is a way to do it in dplyr:
library(dplyr)
df <- df %>%
mutate(id = 1:n()) %>%
mutate(new_freq = ifelse(id %% 2 != 0, paste0(FREQ, "+", lead(FREQ, 1)), paste0(lag(FREQ, 1), "+", FREQ)))
df <- df %>%
group_by(new_freq) %>%
mutate(new_cnt = sum(CNT))
unique(df[, 4:5])
# A tibble: 2 x 2
# Groups: new_freq [2]
# new_freq new_cnt
# <chr> <int>
#1 0+1 25
#2 2+3 1003
data
df <- structure(list(FREQ = 0:3, CNT = c(5L, 20L, 1000L, 3L)), class = "data.frame", row.names = c(NA, -4L))
A non-elegant solution using dplyr... probably a better way to do this.
dat <- data.frame(FREQ = c(0,1,2,3,4), CNT = c(5,20,1000, 3, 3))
dat2 <- dat %>%
mutate(index = 0:(nrow(dat)-1)%/%2) %>%
group_by(index)
dat2 %>%
summarise(new_CNT = sum(CNT)) %>%
left_join(dat2 %>%
mutate(CUT = paste0(FREQ[1], "+", FREQ[2])) %>%
distinct(index, CUT),
by = "index") %>%
select(-index)
# A tibble: 3 x 2
new_CNT CUT
<dbl> <chr>
1 25 0+1
2 1003 2+3
3 3 4+NA

Pmax of columns ending with a given string

I would like to conditionally mutate a new column representing the pmax() of columns ending with "_n" for a given row. I know I can do this by explicitly specifying the column names, but I would prefer to have this be the result of a call to ends_with() or similar.
I have tried mutate_at() and plain mutate(). My general thought is that I need to pass a vars(ends_with("_n")) to something, but I'm just missing that something.
Thanks in advance.
library(dplyr)
library(tidyr)
mtcars %>%
group_by(vs, gear) %>%
summarize(mean = mean(disp),
sd = sd(disp),
n = n()) %>%
mutate_if(is.double, round, 1) %>%
mutate(mean_sd = paste0(mean, " (", sd, ")")) %>%
select(-mean, -sd) %>%
group_by(vs, gear) %>%
nest(n, mean_sd, .key = "summary") %>%
spread(key = vs, value = summary) %>%
unnest(`0`, `1`, .sep = "_")
gear `0_n` `0_mean_sd` `1_n` `1_mean_sd`
<dbl> <int> <chr> <int> <chr>
1 3 12 357.6 (71.8) 3 201 (72)
2 4 2 160 (0) 10 115.6 (38.5)
3 5 4 229.3 (113.9) 1 95.1 (NA)
edit: both answers are much appreciated. Cheers!
Here's one way using the unquote-splice operator. We can select columns that we want to compare and then splice them as vectors into pmax:
library(tidyverse)
tbl <- structure(list(gear = c(3, 4, 5), `0_n` = c(12L, 2L, 4L), `0_mean_sd` = c("357.6 (71.8)", "160 (0)", "229.3 (113.9)"), `1_n` = c(3L, 10L, 1L), `1_mean_sd` = c("201 (72)", "115.6 (38.5)", "95.1 (NA)")), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"))
tbl %>%
mutate(pmax = pmax(!!!select(., ends_with("_n"))))
#> # A tibble: 3 x 6
#> gear `0_n` `0_mean_sd` `1_n` `1_mean_sd` pmax
#> <dbl> <int> <chr> <int> <chr> <int>
#> 1 3 12 357.6 (71.8) 3 201 (72) 12
#> 2 4 2 160 (0) 10 115.6 (38.5) 10
#> 3 5 4 229.3 (113.9) 1 95.1 (NA) 4
Created on 2019-04-23 by the reprex package (v0.2.1)
A base R version, just as an alternative:
tbl <- structure(list(gear = c(3, 4, 5), `0_n` = c(12L, 2L, 4L), `0_mean_sd` = c("357.6 (71.8)", "160 (0)", "229.3 (113.9)"), `1_n` = c(3L, 10L, 1L), `1_mean_sd` = c("201 (72)", "115.6 (38.5)", "95.1 (NA)")), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"))
tbl$pmax <- do.call(pmax,as.list(dat[,grepl("_n$",names(dat))]))

Resources