How to use regex with names_pattern from pivot_longer - r

I have df like this
ID <- c("A01","B20","C3","D4")
Nb_data <- c(2,2,2,3)
Weight_t1 <- c(70,44,98,65)
Weight_t2 <- c(75,78,105,68)
Weight_t3 <- c(72,52,107,NA)
year1 <- c(20,28,32,50)
year2 <- c(28,32,35,60)
year3 <- c(29,35,38,NA)
LENGTHt1 <- c(175,155,198,165)
LENGTHt2 <- c(175,155,198,163)
LENGTHt3 <- c(176,154,198,NA)
df <- data.frame(ID,Nb_data,Weight_t1,Weight_t2,Weight_t3,year1,year2,year3,LENGTHt1,LENGTHt2,LENGTHt3)
weight/year and length : t1 to t28
I want to tidy my data like :
ID
Nb_data
Weigth
Year
Length
A01
3
70
20
175
A01
3
75
28
175
A01
3
72
29
176
B20
3
44
28
155
B20
3
78
32
155
B20
3
52
35
154
I try
df1 <- df %>%
pivot_longer(cols = -c('ID','Nb_data'),
names_to = c('Weight','Year','Length' ),
names_pattern = '(Weight_t[0-9]*|year[0-9]*|LENGTHt[0-9]*)' ,
values_drop_na = TRUE)
or names_pattern = '(.t[0-9])(.t[0-9])(.t[0-9])'
I have some difficulties to use regex or maybe pivot_longer are not suitable...

You need to extract the common timepoint information from the variable names. Make this information consistent first, with a clear separator (_ in this case), then it becomes much easier.
I would do something like this
library(tidyr)
library(dplyr)
df1 <- df
names(df1) <- gsub("year", "Year_t", names(df1))
names(df1) <- gsub("LENGTH", "Length_", names(df1))
df1 %>%
pivot_longer(cols = -c('ID','Nb_data'),
names_to = c("name", "timepoint"),
names_sep = "_",
values_drop_na = TRUE) %>%
pivot_wider(names_from = name, values_from = value)
EDIT: or shorter, using ".value" in the names_to argument (as #onyambu showed in his answer):
df1 %>%
pivot_longer(cols = -c('ID','Nb_data'),
names_to = c(".value", "timepoint"),
names_sep = "_",
values_drop_na = TRUE)
Output:
ID Nb_data timepoint Weight Year Length
<chr> <dbl> <chr> <dbl> <dbl> <dbl>
1 A01 2 t1 70 20 175
2 A01 2 t2 75 28 175
3 A01 2 t3 72 29 176
4 B20 2 t1 44 28 155
5 B20 2 t2 78 32 155
6 B20 2 t3 52 35 154
7 C3 2 t1 98 32 198
8 C3 2 t2 105 35 198
9 C3 2 t3 107 38 198
10 D4 3 t1 65 50 165
11 D4 3 t2 68 60 163

You could directly use pivot_longer though with abit of complex regex as follows
df %>%
pivot_longer(matches("\\d+$"), names_to = c(".value", "grp"),
names_pattern = "(.*?)[_t]{0,2}(\\d+$)",
values_drop_na = TRUE)
# A tibble: 11 × 6
ID Nb_data grp Weight year LENGTH
<chr> <dbl> <chr> <dbl> <dbl> <dbl>
1 A01 2 1 70 20 175
2 A01 2 2 75 28 175
3 A01 2 3 72 29 176
4 B20 2 1 44 28 155
5 B20 2 2 78 32 155
6 B20 2 3 52 35 154
7 C3 2 1 98 32 198
8 C3 2 2 105 35 198
9 C3 2 3 107 38 198
10 D4 3 1 65 50 165
11 D4 3 2 68 60 163

Related

Rounded averages by group that sum to the same as the group total

I have data that looks like this:
library(dplyr)
Data <- tibble(
ID = c("Code001", "Code001","Code001","Code002","Code002","Code002","Code002","Code002","Code003","Code003","Code003","Code003"),
Value = c(107,107,107,346,346,346,346,346,123,123,123,123))
I need to work out the average value per group per row. However, the value needs to be rounded (so no decimal places) and the group sum needs to equal the group sum of Value.
So solutions like this won't work:
Data %>%
add_count(ID) %>%
group_by(ID) %>%
mutate(Prop_Value_1 = Value/n,
Prop_Value_2 = round(Value/n))
Is there a solution that can produce an output like this:
Data %>%
mutate(Prop_Value = c(35,36,36,69,69,69,69,70,30,31,31,31))
Can use ceiling and then row_number to get there:
Data %>%
group_by(ID) %>%
mutate(count = n(),
ceil_avg = ceiling(Value/count)) %>%
mutate(sum_ceil_avg = sum(ceil_avg),
diff_sum = sum_ceil_avg - Value,
rn = row_number()) %>%
mutate(new_avg = ifelse(rn <= diff_sum,
ceil_avg - 1,
ceil_avg))
# A tibble: 12 × 8
# Groups: ID [3]
ID Value count ceil_avg sum_ceil_avg diff_sum rn new_avg
<chr> <dbl> <int> <dbl> <dbl> <dbl> <int> <dbl>
1 Code001 107 3 36 108 1 1 35
2 Code001 107 3 36 108 1 2 36
3 Code001 107 3 36 108 1 3 36
4 Code002 346 5 70 350 4 1 69
5 Code002 346 5 70 350 4 2 69
6 Code002 346 5 70 350 4 3 69
7 Code002 346 5 70 350 4 4 69
8 Code002 346 5 70 350 4 5 70
9 Code003 123 4 31 124 1 1 30
10 Code003 123 4 31 124 1 2 31
11 Code003 123 4 31 124 1 3 31
12 Code003 123 4 31 124 1 4 31
A first solution is to use integer division:
Data %>%
group_by(ID) %>%
mutate(Prop_Value = ifelse(row_number() <= Value %% n(), Value %/% n() + 1, Value %/% n()))
# A tibble: 12 × 3
# Groups: ID [3]
ID Value Prop_Value
<chr> <dbl> <dbl>
1 Code001 107 36
2 Code001 107 36
3 Code001 107 35
4 Code002 346 70
5 Code002 346 69
6 Code002 346 69
7 Code002 346 69
8 Code002 346 69
9 Code003 123 31
10 Code003 123 31
11 Code003 123 31
12 Code003 123 30

How to order variables in blocks

I have a large data set with variables like this:
df <- data.frame(ID=c(1,2),
Hei3ght1=c(180,192),
Weight1=c(70,90),
Hip1=c(25,29),
hei5ght1=c(160,150),
Hei3ght2=c(167,168),
Weight2=c(50,50),
Hip2=c(23,27),
hei5ght2=c(160,150),
Hei3ght3=c(175,176),
Weight3=c(50,70),
Hip3=c(28,28),
hei5ght3=c(160,150))
I would like to order the variables as follows:
ID, Hei3ght1, Hei3ght2, Hei3ght3, Weight1, Weight2, Weight3, Hip1, Hip2, Hip3, Hei5ght1, Hei5ght2, Hei5ght3
I have tried with:
df <- df[sort(names(df))]
But I do not want all the variables alphabetically.
Thank you so much in advance.
UODATE 2
df <- data.frame(ID=c(1,2),
Hei3ght1=c(180,192),
Weight1=c(70,90),
Hip1=c(25,29),
hei5ght1=c(160,150),
hei5ght21=c(160,150),
Hei3ght2=c(167,168),
Weight2=c(50,50),
Hip2=c(23,27),
hei5ght2=c(160,150),
hei5ght22=c(160,150),
Hei3ght3=c(175,176),
Weight3=c(50,70),
Hip3=c(28,28),
hei5ght3=c(160,150),
hei5ght23=c(160,150))
An option in base R would be to convert the column names to a matrix and then to a vector:
n <- length(unique(sub("_\\d+", "", names(df)[-1])))
df[c('ID', c(matrix(names(df)[-1], ncol = n, byrow = TRUE)))]
Output:
ID Height_1 Height_2 Height_3 Weight_1 Weight_2 Weight_3 Hip_1 Hip_2 Hip_3
1 1 180 167 175 70 50 50 25 23 28
2 2 192 168 176 90 50 70 29 27 28
Or you may use
library(data.table)
library(dplyr)
df %>%
select(ID, order(rowid(readr::parse_number(names(.)[-1])))+1)
Output:
ID Height_1 Height_2 Height_3 Weight_1 Weight_2 Weight_3 Hip_1 Hip_2 Hip_3
1 1 180 167 175 70 50 50 25 23 28
2 2 192 168 176 90 50 70 29 27 28
Update
For the updated data
library(stringr)
df %>%
select(ID, order(rowid(str_extract(names(.)[-1], "\\d+$")))+1)
Output:
ID Hei3ght1 Hei3ght2 Hei3ght3 Weight1 Weight2 Weight3 Hip1 Hip2 Hip3 hei5ght1 hei5ght2 hei5ght3
1 1 180 167 175 70 50 50 25 23 28 160 160 160
2 2 192 168 176 90 50 70 29 27 28 150 150 150
Update2
df %>%
select(ID, order(rowid(str_extract(names(.)[-1], "\\d$")))+1)
ID Hei3ght1 Hei3ght2 Hei3ght3 Weight1 Weight2 Weight3 Hip1 Hip2 Hip3 hei5ght1 hei5ght2 hei5ght3 hei5ght21 hei5ght22 hei5ght23
1 1 180 167 175 70 50 50 25 23 28 160 160 160 160 160 160
2 2 192 168 176 90 50 70 29 27 28 150 150 150 150 150 150
This is the long version of #akrun's solution, the core idea is to make pivot longer, transform to factor, and the arrange and pivot back:
library(tidyverse)
df %>%
pivot_longer(-ID) %>%
mutate(helper = str_replace_all(name, "[[:punct:]][0-9]+", ""),
helper = factor(helper, levels = c("Height", "Weight", "Hip"))) %>%
group_by(ID) %>%
arrange(helper, .by_group = TRUE) %>%
select(-helper) %>%
pivot_wider(names_from = name, values_from = value)
ID Height_1 Height_2 Height_3 Weight_1 Weight_2 Weight_3 Hip_1 Hip_2 Hip_3
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 180 167 175 70 50 50 25 23 28
2 2 192 168 176 90 50 70 29 27 28

Calculate area under the curve for time serie data

I want to calculate the area under the curve for the time points for each id and column. Any suggestions? Which R packages to use? Many thanks!
id <- rep(1:3,each=5)
time <- rep(c(10,20,30,40,50),3)
q1 <- sample(100,15, replace=T)
q2 <- sample(100,15, replace=T)
q3 <- sample(100,15, replace=T)
df <- data.frame(id,time,q1,q2,q3)
df
id time q1 q2 q3
1 10 38 55 38
1 20 46 29 88
1 30 16 28 97
1 40 37 20 81
1 50 59 27 42
2 10 82 81 54
2 20 45 3 23
2 30 82 67 59
2 40 27 3 42
2 50 45 71 45
3 10 39 8 29
3 20 12 6 90
3 30 92 11 7
3 40 52 8 37
3 50 81 57 80
Wanted output, something like this:
q1 q2 q3
1 area area area
2 area area area
3 area area area
library(tidyverse)
id <- rep(1:3,each=5)
time <- rep(c(10,20,30,40,50),3)
q1 <- sample(100,15, replace=T)
q2 <- sample(100,15, replace=T)
q3 <- sample(100,15, replace=T)
df <- data.frame(id,time,q1,q2,q3)
df %>%
arrange(time) %>%
pivot_longer(cols = c(q1, q2, q3)) -> longer_df
longer_df %>%
ggplot(aes(x = time, y = value, col = factor(id))) +
geom_line() +
geom_point() +
facet_wrap(. ~ name)
longer_df %>%
group_by(id, name) %>%
mutate(lag_value = lag(value),
midpoint_value = (value + lag_value)/2) %>%
summarize(area = 10*sum(midpoint_value, na.rm = T)) %>%
pivot_wider(values_from = area)
#> `summarise()` has grouped output by 'id'. You can override using the `.groups` argument.
#> # A tibble: 3 x 4
#> # Groups: id [3]
#> id q1 q2 q3
#> <int> <dbl> <dbl> <dbl>
#> 1 1 1960 1980 2075
#> 2 2 1025 2215 2180
#> 3 3 2105 1590 2110
Created on 2021-06-30 by the reprex package (v2.0.0)
Here I will use the trapz function to calculate the integral.
library(data.table)
library(caTools) # integrate with its trapz function
# data
df <- fread("id time q1 q2 q3
1 10 38 55 38
1 20 46 29 88
1 30 16 28 97
1 40 37 20 81
1 50 59 27 42
2 10 82 81 54
2 20 45 3 23
2 30 82 67 59
2 40 27 3 42
2 50 45 71 45
3 10 39 8 29
3 20 12 6 90
3 30 92 11 7
3 40 52 8 37
3 50 81 57 80")
# calculate the area with `trapz`
df[,lapply(.SD[,2:4], function(y) trapz(time,y)),by=id]
#> id q1 q2 q3
#> 1: 1 1475 1180 3060
#> 2: 2 2175 1490 1735
#> 3: 3 2160 575 1885
Created on 2021-06-30 by the reprex package (v2.0.0)

How can I transform multiple repeated measures from wide to long format?

I have a data set that looks like that:
id <- c(1:3)
gender <- factor(c("male","female","female"))
age <- c(51,69,44)
cortisol_1 <- c(23,32,54)
cortisol_2 <- c(34,52,49)
cortisol_3 <- c(34,65,12)
blood_1 <- c(12,64,54)
blood_2 <- c(52,32,75)
blood_3 <- c(12,12,75)
temp_1 <- c(38.5,38.7,37.9)
temp_3 <- c(36.5,36.4,37.1)
df <- data.frame(id,gender,age,cortisol_1,cortisol_2,cortisol_3,blood_1,blood_2,blood_3,temp_1,temp_3)
df
id gender age cortisol_1 cortisol_2 cortisol_3 blood_1 blood_2 blood_3 temp_1 temp_3
1 1 male 51 23 34 34 12 52 12 38.5 36.5
2 2 female 69 32 52 65 64 32 12 38.7 36.4
3 3 female 44 54 49 12 54 75 75 37.9 37.1
So I have cortisol level and blood pressure which were measured annually at three time points. However, body temperature was only assessed at baseline and wave 3.
How can I change the data structure from wide to long? I would hope that the data looks like that:
id gender wave cortisol blood temp
1 1 male 1 23 12 38.5
2 1 male 2 34 52 NA
3 1 male 3 34 12 36.5
4 2 female 1 32 64 37.7
5 2 female 2 52 32 NA
6 2 female 3 65 12 36.4
7 3 female 1 54 54 37.9
8 3 female 2 49 75 NA
9 3 female 3 12 75 37.1
Best
Jascha
We can use pivot_longer
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = -c(id, gender, age),
names_to = c('.value', 'grp'), names_sep = "_") %>%
select(-grp)
-output
# A tibble: 9 x 6
# id gender age cortisol blood temp
# <int> <fct> <dbl> <dbl> <dbl> <dbl>
#1 1 male 51 23 12 38.5
#2 1 male 51 34 52 NA
#3 1 male 51 34 12 36.5
#4 2 female 69 32 64 38.7
#5 2 female 69 52 32 NA
#6 2 female 69 65 12 36.4
#7 3 female 44 54 54 37.9
#8 3 female 44 49 75 NA
#9 3 female 44 12 75 37.1

Simplify multiple rowSums looping through columns

I'm currently on R trying to create for a DF multiple columns with the sum of previous one. Imagine I got a DF like this:
df=
sep-2016 oct-2016 nov-2016 dec-2016 jan-2017
1 70 153 NA 28 19
2 57 68 73 118 16
3 29 NA 19 32 36
4 177 36 3 54 53
and I want to add at the end the sum of the rows previous of the month that I'm reporting so for October you end up with the sum of sep and oct, and for November you end up with the sum of sep, oct and november and end up with something like this:
df=
sep-2016 oct-2016 nov-2016 dec-2016 jan-2017 status-Oct2016 status-Nov 2016
1 70 153 NA 28 19 223 223
2 57 68 73 118 16 105 198
3 29 NA 19 32 36 29 48
4 177 36 3 54 53 213 93
I want to know a efficient way insted of writing a lots of lines of rowSums() and even if I can get the label on the iteration for each month would be amazing!
Thanks!
We can use lapply to loop through the columns to apply the rowSums.
dat2 <- as.data.frame(lapply(2:ncol(dat), function(i){
rowSums(dat[, 1:i], na.rm = TRUE)
}))
names(dat2) <- paste0("status-", names(dat[, -1]))
dat3 <- cbind(dat, dat2)
dat3
# sep-2016 oct-2016 nov-2016 dec-2016 jan-2017 status-oct-2016 status-nov-2016 status-dec-2016 status-jan-2017
# 1 70 153 NA 28 19 223 223 251 270
# 2 57 68 73 118 16 125 198 316 332
# 3 29 NA 19 32 36 29 48 80 116
# 4 177 36 3 54 53 213 216 270 323
DATA
dat <- read.table(text = " 'sep-2016' 'oct-2016' 'nov-2016' 'dec-2016' 'jan-2017'
1 70 153 NA 28 19
2 57 68 73 118 16
3 29 NA 19 32 36
4 177 36 3 54 53",
header = TRUE, stringsAsFactors = FALSE)
names(dat) <- c("sep-2016", "oct-2016", "nov-2016", "dec-2016", "jan-2017")
Honestly I have no idea why you would want your data in this format, but here is a tidyverse method of accomplishing it. It involves transforming the data to a tidy format before spreading it back out into your wide format. The key thing to note is that in a tidy format, where month is a variable in a single column instead of spread across multiple columns, you can simply use group_by(rowid) and cumsum to calculate all the values you want. The last few lines are constructing the status- column names and spreading the data back out into a wide format.
library(tidyverse)
df <- read_table2(
"sep-2016 oct-2016 nov-2016 dec-2016 jan-2017
70 153 NA 28 19
57 68 73 118 16
29 NA 19 32 36
177 36 3 54 53"
)
df %>%
rowid_to_column() %>%
gather("month", "value", -rowid) %>%
arrange(rowid) %>%
group_by(rowid) %>%
mutate(
value = replace_na(value, 0),
status = cumsum(value)
) %>%
gather("vartype", "number", value, status) %>%
mutate(colname = ifelse(vartype == "value", month, str_c("status-", month))) %>%
select(rowid, number, colname) %>%
spread(colname, number)
#> # A tibble: 4 x 11
#> # Groups: rowid [4]
#> rowid `dec-2016` `jan-2017` `nov-2016` `oct-2016` `sep-2016`
#> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 28.0 19.0 0 153 70.0
#> 2 2 118 16.0 73.0 68.0 57.0
#> 3 3 32.0 36.0 19.0 0 29.0
#> 4 4 54.0 53.0 3.00 36.0 177
#> # ... with 5 more variables: `status-dec-2016` <dbl>,
#> # `status-jan-2017` <dbl>, `status-nov-2016` <dbl>,
#> # `status-oct-2016` <dbl>, `status-sep-2016` <dbl>
Created on 2018-02-16 by the reprex package (v0.2.0).
A clean way to do it is by convert your data in a long format.
library(tibble)
library(tidyr)
library(dplyr)
your_data <- tribble(~"sep_2016", ~"oct_2016", ~"nov_2016", ~"dec_2016", ~"jan_2017",
70, 153, NA, 28, 19,
57, 68, 73, 118, 16,
29, NA, 19, 32, 36,
177, 36, 3, 54, 53)
You can change the format of your data.frame with gather from the tidyr package.
your_data_long <- your_data %>%
rowid_to_column() %>%
gather(key = month_year, value = the_value, -rowid)
head(your_data_long)
#> # A tibble: 6 x 3
#> rowid month_year the_value
#> <int> <chr> <dbl>
#> 1 1 sep_2016 70
#> 2 2 sep_2016 57
#> 3 3 sep_2016 29
#> 4 4 sep_2016 177
#> 5 1 oct_2016 153
#> 6 2 oct_2016 68
Once your data.frame is in a long format. You can compute cumulative sum with cumsumand dplyrfunctions mutate and group_by.
result <- your_data_long %>%
group_by(rowid) %>%
mutate(cumulative_value = cumsum(the_value))
result
#> # A tibble: 20 x 4
#> # Groups: rowid [4]
#> rowid month_year the_value cumulative_value
#> <int> <chr> <dbl> <dbl>
#> 1 1 sep_2016 70 70
#> 2 2 sep_2016 57 57
#> 3 3 sep_2016 29 29
#> 4 4 sep_2016 177 177
#> 5 1 oct_2016 153 223
#> 6 2 oct_2016 68 125
#> 7 3 oct_2016 NA NA
#> 8 4 oct_2016 36 213
#> 9 1 nov_2016 NA NA
#> 10 2 nov_2016 73 198
#> 11 3 nov_2016 19 NA
#> 12 4 nov_2016 3 216
#> 13 1 dec_2016 28 NA
#> 14 2 dec_2016 118 316
#> 15 3 dec_2016 32 NA
#> 16 4 dec_2016 54 270
#> 17 1 jan_2017 19 NA
#> 18 2 jan_2017 16 332
#> 19 3 jan_2017 36 NA
#> 20 4 jan_2017 53 323
If you want to retrieve the starting form, you can do it with spread.
My preferred solution would be:
# library(matrixStats)
DF <- as.matrix(df)
DF[is.na(DF)] <- 0
RES <- matrixStats::rowCumsums(DF)
colnames(RES) <- paste0("status-", colnames(DF))
cbind.data.frame(df, RES)
This is closest to what you are looking for with the rowSums.
One option could be using spread and gather function from tidyverse.
Note: The status column has been added even for the 1st month. And the status columns are not in order but values are correct.
The approach is:
# Data
df <- read.table(text = "sep-2016 oct-2016 nov-2016 dec-2016 jan-2017
70 153 NA 28 19
57 68 73 118 16
29 NA 19 32 36
177 36 3 54 53", header = T, stringsAsFactors = F)
library(tidyverse)
# Just add an row number as sl
df <- df %>% mutate(sl = row_number())
#Calculate the cumulative sum after gathering and arranging by date
mod_df <- df %>%
gather(key, value, -sl) %>%
mutate(key = as.Date(paste("01",key, sep="."), format="%d.%b.%Y")) %>%
arrange(sl, key) %>%
group_by(sl) %>%
mutate(status = cumsum(ifelse(is.na(value),0L,value) )) %>%
select(-value) %>%
mutate(key = paste("status",as.character(key, format="%b.%Y"))) %>%
spread(key, status)
# Finally join cumulative calculated sum columns with original df and then
# remove sl column
inner_join(df, mod_df, by = "sl") %>% select(-sl)
# sep.2016 oct.2016 nov.2016 dec.2016 jan.2017 status Dec.2016 status Jan.2017 status Nov.2016 status Oct.2016 status Sep.2016
#1 70 153 NA 28 19 251 270 223 223 70
#2 57 68 73 118 16 316 332 198 125 57
#3 29 NA 19 32 36 80 116 48 29 29
#4 177 36 3 54 53 270 323 216 213 177
Another base solution where we build a matrix accumulating the row sums :
status <- setNames(
as.data.frame(t(apply(dat,1,function(x) Reduce(sum,'[<-'(x,is.na(x),0),accumulate = TRUE)))),
paste0("status-",names(dat)))
status
# status-sep-2016 status-oct-2016 status-nov-2016 status-dec-2016 status-jan-2017
# 1 70 223 223 251 270
# 2 57 125 198 316 332
# 3 29 29 48 80 116
# 4 177 213 216 270 323
Then bind it to your original data if needed :
cbind(dat,status[-1])

Resources