R: generate value rows for each date extracted - r

I have a dataframe like this:
ID Year Week Monday Tuesday Wednesday
12 2017 42 8 9 8,5
12 2017 43 9 11 7,3
13 2017 43 9 10 6,8
I would like to change it in order to achive this:
ID day time
12 16/10/2017 8
12 17/10/2017 9
12 18/10/2017 8,5
12 23/10/2017 9
12 24/10/2017 11
12 25/10/2017 7,3
12 23/10/2017 9
12 24/10/2017 10
12 25/10/2017 6,8
I´m trying by using dplyr but still I have not found a solution

library(dplyr)
library(tidyr)
df %>%
gather(day, time, Monday:Wednesday) %>%
mutate(date = as.Date(paste(Year, Week, day),"%Y %U %A")) %>%
arrange(ID, Year, Week) %>%
select(-Year, -Week, -day)
# ID time date
#1 12 8 2017-10-16
#2 12 9 2017-10-17
#3 12 8,5 2017-10-18
#4 12 9 2017-10-23
#5 12 11 2017-10-24
#6 12 7,3 2017-10-25
#7 13 9 2017-10-23
#8 13 10 2017-10-24
#9 13 6,8 2017-10-25
#sample data
> dput(df)
structure(list(ID = c(12L, 12L, 13L), Year = c(2017L, 2017L,
2017L), Week = c(42L, 43L, 43L), Monday = c(8L, 9L, 9L), Tuesday = c(9L,
11L, 10L), Wednesday = structure(c(3L, 2L, 1L), .Label = c("6,8",
"7,3", "8,5"), class = "factor")), .Names = c("ID", "Year", "Week",
"Monday", "Tuesday", "Wednesday"), class = "data.frame", row.names = c(NA,
-3L))

Related

Reshape Wide to Long with 2 time variables

Though there is an abundance of 'wide to long' threads for R, I haven't found an answer that will help me with my issue. Any assistance is greatly appreciated!
Example of my dataframe (in wide format):
CODE NAME M_2010_1 M_2011_1 M_2012_1 M_2010_3 M_2011_3 M_2012_3
1 A 10 11 10 9 10 13
12 B 11 13 15 15 14 11
8 C 9 2 4 2 8 8
Desired dataframe (in long):
CODE NAME YEAR M1 M3
1 A 2010 10 9
1 A 2011 11 10
1 A 2012 10 13
12 B 2010 11 15
12 B 2011 13 14
12 B 2012 15 11
8 C 2010 9 2
8 C 2011 2 8
8 C 2012 4 8
Thanks in advance!
Data
df<-
structure(list(CODE = c(1L, 12L, 8L), NAME = c("A", "B", "C"),
M_2010_1 = c(10L, 11L, 9L), M_2011_1 = c(11L, 13L, 2L), M_2012_1 = c(10L,
15L, 4L), M_2010_3 = c(9L, 15L, 2L), M_2011_3 = c(10L, 14L,
8L), M_2012_3 = c(13L, 11L, 8L)), class = "data.frame", row.names = c(NA,
-3L))
Code
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = -c(CODE,NAME),names_sep = "_",names_to = c("aux1","YEAR","aux2")) %>%
unite(aux,aux1,aux2,sep = "") %>%
pivot_wider(names_from = aux,values_from = value)
Output
# A tibble: 9 x 5
CODE NAME YEAR M1 M3
<int> <chr> <chr> <int> <int>
1 1 A 2010 10 9
2 1 A 2011 11 10
3 1 A 2012 10 13
4 12 B 2010 11 15
5 12 B 2011 13 14
6 12 B 2012 15 11
7 8 C 2010 9 2
8 8 C 2011 2 8
9 8 C 2012 4 8
A one liner using reshape which allows to define all in one.
reshape(dat, idv=1:2, var=list(3:5, 6:8), dir='long', timev='YEAR', times=2010:2012, v.n=c('M1', 'M2'))
# CODE NAME YEAR M1 M2
# 1.A.2010 1 A 2010 10 9
# 12.B.2010 12 B 2010 11 15
# 8.C.2010 8 C 2010 9 2
# 1.A.2011 1 A 2011 11 10
# 12.B.2011 12 B 2011 13 14
# 8.C.2011 8 C 2011 2 8
# 1.A.2012 1 A 2012 10 13
# 12.B.2012 12 B 2012 15 11
# 8.C.2012 8 C 2012 4 8
Data:
dat <- structure(list(CODE = c(1L, 12L, 8L), NAME = c("A", "B", "C"),
M_2010_1 = c(10L, 11L, 9L), M_2011_1 = c(11L, 13L, 2L), M_2012_1 = c(10L,
15L, 4L), M_2010_3 = c(9L, 15L, 2L), M_2011_3 = c(10L, 14L,
8L), M_2012_3 = c(13L, 11L, 8L)), class = "data.frame", row.names = c(NA,
-3L))
We could do this in pivot_longer after we rearrange the substring in the column names
library(dplyr)
library(stringr)
library(tidyr)
df1 %>%
rename_with(~ str_replace(.x, "_(\\d+)_(\\d+)", "\\2_\\1"),
starts_with("M_")) %>%
pivot_longer(cols = starts_with("M"),
names_to = c(".value", "year"), names_sep = "_")
-output
# A tibble: 9 × 5
CODE NAME year M1 M3
<int> <chr> <chr> <int> <int>
1 1 A 2010 10 9
2 1 A 2011 11 10
3 1 A 2012 10 13
4 12 B 2010 11 15
5 12 B 2011 13 14
6 12 B 2012 15 11
7 8 C 2010 9 2
8 8 C 2011 2 8
9 8 C 2012 4 8
data
df1 <- structure(list(CODE = c(1L, 12L, 8L), NAME = c("A", "B", "C"),
M_2010_1 = c(10L, 11L, 9L), M_2011_1 = c(11L, 13L, 2L), M_2012_1 = c(10L,
15L, 4L), M_2010_3 = c(9L, 15L, 2L), M_2011_3 = c(10L, 14L,
8L), M_2012_3 = c(13L, 11L, 8L)), class = "data.frame", row.names = c(NA,
-3L))

Converting days into weeks in R

I want to convert days into weeks with all the values from that week summed up
Right now I have the following df
Date x
1 2018-02-23 15
2 2018-03-26 4
3 2018-03-29 3
4 2018-03-30 6
5 2018-04-03 5
6 2018-04-04 12
7 2018-04-05 7
8 2018-04-06 5
9 2018-04-07 5
10 2018-04-09 13
11 2018-04-10 8
12 2018-04-11 2
ETC.
The x in this df stands for amount of items sent on a certain day.
There are days in this df where there are no items beeing transported.
This df has a total of 688 tuples.
What I would like to see it:
Date x
1 Week 8 2018 19
2 Week 9 2018 26
3 Week 10 2018 33
ETC.
Can someone help me out?
You can use aggregate and get the weeks with format %V:
aggregate(df$x, list(Date=format(df$Date, "%V %Y")), sum)
# Date x
#1 08 2018 15
#2 13 2018 13
#3 14 2018 34
#4 15 2018 23
Or with Week (Thanks to #sindri-baldur for the comment):
aggregate(df$x, list(Date=sub("^0?", "Week ", format(df$Date, "%V %Y"))), sum)
#aggregate(df$x, list(Date=format(df$Date, "Week %-V %Y")), sum) #Alternative
# Date x
#1 Week 13 2018 13
#2 Week 14 2018 34
#3 Week 15 2018 23
#4 Week 8 2018 15
Data:
df <- read.table(header=TRUE, text=" Date x
1 2018-02-23 15
2 2018-03-26 4
3 2018-03-29 3
4 2018-03-30 6
5 2018-04-03 5
6 2018-04-04 12
7 2018-04-05 7
8 2018-04-06 5
9 2018-04-07 5
10 2018-04-09 13
11 2018-04-10 8
12 2018-04-11 2")
df$Date <- as.Date(df$Date)
library(lubridate)
library(tidyverse)
## Random data
df <- data.frame(date=seq.Date(from = as.Date("2018-01-01"), to=as.Date("2018-12-31"), by = "day"),x=runif(n=365,min=0,max=25))
## Aggregating by week
df2 <- df %>%
mutate(week = lubridate::week(ymd(date))) %>%
group_by(week) %>%
summarise(total_per_week = sum(x))
Using collapse
library(collapse)
library(lubridate)
library(magrittr)
df %>%
ftransform(week = week(ymd(Date))) %>%
fgroup_by(week) %>%
fsummarise(total_per_week = fsum(x))
# week total_per_week
#1 8 15
#2 13 13
#3 14 34
#4 15 23
data
df <- structure(list(Date = c("2018-02-23", "2018-03-26", "2018-03-29",
"2018-03-30", "2018-04-03", "2018-04-04", "2018-04-05", "2018-04-06",
"2018-04-07", "2018-04-09", "2018-04-10", "2018-04-11"), x = c(15L,
4L, 3L, 6L, 5L, 12L, 7L, 5L, 5L, 13L, 8L, 2L)), class = "data.frame",
row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12"))
#akrun
This almost worked. Right now I get 52 rows out of 3 years of data:
week total_per_week
1 1 246
2 2 491
3 3 458
4 4 556
5 5 454
6 6 594
7 7 592
8 8 496
9 9 567
10 10 615

Separate hour and minutes in R

I have a column for time, but it hasn't been separated by : or any thing. It looks like this:
person time
1 356
1 931
1 2017
1 2103
2 256
2 1031
2 1517
2 2206
How do I separate them?
There are different ways of approaching the issue. Which method you choose depends on your desired output.
For example, you could use stringr::str_split to split time into a list vector of hours and minutes using a positive look-ahead
library(tidyverse)
df %>% mutate(time = str_split(time, "(?=\\d{2}$)"))
# person time
#1 1 3, 56
#2 1 9, 31
#3 1 20, 17
#4 1 2, 13
#5 2 2, 56
#6 2 10, 31
#7 2 15, 17
#8 2 2, 26
Or we can use tidyr::separate to create two new columns hours and minutes
df %>% separate(time, c("hours", "minutes"), sep = "(?=\\d{2}$)")
# person hours minutes
#1 1 3 56
#2 1 9 31
#3 1 20 17
#4 1 2 13
#5 2 2 56
#6 2 10 31
#7 2 15 17
#8 2 2 26
In response to your comment you could use stringr::str_replace
df %>% mutate(time = str_replace(time, "(?=\\d{2}$)", ":"))
# person time
#1 1 3:56
#2 1 9:31
#3 1 20:17
#4 1 2:13
#5 2 2:56
#6 2 10:31
#7 2 15:17
#8 2 2:26
And the same in base R using sub
transform(df, time = sub("(?=\\d{2}$)", ":", time, perl = TRUE))
giving the same result.
Sample data
df <- read.table(text = "
person time
1 356
1 931
1 2017
1 213
2 256
2 1031
2 1517
2 226", header = T)
We can use strptime with sprintf in base R
df[c("hour", "min")] <- unclass(strptime(sprintf("%04d00", df$time),
"%H%M%S"))[c('hour', 'min')]
df
# person time hour min
#1 1 356 3 56
#2 1 931 9 31
#3 1 2017 20 17
#4 1 213 2 13
#5 2 256 2 56
#6 2 1031 10 31
#7 2 1517 15 17
#8 2 226 2 26
Or if it needs to only create a delimiter
tmp <- sub('(\\d{2})$', ':\\1', df$time)
tmp
#[1] "3:56" "9:31" "20:17" "2:13" "2:56" "10:31" "15:17" "2:26"
and then it can be separated in to two column with read.table
read.table(text = tmp, sep=":", header = FALSE, col.names = c('hour', 'min'))
data
df <- structure(list(person = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), time = c(356L,
931L, 2017L, 213L, 256L, 1031L, 1517L, 226L)),
class = "data.frame", row.names = c(NA,
-8L))
Another possibility:
res<-strsplit(gsub("(\\d+(?=\\d{2,}))(\\d{1,})",
"\\1:\\2",df$time,perl = T),":")
df$Minutes <- sapply(res,"[[",2)
df$Hr <- sapply(res,"[[",1)
df
Result:
person time Minutes Hr
1 1 356 56 3
2 1 931 31 9
3 1 2017 17 20
4 1 2103 03 21
5 2 256 56 2
6 2 1031 31 10
7 2 1517 17 15
8 2 2206 06 22
Data:
df <-structure(list(person = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), time = c(356L,
931L, 2017L, 2103L, 256L, 1031L, 1517L, 2206L)), row.names = c(NA,
-8L), class = "data.frame")
If you want to show time in HH:MM format, probably we can use sprintf with sub to enter semicolon (:) in between
sub("(\\d{2})(\\d{2})", "\\1:\\2",sprintf("%04d", df$time))
#[1] "03:56" "09:31" "20:17" "21:03" "02:56" "10:31" "15:17" "22:06"

R: cumulative sum over rolling date range

In R, how can I calculate cumsum for a defined time period prior to the row being calculate? Prefer dplyr if possible.
For example, if the period was 10 days, then the function would achieve cum_rolling10:
date value cumsum cum_rolling10
1/01/2000 9 9 9
2/01/2000 1 10 10
5/01/2000 9 19 19
6/01/2000 3 22 22
7/01/2000 4 26 26
8/01/2000 3 29 29
13/01/2000 10 39 29
14/01/2000 9 48 38
18/01/2000 2 50 21
19/01/2000 9 59 30
21/01/2000 8 67 38
25/01/2000 5 72 24
26/01/2000 1 73 25
30/01/2000 6 79 20
31/01/2000 6 85 18
A solution using dplyr, tidyr, lubridate, and zoo.
library(dplyr)
library(tidyr)
library(lubridate)
library(zoo)
dt2 <- dt %>%
mutate(date = dmy(date)) %>%
mutate(cumsum = cumsum(value)) %>%
complete(date = full_seq(date, period = 1), fill = list(value = 0)) %>%
mutate(cum_rolling10 = rollapplyr(value, width = 10, FUN = sum, partial = TRUE)) %>%
drop_na(cumsum)
dt2
# A tibble: 15 x 4
date value cumsum cum_rolling10
<date> <dbl> <int> <dbl>
1 2000-01-01 9 9 9
2 2000-01-02 1 10 10
3 2000-01-05 9 19 19
4 2000-01-06 3 22 22
5 2000-01-07 4 26 26
6 2000-01-08 3 29 29
7 2000-01-13 10 39 29
8 2000-01-14 9 48 38
9 2000-01-18 2 50 21
10 2000-01-19 9 59 30
11 2000-01-21 8 67 38
12 2000-01-25 5 72 24
13 2000-01-26 1 73 25
14 2000-01-30 6 79 20
15 2000-01-31 6 85 18
DATA
dt <- structure(list(date = c("1/01/2000", "2/01/2000", "5/01/2000",
"6/01/2000", "7/01/2000", "8/01/2000", "13/01/2000", "14/01/2000",
"18/01/2000", "19/01/2000", "21/01/2000", "25/01/2000", "26/01/2000",
"30/01/2000", "31/01/2000"), value = c(9L, 1L, 9L, 3L, 4L, 3L,
10L, 9L, 2L, 9L, 8L, 5L, 1L, 6L, 6L)), .Names = c("date", "value"
), row.names = c(NA, -15L), class = "data.frame")
I recommend using runner package designed to calculate functions on rolling/running windows. You can achieve this by using sum_run - one liner here:
library(runner)
library(dplyr)
df %>%
mutate(
cum_rolling_10 = sum_run(
x = df$value,
k = 10,
idx = as.Date(df$date, format = "%d/%m/%Y"))
)
df
# date value cum_rolling_10
# 1 1/01/2000 9 9
# 2 2/01/2000 1 10
# 3 5/01/2000 9 19
# 4 6/01/2000 3 22
# 5 7/01/2000 4 26
# 6 8/01/2000 3 29
# 7 13/01/2000 10 29
# 8 14/01/2000 9 38
# 9 18/01/2000 2 21
# 10 19/01/2000 9 30
# 11 21/01/2000 8 38
# 12 25/01/2000 5 24
# 13 26/01/2000 1 25
# 14 30/01/2000 6 20
# 15 31/01/2000 6 18
Enjoy!
this solution will avoid memory overhead, and migrate to sparklyr will be easy.
lag = 7
dt %>%
mutate(date = dmy(date)) %>%
mutate(order = datediff(date,min(date)) %>%
arrange(desc(order)) %>%
mutate(n_order = lag(order + lag,1L,default = 0)) %>%
mutate(b_order = ifelse(order - n_order >= 0,order,-1)) %>%
mutate(m_order = cummax(b_order)) %>%
group_by(m_order) %>%
mutate(rolling_value = cumsum(value))
Use slide_index_sum() from slider, which is designed to have the same API as purrr.
library(slider)
library(dplyr)
df <- tibble(
date = c(
"1/01/2000", "2/01/2000", "5/01/2000", "6/01/2000", "7/01/2000",
"8/01/2000", "13/01/2000", "14/01/2000", "18/01/2000", "19/01/2000",
"21/01/2000", "25/01/2000", "26/01/2000", "30/01/2000", "31/01/2000"
),
value = c(9L, 1L, 9L, 3L, 4L, 3L, 10L, 9L, 2L, 9L, 8L, 5L, 1L, 6L, 6L)
)
df <- mutate(df, date = as.Date(date, format = "%d/%m/%Y"))
df %>%
mutate(
cumsum = cumsum(value),
cum_rolling10 = slide_index_sum(value, date, before = 9L)
)
#> # A tibble: 15 × 4
#> date value cumsum cum_rolling10
#> <date> <int> <int> <dbl>
#> 1 2000-01-01 9 9 9
#> 2 2000-01-02 1 10 10
#> 3 2000-01-05 9 19 19
#> 4 2000-01-06 3 22 22
#> 5 2000-01-07 4 26 26
#> 6 2000-01-08 3 29 29
#> 7 2000-01-13 10 39 29
#> 8 2000-01-14 9 48 38
#> 9 2000-01-18 2 50 21
#> 10 2000-01-19 9 59 30
#> 11 2000-01-21 8 67 38
#> 12 2000-01-25 5 72 24
#> 13 2000-01-26 1 73 25
#> 14 2000-01-30 6 79 20
#> 15 2000-01-31 6 85 18

Merge two tables in R; column names differ with A and B options

I have two datasets that I'm trying to merge together. The first one contains information for every test subject with a unique ID (in rows). The second set contains measurements for every test subject (in columns), however each subject was measured twice so the unique ID reads "IDa and IDb." I'd like to find a way to merge these two tables based on the unique ID, regardless of whether it is measurement A or B.
Here's a small sample of the 2 datasets, and a table of the intended output. Any help would be appreciated!
UniqueID Site State Age Height
Tree001 FK OR 23 70
Tree002 FK OR 45 53
Tree003 NM OR 35 84
UniqueID Tree001A Tree001B Tree002A Tree002B Tree003A Tree003B
1996 4 2
1997 7 8 7 3
1998 3 2 9 4 7
1999 11 9 2 12 3 13
2010 8 8 4 6 11 4
2011 10 5 6 3 8 9
UniqueID Tree001A Tree001B Tree002A Tree002B Tree003A Tree003B
Site FK FK FK FK NM NM
State OR OR OR OR OR OR
Age 23 23 45 45 35 35
Height 70 70 53 53 84 84
1996 4 2
1997 7 8 7 3
1998 3 2 9 4 7
1999 11 9 2 12 3 13
2010 8 8 4 6 11 4
2011 10 5 6 3 8 9
This can be one approach.
df1 <- structure(list(UniqueID = structure(1:3, .Label = c("Tree001",
"Tree002", "Tree003"), class = "factor"), Site = structure(c(1L,
1L, 2L), .Label = c("FK", "NM"), class = "factor"), State = structure(c(1L,
1L, 1L), .Label = "OR", class = "factor"), Age = c(23L, 45L,
35L), Height = c(70L, 53L, 84L)), .Names = c("UniqueID", "Site",
"State", "Age", "Height"), class = "data.frame", row.names = c(NA,
-3L))
df2 <- structure(list(UniqueID = c(1996L, 1997L, 1998L, 1999L, 2010L,
2011L), Tree001A = c(4L, 7L, 3L, 11L, 8L, 10L), Tree001B = c(NA,
8L, 2L, 9L, 8L, 5L), Tree002A = c(2L, 7L, 9L, 2L, 4L, 6L), Tree002B = c(NA,
NA, 4L, 12L, 6L, 3L), Tree003A = c(NA, 3L, 7L, 3L, 11L, 8L),
Tree003B = c(NA, NA, NA, 13L, 4L, 9L)), .Names = c("UniqueID",
"Tree001A", "Tree001B", "Tree002A", "Tree002B", "Tree003A", "Tree003B"
), class = "data.frame", row.names = c(NA, -6L))
> df1
UniqueID Site State Age Height
1 Tree001 FK OR 23 70
2 Tree002 FK OR 45 53
3 Tree003 NM OR 35 84
> df2
UniqueID Tree001A Tree001B Tree002A Tree002B Tree003A Tree003B
1 1996 4 <NA> 2 <NA> <NA> <NA>
2 1997 7 8 7 <NA> 3 <NA>
3 1998 3 2 9 4 7 <NA>
4 1999 11 9 2 12 3 13
5 2010 8 8 4 6 11 4
6 2011 10 5 6 3 8 9
# Use transpose function to change df1
df3 <- as.data.frame(t(df1[,-1]))
colnames(df3) <- df1[,1]
# Change rownames to UniqueID
df3$UniqueID <- rownames(df3)
# ROwnames to numeric
rownames(df3) <- c(1:4)
# Modify dataframe so that you have two columns for each subject
df3 <- df3[,c(4,1,1,2,2,3,3)]
colnames(df3) <- c("UniqueID", "Tree001A", "Tree001B", "Tree002A",
"Tree002B", "Tree003A", "Tree003B")
# Change classes of columns of df2 to factor
df2 <- data.frame(sapply(df2,function(x) class(x)<- as.factor(x)))
# Now combine two data frames
new <- rbind(df3,df2)
> new
UniqueID Tree001A Tree001B Tree002A Tree002B Tree003A Tree003B
1 Site FK FK FK FK NM NM
2 State OR OR OR OR OR OR
3 Age 23 23 45 45 35 35
4 Height 70 70 53 53 84 84
5 1996 4 <NA> 2 <NA> <NA> <NA>
6 1997 7 8 7 <NA> 3 <NA>
7 1998 3 2 9 4 7 <NA>
8 1999 11 9 2 12 3 13
9 2010 8 8 4 6 11 4
10 2011 10 5 6 3 8 9

Resources