Reshape Wide to Long with 2 time variables - r

Though there is an abundance of 'wide to long' threads for R, I haven't found an answer that will help me with my issue. Any assistance is greatly appreciated!
Example of my dataframe (in wide format):
CODE NAME M_2010_1 M_2011_1 M_2012_1 M_2010_3 M_2011_3 M_2012_3
1 A 10 11 10 9 10 13
12 B 11 13 15 15 14 11
8 C 9 2 4 2 8 8
Desired dataframe (in long):
CODE NAME YEAR M1 M3
1 A 2010 10 9
1 A 2011 11 10
1 A 2012 10 13
12 B 2010 11 15
12 B 2011 13 14
12 B 2012 15 11
8 C 2010 9 2
8 C 2011 2 8
8 C 2012 4 8
Thanks in advance!

Data
df<-
structure(list(CODE = c(1L, 12L, 8L), NAME = c("A", "B", "C"),
M_2010_1 = c(10L, 11L, 9L), M_2011_1 = c(11L, 13L, 2L), M_2012_1 = c(10L,
15L, 4L), M_2010_3 = c(9L, 15L, 2L), M_2011_3 = c(10L, 14L,
8L), M_2012_3 = c(13L, 11L, 8L)), class = "data.frame", row.names = c(NA,
-3L))
Code
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = -c(CODE,NAME),names_sep = "_",names_to = c("aux1","YEAR","aux2")) %>%
unite(aux,aux1,aux2,sep = "") %>%
pivot_wider(names_from = aux,values_from = value)
Output
# A tibble: 9 x 5
CODE NAME YEAR M1 M3
<int> <chr> <chr> <int> <int>
1 1 A 2010 10 9
2 1 A 2011 11 10
3 1 A 2012 10 13
4 12 B 2010 11 15
5 12 B 2011 13 14
6 12 B 2012 15 11
7 8 C 2010 9 2
8 8 C 2011 2 8
9 8 C 2012 4 8

A one liner using reshape which allows to define all in one.
reshape(dat, idv=1:2, var=list(3:5, 6:8), dir='long', timev='YEAR', times=2010:2012, v.n=c('M1', 'M2'))
# CODE NAME YEAR M1 M2
# 1.A.2010 1 A 2010 10 9
# 12.B.2010 12 B 2010 11 15
# 8.C.2010 8 C 2010 9 2
# 1.A.2011 1 A 2011 11 10
# 12.B.2011 12 B 2011 13 14
# 8.C.2011 8 C 2011 2 8
# 1.A.2012 1 A 2012 10 13
# 12.B.2012 12 B 2012 15 11
# 8.C.2012 8 C 2012 4 8
Data:
dat <- structure(list(CODE = c(1L, 12L, 8L), NAME = c("A", "B", "C"),
M_2010_1 = c(10L, 11L, 9L), M_2011_1 = c(11L, 13L, 2L), M_2012_1 = c(10L,
15L, 4L), M_2010_3 = c(9L, 15L, 2L), M_2011_3 = c(10L, 14L,
8L), M_2012_3 = c(13L, 11L, 8L)), class = "data.frame", row.names = c(NA,
-3L))

We could do this in pivot_longer after we rearrange the substring in the column names
library(dplyr)
library(stringr)
library(tidyr)
df1 %>%
rename_with(~ str_replace(.x, "_(\\d+)_(\\d+)", "\\2_\\1"),
starts_with("M_")) %>%
pivot_longer(cols = starts_with("M"),
names_to = c(".value", "year"), names_sep = "_")
-output
# A tibble: 9 × 5
CODE NAME year M1 M3
<int> <chr> <chr> <int> <int>
1 1 A 2010 10 9
2 1 A 2011 11 10
3 1 A 2012 10 13
4 12 B 2010 11 15
5 12 B 2011 13 14
6 12 B 2012 15 11
7 8 C 2010 9 2
8 8 C 2011 2 8
9 8 C 2012 4 8
data
df1 <- structure(list(CODE = c(1L, 12L, 8L), NAME = c("A", "B", "C"),
M_2010_1 = c(10L, 11L, 9L), M_2011_1 = c(11L, 13L, 2L), M_2012_1 = c(10L,
15L, 4L), M_2010_3 = c(9L, 15L, 2L), M_2011_3 = c(10L, 14L,
8L), M_2012_3 = c(13L, 11L, 8L)), class = "data.frame", row.names = c(NA,
-3L))

Related

Getting rows in data frame based on mutiple ranges in R

Let's say I have this table
a b
1 5 12
2 6 17
3 7 28
4 8 12
5 9 17
6 10 28
7 15 12
8 25 14
9 13 29
Also another table with index ranges:
start end
1 2 3
2 5 7
I want to get the rows in the first table based on the index ranges in the second table with a group name to differentiate, something like this:
a b group
2 6 17 1
3 7 28 1
5 9 17 2
6 10 28 2
7 15 12 2
how do I achieve this in R?
We can subset df1 taking the row index value from df2 using Map.
do.call(rbind, Map(function(x, y, z) transform(df1[x:y, ], group = z),
df2$start, df2$end, seq_len(nrow(df2))))
Or in purrr :
purrr::map2_dfr(df2$start, df2$end, ~df1[.x:.y, ], .id = "group")
# group a b
#1 1 6 17
#2 1 7 28
#3 2 9 17
#4 2 10 28
#5 2 15 12
data
df1 <- structure(list(a = c(5L, 6L, 7L, 8L, 9L, 10L, 15L, 25L, 13L),
b = c(12L, 17L, 28L, 12L, 17L, 28L, 12L, 14L, 29L)),
class = "data.frame", row.names = c(NA, -9L))
df2 <- structure(list(start = c(2L, 5L), end = c(3L, 7L)),
class = "data.frame", row.names = c(NA, -2L))
An option using data.table:
DT1[, rn := .I]
DT2[, g := .I]
DT1[, g := DT2[DT1, on=.(start<=rn, end>=rn), x.g]][
!is.na(g)]
output:
a b rn g
1: 6 17 2 1
2: 7 28 3 1
3: 9 17 5 2
4: 10 28 6 2
5: 15 12 7 2
data:
library(data.table)
DT1 <- fread("a b
5 12
6 17
7 28
8 12
9 17
10 28
15 12
25 14
13 29")
DT2 <- fread("start end
2 3
5 7")

R: generate value rows for each date extracted

I have a dataframe like this:
ID Year Week Monday Tuesday Wednesday
12 2017 42 8 9 8,5
12 2017 43 9 11 7,3
13 2017 43 9 10 6,8
I would like to change it in order to achive this:
ID day time
12 16/10/2017 8
12 17/10/2017 9
12 18/10/2017 8,5
12 23/10/2017 9
12 24/10/2017 11
12 25/10/2017 7,3
12 23/10/2017 9
12 24/10/2017 10
12 25/10/2017 6,8
I´m trying by using dplyr but still I have not found a solution
library(dplyr)
library(tidyr)
df %>%
gather(day, time, Monday:Wednesday) %>%
mutate(date = as.Date(paste(Year, Week, day),"%Y %U %A")) %>%
arrange(ID, Year, Week) %>%
select(-Year, -Week, -day)
# ID time date
#1 12 8 2017-10-16
#2 12 9 2017-10-17
#3 12 8,5 2017-10-18
#4 12 9 2017-10-23
#5 12 11 2017-10-24
#6 12 7,3 2017-10-25
#7 13 9 2017-10-23
#8 13 10 2017-10-24
#9 13 6,8 2017-10-25
#sample data
> dput(df)
structure(list(ID = c(12L, 12L, 13L), Year = c(2017L, 2017L,
2017L), Week = c(42L, 43L, 43L), Monday = c(8L, 9L, 9L), Tuesday = c(9L,
11L, 10L), Wednesday = structure(c(3L, 2L, 1L), .Label = c("6,8",
"7,3", "8,5"), class = "factor")), .Names = c("ID", "Year", "Week",
"Monday", "Tuesday", "Wednesday"), class = "data.frame", row.names = c(NA,
-3L))

Combine two data frames considering levels of factor of one data frame and column name of another data frame using r

I need to create a new column for a existing data frame considering levels of factors. I have 2 data frames called dat_group and dat_prices. These data frames look like below.
dat_group
Group
1 A
2 A
3 A
4 A
5 A
6 A
7 A
8 A
9 A
10 A
11 C
12 C
13 C
14 C
15 C
16 C
17 C
18 C
19 C
20 C
21 B
22 B
23 B
24 B
25 B
26 B
27 B
28 B
29 B
30 B
dat_price
A B C
1 21 45 24
2 21 45 24
3 21 45 24
4 21 45 24
5 15 11 10
6 15 11 10
7 15 11 10
8 20 13 55
9 20 13 55
10 20 13 55
I need to paste the values of A,B and C columns considering the level in dat_group. The row sequence should be the same order. If I create new column to dat_group as "price"
dat_group$Price<-NA
Then the data frame should be like ;
Group Price
1 A 21
2 A 21
3 A 21
4 A 21
5 A 15
6 A 15
7 A 15
8 A 20
9 A 20
10 A 20
11 C 24
12 C 24
13 C 24
14 C 24
15 C 10
16 C 10
17 C 10
18 C 55
19 C 55
20 C 55
21 B 45
22 B 45
23 B 45
24 B 45
25 B 11
26 B 11
27 B 11
28 B 13
29 B 13
30 B 13
I tried to do this using some available examples e.g.1 e.g.2, but did not work.
Please could anybody help me. The two example data frames can be accessed in following codes. My actual data set has several 1000 rows.
dat_group<- structure(list(Group = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("A", "B", "C"), class = "factor")), .Names = "Group", class = "data.frame", row.names = c(NA,
-30L))
dat_price<-structure(list(A = c(21L, 21L, 21L, 21L, 15L, 15L, 15L, 20L,
20L, 20L), B = c(45L, 45L, 45L, 45L, 11L, 11L, 11L, 13L, 13L,
13L), C = c(24L, 24L, 24L, 24L, 10L, 10L, 10L, 55L, 55L, 55L)), .Names = c("A",
"B", "C"), class = "data.frame", row.names = c(NA, -10L))
library(data.table)
dat_price <- as.data.table(dat_price)
dat_price_new <- cbind(dat_price[, c(1,3), with = FALSE],
dat_price[, 2, with = FALSE])
melt(dat_price_new)
A more defensive solution to your problem at hand. Hopefully this will work even if all of your factor's levels are not in identical multiples.
library(dplyr); library(purrr); library(magrittr)
dat_group$original_order <- seq(1:nrow(dat_group))
dat_group %<>%
split(.$Group) %>%
map(~ mutate(., Price = rep(na.omit(dat_price[,unique(Group)]), n()/length(na.omit(dat_price[,unique(Group)]))))) %>%
bind_rows() %>%
arrange(original_order) %>%
select(-original_order)
dat_group
Group Price
1 A 21
2 A 21
3 A 21
4 A 21
5 A 15
6 A 15
7 A 15
8 A 20
9 A 20
10 A 20
11 C 24
12 C 24
13 C 24
14 C 24
15 C 10
16 C 10
17 C 10
18 C 55
19 C 55
20 C 55
21 B 45
22 B 45
23 B 45
24 B 45
25 B 11
26 B 11
27 B 11
28 B 13
29 B 13
30 B 13
Original (lazy) solution:
dat_group$Price <- rep(unlist(dat_price), length.out = nrow(dat_group))

Finding value in one data.frame and transfering value from other column

I don't know if I will be able to explain it correctly but what I want to achieve really simple.
That's first data.frame. The important value for me is in first column "V1"
> dput(Data1)
structure(list(V1 = c(10L, 5L, 3L, 9L, 1L, 2L, 6L, 4L, 8L, 7L
), V2 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "NA", class = "factor"),
V3 = c(18L, 17L, 13L, 20L, 15L, 12L, 16L, 11L, 14L, 19L)), .Names = c("V1",
"V2", "V3"), row.names = c(NA, -10L), class = "data.frame")
Second data.frame:
> dput(Data2)
structure(list(Names = c(9L, 10L, 6L, 4L, 2L, 7L, 5L, 3L, 1L,
8L), Herat = c(30L, 29L, 21L, 25L, 24L, 22L, 28L, 27L, 23L, 26L
), Grobpel = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = "NA", class = "factor"), Hassynch = c(19L, 12L,
15L, 20L, 11L, 13L, 14L, 16L, 18L, 17L)), .Names = c("Names",
"Herat", "Grobpel", "Hassynch"), row.names = c(NA, -10L), class = "data.frame"
)
The value from first data.frame can be find in 1st column and I would like to copy the value from 4 column (Hassynch) and put it in the second column in first data.frame.
How to do it in the fastest way ?
library(dplyr)
left_join(Data1, Data2, by=c("V1"="Names"))
# V1 V2 V3 Herat Grobpel Hassynch
# 1 10 NA 18 29 NA 12
# 2 5 NA 17 28 NA 14
# 3 3 NA 13 27 NA 16
# 4 9 NA 20 30 NA 19
# 5 1 NA 15 23 NA 18
# 6 2 NA 12 24 NA 11
# 7 6 NA 16 21 NA 15
# 8 4 NA 11 25 NA 20
# 9 8 NA 14 26 NA 17
# 10 7 NA 19 22 NA 13
# if you don't want V2 and V3, you could
left_join(Data1, Data2, by=c("V1"="Names")) %>%
select(-V2, -V3)
# V1 Herat Grobpel Hassynch
# 1 10 29 NA 12
# 2 5 28 NA 14
# 3 3 27 NA 16
# 4 9 30 NA 19
# 5 1 23 NA 18
# 6 2 24 NA 11
# 7 6 21 NA 15
# 8 4 25 NA 20
# 9 8 26 NA 17
# 10 7 22 NA 13
Here's a toy example that I made some time ago to illustrate merge. left_join from dplyr is also good, and data.table almost certainly has another option.
You can subset your reference dataframe so that it contains only the key variable and value variable so that you don't end up with an unmanageable dataframe.
id<-as.numeric((1:5))
m<-c("a","a","a","","")
n<-c("","","b","b","b")
dfm<-data.frame(cbind(id,m))
head(dfm)
id m
1 1 a
2 2 a
3 3 a
4 4
5 5
dfn<-data.frame(cbind(id,n))
head(dfn)
id n
1 1
2 2
3 3 b
4 4 b
5 5 b
dfm$id<-as.numeric(dfm$id)
dfn$id<-as.numeric(dfn$id)
dfm<-subset(dfm,id<4)
head(dfm)
id m
1 1 a
2 2 a
3 3 a
dfn<-subset(dfn,id!=1 & id!=2)
head(dfn)
id n
3 3 b
4 4 b
5 5 b
df.all<-merge(dfm,dfn,by="id",all=TRUE)
head(df.all)
id m n
1 1 a <NA>
2 2 a <NA>
3 3 a b
4 4 <NA> b
5 5 <NA> b
df.all.m<-merge(dfm,dfn,by="id",all.x=TRUE)
head(df.al.lm)
id m n
1 1 a <NA>
2 2 a <NA>
3 3 a b
df.all.n<-merge(dfm,dfn,by="id",all.y=TRUE)
head(df.all.n)
id m n
1 3 a b
2 4 <NA> b
3 5 <NA> b

Merge two tables in R; column names differ with A and B options

I have two datasets that I'm trying to merge together. The first one contains information for every test subject with a unique ID (in rows). The second set contains measurements for every test subject (in columns), however each subject was measured twice so the unique ID reads "IDa and IDb." I'd like to find a way to merge these two tables based on the unique ID, regardless of whether it is measurement A or B.
Here's a small sample of the 2 datasets, and a table of the intended output. Any help would be appreciated!
UniqueID Site State Age Height
Tree001 FK OR 23 70
Tree002 FK OR 45 53
Tree003 NM OR 35 84
UniqueID Tree001A Tree001B Tree002A Tree002B Tree003A Tree003B
1996 4 2
1997 7 8 7 3
1998 3 2 9 4 7
1999 11 9 2 12 3 13
2010 8 8 4 6 11 4
2011 10 5 6 3 8 9
UniqueID Tree001A Tree001B Tree002A Tree002B Tree003A Tree003B
Site FK FK FK FK NM NM
State OR OR OR OR OR OR
Age 23 23 45 45 35 35
Height 70 70 53 53 84 84
1996 4 2
1997 7 8 7 3
1998 3 2 9 4 7
1999 11 9 2 12 3 13
2010 8 8 4 6 11 4
2011 10 5 6 3 8 9
This can be one approach.
df1 <- structure(list(UniqueID = structure(1:3, .Label = c("Tree001",
"Tree002", "Tree003"), class = "factor"), Site = structure(c(1L,
1L, 2L), .Label = c("FK", "NM"), class = "factor"), State = structure(c(1L,
1L, 1L), .Label = "OR", class = "factor"), Age = c(23L, 45L,
35L), Height = c(70L, 53L, 84L)), .Names = c("UniqueID", "Site",
"State", "Age", "Height"), class = "data.frame", row.names = c(NA,
-3L))
df2 <- structure(list(UniqueID = c(1996L, 1997L, 1998L, 1999L, 2010L,
2011L), Tree001A = c(4L, 7L, 3L, 11L, 8L, 10L), Tree001B = c(NA,
8L, 2L, 9L, 8L, 5L), Tree002A = c(2L, 7L, 9L, 2L, 4L, 6L), Tree002B = c(NA,
NA, 4L, 12L, 6L, 3L), Tree003A = c(NA, 3L, 7L, 3L, 11L, 8L),
Tree003B = c(NA, NA, NA, 13L, 4L, 9L)), .Names = c("UniqueID",
"Tree001A", "Tree001B", "Tree002A", "Tree002B", "Tree003A", "Tree003B"
), class = "data.frame", row.names = c(NA, -6L))
> df1
UniqueID Site State Age Height
1 Tree001 FK OR 23 70
2 Tree002 FK OR 45 53
3 Tree003 NM OR 35 84
> df2
UniqueID Tree001A Tree001B Tree002A Tree002B Tree003A Tree003B
1 1996 4 <NA> 2 <NA> <NA> <NA>
2 1997 7 8 7 <NA> 3 <NA>
3 1998 3 2 9 4 7 <NA>
4 1999 11 9 2 12 3 13
5 2010 8 8 4 6 11 4
6 2011 10 5 6 3 8 9
# Use transpose function to change df1
df3 <- as.data.frame(t(df1[,-1]))
colnames(df3) <- df1[,1]
# Change rownames to UniqueID
df3$UniqueID <- rownames(df3)
# ROwnames to numeric
rownames(df3) <- c(1:4)
# Modify dataframe so that you have two columns for each subject
df3 <- df3[,c(4,1,1,2,2,3,3)]
colnames(df3) <- c("UniqueID", "Tree001A", "Tree001B", "Tree002A",
"Tree002B", "Tree003A", "Tree003B")
# Change classes of columns of df2 to factor
df2 <- data.frame(sapply(df2,function(x) class(x)<- as.factor(x)))
# Now combine two data frames
new <- rbind(df3,df2)
> new
UniqueID Tree001A Tree001B Tree002A Tree002B Tree003A Tree003B
1 Site FK FK FK FK NM NM
2 State OR OR OR OR OR OR
3 Age 23 23 45 45 35 35
4 Height 70 70 53 53 84 84
5 1996 4 <NA> 2 <NA> <NA> <NA>
6 1997 7 8 7 <NA> 3 <NA>
7 1998 3 2 9 4 7 <NA>
8 1999 11 9 2 12 3 13
9 2010 8 8 4 6 11 4
10 2011 10 5 6 3 8 9

Resources