How to save multiple numbers in one cell in a matrix/dataframe? - r

This is R language.
From a matrix called temp_warnings that looks like
row.names row day Tx Hx Tn
1 61 61 30 31.9 36.85 19.1
2 84 84 23 33.5 43.07 20.3
3 85 85 24 31.5 39.82 19.2
4 94 94 2 30.9 41.36 20.0
5 99 99 7 34.0 43.17 21.6
6 101 101 9 34.4 42.45 21.0
7 131 131 8 30.1 38.52 19.6
8 132 132 9 30.7 38.35 21.0
I want to have this informations saved using the row and day columns into a new matrix called stn.
2001
Tmax >= 30 & Tmin >= 19 61, 84, 85, 94, 99, 101, 131, 132
May
June 30
July 23, 24
August 2, 7, 9
September 8, 9
So I would like the contents of the column row to be saved in the first cell. There are 153 days being tested for Tx, Hx and Tn, May 1st - Sept 30th so the day column corresponds to the day of the month. So for column row numbers 1-31 are May, 32-61 are June and so on. I would like the day column numbers to be saved in the correct cells for their month as well.
If you need any other information let me know,
Thanks,
Nick

This is very unusual format so things can get messy:
dat <- read.table(header = TRUE, text="row.names row day Tx Hx Tn
1 61 61 30 31.9 36.85 19.1
2 84 84 23 33.5 43.07 20.3
3 85 85 24 31.5 39.82 19.2
4 94 94 2 30.9 41.36 20.0
5 99 99 7 34.0 43.17 21.6
6 101 101 9 34.4 42.45 21.0
7 131 131 8 30.1 38.52 19.6
8 132 132 9 30.7 38.35 21.0")
## creating a column for the months and pasting the days by month
dat <- within(dat, {
m <- cut(row, breaks = c(0, 31, 61, 91, 121, Inf), labels = month.abb[5:9])
ms <- ave(dat$day, m, FUN = function(x) paste(x, collapse = ', '))
# 'Tmax >= 30 & Tmin >= 19' <- paste(row, collapse = ', ')
})
## creating the final data frame to merge into
dat1 <- data.frame(' ' = c('Tmax >= 30 & Tmin >= 19', month.abb[5:9]),
'2001' = c(paste(dat$row, collapse = ', '), rep(NA, 5)),
check.names = FALSE)
dat1 <- merge(dat1, dat[!duplicated(dat[c('m','ms')]), c('m','ms')],
by.x = ' ', by.y = 'm', all = TRUE)
## combining the two columns and some clean-up
dat1 <- within(dat1, {
'2001' <- gsub('NA', '', paste(`2001`, ms))
ms <- NULL
' ' <- factor(` `, levels = c('Tmax >= 30 & Tmin >= 19', month.abb[5:9]))
})
## and ordering the rows as desired
dat1[with(dat1, order(` `)), ]
# 2001
# 6 Tmax >= 30 & Tmin >= 19 61, 84, 85, 94, 99, 101, 131, 132
# 4 May
# 3 Jun 30
# 2 Jul 23, 24
# 1 Aug 2, 7, 9
# 5 Sep 8, 9

This is what I ended up doing
stn[1,1] <- toString(temp_warnings$row)
stn[2,1] <- toString((subset(temp_warnings, row <= 31))$day)
stn[3,1] <- toString((subset(temp_warnings, 31 < row & row <= 61))$day)
stn[4,1] <- toString((subset(temp_warnings, 61 < row & row <= 92))$day)
stn[5,1] <- toString((subset(temp_warnings, 92 < row & row <= 123))$day)
stn[6,1] <- toString((subset(temp_warnings, 123 < row))$day)

Related

How to merge a single measurement into a dataframe of multiple measurements in R

I have a long dataframe of multiple measurements per ID, at different time points for variables BP1 and BP2.
ID <- c(1,1,1,2,2,2,3,3,4)
Time <- c(56,57,58,61,62,64,66,67,72)
BP1 <- c(70,73,73,74,75,76,74,74,70)
BP2 <- c(122,122,123,126,124,121,130,132,140)
df1 <- data.frame(ID, Time, BP1, BP2)
I would like to merge another dataframe (df2), which contains a single measurement for BP1 and BP2 per ID.
ID <- c(1,2,3,4)
Time <- c(55, 60, 65, 70)
BP1 <- c(70, 72, 73, 74)
BP2 <- c(120, 124, 130, 134)
df2 <- data.frame(ID, Time, BP1, BP2)
How do I combine these dataframes so that the Time variable is in order, and the dataframe looks like this:
Any help greatly appreciated, thank you!
In base R, use rbind() to combine and order() to sort, then clean up the rownames:
df3 <- rbind(df1, df2)
df3 <- df3[order(df3$ID, df3$Time), ]
rownames(df3) <- seq(nrow(df3))
df3
Or, using dplyr:
library(dplyr)
bind_rows(df1, df2) %>%
arrange(ID, Time)
Result from either approach:
ID Time BP1 BP2
1 1 55 70 120
2 1 56 70 122
3 1 57 73 122
4 1 58 73 123
5 2 60 72 124
6 2 61 74 126
7 2 62 75 124
8 2 64 76 121
9 3 65 73 130
10 3 66 74 130
11 3 67 74 132
12 4 70 74 134
13 4 72 70 140

Calculate second highest cumulative value by group

I have data with a grouping variable 'grps' and a value 'x'. I have calculated the cummax within each group 'cmx'. Now I need to find the second highest cumulative value of 'x' within each group, scmx.
Some data, including the desired column scmx:
library(data.table)
d = structure(list(date = structure(rep(c(18690, 18691, 18692, 18693, 18694, 18695, 18696, 18697), 2), class = "Date"),
x = c(18, 70, 57, 94, 94, 13, 98, 23, 20, 72, 59, 96, 96, 15, 100, 25),
grps = c(rep("g1", 8), rep("g2", 8))),
row.names = c(NA, -16L), class = c("data.table", "data.frame"))
d[, cmx := cummax(x), by = .(grps)]
d[, scmx := c(18, 18, 57, 70, 70, 70, 94, 94, 20, 20, 59, 72, 72, 72, 96, 96)]
Context
If x corresponds to a performance rating, what I am trying to do is locate the date when they achieved their best performance and their second best. A similar question of mine where I needed to locate the row which corresponded to the highest cumulative value in a column:
Fill down first row within each cumulative max, with a twist
A data.table alternative:
d[ , scmx2 := {
c(x[1], sapply(seq(.N)[-1], function(i){
v = x[1:i]
v[frank(-v, ties.method = "dense") == 2][1]
}))
}, by = grps]
# date x grps cmx scmx scmx2
# 1: 2021-03-04 18 g1 18 18 18
# 2: 2021-03-05 70 g1 70 18 18
# 3: 2021-03-06 57 g1 70 57 57
# 4: 2021-03-07 94 g1 94 70 70
# 5: 2021-03-08 94 g1 94 70 70
# 6: 2021-03-09 13 g1 94 70 70
# 7: 2021-03-10 98 g1 98 94 94
# 8: 2021-03-11 23 g1 98 94 94
# 9: 2021-03-04 20 g2 20 20 20
# 10: 2021-03-05 72 g2 72 20 20
# 11: 2021-03-06 59 g2 72 59 59
# 12: 2021-03-07 96 g2 96 72 72
# 13: 2021-03-08 96 g2 96 72 72
# 14: 2021-03-09 15 g2 96 72 72
# 15: 2021-03-10 100 g2 100 96 96
# 16: 2021-03-11 25 g2 100 96 96
Within each group (by = grps), loop (sapply) over a sequence from 2 to number of rows in the current group (seq(.N)[-1]). In each step, subset 'x' from start of the vector to the index 'i' (v = x[1:i]).
Calculate dense rank and check if the rank is 2 (frank(-v, ties.method = "dense") == 2), i.e. the rank of the second largest number. Use the logical indices to subset 'v' (v[...). Select the first match ([1]; in case of several values with rank 2). Concatenate the result from this 'expanding window' with the first element of 'x' (c(x[1], ...).
In the first window, with only one value, there is clearly no second highest value. Here OP have chosen to return the first value. The same choice needs to be made also for longer windows where all values are equal, which will occur when there are leading runs of equal values. If we rather want to return NA than the first value, then replace the x[1] in the line
c(x[1], sapply(seq(.N)[-1], function(i){
...with NA_real_.
Small demo:
d = data.table(grps = c(1, 1, 2, 2, 2), x = c(3, 3, 4, 4, 5))
d[ , scmx2 := {
c(NA_real_, sapply(seq(.N)[-1], function(i){
v = x[1:i]
v[frank(-v, ties.method = "dense") == 2][1]
}))
}, by = grps]
# grps x scmx
# 1: 1 3 NA # grp 1: all values equal in all windows -> all NA
# 2: 1 3 NA
# 3: 2 4 NA
# 4: 2 4 NA
# 5: 2 5 4 # grp 2: only the last window has a second highest value
This question is indeed similar to the post I linked to above (Finding cumulative second max per group in R). However, here OP asked for a data.table solution.
Here is another option using non-equi join:
d[, s2 := .SD[.SD, on=.(grps, date<=date, x<cmx), by=.EACHI, max(x.x)]$V1]
d[is.na(s2), s2 := x][]
output:
date x grps cmx scmx s2
1: 2021-03-04 18 g1 18 18 18
2: 2021-03-05 70 g1 70 18 18
3: 2021-03-06 57 g1 70 57 57
4: 2021-03-07 94 g1 94 70 70
5: 2021-03-08 94 g1 94 70 70
6: 2021-03-09 13 g1 94 70 70
7: 2021-03-10 98 g1 98 94 94
8: 2021-03-11 23 g1 98 94 94
9: 2021-03-04 20 g2 20 20 20
10: 2021-03-05 72 g2 72 20 20
11: 2021-03-06 59 g2 72 59 59
12: 2021-03-07 96 g2 96 72 72
13: 2021-03-08 96 g2 96 72 72
14: 2021-03-09 15 g2 96 72 72
15: 2021-03-10 100 g2 100 96 96
16: 2021-03-11 25 g2 100 96 96
Create a sequence that is the length of the column x. Apply the function to each sequence in x that is from index 1 to the current number in the sequence, only caring about the unique values. Rfast::nth can be used to take the 2nd highest number in a vector.
library(Rfast)
sapply(seq(length(d$x)), function(x) {
return(nth(unique(d$x[1:x]), 2, descending=TRUE))
})
[1] 2.652495e-315 1.800000e+01 5.700000e+01 7.000000e+01
[5] 7.000000e+01 7.000000e+01 9.400000e+01 9.400000e+01
To do it for the new data frame. We can still use the function created above. Arrange the data frame so that the group names and values are in their own column, then use lapply with rollapplyr to capture the 2nd largest unique value.
d1=d %>% select(-cmx) %>%
pivot_wider(names_from=grps, values_from=x)
lapply(d1[-1], function(x) {
my_list=rollapplyr(x, seq(length(x)), function(x) {return(nth(sort(unique(x), decreasing=TRUE), 2))})
return(my_list)
})

Average over rows pairs and paste the value based on condition

In R, I have a df such as:
a b c
1 124 70 aa
2 129 67 aa
3 139 71 aa
4 125 77 aa
5 125 82 aa
6 121 69 aa
7 135 68 bb
8 137 72 bb
9 137 78 bb
10 140 86 bb
I want to iterate along rows within columns (a, b), computing the mean of all rows pairs, and paste this mean to the same two rows of new columns (a_new, b_new) if the difference between these two rows is >=12. Otherwise just copy the old value. This behaviour should be restricted to groups as marked by another column (c), i.e it should not happen if two rows are from different groups.
In this example, it happens in row 3 (cos in column a, difference with next (4th) row is 14) and in row 5 (cos in column b, difference with next row is 13). However, this should not happen with row 6 cos row 7 is in another c group.
Thus, resulting df would look like:
a b c a_new b_new
1 124 70 aa 124 70
2 129 67 aa 129 67
3 139 71 aa 132 71
4 125 77 aa 132 68
5 125 82 aa 125 75.5
6 121 69 aa 121 75.5
7 135 68 bb 135 68
8 137 72 bb 137 72
9 137 78 bb 137 78
10 140 86 bb 140 86
I've been struggling to do this for a while, figured out that perhaps lag function could be used, but no success. Help would be much appreciated (be it base R, or dplyr, or whatever)
Dput:
structure(list(a = c(124, 129, 139, 125, 125, 121, 135, 137,
137, 140), b = c(70, 67, 71, 77, 82, 69, 68, 72, 78, 86), c = c("aa",
"aa", "aa", "aa", "aa", "aa", "bb", "bb", "bb", "bb")), row.names = c(NA,
-10L), class = c("tbl_df", "tbl", "data.frame"))
We can write a function which works for one chunk.
apply_fun <- function(x) {
inds <- which(abs(diff(x)) >= 12)
if(length(inds))
x[sort(c(inds, inds + 1))] <- c(sapply(inds, function(i)
rep(mean(x[c(i, i + 1)]), 2)))
return(x)
}
and then apply it for multiple columns by group.
library(dplyr)
df %>% group_by(c) %>% mutate_at(vars(a, b), list(new = apply_fun))
# a b c a_new b_new
# <dbl> <dbl> <chr> <dbl> <dbl>
# 1 124 70 aa 124 70
# 2 129 67 aa 129 67
# 3 139 71 aa 132 71
# 4 125 77 aa 132 77
# 5 125 82 aa 125 75.5
# 6 121 69 aa 121 75.5
# 7 135 68 bb 135 68
# 8 137 72 bb 137 72
# 9 137 78 bb 137 78
#10 140 86 bb 140 86
What I understood is to apply to each group given by the indicator column "c" the procedure commented in the code below:
pairAverage <- function(x) {
# x should be a numeric vector of length > 1
if (is.vector(x) & is.numeric(x) & length(x) > 1) {
# copy data to an aux vector
aux <- x
# get differences of lag 1
dh<-diff(x, 1)
# get means of consecutive pairs
med <- c(x$a[2:length(x)] - dh/2)
# get positions (index) of abs(means) >= 12
idx <- match(med[abs(dh) >= 12], med)
# need 2 reps of each mean to replace consecutive values of x
valToRepl <- med[sort(rep(idx,2))]
# ordered indexes pairs of consecutive elements of x to be replaced
idxToRepl <- sort(c(idx,idx+1))
# replace pairs of values
aux[idxToRepl] <- valToRepl
return(aux)
} else {
# do nothing
warning("paramater x should be a numeric vector of length > 1")
return(NULL)
}
}
pairAverageByGroups <- function(x, gr) {
if (is.vector(x) & is.numeric(x) & length(x) == length(gr)) {
x.ls <- split(x, as.factor(gr))
output <- unlist(lapply(x.ls, pairAverage))
names(output) <- NULL
output
} else {
# do nothing
warning("paremater x should be a numeric vector of length > 1")
return(NULL)
}
}
pairAverageByGroups(dd$a, dd$c)
[1] 124 129 132 132 125 121 135 137 137 140

Regroup lines of a data frame for which a column value is inferior to x

I have this data frame :
> df
Z freq proba
1 17 1 0.0033289263
2 18 4 0.0055569026
3 19 2 0.0087878028
4 20 3 0.0132023556
5 21 16 0.0188900561
6 22 12 0.0257995234
7 23 30 0.0337042731
8 24 41 0.0421963455
9 25 56 0.0507149437
10 26 65 0.0586089198
11 27 65 0.0652230449
12 28 93 0.0699913154
13 29 82 0.0725182432
14 30 94 0.0726318551
15 31 72 0.0703990113
16 32 74 0.0661024717
17 33 58 0.0601873020
18 34 66 0.0531896431
19 35 38 0.0456625487
20 36 45 0.0381117389
21 37 27 0.0309498221
22 38 17 0.0244723502
23 39 15 0.0188543771
24 40 13 0.0141629367
25 41 4 0.0103793600
26 42 1 0.0074254435
27 43 2 0.0051886582
28 45 1 0.0023658767
29 46 1 0.0015453804
30 49 2 0.0003792308
# Here are my datas :
> dput(df)
structure(list(Z = c(17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42,
43, 45, 46, 49), freq = c(1, 4, 2, 3, 16, 12, 30, 41, 56, 65,
65, 93, 82, 94, 72, 74, 58, 66, 38, 45, 27, 17, 15, 13, 4, 1,
2, 1, 1, 2), proba = c(0.0033289262662263, 0.00555690264007235,
0.00878780282243439, 0.0132023555702843, 0.0188900560866825,
0.0257995234198431, 0.0337042730520012, 0.0421963455163949, 0.0507149437492447,
0.0586089198012906, 0.0652230449359029, 0.0699913153996099, 0.0725182432348992,
0.0726318551493006, 0.0703990113442269, 0.0661024716831246, 0.0601873020200862,
0.0531896430528685, 0.045662548708844, 0.0381117389181843, 0.030949822142559,
0.0244723501557229, 0.01885437705459, 0.0141629366839816, 0.0103793599644779,
0.00742544354411115, 0.00518865818999788, 0.00236587669133322,
0.00154538036835848, 0.000379230768851682)), .Names = c("Z",
"freq", "proba"), row.names = c(NA, -30L), class = "data.frame")
And I want to regroup lines for which the value "freq" is < 5 with the next line, and this while the next line is < 5.
Idk if I'm clear enough so this is the output I expect :
> df2
labels effectifs pi
1 17;20 10 0.03087599
2 21 16 0.01889006
3 22 12 0.02579952
4 23 30 0.03370427
5 24 41 0.04219635
6 25 56 0.05071494
7 26 65 0.05860892
8 27 65 0.06522304
9 28 93 0.06999132
10 29 82 0.07251824
11 30 94 0.07263186
12 31 72 0.07039901
13 32 74 0.06610247
14 33 58 0.06018730
15 34 66 0.05318964
16 35 38 0.04566255
17 36 45 0.03811174
18 37 27 0.03094982
19 38 17 0.02447235
20 39 15 0.01885438
21 40 13 0.01416294
22 41;49 11 0.02728395
I did it with nested while, but I find this solution very painful and so unoptimized.
i <- 1
freqs <- c()
labels <- c()
pi <- c()
while(i < nrow(df)) {
if (df$freq[i] >= 5) {
freqs <- c(freqs, df$freq[i])
labels <- c(labels, df$Z[i])
pi <- c(pi, df$proba[i])
i <- i + 1
}
else {
count <- df$freq[i]
countPi <- df$proba[i]
k <- i
j <- i
while(df$freq[i] < 5 & i < nrow(df)) {
if (df$freq[i+1] < 5) {
count <- count + df$freq[i+1]
countPi <- countPi + df$proba[i+1]
j <- i + 1
}
i <- i + 1
}
labels <- c(labels, paste0(df$Z[k], ";", df$Z[j]))
freqs <- c(freqs, count)
pi <- c(pi, countPi)
}
}
df2 <- data.frame(labels, freqs, pi)
I'm sure there is far better, maybe with dplyr. If you have a better solution.. Thanks !
We could use the "devel" version of "data.table" as new functions are introduced (rleid). Here, we convert the "data.frame" to "data.table" (setDT(df)), create a grouping variable ("gr") based on the logical index (freq <5) using rleid. 'Z' column is 'numeric/integer' class. Create a character column ("Z1") from the "Z". Grouped by 'gr', if the "freq" is less than 5 for all the elements of that group, summarise the rows to a single row by taking the first observation of columns (.SD[1L]), remove the unwanted columns (as .SD includes "Z1" which will result in duplicate columns), append it with the "Z1" that we get from pasting the min and max value of "Z" for that group. Otherwise, leave it unchanged (else .SD). Remove the columns that we don't need by assigning it to "NULL".
library(data.table) #data.table_1.9.5
res <- setDT(df)[, gr:=rleid(freq<5)][, Z1:= as.character(Z)][,
if(all(freq<5)) c(.SD[1L][,-4, with=FALSE],
list(Z1=toString(c(min(Z), max(Z)))))
else .SD, gr][,1:2 :=NULL][]
head(res,3)
# freq proba Z1
#1: 1 0.003328926 17, 20
#2: 16 0.018890056 21
#3: 12 0.025799523 22
Since this is a dplyr question, here is a dplyr solution. First I used a grouping function in order to define the groups (similar to the rleid function in data.table). Then the summary and is fairly simple.
# grouping function
grouping <- function(condition){
# calculate runs for grouping
run <- rle((!condition) * 1:length(condition))
# revalue
run$values <- seq_along(run$values)
# invert to get grouping
inverse.rle(run)
}
# load dplyr
require(dplyr)
df %>%
mutate(group = grouping(freq<5)) %>% # add groups
group_by(group) %>% # group data
summarize(freq = sum(freq), # sum freq
proba = sum(proba), # sum proba
Z = toString(unique(range(Z)))) %>% # rename Z
mutate(group=NULL) # remove groups
## Source: local data table [22 x 3]
##
## freq proba Z
## 1 10 0.03087599 17, 20
## 2 16 0.01889006 21
## 3 12 0.02579952 22
## 4 30 0.03370427 23
## 5 41 0.04219635 24
## 6 56 0.05071494 25
## 7 65 0.05860892 26
## 8 65 0.06522304 27
## 9 93 0.06999132 28
## 10 82 0.07251824 29
## .. ... ... ...

R equivalent of Stata's for-loop over local macro list of stubnames

I'm a Stata user that's transitioning to R and there's one Stata crutch that I find hard to give up. This is because I don't know how to do the equivalent with R's "apply" functions.
In Stata, I often generate a local macro list of stubnames and then loop over that list, calling on variables whose names are built off of those stubnames.
For a simple example, imagine that I have the following dataset:
study_id year varX06 varX07 varX08 varY06 varY07 varY08
1 6 50 40 30 20.5 19.8 17.4
1 7 50 40 30 20.5 19.8 17.4
1 8 50 40 30 20.5 19.8 17.4
2 6 60 55 44 25.1 25.2 25.3
2 7 60 55 44 25.1 25.2 25.3
2 8 60 55 44 25.1 25.2 25.3
and so on...
I want to generate two new variables, varX and varY that take on the values of varX06 and varY06 respectively when year is 6, varX07 and varY07 respectively when year is 7, and varX08 and varY08 respectively when year is 8.
The final dataset should look like this:
study_id year varX06 varX07 varX08 varY06 varY07 varY08 varX varY
1 6 50 40 30 20.5 19.8 17.4 50 20.5
1 7 50 40 30 20.5 19.8 17.4 40 19.8
1 8 50 40 30 20.5 19.8 17.4 30 17.4
2 6 60 55 44 25.1 25.2 25.3 60 25.1
2 7 60 55 44 25.1 25.2 25.3 55 25.2
2 8 60 55 44 25.1 25.2 25.3 44 25.3
and so on...
To clarify, I know that I can do this with melt and reshape commands - essentially converting this data from wide to long format, but I don't want to resort to that. That's not the intent of my question.
My question is about how to loop over a local macro list of stubnames in R and I'm just using this simple example to illustrate a more generic dilemma.
In Stata, I could generate a local macro list of stubnames:
local stub varX varY
And then loop over the macro list. I can generate a new variable varX or varY and replace the new variable value with the value of varX06 or varY06 (respectively) if year is 6 and so on.
foreach i of local stub {
display "`i'"
gen `i'=.
replace `i'=`i'06 if year==6
replace `i'=`i'07 if year==7
replace `i'=`i'08 if year==8
}
The last section is the section that I find hardest to replicate in R. When I write 'x'06, Stata takes the string "varX", concatenates it with the string "06" and then returns the value of the variable varX06. Additionally, when I write 'i', Stata returns the string "varX" and not the string "'i'".
How do I do these things with R?
I've searched through Muenchen's "R for Stata Users", googled the web, and searched through previous posts here at StackOverflow but haven't been able to find an R solution.
I apologize if this question is elementary. If it's been answered before, please direct me to the response.
Thanks in advance,
Tara
Well, here's one way. Columns in R data frames can be accessed using their character names, so this will work:
# create sample dataset
set.seed(1) # for reproducible example
df <- data.frame(year=as.factor(rep(6:8,each=100)), #categorical variable
varX06 = rnorm(300), varX07=rnorm(300), varX08=rnorm(100),
varY06 = rnorm(300), varY07=rnorm(300), varY08=rnorm(100))
# you start here...
years <- unique(df$year)
df$varX <- unlist(lapply(years,function(yr)df[df$year==yr,paste0("varX0",yr)]))
df$varY <- unlist(lapply(years,function(yr)df[df$year==yr,paste0("varY0",yr)]))
print(head(df),digits=4)
# year varX06 varX07 varX08 varY06 varY07 varY08 varX varY
# 1 6 -0.6265 0.8937 -0.3411 -0.70757 1.1350 0.3412 -0.6265 -0.70757
# 2 6 0.1836 -1.0473 1.5024 1.97157 1.1119 1.3162 0.1836 1.97157
# 3 6 -0.8356 1.9713 0.5283 -0.09000 -0.8708 -0.9598 -0.8356 -0.09000
# 4 6 1.5953 -0.3836 0.5422 -0.01402 0.2107 -1.2056 1.5953 -0.01402
# 5 6 0.3295 1.6541 -0.1367 -1.12346 0.0694 1.5676 0.3295 -1.12346
# 6 6 -0.8205 1.5122 -1.1367 -1.34413 -1.6626 0.2253 -0.8205 -1.34413
For a given yr, the anonymous function extracts the rows with that yr and column named "varX0" + yr (the result of paste0(...). Then lapply(...) "applies" this function for each year, and unlist(...) converts the returned list into a vector.
Maybe a more transparent way:
sub <- c("varX", "varY")
for (i in sub) {
df[[i]] <- NA
df[[i]] <- ifelse(df[["year"]] == 6, df[[paste0(i, "06")]], df[[i]])
df[[i]] <- ifelse(df[["year"]] == 7, df[[paste0(i, "07")]], df[[i]])
df[[i]] <- ifelse(df[["year"]] == 8, df[[paste0(i, "08")]], df[[i]])
}
This method reorders your data, but involves a one-liner, which may or may not be better for you (assume d is your dataframe):
> do.call(rbind, by(d, d$year, function(x) { within(x, { varX <- x[, paste0('varX0',x$year[1])]; varY <- x[, paste0('varY0',x$year[1])] }) } ))
study_id year varX06 varX07 varX08 varY06 varY07 varY08 varY varX
6.1 1 6 50 40 30 20.5 19.8 17.4 20.5 50
6.4 2 6 60 55 44 25.1 25.2 25.3 25.1 60
7.2 1 7 50 40 30 20.5 19.8 17.4 19.8 40
7.5 2 7 60 55 44 25.1 25.2 25.3 25.2 55
8.3 1 8 50 40 30 20.5 19.8 17.4 17.4 30
8.6 2 8 60 55 44 25.1 25.2 25.3 25.3 44
Essentially, it splits the data based on year, then uses within to create the varX and varY variables within each subset, and then rbind's the subsets back together.
A direct translation of your Stata code, however, would be something like the following:
u <- unique(d$year)
for(i in seq_along(u)){
d$varX <- ifelse(d$year == 6, d$varX06, ifelse(d$year == 7, d$varX07, ifelse(d$year == 8, d$varX08, NA)))
d$varY <- ifelse(d$year == 6, d$varY06, ifelse(d$year == 7, d$varY07, ifelse(d$year == 8, d$varY08, NA)))
}
Here's another option.
Create a 'column selection matrix' based on year, then use that to grab the values you want from any block of columns.
# indexing matrix based on the 'year' column
col_select_mat <-
t(sapply(your_df$year, function(x) unique(your_df$year) == x))
# make selections from col groups by stub name
sapply(c('varX', 'varY'),
function(x) your_df[, grep(x, names(your_df))][col_select_mat])
This gives the desired result (which you can cbind to your_df if you like)
varX varY
[1,] 50 20.5
[2,] 60 25.1
[3,] 40 19.8
[4,] 55 25.2
[5,] 30 17.4
[6,] 44 25.3
OP's dataset:
your_df <- read.table(header=T, text=
'study_id year varX06 varX07 varX08 varY06 varY07 varY08
1 6 50 40 30 20.5 19.8 17.4
1 7 50 40 30 20.5 19.8 17.4
1 8 50 40 30 20.5 19.8 17.4
2 6 60 55 44 25.1 25.2 25.3
2 7 60 55 44 25.1 25.2 25.3
2 8 60 55 44 25.1 25.2 25.3')
Benchmarking: Looking at the three posted solutions, this appears to be the fastest on average, but the differences are very small.
df <- your_df
d <- your_df
arvi1000 <- function() {
col_select_mat <- t(sapply(your_df$year, function(x) unique(your_df$year) == x))
# make selections from col groups by stub name
cbind(your_df,
sapply(c('varX', 'varY'),
function(x) your_df[, grep(x, names(your_df))][col_select_mat]))
}
jlhoward <- function() {
years <- unique(df$year)
df$varX <- unlist(lapply(years,function(yr)df[df$year==yr,paste0("varX0",yr)]))
df$varY <- unlist(lapply(years,function(yr)df[df$year==yr,paste0("varY0",yr)]))
}
Thomas <- function() {
do.call(rbind, by(d, d$year, function(x) { within(x, { varX <- x[, paste0('varX0',x$year[1])]; varY <- x[, paste0('varY0',x$year[1])] }) } ))
}
> microbenchmark(arvi1000, jlhoward, Thomas)
Unit: nanoseconds
expr min lq mean median uq max neval
arvi1000 37 39 43.73 40 42 380 100
jlhoward 38 40 46.35 41 42 377 100
Thomas 37 40 56.99 41 42 1590 100

Resources