R: expand data frame columnwise with shifted rows of data - r

- Example Data to work with:
To create a reduced example, this is the output of dput(df):
df <- structure(list(SubjectID = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L), .Label = c("1", "2", "3"), class = "factor"), EventNumber = structure(c(1L,
1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L), .Label = c("1", "2"), class = "factor"),
EventType = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L
), .Label = c("A", "B"), class = "factor"), Param1 = c(0.3,
0.21, 0.87, 0.78, 0.9, 1.2, 1.4, 1.3, 0.6, 0.45, 0.45, 0.04,
0, 0.1, 0.03, 0.01, 0.09, 0.06, 0.08, 0.09, 0.03, 0.04, 0.04,
0.02), Param2 = c(45, 38, 76, 32, 67, 23, 27, 784, 623, 54,
54, 1056, 487, 341, 671, 859, 7769, 2219, 4277, 4060, 411,
440, 224, 57), Param3 = c(1.5, 1.7, 1.65, 1.32, 0.6, 0.3,
2.5, 0.4, 1.4, 0.67, 0.67, 0.32, 0.1, 0.15, 0.22, 0.29, 0.3,
0.2, 0.8, 1, 0.9, 0.8, 0.3, 0.1), Param4 = c(0.14, 0, 1,
0.86, 0, 0.6, 1, 1, 0.18, 0, 0, 0.39, 0, 1, 0.29, 0.07, 0.33,
0.53, 0.29, 0.23, 0.84, 0.61, 0.57, 0.59), Param5 = c(0.18,
0, 1, 0, 1, 0, 0.09, 1, 0.78, 0, 0, 1, 0.2, 0, 0.46, 0.72,
0.16, 0.22, 0.77, 0.52, 0.2, 0.68, 0.58, 0.17), Param6 = c(0,
1, 0.75, 0, 0.14, 0, 1, 0, 1, 0.27, 0, 1, 0, 0.23, 0.55,
0.86, 1, 0.33, 1, 1, 0.88, 0.75, 0, 0), AbsoluteTime = structure(c(1522533600,
1522533602, 1522533604, 1522533604, 1525125600, 1525125602,
1525125604, 1519254000, 1519254002, 1519254004, 1519254006,
1521759600, 1521759602, 1521759604, 1521759606, 1521759608,
1517353224, 1517353226, 1517353228, 1517353230, 1517439600,
1517439602, 1517439604, 1517439606), class = c("POSIXct",
"POSIXt"), tzone = "")), row.names = c(NA, -24L), class = "data.frame")
df
The real data has 20 subject, EventNumbers ranging from 1 to 100, and parameters are from Param1 to Param40 (depending on the experiment).
Row number are around 60 000 observation.
- What I want to achieve:
For df, create n * 40 new columns. # (40 or any number of parameters that will be chosen later.)
Think of n as "steps into the future".
Name the 40 * n newly created columns:
Param1_2, Param2_2, Param3_2, ..., Param39_2, Param40_2, ...,
Param1_3, Param2_3, Param3_3, ..., Param39_3, Param40_3, ...,
...,
Param1_n, Param2_n, Param3_n, ..., Param39_n, Param40_n
Resulting in columns
Param1_1, Param2_1, Param1_2, Param2_2, Param1_3, Param2_3, Param1_4, Param2_4, ... Param1_n, Param2_n
So every observation of subset df[X, c(4:9)] will get an additional set of variables with values from df[X+1, c(4:9)] to df[X+n, c(4:9)].
This is what the new df.extended should look like for n = 1:
df.extended <- structure(list(SubjectID = c(1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,
2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3), EventNumber = c(1, 1,
1, 1, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2,
2), EventType = c("A", "A", "A", "A", "B", "B", "B", "A", "A",
"A", "A", "B", "B", "B", "B", "B", "A", "A", "A", "A", "B", "B",
"B", "B"), Param1 = c(0.3, 0.21, 0.87, 0.78, 0.9, 1.2, 1.4, 1.3,
0.6, 0.45, 0.45, 0.04, 0, 0.1, 0.03, 0.01, 0.05, 0.07, 0.06,
0.01, 0.01, 0.01, 0.07, 0.04), Param2 = c(45, 38, 76, 32, 67,
23, 27, 784, 623, 54, 54, 1056, 487, 341, 671, 859, 1858, 640,
8181, 220, 99, 86, 170, 495), Param3 = c(1.5, 1.7, 1.65, 1.32,
0.6, 0.3, 2.5, 0.4, 1.4, 0.67, 0.67, 0.32, 0.1, 0.15, 0.22, 0.29,
1.5, 0.9, 0.8, 0.9, 0.1, 0, 0.8, 0.1), Param4 = c(0.14, 0, 1,
0.86, 0, 0.6, 1, 1, 0.18, 0, 0, 0.39, 0, 1, 0.29, 0.07, 0.64,
0.11, 0.12, 0.32, 0.55, 0.67, 0.83, 0.82), Param5 = c(0.18, 0,
1, 0, 1, 0, 0.09, 1, 0.78, 0, 0, 1, 0.2, 0, 0.46, 0.72, 0.27,
0.14, 0.7, 0.67, 0.23, 0.44, 0.61, 0.76), Param6 = c(0, 1, 0.75,
0, 0.14, 0, 1, 0, 1, 0.27, 0, 1, 0, 0.23, 0.55, 0.86, 1, 0.56,
0.45, 0.5, 0, 0, 0.89, 0.11), AbsoluteTime = c("2018-04-01 00:00:00",
"2018-04-01 00:00:02", "2018-04-01 00:00:04", "2018-04-01 00:00:04",
"2018-05-01 00:00:00", "2018-05-01 00:00:02", "2018-05-01 00:00:04",
"2018-02-22 00:00:00", "2018-02-22 00:00:02", "2018-02-22 00:00:04",
"2018-02-22 00:00:06", "2018-03-23 00:00:00", "2018-03-23 00:00:02",
"2018-03-23 00:00:04", "2018-03-23 00:00:06", "2018-03-23 00:00:08",
"2018-01-31 00:00:24", "2018-01-31 00:00:26", "2018-01-31 00:00:28",
"2018-01-31 00:00:30", "2018-02-01 00:00:00", "2018-02-01 00:00:02",
"2018-02-01 00:00:04", "2018-02-01 00:00:06"), Param1_2 = c(0.21,
0.87, 0.78, NA, 1.2, 1.4, NA, 0.6, 0.45, 0.45, NA, 0, 0.1, 0.03,
0.01, NA, 0.07, 0.07, 0.08, NA, 0.09, 0.06, 0.01, NA), Param2_2 = c(38,
76, 32, NA, 23, 27, NA, 623, 54, 54, NA, 487, 341, 671, 859,
NA, 6941, 4467, 808, NA, 143, 301, 219, NA), Param3_2 = c(1.7,
1.65, 1.32, NA, 0.3, 2.5, NA, 1.4, 0.67, 0.67, NA, 0.1, 0.15,
0.22, 0.29, NA, 1, 1, 0.1, NA, 0.5, 1, 0.3, NA), Param4_2 = c(0,
1, 0.86, NA, 0.6, 1, NA, 0.18, 0, 0, NA, 0, 1, 0.29, 0.07, NA,
0.31, 0.16, 0.68, NA, 0.86, 0.47, 0.47, NA), Param5_2 = c(0,
1, 0, NA, 0, 0.09, NA, 0.78, 0, 0, NA, 0.2, 0, 0.46, 0.72, NA,
0.29, 0.26, 0.1, NA, 0.88, 0.86, 0.95, NA), Param6_2 = c(1, 0,
0, NA, 0, 1, NA, 1, 0.27, 0, NA, 0, 0.23, 0.55, 0.86, NA, 0.68,
0.66, 0, NA, 0.44, 1, 0.22, NA)), row.names = c(NA, 24L), class = "data.frame")
df.extended
How can this be solved without using loops, writing column indexes by hand etc.? Write a function for trial 2 and use doBy?
My thoughts and what I have done so far to solve this:
Trial 1:
Cycle through the SubjectIDs in a for-loop
In an inner for-loop, cycle through the EventNumber
In another inner for-loop, cycle through the rows
Get the first row by grabbing df[1, ] and save into df.temp
Merge df.temp with df[2, parameters] #
Merge merge df.temp with df[3, parameters] and so on
Save all resulting df.temps into df.final
Problems I ran into: Step 5:
df.temp <- df[1,]
df.temp <- merge(df.temp, df[2, !(colnames(df) == "AbsoluteTime")], by = c("SubjectID", "EventNumber", "EventType"))
df.temp <- merge(df.temp, df[3, !(colnames(df) == "AbsoluteTime")], by = c("SubjectID", "EventNumber", "EventType"))
df.temp <- merge(df.temp, df[4, !(colnames(df) == "AbsoluteTime")], by = c("SubjectID", "EventNumber", "EventType"))
Warning:
In merge.data.frame(df.temp, df[4, ], by = c("SubjectID", "EventNumber", :
column names ‘Param1.x’, ‘Param2.x’, ‘Param3.x’, ‘Param4.x’, ‘Param5.x’, ‘Param6.x’, ‘AbsoluteTime.x’, ‘Param1.y’, ‘Param2.y’,
‘Param3.y’, ‘Param4.y’, ‘Param5.y’, ‘Param6.y’, ‘AbsoluteTime.y’ are
duplicated in the result.
The column names are repeated, see the warning.
I can not figure out how to easily create the column names / rename the new columns based on a given column name and variable.
There must a better way than this:
n <- 3
names_vector <- c()
for (n in seq(from = c(1), to = n)) {
for (i in names(df[4:9])) {
names_vector <- c(names_vector, paste0(i, "_", c(n+1)))
}
}
names(df.temp)[c(4:9)] <- parameters
names(df.temp)[c(11:ncol(df.temp))] <- names_vector
names(df.temp)
Also, how do I prevent the last n-1 rows from breaking the script? This is a lot of work to do by hand and I think quite error prone!?
Trial 2:
Cycle through the SubjectIDs in a for-loop
In an inner for-loop, cycle through the EventNumber
Get all rows of parameters into a new data frame except the first row
Append a row with NAs
use cbind() to merge the rows
Repeat n times.
This is the code for one SubjectID and one EventNumber:
df.temp <- df[which(df$SubjectID == "1" & df$EventNumber == "1"), ]
df.temp2 <- df.temp[2:nrow(df.temp)-1, parameters]
df.temp2 <- rbind(df.temp2, NA)
df.temp <- cbind(df.temp, df.temp2)
df.temp2 <- df.temp[3:nrow(df.temp)-1, parameters]
df.temp2 <- rbind(df.temp2, NA, NA)
df.temp <- cbind(df.temp, df.temp2)
df.temp2 <- df.temp[4:nrow(df.temp)-1, parameters]
df.temp2 <- rbind(df.temp2, NA, NA, NA)
df.temp <- cbind(df.temp, df.temp2)
n <- 3
names_vector <- c()
for (n in seq(from = c(1), to = n)) {
for (i in names(df[4:9])) {
print(i)
print(n)
names_vector <- c(names_vector, paste0(i, "_", c(n+1)))
}
}
names(df.temp)[c(4:9)] <- parameters
names(df.temp)[c(11:ncol(df.temp))] <- names_vector
df.temp
That solves the problem with missing rows (NAs are acceptable in my case).
Still lots of work by hand / for loops and error prone!?

What about something like this:
You can use the developer version of the package dplyr to add and rename variables according to various subsets of interest in your data. dplyr also provides the functions lead()and lag(), which can be used to find the "next" or "previous" values in a vector (or here row). You can use lead() in combination with the function mutate_at() to extract the values from the succeeding "nth"-row and use them to create new set of variables.
Here I use the data you provided in your example:
# load dplyr package
require(dplyr)
# creacte new data frame "df.extended"
df.extended <- df
# number of observations per group (e.g., SubjectID)
# or desired number of successions
obs = 3
# loop until number of successions achieved
for (i in 1:obs) {
# overwrite df.extended with new information
df.extended <- df.extended %>%
# group by subjects and events
group_by(SubjectID, EventNumber) %>%
# create new variable for each parameter
mutate_at( vars(Param1:Param6),
# using the lead function
.funs = funs(step = lead),
# for the nth followning row
n = i) %>%
# rename the new variables to show the succession number
rename_at(vars(contains("_step")), funs(sub("step", as.character(i), .)))
}
This should roughly recreate the data you posted as desired result.
# Look at first part of "df.extended"
> head(df.extended)
# A tibble: 6 x 28
# Groups: SubjectID, EventNumber [2]
SubjectID EventNumber EventType Param1 Param2 Param3 Param4 Param5 Param6 AbsoluteTime Param1_1 Param2_1 Param3_1 Param4_1 Param5_1 Param6_1
<fct> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dttm> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 A 0.300 45. 1.50 0.140 0.180 0. 2018-04-01 00:00:00 0.210 38. 1.70 0. 0. 1.00
2 1 1 A 0.210 38. 1.70 0. 0. 1.00 2018-04-01 00:00:02 0.870 76. 1.65 1.00 1.00 0.750
3 1 1 A 0.870 76. 1.65 1.00 1.00 0.750 2018-04-01 00:00:04 0.780 32. 1.32 0.860 0. 0.
4 1 1 A 0.780 32. 1.32 0.860 0. 0. 2018-04-01 00:00:04 NA NA NA NA NA NA
5 1 2 B 0.900 67. 0.600 0. 1.00 0.140 2018-05-01 00:00:00 1.20 23. 0.300 0.600 0. 0.
6 1 2 B 1.20 23. 0.300 0.600 0. 0. 2018-05-01 00:00:02 1.40 27. 2.50 1.00 0.0900 1.00
# ... with 12 more variables: Param1_2 <dbl>, Param2_2 <dbl>, Param3_2 <dbl>, Param4_2 <dbl>, Param5_2 <dbl>, Param6_2 <dbl>, Param1_3 <dbl>,
# Param2_3 <dbl>, Param3_3 <dbl>, Param4_3 <dbl>, Param5_3 <dbl>, Param6_3 <dbl>

For base R, consider by to slice by SubjectID, EventNumber, and EventType, and run a merge using a helper group_num. And to run across a series of params, wrap by process in an lapply for list of dataframes that you chain merge on the outside for final merge with original dataframe:
df_list <- lapply(2:3, function(i) {
# BUILD LIST OF DATAFRAMES
by_list <- by(df, df[c("SubjectID", "EventNumber", "EventType")], FUN=function(sub){
sub$grp_num <- 1:nrow(sub)
row_less_sub <- transform(sub, AbsoluteTime=NULL, grp_num=grp_num-(i-1))
merge(sub, row_less_sub, by=c("SubjectID", "EventNumber", "EventType", "grp_num"),
all.x=TRUE, suffixes = c("", paste0("_", i)))
})
# APPEND ALL DATAFRAMES IN LIST
grp_df <- do.call(rbind, by_list)
grp_df <- with(grp_df, grp_df[order(SubjectID, EventNumber),])
# KEEP NEEDED COLUMNS
grp_df <- grp_df[c("SubjectID", "EventNumber", "EventType", "grp_num",
names(grp_df)[grep("Param[0-9]_", names(grp_df))])]
row.names(grp_df) <- NULL
return(grp_df)
})
# ALL PARAMS_* CHAIN MERGE
params_df <- Reduce(function(x,y) merge(x, y, by=c("SubjectID", "EventNumber", "EventType", "grp_num")), df_list)
# ORIGINAL DF AND PARAMS MERGE
df$grp_num <- ave(df$Param1, df$SubjectID, df$EventNumber, df$EventType,
FUN=function(x) cumsum(rep(1, length(x))))
final_df <- transform(merge(df, params_df, by=c("SubjectID", "EventNumber", "EventType", "grp_num")), grp_num=NULL)
Output
head(final_df, 10)
# SubjectID EventNumber EventType Param1 Param2 Param3 Param4 Param5 Param6 AbsoluteTime Param1_2 Param2_2 Param3_2 Param4_2 Param5_2 Param6_2 Param1_3 Param2_3 Param3_3 Param4_3 Param5_3 Param6_3
# 1 1 1 A 0.30 45 1.50 0.14 0.18 0.00 2018-03-31 17:00:00 0.21 38 1.70 0.00 0.00 1.00 0.87 76 1.65 1.00 1.00 0.75
# 2 1 1 A 0.21 38 1.70 0.00 0.00 1.00 2018-03-31 17:00:02 0.87 76 1.65 1.00 1.00 0.75 0.78 32 1.32 0.86 0.00 0.00
# 3 1 1 A 0.87 76 1.65 1.00 1.00 0.75 2018-03-31 17:00:04 0.78 32 1.32 0.86 0.00 0.00 NA NA NA NA NA NA
# 4 1 1 A 0.78 32 1.32 0.86 0.00 0.00 2018-03-31 17:00:04 NA NA NA NA NA NA NA NA NA NA NA NA
# 5 1 2 B 0.90 67 0.60 0.00 1.00 0.14 2018-04-30 17:00:00 1.20 23 0.30 0.60 0.00 0.00 1.40 27 2.50 1.00 0.09 1.00
# 6 1 2 B 1.20 23 0.30 0.60 0.00 0.00 2018-04-30 17:00:02 1.40 27 2.50 1.00 0.09 1.00 NA NA NA NA NA NA
# 7 1 2 B 1.40 27 2.50 1.00 0.09 1.00 2018-04-30 17:00:04 NA NA NA NA NA NA NA NA NA NA NA NA
# 8 2 1 A 1.30 784 0.40 1.00 1.00 0.00 2018-02-21 17:00:00 0.60 623 1.40 0.18 0.78 1.00 0.45 54 0.67 0.00 0.00 0.27
# 9 2 1 A 0.60 623 1.40 0.18 0.78 1.00 2018-02-21 17:00:02 0.45 54 0.67 0.00 0.00 0.27 0.45 54 0.67 0.00 0.00 0.00
# 10 2 1 A 0.45 54 0.67 0.00 0.00 0.27 2018-02-21 17:00:04 0.45 54 0.67 0.00 0.00 0.00 NA NA NA NA NA NA

Related

Use dataframe as filtering criteria in R

I am writing a function that uses a dataframe as filtering criteria for a big dataframe containing model outputs. These are the filtering criteria (as a df):
parameter value
1 alpha 0.1
2 beta 0.1
3 eta 0.1
4 zeta 0.1
5 lambda 0.5
6 phi 5.0
7 kappa 1.0
dput(values)
structure(list(parameter = structure(c(1L, 2L, 3L, 7L, 5L, 6L,
4L), .Label = c("alpha", "beta", "eta", "kappa", "lambda", "phi",
"zeta"), class = "factor"), value = c(0.1, 0.1, 0.1, 0.1, 0.5,
5, 1)), class = "data.frame", row.names = c(NA, -7L))
And this is how the 'outputs' df looks like:
time w x y z alpha beta eta zeta lambda phi kappa
1 0.0 10.00000 10.00000 10.000000 10.000000 0.1 0.1 0.1 0.1 0.95 5 1
1.1 0.1 10.00572 11.04680 9.896057 9.054394 0.1 0.1 0.1 0.1 0.95 5 1
1.2 0.2 10.01983 12.17827 9.592536 8.215338 0.1 0.1 0.1 0.1 0.95 5 1
1.3 0.3 10.04010 13.37290 9.112223 7.483799 0.1 0.1 0.1 0.1 0.95 5 1
1.4 0.4 10.06377 14.60353 8.489174 6.855626 0.1 0.1 0.1 0.1 0.95 5 1
1.5 0.5 10.08778 15.83982 7.764470 6.323152 0.1 0.1 0.1 0.1 0.95 5 1
dput(outputs)
structure(list(time = c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 276.5, 276.6,
276.7, 276.8, 276.9, 276.961144437566), w = c(10, 10.0057192322758,
10.0198266325956, 10.040096099625, 10.0637654242843, 10.087779652849,
-1.71585943177118, -2.04004317987084, -2.56315700921588, -3.56775247519687,
-6.37643561014456, -13.828470036737), x = c(10, 11.0467963604334,
12.1782709261765, 13.3728962503142, 14.6035317074526, 15.8398164069251,
27.2774474452024, 26.3099862348669, 24.8705756934881, 22.3379071188018,
15.8960461541267, 3.62452931346518e-144), y = c(10, 9.89605687874935,
9.59253574727296, 9.11222320249057, 8.48917353431654, 7.76447036695841,
-0.604572230605542, -0.878231815857628, -1.46586965791714, -3.20623046085508,
-14.9365932475767, -3.30552834129368e+146), z = c(10, 9.05439359565339,
8.21533762023494, 7.48379901688836, 6.85562632179817, 6.3231517466183,
42.3149654949179, 43.8836626616462, 46.4372543252026, 51.7183454733949,
72.7027555440752, 3.30552834129368e+146), alpha = c(0.1, 0.1,
0.1, 0.1, 0.1, 0.1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5), beta = c(0.1,
0.1, 0.1, 0.1, 0.1, 0.1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5), eta = c(0.1,
0.1, 0.1, 0.1, 0.1, 0.1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5), zeta = c(0.1,
0.1, 0.1, 0.1, 0.1, 0.1, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9), lambda = c(0.9,
0.9, 0.5, 0.5, 0.9, 0.9, 0.5, 0.9, 0.5, 0.9, 0.5, 0.5
), phi = c(5, 5, 5, 5, 5, 5, 20, 20, 20, 20, 20, 20), kappa = c(1,
1, 1, 1, 1, 1, 10, 10, 10, 10, 10, 10), ode_outputs..iteration.. = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), row.names = c("1",
"1.1", "1.2", "1.3", "1.4", "1.5", "2916.2765", "2916.2766",
"2916.2767", "2916.2768", "2916.2769", "2916.2770"), class = "data.frame")
So it should be something like:
filtered_outputs <- outputs %>% filter(all rows in column 1 == all values in column 2)
The names under the 'parameter' column correspond to column names in the 'outputs' df. I'd like this to be not hard-coded, so that I can feed in any filtering criteria as a df and the function will filter 'outputs'. I'd like to use dplyr or baseR preferably.
So you want to select all the rows in outputs dataframe which matches the values in values dataframe?
Here is a base R approach using sweep and rowSums.
result <- outputs[rowSums(sweep(outputs[as.character(values$parameter)], 2,
values$value, `!=`)) == 0, ]
result
# time w x y z alpha beta eta zeta lambda phi kappa
#1.2 0.2 10.01983 12.17827 9.592536 8.215338 0.1 0.1 0.1 0.1 0.5 5 1
#1.3 0.3 10.04010 13.37290 9.112223 7.483799 0.1 0.1 0.1 0.1 0.5 5 1
# ode_outputs..iteration..
#1.2 NA
#1.3 NA
A possible dplyr and tidyr solution:
Create a helper data frame by turning the values data frame into wide format, and apply a semi-join to filter by the required conditions.
You could easily wrap this up in one continuous workflow but I think it's easier to understand in separate steps.
library(dplyr)
library(tidyr)
conditions <-
values %>%
pivot_wider(names_from = parameter, values_from = value)
outputs %>%
semi_join(conditions)
#> Joining, by = c("alpha", "beta", "eta", "zeta", "lambda", "phi", "kappa")
#> time w x y z alpha beta eta zeta lambda phi
#> 1.2 0.2 10.01983 12.17827 9.592536 8.215338 0.1 0.1 0.1 0.1 0.5 5
#> 1.3 0.3 10.04010 13.37290 9.112223 7.483799 0.1 0.1 0.1 0.1 0.5 5
#> kappa ode_outputs..iteration..
#> 1.2 1 NA
#> 1.3 1 NA
Created on 2021-07-08 by the reprex package (v2.0.0)
I often find these kind of things are easier when the data is in long-form format - although this is just preference:
outputs %>%
tidyr::pivot_longer(
cols = -c(time, w, x, y, z, ode_outputs..iteration..),
names_to="parameter", values_to="value_truth"
) %>%
dplyr::left_join(filter_df) %>%
dplyr::group_by(time) %>%
dplyr::filter(all(value == value_truth)) %>%
dplyr::select(-value) %>%
tidyr::pivot_wider(
names_from="parameter",
values_from="value_truth"
)
Output:
# A tibble: 2 x 13
# Groups: time [2]
time w x y z ode_outputs..iteration.. alpha beta eta zeta lambda phi kappa
<dbl> <dbl> <dbl> <dbl> <dbl> <lgl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.2 10.0 12.2 9.59 8.22 NA 0.1 0.1 0.1 0.1 0.5 5 1
2 0.3 10.0 13.4 9.11 7.48 NA 0.1 0.1 0.1 0.1 0.5 5 1
Data:
outputs = structure(list(time = c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 276.5, 276.6,
276.7, 276.8, 276.9, 276.961144437566), w = c(10, 10.0057192322758,
10.0198266325956, 10.040096099625, 10.0637654242843, 10.087779652849,
-1.71585943177118, -2.04004317987084, -2.56315700921588, -3.56775247519687,
-6.37643561014456, -13.828470036737), x = c(10, 11.0467963604334,
12.1782709261765, 13.3728962503142, 14.6035317074526, 15.8398164069251,
27.2774474452024, 26.3099862348669, 24.8705756934881, 22.3379071188018,
15.8960461541267, 3.62452931346518e-144), y = c(10, 9.89605687874935,
9.59253574727296, 9.11222320249057, 8.48917353431654, 7.76447036695841,
-0.604572230605542, -0.878231815857628, -1.46586965791714, -3.20623046085508,
-14.9365932475767, -3.30552834129368e+146), z = c(10, 9.05439359565339,
8.21533762023494, 7.48379901688836, 6.85562632179817, 6.3231517466183,
42.3149654949179, 43.8836626616462, 46.4372543252026, 51.7183454733949,
72.7027555440752, 3.30552834129368e+146), alpha = c(0.1, 0.1,
0.1, 0.1, 0.1, 0.1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5), beta = c(0.1,
0.1, 0.1, 0.1, 0.1, 0.1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5), eta = c(0.1,
0.1, 0.1, 0.1, 0.1, 0.1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5), zeta = c(0.1,
0.1, 0.1, 0.1, 0.1, 0.1, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9), lambda = c(0.9,
0.9, 0.5, 0.5, 0.9, 0.9, 0.5, 0.9, 0.5, 0.9, 0.5, 0.5
), phi = c(5, 5, 5, 5, 5, 5, 20, 20, 20, 20, 20, 20), kappa = c(1,
1, 1, 1, 1, 1, 10, 10, 10, 10, 10, 10), ode_outputs..iteration.. = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), row.names = c("1",
"1.1", "1.2", "1.3", "1.4", "1.5", "2916.2765", "2916.2766",
"2916.2767", "2916.2768", "2916.2769", "2916.2770"), class = "data.frame")
filter_df = fread(' parameter value
1 alpha 0.1
2 beta 0.1
3 eta 0.1
4 zeta 0.1
5 lambda 0.5
6 phi 5.0
7 kappa 1.0') %>% dplyr::select(-V1)

how to make calculations and comparisons with the next line in R

i got stuck in a problem:
i got this df:
df <- data.frame(station = c("A", "A", "A", "B", "B"),
Initial_height = c(20, 50, 100, 30, 60),
final_height = c(50, 100, 300, 60, 110),
initial_flow = c(0.5, 1.2, 1.9, 0.8, 0.7),
final_Flow = c(1.21, 1.92, 0.805, 0.7, 1))
context: each height has a flow value, but is calculated differently for each line of the data frame.
I would like to compare, for the same station, the flow value where the height is the same.
My perfect data frame:
df.answer <- data.frame(station = c("A", "A", "A", "B", "B"),
Initial_height = c(20, 50, 100, 30, 60),
final_height = c(50, 100, 300, 60, 110),
initial_flow = c(0.5, 1.2, 1.9, 0.8, 0.7),
final_Flow = c(1.21, 1.92, 0.805, 0.7, 1),
diff_flow = c(0.010, 0.020, NA, 0, NA))
NA can be replaced by any other character
EDIT: this can happen:
df <- data.frame(station = c("A", "A", "A", "B", "B"),
Initial_height = c(20, 51, 100, 30, 60),
final_height = c(50, 100, 300, 60, 110),
initial_flow = c(0.5, 1.2, 1.9, 0.8, 0.7),
final_Flow = c(1.21, 1.92, 0.805, 0.7, 1),
diff_flow = c(NA, 0.020, NA, 0, NA)))
at station A, the initial and final values ​​do not match. should return NA
We can subtract the lead i.e next value of 'initial_flow' from 'final_flow after grouping by 'station'
library(dplyr)
out <- df %>%
group_by(station) %>%
mutate(diff_flow = final_Flow - lead(initial_flow)) %>%
ungroup
-output
out
# A tibble: 5 x 6
# station Initial_height final_height initial_flow final_Flow diff_flow
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 A 20 50 0.5 1.21 0.01
#2 A 50 100 1.2 1.92 0.02
#3 A 100 30 1.9 0.805 NA
#4 B 30 60 0.8 0.7 0
#5 B 60 110 0.7 1 NA
In data.table you can use shift to get next row in each group.
library(data.table)
setDT(df)[,diff_flow := final_Flow - shift(initial_flow, type = 'lead'), station]
# station Initial_height final_height initial_flow final_Flow diff_flow
#1: A 20 50 0.5 1.210 0.01
#2: A 50 100 1.2 1.920 0.02
#3: A 100 300 1.9 0.805 NA
#4: B 30 60 0.8 0.700 0.00
#5: B 60 110 0.7 1.000 NA

Divide each column of a dataframe by one row of the dataframe

I would like to divide each column of my dataframe by the values of one row.
I tried to transform my dataframe into a matrix and to extract one row of the dataframe as a vector then divide the matrix by the vector but it did not work. Indeed, only the first row of the matrix got divided by the vector.
Here is my original dataframe.
And this is the code I tried to run :
data <- read_excel("Documents/TFB/xlsx_geochimie/solfatara_maj.xlsx")
View(data)
data.mat <- as.matrix(data[,2:20])
vector <- data[12,2:20]
data.mat/vector
We replicate the vector to make the length same and then do the division
data.mat/unlist(vector)[col(data.mat)]
# FeO Total S SO4 Total N SiO2 Al2O3 Fe2O3 MnO MgO CaO Na2O K2O
#[1,] 0.10 16.5555556 NA NA 0.8908607 0.8987269 0.1835206 0.08333333 0.03680982 0.04175365 0.04823151 0.5738562
#[2,] 0.40 125.8333333 NA NA 0.5510204 0.4456019 0.2359551 0.08333333 0.04294479 0.01878914 0.04501608 0.2588235
#[3,] 0.85 0.6111111 NA NA 1.0021295 1.0162037 0.7715356 1.08333333 0.53987730 0.69728601 1.03858521 1.0457516
#[4,] 0.15 48.0555556 NA NA 1.1027507 0.2569444 NA 0.08333333 0.01840491 0.01878914 0.04180064 0.1647059
#[5,] 0.85 NA NA NA 1.0889086 1.0271991 0.6591760 0.75000000 0.59509202 0.53862213 1.02250804 1.1228758
#[6,] NA NA NA NA 1.3426797 0.6319444 0.0411985 0.08333333 0.03067485 0.11899791 0.65594855 0.7764706
# TiO2 P2O5 LOI LOI2 Total Total 2 Fe2O3(T)
#[1,] 0.7924528 0.3928571 7.0841837 6.6963855 0.9922233 0.9894632 0.14489796
#[2,] 0.5094340 0.3214286 14.5561224 13.7710843 0.9958126 0.9936382 0.31020408
#[3,] 0.8679245 0.6428571 1.5637755 1.5228916 0.9990030 0.9970179 0.80612245
#[4,] 1.4905660 0.2857143 7.4056122 7.0024096 0.9795613 0.9769384 0.05510204
#[5,] 1.0377358 0.2500000 0.3520408 0.3783133 0.9969093 0.9960239 0.74489796
#[6,] 0.3018868 0.2500000 1.2551020 1.1879518 1.0019940 1.0000000 0.04489796
Or use sweep
sweep(data.mat, MARGIN = 2, unlist(vector), FUN = `/`)
Or using mapply with asplit
mapply(`/`, asplit(data.mat, 2), vector)
data
data_mat <- structure(c(0.2, 0.8, 1.7, 0.3, 1.7, NA, 5.96, 45.3, 0.22, 17.3,
NA, NA, NA, 6.72, NA, 4.08, 0.06, 0.16, NA, NA, NA, NA, NA, NA,
50.2, 31.05, 56.47, 62.14, 61.36, 75.66, 15.53, 7.7, 17.56, 4.44,
17.75, 10.92, 0.49, 0.63, 2.06, NA, 1.76, 0.11, 0.01, 0.01, 0.13,
0.01, 0.09, 0.01, 0.06, 0.07, 0.88, 0.03, 0.97, 0.05, 0.2, 0.09,
3.34, 0.09, 2.58, 0.57, 0.15, 0.14, 3.23, 0.13, 3.18, 2.04, 4.39,
1.98, 8, 1.26, 8.59, 5.94, 0.42, 0.27, 0.46, 0.79, 0.55, 0.16,
0.11, 0.09, 0.18, 0.08, 0.07, 0.07, 27.77, 57.06, 6.13, 29.03,
1.38, 4.92, 27.79, 57.15, 6.32, 29.06, 1.57, 4.93, 99.52, 99.88,
100.2, 98.25, 99.99, 100.5, 99.54, 99.96, 100.3, 98.28, 100.2,
100.6, 0.71, 1.52, 3.95, 0.27, 3.65, 0.22), .Dim = c(6L, 19L), .Dimnames = list(
NULL, c("FeO", "Total S", "SO4", "Total N", "SiO2", "Al2O3",
"Fe2O3", "MnO", "MgO", "CaO", "Na2O", "K2O", "TiO2", "P2O5",
"LOI", "LOI2", "Total", "Total 2", "Fe2O3(T)")))
vector <- structure(list(FeO = 2, `Total S` = 0.36, SO4 = NA_real_, `Total N` = NA_real_,
SiO2 = 56.35, Al2O3 = 17.28, Fe2O3 = 2.67, MnO = 0.12, MgO = 1.63,
CaO = 4.79, Na2O = 3.11, K2O = 7.65, TiO2 = 0.53, P2O5 = 0.28,
LOI = 3.92, LOI2 = 4.15, Total = 100.3, `Total 2` = 100.6,
`Fe2O3(T)` = 4.9), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame"))
To divide data frame, df, by the third row:
df/df[rep(3, nrow(df)), ]

How to subset columns based on value in a different column?

EDITED:
I have a dataframe that stores information about when particular assessment happened ('when'). This assessment happened at different times (t1 - t3) which vary by participant.
The dataframe also contains all the assessments ever completed by every participant (including the one referenced in the 'when' column). I only want the assessment information represented in the 'when' column. So if the number is 1, I want to keep all the data related to that assessment and remove all the data that was not collected at that assessment. Please note that I have many more variables in my actual data set than are represented in this shortened data set so any solution should not rely on repeating variable names.
Here's the best I can do. The problem with this solution is that it would have to be repeated for every variable name.
df2 <- mutate(.data = df,
a1G_when = if_else(when == 1, a1G_t1, NA_real_))
# here is what we start with
df <- structure(list(id = 1:10, when = c(1, 3, 2, 1, 2, 1, 3, 2, 3,
1), a1G_t1 = c(0.78, 0.21, 0.04, 0.87, 0.08, 0.25, 0.9, 0.77,
0.51, 0.5), Stqo_t1 = c(0.68, 0.77, 0.09, 0.66, 0.94, 0.05, 0.97,
0.92, 1, 0.04), Twcdz_t1 = c(0.95, 0.41, 0.29, 0.54, 0.06, 0.45,
0.6, 0.24, 0.17, 0.55), Kgh_t1 = c(0.25, 0.86, 0.37, 0.34, 0.97,
0.75, 0.73, 0.68, 0.37, 0.66), `2xWX_t1` = c(0.47, 0.52, 0.23,
0.5, 0.88, 0.71, 0.21, 0.98, 0.76, 0.21), `2IYnS_t1` = c(0.32,
0.75, 0.03, 0.46, 0.89, 0.71, 0.51, 0.83, 0.34, 0.32), a1G_t2 = c(0.97,
0.01, 0.58, 0.33, 0.58, 0.37, 0.76, 0.33, 0.39, 0.56), Stqo_t2 = c(0.78,
0.42, 0.5, 0.69, 0.09, 0.72, 0.84, 0.94, 0.46, 0.83), Twcdz_t2 = c(0.62,
0.34, 0.72, 0.62, 0.8, 0.26, 0.3, 0.88, 0.42, 0.53), Kgh_t2 = c(0.99,
0.66, 0.02, 0.17, 0.51, 0.03, 0.03, 0.74, 0.1, 0.26), `2xWX_t2` = c(0.68,
0.97, 0.56, 0.27, 0.66, 0.71, 0.96, 0.24, 0.37, 0.76), `2IYnS_t2` = c(0.24,
0.88, 0.58, 0.31, 0.8, 0.92, 0.91, 0.9, 0.55, 0.52), a1G_t3 = c(0.73,
0.6, 0.66, 0.06, 0.33, 0.34, 0.09, 0.44, 0.73, 0.56), Stqo_t3 = c(0.28,
0.88, 0.56, 0.75, 0.85, 0.33, 0.88, 0.4, 0.63, 0.61), Twcdz_t3 = c(0.79,
0.95, 0.41, 0.07, 0.99, 0.06, 0.74, 0.17, 0.89, 0.4), Kgh_t3 = c(0.06,
0.52, 0.35, 0.91, 0.43, 0.74, 0.72, 0.96, 0.39, 0.4), `2xWX_t3` = c(0.25,
0.09, 0.64, 0.32, 0.15, 0.14, 0.18, 0.33, 0.97, 0.6), `2IYnS_t3` = c(0.92,
0.49, 0.09, 0.95, 0.3, 0.83, 0.82, 0.56, 0.29, 0.36)), row.names = c(NA,
-10L), class = "data.frame")
# here is an example of what I want with the first column. I would also want all other repeating columns to look like this (Stq0_when, Twcdz, etc.)
id when a1G_when
1 1 1 0.78
2 2 3 0.88
3 3 2 0.58
4 4 1 0.87
5 5 2 0.58
6 6 1 0.25
7 7 3 0.09
8 8 2 0.33
9 9 3 0.73
10 10 1 0.50
Using data.table, you could do something like:
library(data.table)
cols <- unique(paste0(gsub("_.*", "", setdiff(names(df), c("id", "when"))), "_when"))
setDT(df)[
, (cols) := lapply(cols, function(x) paste0(gsub("_.*", "", x), "_t", when))][
, (cols) := lapply(cols, function(x) as.character(.SD[[get(x)]])), by = cols][
, (cols) := lapply(.SD, as.numeric), .SDcols = cols
]
Output (only first 10 rows and only relevant when columns):
a1G_when Stqo_when Twcdz_when Kgh_when 2xWX_when 2IYnS_when
1: 0.78 0.68 0.95 0.25 0.47 0.32
2: 0.60 0.88 0.95 0.52 0.09 0.49
3: 0.58 0.50 0.72 0.02 0.56 0.58
4: 0.87 0.66 0.54 0.34 0.50 0.46
5: 0.58 0.09 0.80 0.51 0.66 0.80
6: 0.25 0.05 0.45 0.75 0.71 0.71
7: 0.09 0.88 0.74 0.72 0.18 0.82
8: 0.33 0.94 0.88 0.74 0.24 0.90
9: 0.73 0.63 0.89 0.39 0.97 0.29
10: 0.50 0.04 0.55 0.66 0.21 0.32
Here is an opportunity to use the new tidyr::pivot_longer. We can use this to reshape the data so that var and t are in their own columns, filter to just the rows with the data we want (i.e. where t equals when) and then pivot the data back out to wide.
library(tidyverse)
df1 <- structure(list(ID = c(101, 102, 103, 104, 105), when = c(1, 2, 3, 1, 2), var1_t1 = c(5, 6, 4, 5, 6), var2_t1 = c(2, 3, 4, 2, 3), var1_t2 = c(7, 8, 9, 7, 8), var2_t2 = c(5, 4, 5, 4, 5), var1_t3 = c(3, 4, 3, 4, 3), var2_t3 = c(6, 7, 6, 7, 6)), row.names = c(NA, 5L), class = "data.frame")
df1 %>%
pivot_longer(
cols = starts_with("var"),
names_to = c("var", "t"),
names_sep = "_t",
values_to = "val",
col_ptypes = list(var = character(), t = numeric())
) %>%
filter(when == t) %>%
select(-t) %>%
pivot_wider(names_from = "var", values_from = "val")
#> # A tibble: 5 x 4
#> ID when var1 var2
#> <dbl> <dbl> <dbl> <dbl>
#> 1 101 1 5 2
#> 2 102 2 8 4
#> 3 103 3 3 6
#> 4 104 1 5 2
#> 5 105 2 8 5
Created on 2019-07-16 by the reprex package (v0.3.0)

R .Last.value error

I have to loop through a list and when I index it, .Last.value stops working.
Example:
A list:
x
[[1]]
cd_entrada qx laplace r
21 1 0.65 50 16.35
141 1 0.65 0 11.24
81 1 0.80 0 2.87
181 0 0.75 25 2.68
[[2]]
cd_entrada qx penalty stepno r
1 0 0.40 10 1500 12.64
2 1 0.45 400 2000 21.58
3 1 0.35 400 750 19.38
4 1 0.45 400 1000 12.78
5 1 0.55 400 1500 4.77
6 1 0.50 400 750 4.79
7 1 0.65 400 750 1.80
8 1 0.75 100 750 1.80
9 2 0.65 100 1500 1.95
10 0 0.40 10 750 9.18
11 0 0.35 200 1000 7.90
12 1 0.45 100 1500 5.65
13 0 0.60 100 750 2.80
Filtering lines of first element
(x[[1]])[.Last.value$cd_entrada==0 & .Last.value$laplace==25,]
cd_entrada qx laplace r
181 0 0.75 25 2.68
It is ok. But When I replace 1 by i, it gives error:
i=1
(x[[i]])[.Last.value$cd_entrada==0 & .Last.value$laplace==25,]
Error in .Last.value$cd_entrada :
$ operator is invalid for atomic vectors
But .Last.value stills returning the same data.frame:
(x[[i]]); .Last.value
cd_entrada qx laplace r
21 1 0.65 50 16.35
141 1 0.65 0 11.24
81 1 0.80 0 2.87
181 0 0.75 25 2.68
cd_entrada qx laplace r
21 1 0.65 50 16.35
141 1 0.65 0 11.24
81 1 0.80 0 2.87
181 0 0.75 25 2.68
P.S.
> dput(x)
list(structure(list(cd_entrada = c(1, 1, 1, 0), qx = c(0.65,
0.65, 0.8, 0.75), laplace = c(50, 0, 0, 25), r = c(16.35, 11.24,
2.87, 2.68)), .Names = c("cd_entrada", "qx", "laplace", "r"), class = "data.frame", row.names = c(21L,
141L, 81L, 181L)), structure(list(cd_entrada = c(0, 1, 1, 1,
1, 1, 1, 1, 2, 0, 0, 1, 0), qx = c(0.4, 0.45, 0.35, 0.45, 0.55,
0.5, 0.65, 0.75, 0.65, 0.4, 0.35, 0.45, 0.6), penalty = c(10,
400, 400, 400, 400, 400, 400, 100, 100, 10, 200, 100, 100), stepno = c(1500,
2000, 750, 1000, 1500, 750, 750, 750, 1500, 750, 1000, 1500,
750), r = c(12.64, 21.58, 19.38, 12.78, 4.77, 4.79, 1.8, 1.8,
1.95, 9.18, 7.9, 5.65, 2.8)), .Names = c("cd_entrada", "qx",
"penalty", "stepno", "r"), class = "data.frame", row.names = c(NA,
-13L)))
.Last.value will change every time you do another assignment such as x=1. At that point, .Last.value will just be a length-1 vector with the value 1. You cannot use .Last.value in a function or in a subsetting operation. Only reliable when you want to retrieve something that you might not have assigned to a name. As an example, you've just entered a complex ggplot2 call and don't want to scrape the result from you console and then edit out the extra line-continuation "+"-signs. You could just execute:
my_last_plot <- .Last.value
I think you might tail() or you might want to be subsetting x[[1]] with a logical expression, but your first result was almost surely a lucky accident based on your having just examined x[[1]] (but then not showing us that code sequence.) See what happens when I first assign your dput output to x
x <-
list(structure(list(cd_entrada = c(1, 1, 1, 0), qx = c(0.65,
0.65, 0.8, 0.75), laplace = c(50, 0, 0, 25), r = c(16.35, 11.24,
2.87, 2.68)), .Names = c("cd_entrada", "qx", "laplace", "r"), class = "data.frame", row.names = c(21L,
141L, 81L, 181L)), structure(list(cd_entrada = c(0, 1, 1, 1,
1, 1, 1, 1, 2, 0, 0, 1, 0), qx = c(0.4, 0.45, 0.35, 0.45, 0.55,
0.5, 0.65, 0.75, 0.65, 0.4, 0.35, 0.45, 0.6), penalty = c(10,
400, 400, 400, 400, 400, 400, 100, 100, 10, 200, 100, 100), stepno = c(1500,
2000, 750, 1000, 1500, 750, 750, 750, 1500, 750, 1000, 1500,
750), r = c(12.64, 21.58, 19.38, 12.78, 4.77, 4.79, 1.8, 1.8,
1.95, 9.18, 7.9, 5.65, 2.8)), .Names = c("cd_entrada", "qx",
"penalty", "stepno", "r"), class = "data.frame", row.names = c(NA,
-13L)))
(x[[1]])[.Last.value$cd_entrada==0 & .Last.value$laplace==25,]
#[1] cd_entrada qx laplace r
#<0 rows> (or 0-length row.names)
Not the result you got, but if we first look at x[[1]]:
x[[1]]
# cd_entrada qx laplace r
#21 1 0.65 50 16.35
#141 1 0.65 0 11.24
#81 1 0.80 0 2.87
#181 0 0.75 25 2.68
(x[[1]])[.Last.value$cd_entrada==0 & .Last.value$laplace==25,]
# cd_entrada qx laplace r
#181 0 0.75 25 2.68
If you wanted to do that operation reliably use this code instead:
i=1
x[[i]][ x[[i]]$cd_entrada==0 & x[[i]]$laplace==25, ]
#-----------
cd_entrada qx laplace r
181 0 0.75 25 2.68

Resources