Apply custom function to specific row/column - r

I'm trying to solve a much larger problem using this basic example. I need to apply a function based on the location from which() because I need to know the year from df1 where the value is NA or >= 150. Then I subset df2, get the mean, and return it to the exact row. Right now I'm using a for() loop and need something much faster as the data I have is very large. Is there a common way to do this?
dput:
df1 <- structure(list(id = c("USC00031632", "USC00031632", "USC00031632",
"USC00031632", "USC00031632", "USC00031632", "USC00031632", "USC00031632",
"USC00031632", "USC00031632"), element = c("TMAX", "TMIN", "TMAX",
"TMIN", "TMAX", "TMIN", "TMAX", "TMIN", "TMAX", "TMIN"), year = 1900:1909,
month = c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5), day = c(1, 1, 1,
1, 1, 1, 1, 1, 1, 1), value = c(30.02, NA, 37.94, 10.94,
NA, 28.04, 64.94, 41, 200, 51.08)), .Names = c("id", "element",
"year", "month", "day", "value"), row.names = c(NA, -10L), class = c("tbl_df",
"data.frame"))
df2 <-structure(list(id = c("USC00031632", "USC00031632", "USC00031632",
"USC00031632", "USC00031632", "USC00031632", "USC00031632", "USC00031632",
"USC00031632", "USC00031632"), element = c("TMAX", "TMIN", "TMAX",
"TMIN", "TMAX", "TMIN", "TMAX", "TMIN", "TMAX", "TMIN"), year = 1900:1909,
month = c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5), day = c(1, 1, 1,
1, 1, 1, 1, 1, 1, 1), value = c(30.02, 10.94, 37.94, 10.94,
12, 28.04, 64.94, 41, 82.04, 51.08)), row.names = c(NA, -10L
), class = c("tbl_df", "data.frame"), .Names = c("id", "element",
"year", "month", "day", "value"))
Code:
library(dplyr)
check <- function(df, yr){
df_d <- filter(df, year == yr)
m <- mean(df_d$value)
return(m)
}
for (i in which(is.na(df1$value) | df1$value >= 150)){
df1[i,6] <- check(df = df2, yr = as.numeric(df1[i,3]) )
}

I would recommend the efficient binary join from data.table combined with modification in place (using the :=) while specifying by = .EACHI (in order to calculate the mean for each group separately).
library(data.table)
setDT(df1)[setDT(df2),
value := ifelse(is.na(value) | value >= 150, mean(i.value), value),
on = "year",
by = .EACHI]
df1
# id element year month day value
# 1: USC00031632 TMAX 1900 1 1 30.02
# 2: USC00031632 TMIN 1901 1 1 10.94
# 3: USC00031632 TMAX 1902 2 1 37.94
# 4: USC00031632 TMIN 1903 2 1 10.94
# 5: USC00031632 TMAX 1904 3 1 12.00
# 6: USC00031632 TMIN 1905 3 1 28.04
# 7: USC00031632 TMAX 1906 4 1 64.94
# 8: USC00031632 TMIN 1907 4 1 41.00
# 9: USC00031632 TMAX 1908 5 1 82.04
# 10: USC00031632 TMIN 1909 5 1 51.08
Alternatively, we could do this in two steps in order to try avoiding the ifelse overhead in each step
setDT(df1)[setDT(df2), value2 := i.value, on = "year"]
df1[is.na(value) | value >= 150, value := mean(value2), by = year]
df1
# id element year month day value value2
# 1: USC00031632 TMAX 1900 1 1 30.02 30.02
# 2: USC00031632 TMIN 1901 1 1 10.94 10.94
# 3: USC00031632 TMAX 1902 2 1 37.94 37.94
# 4: USC00031632 TMIN 1903 2 1 10.94 10.94
# 5: USC00031632 TMAX 1904 3 1 12.00 12.00
# 6: USC00031632 TMIN 1905 3 1 28.04 28.04
# 7: USC00031632 TMAX 1906 4 1 64.94 64.94
# 8: USC00031632 TMIN 1907 4 1 41.00 41.00
# 9: USC00031632 TMAX 1908 5 1 82.04 82.04
# 10: USC00031632 TMIN 1909 5 1 51.08 51.08
You can get rid of value2 afterwards if you wish using df1[, value2 := NULL]

Related

Joining two R dataframes on multiple columns with one column having slightly different values

I have two dataframes in R that I'm trying to join together, but one of the columns has values that are off by one or two (specifically the yardline_100 column in each). Below is the code that I'm using to join the two:
fin_df <- df1 %>%
left_join(df2,
by = c("posteam" = "posteam",
"qtr" = "qtr",
"down" = "down",
"yardline_100" = "yardline_100"))
Is there any way to make it so that they join even if that one column is off by one or two? You'll notice that the last two values rows have different numbers in that column. Below are samples of the dataframes:
df1 <- structure(list(play_id = c(4596, 4629, 4658, 4682, 4723, 4766,
4790, 4828, 4849, 4878, 4899, 4938), posteam = c("MIN", "MIN",
"MIN", "MIN", "MIN", "CIN", "CIN", "CIN", "CIN", "CIN", "CIN",
"CIN"), qtr = c(5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5), yardline_100 = c(63,
58, 55, 50, 38, 61, 55, 52, 52, 20, 15, 15), down = c(2, 1, 2,
3, 1, 1, 2, 3, 4, 1, 2, 3)), row.names = c(NA, -12L), class = c("nflverse_data",
"tbl_df", "tbl", "data.table", "data.frame"), nflverse_timestamp = structure(1659046255.35538, class = c("POSIXct",
"POSIXt")), nflverse_type = "play by play", nflfastR_version = structure(list(
c(4L, 3L, 0L, 9020L)), class = c("package_version", "numeric_version"
)), .internal.selfref = <pointer: 0x0000021967f81ef0>)
df2 <- structure(list(posteam = c("MIN", "MIN", "MIN", "MIN", "MIN",
"CIN", "CIN", "CIN", "CIN", "CIN", "CIN", "CIN"), qtr = c(5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5), yardline_100 = c(63, 58, 55,
50, 38, 61, 55, 53, 52, 20, 16, 16), down = c(2, 1, 2, 3, 1,
1, 2, 3, 4, 1, 2, 3), play_id_SR = c("a9f97fb0-1407-11ec-ae9a-d77d9ecb2022",
"d49d54d0-1407-11ec-ae9a-d77d9ecb2022", "e8f74ad0-1407-11ec-ae9a-d77d9ecb2022",
"0208ae60-1408-11ec-ae9a-d77d9ecb2022", "257fd030-1408-11ec-ae9a-d77d9ecb2022",
"fe058030-1408-11ec-ae9a-d77d9ecb2022", "0da68200-1409-11ec-ae9a-d77d9ecb2022",
"26a5bd20-1409-11ec-ae9a-d77d9ecb2022", "70eacce0-1409-11ec-ae9a-d77d9ecb2022",
"99e5fb10-1409-11ec-ae9a-d77d9ecb2022", "a7646b00-1409-11ec-ae9a-d77d9ecb2022",
"de2683d0-1409-11ec-ae9a-d77d9ecb2022")), row.names = c(NA, -12L
), class = c("tbl_df", "tbl", "data.frame"))
An option is to use fuzzyjoin.
library(fuzzyjoin)
df1 %>%
fuzzy_left_join(
df2,
by = c("posteam", "qtr", "down", "yardline_100"),
match_fun = list(`==`, `==`, `==`, function(x, y) abs(x - y) <= 2)) %>%
select(-matches("(posteam|qtr|down).y")) %>%
rename_with(~str_remove(.x, "(?<=(posteam|qtr|down)).x"))
## A tibble: 12 x 7
# play_id posteam qtr yardline_100.x down yardline_100.y play_id_SR
# <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
# 1 4596 MIN 5 63 2 63 a9f97fb0-1407-11ec-ae9a-d77d9ecb2022
# 2 4629 MIN 5 58 1 58 d49d54d0-1407-11ec-ae9a-d77d9ecb2022
# 3 4658 MIN 5 55 2 55 e8f74ad0-1407-11ec-ae9a-d77d9ecb2022
# 4 4682 MIN 5 50 3 50 0208ae60-1408-11ec-ae9a-d77d9ecb2022
# 5 4723 MIN 5 38 1 38 257fd030-1408-11ec-ae9a-d77d9ecb2022
# 6 4766 CIN 5 61 1 61 fe058030-1408-11ec-ae9a-d77d9ecb2022
# 7 4790 CIN 5 55 2 55 0da68200-1409-11ec-ae9a-d77d9ecb2022
# 8 4828 CIN 5 52 3 53 26a5bd20-1409-11ec-ae9a-d77d9ecb2022
# 9 4849 CIN 5 52 4 52 70eacce0-1409-11ec-ae9a-d77d9ecb2022
#10 4878 CIN 5 20 1 20 99e5fb10-1409-11ec-ae9a-d77d9ecb2022
#11 4899 CIN 5 15 2 16 a7646b00-1409-11ec-ae9a-d77d9ecb2022
#12 4938 CIN 5 15 3 16 de2683d0-1409-11ec-ae9a-d77d9ecb2022
Note the matching function function(x, y) abs(x - y) <= 2 for column "yardline_100".
The last two lines (select(...) and rename_with(...)) are necessary to remove the duplicate columns: fuzzyjoin seems to create duplicate (i.e. ".x" and ".y"-suffixed) columns even on exact matches; the last two commands remove these duplicate exact match columns.

how to groupby and take mean of value by symetrically looping forward and backward on the date value in r

I need to group data on ID and then replace the missing value of price by iterating on a date value up and down. first, look for 1 date value up and down if no data go 2 date values up and down until there is a mean value for all rows.
Input data :
df1 <- data.frame(id = c(11,11,11,11,11,11,11,11,555,555,555,555,555,555,555,555,555),
Date = c("1-Jun", "18-Jun", "3-Jul", "4-Jul", "25-Jul", "3-Nov", "7-Nov", "28_Nov",
"1-Jun", "18-Jun", "3-Jul", "4-Jul", "25-Jul", "3-Nov", "7-Nov", "28_Nov",
"30-Nov"),
price = c(NA, NA, 100, NA, 25, NA, 50, NA, 400, NA, NA, NA, NA, NA, NA, NA, 200)
)
Updated requirement:
Input data :
df1 <- data.frame(id = c(11,11,11,11,11,11,11,11),
Date = c("1-Jun", "5-Jun", "8-Jun", "9-Jun", "14-Jun", "16-Jun", "20-Jun", "21-Jun"),
price = c(NA, NA,100, NA, 50, NA, 200, NA)
)
I need to impute all missing dates between the available dates for each id's and then go symmetrically up and down to impute missing. Also, not always I need the average between two, eg: when I go 2 dates up and down and I see only 1 value, then I would impute that value.
Please find below with a reprex one possible solution using the data.table library.
I built a function to make it easier to use.
Reprex
Code of the NA_imputations() function
library(data.table)
NA_imputations <- function(x) {
x[, rows := .I]
z <- x[, .I[!is.na(price)]]
id_1 <- z[-length(z)]
id_2 <- z[-1]
values <- x[z, .(price = price, id = id)]
values_1 <- values[-nrow(values)]
names(values_1) <- c("price_1", "id_o1")
values_2 <- values[-1]
names(values_2) <- c("price_2", "id_o2")
subtract <- z[-1] - z[-length(z)]
r <- data.table(id_1, values_1, id_2, values_2, subtract)
Results <- r[, `:=` (id_mean = fifelse(subtract > 2 & subtract %% 2 == 0, id_1+(subtract/2), (id_1+id_2)/2),
mean = fifelse(subtract >= 2 & subtract %% 2 == 0 & id_o1 == id_o2, (price_1+price_2)/2, NA_real_))
][, `:=` (price_1 = NULL, id_o1 = NULL, id_2 = NULL, price_2 = NULL, id_o2 = NULL)
][x, on = .(id_mean = rows)
][, price := fcoalesce(price, mean)
][, mean := NULL
][r[subtract > 2 & subtract %% 2 == 0,id_1]:r[subtract > 2 & subtract %% 2 == 0,id_mean-1], price := lapply(price, nafill, type = "nocb"), by = .(id)
][, price := nafill(price, type = "nocb"), by = .(id)
][, price := nafill(price, type = "locf")
][, `:=` (id_1 = NULL, id_mean = NULL, subtract = NULL)][]
return(Results)
}
Output of the NA_imputations() function
NA_imputations(df1)
#> id Date price
#> <num> <char> <num>
#> 1: 11 1-Jun 100.0
#> 2: 11 18-Jun 100.0
#> 3: 11 3-Jul 100.0
#> 4: 11 4-Jul 62.5
#> 5: 11 25-Jul 25.0
#> 6: 11 3-Nov 37.5
#> 7: 11 7-Nov 50.0
#> 8: 11 28_Nov 50.0
#> 9: 555 1-Jun 400.0
#> 10: 555 18-Jun 400.0
#> 11: 555 3-Jul 400.0
#> 12: 555 4-Jul 400.0
#> 13: 555 25-Jul 300.0
#> 14: 555 3-Nov 200.0
#> 15: 555 7-Nov 200.0
#> 16: 555 28_Nov 200.0
#> 17: 555 30-Nov 200.0
Created on 2021-12-05 by the reprex package (v2.0.1)

How to apply subset function to dataframe using lapply

I have a DataFrame of "dbGet_TRUE_EVENTS_DATA"
> dbGet_TRUE_EVENTS_DATA
LONGITUDE LATITUDE DATE_START DATE_END FLAG EQNUM
1 -39.5 80.5 2008-07-06 2008-07-10 1 1
2 -39.5 81.5 2008-07-06 2008-07-10 1 1
3 -38.5 80.5 2008-07-06 2008-07-10 1 2
4 -38.5 81.5 2008-07-06 2008-07-10 1 2
5 -39.5 79.5 2008-07-06 2008-07-10 1 3
6 -38.5 79.5 2008-07-06 2008-07-10 1 3
7 -39.5 79.5 2008-07-06 2008-07-10 1 4
8 -38.5 79.5 2008-07-06 2008-07-10 1 4
9 -39.5 79.5 2008-07-06 2008-07-10 1 5
10 -38.5 79.5 2008-07-06 2008-07-10 1 6
11 -39.5 79.5 2008-07-06 2008-07-10 1 7
and a list "TRUE_EVENTS_split_up"
> TRUE_EVENTS_split_up
$Fold1
[1] 3 4 6
$Fold2
[1] 5 7
$Fold3
[1] 1 2
I can subset data like the following:
newdata1 <-
subset(dbGet_TRUE_EVENTS_DATA, EQNUM %in% TRUE_EVENTS_split_up$Fold1)
newdata2 <-
subset(dbGet_TRUE_EVENTS_DATA, EQNUM %in% TRUE_EVENTS_split_up$Fold2)
newdata3 <-
subset(dbGet_TRUE_EVENTS_DATA, EQNUM %in% TRUE_EVENTS_split_up$Fold3)
newdata <- list(newdata1 ,newdata2 ,newdata3 )
I would like to apply directly my subset on dbGet_TRUE_EVENTS_DATA. I know that I have to use lapply function but don't know how ?
> dput(dbGet_TRUE_EVENTS_DATA)
structure(list(LONGITUDE = c(-39.5, -39.5, -38.5, -38.5, -39.5,
-38.5, -39.5, -38.5, -39.5, -38.5, -39.5), LATITUDE = c(80.5,
81.5, 80.5, 81.5, 79.5, 79.5, 79.5, 79.5, 79.5, 79.5, 79.5),
DATE_START = structure(c(1215298800, 1215298800, 1215298800,
1215298800, 1215298800, 1215298800, 1215298800, 1215298800,
1215298800, 1215298800, 1215298800), class = c("POSIXct",
"POSIXt")), DATE_END = structure(c(1215644400, 1215644400,
1215644400, 1215644400, 1215644400, 1215644400, 1215644400,
1215644400, 1215644400, 1215644400, 1215644400), class = c("POSIXct",
"POSIXt")), FLAG = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), EQNUM = c(1,
1, 2, 2, 3, 3, 4, 4, 5, 6, 7)), .Names = c("LONGITUDE", "LATITUDE",
"DATE_START", "DATE_END", "FLAG", "EQNUM"), row.names = c(NA,
-11L), class = "data.frame")
> dput(TRUE_EVENTS_split_up)
structure(list(Fold1 = c(3, 4, 6), Fold2 = c(5, 7), Fold3 = c(1,
2)), .Names = c("Fold1", "Fold2", "Fold3"))

Update two columns that are interdependent row-wise using data.table

I want to create a data.table with the departure and arrival times between bus stops. This is the format of my data.table. (reproducible dataset below)
trip_id stop_sequence arrival_time departure_time travel_time
1: a 1 07:00:00 07:00:00 00:00:00
2: a 2 00:00:00 00:00:00 00:02:41
3: a 3 00:00:00 00:00:00 00:01:36
4: a 4 00:00:00 00:00:00 00:02:39
5: a 5 00:00:00 00:00:00 00:02:28
6: b 1 07:00:00 07:00:00 00:00:00
7: b 2 00:00:00 00:00:00 00:00:00
8: b 3 00:00:00 00:00:00 00:01:36
9: b 4 00:00:00 00:00:00 00:00:37
10: b 5 00:00:00 00:00:00 00:03:00
Here is how it should work. The idea is that a vehicle travels following the stop sequence. In trip a, for example, it takes 00:02:41 for the vehicle to travel from stop 1 to stop 2. Given a fixed time of 40 seconds for passangers to enter/leave the vehicle at each stop, the bus would departure from stop 2 at "07:03:21"
The thing here is that this is a row-wise iterative process between two columns. Intuitively, I would would a for set loop in data.table but I couldn't get my head around this. Help?
reproducible dataset:
library(data.table)
library(chron)
dt <- structure(list(trip_id = c("a", "a", "a", "a", "a", "b", "b",
"b", "b", "b"), stop_sequence = c(1L, 2L, 3L, 4L, 5L, 1L, 2L,
3L, 4L, 5L), arrival_time = structure(c(0.291666666666667, 0,
0, 0, 0, 0.291666666666667, 0, 0, 0, 0), format = "h:m:s", class = "times"),
departure_time = structure(c(0.291666666666667, 0, 0, 0,
0, 0.291666666666667, 0, 0, 0, 0), format = "h:m:s", class = "times"),
travel_time = structure(c(0, 0.00186598685444013, 0.00110857958406301,
0.00183749407361369, 0.00171664297781446, 0, 0.000522388450578203,
0.00111473367541453, 0.000427755975518318, 0.00207918951573377
), format = "h:m:s", class = "times")), .Names = c("trip_id",
"stop_sequence", "arrival_time", "departure_time", "travel_time"
), class = c("data.table", "data.frame"), row.names = c(NA, -10L
))
expected output: first four rows
trip_id stop_sequence arrival_time departure_time travel_time
1: a 1 07:00:00 07:00:00 00:00:00
2: a 2 07:02:41 07:03:21 00:02:41
3: a 3 07:04:57 07:05:37 00:01:36
4: a 4 07:08:16 07:08:56 00:02:39
I think it might be possible to do it without looping. I think you can calculate the departure_time without looping and then once you have that, the arrival_time is just departure_time - 40 seconds:
dt2 <- copy(dt)
dt2[,c("arrival_time", "departure_time") := .(cumsum(arrival_time + ifelse(travel_time==0, 0, travel_time + times("00:00:40"))) - ifelse(travel_time == 0 , 0, times("00:00:40")),
cumsum(arrival_time + ifelse(travel_time==0, 0, travel_time + times("00:00:40")))),
by = trip_id]
dt2
# trip_id stop_sequence arrival_time departure_time travel_time
#1: a 1 07:00:00 07:00:00 00:00:00
#2: a 2 07:02:41 07:03:21 00:02:41
#3: a 3 07:04:57 07:05:37 00:01:36
#4: a 4 07:08:16 07:08:56 00:02:39
#5: a 5 07:11:24 07:12:04 00:02:28
#6: b 1 07:00:00 07:00:00 00:00:00
#7: b 2 07:00:45 07:01:25 00:00:45
#8: b 3 07:03:01 07:03:41 00:01:36
#9: b 4 07:04:18 07:04:58 00:00:37
#10: b 5 07:07:58 07:08:38 00:03:00
Alternatively, so you don't have to repeat the long cumsum for departure_time to get arrival_time you could do:
dt2[,departure_time := cumsum(arrival_time + ifelse(travel_time==0, 0, travel_time + times("00:00:40"))), by = trip_id]
dt2[, arrival_time := departure_time - ifelse(travel_time == 0 , 0, times("00:00:40"))]
A third option posted by #eddi:
dt[, departure_time := arrival_time[1] + cumsum(travel_time) + (0:(.N-1))*times('00:00:40'), by = trip_id]
dt[, arrival_time := c(arrival_time[1], tail(departure_time, -1) - times('00:00:40')), by = trip_id]

haven and dplyr

I'm trying to use dplyr to have the variables that are factors be represented by their values after importing a SPSS dataset using haven.
Two questions:
1) how can I loop over the columns in the dataframe containing labels over the imported dataset using dplyr?
u<-which(sapply(i,function(x) !is.null(attr(x,"labels"))))
n<-mutate_each(i,(as_factor),... = u)
2) how can I set the correct date after importing .sav file from SPSS. i$e3 is a date, but I'm uncertain how I can convert it to proper r-lingo.
Dataset:
> dput(i)
structure(list(e = structure(c(1, 1, 2, 2, 1), label = "Sex", class = c("labelled",
"numeric"), labels = structure(c(1, 2), .Names = c("Male", "Female"
))), e2 = structure(c(3, 3, 3, 3, 3), label = "The time from injury to surgery", class = c("labelled",
"numeric"), labels = structure(c(1, 2, 3), .Names = c("< 12 hours",
"12 to 24 hours", "> 24 hours"))), e3 = structure(c(13254624000,
13431139200, 13437360000, 13493174400, 13233369600), label = "Surgery Date")), .Names = c("e",
"e2", "e3"), row.names = c(NA, -5L), class = "data.frame")
I'm not sure how to adjust your dates properly (you can change the / 10 to / 100 or 1000). You could do this with base r:
i <- structure(list(e = structure(c(1, 1, 2, 2, 1), label = "Sex", class = c("labelled",
"numeric"), labels = structure(c(1, 2), .Names = c("Male", "Female"
))), e2 = structure(c(3, 3, 3, 3, 3), label = "The time from injury to surgery", class = c("labelled",
"numeric"), labels = structure(c(1, 2, 3), .Names = c("< 12 hours",
"12 to 24 hours", "> 24 hours"))), e3 = structure(c(13254624000,
13431139200, 13437360000, 13493174400, 13233369600), label = "Surgery Date")), .Names = c("e",
"e2", "e3"), row.names = c(NA, -5L), class = "data.frame")
i$e3 <- as.POSIXct(i$e3 / 10, origin = '1970-01-01')
# e e2 e3
# 1 1 3 2012-01-01 19:00:00
# 2 1 3 2012-07-24 03:12:00
# 3 2 3 2012-07-31 08:00:00
# 4 2 3 2012-10-03 22:24:00
# 5 1 3 2011-12-08 04:36:00
i <- setNames(i, sapply(i, function(x) attr(x, 'label')))
i[] <- lapply(i, function(x) {
if (!is.null(lab <- attr(x, 'labels')))
names(lab[x])
else x
})
# Sex The time from injury to surgery Surgery Date
# 1 Male > 24 hours 2012-01-01 19:00:00
# 2 Male > 24 hours 2012-07-24 03:12:00
# 3 Female > 24 hours 2012-07-31 08:00:00
# 4 Female > 24 hours 2012-10-03 22:24:00
# 5 Male > 24 hours 2011-12-08 04:36:00

Resources