Related
I have two dataframes in R that I'm trying to join together, but one of the columns has values that are off by one or two (specifically the yardline_100 column in each). Below is the code that I'm using to join the two:
fin_df <- df1 %>%
left_join(df2,
by = c("posteam" = "posteam",
"qtr" = "qtr",
"down" = "down",
"yardline_100" = "yardline_100"))
Is there any way to make it so that they join even if that one column is off by one or two? You'll notice that the last two values rows have different numbers in that column. Below are samples of the dataframes:
df1 <- structure(list(play_id = c(4596, 4629, 4658, 4682, 4723, 4766,
4790, 4828, 4849, 4878, 4899, 4938), posteam = c("MIN", "MIN",
"MIN", "MIN", "MIN", "CIN", "CIN", "CIN", "CIN", "CIN", "CIN",
"CIN"), qtr = c(5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5), yardline_100 = c(63,
58, 55, 50, 38, 61, 55, 52, 52, 20, 15, 15), down = c(2, 1, 2,
3, 1, 1, 2, 3, 4, 1, 2, 3)), row.names = c(NA, -12L), class = c("nflverse_data",
"tbl_df", "tbl", "data.table", "data.frame"), nflverse_timestamp = structure(1659046255.35538, class = c("POSIXct",
"POSIXt")), nflverse_type = "play by play", nflfastR_version = structure(list(
c(4L, 3L, 0L, 9020L)), class = c("package_version", "numeric_version"
)), .internal.selfref = <pointer: 0x0000021967f81ef0>)
df2 <- structure(list(posteam = c("MIN", "MIN", "MIN", "MIN", "MIN",
"CIN", "CIN", "CIN", "CIN", "CIN", "CIN", "CIN"), qtr = c(5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5), yardline_100 = c(63, 58, 55,
50, 38, 61, 55, 53, 52, 20, 16, 16), down = c(2, 1, 2, 3, 1,
1, 2, 3, 4, 1, 2, 3), play_id_SR = c("a9f97fb0-1407-11ec-ae9a-d77d9ecb2022",
"d49d54d0-1407-11ec-ae9a-d77d9ecb2022", "e8f74ad0-1407-11ec-ae9a-d77d9ecb2022",
"0208ae60-1408-11ec-ae9a-d77d9ecb2022", "257fd030-1408-11ec-ae9a-d77d9ecb2022",
"fe058030-1408-11ec-ae9a-d77d9ecb2022", "0da68200-1409-11ec-ae9a-d77d9ecb2022",
"26a5bd20-1409-11ec-ae9a-d77d9ecb2022", "70eacce0-1409-11ec-ae9a-d77d9ecb2022",
"99e5fb10-1409-11ec-ae9a-d77d9ecb2022", "a7646b00-1409-11ec-ae9a-d77d9ecb2022",
"de2683d0-1409-11ec-ae9a-d77d9ecb2022")), row.names = c(NA, -12L
), class = c("tbl_df", "tbl", "data.frame"))
An option is to use fuzzyjoin.
library(fuzzyjoin)
df1 %>%
fuzzy_left_join(
df2,
by = c("posteam", "qtr", "down", "yardline_100"),
match_fun = list(`==`, `==`, `==`, function(x, y) abs(x - y) <= 2)) %>%
select(-matches("(posteam|qtr|down).y")) %>%
rename_with(~str_remove(.x, "(?<=(posteam|qtr|down)).x"))
## A tibble: 12 x 7
# play_id posteam qtr yardline_100.x down yardline_100.y play_id_SR
# <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
# 1 4596 MIN 5 63 2 63 a9f97fb0-1407-11ec-ae9a-d77d9ecb2022
# 2 4629 MIN 5 58 1 58 d49d54d0-1407-11ec-ae9a-d77d9ecb2022
# 3 4658 MIN 5 55 2 55 e8f74ad0-1407-11ec-ae9a-d77d9ecb2022
# 4 4682 MIN 5 50 3 50 0208ae60-1408-11ec-ae9a-d77d9ecb2022
# 5 4723 MIN 5 38 1 38 257fd030-1408-11ec-ae9a-d77d9ecb2022
# 6 4766 CIN 5 61 1 61 fe058030-1408-11ec-ae9a-d77d9ecb2022
# 7 4790 CIN 5 55 2 55 0da68200-1409-11ec-ae9a-d77d9ecb2022
# 8 4828 CIN 5 52 3 53 26a5bd20-1409-11ec-ae9a-d77d9ecb2022
# 9 4849 CIN 5 52 4 52 70eacce0-1409-11ec-ae9a-d77d9ecb2022
#10 4878 CIN 5 20 1 20 99e5fb10-1409-11ec-ae9a-d77d9ecb2022
#11 4899 CIN 5 15 2 16 a7646b00-1409-11ec-ae9a-d77d9ecb2022
#12 4938 CIN 5 15 3 16 de2683d0-1409-11ec-ae9a-d77d9ecb2022
Note the matching function function(x, y) abs(x - y) <= 2 for column "yardline_100".
The last two lines (select(...) and rename_with(...)) are necessary to remove the duplicate columns: fuzzyjoin seems to create duplicate (i.e. ".x" and ".y"-suffixed) columns even on exact matches; the last two commands remove these duplicate exact match columns.
Can't find it anywhere but I have the below data.frame and need to to look like the second data.frame but struggling with the first row. Any ideas? (in the original .csv I have 18 variables with 28 observations).
Here is a data.frame example of what I have:
#Have this cnames_have <- data.frame(names = c(NA, "Name", "BMC", "MFH", "MCHHS", "CIH"), Official.use.only = c( NA, "Last Updated", "2020-11-10", "2020-10-10", "2020-11-10", "2020-11-09"), X = c("Adult Unit", "Staffed", 8, NA, 0, 62), X1 = c(NA, "Current Available", 3, NA, 0,13), X2 = c("Pediatric Unit", "Staffed", 8, NA, 0, 62), X3 = c(NA, "Current Available", 3, NA, 0,13))
Here is an example of what I need:
#need this cnames <- data.frame(names = c("BMC", "MFH", "MCHHS", "CIH", "BMC", "MFH", "MCHHS", "CIH"), Last_Updated = c("2020-11-10", "2020-10-10", "2020-11-10", "2020-11-09"), beds = c("Adult Unit", "Adult Unit", "Adult Unit", "Adult Unit", "Pediatric Unit", "Pediatric Unit", "Pediatric Unit", "Pediatric Unit"), Staffed = c(8, NA, 0, 62, 8, NA, 0, 62), Current_Available = c(3, NA, 0,13, 3, NA, 0,13))
I have tried transpose, melt, dcast, gather, etc. Here is as far as I was able to get but then I couldn't think of where to go from there or if I just code-blocked myself.
df <- as.data.frame(t(cnames)) #from Have this cnames data frame
df <- df %>% rename(col_1 = "2") %>% fill(col_1)
Any help would be awesome as I need to figure this out so I can include visuals in my situational reports. Thank you in advance!
A dplyr and tidyr solution by pivotting long then back wide
library(dplyr)
cnames_have <- data.frame(names = c(NA, "Name", "BMC", "MFH", "MCHHS", "CIH"), Official.use.only = c( NA, "Last Updated", "2020-11-10", "2020-10-10", "2020-11-10", "2020-11-09"), X = c("Adult Unit", "Staffed", 8, NA, 0, 62), X1 = c(NA, "Current Available", 3, NA, 0,13), X2 = c("Pediatric Unit", "Staffed", 8, NA, 0, 62), X3 = c(NA, "Current Available", 3, NA, 0,13))
cnames_have %>% rename(Last_Updated = Official.use.only,
`Adult Staffed` = X,
`Adult Available` = X1,
`Pediatric Staffed` = X2,
`Pediatric Available` = X3) %>%
slice(-1:-2) %>%
tidyr::pivot_longer(`Adult Staffed`:`Pediatric Available`) %>%
tidyr::separate(., name, into = c("beds", "type")) %>%
tidyr::pivot_wider(names_from = type) %>%
arrange(beds)
#> # A tibble: 8 x 5
#> names Last_Updated beds Staffed Available
#> <chr> <chr> <chr> <chr> <chr>
#> 1 BMC 2020-11-10 Adult 8 3
#> 2 MFH 2020-10-10 Adult <NA> <NA>
#> 3 MCHHS 2020-11-10 Adult 0 0
#> 4 CIH 2020-11-09 Adult 62 13
#> 5 BMC 2020-11-10 Pediatric 8 3
#> 6 MFH 2020-10-10 Pediatric <NA> <NA>
#> 7 MCHHS 2020-11-10 Pediatric 0 0
#> 8 CIH 2020-11-09 Pediatric 62 13
This seems to work but is not as elegant as I would like, I'm sure there is a way to do this using pivot_longer. The data wrangling block of code could be put into a function.
library(dplyr)
library(tidyr)
library(purrr)
# Extract unique names from data
vec_names <- cnames_have[2, 1:4]
au <-
cnames_have %>%
select(names:X1) %>%
set_names(vec_names) %>%
mutate(beds = "Adult Unit") %>%
filter(row_number() > 2)
cnames <-
cnames_have %>%
select(names, Official.use.only, X2, X3) %>%
set_names(vec_names) %>%
mutate(beds = "Pediatric Unit") %>%
filter(row_number() > 2) %>%
bind_rows(au)
cnames
#> Name Last Updated Staffed Current Available beds
#> 1 BMC 2020-11-10 8 3 Pediatric Unit
#> 2 MFH 2020-10-10 <NA> <NA> Pediatric Unit
#> 3 MCHHS 2020-11-10 0 0 Pediatric Unit
#> 4 CIH 2020-11-09 62 13 Pediatric Unit
#> 5 BMC 2020-11-10 8 3 Adult Unit
#> 6 MFH 2020-10-10 <NA> <NA> Adult Unit
#> 7 MCHHS 2020-11-10 0 0 Adult Unit
#> 8 CIH 2020-11-09 62 13 Adult Unit
data
cnames_have <-
data.frame(names = c(NA, "Name", "BMC", "MFH", "MCHHS", "CIH"),
Official.use.only = c( NA, "Last Updated", "2020-11-10", "2020-10-10", "2020-11-10", "2020-11-09"),
X = c("Adult Unit", "Staffed", 8, NA, 0, 62),
X1 = c(NA, "Current Available", 3, NA, 0,13),
X2 = c("Pediatric Unit", "Staffed", 8, NA, 0, 62),
X3 = c(NA, "Current Available", 3, NA, 0,13))
Created on 2020-11-11 by the reprex package (v0.3.0)
cnames_have <- data.frame(names = c(NA, "Name", "BMC", "MFH", "MCHHS", "CIH"), Official.use.only = c( NA, "Last Updated", "2020-11-10", "2020-10-10", "2020-11-10", "2020-11-09"), X = c("Adult Unit", "Staffed", 8, NA, 0, 62), X1 = c(NA, "Current Available", 3, NA, 0,13), X2 = c("Pediatric Unit", "Staffed", 8, NA, 0, 62), X3 = c(NA, "Current Available", 3, NA, 0,13))
colnames(cnames_have) <- c("Name", "Last Updated", "Adult Unit", "Adult Unit Current Available", "Pediatric Unit", "Pediatric Unit Current Available")
cnames_have <- cnames_have[-1, ]
cnames_have <- cnames_have[-1, ]
cnames_have <- cnames_have[, c(1, 2, 3, 5, 4, 6)]
library(tidyr)
cnames_have <- gather(cnames_have, 'Unit', 'Staffed', 3:4)
cnames_have <- cnames_have[, -3]
colnames(cnames_have) <- c("names", "Last_Updated", "Current_Available", "beds", "Staffed")
cnames_have <- cnames_have[, c(1, 2, 4, 5, 3)]
I am trying to create a new column in one of 2 separate lists.
One element of df1 looks like:
$`c(5, 19)`
$`c(5, 19)`[[1]]
Feature Gain Cover Frequency
1: plaza_eliptica 0.948578681145 0.53759794901 0.2794117647
2: wind 0.014083116347 0.10011187610 0.1343137255
3: temp 0.011637657812 0.08581378948 0.1460784314
4: year 0.006344014204 0.12430881478 0.1137254902
5: humidity 0.004941509318 0.03941622272 0.0862745098
6: barometer 0.003729098869 0.03750491037 0.0715686275
7: season 0.003482740507 0.02015837244 0.0254901961
8: month 0.003016223359 0.03462645560 0.0539215686
9: day 0.002926824939 0.00525381171 0.0578431373
10: weekday 0.000644114655 0.01335391670 0.0176470588
11: week_of_month 0.000587970927 0.00074890364 0.0117647059
12: workday_on_holiday 0.000025880557 0.00107281595 0.0009803922
13: holiday 0.000002167362 0.00003216151 0.0009803922
The first element of df2 looks like
[[1]]
[[1]][[1]]
date c_farolillo
[1,] "2016-01-01" "17"
[[1]][[2]]
date c_farolillo
[1,] "2016-01-02" "9"
[[1]][[3]]
date c_farolillo
[1,] "2016-01-03" "8"
[[1]][[4]]
date c_farolillo
[1,] "2016-01-04" "3"
[[1]][[5]]
date c_farolillo
[1,] "2016-01-05" "4"
[[1]][[6]]
date c_farolillo
[1,] "2016-01-06" "4"
I am trying to take the first element of df2 and create a new column in df1. That is, I want to take this element:
[[1]]
[[1]][[1]]
date c_farolillo
[1,] "2016-01-01" "17"
and add a column to element 1 of df1 such as:
$`c(5, 19)`
$`c(5, 19)`[[1]]
Feature Gain Cover Frequency date c_farolillo
1: plaza_eliptica 0.948578681145 0.53759794901 0.2794117647 2016-01-01 17
2: wind 0.014083116347 0.10011187610 0.1343137255 2016-01-01 17
3: temp 0.011637657812 0.08581378948 0.1460784314 2016-01-01 17
4: year 0.006344014204 0.12430881478 0.1137254902 2016-01-01 17
5: humidity 0.004941509318 0.03941622272 0.0862745098 2016-01-01 17
6: barometer 0.003729098869 0.03750491037 0.0715686275 2016-01-01 17
7: season 0.003482740507 0.02015837244 0.0254901961 2016-01-01 17
8: month 0.003016223359 0.03462645560 0.0539215686 2016-01-01 17
9: day 0.002926824939 0.00525381171 0.0578431373 2016-01-01 17
10: weekday 0.000644114655 0.01335391670 0.0176470588 2016-01-01 17
11: week_of_month 0.000587970927 0.00074890364 0.0117647059 2016-01-01 17
12: workday_on_holiday 0.000025880557 0.00107281595 0.0009803922 2016-01-01 17
13: holiday 0.000002167362 0.00003216151 0.0009803922 2016-01-01 17
Then take element 2 of df2:
[[1]][[2]]
date c_farolillo
[1,] "2016-01-02" "9"
and do the same thing but for element 2 of df1.
$`c(5, 19)`[[2]]
Feature Gain Cover Frequency date c_farolillo
1: plaza_eliptica 0.95025739085 0.5490795291 0.283433134 2016-01-02 9
2: temp 0.01236820897 0.0832973356 0.150698603 2016-01-02 9
3: wind 0.01196041617 0.0895609496 0.125748503 2016-01-02 9
4: year 0.00604315510 0.1158975396 0.112774451 2016-01-02 9
5: season 0.00511480982 0.0173938219 0.027944112 2016-01-02 9
6: humidity 0.00500999458 0.0578155014 0.086826347 2016-01-02 9
7: barometer 0.00325812831 0.0340156062 0.071856287 2016-01-02 9
8: month 0.00323898173 0.0354103288 0.062874251 2016-01-02 9
9: day 0.00220665600 0.0067323511 0.058882236 2016-01-02 9
10: weekday 0.00050300478 0.0103857430 0.014970060 2016-01-02 9
11: workday_on_holiday 0.00001964502 0.0002228799 0.000998004 2016-01-02 9
12: week_of_month 0.00001960867 0.0001884139 0.002994012 2016-01-02 9
Such that all 3 lists in each of the 2 main lists contain the merged data. That is, list3, element 3 of df1 would have merged to gether:
[[2]][[3]]
date pza_del_carmen
[1,] "2016-01-03" "10"
$`c(7, 1, 2, 18)`[[3]]
Feature Gain Cover Frequency date pza_del_carmen
1: pza_de_espana 0.75620312440 0.2776437590 0.1729106628 2016-01-03 10
2: retiro 0.21115176179 0.2195341962 0.1498559078 2016-01-03 10
3: escuelas_aguirre 0.01304161322 0.0993235815 0.0970220941 2016-01-03 10
4: wind 0.00497255534 0.0963420148 0.1123919308 2016-01-03 10
5: temp 0.00490558802 0.1068475040 0.1585014409 2016-01-03 10
6: barometer 0.00356537931 0.0580338186 0.0787704131 2016-01-03 10
7: humidity 0.00201778550 0.0233865914 0.0672430355 2016-01-03 10
8: year 0.00177749645 0.0517034409 0.0547550432 2016-01-03 10
9: day 0.00086333491 0.0048338563 0.0509125841 2016-01-03 10
10: month 0.00047874430 0.0234141348 0.0211335255 2016-01-03 10
11: weekday 0.00040798584 0.0168542292 0.0144092219 2016-01-03 10
12: week_of_month 0.00032152928 0.0032202756 0.0105667627 2016-01-03 10
13: season 0.00026657228 0.0186674991 0.0105667627 2016-01-03 10
14: weekend_on_holiday 0.00002652936 0.0001950987 0.0009606148 2016-01-03 10
Data1:
EDIT:
The list2 data is the same data as in the Info_assessment list here.
EDIT:
New data:
list1 <- list(`c(5, 19)` = list(structure(list(Feature = c("plaza_eliptica",
"wind", "temp", "year", "humidity", "barometer", "season", "month",
"day", "weekday", "week_of_month", "workday_on_holiday", "holiday"
), Gain = c(0.948578681144529, 0.0140831163472628, 0.0116376578118342,
0.00634401420383024, 0.0049415093180091, 0.00372909886882749,
0.00348274050673969, 0.00301622335931412, 0.00292682493887959,
0.000644114654618996, 0.000587970926885777, 0.0000258805573006903,
0.00000216736196828243), Cover = c(0.537597949014824, 0.100111876095501,
0.0858137894753769, 0.12430881477959, 0.0394162227230228, 0.0375049103727748,
0.0201583724440218, 0.034626455595298, 0.005253811712761, 0.0133539166971052,
0.000748903637236591, 0.00107281594659352, 0.000032161505893596
), Frequency = c(0.279411764705882, 0.134313725490196, 0.146078431372549,
0.113725490196078, 0.0862745098039216, 0.0715686274509804, 0.0254901960784314,
0.053921568627451, 0.057843137254902, 0.0176470588235294, 0.0117647058823529,
0.000980392156862745, 0.000980392156862745)), row.names = c(NA,
-13L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x560f12912cd0>),
structure(list(Feature = c("plaza_eliptica", "temp", "wind",
"year", "season", "humidity", "barometer", "month", "day",
"weekday", "workday_on_holiday", "week_of_month"), Gain = c(0.950257390847805,
0.0123682089682263, 0.0119604161685161, 0.00604315510198456,
0.00511480981946408, 0.00500999457778123, 0.00325812831159771,
0.00323898173138714, 0.00220665599964529, 0.000503004779421417,
0.0000196450237473069, 0.0000196086704235932), Cover = c(0.549079529057103,
0.0832973355514094, 0.0895609496061689, 0.115897539589901,
0.0173938218615296, 0.0578155014108067, 0.0340156061873294,
0.0354103287593173, 0.00673235113002399, 0.0103857430401735,
0.000222879883826733, 0.000188413922410228), Frequency = c(0.283433133732535,
0.150698602794411, 0.125748502994012, 0.112774451097804,
0.0279441117764471, 0.0868263473053892, 0.0718562874251497,
0.062874251497006, 0.0588822355289421, 0.0149700598802395,
0.000998003992015968, 0.0029940119760479)), row.names = c(NA,
-12L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x560f12912cd0>)),
`c(7, 1, 2, 18)` = list(structure(list(Feature = c("pza_de_espana",
"retiro", "escuelas_aguirre", "wind", "temp", "barometer",
"humidity", "year", "day", "month", "weekday", "week_of_month",
"season", "weekend_on_holiday"), Gain = c(0.762835844259031,
0.205459059740918, 0.0130315791677542, 0.0045078890564497,
0.00444974962904841, 0.00339293826829134, 0.00189508238873358,
0.00187978643588582, 0.00100750177875752, 0.000538521180289064,
0.000402209068457385, 0.000300268511436018, 0.0002522936065142,
0.0000472769084332351), Cover = c(0.293886140204331, 0.227015081557907,
0.0916711798129263, 0.0951455374927713, 0.102043766809557,
0.0520602895145079, 0.0284397058958519, 0.0521635564204478,
0.00571869176893915, 0.0177917404833809, 0.0133466738877007,
0.00350419034156103, 0.0168233263876777, 0.000390119422439669
), Frequency = c(0.175908221797323, 0.154875717017208, 0.101338432122371,
0.111854684512428, 0.136711281070746, 0.0736137667304015,
0.0678776290630975, 0.0583173996175908, 0.0506692160611855,
0.0248565965583174, 0.0181644359464627, 0.011472275334608,
0.0124282982791587, 0.00191204588910134)), row.names = c(NA,
-14L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x560f12912cd0>),
structure(list(Feature = c("pza_de_espana", "retiro",
"escuelas_aguirre", "wind", "temp", "barometer", "humidity",
"year", "day", "month", "weekday", "week_of_month", "season",
"weekend_on_holiday"), Gain = c(0.762803211914528, 0.205468329058334,
0.0130409115957786, 0.00452752500477332, 0.0044356364989903,
0.00339296767537418, 0.00189335858734865, 0.00188130563208465,
0.00100789616848372, 0.000544801039619805, 0.000402657864212878,
0.000301123014739554, 0.000252994653373448, 0.0000472812923590283
), Cover = c(0.293865486823143, 0.227012786737776, 0.0915793870076463,
0.0953268282831992, 0.101871655299658, 0.0520671739749038,
0.0284167576945319, 0.0521819149815037, 0.00571869176893915,
0.0179064814899808, 0.0133374946071727, 0.00349960070129703,
0.0168256212078097, 0.000390119422439669), Frequency = c(0.175908221797323,
0.154875717017208, 0.10038240917782, 0.112810707456979,
0.135755258126195, 0.0736137667304015, 0.0678776290630975,
0.0583173996175908, 0.0506692160611855, 0.0258126195028681,
0.0181644359464627, 0.011472275334608, 0.0124282982791587,
0.00191204588910134)), row.names = c(NA, -14L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x560f12912cd0>)))
New data 2:
list2 <- list(`c(5, 19)` = list(Info_assessment = list(structure(c("2016-01-01",
"17"), .Dim = 1:2, .Dimnames = list(NULL, c("date", "c_farolillo"
))), structure(c("2016-01-02", "9"), .Dim = 1:2, .Dimnames = list(
NULL, c("date", "c_farolillo")))), X_test = list(structure(c(1,
1, 2016, 1, 1, 1, 0, 1, 1, 1, 0, 1, 52.2692307692308, 5.46153846153846,
84.9615384615385, 30.1315384615385, 25), .Dim = c(1L, 17L), .Dimnames = list(
NULL, c("day", "month", "year", "quarter", "semester", "weekday",
"weekend", "season", "holiday", "workday_on_holiday", "weekend_on_holiday",
"week_of_month", "temp", "wind", "humidity", "barometer",
"plaza_eliptica"))), structure(c(2, 1, 2016, 1, 1, 0, 1,
1, 0, 0, 0, 1, 47.7307692307692, 10.4230769230769, 77.0769230769231,
30.1834615384615, 29), .Dim = c(1L, 17L), .Dimnames = list(NULL,
c("day", "month", "year", "quarter", "semester", "weekday",
"weekend", "season", "holiday", "workday_on_holiday", "weekend_on_holiday",
"week_of_month", "temp", "wind", "humidity", "barometer",
"plaza_eliptica")))), Y_test = list(structure(17, .Dim = c(1L,
1L), .Dimnames = list(NULL, "c_farolillo")), structure(9, .Dim = c(1L,
1L), .Dimnames = list(NULL, "c_farolillo")))), `c(7, 1, 2, 18)` = list(
Info_assessment = list(structure(c("2016-01-01", "12"), .Dim = 1:2, .Dimnames = list(
NULL, c("date", "pza_del_carmen"))), structure(c("2016-01-02",
"10"), .Dim = 1:2, .Dimnames = list(NULL, c("date", "pza_del_carmen"
)))), X_test = list(structure(c(1, 1, 2016, 1, 1, 1, 0, 1,
1, 1, 0, 1, 52.2692307692308, 5.46153846153846, 84.9615384615385,
30.1315384615385, 28, 17, 6), .Dim = c(1L, 19L), .Dimnames = list(
NULL, c("day", "month", "year", "quarter", "semester",
"weekday", "weekend", "season", "holiday", "workday_on_holiday",
"weekend_on_holiday", "week_of_month", "temp", "wind",
"humidity", "barometer", "pza_de_espana", "escuelas_aguirre",
"retiro"))), structure(c(2, 1, 2016, 1, 1, 0, 1, 1, 0,
0, 0, 1, 47.7307692307692, 10.4230769230769, 77.0769230769231,
30.1834615384615, 21, 24, 5), .Dim = c(1L, 19L), .Dimnames = list(
NULL, c("day", "month", "year", "quarter", "semester",
"weekday", "weekend", "season", "holiday", "workday_on_holiday",
"weekend_on_holiday", "week_of_month", "temp", "wind",
"humidity", "barometer", "pza_de_espana", "escuelas_aguirre",
"retiro")))), Y_test = list(structure(12, .Dim = c(1L,
1L), .Dimnames = list(NULL, "pza_del_carmen")), structure(10, .Dim = c(1L,
1L), .Dimnames = list(NULL, "pza_del_carmen")))))
It is a list of list, so we can Map over the corresponding list, and do a second Map internally
Map(function(lst1, lst2) Map(function(dat1, dat2) dat1[,
colnames(dat2) := .(dat2[1], dat2[2])][], lst1, lst2), list1,
lapply(list2, `[[`, "Info_assessment") )
I'm an R beginner and it's my first post here. I'm struggling with a problem and would love your advice. Basically, I have a dataset with 3 sets of columns that I need to manipulate altogether in order to obtain the desired outcome, which is an average of the 2 most recent observations (and that these observations must occur after a cutoff date, say, 3/15/2018) that are of high quality, but what makes it complex is that the relevant columns that go into the average differ for all cases.
The first set of data columns has to do with the number of observations each case has, so subject one has 2 observations, subject two has 3, etc.
The second set of columns describe the data quality for each of these observations. So for example, subject 1 has two good observations whereas subject 2 has 1 bad data quality for the first observation and good data quality for the 2 latter ones, and subject 3 has 3 observations that are of good quality and one observation (obs_3)that is of bad data quality.
The third set of columns specify the dates of the observations.
subject_id obs_1 obs_2 obs_3 obs_4 obs_1_dq obs_2_dq obs_3_dq obs_4_dq obs_1_date obs_2_date obs_3_date obs_4_date desired.average
1 1 5 6 NA NA TRUE TRUE NA NA 2018-02-01 2018-03-16 <NA> <NA> NA
2 2 6 8 11 NA FALSE TRUE TRUE NA 2018-02-18 2018-03-16 2018-04-10 <NA> 9.5
3 3 7 9 12 15 TRUE TRUE FALSE TRUE 2018-02-15 2018-03-18 2018-04-02 2018-04-10 12.0
4 4 3 4 8 15 TRUE TRUE TRUE TRUE 2018-02-16 2018-03-08 2018-03-10 2018-03-15 NA
In order to compute an average of TWO latest observations that are of good data quality:
I must first decide which observations are of good quality,
Then, compute an average (and it has to be an average of 2 observations) that occur after 3/15 and they must be the two most recent observations.
Below is my sample dataset. I've tried to do this manually in Excel and it was really painstaking. I'm hoping to do this in R and would very much appreciate your feedback. Thank you!
Here is my sample dataset:
> dput(head(df,5))
structure(list(subject_id = c(1, 2, 3, 4), obs_1 = c(5, 6, 7,
3), obs_2 = c(6, 8, 9, 4), obs_3 = c(NA, 11, 12, 8), obs_4 = c(NA,
NA, 15, 15), obs_1_dq = c(TRUE, FALSE, TRUE, TRUE), obs_2_dq = c(TRUE,
TRUE, TRUE, TRUE), obs_3_dq = c(NA, TRUE, FALSE, TRUE), obs_4_dq =
c(NA,
NA, TRUE, TRUE), obs_1_date = structure(c(17563, 17580, 17577,
17578), class = "Date"), obs_2_date = structure(c(17606, 17606,
17608, 17598), class = "Date"), obs_3_date = structure(c(NA,
17631, 17623, 17600), class = "Date"), obs_4_date = structure(c(NA,
NA, 17631, 17605), class = "Date"), desired.average = c(NA, 9.5,
12, NA)), .Names = c("subject_id", "obs_1", "obs_2", "obs_3",
"obs_4", "obs_1_dq", "obs_2_dq", "obs_3_dq", "obs_4_dq", "obs_1_date",
"obs_2_date", "obs_3_date", "obs_4_date", "desired.average"), row.names
= c(NA,
4L), class = "data.frame")
This should also work, and though a bit verbose it doesn't rely on column indices, so should be robust:
library(dplyr)
library(tidyr)
num_date <- as.numeric(as.Date("2018-03-15"))
df <- df[,-ncol(df)]
df_join <- df %>%
gather(Obs, value, 2:ncol(df)) %>%
mutate(
nr = as.numeric(gsub("[^\\d]", "", Obs, perl = TRUE))
) %>%
group_by(subject_id, nr) %>%
filter(!(is.na(value) | (grepl("_dq", Obs) & value == 0) | any(value[grepl("_date", Obs)] <= num_date))) %>%
ungroup() %>%
group_by(subject_id, Obs) %>%
filter(!row_number() < (max(row_number() - 1))) %>%
ungroup() %>%
group_by(subject_id) %>%
mutate(
desired.average = mean(value[grepl("_date|_dq", Obs) == FALSE], na.rm = TRUE)
) %>%
filter(!max(row_number()) == 3) %>%
distinct(subject_id, desired.average)
df <- left_join(df, df_join)
Result:
subject_id obs_1 obs_2 obs_3 obs_4 obs_1_dq obs_2_dq obs_3_dq obs_4_dq obs_1_date obs_2_date
1 1 5 6 NA NA TRUE TRUE NA NA 2018-02-01 2018-03-16
2 2 6 8 11 NA FALSE TRUE TRUE NA 2018-02-18 2018-03-16
3 3 7 9 12 15 TRUE TRUE FALSE TRUE 2018-02-15 2018-03-18
4 4 3 4 8 15 TRUE TRUE TRUE TRUE 2018-02-16 2018-03-08
obs_3_date obs_4_date desired.average
1 <NA> <NA> NA
2 2018-04-10 <NA> 9.5
3 2018-04-02 2018-04-10 12.0
4 2018-03-10 2018-03-15 NA
See if this works for you. Code is annotated briefly.
df=structure(list(subject_id = c(1, 2, 3, 4), obs_1 = c(5, 6, 7,
3), obs_2 = c(6, 8, 9, 4), obs_3 = c(NA, 11, 12, 8), obs_4 = c(NA,
NA, 15, 15), obs_1_dq = c(TRUE, FALSE, TRUE, TRUE), obs_2_dq = c(TRUE,
TRUE, TRUE, TRUE), obs_3_dq = c(NA, TRUE, FALSE, TRUE), obs_4_dq =
c(NA, NA, TRUE, TRUE), obs_1_date = structure(c(17563, 17580, 17577,
17578), class = "Date"), obs_2_date = structure(c(17606, 17606,
17608, 17598), class = "Date"), obs_3_date = structure(c(NA,
17631, 17623, 17600), class = "Date"), obs_4_date = structure(c(NA,
NA, 17631, 17605), class = "Date"), desired.average = c(NA, 9.5,
12, NA)), .Names = c("subject_id", "obs_1", "obs_2", "obs_3",
"obs_4", "obs_1_dq", "obs_2_dq", "obs_3_dq", "obs_4_dq", "obs_1_date",
"obs_2_date", "obs_3_date", "obs_4_date", "desired.average"), row.names
= c(NA, 4L), class = "data.frame")
# separate each section
obs=df[,2:5]
dq=df[, 6:9]
dt=sapply(df[, 10:13], as.numeric) # for easier calculations
# remove bad quality
obs[dq==F]=NA
# remove dates before 2018-3-15
obs[dt - as.numeric(as.Date("2018-03-15")) <= 0] = NA
# only leave two most recent dates
dt[is.na(obs)]=NA
dt=t(apply(dt,1,function(x){x[x<max(x[x!=max(x, na.rm=T)],na.rm=T)]=NA;x}))
obs[is.na(dt)]=NA
# average
df$avg=apply(obs,1,function(x)ifelse(sum(!is.na(x))>=2, mean(x,na.rm=T), NA))
df
Edits:
Explanation
dt=t(apply(dt,1, function(x){x[x<max(x[x!=max(x, na.rm=T)],na.rm=T)]=NA;x}))
I think this might be a little confusing for x[x<max(x[x!=max(x, na.rm=T)],na.rm=T)]=NA. The na.rm=T meaning remove NA values. max(x[x!=max(x)]) meaning the second largest number. So x[x < 2nd_largest_num]=NA just removed any number except the largest and the 2nd largest. This function is then applied to the data frame row-wise. The final result is dt contains only two largest number in each row (most recent date in numeric format). All "discarded" values (NA in dt) will be removed from obs in the next line obs[is.na(dt)]=NA. After all these, obs only contains the two recent values in each line.
I'm trying to solve a much larger problem using this basic example. I need to apply a function based on the location from which() because I need to know the year from df1 where the value is NA or >= 150. Then I subset df2, get the mean, and return it to the exact row. Right now I'm using a for() loop and need something much faster as the data I have is very large. Is there a common way to do this?
dput:
df1 <- structure(list(id = c("USC00031632", "USC00031632", "USC00031632",
"USC00031632", "USC00031632", "USC00031632", "USC00031632", "USC00031632",
"USC00031632", "USC00031632"), element = c("TMAX", "TMIN", "TMAX",
"TMIN", "TMAX", "TMIN", "TMAX", "TMIN", "TMAX", "TMIN"), year = 1900:1909,
month = c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5), day = c(1, 1, 1,
1, 1, 1, 1, 1, 1, 1), value = c(30.02, NA, 37.94, 10.94,
NA, 28.04, 64.94, 41, 200, 51.08)), .Names = c("id", "element",
"year", "month", "day", "value"), row.names = c(NA, -10L), class = c("tbl_df",
"data.frame"))
df2 <-structure(list(id = c("USC00031632", "USC00031632", "USC00031632",
"USC00031632", "USC00031632", "USC00031632", "USC00031632", "USC00031632",
"USC00031632", "USC00031632"), element = c("TMAX", "TMIN", "TMAX",
"TMIN", "TMAX", "TMIN", "TMAX", "TMIN", "TMAX", "TMIN"), year = 1900:1909,
month = c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5), day = c(1, 1, 1,
1, 1, 1, 1, 1, 1, 1), value = c(30.02, 10.94, 37.94, 10.94,
12, 28.04, 64.94, 41, 82.04, 51.08)), row.names = c(NA, -10L
), class = c("tbl_df", "data.frame"), .Names = c("id", "element",
"year", "month", "day", "value"))
Code:
library(dplyr)
check <- function(df, yr){
df_d <- filter(df, year == yr)
m <- mean(df_d$value)
return(m)
}
for (i in which(is.na(df1$value) | df1$value >= 150)){
df1[i,6] <- check(df = df2, yr = as.numeric(df1[i,3]) )
}
I would recommend the efficient binary join from data.table combined with modification in place (using the :=) while specifying by = .EACHI (in order to calculate the mean for each group separately).
library(data.table)
setDT(df1)[setDT(df2),
value := ifelse(is.na(value) | value >= 150, mean(i.value), value),
on = "year",
by = .EACHI]
df1
# id element year month day value
# 1: USC00031632 TMAX 1900 1 1 30.02
# 2: USC00031632 TMIN 1901 1 1 10.94
# 3: USC00031632 TMAX 1902 2 1 37.94
# 4: USC00031632 TMIN 1903 2 1 10.94
# 5: USC00031632 TMAX 1904 3 1 12.00
# 6: USC00031632 TMIN 1905 3 1 28.04
# 7: USC00031632 TMAX 1906 4 1 64.94
# 8: USC00031632 TMIN 1907 4 1 41.00
# 9: USC00031632 TMAX 1908 5 1 82.04
# 10: USC00031632 TMIN 1909 5 1 51.08
Alternatively, we could do this in two steps in order to try avoiding the ifelse overhead in each step
setDT(df1)[setDT(df2), value2 := i.value, on = "year"]
df1[is.na(value) | value >= 150, value := mean(value2), by = year]
df1
# id element year month day value value2
# 1: USC00031632 TMAX 1900 1 1 30.02 30.02
# 2: USC00031632 TMIN 1901 1 1 10.94 10.94
# 3: USC00031632 TMAX 1902 2 1 37.94 37.94
# 4: USC00031632 TMIN 1903 2 1 10.94 10.94
# 5: USC00031632 TMAX 1904 3 1 12.00 12.00
# 6: USC00031632 TMIN 1905 3 1 28.04 28.04
# 7: USC00031632 TMAX 1906 4 1 64.94 64.94
# 8: USC00031632 TMIN 1907 4 1 41.00 41.00
# 9: USC00031632 TMAX 1908 5 1 82.04 82.04
# 10: USC00031632 TMIN 1909 5 1 51.08 51.08
You can get rid of value2 afterwards if you wish using df1[, value2 := NULL]