How to apply subset function to dataframe using lapply - r

I have a DataFrame of "dbGet_TRUE_EVENTS_DATA"
> dbGet_TRUE_EVENTS_DATA
LONGITUDE LATITUDE DATE_START DATE_END FLAG EQNUM
1 -39.5 80.5 2008-07-06 2008-07-10 1 1
2 -39.5 81.5 2008-07-06 2008-07-10 1 1
3 -38.5 80.5 2008-07-06 2008-07-10 1 2
4 -38.5 81.5 2008-07-06 2008-07-10 1 2
5 -39.5 79.5 2008-07-06 2008-07-10 1 3
6 -38.5 79.5 2008-07-06 2008-07-10 1 3
7 -39.5 79.5 2008-07-06 2008-07-10 1 4
8 -38.5 79.5 2008-07-06 2008-07-10 1 4
9 -39.5 79.5 2008-07-06 2008-07-10 1 5
10 -38.5 79.5 2008-07-06 2008-07-10 1 6
11 -39.5 79.5 2008-07-06 2008-07-10 1 7
and a list "TRUE_EVENTS_split_up"
> TRUE_EVENTS_split_up
$Fold1
[1] 3 4 6
$Fold2
[1] 5 7
$Fold3
[1] 1 2
I can subset data like the following:
newdata1 <-
subset(dbGet_TRUE_EVENTS_DATA, EQNUM %in% TRUE_EVENTS_split_up$Fold1)
newdata2 <-
subset(dbGet_TRUE_EVENTS_DATA, EQNUM %in% TRUE_EVENTS_split_up$Fold2)
newdata3 <-
subset(dbGet_TRUE_EVENTS_DATA, EQNUM %in% TRUE_EVENTS_split_up$Fold3)
newdata <- list(newdata1 ,newdata2 ,newdata3 )
I would like to apply directly my subset on dbGet_TRUE_EVENTS_DATA. I know that I have to use lapply function but don't know how ?
> dput(dbGet_TRUE_EVENTS_DATA)
structure(list(LONGITUDE = c(-39.5, -39.5, -38.5, -38.5, -39.5,
-38.5, -39.5, -38.5, -39.5, -38.5, -39.5), LATITUDE = c(80.5,
81.5, 80.5, 81.5, 79.5, 79.5, 79.5, 79.5, 79.5, 79.5, 79.5),
DATE_START = structure(c(1215298800, 1215298800, 1215298800,
1215298800, 1215298800, 1215298800, 1215298800, 1215298800,
1215298800, 1215298800, 1215298800), class = c("POSIXct",
"POSIXt")), DATE_END = structure(c(1215644400, 1215644400,
1215644400, 1215644400, 1215644400, 1215644400, 1215644400,
1215644400, 1215644400, 1215644400, 1215644400), class = c("POSIXct",
"POSIXt")), FLAG = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), EQNUM = c(1,
1, 2, 2, 3, 3, 4, 4, 5, 6, 7)), .Names = c("LONGITUDE", "LATITUDE",
"DATE_START", "DATE_END", "FLAG", "EQNUM"), row.names = c(NA,
-11L), class = "data.frame")
> dput(TRUE_EVENTS_split_up)
structure(list(Fold1 = c(3, 4, 6), Fold2 = c(5, 7), Fold3 = c(1,
2)), .Names = c("Fold1", "Fold2", "Fold3"))

Related

calculating medians per year per ID in R and plotting the outcome

Dataset:
structure(list(ID = c(1234, 1234, 1234, 1234, 1234, 1234, 1234,
1234, 8769, 8769, 8769, 8769, 8769, 7457, 7457, 7457, 7457, 7457,
7457, 55667, 55667, 55667, 55667, 55667, 55667, 55667, 3789,
3789, 3789, 3789, 3789, 3789), date_of_bloods = structure(c(978307200,
981072000, 1173052800, 1175731200, 1367798400, 1465171200, 1467936000,
1659916800, 1072915200, 1075680000, 1173052800, 1175731200, 1367798400,
978307200, 981072000, 1173052800, 1175731200, 1367798400, 1465171200,
978307200, 981072000, 1173052800, 1270425600, 1273104000, 1465171200,
1467936000, 1270425600, 1367798400, 1465171200, 1465257600, 1465344000,
1465430400), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
result = c(90, 80, 60, 40, 25, 22, 22, 21, 70, 65, 43, 23,
22, 90, 90, 88, 86, 76, 74, 58, 46, 35, 34, 33, 30, 24, 76,
67, 56, 34, 33, 23), `mutation type` = c(1, 1, 1, 1, 1, 1,
1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3,
3, 1, 1, 1, 1, 1, 1)), row.names = c(NA, -32L), class = "data.frame")
I would like the median of results per year per ID in a format where the year is just 0,1,2,3 etc for uniformity across cohorts and then to plot these lines with some indication of their mutation category.
I have done:
filtered$date_of_bloods <-format(filtered$date_of_bloods,format="%Y")
#split into individual ID groups
a <- with(filtered, split(filtered, list(ID)))
#aggregate median results per year
medianfunc <- function(y) {aggregate(results ~ date_of_bloods, data = y, median)}
medians <- sapply(a, medianfunc)
# do lm per ID cohort and get slope of lines
g<- as.data.frame(medians)
coefLM <- function(x) {coef(lm(date_of_bloods ~ results, data = x))}
coefs<- sapply(g, coefLM)
The actual years don't matter and for uniformity I would like them to be 0,1,2,3,4 etc per ID. I am not sure how to do that? I would then want to plot this data (median yearly bloods per ID) with some form of idea as to which mutational category they belong.
I hope this isn't too broad a question.
Many thanks
You can try this (filtered is the dput() you included). I hope this helps:
library(dplyr)
library(lubridate)
library(ggplot2)
library(broom)
#Data
filtered %>% mutate(year=year(date_of_bloods)) %>%
group_by(ID,year,`mutation type`) %>% summarise(med=median(result)) -> df1
#Variables
df1 %>% ungroup()%>% mutate(ID=as.factor(ID),
year=as.factor(year),
`mutation type`=as.factor(`mutation type`)) -> df1
#Plot
ggplot(df1,aes(x=ID,y=med,fill=`mutation type`,color=year,group=year))+
geom_line()
And for models:
#Models
fits <- df1 %>%group_by(ID) %>%
do(fitmodel = lm(med ~ year, data = .))
#Coefs
dfCoef = tidy(fits, fitmodel)
# A tibble: 10 x 6
# Groups: ID [5]
ID term estimate std.error statistic p.value
<dbl> <chr> <dbl> <dbl> <dbl> <dbl>
1 1234 (Intercept) 6329. 1546. 4.09 0.0264
2 1234 year -3.13 0.769 -4.07 0.0268
3 3789 (Intercept) 14318. 4746. 3.02 0.204
4 3789 year -7.08 2.36 -3.00 0.205
5 7457 (Intercept) 2409. 403. 5.98 0.0269
6 7457 year -1.16 0.201 -5.78 0.0287
7 8769 (Intercept) 9268. 4803. 1.93 0.304
8 8769 year -4.60 2.39 -1.92 0.306
9 55667 (Intercept) 3294. 759. 4.34 0.0492
10 55667 year -1.62 0.378 -4.29 0.0503
Code for required plot:
#Plot 2
#Data modifications
df1 %>% mutate(year2=as.numeric(year)-1) -> df2
df2 %>% mutate(year2=factor(year2,levels = sort(unique(year2)))) -> df2
#Plot 2
ggplot(df2,aes(x=year2,y=med,color=ID,group=ID))+
facet_wrap(.~`mutation type`)+
geom_line()
Your naming structure is unclear, if the data you provided is called df then you can do:
df$year <-format(df$date_of_bloods,format="%Y")
aggregate(result ~ year + ID, data = df, median)
year ID result
1 2001 1234 85.0
2 2007 1234 50.0
3 2013 1234 25.0
4 2016 1234 22.0
5 2022 1234 21.0
6 2010 3789 76.0
7 2013 3789 67.0
8 2016 3789 33.5
9 2001 7457 90.0
10 2007 7457 87.0
11 2013 7457 76.0
12 2016 7457 74.0
13 2004 8769 67.5
14 2007 8769 33.0
15 2013 8769 22.0
16 2001 55667 52.0
17 2007 55667 35.0
18 2010 55667 33.5
19 2016 55667 27.0

Dumbbell plot: Order and value label

I am working on a dumbbell plot in R inspired by this post, and have two problems:
Ordering the dumbbell plot (I've tried a strategy provided in this post)
Present value labels in the plot in an aesthetically pleasing way.
My data set is formatted as a wide data set with 18 units with the following structure:
> head(ADHD_med_2010_2018_wide, 18)
# A tibble: 18 x 9
age gender county adhd_pr_1000_2010 adhd_pr_1000_2018 county_label adhd_2010 adhd_2018 diff
<dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl> <dbl> <fct> <dbl> <dbl> <dbl>
1 2 [10-14] 1 [Both genders] 1 [Østfold] 32.1 24.3 Østfold 32.1 24.3 -7.80
2 2 [10-14] 1 [Both genders] 2 [Akershus] 20.6 23.0 Akershus 20.6 23 2.40
3 2 [10-14] 1 [Both genders] 3 [Oslo] 17.2 33.9 Oslo 17.2 33.9 16.7
4 2 [10-14] 1 [Both genders] 4 [Hedmark] 41.7 30.9 Hedmark 41.7 30.9 -10.8
5 2 [10-14] 1 [Both genders] 5 [Oppland] 24.9 39.0 Oppland 24.9 39 14.1
6 2 [10-14] 1 [Both genders] 6 [Buskerud] 26.7 36.8 Buskerud 26.7 36.8 10.1
7 2 [10-14] 1 [Both genders] 7 [Vestfold] 28.1 27.1 Vestfold 28.1 27 -1.10
8 2 [10-14] 1 [Both genders] 8 [Telemark] 29.2 24.7 Telemark 29.2 24.7 -4.5
9 2 [10-14] 1 [Both genders] 9 [Aust-Agder] 34.9 39.2 Aust-Agder 34.9 39.2 4.30
10 2 [10-14] 1 [Both genders] 10 [Vest-Agder] 17.4 23.8 Vest-Agder 17.4 23.8 6.40
11 2 [10-14] 1 [Both genders] 11 [Rogaland] 29.5 13.8 Rogaland 29.5 13.8 -15.7
12 2 [10-14] 1 [Both genders] 12 [Hordaland] 21.3 14.4 Hordaland 21.3 14.4 -6.90
13 2 [10-14] 1 [Both genders] 14 [Sogn og Fjordane] 21.3 39.7 Sogn og Fjordane 21.3 39.7 18.4
14 2 [10-14] 1 [Both genders] 15 [Møre og Romsdal] 27.0 18.6 Møre og Romsdal 27 18.6 -8.40
15 2 [10-14] 1 [Both genders] 18 [Nordland] 40.1 30.0 Nordland 40.1 30 -10.1
16 2 [10-14] 1 [Both genders] 19 [Troms] 25.8 33.2 Troms 25.8 33.2 7.40
17 2 [10-14] 1 [Both genders] 20 [Finnmark] 19.1 21.3 Finnmark 19.1 21.3 2.20
18 2 [10-14] 1 [Both genders] 50 [Trøndelag] 25.0 36.9 Trøndelag 25 37 12
I've tried two strategies for problem 1:
library("tidyverse")
library("ggalt")
fig2 <- ggplot(ADHD_med_2010_2018_wide, aes(x=adhd_2010, xend=adhd_2018, y=county_label, group=county_label)) +
#create a thick line between x and xend instead of using defaut
#provided by geom_dubbell
geom_segment(aes(x=adhd_2010,
xend=adhd_2018,
y=county_label,
yend=county_label),
color="#b2b2b2", size=1.5)+
geom_dumbbell(color="light blue",
size_x=3.5,
size_xend = 3.5,
#Note: there is no US:'color' for UK:'colour'
# in geom_dumbbel unlike standard geoms in ggplot()
colour_x="forestgreen", # green = 2010
colour_xend = "red")+ # red = 2018
labs(x=NULL, y=NULL,
title="Dumbbell Chart",
subtitle="Change in prescription rate: 2010 vs 2018")+
geom_text(color="black", size=2, hjust=-0.5,
aes(x=adhd_2010, label=adhd_2010))+
geom_text(aes(x=adhd_2018, label=adhd_2018),
color="black", size=2, hjust=1.5)
fig2
Which gives a plot without ordering or values presented in a good way:
To correct ordering, I tried following the strategy provided in the post linked above:
library(dplyr)
ADHD_med_2010_2018_wide%>%
mutate(difference = abs(adhd_2018-adhd_2010)) %>% #creates the variable of differences
top_n(18, wt = difference) %>% # Choose the rows with top 20 difference
ggplot() +
aes(x=adhd_2010, xend=adhd_2018, y=reorder(county_label, difference),
group=county_label) + #reorder the labels by descending difference value
geom_dumbbell(color="light blue",
size_x=3.5,
size_xend = 3.5,
#Note: there is no US:'color' for UK:'colour'
# in geom_dumbbel unlike standard geoms in ggplot()
colour_x="forestgreen", # green = 2010
colour_xend = "red")+ # red = 2018
labs(x=NULL, y=NULL,
title="Dumbbell Chart",
subtitle="Change in prescription rate: 2010 vs 2018")+
geom_text(color="black", size=2, hjust=-0.5,
aes(x=adhd_2010, label=adhd_2010))+
geom_text(aes(x=adhd_2018, label=adhd_2018),
color="black", size=2, hjust=1.5)
This still does not give a plot with a nice ordering, although it seem to order the difference (and there's still the issue with value labels):
Hopefully some of you may have input on these issues.
Data to copy:
> dput(head(ADHD_med_2010_2018_wide, 18))
structure(list(age = structure(c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2), label = "Age groups", labels = c(`5-9` = 1,
`10-14` = 2, `15-19` = 3, `20-24` = 4, `25-29` = 5, `30-34` = 6,
`All ages` = 7), class = "haven_labelled"), gender = structure(c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), label = "Gender", labels = c(`Both genders` = 1,
Female = 2, Male = 3), class = "haven_labelled"), county = structure(c(1,
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 18, 19, 20, 50), labels = c(Østfold = 1,
Akershus = 2, Oslo = 3, Hedmark = 4, Oppland = 5, Buskerud = 6,
Vestfold = 7, Telemark = 8, `Aust-Agder` = 9, `Vest-Agder` = 10,
Rogaland = 11, Hordaland = 12, `Sogn og Fjordane` = 14, `Møre og Romsdal` = 15,
Nordland = 18, Troms = 19, Finnmark = 20, Trøndelag = 50, `Hele landet` = 99
), class = "haven_labelled"), adhd_pr_1000_2010 = c(32.1488990783691,
20.5894756317139, 17.2119483947754, 41.6982574462891, 24.8543014526367,
26.7194156646729, 28.1328239440918, 29.2480430603027, 34.8775291442871,
17.3759765625, 29.4698066711426, 21.340311050415, 21.3308296203613,
27.0334072113037, 40.1140670776367, 25.7862873077393, 19.1311283111572,
25.0325565338135), adhd_pr_1000_2018 = c(24.2834396362305, 23.0037822723389,
33.9068183898926, 30.8641967773438, 39.0195579528809, 36.7909698486328,
27.0642204284668, 24.6901988983154, 39.1978950500488, 23.8095245361328,
13.8218154907227, 14.4400091171265, 39.7175636291504, 18.5994052886963,
29.9642810821533, 33.1638412475586, 21.2596340179443, 36.9249382019043
), county_label = structure(18:1, .Label = c("Trøndelag", "Finnmark",
"Troms", "Nordland", "Møre og Romsdal", "Sogn og Fjordane", "Hordaland",
"Rogaland", "Vest-Agder", "Aust-Agder", "Telemark", "Vestfold",
"Buskerud", "Oppland", "Hedmark", "Oslo", "Akershus", "Østfold"
), class = "factor"), adhd_2010 = c(32.0999984741211, 20.6000003814697,
17.2000007629395, 41.7000007629395, 24.8999996185303, 26.7000007629395,
28.1000003814697, 29.2000007629395, 34.9000015258789, 17.3999996185303,
29.5, 21.2999992370605, 21.2999992370605, 27, 40.0999984741211,
25.7999992370605, 19.1000003814697, 25), adhd_2018 = c(24.2999992370605,
23, 33.9000015258789, 30.8999996185303, 39, 36.7999992370605,
27, 24.7000007629395, 39.2000007629395, 23.7999992370605, 13.8000001907349,
14.3999996185303, 39.7000007629395, 18.6000003814697, 30, 33.2000007629395,
21.2999992370605, 37), diff = c(-7.79999923706055, 2.39999961853027,
16.7000007629395, -10.8000011444092, 14.1000003814697, 10.0999984741211,
-1.10000038146973, -4.5, 4.29999923706055, 6.39999961853027,
-15.6999998092651, -6.89999961853027, 18.4000015258789, -8.39999961853027,
-10.0999984741211, 7.40000152587891, 2.19999885559082, 12)), row.names = c(NA,
-18L), class = c("tbl_df", "tbl", "data.frame"))
Here an easy way to order your plot is to use arrange function from dplyr to sort your dataframe according column(s) of your choice and then format the grouping value (county_label) as factor with the ranked elements:
library(dplyr)
library(ggplot2)
DF %>% arrange(adhd_2010) %>% mutate(county_label = factor(county_label, unique(county_label))) %>%
ggplot(aes(x=adhd_2010, xend=adhd_2018, y=county_label, group=county_label)) +
#create a thick line between x and xend instead of using defaut
#provided by geom_dubbell
geom_segment(aes(x=adhd_2010,
xend=adhd_2018,
y=county_label,
yend=county_label),
color="#b2b2b2", size=1.5)+
geom_dumbbell(color="light blue",
size_x=3.5,
size_xend = 3.5,
#Note: there is no US:'color' for UK:'colour'
# in geom_dumbbel unlike standard geoms in ggplot()
colour_x="forestgreen", # green = 2010
colour_xend = "red")+ # red = 2018
labs(x=NULL, y=NULL,
title="Dumbbell Chart",
subtitle="Change in prescription rate: 2010 vs 2018")+
geom_text(color="black", size=2, hjust=-0.5,
aes(x=adhd_2010, label=adhd_2010))+
geom_text(aes(x=adhd_2018, label=adhd_2018),
color="black", size=2, hjust=1.5)
It is not perfect but at least your values are quite ordered. After you can change the column by which you wish to order your plot (here I order based on adhd_2010)
# Reformat data
DF2<-DF%>% arrange(desc(adhd_2010))
DF3<-DF%>% mutate("key" = "Change in Prescription Rate")
DF3$county_label<-factor(DF3$county_label,DF2$county_label)
DF3$adhd_2018<-signif(DF3$adhd_2018, digits = 3)
DF3$adhd_2010<-signif(DF3$adhd_2010, digits = 3)
# Plot
dumbbell::dumbbell(DF3, id="county_label", key="key", column1="adhd_2010", column2="adhd_2018", lab1="2010", lab2="2018", delt=1, textsize = 2, pt_val =1, ,pointsize = 2) + xlim(13,43) +
labs(x=NULL, y=NULL, title="Dumbbell Chart",subtitle="Change in prescription rate: 2010 vs 2018")
I added in a few bells and whistles, just toggle the options to remove
I hope someone finds it useful
Enjoy!

Conditional Replacing with NA in R (two dataframes)

I have
idx <- c(1397, 2000, 3409, 3415, 4077, 4445, 5021, 5155)
idy <- c( 1397, 2000, 2860, 3029, 3415, 3707, 4077, 4445, 5021, 5155,
5251, 5560)
agex <- c(NA, NA, NA, 35, NA, 62, 35, 46)
agey <- c( 3, 45, 0, 89, 7, 2, 13, 24, 58, 8, 3, 45)
dat1 <- as.data.frame(cbind(idx, agex))
dat2 <- as.data.frame(cbind(idy, agey))
Now I want whenever agex = NA, and idx = idy, that agey = NA, so that
idy agey
1 1397 NA
2 2000 NA
3 2860 0
4 3029 89
5 3415 7
6 3707 2
7 4077 NA
8 4445 24
9 5021 58
10 5155 8
11 5251 3
12 5560 45
I have tried this
ifelse(is.na(dat1$agex) | dat1$idx %in% dat2$idy, NA, dat2$agey)
it returns NAs at the correct indices, but shortens idy to the length of idx.
I want whenever agex = NA, and idx = idy, that agey = NA
With a data.table update join...
library(data.table)
setDT(dat1); setDT(dat2)
dat2[dat1[is.na(agex)], on=.(idy = idx), agey := NA]
dat2
idy agey
1: 1397 NA
2: 2000 NA
3: 2860 0
4: 3029 89
5: 3415 7
6: 3707 2
7: 4077 NA
8: 4445 24
9: 5021 58
10: 5155 8
11: 5251 3
12: 5560 45
How it works
dat1[is.na(agex)] is the subset where agex is NA
DT[mDT, on=, j] is a join where rows of mDT are looked up in DT using on=
j is done in the joined subset of DT
when j is k := expr, column k of DT is updated

Extract unique values from column in dataframe, grouped by ID

I have a dataframe data0 with id's and dates like this:
id date
1 2016-10-20
1 2016-10-19
1 2016-10-20
2 2016-10-21
2 2016-10-22
3 2016-10-21
3 2016-10-21
3 2016-10-22
Reproduce:
data0 <- structure(list(id = c(1, 1, 1, 2, 2, 3, 3, 3), date = structure(c(17094, 17093, 17094, 17095, 17096, 17095, 17095, 17096), class = "Date")), .Names = c("id", "date"), row.names = c(NA, -8L), class = "data.frame")
How can I summarize dates by id so that I come out with a structure with counts like this? :
id 2016-10-19 2016-10-20 2016-10-21 2016-10-22
1 1 2
2 1 1
3 2 1
data0 <- structure(list(id = c(1, 1, 1, 2, 2, 3, 3, 3),
date = structure(c(17094, 17093, 17094, 17095, 17096, 17095, 17095, 17096), class = "Date")),
.Names = c("id", "date"), row.names = c(NA, -8L), class = "data.frame")
Use the built in table function.
> table(data0)
date
id 2016-10-19 2016-10-20 2016-10-21 2016-10-22
1 1 2 0 0
2 0 0 1 1
3 0 0 2 1
Another way around using xtabs:
data0$col <- 1
xtabs(col~id+date, data0)
# date
#id 2016-10-19 2016-10-20 2016-10-21 2016-10-22
# 1 1 2 0 0
# 2 0 0 1 1
# 3 0 0 2 1

Apply custom function to specific row/column

I'm trying to solve a much larger problem using this basic example. I need to apply a function based on the location from which() because I need to know the year from df1 where the value is NA or >= 150. Then I subset df2, get the mean, and return it to the exact row. Right now I'm using a for() loop and need something much faster as the data I have is very large. Is there a common way to do this?
dput:
df1 <- structure(list(id = c("USC00031632", "USC00031632", "USC00031632",
"USC00031632", "USC00031632", "USC00031632", "USC00031632", "USC00031632",
"USC00031632", "USC00031632"), element = c("TMAX", "TMIN", "TMAX",
"TMIN", "TMAX", "TMIN", "TMAX", "TMIN", "TMAX", "TMIN"), year = 1900:1909,
month = c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5), day = c(1, 1, 1,
1, 1, 1, 1, 1, 1, 1), value = c(30.02, NA, 37.94, 10.94,
NA, 28.04, 64.94, 41, 200, 51.08)), .Names = c("id", "element",
"year", "month", "day", "value"), row.names = c(NA, -10L), class = c("tbl_df",
"data.frame"))
df2 <-structure(list(id = c("USC00031632", "USC00031632", "USC00031632",
"USC00031632", "USC00031632", "USC00031632", "USC00031632", "USC00031632",
"USC00031632", "USC00031632"), element = c("TMAX", "TMIN", "TMAX",
"TMIN", "TMAX", "TMIN", "TMAX", "TMIN", "TMAX", "TMIN"), year = 1900:1909,
month = c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5), day = c(1, 1, 1,
1, 1, 1, 1, 1, 1, 1), value = c(30.02, 10.94, 37.94, 10.94,
12, 28.04, 64.94, 41, 82.04, 51.08)), row.names = c(NA, -10L
), class = c("tbl_df", "data.frame"), .Names = c("id", "element",
"year", "month", "day", "value"))
Code:
library(dplyr)
check <- function(df, yr){
df_d <- filter(df, year == yr)
m <- mean(df_d$value)
return(m)
}
for (i in which(is.na(df1$value) | df1$value >= 150)){
df1[i,6] <- check(df = df2, yr = as.numeric(df1[i,3]) )
}
I would recommend the efficient binary join from data.table combined with modification in place (using the :=) while specifying by = .EACHI (in order to calculate the mean for each group separately).
library(data.table)
setDT(df1)[setDT(df2),
value := ifelse(is.na(value) | value >= 150, mean(i.value), value),
on = "year",
by = .EACHI]
df1
# id element year month day value
# 1: USC00031632 TMAX 1900 1 1 30.02
# 2: USC00031632 TMIN 1901 1 1 10.94
# 3: USC00031632 TMAX 1902 2 1 37.94
# 4: USC00031632 TMIN 1903 2 1 10.94
# 5: USC00031632 TMAX 1904 3 1 12.00
# 6: USC00031632 TMIN 1905 3 1 28.04
# 7: USC00031632 TMAX 1906 4 1 64.94
# 8: USC00031632 TMIN 1907 4 1 41.00
# 9: USC00031632 TMAX 1908 5 1 82.04
# 10: USC00031632 TMIN 1909 5 1 51.08
Alternatively, we could do this in two steps in order to try avoiding the ifelse overhead in each step
setDT(df1)[setDT(df2), value2 := i.value, on = "year"]
df1[is.na(value) | value >= 150, value := mean(value2), by = year]
df1
# id element year month day value value2
# 1: USC00031632 TMAX 1900 1 1 30.02 30.02
# 2: USC00031632 TMIN 1901 1 1 10.94 10.94
# 3: USC00031632 TMAX 1902 2 1 37.94 37.94
# 4: USC00031632 TMIN 1903 2 1 10.94 10.94
# 5: USC00031632 TMAX 1904 3 1 12.00 12.00
# 6: USC00031632 TMIN 1905 3 1 28.04 28.04
# 7: USC00031632 TMAX 1906 4 1 64.94 64.94
# 8: USC00031632 TMIN 1907 4 1 41.00 41.00
# 9: USC00031632 TMAX 1908 5 1 82.04 82.04
# 10: USC00031632 TMIN 1909 5 1 51.08 51.08
You can get rid of value2 afterwards if you wish using df1[, value2 := NULL]

Resources