Select value from data frame in period - r

I have the following problem - there are two data frames:
observations: 100 observations (= individuals) with a specific "observation date" - this date ist the baseline. There are further time points after 6 months (and maybe after 12 months).
visits: obervations (individuals) visit a place several times during a period of 12 months, f.e. observation 1 has 24 visits, observation 2 has 32 visits etc. During every visit data are collected - among others the "important value" (numeric). These values are connected to a visit date in the same row. So the "important value" occurs several times for each observation (f.e. 24 times for observation 1, 32 times for observation 2, etc.)
Now I would like to read the "important value" from data frame visits which fits best to approximately 6 months (+/- 15 days) after observation date and put this value in a new column in the observations data frame. The reason is because I want to know what was the important value after 6 months for each observation out of multiple visits from these many visits. The +/- 15 days idea is because the important value is not collected exactly after 6 months - this could be a little bit earlier or later.
I created a dummy dataset:
library(dplyr)
library(lubridate)
library(data.table)
set.seed(123)
observations <- data.frame(observation_id = seq(1, 100, 1),
observation_date = sample(seq(as.Date('1980/01/01'), as.Date('2010/12/31'), by="day"), 100),
age = round(runif(100, min = 1, max = 80)),
bmi = round(runif(100, min = 19, max = 30)),
amount = round(runif(100, min = 10, max = 10000)),
stringsAsFactors = FALSE)
observations$observation_date_6months_later <- observations$observation_date + days(180)
observations$observation_date_6months_later_start <- observations$observation_date_6months_later + days(-10)
observations$observation_date_6months_later_end <- observations$observation_date_6months_later + days(+10)
visit_data <- data.frame(visit_observation_id = round(runif(10000, min = 1, max = 100)),
visit_id = seq(1, 10000, 1),
visit_date = sample(seq(as.Date('1980/01/01'), as.Date('2020/12/31'), by="day"), 10000),
important_value = round(runif(10000, min = 0.01, max = 50), 2),
var2 = round(runif(10000, min = 1, max = 1000)),
var3 = round(runif(10000, min = 1, max = 9000)),
stringsAsFactors = FALSE)
observations$observation_date_6months_later_important_value <- NA # I would like to read the best fitting value from data frame "visit_data"
data_joined <-
dplyr::inner_join(visit_data, observations, by = c('visit_observation_id' = 'observation_id')) %>%
arrange(visit_date)
I would be glad if someone could help me with this problem!
Thank you!

Related

How to merge two dataframes when column values are not exact?

I have:
Linearly interpolated the dFe_env data every 1 m and create a data frame (This works)
Extracted the 'Depth' (based on sinking rate) in 30 minute intervals (This works)
Created a 'Time' column where it increases every 30 minutes (This works)
How do I:
Merge two dataframes together (Bckgd_env2 and bulk_Fe2). In 'bulk_Fe2' the Depth increases by 1m and in 'Bckgd_env2' the depth increases by 0.8m. Can I get the closest 'Depth' match, extract the dFe_env at that depth and create a new data frame with Depth, Time and dFe_env all together?
library(dplyr)
Depth <- c(0, 2, 20, 50, 100, 500, 800, 1000, 1200, 1500)
dFe_env <- c(0.2, 0.2, 0.3, 0.4, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1)
bulk_Fe <- data.frame(Depth, dFe_env)
summary(bulk_Fe)
is.data.frame(bulk_Fe)
do_interp <- function(dat, Depth = seq(0,1500, by=1)) {
out <- tibble(Depth = Depth)
for (var in c("dFe_env")) {
out[[var]] <- tryCatch(approx(dat$Depth, dat[[var]], Depth)$y, method="ngb", error = function(e) NA_real_)
}
out
}
bulk_Fe2 <- bulk_Fe %>% do(do_interp(.))
bulk_Fe2
summary(bulk_Fe2)
D0 <- 0 #Starting depth
T0 <- 0 #Starting time of the experiment
r <- 40 #sinking rate per day
r_30min <- r/48 #sinking speed every 30 minutes (There are 48 x 30 minute intervals in 24 hours)
days <- round(1501/(r)) #days 1501 is maximum depth
time <- days * 24 * 60 #minutes
n_steps <- 1501/r_30min
Bckgd_env2 <- data.frame(Depth =seq(from = D0, by= r_30min, length.out = n_steps + 1),
Time = seq(from = T0, by= 30, length.out = n_steps + 1))
head(Bckgd_env2)
round(Bckgd_env2, digits = 1)
Bckgd_env3 <- merge(Bckgd_env2, bulk_Fe2)
Bckgd_env3
plot(Bckgd_env2$dFe_env ~ Bckgd_env2$Depth, ylab="dFe (nmol/L)", xlab="Depth (m)", las=1)
You have already built the mechanism for interpolation which will be useful for the join. But you didn't build it at the right depth values. It is just a matter of reorganizing your code.
Start with buiding Bckgd_env2, and only afterwards compute bulk_Fe2 and bulk_Fe3:
bulk_Fe2 <- bulk_Fe %>% do(do_interp(., Depth=Bckgd_env2$Depth))
Bckgd_env3 <- merge(Bckgd_env2, bulk_Fe2)

Conditional structures, R

I am a new user on R.
I have a datafame like:
Month eqqBio eqqLi ..........
January 20 20000
February 100 500
.
.
.
The 2 columns, eqqBio and eqqLi are the % compared to the previous year.
My aim is if the % compared to the previous year is 1000(or +) you flag it or a message indicating an outlier.
It's just I don't understant the conditional structure, and I don't know how I have to proceed..
Thanks in advance!
df <- data.frame(Month = c("January", "February"),
eqqBio = c(20, 100),
eqqLi = c(20000, 500))
df$alert <- df$eqqBio > 1000 | df$eqqLi > 1000
That gives you a new column. If both eqqBio and eqqLi are below 1000 the value in the according row is FALSE. Otherwise it is TRUE.
Of you have more than 2 columns you can do it like this:
# sample data
df <- data.frame(Month = c("January", "February", "March"),
eqqBio = c(20, 100, 0),
eqqLi = c(20000, 500, 0),
dummy1 = c(0, 0, 1001),
dummy2 = c(0, 0, 0))
# Check is any values in this row are > 1000 but only check columns 2:5.
df$alert <- apply(df[, 2:5], 1, function(x) any(x > 1000))
Adjust the columns you wish to include by changing 2:5 in df[, 2:5].

How to write a function that collects a specific list of observations from a time series data frame

In the data set created below, assume I randomly picked up 20 flat rocks. Each of these rocks were assigned a unique ID number. I measured the concentration of 7 substances (Copper,Iron,Carbon,Lead,Mg,CaCO, and Zinc) across the surface of the longest axis of each rock. Distance is recorded in mm, and therefore is a function of each rocks length. Note that not all Rocks are of the same length. Location is a grouping variable that describes where the Rock was picked up.
ID <- data.frame(ID=rep(c(12,122,242,329,595,130,145,245,654,878), each = 200))
ID2 <- data.frame(ID=rep(c(863,425,24,92,75,3,200,300,40,500), each = 300))
RockID<-data.frame(RockID = c(unlist(ID), unlist(ID2)))
Location <- rep(c("Alpha","Beta","Charlie","Delta","Echo"), each = 1000)
a <- rep(c(1:200),times = 10)
b <- rep(c(1:300), times = 10)
Time <- data.frame(Time = c(unlist(a), unlist(b)))
set.seed(1)
Copper <- rnorm(5000, mean = 0, sd = 5)
Iron <- rnorm(5000, mean = 0, sd = 10)
Carbon <- rnorm(5000, mean = 0, sd = 1)
Lead <- rnorm(5000, mean = 0, sd = 4)
Mg <- rnorm(5000, mean = 0, sd = 6)
CaCO <- rnorm(5000, mean = 0, sd = 2)
Zinc <- rnorm(5000, mean = 0, sd = 3)
data <-cbind(RockID, Location, Time,Copper,Iron,Carbon,Lead,Mg,CaCO,Zinc)
data$ID <- as.factor(data$RockID)
I want to create a new data frame that contains the following information:
1. The first observation and the last observation for each individual
2. The average of the first 3 observations and last 3 observations for each individual
3. The same as step 2. for the first and last 5, 7, and 10 observations
I want the new data frame to be set up like this:
ID FirstPt First3 First5 First7 First10 LastPt Last3 Last5 Last7 Last10
12 … … … … … … … … … …
122
242
329
595
130
145
245
654
878
863
425
ect...
How would I write a function to accomplish this?
We can create a function to calculate average of first and last n values. Use pivot_longer to get data in long format, group_by each RockID and substance and calculate the mean.
library(dplyr)
average_of_first_n_values <- function(value, x) mean(head(value, x))
average_of_last_n_values <- function(value, x) mean(tail(value, x))
data %>%
tidyr::pivot_longer(cols = Copper:Zinc) %>%
group_by(RockID, name) %>%
summarise(first_obs = first(value),
last_obs = last(value),
first_3_avg = average_of_first_n_values(value, 3),
first_5_avg = average_of_first_n_values(value, 5),
first_7_avg = average_of_first_n_values(value, 7),
first_10_avg = average_of_first_n_values(value, 10),
last_3_avg = average_of_last_n_values(value, 3),
last_5_avg = average_of_last_n_values(value, 5),
last_7_avg = average_of_last_n_values(value, 7),
last_10_avg = average_of_last_n_values(value, 10))

Sample from one of two distributions

I want to repeatedly sample values based on a certain condition. For example I want to create a sample of 100 values.
With probability of 0.7 it will be sampled from one distribution, and from another probability, otherwise.
Here is a way to do what I want:
set.seed(20)
A<-vector()
for (i in 1:100){
A[i]<-ifelse(runif(1,0,1)>0.7,rnorm(1, mean = 100, sd = 20),runif(1, min = 0, max = 1))
}
I am sure there are other more elegant ways, without using for loop.
Any suggestions?
You can sample an indiactor, which defines what distribution you draw from.
ind <- sample(0:1, size = 100, prob = c(0.3, 0.7), replace = TRUE)
A <- ind * rnorm(100, mean = 100, sd = 20) + (1 - ind) * runif(100, min = 0, max = 1)
In this case you don't use a for-loop but you need to sample more random variables.
If the percentage of times is not random, you can draw the right amount of each distribution then shuffle the result :
n <- 100
A <- sample(c(rnorm(0.7*n, mean = 100, sd = 20), runif(0.3*n, min = 0, max = 1)))

Replace values in data frame column with results of aggregate/cut

Edit: Modified simulated data so that price means/medians and neighborhoods don't overlap perfectly.
I have a column in a dataframe, we'll call it Price. I'm just simulating data here:
mydata = data.frame(index = rep(1:1000))
mydata$price[1:300] = rnorm(250, mean = 10000, sd = 1000)
mydata$price[301:550] = rnorm(250, mean = 25000, sd = 1000)
mydata$price[551:775] = rnorm(250, mean = 75000, sd = 1000)
mydata$price[776:1000] = rnorm(250, mean = 100000, sd = 1000)
And a set of neighborhoods, we'll call it Hoods:
mydata$hoods = factor(c(rep('hood1',250),rep('hood2',250),rep('hood3',250),rep('hood4',250)))
Then I aggregate the neighborhoods by the median price to create a median bin. I'd like to bin neighborhoods by their median price.
agg <- aggregate(mydata$price, by = list(hoods), FUN = median))
Then I create a cut version of the neighborhood medians (in my actual data there are 24 neighborhoods). So something like:
cut_aggregates <- cut(agg$x, breaks = c(0, 25000, 70000, 110000), labels = c('low','medium','high'))
I then want to replace the value of every 'hood1' in the original data with the aggregated price label, and so on for all the Neighborhoods. SO the first 250 records would be 'low', for example. I know I could make some nested if statement, or brute-force hard-coding. Does anyone know a way I can more efficiently assign all the values in one go, as I may use this for datasets larger than 1000 records. Thank you very much for any help you may provide.
in the final output, the categorized neighborhood ('low', 'medium', 'high') won't necessarily be the same as just doing a cut on price from the original data, because some neighborhood will have a combination of 'low', 'medium', and 'high' using this strategy. I want to first categorize each neighborhood based on its aggregate, and THEN recode the neighborhood.
A very simple way to do this, and probably the fastest, is to use data.table.
library(data.table)
# convert mydata into a data.table
setDT(mydata)
# calculate median price by hood
mydata[, med := median(price), by=hoods]
now you can either:
# replace the original data of `hoods` with the new price labels
mydata[, hoods := cut(med, breaks = c(0, 25000, 70000, 110000), labels = c('low','medium','high'))]
# or create new price labels in a new column
mydata[, new_col := cut(med, breaks = c(0, 25000, 70000, 110000), labels = c('low','medium','high'))]
Finally, if you want just a summary table for each hood:
mydata[, (med = median(price)), by=.(hoods, new_col)]
> hoods my_cut V1
> 1: hood1 low 9916.564
> 2: hood2 low 24696.864
> 3: hood3 high 74749.481
> 4: hood4 high 99852.744
Edit: Approach 1
mydata <- within(mydata, med <- ave(price, hoods, FUN = median) )
mydata$new_label <- cut(mydata$med, breaks = c(0, 25000, 70000, 110000), labels = c('low','medium','high'))
# index price hoods med new_label
# 1 1 10084.756 hood1 10014.38 low
# 2 2 10226.460 hood1 10014.38 low
# 3 3 10432.556 hood1 10014.38 low
# 4 4 10558.065 hood1 10014.38 low
# 5 5 10059.755 hood1 10014.38 low
# 6 6 9885.359 hood1 10014.38 low
Approach2:
Since agg$labs is not unique for each level of hoods in mydata, it will be better to reassign labels individually using a loop by mapping the levels of hoods with the values of agg$labs.
If you had unique labels in agg$labs for each levels of hoods in mydata, then it will be very simple to just reassign labels by doing mydata$hoods <- factor( mydata$hoods, levels = agg$Group.1, labels = agg$labs ). However you have duplicated levels in agg$labs, so you will follow the steps below.
mydata$hoods <- as.character( mydata$hoods ) # convert factor to character
agg$labs <- as.character(agg$labs) # convert factor to character
for( i in seq_len( nrow( agg ) ) ) { # change labels for hoods in mydata
mydata[ mydata$hoods %in% agg$Group.1[ i ], "hoods" ] <- agg$labs[i]
}
mydata$hoods <- factor( mydata$hoods ) # convert hoods back to factor
unique(mydata$hoods) # output
# [1] low medium high
# Levels: high low medium
Data:
set.seed( 200 )
mydata = data.frame(index = rep(1:1000))
mydata$price[1:250] = rnorm(250, mean = 10000, sd = 1000)
mydata$price[251:500] = rnorm(250, mean = 25000, sd = 1000)
mydata$price[501:750] = rnorm(250, mean = 75000, sd = 1000)
mydata$price[751:1000] = rnorm(250, mean = 100000, sd = 1000)
mydata$hoods = factor(c(rep('hood1',250),rep('hood2',250),rep('hood3',250),rep('hood4',250)))
agg <- with(mydata, aggregate( price, by = list(hoods), FUN = median) )
agg$labs <- cut(agg$x, breaks = c(0, 25000, 70000, 110000), labels = c('low','medium','high'))
agg
# Group.1 x labs
# 1 hood1 10014.38 low
# 2 hood2 25021.96 medium
# 3 hood3 74963.40 high
# 4 hood4 100019.88 high
The data in agg will vary if you choose a different seed in set.seed() function.

Resources