Select Time Data [duplicate] - r

This question already has answers here:
R: How to filter/subset a sequence of dates
(4 answers)
Closed 3 years ago.
I only want to select the data between 2015-1 and 2018-1.
usefuldata1 <-usefuldata[usefuldata$zeit > "2014 - 12",]
usefuldata2 <-usefuldata1[usefuldata1$zeit < "2018-2",]
Bigger than 2014 works. However, smaller than 2018 does not.
I get the same number as before.
My data looks as follows:
x <- sample(letters, size = 30, replace = T)
y <- paste(sample(2010:2020, size = 30, replace = T),
sample(1:12, size = 30, replace = T), sep = "-")
df <- data.frame(name = x, date = y)

There is a very userfull type in R called Date, by experience about 90% of all problems about dates can be solved by that.
see: ?as.Date for help
In your case, the first thing to do is format y as a Date type adding the days.
x <- sample(letters, size = 30, replace = T)
y <- paste(sample(2010:2020, size = 30, replace = T),
sample(1:12, size = 30, replace = T),
"01", sep = "-") # <- this is the day (01) for all.
Then we have to format it as a Date type.
y = as.Date(y, format = "%Y-%m-%d") # year complete (%Y), month (%m) and day (%d)
df <- data.frame(name = x, date = y)
And subset them.
dateIndex = which(y > "2014-12-31" & y < "2018-01-01")
subset_df = df[dateIndex,]
I hope this helps you.

Related

Getting the last 6 months data in R

i had data frame which contain many row. Here it is:
library(lubridate)
date_ <- date(seq(ymd_h("2020-01-01-00"), ymd_h("2021-03-31-23"), by = "hours"))
hour_ <- hour(seq(ymd_h("2020-01-01-00"), ymd_h("2021-03-31-23"), by = "hours"))
game1 <- sort(round(runif(length(date_), 10, 50), 0), decreasing = TRUE)
game2 <- sort(round(runif(length(date_), 20, 100), 0), decreasing = TRUE)
game3 <- sort(round(runif(length(date_), 30, 150), 0), decreasing = TRUE)
game4 <- sort(round(runif(length(date_), 40, 200), 0), decreasing = TRUE)
game_data <- data.frame(date_, hour_, game1, game2, game3, game4)
I just want to subset game_data to get all the last 6 months data. How do i get it?
Last 6 months of data from the max date in the data ?
You can try -
library(lubridate)
library(dplyr)
result <- subset(game_data, date_ > max(date_) %m-% months(6))
Or with dplyr -
result <- game_data %>% filter(date_ > max(date_) %m-% months(6))

10 Day intervals with a overlapping date

I have a annual data set that I would like to break into 10 day intervals. For example I would like to subset 2010-12-26 to 2011-01-04 create a home range using the x and y values for those dates, then get the next 9 days plus an overlapping date between the subsetted data in this case it would be 2011-01-04 (2011-01-04 to 2011-01-13). Is there a good way to do this?
library(lubridate)
date <- rep_len(seq(dmy("26-12-2010"), dmy("20-01-2011"), by = "days"), 500)
df <- data.frame(date = date,
x = runif(length(date), min = 60000, max = 80000),
y = runif(length(date), min = 800000, max = 900000))
I separated them by 10 observations, but I am unsure as to how I can do make it more specific to get 10 days instead of just 10 observations.
interval_10 <- lapply(
seq(0, nrow(df), by = 10),
function(k) df[max(k+1, 1):min(k + 10, nrow(df)), ]
)
Thank you
lapply through the unique date vector will do the work:
t <- unique(date)[seq(from = 1, to = length(unique(date)), by = 9)]
interval_10 <- lapply(
1:(length(t)-1),
function(k) df %>% filter(date <= t[k+1], date >= t[k])
)

Use sjlabelled::set_labels with lookup table

I need to label a values in a lot of variables with sjlabelled::set_labels. Here is a reproducable example and what already works:
library(data.table)
library(sjlabelled)
lookup <- data.table(id = paste0("q", 1:5),
answers = paste(paste0("atext", 1:5), paste0("btext", 1:5)
, paste0("ctext", 1:5), sep = ";"))
data <- data.table(q1 = sample(1:3, 10, replace = TRUE),
q2 = sample(1:3, 10, replace = TRUE),
q3 = sample(1:3, 10, replace = TRUE),
q4 = sample(1:3, 10, replace = TRUE),
q5 = sample(1:3, 10, replace = TRUE))
data$q1 <- set_labels(data$q1, labels = unlist(strsplit(lookup[id == "q1", answers], split = ";")))
get_labels(data$q1)
So the labels for the different answers (=values) are seperated by a semicolon. I am able to make it work if I call the variables by id but as you can see in the example code but I am struggling with the task if I want to "loop" through all variables.
The goal is to be able to export the datatable (or dataframe) as an SPSS file. If it works with other packages I would also be happy.
Match the column names of data with id, split the answers on ; and pass the labels as a list.
library(sjlabelled)
data <- set_labels(data, labels = strsplit(lookup$answers[match(names(data), lookup$id)], ';'))
get_labels(data)
#$q1
#[1] "atext1" "btext1" "ctext1"
#$q2
#[1] "atext2" "btext2" "ctext2"
#$q3
#[1] "atext3" "btext3" "ctext3"
#$q4
#[1] "atext4" "btext4" "ctext4"
#$q5
#[1] "atext5" "btext5" "ctext5"

Combine outputs of a function for each index in a for loop in R

I have created a function which completes a calculation based on data from two data frames for an individual.
I want to complete that function for each individual and combine all the outputs in a data frame and export to .csv
Currently the output .csv only has data for person 34, none of the other.
I've noted that when I run this it creates an object ID, which is just the numeric 34. It seems to be only holding on to the last ID in data$ID.
How can I create an output with results for all persons?
library(dplyr)
library(lubridate)
library(date)
screen_function = function(x){
# Select each person and get necessary inputs
data = data%>%filter(ID == x)
demogs = demogs %>% filter (P_ID == x)
data$Age = demogs$Age
data$result = data$test * data$Age
data$Date = as.Date(data$Date,format='%d/%m/%Y') # ensures date column is in correct format
# only include tests from most recent 12-24 months and only proceed if test in last 12 months
Recent_12m = data %>% filter(between(Date, today() - years(1), today()))
Recent_24m = data %>% filter(between(Date, today() - years(2), today()))
if ((nrow(Recent_12m)) > 0) {
data = rbind(Recent_12m,Recent_24m)
Recent_12m$min_date = min(Recent_12m$Date)
Recent_12m$Date = as.Date(Recent_12m$Date,format='%d/%m/%Y')
Recent_12m$min_date = as.Date(Recent_12m$min_date,format='%d/%m/%Y')
Recent_24m$min_date = min(Recent_24m$Date)
Recent_24m$Date = as.Date(Recent_24m$Date,format='%d/%m/%Y')
Recent_24m$min_date = as.Date(Recent_24m$min_date,format='%d/%m/%Y')
Recent_12m$Period = interval(Recent_12m$min_date, Recent_12m$Date)
Recent_12m$Years = as.numeric(Recent_12m$Period, unit = "years")
Recent_24m$Period = interval(Recent_24m$min_date, Recent_24m$Date)
Recent_24m$Years = as.numeric(Recent_24m$Period, unit = "years")
# Latest result
Last = filter(Recent_12m, (Recent_12m$Date == max(Date)))
# linear regression model
Reg_12month <- lm(result ~ Years, data=Recent_12m)
Reg_24month <- lm(result ~ Years, data=Recent_24m)
info = c(x, round(Last$result, digits=1), round(Reg_12month$coefficients["Years"], digits = 1), round(Reg_24month$coefficients["Years"], digits = 1))
newdf = data.frame(matrix(0, ncol = 4))
colnames(newdf) = c("ID", "Latest result", "Trend 12month", "Trend 24 month")
newdf= rbind(newdf, info)
write.csv(newdf, "filepath.csv")
}
}
Date= sample(seq(as.Date('2019/11/01'), as.Date('2020/11/01'), by="day"), 12)
ID= c(12,12,12,450,450,450,1,1,1,34,34,34)
test= rnorm(12, mean=150, sd=60)
data= data.frame(ID, Date, test)
P_ID = c(1,12,34,450)
Age = c(50,45,60,72)
demogs = data.frame(P_ID, Age)
persons = unique(data$ID)
for(ID in persons){
screen_function(paste("", ID,"", sep=""))
}
Created on 2020-11-16 by the reprex package (v0.3.0)
So, I've got around this by using a pre-made .csv, instead of creating a new dataframe. The .csv just contains a single row of 4 columns, with random entries in each cell.
newdf= read.csv(file = "filepath.csv")
info = c(x, round(Last$result, digits=1), round(Reg_12month$coefficients["Years"], digits = 1),
round(Reg_24month$coefficients["Years"], digits = 1))
newdf= rbind(Summary, patient_info)
colnames(newdf) = c("ID", "Latest result", "Trend 12month", "Trend 24 month")
newdf= distinct(newdf, ID, .keep_all = TRUE)
write.csv(Summary, "filepath.csv", row.names = FALSE)}}

Converting a raw data frame into workable time series

I have a spreadsheet documenting prices of 40 similar products at various dates. It looks like this.
date_1<-seq(as.Date("2010-01-01"), as.Date("2011-01-01"), length.out = 40)
date_2<-seq(as.Date("2011-01-01"), as.Date("2012-01-01"), length.out = 40)
date_3<-seq(as.Date("2012-01-01"), as.Date("2013-01-01"), length.out = 40)
date_4<-seq(as.Date("2013-01-01"), as.Date("2014-01-01"), length.out = 40)
date_5<-seq(as.Date("2014-01-01"), as.Date("2015-01-01"), length.out = 40)
date_6<-seq(as.Date("2015-01-01"), as.Date("2016-01-01"), length.out = 40)
price_1<-floor(seq(20, 50, length.out = 40))
price_2<-floor(seq(20, 60, length.out = 40))
price_3<-floor(seq(20, 70, length.out = 40))
price_4<-floor(seq(30, 80, length.out = 40))
price_5<-floor(seq(40, 100, length.out = 40))
price_6<-floor(seq(50, 130, length.out = 40))
data.frame(date_1,price_1,date_2,price_2,date_3,price_3,date_4,price_4,date_5,price_5,date_6,price_6)
The problem is, the columns representing dates and prices alternate (convenient for record keeping). How can I transform the above data to a new dataframe consisting solely of prices of these 40 products as rows, with dates as column names? This will generate a lot of NA's because the dates in each column differ but that's OK.
When working with time series data it is often helpful to have it in long form (one row per observation), even if your target output is wide (one row per time series). Here are three possible approaches to get it into long form, then widen:
1. base reshape()
To get long form, base reshape is definitely a powerful option. The following solution improves on the accepted solution because it works for any numbers of products and observations and eliminates an unnecessary step:
df <- data.frame(date_1,price_1,date_2,price_2,date_3,price_3,
date_4,price_4,date_5,price_5,date_6,price_6)
# no need to create an id variable
long_form <- reshape(df, # idvar="id" by default
varying = list(grep('date_',names(df), value=TRUE),
grep('price_',names(df), value=TRUE) ),
v.names=c("date","price"),
direction="long",
sep="_")
And reshape can also widen it. (We'll use spread in another approach below.)
wide_form <- reshape(long_form, drop='time', timevar='date', direction='wide')
2. data.table melt() and dcast() (likely faster on real dataset)
Make sure you have data.table v1.9.6 or later, which allows you to melt multiple columns.
library(data.table)
setDT(df)
melt.data.table(df[, prod_id := .I], # product id = original row number
measure.vars = list(grep('date_',names(df), value=TRUE),
grep('price_',names(df), value=TRUE) ),
variable.name = 'sequence',
value.name = c('date','price'),
id.vars = 'prod_id') -> long_form
In this case you don't use the sequence, so to get wide form is just:
dcast.data.table(long_form[, !'sequence', with=FALSE],
value.var = 'price', # optional (function guesses correctly)
prod_id ~ date) -> wide_form
3. tidyr & dplyr split-apply-combine (easy to understand)
It doesn't require the mental gymnastics that reshape does (at least for me). It is a column-wise variant on the "split-apply-combine" paradigm.
library(dplyr); library(tidyr)
# Create long-form time series data
# Split table into sequenced prices and dates, then combine on product and sequence
full_join(
df %>%
select(starts_with('date_')) %>% #~~~~ Left side = date component ~~~~~~~~
mutate(prod_id = 1:nrow(df)) %>% #~ product id = original row number ~
gather(sequence, date, -prod_id) %>% #~ long form = 1 row per prod per seq ~
mutate(sequence = #~~~ Cols: product_id, sequence, date ~~~
sub('^date_(\\d+)$', '\\1', sequence) ) ,
df %>%
select(starts_with('price_')) %>% #~~~ Right side = price component ~~~~~~~
mutate(prod_id = 1:nrow(df)) %>% #~ ~
gather(sequence, price, -prod_id) %>% #~ same idea ~
mutate(sequence = #~~ Cols: product_id, sequence, price ~~~
sub('^price_(\\d+)$', '\\1', sequence) )
) -> long_form
In this case you don't need the sequence, so to get to wide form it's simply:
long_form %>% select(-sequence) %>% spread(date, price) -> wide_form
as noted by others above.
Here is one way I came up with using dplyr/tidyr packages:
library(tidyr)
library(dplyr)
date_1<-seq(as.Date("2010-01-01"), as.Date("2011-01-01"), length.out = 40)
date_2<-seq(as.Date("2011-01-01"), as.Date("2012-01-01"), length.out = 40)
date_3<-seq(as.Date("2012-01-01"), as.Date("2013-01-01"), length.out = 40)
date_4<-seq(as.Date("2013-01-01"), as.Date("2014-01-01"), length.out = 40)
date_5<-seq(as.Date("2014-01-01"), as.Date("2015-01-01"), length.out = 40)
date_6<-seq(as.Date("2015-01-01"), as.Date("2016-01-01"), length.out = 40)
price_1<-floor(seq(20, 50, length.out = 40))
price_2<-floor(seq(20, 60, length.out = 40))
price_3<-floor(seq(20, 70, length.out = 40))
price_4<-floor(seq(30, 80, length.out = 40))
price_5<-floor(seq(40, 100, length.out = 40))
price_6<-floor(seq(50, 130, length.out = 40))
df <- data.frame(date_1,price_1,date_2,price_2,date_3,price_3,date_4,price_4,date_5,price_5,date_6,price_6)
dates <- df[, grep('date', names(df))]
dates <- dates %>% gather(date_type, date) %>% select(-date_type)
prices <- df[, grep('price', names(df))]
prices <- prices %>% gather(price_type, price) %>% select(-price_type)
df <- cbind(dates, prices)
Then, to spread dates to columns and prices to rows, you can do something like this:
df <- arrange(df, price)
df <- spread(df, date, price)
Using baseR and tidyr you could do:
library(tidyr)
#add an id to identify the products
df$id=1:40
#transform the data to a long format
long_data <- reshape(df,idvar="id",varying=list(paste0("date_",1:6),paste0("price_",1:6)),v.names=c("date","price"),direction="long",sep="_")
long_data <- long_data[,!grepl("time",colnames(long_data))]
#put it back to a wide format
wide_data <- spread(long_data,date,price)

Resources