I have a spreadsheet documenting prices of 40 similar products at various dates. It looks like this.
date_1<-seq(as.Date("2010-01-01"), as.Date("2011-01-01"), length.out = 40)
date_2<-seq(as.Date("2011-01-01"), as.Date("2012-01-01"), length.out = 40)
date_3<-seq(as.Date("2012-01-01"), as.Date("2013-01-01"), length.out = 40)
date_4<-seq(as.Date("2013-01-01"), as.Date("2014-01-01"), length.out = 40)
date_5<-seq(as.Date("2014-01-01"), as.Date("2015-01-01"), length.out = 40)
date_6<-seq(as.Date("2015-01-01"), as.Date("2016-01-01"), length.out = 40)
price_1<-floor(seq(20, 50, length.out = 40))
price_2<-floor(seq(20, 60, length.out = 40))
price_3<-floor(seq(20, 70, length.out = 40))
price_4<-floor(seq(30, 80, length.out = 40))
price_5<-floor(seq(40, 100, length.out = 40))
price_6<-floor(seq(50, 130, length.out = 40))
data.frame(date_1,price_1,date_2,price_2,date_3,price_3,date_4,price_4,date_5,price_5,date_6,price_6)
The problem is, the columns representing dates and prices alternate (convenient for record keeping). How can I transform the above data to a new dataframe consisting solely of prices of these 40 products as rows, with dates as column names? This will generate a lot of NA's because the dates in each column differ but that's OK.
When working with time series data it is often helpful to have it in long form (one row per observation), even if your target output is wide (one row per time series). Here are three possible approaches to get it into long form, then widen:
1. base reshape()
To get long form, base reshape is definitely a powerful option. The following solution improves on the accepted solution because it works for any numbers of products and observations and eliminates an unnecessary step:
df <- data.frame(date_1,price_1,date_2,price_2,date_3,price_3,
date_4,price_4,date_5,price_5,date_6,price_6)
# no need to create an id variable
long_form <- reshape(df, # idvar="id" by default
varying = list(grep('date_',names(df), value=TRUE),
grep('price_',names(df), value=TRUE) ),
v.names=c("date","price"),
direction="long",
sep="_")
And reshape can also widen it. (We'll use spread in another approach below.)
wide_form <- reshape(long_form, drop='time', timevar='date', direction='wide')
2. data.table melt() and dcast() (likely faster on real dataset)
Make sure you have data.table v1.9.6 or later, which allows you to melt multiple columns.
library(data.table)
setDT(df)
melt.data.table(df[, prod_id := .I], # product id = original row number
measure.vars = list(grep('date_',names(df), value=TRUE),
grep('price_',names(df), value=TRUE) ),
variable.name = 'sequence',
value.name = c('date','price'),
id.vars = 'prod_id') -> long_form
In this case you don't use the sequence, so to get wide form is just:
dcast.data.table(long_form[, !'sequence', with=FALSE],
value.var = 'price', # optional (function guesses correctly)
prod_id ~ date) -> wide_form
3. tidyr & dplyr split-apply-combine (easy to understand)
It doesn't require the mental gymnastics that reshape does (at least for me). It is a column-wise variant on the "split-apply-combine" paradigm.
library(dplyr); library(tidyr)
# Create long-form time series data
# Split table into sequenced prices and dates, then combine on product and sequence
full_join(
df %>%
select(starts_with('date_')) %>% #~~~~ Left side = date component ~~~~~~~~
mutate(prod_id = 1:nrow(df)) %>% #~ product id = original row number ~
gather(sequence, date, -prod_id) %>% #~ long form = 1 row per prod per seq ~
mutate(sequence = #~~~ Cols: product_id, sequence, date ~~~
sub('^date_(\\d+)$', '\\1', sequence) ) ,
df %>%
select(starts_with('price_')) %>% #~~~ Right side = price component ~~~~~~~
mutate(prod_id = 1:nrow(df)) %>% #~ ~
gather(sequence, price, -prod_id) %>% #~ same idea ~
mutate(sequence = #~~ Cols: product_id, sequence, price ~~~
sub('^price_(\\d+)$', '\\1', sequence) )
) -> long_form
In this case you don't need the sequence, so to get to wide form it's simply:
long_form %>% select(-sequence) %>% spread(date, price) -> wide_form
as noted by others above.
Here is one way I came up with using dplyr/tidyr packages:
library(tidyr)
library(dplyr)
date_1<-seq(as.Date("2010-01-01"), as.Date("2011-01-01"), length.out = 40)
date_2<-seq(as.Date("2011-01-01"), as.Date("2012-01-01"), length.out = 40)
date_3<-seq(as.Date("2012-01-01"), as.Date("2013-01-01"), length.out = 40)
date_4<-seq(as.Date("2013-01-01"), as.Date("2014-01-01"), length.out = 40)
date_5<-seq(as.Date("2014-01-01"), as.Date("2015-01-01"), length.out = 40)
date_6<-seq(as.Date("2015-01-01"), as.Date("2016-01-01"), length.out = 40)
price_1<-floor(seq(20, 50, length.out = 40))
price_2<-floor(seq(20, 60, length.out = 40))
price_3<-floor(seq(20, 70, length.out = 40))
price_4<-floor(seq(30, 80, length.out = 40))
price_5<-floor(seq(40, 100, length.out = 40))
price_6<-floor(seq(50, 130, length.out = 40))
df <- data.frame(date_1,price_1,date_2,price_2,date_3,price_3,date_4,price_4,date_5,price_5,date_6,price_6)
dates <- df[, grep('date', names(df))]
dates <- dates %>% gather(date_type, date) %>% select(-date_type)
prices <- df[, grep('price', names(df))]
prices <- prices %>% gather(price_type, price) %>% select(-price_type)
df <- cbind(dates, prices)
Then, to spread dates to columns and prices to rows, you can do something like this:
df <- arrange(df, price)
df <- spread(df, date, price)
Using baseR and tidyr you could do:
library(tidyr)
#add an id to identify the products
df$id=1:40
#transform the data to a long format
long_data <- reshape(df,idvar="id",varying=list(paste0("date_",1:6),paste0("price_",1:6)),v.names=c("date","price"),direction="long",sep="_")
long_data <- long_data[,!grepl("time",colnames(long_data))]
#put it back to a wide format
wide_data <- spread(long_data,date,price)
Related
I have a dataframe with two colums (one contains a timestamp and the other a specific value).
The gaps between the timestamps are not equi-spaced but are approximately the same length (1000 ms +/- 15%).
Every now and then a timestamp + value is missing resulting in a timestamp-difference of approx 2*(previous difference).
Is there a way to find those gaps and just add an NA row to later fill it with imputated values?
Thanks!
How about this
library(tidyverse)
df <- tibble(id = as.character(1:20), t = seq(1000, 20000, by = 1000) + runif(20, -15, 15), x = rnorm(20)) %>%
filter(!(id %in% c(3, 7, 9)))
df
df <- df %>% mutate(delta_t = t - lag(t),
mean_t = (t + lag(t))/2) %>%
filter(delta_t > 1015 | delta_t < 985 ) %>%
mutate(t = mean_t, x = NA) %>%
select(id, t, x) %>%
bind_rows(df) %>%
arrange(t)
df
The id column is purely so that you can easily see that I've taken out three values in constructing the dummy data
In R I've got a dataset like this one:
df <- data.frame(
ID = c(1:30),
x1 = seq(0, 1, length.out = 30),
x2 = seq(100, 3000, length.out = 30),
category = gl(3, 10, labels = c("NEGATIVE", "NEUTRAL", "POSITIVE"))
)
Now I want to add a new column with randomized boolean values, but inside each category the proportion of TRUE and FALSE values should be the same (i.e. the randomizing process should generate the same count of true and false values, in the above data frame 5 TRUEs and 5 FALSEs in each of the 3 categories). How to do this?
You can sample a vector of "TRUE" and "FALSE" values without replacement so you have a randomized and balanced column in your data-frame.
sample(rep(c("TRUE","FALSE"),each=5),10,replace=FALSE)
Based on Yacine Hajji answer:
addRandomBool <- function(df, p){
n <- ceiling(nrow(df) * p)
df$bool <- sample(rep(c("TRUE","FALSE"), times = c(n, nrow(df) - n)))
df
}
Reduce(rbind, lapply(split(df, df$category), addRandomBool, p = 0.5))
where parametar p determines the proportion of TRUE.
This will sample within each group from a vector of 5 TRUE and 5 FALSE without replacement. It will assume that there are always 10 records per group.
library(dplyr)
library(tidyr)
df <- data.frame(
ID = c(1:30),
x1 = seq(0, 1, length.out = 30),
x2 = seq(100, 3000, length.out = 30),
category = gl(3, 10, labels = c("NEGATIVE", "NEUTRAL", "POSITIVE"))
)
set.seed(pi)
df %>%
group_by(category) %>%
nest() %>%
mutate(data = lapply(data,
function(df){ # Function to saple and assign the new_col
df$new_col <- sample(rep(c(FALSE, TRUE),
each = 5),
size = 10,
replace = FALSE)
df
})) %>%
unnest(cols = "data")
This next example is a little more generalized, but still assumes (approximately) even distribution of TRUE and FALSE within a group. But it can accomodate variable group sizes, and even groups with odd numbers of records (but will favor FALSE for odd numbers of records)
library(dplyr)
library(tidyr)
df <- data.frame(
ID = c(1:30),
x1 = seq(0, 1, length.out = 30),
x2 = seq(100, 3000, length.out = 30),
category = gl(3, 10, labels = c("NEGATIVE", "NEUTRAL", "POSITIVE"))
)
set.seed(pi)
df %>%
group_by(category) %>%
nest() %>%
mutate(data = lapply(data,
function(df){
df$new_col <- sample(rep(c(FALSE, TRUE),
length.out = nrow(df)),
size = nrow(df),
replace = FALSE)
df
})) %>%
unnest(cols = "data")
Maintaining Column Order
A couple of options to maintain the column order:
First, you can save the column order before you do your group_by - nest, and then use select to set the order when you're done.
set.seed(pi)
orig_col <- names(df) # original column order
df %>%
group_by(category) %>%
nest() %>%
mutate(data = lapply(data,
function(df){
df$new_col <- sample(rep(c(FALSE, TRUE),
length.out = nrow(df)),
size = nrow(df),
replace = FALSE)
df
})) %>%
unnest(cols = "data") %>%
select_at(c(orig_col, "new_col")) # Restore the column order
Or you can use a base R solution that doesn't change the column order in the first place
df <- split(df, df["category"])
df <- lapply(df,
function(df){
df$new_col <- sample(rep(c(FALSE, TRUE),
length.out = nrow(df)),
size = nrow(df),
replace = FALSE)
df
})
do.call("rbind", c(df, list(make.row.names = FALSE)))
There are likely a dozen other ways to do this, and probably more efficient ways that I'm not thinking of.
I have repeated measures data with two ratings (reliable and fast) repeated on two different objects, (each survey respondent rates each object using the same two ratings measures). I would like to have two columns, one for object 1 and one for object 2, with the ratings displayed in two separate rows.
In the reference manual there is reference to using a | separator to compare two variables, but the example given is for mrsets not means, I'm not sure how to do the same with means and keep them in separate data frame columns.
In the code below, the problem is that instead of placing the means side by side (for comparison) they are stacked on top of each other.
#library
library(expss)
library(magrittr)
#dummy data
set.seed(9)
df <- data.frame(
q1_reliable=sample(c(1,5), 100, replace = TRUE),
q1_fast=sample(c(1,5), 100, replace = TRUE),
q2_reliable=sample(c(1,5), 100, replace = TRUE),
q2_fast=sample(c(1,5), 100, replace = TRUE))
#table
df %>%
tab_cells(q1_reliable,q1_fast) %>%
tab_stat_mean(label = "") %>%
tab_cells(q2_reliable,q2_fast) %>%
tab_stat_mean(label = "") %>%
tab_pivot()
I discovered that if I add variable labels first and use 'tab_pivot(stat_position = "inside_columns")' it solved the problem.
#library
library(expss)
library(magrittr)
#dummy data
set.seed(9)
df <- data.frame(
q1_reliable=sample(c(1,5), 100, replace = TRUE),
q1_fast=sample(c(1,5), 100, replace = TRUE),
q2_reliable=sample(c(1,5), 100, replace = TRUE),
q2_fast=sample(c(1,5), 100, replace = TRUE)
)
#labels
df = apply_labels(df,
q1_reliable = "reliable",
q1_fast = "fast",
q2_reliable = "reliable",
q2_fast = "fast")
#table
df %>%
tab_cells(q1_reliable,q1_fast) %>%
tab_stat_mean(label = "") %>%
tab_cells(q2_reliable,q2_fast) %>%
tab_stat_mean(label = "") %>%
tab_pivot(stat_position = "inside_columns")
Like this data.table approach?
library(data.table)
#melt first
DT <- melt( setDT(df),
measure.vars = patterns( reliable = "reliable", fast = "fast"),
variable.name = "q")
#then summarise
DT[, lapply(.SD, mean), by = .(q), .SDcols = c("reliable", "fast")]
q reliable fast
1: 1 3.04 2.96
2: 2 2.92 2.96
This is a sample of my dataset.
library(tidyr)
library(dplyr)
resource <- c("good","good","bad","bad","good","good","bad","bad","good","good","bad","bad","good","good","bad","bad")
fertilizer <- c("none", "nitrogen","none","nitrogen","none", "nitrogen","none","nitrogen","none", "nitrogen","none","nitrogen","none", "nitrogen","none","nitrogen")
t1 <- sample(1:20, 16)
t2 <- sample(1:20, 16)
t3 <- sample(1:20, 16)
t4 <- sample(1:20, 16)
t5 <- sample(1:20, 16)
t6 <- sample(10:100, 16)
t7 <- sample(10:100, 16)
t8 <- sample(10:100, 16)
t9 <- sample(10:100, 16)
t10 <- sample(10:100, 16)
replicates <- c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16)
data <- data.frame(resource, fertilizer,replicates, t1,t2,t3,t4,t5,t6,t7,t8,t9,t10)
data$resource <- as.factor(data$resource)
data$fertilizer <- as.factor(data$fertilizer)
Where t0,t1,t2..etc are time points. I need to average adjacent time points (non-overlapping) across eg. (t1,t2), (t3,t4)..and the new column headings need to have the average of the times, so that the columns read as t1.5,t3.5,...etc.
Thus in the end I need to have only 5 columns reading t1.5, t3.5,t5.5, t7.5,t9.5
Is there anyway this can be achieved using dplyr function, or any other function in R?
Edited for OP's modified request:
If you put everything in a tidy format, you can take advantage of the lag/lead functions to average adjacent rows.
library(stringr)
library(forcats)
data %>%
gather(key = time, value = value, -replicates, -resource, -fertilizer) %>%
mutate(index = as.integer(str_extract(time, "[0-9]+"))) %>%
arrange(replicates, index) %>%
group_by(resource, fertilizer, replicates) %>%
mutate(mid_value = (value + lead(value))/2,
mid_index = (index + lead(index))/2,
mid_time = str_c("t",mid_index)) %>%
ungroup %>%
filter(!is.na(mid_value), index %% 2 == 1) %>%
select(replicates, resource, fertilizer, matches("mid")) %>%
rename(value = mid_value, time = mid_time, index = mid_index) %>%
arrange(index) %>%
mutate(time = as_factor(time)) %>%
select(-index) %>%
spread(key = time, value = value) %>%
arrange(replicates)
Solution using only base R: You need to somehow find the columns you want to calculate the average for. You can do this by searching the column names for the t + "somenumber" pattern. After that, create a sequence of sequences, corresponding to the column numbers of df you want to calculate the mean for.
relevant_cols <- grep("[0-9]{1,2}", names(df))
start <- min(relevant_cols)
end <- max(relevant_cols)
cols <- split(start:end, rep(1:5, each=2))
If you look at cols, you'll see that it is list of five, each element resembling a combination of columns you want to average. This smells like a use-case for sapply():
newdf <- sapply(cols, function(x) rowMeans(df[x]) )
colnames(newdf) <- paste0("t", seq(1, diff(range(relevant_cols)), 2) + 0.5)
Edit: I seem to have misunderstood what you want to maintain and what not. You can just cbind() (parts of) the old df to newdf:
cbind(df, newdf)
cbind(df[, -relevant_cols], newdf) # This is what you want. I think..
Here ya go:
transmute(data,
t1.5 = (t1 + t2) / 2,
t3.5 = (t3 + t4) / 2,
t5.5 = (t5 + t6) / 2,
t7.5 = (t7 + t8) / 2,
t9.5 = (t9 + t10) / 2)
I am trying to get the total precipitation values for every hour from a personal weather station I have using the weatherData package. The problem I have is that the data is collected every five minutes and the values repeat themselves until there is a change in precipitation value. I have tried the 'duplicated' function but I get a large number of data removed when there is no precipitation which makes it hard for me to get a summary of the hourly precipitation.
Please see code below
## Load required libraries
library(weatherData)
library(ggplot2)
library(scales)
library(plyr)
library(reshape2)
library(gridExtra)
library(lubridate)
library(weathermetrics)
library(zoo)
# Get data for PWS using weatherData package
pws <- getWeatherForDate("IPENANGB2", "2014-09-01","2014-09-30", station_type = "id",opt_detailed=T, opt_custom_columns=T, custom_columns=c(1,2,6,7,10))
# Rename columns
colnames(pws)<-c("time","time1","tempc","wdd","wspd","prcp")
## Adding date columns
pws$time<-as.POSIXct(pws$time1,format="%Y-%m-%d %H:%M:%S",tz="Australia/Perth")
pws$year <- as.numeric(format(pws$time,"%Y"))
pws$date <-as.Date(pws$time,format="%Y-%m-%d",tz="Australia/Perth")
pws$year <- as.numeric(as.POSIXlt(pws$date)$year+1900)
pws$month <- as.numeric(as.POSIXlt(pws$date)$mon+1)
pws$monthf <- factor(pws$month,levels=as.character(1:12),labels=c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"),ordered=TRUE)
pws$weekday <- as.POSIXlt(pws$date)$wday
pws$weekdayf <- factor(pws$weekday,levels=rev(0:6),labels=rev(c("Mon","Tue","Wed","Thu","Fri","Sat","Sun")),ordered=TRUE)
pws$yearmonth <- as.yearmon(pws$date)
pws$yearmonthf <- factor(pws$yearmonth)
pws$week <- as.numeric(format(as.Date(pws$date),"%W"))
pws$weekf<- factor(pws$week)
pws$jday<-yday(pws$date)
pws$hour <- as.numeric(format(strptime(pws$time, format = "%Y-%m-%d %H:%M"),format = "%H"))
pws$min <- as.numeric(format(strptime(pws$time, format = "%Y-%m-%d %H:%M"),format = "%M"))
# Remove duplicate values
pws.df <- pws[!duplicated(pws$prcp),]
Assuming you want to get hourly averages of tempc, wdd, wspd, prcp:
# used packages
library(weatherData)
library(lubridate)
library(dplyr)
library(stringr)
# read data
pws <- getWeatherForDate("IPENANGB2",
"2014-09-01",
"2014-09-30",
station_type = "id",
opt_detailed = T,
opt_custom_columns = T,
custom_columns = c(1, 2, 6, 7, 10))
# rename columns
colnames(pws) <- c("time", "time1", "tempc", "wdd", "wspd", "prcp")
# cleaning dataset and adding some columns
useful_pws <-
pws %>%
select(2:6) %>%
filter(!str_detect(time1, "<br>")) %>%
mutate(time1 = ymd_hms(time1),
year = year(time1),
month = month(time1),
day = day(time1),
hour = hour(time1)) %>%
tbl_df()
# summarising dataset
useful_pws %>%
select(-time1) %>%
group_by(year, month, day, hour) %>%
summarise(tempc = mean(tempc, na.rm = TRUE),
wdd = mean(wdd, na.rm = TRUE),
wspd = mean(wspd, na.rm = TRUE),
prcp = mean(prcp, na.rm = TRUE))