Combine a data frame and a list - r

I have a data frame of date which I want to merge with a list that has different number of rows ( 3 or 4 ). I want to merge the first date of the data frame with all elements (dates) of the list. For example the first two dates in df1 are merged with the first two elements of the list (bc) and I would get the following:
date date1 id
1992-09-26 1992-09-05 1
1992-09-26 1992-09-12 1
1992-09-26 1992-09-19 1
1992-09-27 1992-09-06 2
1992-09-27 1992-09-13 2
1992-09-27 1992-09-20 2
How can I do that? I have searched this forum and did not find similar problem or comparable solution.
df1 <- structure(c(8304, 8305, 8306, 8307, 8308, 8309, 8310, 8311, 8312,
8313, 8314), class = "Date")
mylist <- list(structure(c(8283, 8290, 8297), class = "Date"), structure(c(8284,
8291, 8298), class = "Date"), structure(c(8285, 8292, 8299), class = "Date"),
structure(c(8279, 8286, 8293, 8300), class = "Date"), structure(c(8280,
8287, 8294, 8301), class = "Date"), structure(c(8316, 8323,
8330, 8337), class = "Date"), structure(c(8317, 8324, 8331,
8338), class = "Date"), structure(c(8318, 8325, 8332, 8339
), class = "Date"), structure(c(8319, 8326, 8333), class = "Date"),
structure(c(8320, 8327, 8334), class = "Date"), structure(c(8321,
8328, 8335), class = "Date"))

Highly inelegant, but it works:
df <- as.data.frame(do.call("rbind", mapply(function(x,y) cbind(date=x, date1=y, id=which(df1==x)), df1, mylist)))
class(df[,1]) <- "Date"
class(df[,2]) <- "Date"
head(df)
date date1 id
1 1992-09-26 1992-09-05 1
2 1992-09-26 1992-09-12 1
3 1992-09-26 1992-09-19 1
4 1992-09-27 1992-09-06 2
5 1992-09-27 1992-09-13 2
6 1992-09-27 1992-09-20 2

It sounds like you're just looking for rep and unlist, as in the following:
A <- sapply(mylist, length)
out <- data.frame(date = rep(df1, A),
date1 = unlist(mylist),
id = rep(seq_along(A), A))
head(out)
# date date1 id
# 1 1992-09-26 8283 1
# 2 1992-09-26 8290 1
# 3 1992-09-26 8297 1
# 4 1992-09-27 8284 2
# 5 1992-09-27 8291 2
# 6 1992-09-27 8298 2
tail(out)
# date date1 id
# 33 1992-10-05 8320 10
# 34 1992-10-05 8327 10
# 35 1992-10-05 8334 10
# 36 1992-10-06 8321 11
# 37 1992-10-06 8328 11
# 38 1992-10-06 8335 11
You could also use data.table
library(data.table)
DT <- data.table(date = df1, date1 = mylist, key = "date")
DT[, id := 1:nrow(DT)]
DT[, as.IDate(unlist(date1), origin = "1970-01-01"), by = list(date, id)]

Related

Remove rows that do not match common dates from a separate data frame

I have two data frames, each containing a column with dates. I would like to modify the first data frame such that its rows (dates) match the second data frame according to common dates. The first data frame has more dates (includes weekend data) than the second (does not include weekend data), so it would require filtering out those dates in the first data frame to match only those dates in the second.
Example of first data frame (df1):
Date
Value
2014-09-19
1
2014-09-20
3
2014-09-21
3
2014-09-22
2
2014-09-23
1
Example of second data frame (df2):
Date
Value
2014-09-19
1
2014-09-22
3
2014-09-23
2
Example of desired output of df1, removing the dates that are not common in df2:
Date
Value
2014-09-19
1
2014-09-22
2
2014-09-23
1
df1 <- read.table(header = T, text = "Date Value
2014-09-19 1
2014-09-20 3
2014-09-21 3
2014-09-22 2
2014-09-23 1")
df2 <- read.table(header = T, text = "Date Value
2014-09-19 1
2014-09-22 3
2014-09-23 2")
library(dplyr)
df1 %>% filter(Date %in% df2$Date)
output :
Date Value
1 2014-09-19 1
2 2014-09-22 2
3 2014-09-23 1
Assuming that df1 and df2 each have unique dates and that what is wanted is rows of df1 for which there is a matching date in df2 use merge like this:
merge(df1, df2[1])
## Date Value
## 1 2014-09-19 1
## 2 2014-09-22 2
## 3 2014-09-23 1
These also work:
library(dplyr)
inner_join(df1, df2[1])
## Joining, by = "Date"
## Date Value
## 1 2014-09-19 1
## 2 2014-09-22 2
## 3 2014-09-23 1
library(sqldf)
sqldf("select a.* from df1 a inner join df2 b using(Date)")
## Date Value
## 1 2014-09-19 1
## 2 2014-09-22 2
## 3 2014-09-23 1
library(data.table)
as.data.table(df1)[df2[[1]],, on = "Date"]
## Date Value
## 1: 2014-09-19 1
## 2: 2014-09-22 2
## 3: 2014-09-23 1
# convert to zoo and output a zoo series
library(zoo)
z1 <- read.zoo(df1)
z2 <- read.zoo(df2)
z1[time(z2)]
## 2014-09-19 2014-09-22 2014-09-23
## 1 2 1
Note
The input in reproducible form is:
df1 <- structure(list(Date = c("2014-09-19", "2014-09-20", "2014-09-21",
"2014-09-22", "2014-09-23"), Value = c(1L, 3L, 3L, 2L, 1L)),
class = "data.frame", row.names = c(NA, -5L))
df2 <- structure(list(Date = c("2014-09-19", "2014-09-22", "2014-09-23"
), Value = c(1L, 3L, 2L)),
class = "data.frame", row.names = c(NA, -3L))
We can use a dplyr::semi_join:
library(dplyr)
df1 %>%
semi_join(df2, by = "date")
#> # A tibble: 3 x 2
#> date value
#> <date> <dbl>
#> 1 2014-09-19 1
#> 2 2014-09-22 2
#> 3 2014-09-23 1
# the data
df1 <- tribble(~ date, ~ value,
"2014-09-19", 1,
"2014-09-20", 3,
"2014-09-21", 3,
"2014-09-22", 2,
"2014-09-23", 1
) %>%
mutate(date = as.Date(date))
df2 <- tribble(~ date, ~ value,
"2014-09-19", 1,
"2014-09-22", 3,
"2014-09-23", 2
) %>%
mutate(date = as.Date(date))
Created on 2022-06-08 by the reprex package (v2.0.1)
With the base subset option:
df1[df1$Date %in% df2$Date,]
Output:
Date Value
1 2014-09-19 1
4 2014-09-22 2
5 2014-09-23 1
If you are not using more specific libraries, this can be done as following:
df1 <- df1[, df1$Date %in% df2$Date]

Rolling column subtraction of Dataframe

I have a dataframe like this one:
library(lubridate)
set.seed(23)
date_list = seq(ymd('2000-01-15'),ymd('2010-09-18'),by='day')
testframe = data.frame(Date = date_list)
testframe$Day = substr(testframe$Date, start = 6, stop = 10)
testframe$ABC = rnorm(3900)
testframe$DEF = rnorm(3900)
testframe$GHI = seq(from = 10, to = 25, length.out = 3900)
testframe$JKL = seq(from = 5, to = 45, length.out = 3900)
I want to have an automatic rolling subset of this dataframe, that should be like this:
testframe_ABC = testframe[,c("Date","Day","ABC")]
testframe_DEF = testframe[,c("Date","Day","DEF")]
testframe_GHI = testframe[,c("Date","Day","GHI")]
testframe_JKL = testframe[,c("Date","Day","JKL")]
The columns Date and Day should always stay, the other columns should be added individually. The name of the varying column should be added to the dataframename, to have a new df. All dataframes could also be in a list of dataframes, if possible.
Any ideas how to do that?
I assume you want a list of 4 data frames whose components are ABC, DEF, etc. It would be better to put them in a list:
L <- Map(function(nm) testframe[c("Date", "Day", nm)], names(testframe)[-(1:2)])
in which case L$ABC or L[[1]] will refer to the ABC data frame but if you want to leave them dangling in the global environment this will copy the list components to it:
list2env(L, .GlobalEnv)
I would not use the term rolling in this context. Typically that term refers to a sliding window such as this:
library(zoo)
rollmeanr(1:10, 3) # 2 is mean of 1:3, 3 is mean of 2:4, etc.
## [1] 2 3 4 5 6 7 8 9
You can use split.default to split on every column and then cbind the first 2 columns to every element, i.e.
lapply(split.default(testframe[-c(1, 2)], seq_along(testframe)[-c(1, 2)]), function(i)
cbind.data.frame(testframe[c(1, 2)], i))
which gives a list,
$`3`
Date Day ABC
1 2000-01-15 01-15 0.1932123
2 2000-01-16 01-16 -0.4346821
3 2000-01-17 01-17 0.9132671
$`4`
Date Day DEF
1 2000-01-15 01-15 1.7933881
2 2000-01-16 01-16 0.9966051
3 2000-01-17 01-17 1.1074905
$`5`
Date Day GHI
1 2000-01-15 01-15 10.0
2 2000-01-16 01-16 17.5
3 2000-01-17 01-17 25.0
$`6`
Date Day JKL
1 2000-01-15 01-15 5
2 2000-01-16 01-16 25
3 2000-01-17 01-17 45
DATA USED
dput(testframe)
structure(list(Date = structure(c(10971, 10972, 10973), class = "Date"),
Day = c("01-15", "01-16", "01-17"), ABC = c(0.193212333898146,
-0.434682108206693, 0.913267096589322), DEF = c(1.79338809206353,
0.996605106833546, 1.10749048744809), GHI = c(10, 17.5, 25
), JKL = c(5, 25, 45)), row.names = c(NA, -3L), class = "data.frame")

How to combine data with same rownames to one column in R

I'm trying to move a large list with >200000 character from this:
startTime 1
max 3
min 1
EndTime 2
avg 2
startTime 2
max ..
min ..
EndTime ..
avg ..
..
to a dataframe like this:
startTime max min EndTime avg
1 3 1 2 2
2 .. .. .. ..
I managed it by looping it through a for-loop. It takes to much time. Is there a more sufficient way by not looping it through a for-loop?
Expanding your input data a bit you could use unstack from base R.
Input:
dat
# V1 V2
#1 startTime 1
#2 max 3
#3 min 1
#4 EndTime 2
#5 avg 2
#6 startTime 2
#7 max 3
#8 min 4
#9 EndTime 5
#10 avg 6
Result:
out <- unstack(dat, V2 ~ V1)
out
# avg EndTime max min startTime
#1 2 2 3 1 1
#2 6 5 3 4 2
If you want the column names in the same order as the they appear in dat$V1 do
out <- out[unique(dat$V1)]
data
dat <- structure(list(V1 = c("startTime", "max", "min", "EndTime", "avg",
"startTime", "max", "min", "EndTime", "avg"), V2 = c(1L, 3L,
1L, 2L, 2L, 2L, 3L, 4L, 5L, 6L)), .Names = c("V1", "V2"), class = "data.frame", row.names = c(NA,
-10L))
simply tranform it
library( data.table )
dt <- data.table::fread(" startTime 1
max 3
min 1
EndTime 2
avg 2
startTime 2", header = FALSE)
as.data.table( t( dt ) )
# V1 V2 V3 V4 V5 V6
# 1: startTime max min EndTime avg startTime
# 2: 1 3 1 2 2 2
This is not an exact duplicate of How to reshape data from long to wide format? so I will answer.
First create a new column ID and then use one of the solutions in the duplicate. I will use the solution based on package reshape2.
pattern <- as.character(df1[1, 1])
ipat <- grep(pattern, df1[[1]])
df1$ID <- rep(seq_along(ipat), nrow(df1)/length(ipat))
library(reshape2)
result <- dcast(df1, ID ~ V1, value.var = "V2")[-1]
# avg EndTime max min startTime
#1 2 3 4 1 1
#2 1 2 3 2 2
Final clean up, put the input dataset df1 back as it were.
df1 <- df1[-ncol(df1)]
Data.
df1 <- read.table(text = "
startTime 1
max 3
min 1
EndTime 2
avg 2
startTime 2
max 4
min 2
EndTime 3
avg 1
")
Here are some alternatives. They do not use any packages.
Assume the input DF shown reproducibly in the Note at the end.
1) xtabs The first line of code converts the first column to character in case it is factor. We do not need this with the data shown in the Note but it doesn't hurt and might be useful if the column were factor so that it is in a known state.
Then convert the V1 column to a factor having levels in the order that appear so that they don't get rearranged upon output. Also define nicer names and create a Group number vector which numbers the first group of 5 rows as 1, the second group 2 and so on.
Finally use xtabs to create the desired table. If you prefer a data frame as the output rather than a table then use as.data.frame(xt).
DF2 <- transform(DF, V1 = as.character(V1))
DF2 <- transform(DF2, Stat = factor(V1, levels = V1[1:5]),
Value = V2,
Group = cumsum(V1== "startTime"))
xt <- xtabs(Value ~ Group + Stat, DF2)
xt
giving:
Stat
Group startTime max min EndTime avg
1 1 3 1 2 2
2 2 4 1 3 2
2) matrix Even shorter is this one-liner. It gives a matrix. Use as.data.frame(m) if you want a data frame.
m <- matrix(DF$V2,, 5, byrow = TRUE, list(NULL, DF$V1[1:5]))
m
giving:
startTime max min EndTime avg
[1,] 1 3 1 2 2
[2,] 2 4 1 3 2
Note
The input in reproducible form. I have added a few rows.
Lines <- "
startTime 1
max 3
min 1
EndTime 2
avg 2
startTime 2
max 4
min 1
EndTime 3
avg 2"
DF <- read.table(text = Lines, as.is = TRUE)
A tidyverse solution using #markus' data would be :
library(tidyverse)
dat %>%
group_by(tmp = cumsum(V1=="startTime")) %>%
spread(V1,V2) %>%
ungroup %>%
select(-tmp)
# # A tibble: 2 x 5
# avg EndTime max min startTime
# <int> <int> <int> <int> <int>
# 1 2 2 3 1 1
# 2 6 5 3 4 2

calculate timeline for different subjects in dataframe

I have data like
subject date number
1 1/2/01 4
1 3/2/01 6
1 10/2/01 7
2 1/1/01 2
2 4/1/01 3
I want to get R to work out the number of days since the first sample for each subject. eg:
Subject days
1 0
1 2
1 9
2 0
2 3
How can I do this? I have converted the dates using lubridate.
SOmething like:
for(i in 1:nrow(data)){
if(data$date[i] != data$date[i -1]) {
data$timeline <- data$date[i] - data$date[i-1]
}
}
I get the error:
argument is of length 0 - I think the problem is the first line where there is no preceeding row..?
I would use dplyr to do some grouping and data manipulation. Note that we first have to convert your date into something R will recognize as a date.
library(dplyr)
dat$Date <- as.Date(dat$date, '%d/%m/%y')
dat %>%
group_by(subject) %>%
mutate(days = Date - min(Date))
# subject date number Date days
# <int> <chr> <int> <date> <time>
# 1 1 1/2/01 4 2001-02-01 0
# 2 1 3/2/01 6 2001-02-03 2
# 3 1 10/2/01 7 2001-02-10 9
# 4 2 1/1/01 2 2001-01-01 0
# 5 2 4/3/01 3 2001-03-04 62
here's the data:
dat <- structure(list(subject = c(1L, 1L, 1L, 2L, 2L), date = c("1/2/01",
"3/2/01", "10/2/01", "1/1/01", "4/3/01"), number = c(4L, 6L,
7L, 2L, 3L), Date = structure(c(11354, 11356, 11363, 11323, 11385
), class = "Date")), .Names = c("subject", "date", "number",
"Date"), row.names = c(NA, -5L), class = "data.frame")
Using the input shown in the note convert the date column to Date class (assuming that it is in the form dd/mm/yy) and then use ave to subtract the least date from all the dates for each subject. If the input is sorted as in the question we could optionally use x[1] instead of min(x). No packages are used.
data$date <- as.Date(data$date, "%d/%m/%y")
diff1 <- function(x) x - min(x)
with(data, data.frame(subject, days = ave(as.numeric(date), subject, FUN = diff1)))
giving:
subject days
1 1 0
2 1 2
3 1 9
4 2 0
5 2 62
Note
The input used, in reproducible form, is:
Lines <- "
subject date number
1 1/2/01 4
1 3/2/01 6
1 10/2/01 7
2 1/1/01 2
2 4/3/01 3"
data <- read.table(text = Lines, header = TRUE)

Grouping time and counting instances by 12 hour bins in R

I have a dataframe df1 like this :
timestamp
01-12-2015 00:04
01-12-2015 02:20
01-12-2015 02:43
01-12-2015 04:31
01-12-2015 08:51
01-12-2015 11:28
01-12-2015 20:53
01-12-2015 21:28
02-12-2015 00:30
02-12-2015 20:22
Which contains time stamps. I would want to get count by binning hours in 12 hours interval i.e(01-12-2015[0-9],01-12-2015[9-21], and so on.
output Sample:
DayOfMonth Group count
1 1 5
1 2 2
2 1 2
2 2 1
The day of month can be replaced by Serial Number also, starting with 1. Any help to solve this is highly appreciated.
A possible solution in base R:
# convert the 'timestamp' column to a datetime format
df1$timestamp <- as.POSIXct(strptime(df1$timestamp, format = '%d-%m-%Y %H:%M'))
# create day.of.month variable
df1$day.of.month <- format(df1$timestamp, '%d')
# extract the 12 hour interval as am/pm values
df1$group <- gsub('[0-9: ]+','\\1',format(df1$timestamp, '%r'))
# aggregate
aggregate(. ~ group + day.of.month, df1, length)
which gives:
group day.of.month timestamp
1 am 01 6
2 pm 01 2
3 am 02 1
4 pm 02 1
Another solution using data.table and and the pm function of lubridate:
library(lubridate)
library(data.table)
setDT(df1)[, timestamp := dmy_hm(timestamp)
][, group := pm(timestamp)+1
][, .N, .(day.of.month = day(timestamp),group)]
which gives:
day.of.month group N
1: 1 1 6
2: 1 2 2
3: 2 1 1
4: 2 2 1
Used data:
df1 <- structure(list(timestamp = c("01-12-2015 00:04", "01-12-2015 02:20", "01-12-2015 02:43", "01-12-2015 04:31", "01-12-2015 08:51",
"01-12-2015 11:28", "01-12-2015 20:53", "01-12-2015 21:28", "02-12-2015 00:30", "02-12-2015 20:22")),
.Names = "timestamp", class = "data.frame", row.names = c(NA,-10L))
We can use lubridate functions to convert to 'Datetime' class easily and with dplyr to get the output efficiently compared to base R methods.
library(lubridate)
library(dplyr)
df1 %>%
mutate(timestamp = dmy_hm(timestamp)) %>%
group_by(DayOfMonth = day(timestamp)) %>%
group_by(Group = as.numeric(cut(timestamp, breaks = "12 hour")),
add=TRUE) %>%
summarise(GroupCount = n())
# DayOfMonth Group GroupCount
# <int> <dbl> <int>
#1 1 1 6
#2 1 2 2
#3 2 1 1
#4 2 2 1
Or we can use a compact option with data.table
library(data.table)
setDT(df1)[, {t1 <- dmy_hm(timestamp); .(DayOfMonth = day(t1),
Group = (hour(t1)>12)+1L)}][, .(GroupCount = .N), .(DayOfMonth, Group)]
# DayOfMonth Group GroupCount
#1: 1 1 6
#2: 1 2 2
#3: 2 1 1
#4: 2 2 1
NOTE: The data.table solution is done with just two steps...
data
df1 <- structure(list(timestamp = c("01-12-2015 00:04", "01-12-2015 02:20",
"01-12-2015 02:43", "01-12-2015 04:31", "01-12-2015 08:51", "01-12-2015 11:28",
"01-12-2015 20:53", "01-12-2015 21:28", "02-12-2015 00:30", "02-12-2015 20:22"
)), .Names = "timestamp", class = "data.frame", row.names = c(NA,-10L))
Another possible solution in base R :
timeStamp <- c("01-12-2015 00:04","01-12-2015 02:20","01-12-2015 02:43","01-12-2015 04:31",
"01-12-2015 08:51","01-12-2015 11:28","01-12-2015 20:53","01-12-2015 21:28",
"02-12-2015 00:30","02-12-2015 20:22")
times <- as.POSIXlt(timeStamp,format="%d-%m-%Y %H:%M",tz='GMT')
DF <- data.frame(Times=times)
DF$Group <- as.logical(times$hour > 12) + 1
DF$DayOfMonth <- times$mday
res <- aggregate(Times ~ DayOfMonth + Group,data=DF, FUN = length)
# res :
# DayOfMonth Group Times
# 1 1 1 6
# 2 2 1 1
# 3 1 2 2
# 4 2 2 1
Or if you want to include dates in hours range: [21-0] of previous day in the next day :
timeStamp <- c("01-12-2015 00:04","01-12-2015 02:20","01-12-2015 02:43","01-12-2015 04:31",
"01-12-2015 08:51","01-12-2015 11:28","01-12-2015 20:53","01-12-2015 21:28",
"02-12-2015 00:30","02-12-2015 20:22")
times <- as.POSIXlt(timeStamp,format="%d-%m-%Y %H:%M",tz='GMT')
h <- times$hour + times$min*1/60 + times$sec*1/3600
# here we add 3 hours to the dates in hours range [21-0] in this way we
# push them to the next day
times[h >= 21] <- times[h >= 21] + 3*3600
DF <- data.frame(Times=times)
DF$Group <- ifelse(h < 9,1,ifelse(h <= 21,2,NA))
DF$DayOfMonth <- times$mday
res <- aggregate(Times ~ DayOfMonth + Group,data=na.omit(DF), FUN = length)
# res :
# DayOfMonth Group Times
# 1 1 1 5
# 2 2 1 2
# 3 1 2 2
# 4 2 2 1
Adding to the several already presented options, the stringi package has some date parsing functions as well:
library(stringi)
df1$timestamp <- stri_datetime_parse(df1$timestamp, format = 'dd-mm-yyyy HH:mm')
df1$DayOfMonth <- stri_datetime_format(df1$timestamp, format = 'd')
df1$Group <- stri_datetime_format(df1$timestamp, format = 'a')
After that you can get a count with for example the following two options:
# option 1:
aggregate(. ~ Group + DayOfMonth, df1, length) # copied from #ProcrastinatusMaximus
# option 2a:
library(dplyr)
df1 %>%
group_by(DayOfMonth, Group) %>%
tally()
# option 2b:
count(df1, DayOfMonth, Group)
The output of the latter:
DayOfMonth Group n
(chr) (chr) (int)
1 1 a.m. 6
2 1 p.m. 2
3 2 a.m. 1
4 2 p.m. 1

Resources