import and merge dataset all in one step via iterative function? - r

These are the datasets I'm working on.
Since the P-ones have the following structure
# A tibble: 25 x 5
ID GR SES COND `LPP2(1000-1500).POz`
<chr> <chr> <chr> <chr> <dbl>
1 01 RP V NEG-CTR 7.91
2 04 RP V NEG-CTR 1.07
3 06 RP V NEG-CTR 0.742
4 07 RP V NEG-CTR 4.49
5 08 RP V NEG-CTR -2.43
6 09 RP V NEG-CTR 0.649
7 10 RP V NEG-CTR 1.10
8 11 RP V NEG-CTR -2.07
9 12 RP V NEG-CTR 10.9
10 13 RP V NEG-CTR 5.17
While the V-ones have this following ones, with a more extended number of coulmns:
> `RP-V-NEG-CTR`
# A tibble: 25 x 16
ID GR SES COND `P3(400-450).FCz` `P3(400-450).Cz` `P3(400-450).Pz` `LPPearly(500-700)~
<chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 01 RP V NEG-CTR -11.6 -5.17 11.9 -11.8
2 04 RP V NEG-CTR -0.314 2.23 15.1 -4.02
3 06 RP V NEG-CTR -0.214 -1.30 3.14 4.47
4 07 RP V NEG-CTR -2.83 2.19 13.7 -0.884
5 08 RP V NEG-CTR 4.24 2.45 7.48 1.15
6 09 RP V NEG-CTR 9.57 13.7 23.4 13.3
7 10 RP V NEG-CTR -6.13 -4.13 6.27 0.0229
8 11 RP V NEG-CTR 0.529 5.43 13.0 1.90
9 12 RP V NEG-CTR -7.74 -2.41 15.8 -8.08
10 13 RP V NEG-CTR 1.27 3.48 9.94 1.75
# ... with 15 more rows, and 8 more variables: LPPearly(500-700).Cz <dbl>,
# LPPearly(500-700).Pz <dbl>, LPP1(500-1000).FCz <dbl>, LPP1(500-1000).Cz <dbl>,
# LPP1(500-1000).Pz <dbl>, LPP2(1000-1500).FCz <dbl>, LPP2(1000-1500).Cz <dbl>,
# LPP2(1000-1500).Pz <dbl>
>
I've used the followinf procedure for importing and merging them together:
files <- list.files(pattern = "\\.xls")
for (i in 1:length(files)) {
assign(gsub("\\.xls", "", files[i]), readxl::read_xls(files[i]))
}
data <- `RP-V-NEG-CTR` %>%
add_row(`RP-V-NEG-NOC`) %>%
add_row(`RP-V-NEU-NOC`)%>%
arrange(ID)
data1 <- `RP-POz-NEG-CTR` %>%
add_row(`RP-POz-NEG-NOC`) %>%
add_row(`RP-POz-NEU-NOC`)%>%
arrange(ID)
data <- merge(data, data1, by = c('ID', 'GR', 'SES', 'COND'))
Since trying by this following code I've obtain a non-matched merged dataset:
filenames <- list.files(pattern = '^RP.*\\.xls$')
> data <- purrr::map_df(filenames, readxl::read_excel)
Like this:
In the case I would ike to import already merged in a proper way, what am I suppsed to do/adjust?
Thanks in advance

Maybe we need map2
library(dplyr)
library(purrr)
library(stringr)
filesv <- list.files(pattern = 'RP-V-.*\\.xls', full.names = TRUE)
filesp <- list.files(pattern = 'RP-P-.*\\.xls', full.names = TRUE)
nm1 <- str_c(str_remove(basename(filesv), "\\.xls"),
str_remove(basename(filesp), "\\.xls"), sep="_")
out <- map2(filesv, filesp, ~ {
vdat <- readxl::read_excel(.x)
pdat <- readxl::read_excel(.y)
inner_join(vdat, pdat, by = c('ID', 'GR', 'SES', 'COND'))
}) %>%
setNames(nm1) %>%
bind_rows(.id = 'grp')
Or if we don't need the 'grp' column
out <- map2_dfr(filesv, filesp, ~ {
vdat <- readxl::read_excel(.x)
pdat <- readxl::read_excel(.y)
inner_join(vdat, pdat, by = c('ID', 'GR', 'SES', 'COND'))
})

Related

How to make monthly data to daily data using random techniques in R

I need to transform this monthly data to daily data by using some randomization technique, for example. Here is the dataframe:
library(dplyr)
library(lubridate)
month_year <- c(
"08-2021",
"09-2021",
"10-2021",
"11-2021",
"12-2021"
)
monthly_values_var1 <- c(
598,
532,
736,
956,
780
)
monthly_values_var2 <- c(
18.3179,
62.6415,
11.1033,
30.7443,
74.2076
)
df <- data.frame(month_year, monthly_values_var1, monthly_values_var2)
df
That is the month dataset view:
And the expected result of this, is something like this:
How to do it using R ?
Like this perhaps?
df %>%
mutate(mo_start = dmy(paste(1,month_year))) %>%
tidyr::uncount(days_in_month(mo_start), .id = "day") %>%
mutate(date = dmy(paste(day,month_year))) %>%
mutate(across(contains("var"), ~rnorm(n(), mean = .x, sd = 1)))
# A tibble: 153 x 6
month_year monthly_values_var1 monthly_values_var2 mo_start day date
<chr> <dbl> <dbl> <date> <int> <date>
1 08-2021 599. 18.8 2021-08-01 1 2021-08-01
2 08-2021 598. 17.4 2021-08-01 2 2021-08-02
3 08-2021 596. 18.0 2021-08-01 3 2021-08-03
4 08-2021 598. 19.2 2021-08-01 4 2021-08-04
5 08-2021 600. 18.3 2021-08-01 5 2021-08-05
6 08-2021 597. 19.8 2021-08-01 6 2021-08-06
7 08-2021 599. 18.9 2021-08-01 7 2021-08-07
8 08-2021 597. 17.9 2021-08-01 8 2021-08-08
9 08-2021 597. 16.0 2021-08-01 9 2021-08-09
10 08-2021 596. 17.7 2021-08-01 10 2021-08-10
# … with 143 more rows
It is not a one function question.
There are more compact answers, but it is clearer step by step.
First the data:
month_year <- c(
"08-2021",
"09-2021",
"10-2021",
"11-2021",
"12-2021"
)
monthly_values_var1 <- c(
598,
532,
736,
956,
780
)
monthly_values_var2 <- c(
18.3179,
62.6415,
11.1033,
30.7443,
74.2076
)
df <- data.frame(month_year, monthly_values_var1, monthly_values_var2)
df
Some useful libraries:
library(dplyr)
library(lubridate)
library(stringr)
It needs a similar data frame to save new data:
df$month_year <- lubridate::dmy(paste0('01-',df$month_year))
new.df <- df[0,]
Now the code
counter <- 1
for (i in 1:nrow(df)) {
days_month <- lubridate::days_in_month(df[i, 'month_year'])
mean1 <- df[i, 'monthly_values_var1']/days_month
mean2 <- df[i, 'monthly_values_var2']/days_month
for(j in 1:days_month){
if (j < 10) {
value <- str_pad(string = j, width = length(as.character(j))+1, pad = "0")
} else {
value <- as.character(j)
}
new.df[counter, 'month_year'] <- paste0(lubridate::year(df[i, 'month_year']),'-', lubridate::month(df[i, 'month_year']), '-', value)
new.df[counter, 'monthly_values_var1'] <- rnorm(n = 1, mean = mean1, sd = mean1/3)
new.df[counter, 'monthly_values_var2'] <- rnorm(n = 1, mean = mean2, sd = mean2/3)
counter <- counter + 1
}
}
View(new.df)
lubridate::days_in_month() function shows how many days are in a specific month.
rnorm assign a random number with normal distribution. I choose a mean around each data divided days in month, and a standard deviation mean/3.

How do I use dplyr to correlate each column in a for loop?

I have a dataframe of 19 stocks, including the S&P500 (SPX), throughout time. I want to correlate each one of these stocks with the S&P for each month (Jan-Dec), making 18 x 12 = 216 different correlations, and store these in a list called stockList.
> tokens
# A tibble: 366 x 21
Month Date SPX TZERO .....(16 more columns of stocks)...... MPS
<dbl> <dttm> <dbl> <dbl> <dbl>
1 2020-01-02 3245.50 0.95 176.72
...
12 2020-12-31 3733.42 2.90 .....(16 more columns of stocks)..... 360.73
Here's where my error pops up, by using the index [i], or [[i]], in the cor() function
stockList <- list()
for(i in 1:18) {
stockList[[i]] <- tokens %>%
group_by(Month) %>%
summarize(correlation = cor(SPX, tokens[i+3], use = 'complete.obs'))
}
Error in summarise_impl(.data, dots) :
Evaluation error: incompatible dimensions.
How do I use column indexing in the cor() function when trying to summarize? Is there an alternative way?
First, to recreate data like yours:
library(tidyquant)
# Get gamestop, apple, and S&P 500 index prices
prices <- tq_get(c("GME", "AAPL", "^GSPC"),
get = "stock.prices",
from = "2020-01-01",
to = "2020-12-31")
library(tidyverse)
prices_wide <- prices %>%
select(date, close, symbol) %>%
pivot_wider(names_from = symbol, values_from = close) %>%
mutate(Month = lubridate::month(date)) %>%
select(Month, Date = date, GME, AAPL, SPX = `^GSPC`)
This should look like your data:
> prices_wide
# A tibble: 252 x 5
Month Date GME AAPL SPX
<dbl> <date> <dbl> <dbl> <dbl>
1 1 2020-01-02 6.31 75.1 3258.
2 1 2020-01-03 5.88 74.4 3235.
3 1 2020-01-06 5.85 74.9 3246.
4 1 2020-01-07 5.52 74.6 3237.
5 1 2020-01-08 5.72 75.8 3253.
6 1 2020-01-09 5.55 77.4 3275.
7 1 2020-01-10 5.43 77.6 3265.
8 1 2020-01-13 5.43 79.2 3288.
9 1 2020-01-14 4.71 78.2 3283.
10 1 2020-01-15 4.61 77.8 3289.
# … with 242 more rows
Then I put that data in longer "tidy" format where each row has the stock value and the SPX value so I can compare them:
prices_wide %>%
# I want every row to have month, date, and SPX
pivot_longer(cols = -c(Month, Date, SPX),
names_to = "symbol",
values_to = "price") %>%
group_by(Month, symbol) %>%
summarize(correlation = cor(price, SPX)) %>%
ungroup()
# A tibble: 24 x 3
Month symbol correlation
<dbl> <chr> <dbl>
1 1 AAPL 0.709
2 1 GME -0.324
3 2 AAPL 0.980
4 2 GME 0.874
5 3 AAPL 0.985
6 3 GME -0.177
7 4 AAPL 0.956
8 4 GME 0.873
9 5 AAPL 0.792
10 5 GME -0.435
# … with 14 more rows

Calculate daily mean of data frame in r

I have a data frame in r that contains readings each five minutes of an hour for couple of months. I want to calculate daily mean of the var3 (data frame under) and add into this data frame as var4.
Here is my df:
>df
timestamp Var1 Var2 Var3
1 2018-07-20 13:50:00 32.0358 28.1 3.6
2 2018-07-20 13:55:00 32.0358 28.0 2.5
3 2018-07-20 14:00:00 32.0358 28.1 2.2
I find this solution from searching the forum, but it's raising error.
Here is the solution I am applying:
aggregate(ts(df$var3[, 2], freq = 288), 1, mean)
This is the error I am getting:
Error in df$var3[, 2] : incorrect number of dimensions
I think this should work for my data frame too but not able to remove this error. Please help.
Here's an approach with dplyr and lubridate.
library(dplyr)
library(lubridate)
df %>%
group_by(Day = day(ymd_hms(timestamp))) %>%
mutate(Var4 = mean(Var3))
## A tibble: 1,000 x 6
## Groups: Day [5]
# timestamp Var1 Var2 Var3 Day Var4
# <dttm> <dbl> <dbl> <dbl> <int> <dbl>
# 1 2018-07-20 13:55:30 32.2 22.9 2.35 20 2.99
# 2 2018-07-20 14:00:30 37.7 24.8 2.99 20 2.99
# 3 2018-07-20 14:05:30 38.7 29.6 3.47 20 2.99
# 4 2018-07-20 14:10:30 30.4 24.2 3.02 20 2.99
# 5 2018-07-20 14:15:30 32.0 28.4 2.95 20 2.99
## … with 995 more rows
Sample Data
df <- data.frame(timestamp = ymd_hms("2018-07-20 13:50:30") + 60*5 * 1:1000,
Var1 = runif(100,30,40),
Var2 = runif(100,20,30),
Var3 = runif(100,2,4))

Trouble using object in dataframe after a pipe (decomposition of a msts object)

I do time series decomposition and I want to save the resulting objects in a dataframe. It works if I store the results in a object and use it to make the dataframe afterwards:
# needed packages
library(tidyverse)
library(forecast)
# some "time series"
vec <- 1:1000 + rnorm(1000)
# store pipe results
pipe_out <-
# do decomposition
decompose(msts(vec, start= c(2001, 1, 1), seasonal.periods= c(7, 365.25))) %>%
# relevant data
.$seasonal
# make a dataframe with the stored seasonal data
data.frame(ts= pipe_out)
But doing the same as a one-liner fails:
decompose(msts(vec, start= c(2001, 1, 1), seasonal.periods= c(7, 365.25))) %>%
data.frame(ts= .$seasonal)
I get the error
Error in as.data.frame.default(x[[i]], optional = TRUE, stringsAsFactors = stringsAsFactors) :
cannot coerce class ‘"decomposed.ts"’ to a data.frame
I thought that the pipe simply moves forward the things that came up in the last step which saves us storing those things in objects. If so, shouldn't both codes result in the very same output?
EDIT (from comments)
The first code works but it is a bad solution because if one wants to extract all the vectors of the decomposed time series one would need to do it in multiple steps. Something like the following would be better:
decompose(msts(vec, start= c(2001, 1, 1),
seasonal.periods= c(7, 365.25))) %>%
data.frame(seasonal= .$seasonal, x=.$x, trend=.$trend, random=.$random)
It's unclear from your example whether you want to extract $x or $seasonal. Either way, you can extract part of a list either with the `[[`() function in base or the alias extract2() in magrittr, as you prefer. You should then use the . when you create a data.frame in the last step.
Cleaning up the code a bit to be consistent with the piping, the following works:
library(magrittr)
library(tidyverse)
library(forecast)
vec <- 1:1000 + rnorm(1000)
vec %>%
msts(start = c(2001, 1, 1), seasonal.periods= c(7, 365.25)) %>%
decompose %>%
`[[`("seasonal") %>%
# extract2("seasonal") %>% # Another option, uncomment if preferred
data.frame(ts = .) %>%
head # Just for the reprex, remove as required
#> ts
#> 1 -1.17332998
#> 2 0.07393265
#> 3 0.37631946
#> 4 0.30640395
#> 5 1.04279779
#> 6 0.20470768
Created on 2019-11-28 by the reprex package (v0.3.0)
Edit based on comment:
To do what you mention in the comments, you need to use curly brackets (see e.g. here for an explanation why). Hence, the following works:
library(magrittr)
library(tidyverse)
library(forecast)
vec <- 1:1000 + rnorm(1000)
vec %>%
msts(start= c(2001, 1, 1), seasonal.periods = c(7, 365.25)) %>%
decompose %>%
{data.frame(seasonal = .$seasonal,
trend = .$trend)} %>%
head
#> seasonal trend
#> 1 -0.4332034 NA
#> 2 -0.6185832 NA
#> 3 -0.5899566 NA
#> 4 0.7640938 NA
#> 5 -0.4374417 NA
#> 6 -0.8739449 NA
However, for your specific use case, it may be clearer and easier to use magrittr::extract and then simply bind_cols:
vec %>%
msts(start= c(2001, 1, 1), seasonal.periods = c(7, 365.25)) %>%
decompose %>%
magrittr::extract(c("seasonal", "trend")) %>%
bind_cols %>%
head
#> # A tibble: 6 x 2
#> seasonal trend
#> <dbl> <dbl>
#> 1 -0.433 NA
#> 2 -0.619 NA
#> 3 -0.590 NA
#> 4 0.764 NA
#> 5 -0.437 NA
#> 6 -0.874 NA
Created on 2019-11-29 by the reprex package (v0.3.0)
With daily data, decompose() does not work well because it will only handle the annual seasonality and will give relatively poor estimates of it. If the data involve human behaviour, it will probably have both weekly and annual seasonal patterns.
Also, msts objects are not great for daily data either because they don't store the dates explicitly.
I suggest you use tsibble objects with an STL decomposition instead. Here is an example using your data.
library(tidyverse)
library(tsibble)
library(feasts)
mydata <- tsibble(
day = as.Date(seq(as.Date("2001-01-01"), length=1000, by=1)),
vec = 1:1000 + rnorm(1000)
)
#> Using `day` as index variable.
mydata
#> # A tsibble: 1,000 x 2 [1D]
#> day vec
#> <date> <dbl>
#> 1 2001-01-01 0.161
#> 2 2001-01-02 2.61
#> 3 2001-01-03 1.37
#> 4 2001-01-04 3.15
#> 5 2001-01-05 4.43
#> 6 2001-01-06 7.35
#> 7 2001-01-07 7.10
#> 8 2001-01-08 10.0
#> 9 2001-01-09 9.16
#> 10 2001-01-10 10.2
#> # … with 990 more rows
# Compute a decomposition
mydata %>% STL(vec)
#> # A dable: 1,000 x 7 [1D]
#> # STL Decomposition: vec = trend + season_year + season_week + remainder
#> day vec trend season_year season_week remainder season_adjust
#> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 2001-01-01 0.161 14.7 -14.6 0.295 -0.193 14.5
#> 2 2001-01-02 2.61 15.6 -14.2 0.0865 1.04 16.7
#> 3 2001-01-03 1.37 16.6 -15.5 0.0365 0.240 16.9
#> 4 2001-01-04 3.15 17.6 -13.0 -0.0680 -1.34 16.3
#> 5 2001-01-05 4.43 18.6 -13.4 -0.0361 -0.700 17.9
#> 6 2001-01-06 7.35 19.5 -12.4 -0.122 0.358 19.9
#> 7 2001-01-07 7.10 20.5 -13.4 -0.181 0.170 20.7
#> 8 2001-01-08 10.0 21.4 -12.7 0.282 1.10 22.5
#> 9 2001-01-09 9.16 22.2 -13.8 0.0773 0.642 22.9
#> 10 2001-01-10 10.2 22.9 -12.7 0.0323 -0.0492 22.9
#> # … with 990 more rows
Created on 2019-11-30 by the reprex package (v0.3.0)
The output is a dable (decomposition table) which behaves like a dataframe most of the time. So you can extract the trend column, or either of the seasonal component columns in the usual way.

Format/Generate new table in R using dplyr/tidyr

I have a table calculated using a df which looks like the following.
Month_considered pct `ATC Count`
<fct> <dbl> <fct>
1 Apr-17 54.9 198,337
2 May-17 56.4 227,681
3 Jun-17 58.0 251,664
4 Jul-17 57.7 251,934
5 Aug-17 55.5 259,617
6 Sep-17 55.7 245,588
7 Oct-17 56.6 247,051
8 Nov-17 57.6 256,375
9 Dec-17 56.9 277,784
10 Jan-18 56.7 272,818
Now I want to find the difference in pct between two months.So the desired output would be like
Month_considered pct
<fct> <dbl>
1 Apr-17-May-17 1.5
2 May-17-Jun-17 1.6
3 Jun-17-Jul-17 - 0.3
How do I concatenate the first column like above. I did try using unite in tidyr but it isnt the output what I want to generate.Thank you.
We need to take the difference between the current and the next value
library(dplyr)
library(zoo)
df1 %>%
arrange(as.yearmon(Month_considered, format = "%b-%y")) %>% # to order
mutate_at(vars(Month_considered, pct),
funs(new = lead(., default = last(.)))) %>%
unite(Month_considered, Month_considered, Month_considered_new, sep="-") %>%
transmute(Month_considered, pct = pct_new - pct)
# Month_considered pct
#1 Apr-17-May-17 1.5
#2 May-17-Jun-17 1.6
#3 Jun-17-Jul-17 -0.3
#4 Jul-17-Aug-17 -2.2
#5 Aug-17-Sep-17 0.2
#6 Sep-17-Oct-17 0.9
#7 Oct-17-Nov-17 1.0
#8 Nov-17-Dec-17 -0.7
#9 Dec-17-Jan-18 -0.2
#10 Jan-18-Jan-18 0.0
Or using base R
pct <- df1$pct[-1] - df1$pct[-nrow(df1)]
Month_considered <- paste(df1$Month_considered[-1],
df1$Month_considered[-nrow(df1)], sep="-")
data.frame(Month_considered, pct)
One can try using self-join after adding 1 month from zoo::yearmon type column.
To add a month in yearmon type column simply add 1/12.
The solution is:
library(zoo)
library(dplyr)
df %>% mutate(Month_considered = as.yearmon(Month_considered, "%b-%y"),
Next_Month = Month_considered+(1/12)) %>%
#self join
left_join(.,.,by=c("Next_Month"="Month_considered")) %>%
mutate(Month_considered = paste(Month_considered,Next_Month,sep="-"),
pct = pct.y - pct.x) %>%
select(Month_considered, pct)
# Month_considered pct
# 1 Apr 2017-May 2017 1.5
# 2 May 2017-Jun 2017 1.6
# 3 Jun 2017-Jul 2017 -0.3
# 4 Jul 2017-Aug 2017 -2.2
# 5 Aug 2017-Sep 2017 0.2
# 6 Sep 2017-Oct 2017 0.9
# 7 Oct 2017-Nov 2017 1.0
# 8 Nov 2017-Dec 2017 -0.7
# 9 Dec 2017-Jan 2018 -0.2
# 10 Jan 2018-Feb 2018 NA
Data:
df <- read.table(text=
"Month_considered pct 'ATC Count'
Apr-17 54.9 198337
May-17 56.4 227681
Jun-17 58.0 251664
Jul-17 57.7 251934
Aug-17 55.5 259617
Sep-17 55.7 245588
Oct-17 56.6 247051
Nov-17 57.6 256375
Dec-17 56.9 277784
Jan-18 56.7 272818",
header=TRUE, stringsAsFactors = FALSE)

Resources