Related
I have 2 data sets - one is quarterly which I need to match to monthly data. So the values from the quarterly data will be repeated thrice in the final data set. I have created a one quarter sample below but this would need to be repeated for many quarters.
month <- c(1/20, 2/20, 3/20)
rating <- c(0.5,0.6,0.65)
df1 <- cbind(month,rating)
quarter <- c(“q1/20”)
amount <- c(100)
df2 <- cbind(quarter,amount)
My final data set should have the following structure
month <- c(1/20, 2/20, 3/20)
rating <- c(0.5,0.6,0.65)
quarter <- c(“q1/20”, “q1/20”, “q1/20”)
amount <- c(100,100,100)
df3 <- cbind(month, rating, quarter, amount)
In the full quarterly data set (df1), some observations are also monthly so it would maybe be a case of matching the monthly observations by month and quarterly observations by quarter?
Thanks in anticipation.
Assuming you have this data.
head(m.dat)
# month rating
# 1 1/18 0.91
# 2 2/18 0.94
# 3 3/18 0.29
# 4 4/18 0.83
# 5 5/18 0.64
# 6 6/18 0.52
head(q.dat)
# quarter amount
# 1 q1/18 1
# 2 q2/18 21
# 3 q3/18 91
# 4 q4/18 61
# 5 q1/19 38
# 6 q2/19 44
You could match month information to quarters using an assignment matrix qm.
qm <- matrix(c(1:12, paste0("q", rep(1:4, each=3))), 12, 2)
m.dat$quarter <- paste0(qm[match(qm[, 1], gsub("(^\\d*).*", "\\1", m.dat$month)), 2],
"/",
sapply(strsplit(m.dat$month, "/"), `[`, 2))
This enables you to use merge.
res <- merge(m.dat, q.dat, all=TRUE)
head(res)
# quarter month rating amount
# 1 q1/18 1/18 0.91 1
# 2 q1/18 2/18 0.94 1
# 3 q1/18 3/18 0.29 1
# 4 q1/19 1/19 0.93 38
# 5 q1/19 2/19 0.26 38
# 6 q1/19 3/19 0.46 38
Toy data
m.dat <- structure(list(month = c("1/18", "2/18", "3/18", "4/18", "5/18",
"6/18", "7/18", "8/18", "9/18", "10/18", "11/18", "12/18", "1/19",
"2/19", "3/19", "4/19", "5/19", "6/19", "7/19", "8/19", "9/19",
"10/19", "11/19", "12/19", "1/20", "2/20", "3/20", "4/20", "5/20",
"6/20", "7/20", "8/20", "9/20", "10/20", "11/20", "12/20"), rating = c(0.91,
0.94, 0.29, 0.83, 0.64, 0.52, 0.74, 0.13, 0.66, 0.71, 0.46, 0.72,
0.93, 0.26, 0.46, 0.94, 0.98, 0.12, 0.47, 0.56, 0.9, 0.14, 0.99,
0.95, 0.08, 0.51, 0.39, 0.91, 0.45, 0.84, 0.74, 0.81, 0.39, 0.69,
0, 0.83)), class = "data.frame", row.names = c(NA, -36L))
q.dat <- structure(list(quarter = c("q1/18", "q2/18", "q3/18", "q4/18",
"q1/19", "q2/19", "q3/19", "q4/19", "q1/20", "q2/20", "q3/20",
"q4/20"), amount = c(1, 21, 91, 61, 38, 44, 4, 97, 43, 96, 89,
64)), class = "data.frame", row.names = c(NA, -12L))
Assuming that df1 and df2 are the data frames shown in the Note at the end create a yq column of class yearqtr in each and merge on that:
library(zoo)
df1 <- transform(df1, yq = as.yearqtr(month, "%m/%y"))
df2 <- transform(df2, yq = as.yearqtr(quarter, "q%q/%y"))
merge(df1, df2, by = "yq", all = TRUE)
giving:
yq month rating quarter amount
1 2020 Q1 1/20 0.50 q1/20 100
2 2020 Q1 2/20 0.60 q1/20 100
3 2020 Q1 3/20 0.65 q1/20 100
We could also consider converting the month column into a yearmon class column using
as.yearmon .
Note
df1 <- data.frame(month = c("1/20", "2/20", "3/20"), rating = c(0.5,0.6,0.65))
df2 <- data.frame(quarter = "q1/20", amount = 100)
Say I have some data with 2 numeric variables ranging from 0 to 1 (it1, it2), a name variable, which has the name of the subject the numeric variable belongs to and then some date for every measure, ranging from year 2014 to 2017. Now, what I want to do is create a data set that only contains measures of people that have values for every year of my measure, and then in the future maybe specify that I only want measures for people with data ranging from 2015 to 2017. Does anybody have any hint on what package or code could help me with my problem? Thanks in advance.
date <- c("2015-11-26", "2015-12-30","2016-11-13", "2014-09-22", "2014-01-13", "2014-07-26", "2016-11-26", "2016-04-04", "2017-04-09", "2017-02-23", "2015-03-22")
names <- c("Max", "Allen", "Allen", "Bob", "Max", "Sarah", "Max", "Sarah", "Max", "Sarah", "Sarah")
it1 <- c(0.6, 0.3, 0.1, 0.2, 0.3, 0.8, 0.8, 0.5, 0.5, 0.3, 0.7)
it2 <- c(0.5, 0.8, 0.1, 0.4, 0.4, 0.4, 0.5, 0.8, 0.6, 0.5, 0.4)
date <- as.Date(date, format = "%Y-%m-%d")
myframe <- data.frame(date, names, it1, it2)
Desired output:
date <- c("2015-11-26", "2014-01-13", "2014-07-26", "2016-11-26", "2016-04-04", "2017-04-09", "2017-02-23", "2015-03-22")
names <- c("Max", "Max", "Sarah", "Max", "Sarah", "Max", "Sarah", "Sarah")
it1 <- c(0.6, 0.3, 0.8, 0.8, 0.5, 0.5, 0.3, 0.7)
it2 <- c(0.5, 0.4, 0.4, 0.5, 0.8, 0.6, 0.5, 0.4)
date <- as.Date(date, format = "%Y-%m-%d")
myframe <- data.frame(date, names, it1, it2)
Create a table of year vs. name and for those names in all years select out those rows. No packages are used.
tab <- table(as.POSIXlt(myframe$date)$year + 1900, myframe$names)
subset(myframe, names %in% colnames(tab)[colSums(sign(tab)) == nrow(tab)])
giving:
date names it1 it2
1 2015-11-26 Max 0.6 0.5
5 2014-01-13 Max 0.3 0.4
6 2014-07-26 Sarah 0.8 0.4
7 2016-11-26 Max 0.8 0.5
8 2016-04-04 Sarah 0.5 0.8
9 2017-04-09 Max 0.5 0.6
10 2017-02-23 Sarah 0.3 0.5
11 2015-03-22 Sarah 0.7 0.4
library(lubridate)
myframe[with(data = myframe[year(myframe$date) >= 2014 & year(myframe$date) <= 2017,],
expr = ave(year(date), names, FUN = function(x)
all(year(date) %in% x))) == 1,]
# date names it1 it2
#1 2015-11-26 Max 0.6 0.5
#5 2014-01-13 Max 0.3 0.4
#6 2014-07-26 Sarah 0.8 0.4
#7 2016-11-26 Max 0.8 0.5
#8 2016-04-04 Sarah 0.5 0.8
#9 2017-04-09 Max 0.5 0.6
#10 2017-02-23 Sarah 0.3 0.5
#11 2015-03-22 Sarah 0.7 0.4
I'm trying to loop through a large dataframe [5413 columns] and run an ANOVA on each column, however I'm getting an error when trying to do so.
I'd like to have the P value from the ANOVA written to a new row in a dataframe containing the column titles. But limited my current knowledge I'm writing the P-value outputs to files I can parse through in bash.
Here's an example layout of the data:
data()
Name, Group, aaaA, aaaE, bbbR, cccD
Apple, Fruit, 1.23, 0.45, 0.3, 1.1
Banana, Fruit, 0.54, 0.12, 2.0, 1.32
Carrot, Vegetable, 0.01, 0.05, 0.45, 0.9
Pear, Fruit, 0.1, 0.2, 0.1, 0.3
Fox, Animal, 1.0, 0.9, 1.2, 0.8
Dog, Animal, 1.2, 1.1, 0.8, 0.7
And here is the output from dput:
structure(list(Name = structure(c(1L, 2L, 3L, 6L, 5L, 4L), .Label = c("Apple",
"Banana", "Carrot", "Dog", "Fox", "Pear"), class = "factor"),
Group = structure(c(2L, 2L, 3L, 2L, 1L, 1L), .Label = c(" Animal",
" Fruit", " Vegetable"), class = "factor"), aaaA = c(1.23,
0.54, 0.01, 0.1, 1, 1.2), aaaE = c(0.45, 0.12, 0.05, 0.2,
0.9, 1.1), bbbR = c(0.3, 2, 0.45, 0.1, 1.2, 0.8), cccD = c(1.1,
1.32, 0.9, 0.3, 0.8, 0.7)), class = "data.frame", row.names = c(NA,
-6L))
To get a successful output from one I do:
summary(aov(aaaA ~ Group, data=data))[[1]][["Pr(>F)"]]
I then try to implement that in a loop:
for(i in names(data[3:6])){
out <- summary(aov(i ~ Group, data=data))[[1]][["Pr(>F)"]]
write.csv(out, i)}
Which returns the error:
Error in model.frame.default(formula = i ~ Group, data = test, drop.unused.levels = TRUE) :
variable lengths differ (found for 'Group')
Can anyone help with getting around the error or implementing a per-column ANOVA?
We can do the following and later get the p values:
to_use<-setdiff(names(df),"aaaA")
lapply(to_use,function(x) summary(do.call(aov,list(as.formula(paste("aaaA","~",x)),
data=df))))
This gives you:
[[1]]
Df Sum Sq Mean Sq
Name 5 1.48 0.296
[[2]]
Df Sum Sq Mean Sq F value Pr(>F)
Group 2 0.8113 0.4057 1.819 0.304
Residuals 3 0.6689 0.2230
[[3]]
Df Sum Sq Mean Sq F value Pr(>F)
aaaE 1 0.9286 0.9286 6.733 0.0604 .
Residuals 4 0.5516 0.1379
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
[[4]]
Df Sum Sq Mean Sq F value Pr(>F)
bbbR 1 0.043 0.0430 0.12 0.747
Residuals 4 1.437 0.3593
[[5]]
Df Sum Sq Mean Sq F value Pr(>F)
cccD 1 0.1129 0.1129 0.33 0.596
Residuals 4 1.3673 0.3418
How do I apply a function to many columns of grouped rows? For example;
library(tidyverse)
data <- tribble(
~Date, ~Seq1, ~Component, ~Seq2, ~X1, ~X2, ~X3,
"01/01/18", 1, "Smooth", NA, 3.98, 2.75, 1.82,
"01/01/18", 2, "Smooth", NA, 1.02, 0.02, -0.04,
"01/01/18", 3, "Smooth", NA, 3.48, 3.06, 1.25,
"01/01/18", 3, "Bounce", 1, 2.01, -0.43, -0.52,
"01/01/18", 3, "Bounce", 2, 1.94, 1.53, 1.92) %>%
mutate_at(vars(Date, Seq1, Component, Seq2), funs(factor))
Each column of X values (many more columns, truncated here for clarity) is grouped into Date, Seq1, Component, and Seq2. While Component "Smooth" and Seq1 "NA" are constant, within Component "Bounce" level there are multiple Seq2 levels e.g. "1", "2", etc.
How do I sum each X column, always the constant "NA" with each level of Seq2?
The desired results is:
expected <- tribble(
~Date, ~Seq1, ~Component, ~Seq2, ~X1, ~X2, ~X3,
"01/01/18", 1, "Smooth", NA, 3.98, 2.75, 1.82,
"01/01/18", 2, "Smooth", NA, 1.02, 0.02, -0.04,
"01/01/18", 3, "Smooth", NA, 3.48, 3.06, 1.25,
"01/01/18", 3, "Bounce", 1, 5.49, 3.49, 1.77,
"01/01/18", 3, "Bounce", 2, 5.42, 4.59, 3.17)
The following example only adds each Seq1 level.
data %>%
group_by(Date, Seq1) %>%
mutate_at(vars(starts_with("X")), funs(sum(.)))
#> # A tibble: 5 x 7
#> # Groups: Date, Seq1 [3]
#> Date Seq1 Component Seq2 X1 X2 X3
#> <fct> <fct> <fct> <fct> <dbl> <dbl> <dbl>
#> 1 01/01/18 1 Smooth <NA> 3.98 2.75 1.82
#> 2 01/01/18 2 Smooth <NA> 1.02 0.02 -0.04
#> 3 01/01/18 3 Smooth <NA> 7.43 4.16 2.65
#> 4 01/01/18 3 Bounce 1 7.43 4.16 2.65
#> 5 01/01/18 3 Bounce 2 7.43 4.16 2.65
I am certain there is solution within the purrr or apply function family, however, I have been unsuccessful (for days) in solving this example. The actual data has about 180 X columns, with hundreds of Date and Seq1 combinations, and multiple Seq2 levels.
A similar example could be Summing Multiple Groups of Columns, How to apply a function to a subset of columns in r?, or even perhaps https://github.com/jennybc/row-oriented-workflows.
Created on 2018-10-23 by the reprex package (v0.2.1)
Here's my solution. This problem is not really a purrr task, because there is nothing really that you want to map a single function to. Instead, what I understand the problem to be is that you want to match each X value in a Bounce row with the corresponding Smooth row X values of the same Date and Seq1 (and there is only one such row). This means that it is really a merging or joining problem, and then the approach is to set up the join so that you can match the right values and do the sum. So I go as follows:
Split the data into the Smooth rows and the Bounce rows and gather so that all the X values are in one column
Join the smooths onto the bounces with a left_join, so each original Bounce row now has its corresponding Smooth.
mutate the sum into a new column and select/rename the columns to be as in the original
bind_rows to join the newly summed bounces and spread to return to the original layout.
This should be robust to any number of Date, Seq1, Seq2 and X values.
library(tidyverse)
data <- tribble(
~Date, ~Seq1, ~Component, ~Seq2, ~X1, ~X2, ~X3,
"01/01/18", 1, "Smooth", NA, 3.98, 2.75, 1.82,
"01/01/18", 2, "Smooth", NA, 1.02, 0.02, -0.04,
"01/01/18", 3, "Smooth", NA, 3.48, 3.06, 1.25,
"01/01/18", 3, "Bounce", 1, 2.01, -0.43, -0.52,
"01/01/18", 3, "Bounce", 2, 1.94, 1.53, 1.92)
smooths <- data %>%
filter(Component == "Smooth") %>%
gather(X, val, starts_with("X"))
bounces <- data %>%
filter(Component == "Bounce") %>%
gather(X, val, starts_with("X")) %>%
left_join(smooths, by = c("Date", "Seq1", "X")) %>%
mutate(val = val.x + val.y) %>%
select(Date, Seq1, Component = Component.x, Seq2 = Seq2.x, X, val)
bounces %>%
bind_rows(smooths) %>%
spread(X, val)
#> # A tibble: 5 x 7
#> Date Seq1 Component Seq2 X1 X2 X3
#> <chr> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 01/01/18 1 Smooth NA 3.98 2.75 1.82
#> 2 01/01/18 2 Smooth NA 1.02 0.02 -0.04
#> 3 01/01/18 3 Bounce 1 5.49 2.63 0.73
#> 4 01/01/18 3 Bounce 2 5.42 4.59 3.17
#> 5 01/01/18 3 Smooth NA 3.48 3.06 1.25
Created on 2018-10-31 by the reprex package (v0.2.1)
My data frame is simple (and probably is not strictly a dataframe):
date MAE_f0 MAE_f1
1 20140101 0.2 0.2
2 20140102 1.9 0.1
3 20140103 0.1 0.3
4 20140104 7.8 15.9
5 20140105 1.9 4.6
6 20140106 0.8 0.8
7 20140107 0.5 0.6
8 20140108 0.2 0.2
9 20140109 0.2 0.2
10 20140110 0.8 1.1
11 20140111 0.2 0.2
12 20140112 0.4 0.4
13 20140113 2.8 0.9
14 20140114 5.4 5.8
15 20140115 0.2 0.3
16 20140116 4.9 3.1
17 20140117 3.7 6.0
18 20140118 1.4 2.1
19 20140119 0.9 3.0
20 20140120 0.2 3.6
21 20140121 0.3 0.3
22 20140122 0.4 0.4
23 20140123 0.6 1.7
24 20140124 6.1 4.7
25 20140125 0.1 0.0
26 20140126 7.4 4.9
27 20140127 0.8 0.9
28 20140128 0.3 0.3
29 20140129 3.0 4.2
30 20140130 9.9 17.3
On every day I've 2 variables: MAE for f0, and MAE for f1.
I can calculate frequency for my 2 variables on the whole time period using "cut" with the same intervals for both:
cut(mae.df$MAE_f0,c(0,2,5,10,50))
cut(mae.df$MAE_f1,c(0,2,5,10,50))
Well. Now I can use boxplot to plot variable versus it's frequency distribution:
boxplot(mae.df$MAE_f0~cut(mae.df$MAE_f0,c(0,2,5,10,50)))
boxplot(mae.df$MAE_f1~cut(mae.df$MAE_f1,c(0,2,5,10,50)))
The produced boxplot (2) are very simple (but I don't show it 'cause I've ho "reputation"): on x there are the intervals of frequency (0-2,2-5,5-10,10-50), on y the boxplot value for variable MAE_f0 for each interval.
Well, the question is very trivial: I'd like to have only one box plot, with both variables MAE_f0 and MAE_f1 and it's frequency distribution: I'd like to have is a plot with 2 boxplot for each frequency interval (I mean: 2 for 0-2, 2 for 2-5 and so on).
I know that my knowledge on R, data frame and so on is very poor, and, de facto, I'm missing something important about those arguments, specially on data frame and reshaping! Sorry in advance for that!But I've seen some nice examples in stackoverflow about grouping boxplot, all without time variable, and I'm not able to figure out how I can adjust my data frame for doing that.
I hope my question is not misplaced: sorry again for that.
Umbe
Here is how I would do this. I think it makes sense to melt your data first. A quick tutorial on melting your data is available here.
# First, make this reproducible by using dput for the data frame
df <- structure(list(date = 20140101:20140130, MAE_f0 = c(0.2, 1.9, 0.1, 7.8, 1.9, 0.8, 0.5, 0.2, 0.2, 0.8, 0.2, 0.4, 2.8, 5.4, 0.2, 4.9, 3.7, 1.4, 0.9, 0.2, 0.3, 0.4, 0.6, 6.1, 0.1, 7.4, 0.8, 0.3, 3, 9.9), MAE_f1 = c(0.2, 0.1, 0.3, 15.9, 4.6, 0.8, 0.6, 0.2, 0.2, 1.1, 0.2, 0.4, 0.9, 5.8, 0.3, 3.1, 6, 2.1, 3, 3.6, 0.3, 0.4, 1.7, 4.7, 0, 4.9, 0.9, 0.3, 4.2, 17.3)), .Names = c("date", "MAE_f0", "MAE_f1"), row.names = c(NA, -30L), class = "data.frame")
require(ggplot2)
require(reshape2)
# Melt the original data frame
df2 <- melt(df, measure.vars = c("MAE_f0", "MAE_f1"))
head(df2)
# date variable value
# 1 20140101 MAE_f0 0.2
# 2 20140102 MAE_f0 1.9
# 3 20140103 MAE_f0 0.1
# 4 20140104 MAE_f0 7.8
# 5 20140105 MAE_f0 1.9
# 6 20140106 MAE_f0 0.8
# Create a "cuts" variable with the correct breaks
df2$cuts <- cut(df2$value,
breaks = c(-Inf, 2, 5, 10, +Inf),
labels = c("first cut", "second cut", "third cut", "fourth cut"))
head(df2)
# date variable value cuts
# 1 20140101 MAE_f0 0.2 first cut
# 2 20140102 MAE_f0 1.9 first cut
# 3 20140103 MAE_f0 0.1 first cut
# 4 20140104 MAE_f0 7.8 third cut
# 5 20140105 MAE_f0 1.9 first cut
# 6 20140106 MAE_f0 0.8 first cut
# Plotting
ggplot(df2, aes(x = variable, y = value, fill = variable)) +
geom_boxplot() +
facet_wrap(~ cuts, nrow = 1)
Result:
Here is one way. You reshape your data. Then, you want to add a fake data point in this case. I noticed that there is no data point for MAE_f0 for (10,50](frequency 10-50). Combine your reshaped data and the fake data. When you draw a figure, use coord_cartesian with the range of y values in the original data set. Hope this gives you an ideal graphic. Here, your data is called mydf
library(dplyr)
library(tidyr)
library(ggplot2)
mydf <- structure(list(V1 = 1:30, V2 = 20140101:20140130, V3 = c(0.2,
1.9, 0.1, 7.8, 1.9, 0.8, 0.5, 0.2, 0.2, 0.8, 0.2, 0.4, 2.8, 5.4,
0.2, 4.9, 3.7, 1.4, 0.9, 0.2, 0.3, 0.4, 0.6, 6.1, 0.1, 7.4, 0.8,
0.3, 3, 9.9), V4 = c(0.2, 0.1, 0.3, 15.9, 4.6, 0.8, 0.6, 0.2,
0.2, 1.1, 0.2, 0.4, 0.9, 5.8, 0.3, 3.1, 6, 2.1, 3, 3.6, 0.3,
0.4, 1.7, 4.7, 0, 4.9, 0.9, 0.3, 4.2, 17.3)), .Names = c("V1",
"V2", "V3", "V4"), class = "data.frame", row.names = c(NA, -30L
))
ana <- select(mydf, -V1) %>%
rename(date = V2, MAE_f0 = V3, MAE_f1 = V4) %>%
gather(variable, value, -date) %>%
mutate(frequency = cut(value, breaks = c(-Inf,2,5,10,50)))
# Create a fake df
extra <- data.frame(date = 20140101,
variable = "MAE_f0",
value = 60,
frequency = "(10,50]")
new <- rbind(ana, extra)
ggplot(data = new, aes(x = frequency, y = value, fill = variable)) +
geom_boxplot(position = "dodge") +
coord_cartesian(ylim = range(ana$value) + c(-0.25, 0.25))