Related
I have following extract of my dataset:
library(dyplr)
library(runner)
example <- data.frame(Date <- c("2020-03-24", "2020-04-06" ,"2020-04-08" ,
"2020-04-13", "2020-04-14", "2020-04-15",
"2020-04-16", "2020-04-18", "2020-04-23",
"2020-04-24", "2020-04-26", "2020-04-29",
"2020-03-24", "2020-04-06" ,"2020-04-08" ,
"2020-04-01", "2020-04-12", "2020-04-15",
"2020-04-17", "2020-04-18", "2020-04-22",
"2020-05-01", "2020-05-15", "2020-05-29",
"2020-03-08", "2020-04-06" ,"2020-04-15",
"2020-04-22", "2020-04-28", "2020-05-05",
"2020-05-08", "2020-05-22", "2020-05-23"),
username <- c("steves_" ,"steves_" ,"steves_",
"steves_" ,"steves_" ,"steves_",
"steves_" ,"steves_" ,"steves_",
"steves_" ,"steves_" ,"steves_",
"jules_" ,"jules_" ,"jules_",
"jules_" ,"jules_" ,"jules_",
"jules_" ,"jules_" ,"jules_",
"jules_" ,"jules_" ,"jules_",
"mia" ,"mia" ,"mia",
"mia" ,"mia" ,"mia",
"mia" ,"mia" ,"mia"),
ER <- as.numeric(c("0.092", "0.08", "0.028",
"0.1", "0.09", "0.02",
"0.02", "0.8", "0.001",
"0.001", "0.1", "0.098",
"0.001", "0.002","0.02",
"0.0098", "0.002","0.0019",
"0.002", "0.11","0.002",
"0.02", "0.01", "0.009",
"0.19", "0.09", "0.21",
"0.22", "0.19", "0.22",
"0.09", "0.19", "0.28")))
colnames(example) <- c("Date", "username", "ER")
example$Date <- as.Date(example$Date)
str(example)
I would like to calculate the respective average of the ER over a month from the respective dates.
I know that there are similar contributions to this already in the forum - but unfortunately I could not find the solution for me.
I have tried the following solutions:
example$avgER_30days <- example %>%
arrange(username, Date) %>%
group_by(username) %>%
mutate(rollmean(example$ER, Date > (Date %m-% months(1)) & Date < Date, fill = NA))
or with the package runners
example$average <- example %>%
group_by(username) %>%
arrange(username, Date) %>%
mutate(mean_run(x = example$ER, k = 30, lag = 1, idx=example$Date)) %>%
ungroup(username)
I would be happy if you could help me!
Here are two equivalent alternatives.
In the first alternative below, the second argument to rollapplyr is a list such that the ith component is the vector of offsets to average over for the ith row of the group.
In the second alternative we can specify the width as a vector of widths, one per row, and then when taking the mean eliminate the last value.
Note that w is slightly different in the two alternatives.
Review ?rollapply for details on the arguments and for further examples.
library(dplyr, exclude = c("filter", "lag"))
library(zoo)
example %>%
arrange(username, Date) %>%
group_by(username) %>%
mutate(w = seq_along(Date) - findInterval(Date - 30, Date) - 1,
avg30 = rollapplyr(ER, lapply(-w, seq, to = -1), mean, fill=NA)) %>%
ungroup
example %>%
arrange(username, Date) %>%
group_by(username) %>%
mutate(w = seq_along(Date) - findInterval(Date - 30, Date),
avg30 = rollapplyr(ER, w, function(x) mean(head(x, -1)), fill = NA)) %>%
ungroup
For the time series plot which is composed by two subplots:
library(tidyverse)
library(lubridate)
library(feasts)
library(tsibble)
library(gghighlight)
df %>%
mutate(date = as.Date(date, origin = "1899-12-30")) %>%
mutate(year=as.numeric(year(date))) %>%
pivot_longer(`food_index`:`energy_index`) %>%
mutate(date=yearmonth(date)) %>%
as_tsibble(index=date, key=name) %>%
gg_season(value, alpha=1) +
geom_line(size=0.8, alpha=0.8) +
geom_point(size=2, alpha=1)
Out:
Let's say if the current year is 2022, I wanna to plot the line of that year with alpha=1, other years' lines with smaller alpha, ie., alpha=0.3.
How could I do that? Thanks for your helps at advance.
Data:
df <- structure(list(date = c(42766, 42794, 42825, 42855, 42886, 42916,
42947, 42978, 43008, 43039, 43069, 43100, 43131, 43159, 43190,
43220, 43251, 43281, 43312, 43343, 43373, 43404, 43434, 43465,
43496, 43524, 43555, 43585, 43616, 43646, 43677, 43708, 43738,
43769, 43799, 43830, 43861, 43890, 43921, 43951, 43982, 44012,
44043, 44074, 44104, 44135, 44165, 44196, 44227, 44255, 44286,
44316, 44347, 44377, 44408, 44439, 44469, 44500, 44530, 44561
), food_index = c(58.53, 61.23, 55.32, 55.34, 61.73, 56.91, 54.27,
59.08, 60.11, 66.01, 60.11, 63.41, 69.8, 72.45, 81.11, 89.64,
88.64, 88.62, 98.27, 111.11, 129.39, 140.14, 143.44, 169.21,
177.39, 163.88, 135.07, 151.28, 172.81, 143.82, 162.13, 172.22,
176.67, 179.3, 157.27, 169.12, 192.51, 194.2, 179.4, 169.1, 193.17,
174.92, 181.92, 188.41, 192.14, 203.41, 194.19, 174.3, 174.86,
182.33, 182.82, 185.36, 192.41, 195.59, 202.6, 201.51, 225.01,
243.78, 270.67, 304.57), energy_index = c(127.36, 119.87, 120.96,
112.09, 112.19, 109.24, 109.56, 106.89, 109.35, 108.35, 112.39,
117.77, 119.52, 122.24, 120.91, 125.41, 129.72, 135.25, 139.33,
148.6, 169.62, 184.23, 204.38, 198.55, 189.29, 202.47, 220.23,
240.67, 263.12, 249.74, 240.84, 243.42, 261.2, 256.76, 258.69,
277.98, 289.63, 293.46, 310.81, 318.68, 310.04, 302.17, 298.62,
260.92, 269.29, 258.84, 241.68, 224.18, 216.36, 226.57, 235.98,
253.86, 267.37, 261.99, 273.37, 280.91, 291.84, 297.88, 292.78,
289.79)), row.names = c(NA, 60L), class = "data.frame")
You could achieve this by creating a boolean variable that detects the year you would like to highlight and then passing that as the alpha aesthetic inside your plot:
df %>%
mutate(date = as.Date(date, origin = "1899-12-30")) %>%
mutate(year=as.numeric(year(date))) %>%
pivot_longer(`food_index`:`energy_index`) %>%
mutate(date=yearmonth(date),
highlight = ifelse(year == "2021", T, F)) %>%
as_tsibble(index=date, key=name) %>%
gg_season(value, alpha = 0.2) +
geom_line(aes(alpha = highlight),
size=0.8) +
geom_point(aes(alpha = highlight),
size=2) +
scale_alpha_manual(values = c(0.2, 1)) +
guides(alpha = "none") +
theme_bw()
I have a r data frame and would like to make a dot plot in ggplot where x-axis would correspond to different groups present in Ensembl_ID row ("ENSG00000000003", "ENSG00000000005" etc. are the group for x-axis) and y-axis would correspond to respective numbers present in logFC.1 to logFC.9. I would like to display the logFC.1 to logFC.9 number of each respective group in form of dots like the attached image (to prevent confusion, each group should have the same color and only change with intensity like: if the mean x is bigger it gets more red and if the mean y is bigger it gets more green or something).
If possible then can you guys also help me in assigning the different color for positive and negative values.
Can we also assign the dot's color based on value (color intensity)?
df1 <- data.frame(Ensembl_ID = c("ENSG00000000003", "ENSG00000000005", "ENSG00000000419", "ENSG00000000457", "ENSG00000000460", "ENSG00000000938", "ENSG00000000971", "ENSG00000001036", "ENSG00000001084", "ENSG00000001167" ), logFC.1 = c(0.147447019707984, -0.278643924528991, 0.00638502079233481, 0.00248371473862579, 0.0591639590814736, -0.0892578080659792, -0.0139042150604349, 0.15210410748665, -0.0273174541997048, 0.0373813166759115 ), logFC.2 = c(0.14237211045168, -0.153847067952652, 0.00806519294435945, -0.0243298183425441, 0.0639184480028851, -0.0791126460573967, -0.0517704622015086, 0.100033161692714, 0.105136768894399, 0.0509474174745926 ), logFC.3 = c(0.0692402101693023, -0.212626837128185, 0.0665466667502187, 0.0189664498456434, 0.073631371224761, -0.0642014520794086, 0.0115060035255512, 0.104767159584613, 0.140378485980222, 0.0814931176279395), logFC.4 = c(0.175916688982428, -0.0606440302201137, 0.0862627141013101, 0.105179938123113, 0.128866411791584, -0.0988927171791539, 0.128758540724723, 0.0997656895899759, 0.345468063926355, 0.130898388184307), logFC.5 = c(0.144743421921328, 0.247159332221974, 0.0232237466183996, 0.0800788300610377, 0.178887735169961, -0.0592727391427514, -0.0723099661837084, 0.0387715967173523, -0.0607793368610136, 0.110464511693512), logFC.6 = c(0.0848187321362019, -0.299283590551811, 0.0366788808661408, -0.00763280370062748, 0.0145148270035513, -0.0384916970002755, -0.0000335640771631606, 0.0851895375297912, -0.00364050261322463, 0.0602143760128463), logFC.7 = c(0.305256444042024, -0.274308408751318, 0.0977066795857243, -0.0265659018074027, 0.136348613124811, -0.0938364533000299, -0.143634179166262, 0.139913812601005, 0.268708965044232, 0.133427360632365), logFC.8 = c(0.12744808339884, -0.285015311267508, 0.0459140048745496, -0.00976012971218515, 0.13292412700208, -0.184687147498946, -0.0411558715447517, 0.165717944056239, 0.323358546432839, 0.0502386767987279), logFC.9 = c(0.286824598926274, 0.095530985319937, 0.101370835445593, 0.0352336819150421, 0.0573659992830985, -0.0739779010955875, 0.00466993628480923, 0.0486643748696862, 0.0322601740536419, 0.0873158516027886))
This is the only test sample, in real data, each group has 1500-2000 values so if dot merging over each other is not an issue.
Thank you for your kind help. waiting to hear from you peoples
library(tidyverse)
df1 %>%
pivot_longer(-Ensembl_ID) %>%
group_by(name) %>%
mutate(grp_avg = mean(value)) %>%
ungroup() %>% # EDIT -- sort should happen outside groups
mutate(name = fct_reorder(name, grp_avg)) %>%
ggplot(aes(Ensembl_ID, y = value, color = name)) +
ggbeeswarm::geom_beeswarm() +
scale_color_brewer() +
theme(axis.text.x = element_text(angle = 20))
bring your data in long format
define a mid point mean
use geom_jitter to account for overlaying points
map the color and specify scale_color_gradient2
library(tidyverse)
df <- df1 %>%
pivot_longer(
cols = -Ensembl_ID
)
mid <- mean(df$value)
ggplot(df, aes(x = factor(Ensembl_ID), y = value, color=value)) +
geom_jitter(size=4,
position = position_jitter(width = 0.2, height = 0.2)) +
#scale_color_gradient(low="blue", high="red")
scale_color_gradient2(midpoint=mid, low="blue", mid="grey",
high="red", space ="Lab" )+
theme_classic()
I'm trying to create a monthly time series in ggplot for time series analysis. This is my data:
rdata1 <- read_table2("date sales_revenue_incl_credit
2017-07 56,037.46
2017-08 38333.9
2017-09 48716.92
2017-10 65447.67
2017-11 134752.57
2017-12 116477.39
2018-01 78167.25
2018-02 75991.44
2018-03 42520.93
2018-04 70489.92
2018-05 121063.35
2018-06 76308.47
2018-07 118085.7
2018-08 96153.38
2018-09 82827.1
2018-10 109288.83
2018-11 145774.52
2018-12 141572.77
2019-01 123055.83
2019-02 104232.24
2019-03 435086.33
2019-04 74304.96
2019-05 117237.82
2019-06 82013.47
2019-07 99382.67
2019-08 138455.2
2019-09 97301.99
2019-10 137206.09
2019-11 109862.44
2019-12 118150.96
2020-01 140717.9
2020-02 127622.3
2020-03 134126.09")
I now use the below code to change the class of date and then plot with breaks and labels much easier using date_labels and date_breaks.
rdata1 %>%
mutate(date = ymd(date)) %>%
ggplot(aes(date, sales_revenue_incl_credit)) +
geom_line() +
scale_x_date(date_labels = "%b %Y", date_breaks = "1 month")+
theme_bw()+
theme(axis.text.x = element_text(angle = 90, vjust=0.5),
panel.grid.minor = element_blank())
I get the following error:
Error in seq.int(r1$mon, 12 * (to0$year - r1$year) + to0$mon, by) :
'from' must be a finite number
Putting all these concerns together, I performed some data preparation to obtain your desired output. First, as noted in the comments, I appended the first day of the month to each "year-month" so you can work with a proper date variable in R. Next, I used the column_to_rownames() function on the month_year column. I appended the year to the month name because duplicate (non-unique) row names are not permitted. I should caution you against using row labels. Quoting from the documentation (see ?tibble::rownames_to_column):
While a tibble can have row names (e.g., when converting from a regular data frame), they are removed when subsetting with the [ operator. A warning will be raised when attempting to assign non-NULL row names to a tibble. Generally, it is best to avoid row names, because they are basically a character column with different semantics than every other column.
You can manipulate the row names below with different naming conventions. Just make sure the labels are unique! See the R code below:
# Loading the required libraries
library(tibble)
library(ggplot2)
library(dplyr)
library(lubridate)
df <- tribble(
~date, ~sales_revenue_incl_credit,
"2017-07", 56037.46,
"2017-08", 38333.9,
"2017-09", 48716.92,
"2017-10", 65447.67,
"2017-11", 134752.57,
"2017-12", 116477.39,
"2018-01", 78167.25,
"2018-02", 75991.44,
"2018-03", 42520.93,
"2018-04", 70489.92,
"2018-05", 121063.35,
"2018-06", 76308.47,
"2018-07", 118085.7,
"2018-08", 96153.38,
"2018-09", 82827.1,
"2018-10", 109288.83,
"2018-11", 145774.52,
"2018-12", 141572.77,
"2019-01", 123055.83,
"2019-02", 104232.24,
"2019-03", 435086.33,
"2019-04", 74304.96,
"2019-05", 117237.82,
"2019-06", 82013.47,
"2019-07", 99382.67,
"2019-08", 138455.2,
"2019-09", 97301.99,
"2019-10", 137206.09,
"2019-11", 109862.44,
"2019-12", 118150.96,
"2020-01", 140717.9,
"2020-02", 127622.3,
"2020-03", 134126.09
)
# Data preparation
df %>%
mutate(date = ymd(paste0(date, "-01")),
month_year = paste(month(date, label = TRUE), year(date), sep = "-")
) %>%
column_to_rownames("month_year") %>% # sets the column labels to row names
head()
# Preview of the data frame with row names (e.g., Jul-2017, Aug-2017, Sep-2017, etc.)
date sales_revenue_incl_credit
Jul-2017 2017-07-01 56037.46
Aug-2017 2017-08-01 38333.90
Sep-2017 2017-09-01 48716.92
Oct-2017 2017-10-01 65447.67
Nov-2017 2017-11-01 134752.57
Dec-2017 2017-12-01 116477.39
# Reproducing your plot
df %>%
ggplot(aes(x = date, y = sales_revenue_incl_credit)) +
geom_line() +
scale_x_date(date_labels = "%b %Y", date_breaks = "1 month") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5),
panel.grid.minor = element_blank())
A simpler version of #Tom's answer is to use a tsibble object and the feasts package:
# Loading the required libraries
library(tibble)
library(dplyr)
library(ggplot2)
library(lubridate)
library(tsibble)
library(feasts)
# Data preparation
df <- tribble(
~date, ~sales_revenue_incl_credit,
"2017-07", 56037.46,
"2017-08", 38333.9,
"2017-09", 48716.92,
"2017-10", 65447.67,
"2017-11", 134752.57,
"2017-12", 116477.39,
"2018-01", 78167.25,
"2018-02", 75991.44,
"2018-03", 42520.93,
"2018-04", 70489.92,
"2018-05", 121063.35,
"2018-06", 76308.47,
"2018-07", 118085.7,
"2018-08", 96153.38,
"2018-09", 82827.1,
"2018-10", 109288.83,
"2018-11", 145774.52,
"2018-12", 141572.77,
"2019-01", 123055.83,
"2019-02", 104232.24,
"2019-03", 435086.33,
"2019-04", 74304.96,
"2019-05", 117237.82,
"2019-06", 82013.47,
"2019-07", 99382.67,
"2019-08", 138455.2,
"2019-09", 97301.99,
"2019-10", 137206.09,
"2019-11", 109862.44,
"2019-12", 118150.96,
"2020-01", 140717.9,
"2020-02", 127622.3,
"2020-03", 134126.09
) %>%
mutate(date = yearmonth(date)) %>%
as_tsibble(index=date)
# Reproducing your plot
df %>% autoplot(sales_revenue_incl_credit) +
scale_x_yearmonth(breaks=seq(1e3)) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5),
panel.grid.minor = element_blank())
Created on 2020-06-19 by the reprex package (v0.3.0)
This is a beginner question. I have spent most of the day trying to work out how to facet my data, but all of the examples of faceting that I have come across seem unsuited to my dataset.
Here are the first five rows from my data:
Date Germany.Yield Italy.Yield Greece.Yield Italy_v_Germany.Spread Greece_v_Germany.Spread
2020-04-19 -0.472 1.820 2.287 2.292 2.759
2020-04-12 -0.472 1.790 2.112 2.262 2.584
2020-04-05 -0.345 1.599 1.829 1.944 2.174
2020-03-29 -0.441 1.542 1.972 1.983 2.413
2020-03-22 -0.475 1.334 1.585 1.809 2.060
I simply want to create two line charts. On both charts the x-axis will be the date. On the first chart, the y-axis should be Italy_v_Germany.Spread and on the second, the y-axis should be Greece_v_Germany.Spread.
The first chart looks like this:
So I want the two charts to appear alongside each other, like this:
The one on the left should be Italy_v_Germany.Spread, and the one on the right should be Greece_v_Germany.Spread.
I really have no idea where to start with this. Hoping that someone can point me in the right direction.
In the interest I making the example reproducible, I will share a link to the CSV files which I'm using: https://1drv.ms/u/s!AvGKDeEV3LOsmmlHkzO6YVQTRiOX?e=mukBVy. Unforunately these files convert into excel format when shared via this link, so you may have to export the files to CSVs so that the code works.
Here is the code that I have so far:
library(ggplot2)
library(scales)
library(extrafont)
library(dplyr)
library(tidyr)
work_dir <- "D:\\OneDrive\\Documents\\Economic Data\\Historical Yields\\Eurozone"
setwd(work_dir)
# Germany
#---------------------------------------
germany_yields <- read.csv(file = "Germany 10-Year Yield Weekly (2007-2020).csv", stringsAsFactors = F)
germany_yields <- germany_yields[, -(3:6)]
colnames(germany_yields)[1] <- "Date"
colnames(germany_yields)[2] <- "Germany.Yield"
#---------------------------------------
# Italy
#---------------------------------------
italy_yields <- read.csv(file = "Italy 10-Year Yield Weekly (2007-2020).csv", stringsAsFactors = F)
italy_yields <- italy_yields[, -(3:6)]
colnames(italy_yields)[1] <- "Date"
colnames(italy_yields)[2] <- "Italy.Yield"
#---------------------------------------
# Greece
#---------------------------------------
greece_yields <- read.csv(file = "Greece 10-Year Yield Weekly (2007-2020).csv", stringsAsFactors = F)
greece_yields <- greece_yields[, -(3:6)]
colnames(greece_yields)[1] <- "Date"
colnames(greece_yields)[2] <- "Greece.Yield"
#---------------------------------------
# Join data
#---------------------------------------
combined <- merge(merge(germany_yields, italy_yields, by = "Date", sort = F),
greece_yields, by = "Date", sort = F)
combined <- na.omit(combined)
combined$Date <- as.Date(combined$Date,format = "%B %d, %Y")
combined["Italy_v_Germany.Spread"] <- combined$Italy.Yield - combined$Germany.Yield
combined["Greece_v_Germany.Spread"] <- combined$Greece.Yield - combined$Germany.Yield
#--------------------------------------------------------------------
fl_dates <- c(tail(combined$Date, n=1), head(combined$Date, n=1))
ggplot(data=combined, aes(x = Date, y = Italy_v_Germany.Spread)) + geom_line() +
scale_x_date(limits = fl_dates,
breaks = seq(as.Date("2008-01-01"), as.Date("2020-01-01"), by="2 years"),
expand = c(0, 0),
date_labels = "%Y")
You need to get your data into a long format, for example, by using pivot_wider. Then it should work.
library(dplyr)
library(tidyr)
library(ggplot2)
data <- tribble(~Date, ~Germany.Yield, ~Italy.Yield, ~Greece.Yield, ~Italy_v_Germany.Spread, ~Greece_v_Germany.Spread,
"2020-04-19", -0.472, 1.820, 2.287, 2.292, 2.759,
"2020-04-19", -0.472, 1.820, 2.287, 2.292, 2.759,
"2020-04-12", -0.472, 1.790, 2.112, 2.262, 2.584,
"2020-04-05", -0.345, 1.599, 1.829, 1.944, 2.174,
"2020-03-29", -0.441, 1.542, 1.972, 1.983, 2.413,
"2020-03-22", -0.475, 1.334, 1.585, 1.809, 2.060
)
data %>%
mutate(Date = as.Date(Date)) %>%
pivot_longer(
cols = ends_with("Spread"),
names_to = "country",
values_to = "Spread_v_Germany",
values_drop_na = TRUE
) %>%
ggplot(., aes(x = Date, y = Spread_v_Germany, group = 1)) +
geom_line() +
facet_wrap(. ~ country)