I have this data set:
dat <-
structure(list(date = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L,
3L, 3L, 4L, 4L), .Label = c("3/31/2014", "4/1/2014", "4/2/2014",
"4/3/2014"), class = "factor"), site = structure(c(1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L), .Label = c("a", "b"), class = "factor"),
clicks = c(73L, 64L, 80L, 58L, 58L, 61L, 70L, 60L, 84L, 65L,
77L), impressions = c(55817L, 78027L, 77017L, 68797L, 92437L,
94259L, 88418L, 55420L, 69866L, 86767L, 92088L)), .Names = c("date",
"site", "clicks", "impressions"), class = "data.frame", row.names = c(NA,
-11L))
dat
date site clicks impressions
1 3/31/2014 a 73 55817
2 3/31/2014 b 64 78027
3 3/31/2014 a 80 77017
4 4/1/2014 b 58 68797
...
Is it possible include the date formatting of one column within the chain? (I've also tried using with, but that only returns the date column.)
library(dplyr)
> dat %.%
+ select(date, clicks, impressions) %.%
+ group_by(date) %.%
+ summarise(clicks = sum(clicks),
+ impressions = sum(impressions)) %.%
+ as.Date(Date, format = '%m/%d/%Y')
Error in as.Date.default(`__prev`, Date, format = "%m/%d/%Y") :
do not know how to convert '__prev' to class “Date”
If I don't include the formatting within the chain, it works. I know it's simple to write this outside of the chain, but I would like to confirm if this is doable.
dat %.%
select(date, clicks, impressions) %.%
group_by(date) %.%
summarise(clicks = sum(clicks),
impressions = sum(impressions))
dat$date <- as.Date(dat$Date, format = '%m/%d/%Y')
Is this what you want?
dat %>%
select(date, clicks, impressions) %>%
group_by(date) %>%
summarise(clicks = sum(clicks),
impressions = sum(impressions)) %>%
mutate(date = as.Date(date, format = '%m/%d/%Y'))
Sometimes the Error: cannot modify grouping variable message comes when you're trying to run group_by() operations on something that has already been grouped. You might try including ungroup first. In the syntax of Robert's answer:
dat %>%
ungroup %>%
select(date, clicks, impressions) %>%
group_by(date) %>%
summarize(clicks = sum(clicks),
impressions = sum(impressions)) %>%
mutate(date = as.Date(date, format = "%m/%d/%Y"))
Related
Dataframe "id" has the columns year, id, and matriline, where each row is an incident. I wanted to count the number of incidents by matriline per year, so I did:
events.bymatr =
id %>%
group_by(year, matr, .drop = FALSE) %>%
dplyr::summarise(n = n()) %>%
ungroup()
events.bymatr
I plotted a line graph of the number of incidents over time, by matriline.
ggplot(events.bymatr, aes(x=year, y=n, group=matr)) + geom_line(aes(color=matr))
My question is twofold:
Is there a way I could recreate this line graph where the thickness of the lines is determined by how many IDs there were, per matriline? I imagine this would involve reshaping my data above but when I tried to group_by(year,matr,id,.drop=FALSE) my data came out all wonky.
I want to change the color palete so that each color is very distinct - how do I attach a new color palette? I tried using this c25 palette with this code but it makes all my lines disappear.
ggplot(events.bymatr, aes(x=year, y=n, group=matr)) + geom_line(aes(color=c25))
Thanks so much in advance!
Output of "id" (shortened to just the first five rows per column):
> dput(id)
structure(list(date = structure(c(8243, 8243, 8243, 8248, 8947,
class = "Date"), year = c(1992L, 1992L, 1992L, 1992L, 1994L),
event.id = c(8L, 8L, 8L, 10L, 11L), id = structure(c(51L, 55L, 59L,
46L, 51L), .Label = c("J11", "J16", "J17", "J2", "J22"),
class = "factor"), sex = structure(c(1L, 2L, 2L, 1L, 1L),
.Label = c("0", "1"), class = "factor"), age = c(28L, 12L, 6L, 42L,
30L), matr = structure(c(20L, 20L, 20L, 11L, 20L), .Label = c("J2",
"J4", "J7", "J9", "K11"), class = "factor"),
matralive = structure(c(2L, 2L, 2L, 2L, 2L),
.Label = c("0", "1"), class = "factor"), pod = structure(c(3L, 3L,
3L, 3L, 3L), .Label = c("J", "K", "L"), class = "factor")),
row.names = c(NA, -134L), class = c("tbl_df", "tbl", "data.frame"))
Output of events.bymatr:
> dput(events.bymatr)
structure(list(year = c(1992L, 1992L, 1992L, 1992L, 1992L),
matr = structure(c(1L, 2L, 3L, 4L, 5L), .Label = c("J2", "J4",
"J7", "J9", "K11"), class = "factor"), n = c(0L, 0L, 0L, 0L, 0L)),
row.names = c(NA, -380L), class = c("tbl_df", "tbl",
"data.frame"))
As #r2evans noted, it is surprisingly hard to distinguish clearly among more than a handful of colors. I used an example 20-color scale here that does a pretty good job, but even so a few can be tricky to distinguish. Here's an attempt using the storms dataset included with dplyr.
library(dplyr)
storms %>%
group_by(name, year) %>%
summarize(n = n(), .groups = "drop") %>% # = number of name per year View
tidyr::complete(name, year = 1975:2015, fill = list(n = 0)) %>%
group_by(name) %>%
mutate(total = sum(n)) %>% # = number of name overall
ungroup() %>%
filter(total %% 12 == 0) %>% # Arbitrary, to reduce scope of data for example
ggplot(aes(year, n, color = name, size = total, group = name)) +
geom_line() +
guides(color = guide_legend(override.aes = list(size = 3))) +
ggthemes::scale_color_tableau(palette = "Tableau 20")
I am trying to plot a line chart using Date-time and no of tweets at that period of date and time in R.
library(ggplot2)
df1 <- structure(list(Date = structure(c(1L, 1L, 2L, 1L, 1L, 1L), .Label = c("2020-03-12",
"2020-03-13"), class = "factor"), Time = structure(c(1L, 1L, 2L,
3L, 4L, 5L), .Label = c("00:00:00Z", "00:00:01Z", "00:10:04Z",
"00:25:12Z", "01:00:02Z"), class = "factor"), Text = structure(c(5L,
3L, 6L, 4L, 2L, 1L), .Label = c("The images of demonstrations and gathering", "Premium policy get activate by company abc",
"Launches of rocket", "Premium policy get activate by company abc",
"Technology makes trend", "The images of demonstrations and gatherings",
"Weather forecasting by xyz"), class = "factor")), class = "data.frame", row.names = c(NA,
-6L))
ggplot(df1, aes(x = Date, y = text(count)) + geom_line(aes(color = variable), size = 1)
I tried the above code to plot desired result but got an error. Dataset given like that in csv format.
Date Time Text
2020-03-12 00:00:00Z The images of demonstrations and gatherings
2020-03-12 00:00:00Z Premium policy get activate by company abc
2020-03-12 00:00:01Z Weather forecasting by xyz
2020-03-12 00:10:04Z Technology makes trend
2020-03-12 00:25:12Z Launches of rocket
2020-03-12 01:00:02Z Government launch new policy to different sector improvement
I have a dataset of nearly 15 days and want to plot the line chart to visualize the number of tweets (given in text column) to see the trend of tweets on different time and date.
df1 <- structure(list(Date = structure(c(1L, 1L, 2L, 1L, 1L, 1L), .Label = c("3/12/2020",
"3/13/2020"), class = "factor"), Time = structure(c(1L, 1L, 2L,
3L, 4L, 5L), .Label = c("00:00:00Z", "00:00:01Z", "00:10:04Z",
"00:25:12Z", "01:00:02Z"), class = "factor"), Text = structure(c(5L,
3L, 6L, 4L, 2L, 1L), .Label = c("Government launch new policy to different sector",
"Launches of rocket", "Premium policy get activate by company abc",
"Technology makes trend", "The images of demonstrations and gatherings",
"Weather forecasting by xyz"), class = "factor"), X = structure(c(1L,
1L, 1L, 1L, 1L, 2L), .Label = c("", "improvement"), class = "factor")), class = "data.frame", row.names = c(NA,
-6L))
Creating the dataset df1 as above then running this gives you required plot for hour
library(tidyverse)
library(lubridate)
df1 %>%
mutate(Time=hms(Time),
Date=mdy(Date),
hour=hour(Time)) %>%
count(hour) %>%
ggplot(aes(hour,n,group=1))+geom_line()+geom_point()
Is this what you are after?
library(dplyr)
library(lubridate)
library(stringr)
library(ggplot2)
Answer with your data
To demonstrate data wrangling.
# your data;
df1 <- structure(list(Date = structure(c(1L, 1L, 2L, 1L, 1L, 1L),
.Label = c("2020-03-12","2020-03-13"),
class = "factor"),
Time = structure(c(1L, 1L, 2L,3L, 4L, 5L),
.Label = c("00:00:00Z", "00:00:01Z", "00:10:04Z","00:25:12Z", "01:00:02Z"),
class = "factor"),
Text = structure(c(5L,3L, 6L, 4L, 2L, 1L),
.Label = c("The images of demonstrations and gathering", "Premium policy get activate by company abc",
"Launches of rocket", "Premium policy get activate by company abc",
"Technology makes trend", "The images of demonstrations and gatherings", "Weather forecasting by xyz"), class = "factor")),
class = "data.frame", row.names = c(NA,-6L))
# data wrangle
df2 <-
df1 %>%
# change all variables from factors to character
mutate_all(as.character) %>%
mutate(Time = str_remove(Time, "Z$"), #remove the trailing 'Z' from Time values
dt = ymd_hms(paste(Date, Time, sep = " ")), # change text into datetime format using lubridtate::ymd_hms
dt = ceiling_date(dt, unit="hour")) %>% # round to the end of the named hour, separated for clarity
group_by(dt) %>%
summarise(nr_tweets = n())
# plot
p1 <- ggplot(df2, aes(dt, nr_tweets))+
geom_line()+
scale_x_datetime(date_breaks = "1 day", date_labels = "%d/%m")+
ggtitle("Data from question `df1`")
Answer with made up large dataset
tib <- tibble(dt = sample(seq(ISOdate(2020,05,01), ISOdate(2020,05,15), by = "sec"), 10000, replace = TRUE),
text = sample(c(letters[1:26], LETTERS[1:26]), 10000, replace = TRUE))
tib1 <-
tib %>%
mutate(dt = round_date(dt, unit="hour"))%>%
group_by(dt) %>%
summarise(nr_tweets = n())
p2 <- ggplot(tib1, aes(dt, nr_tweets))+
geom_line()+
scale_x_datetime(date_breaks = "1 day", date_labels = "%d/%m")+
ggtitle("Result using `tib` data made up to answer the question")
p1/p2
Created on 2020-05-13 by the reprex package (v0.3.0)
I have a time series like this:
created_time,reaction_counts
2016-01-18T08:05:44+0000,65
2016-01-18T08:05:44+0000,65
2016-01-18T08:05:44+0000,65
2016-02-23T01:42:48+0000,468
2016-02-23T03:51:37+0000,125
2016-02-23T09:49:01+0000,433
2016-02-23T10:09:32+0000,72
2016-02-26T07:45:10+0000,137
2016-02-26T11:48:09+0000,120
2016-02-27T03:27:39+0000,70
2016-02-28T09:28:16+0000,145
2016-03-02T00:17:14+0000,122
2016-03-02T05:34:41+0000,108
2016-03-02T09:04:45+0000,296
And I want to aggregate it by month (and also by year) and plot a histogram.
How do I do it?
Thanks!
You can use the following code for converting hourly data to monthly or yearly data
library(lubridate)
library(dplyr)
library(hydroTSM)
try <- structure(list(created_time = structure(c(1L, 1L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L), .Label = c("2016-01-18T08:05:44+0000",
"2016-02-23T01:42:48+0000", "2016-02-23T03:51:37+0000", "2016-02-23T09:49:01+0000",
"2016-02-23T10:09:32+0000", "2016-02-26T07:45:10+0000", "2016-02-26T11:48:09+0000",
"2016-02-27T03:27:39+0000", "2016-02-28T09:28:16+0000", "2016-03-02T00:17:14+0000",
"2016-03-02T05:34:41+0000", "2016-03-02T09:04:45+0000"), class = "factor"),
reaction_counts = c(65L, 65L, 65L, 468L, 125L, 433L, 72L,
137L, 120L, 70L, 145L, 122L, 108L, 296L)), class = "data.frame", row.names = c(NA,
-14L))
df <- mutate_at(try, "created_time", ymd_hms)
Monthly conversion
monthly = df %>%
mutate(month = format(created_time, "%m"), year = format(created_time, "%Y")) %>%
group_by(month, year) %>%
summarise(total = sum(reaction_counts))
For histogram plotting of monthly data
hist(monthly$total)
Yearly conversion
yearly = df %>%
mutate(month = format(created_time, "%m"), year = format(created_time, "%Y")) %>%
group_by(year) %>%
summarise(total = sum(reaction_counts))
For histogram plotting of yearly data
hist(yearly$total)
I've been stuck on this dplyr manipulation issue for a while now.
Here is a small sample size of my data: dput(test)
structure(list(anon_screen_name = c("40492fd6e817cc25cea942be9eae7c1c5795ffa1",
"862329793fdbcd666d660d9a9d2e3beceb07a0db", "862329793fdbcd666d660d9a9d2e3beceb07a0db",
"862329793fdbcd666d660d9a9d2e3beceb07a0db", "862329793fdbcd666d660d9a9d2e3beceb07a0db",
"862329793fdbcd666d660d9a9d2e3beceb07a0db", "862329793fdbcd666d660d9a9d2e3beceb07a0db",
"862329793fdbcd666d660d9a9d2e3beceb07a0db", "a9c8719499b9ef73c78e85bada231591d807a821",
"a9c8719499b9ef73c78e85bada231591d807a821"), resource_display_name = c("Quiz",
"Quiz", "Quiz", "Quiz", "Quiz", "homework", "homework", "final_exam",
"Quiz", "Quiz"), grade = c(0L, 0L, 0L, 3L, 1L, 0L, 1L, 1L, 1L,
2L), max_grade = c(2L, 1L, 0L, 3L, 1L, 10L, 11L, 1L, 1L, 2L),
percent_grade = c("0", "0", "\\N", "100", "100", "0", "9.09",
"100", "100", "100")), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -10L))
Basically, for each anon_screen_name, I want to drop the lowest percent_grade for the homework (in resource_display_name).
I started to write this starter code:
test %>%
mutate(percent_grade = as.numeric(percent_grade)) %>%
group_by(resource_display_name) %>%
summarise(min_percent_grade = min(percent_grade, na.rm = T))
But this only shows me the minimum homework grade without taking out the row with the minimum homework grade
UPDATE:
Basically, borrowing from a comment below, I want to remove the row(s) associated with the lowest value of percent_grade where resource_display_name == 'homework'
Try following codes:
test %>%
mutate(percent_grade = as.numeric(percent_grade)) %>%
filter(resource_display_name == 'homework') %>%
filter(percent_grade > min(percent_grade, na.rm = T)) -> t1
test %>%
mutate(percent_grade = as.numeric(percent_grade)) %>%
filter(resource_display_name != 'homework') -> t2
rbind(t1,t2)
The following will remove all values equal to the minimum per group of resource_display_name. Note that it's a base R solution, there is no need for an external package such as dplyr.
inx <- with(test, ave(as.numeric(percent_grade), resource_display_name, FUN = function(x) x != min(x, na.rm = TRUE)))
inx <- which(as.logical(inx))
test[inx, ]
If you only want to remove a single record, and not all records with the lowest grade, you could do something like the following.
test %>%
mutate(percent_grade = as.numeric(percent_grade)) %>%
group_by(anon_screen_name) %>%
mutate(lowest_grade = 1 * ((percent_grade == min(percent_grade, na.rm=TRUE)) & (resource_display_name == 'homework'))) %>%
arrange(lowest_grade) %>%
filter(row_number() != n()) %>%
ungroup()
I want to get whole numbers for x-axis for ggvis plot.
MWE
df <-
structure(list(Factor = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 1L,
2L, 3L, 1L, 2L, 3L), .Label = c("A", "B", "C"), class = "factor"),
X = c(15.5133333333333, 14.63, 14.41, 14.1266666666667, 13.1833333333333,
12.9466666666667, 13.6133333333333, 13.55, 13.5333333333333,
11.5566666666667, 11.3066666666667, 11.4566666666667), Y = c(20L,
20L, 20L, 40L, 40L, 40L, 70L, 70L, 70L, 100L, 100L, 100L)), .Names = c("Factor",
"X", "Y"), row.names = c(NA, -12L), class = "data.frame")
library(ggvis)
ggvis(data=df
, x= ~X
, y= ~Y
, fill= ~Factor
, stroke = ~Factor) %>%
arrange(Y) %>%
group_by(Factor) %>%
layer_points(shape=~Factor) %>%
layer_paths(fill := NA) %>%
add_axis('x', orient=c('bottom'), format='####')
One possibility is use values=seq(from=10, to=16, by=1) in add_axis(). But this is approach is not automated.
Setting the format argument to 'd' will display only integer values in the axis label:
library(ggvis)
library(dplyr)
##
ggvis(data=df
, x= ~X
, y= ~Y
, fill= ~Factor
, stroke = ~Factor) %>%
arrange(Y) %>%
group_by(Factor) %>%
layer_points(shape=~Factor) %>%
layer_paths(fill := NA) %>%
add_axis('x', orient=c('bottom'), format='d')
More information on d3 formatting specifications is available on this page, as mentioned in the Common Properties section of this ggvis guide.