I have those data:
require(tidyverse)
(df <- tribble( ~id, ~group, ~event,
1, "A", "2010/01/23",
2, "B", "2010/02/13",
3, "A", "2011/03/21",
4, "B", "2010/01/20",
5, "B", "2012/11/03",
6, "A", "2012/08/12"))
(df$event <- as.POSIXct(df$event, format = "%Y/%m/%d")
And I want to trace a graphic with x = time / y = count of events / color = group.
This is how I did it:
(
df
%>% group_by(group)
%>% arrange(event)
%>% mutate(N = row_number())
%>% ggplot(aes(x=event, y=N, color=group))
+ geom_line()
)
I have a spark dataframe I'm manipulating using sparklyr that looks like the following:
input_data <- data.frame(id = c(10,10,10,20,20,30,30,40,40,40,50,60,70, 80,80,80,100,100,110,110,120,120,120,130,140,150,160,170),
date = c("2021-01-01","2021-01-02","2021-01-03","2021-01-01","2021-01-02","2021-01-01","2021-01-02","2021-01-02","2021-01-01","2021-01-02","2021-01-01","2021-01-02","2021-01-05","2021-01-01","2021-01-02","2021-01-03","2021-01-01","2021-01-02","2021-01-01","2021-01-02","2021-01-02","2021-01-01","2021-01-02","2021-01-01","2021-01-02","2021-01-05","2021-01-01","2021-01-05"),
group = c("A", "B", "C", "B", "C", "A", "C", "A", "A", "A", "C", "A","B","A", "B", "C", "B", "C", "A", "C", "A", "A", "A", "C", "A", "A", "B","A"),
event = c(1,1,1,0,1,0,1,0,0,1,1,1,0,1,1,1,0,1,0,1,0,0,1,1,1,1,1,0))
I'd like to aggregate the data such that I have a count of the number of "events" (where event == 1 ) and "non_events" (where event == 0) for each combination such that the final output looks like the following:
data.frame(group_a = c(1,0,0,1,0,1),
group_b = c(0,1,0,1,1,0),
group_c = c(0,0,1,0,1,1),
event_occured = c(3,1,2,0,2,2),
event_not_occured = c(4,2,2,0,2,2))
So, for example, there were no combinations where A and B were groups for the same ID so that gets a 0 for event and non_event. There were 4 IDs in which group A was involved in, of which 3 resulted in an event and 1 resulted in a non_event, so on and so forth.
What approach using sparklyr (or dplyr or pyspark) would allow for aggregation as described above? I tried the following but I'm getting the exact same number of event as event_not_occurred so I must be doing something wrong but can't pinpoint it:
combo_path_sdf <- input_data %>%
group_by(id) %>%
arrange(date) %>%
mutate(order_seq = ifelse(event > 0, 1, NA)) %>%
mutate(order_seq = lag(cumsum(ifelse(is.na(order_seq), 0, order_seq)))) %>%
mutate(order_seq = ifelse((row_number() == 1) & (event > 0), -1, ifelse(row_number() == 1, 0, order_seq))) %>%
ungroup()
combo_path_sdf %>%
group_by(id, order_seq) %>%
summarize(group_a = max(ifelse(group_a == "A", 1, 0)),
group_b = max(ifelse(group_b == "B", 1, 0)),
group_c = max(ifelse(group_c == "C", 1, 0)),
events = sum(event)) %>%
group_by(order_seq, group_a, group_b, group_c) %>%
summarize(event = sum(events),
total_sequences = n()) %>%
mutate(event_not_occured = total_sequences - event)
Final output in the following format would be ok too:
data.frame(group_a = c("A", "B", "C", "A,B", "B,C", "A,C"),
event_occured = c(3,1,2,1,2,2),
event_not_occured = c(4,2,2,1,2,2))
(image below for A,B is incorrect, should be 1,1 not 0,0)
The following matches your requested output format, and process the data in the way I understand you want, but (as per the comment by #Martin Gal) does not match the example result you provided.
input_data %>%
group_by(id) %>%
summarise(group_a = max(ifelse(group == 'A', 1, 0)),
group_b = max(ifelse(group == 'B', 1, 0)),
group_c = max(ifelse(group == 'C', 1, 0)),
event_occured = sum(ifelse(event == 1, 1, 0)),
event_not_occured = sum(ifelse(event == 0, 1, 0)),
.groups = "drop") %>%
group_by(group_a, group_b, group_c) %>%
summarise(event_occured = sum(event_occured),
event_not_occured = sum(event_not_occured),
.groups = "drop")
This idea is a two step summary process. The first summarise creates an indicator for group from each event and counts the number of events/non-events. The second summarise, combines all similar groups.
Regarding the code you are using that produces the same number of events and non-events. Take a look at hts_combined. This is not defined in the code you have shared and hence your script might be reading a variable from elsewhere.
I have two sets of data. Each contains a column for the name of the molecule and a column for the number of times that molecule appears in the sample. I want to create a scatterplot with the number of times a molecule appears in dataset #1 on the x-axis and how many times it appears in dataset #2. If a molecule is in one dataset and not the other, it appears 0 times.
Example:
dat1 <- data.frame(
name = c("A", "B", "D", "E")
count = c(10, 1, 30, 10)
)
dat2 <- data.frame(
name = c("A", "B", "C", "F")
count = c(1, 3, 50, 40)
)
Point #1 would be (10,1) corresponding to A, Point #2 would be (1,3), Point #3 would be (0,50) and so on. I don't want to label my points since my datasets contain tens of thousands of molecules.
Try joining the data.frames
full_join(dat1, dat2, by="name") %>%
mutate_all(function(xx) ifelse(is.na(xx), 0, xx)) %>%
ggplot(aes(count.x, count.y)) +
geom_point()
which produces
You would need a full_join():
library(dplyr)
library(ggplot2)
#Data
dat1 <- data.frame(
name = c("A", "B", "D", "E"),
count = c(10, 1, 30, 10)
)
dat2 <- data.frame(
name = c("A", "B", "C", "F"),
count = c(1, 3, 50, 40)
)
#Code
dat1 %>% full_join(dat2 %>% rename(count2=count)) %>%
replace(is.na(.),0) %>%
ggplot(aes(x=count,y=count2))+
geom_point()+
geom_text(aes(label=name),vjust=-0.5)
Output:
I have the following input data frame with 4 columns and 3 rows.
The time column can take value from 1 to the corresponding value of the maturity column for that customer, I want to create more observations for each customer till the value of time is = value of maturity, with the other columns retaining their original value. Please see the below links for input and expected output
Input
Output
Here is a dplyr solution inspired but not exactly equal to this post.
library(dplyr)
df <- data.frame(custno = 1:3, time = 1, dept = c("A", "B", "A"))
df %>%
slice(rep(1:n(), each = 5)) %>%
group_by(custno) %>%
mutate(time = seq_along(time))
Edit
After the comments by the OP, the following seems to be better.
First, the data:
df <- data.frame(custno = 1:3, time = 1,
dept = c("A", "B", "A"),
maturity = c(5,4,6))
And the solution.
df %>%
tidyr::uncount(maturity) %>%
group_by(custno) %>%
mutate(time = seq_along(time))
We can also use slice with row_number
library(dplyr)
library(data.table)
df %>%
slice(rep(row_number(), maturity)) %>%
mutate(time = rowid(custno))
data
df <- data.frame(custno = 1:3, time = 1,
dept = c("A", "B", "A"),
maturity = c(5,4,6))
I'm new to the Tidyverse and dplyr and was hoping to get some guidance on how best to concatenate data from row below the current row. For example, in the dataframe below I want to use data in the Grade column to create the data in the Prior3Grades column. The Prior3Grades data for 1/2/2019 would be created by concatenating the grades from 12/3/18, 11/3/18 and 10/4/18.
Can this be achieved in dplyr using mutate or some other means? Also is this in dplyr's wheelhouse or would this be something better suited to sql.
Using some basic packages from the tidyverse:
library(dplyr)
library(tidyr)
library(tibble)
df <- tibble(
Name = "Bob",
TestDate = seq(as.Date("2019-02-01"), as.Date("2019-05-08"), length.out = 6), ## some random dates
Grade = c("A", "A", "B", "C", "D", "A")
)
df %>%
group_by(Name) %>%
mutate(
grade1 = lead(Grade),
grade2 = lead(Grade, 2),
grade3 = lead(Grade, 3)
) %>%
replace_na(list(grade1 = "", grade2 = "", grade3 = "")) %>%
mutate(
Prior3Grades = paste0(grade1, grade2, grade3)
)