squashing multiple rows by time difference - r

Assuming these are few timestamped observations in a dataset:
Id Status DateCreated Group
10 Read 2017-11-04 18:24:55 Red
10 Write 2017-11-04 18:24:56 Red
10 Review 2017-11-04 18:25:16 Red
10 Read 2017-11-04 18:26:17 Red
10 Write 2017-11-04 18:26:47 Red
How do I collapse rows that are within 1 minute of each other?
For example, rows 1,2,3 are collapsed into 1 row and rows 4 and 5 are collapsed into second row.
The expected output would look like this:
Id Status DateCreated Date Ended Group
10 Read,Write,Review 2017-11-04 18:24:55 2017-11-04 18:25:16 Red, Red, Red
10 Read,Write 2017-11-04 18:26:17 2017-11-04 18:26:47 Red, Red
Here is the code to reproduce the test dataset in this example:
df <- structure(list(Id = structure(c(1L, 1L, 1L, 1L, 1L), .Label = "10", class = "factor"),
Status = structure(c(1L, 3L, 2L, 1L, 3L), .Label = c("Read",
"Review", "Write"), class = "factor"), DateCreated = structure(1:5, .Label = c("2017-11-04 18:24:55",
"2017-11-04 18:24:56", "2017-11-04 18:25:16", "2017-11-04 18:26:17",
"2017-11-04 18:26:47"), class = "factor"), Group = structure(c(1L,
1L, 1L, 1L, 1L), .Label = "Red", class = "factor")), class = "data.frame", row.names = c(NA,
-5L))

I would do something like that:
df %>%
mutate(DateCreated = ymd_hms(DateCreated))%>%
group_by(minute(DateCreated))%>%
arrange(DateCreated)%>%
summarise(Status = paste(Status,collapse = ", "),DateCreated = DateCreated[1],Date_ended = last(DateCreated),Group = paste(Group,collapse = ", "))

library(lubridate)
library(dplyr)
library(purrr)
df <-
structure(
list(
Id = structure(c(1L, 1L, 1L, 1L, 1L), .Label = "10", class = "factor"),
Status = structure(
c(1L, 3L, 2L, 1L, 3L),
.Label = c("Read",
"Review", "Write"),
class = "factor"
),
DateCreated = structure(
1:5,
.Label = c(
"2017-11-04 18:24:55",
"2017-11-04 18:24:56",
"2017-11-04 18:25:16",
"2017-11-04 18:26:17",
"2017-11-04 18:26:47"
),
class = "factor"
),
Group = structure(c(1L,
1L, 1L, 1L, 1L), .Label = "Red", class = "factor")
),
class = "data.frame",
row.names = c(NA,-5L)
)
df2 <-
df %>%
mutate(DateCreated = as_datetime(df$DateCreated)) %>%
arrange(DateCreated) %>%
mutate(diff = DateCreated - lag(DateCreated))
df2$diff[1] <- 0L
g <- 0
df3 <- mutate(df2, date_groups =
accumulate(df2$diff, function(x, y)
if (y - x < 60)
g
else {
g <<- g + 1
})) %>%
group_by(date_groups) %>%
summarise(
Status = paste(Status, collapse = ", "),
DateCreated = DateCreated[1],
Date_ended = last(DateCreated),
Group = paste(Group, collapse = ", ")
)
df3
#> # A tibble: 2 x 5
#> date_groups Status DateCreated Date_ended Group
#> <dbl> <chr> <dttm> <dttm> <chr>
#> 1 0 Read, Write… 2017-11-04 18:24:55 2017-11-04 18:24:55 Red, Re…
#> 2 1 Read, Write 2017-11-04 18:26:17 2017-11-04 18:26:17 Red, Red
Created on 2019-01-28 by the reprex package (v0.2.1)

Related

appending results together from broom::glance

Say I have this data:
df <- structure(list(a_bracket = structure(c(9L, 8L, 9L,
9L, 9L, 9L), .Label = c("0-15", "16-20", "21-60", "61-100", "101-500",
"501-1000", "1001-3500", "3501-5000", "5001+"), class = "factor"), b_bracket = structure(c(3L,
2L, 3L, 4L, 1L, 4L), .Label = c("18-25", "26-35", "36-40", "41-45",
"46-48", "49-70", "71+"), class = "factor"), gender = structure(c(2L,
2L, 2L, 2L, 1L, 2L), .Label = c("Female", "Male"), class = "factor"),
q1 = structure(c(2L, 2L, 4L, 3L, 1L, 4L
), .Label = c("I don't\nlike a thing",
"I don't\na thing at all", "I like a\nthing",
"Ambivalent about\nthe thing"), class = "factor"), q2 = structure(c(3L,
2L, 1L, 1L, 4L, 1L), .Label = c("Neither like\nnor dislike",
"Somewhat\ndislike", "Somewhat\nlike", "Strongly\ndislike",
"Strongly\nlike"), class = "factor"), q3 = structure(c(2L,
2L, 2L, 3L, 2L, 1L), .Label = c("Moderately", "Not at\nall",
"Quite", "Slightly", "Very"
), class = "factor")), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
df
# A tibble: 6 x 6
a_bracket b_bracket gender q1 q2 q3
<fct> <fct> <fct> <fct> <fct> <fct>
1 5001+ 36-40 Male "I don't\na thing at all" "Somewhat\nlike" "Not at\nall"
2 3501-5000 26-35 Male "I don't\na thing at all" "Somewhat\ndislike" "Not at\nall"
3 5001+ 36-40 Male "Ambivalent about\nthe thing" "Neither like\nnor dislike" "Not at\nall"
4 5001+ 41-45 Male "I like a\nthing" "Neither like\nnor dislike" "Quite"
5 5001+ 18-25 Female "I don't\nlike a thing" "Strongly\ndislike" "Not at\nall"
6 5001+ 41-45 Male "Ambivalent about\nthe thing" "Neither like\nnor dislike" "Moderately"
I'm trying to run a series of models, extract the r-squared and the AIC and append them together in a new df with the name of the dependent variable as the third row.
This is my attempt:
model_stats <- function(data){
mod <- glance(
lm(as.numeric(data) ~
a_bracket +
b_bracket +
gender,
data = df))
tibble(
r_squared = mod %>% select(r.squared),
AIC = mod %>% select(AIC)
)
}
map_dfr(
df %>%
select(starts_with("q")),
model_stats,
.id = "question"
) %>% unnest()
But for some reason I don't understand this repeats the output by N times for the number of models i'm running.
Does anyone know what i'm doing wrong here?
Try this -
library(tidyverse)
library(broom)
model_stats <- function(data){
mod <- glance(
lm(as.numeric(data) ~
a_bracket +
b_bracket +
gender,
data = df))
tibble(
r_squared = mod %>% pull(r.squared),
AIC = mod %>% pull(AIC)
)
df %>%
select(starts_with('q')) %>%
map_df(model_stats, .id = 'question')
# question r_squared AIC
# <chr> <dbl> <dbl>
#1 q1 6.59e- 1 21.8
#2 q2 7.5 e- 1 20.4
#3 q3 2.22e-31 20.4

How to I get accuracy values by group [duplicate]

This question already has answers here:
Mean per group in a data.frame [duplicate]
(8 answers)
Closed 3 years ago.
I can't get the average accuracies (proportion of TRUE values) in Correct_answers columns for the groups chart type and condition.
data
structure(list(Element = structure(c(1L, 1L, 1L, 1L, 1L), .Label = c("1",
"2", "3", "4", "5", "6"), class = "factor"), Correct_answer = structure(c(2L,
2L, 2L, 1L, 2L), .Label = c("FALSE", "TRUE"), class = "factor"),
Response_time = c(25.155, 6.74, 28.649, 16.112, 105.5906238
), Chart_type = structure(c(2L, 2L, 1L, 1L, 1L), .Label = c("Box",
"Violin"), class = "factor"), Condition = structure(c(1L,
2L, 1L, 2L, 1L), .Label = c("0", "1"), class = "factor")), row.names = c(NA,
5L), class = "data.frame")
Average by chart_type
av_data_chartType <- data %>% group_by(Chart_type) %>% summarise_each(funs(mean, sd))
Average by condition
av_data_conition <- data %>% group_by(Condition) %>% summarise_each(funs(mean, sd))
No mean produced for accuracy
NA value is place where accuracy should be.
Reproducing your code I had a warning that led me to the answer : you shouldn't compute statistics on factor variables. If you know what you are doing you can convert them to numeric :
data <- structure(list(Element = structure(c(1L, 1L, 1L, 1L, 1L),
.Label = c("1", "2", "3", "4", "5", "6"),
class = "factor"),
Correct_answer = structure(c(2L, 2L, 2L, 1L, 2L),
.Label = c("FALSE", "TRUE"),
class = "factor"),
Response_time = c(25.155, 6.74, 28.649, 16.112, 105.5906238
),
Chart_type = structure(c(2L, 2L, 1L, 1L, 1L),
.Label = c("Box",
"Violin"),
class = "factor"),
Condition = structure(c(1L, 2L, 1L, 2L, 1L),
.Label = c("0", "1"),
class = "factor")),
row.names = c(NA, 5L), class = "data.frame")
library("dplyr", warn.conflicts = FALSE)
data <- data %>% as_tibble
# av_data_chartType
data %>%
group_by(Chart_type) %>%
mutate_if(.predicate = is.factor, .funs = as.numeric) %>%
summarise_each(list( ~mean, ~sd))
#> `mutate_if()` ignored the following grouping variables:
#> Column `Chart_type`
#> # A tibble: 2 x 9
#> Chart_type Element_mean Correct_answer_~ Response_time_m~ Condition_mean
#> <fct> <dbl> <dbl> <dbl> <dbl>
#> 1 Box 1 1.67 50.1 1.33
#> 2 Violin 1 2 15.9 1.5
#> # ... with 4 more variables: Element_sd <dbl>, Correct_answer_sd <dbl>,
#> # Response_time_sd <dbl>, Condition_sd <dbl>
# av_data_condition
data %>%
group_by(Condition) %>%
mutate_if(.predicate = is.factor, .funs = as.numeric) %>%
summarise_each(list( ~mean, ~sd))
#> `mutate_if()` ignored the following grouping variables:
#> Column `Condition`
#> # A tibble: 2 x 9
#> Condition Element_mean Correct_answer_~ Response_time_m~ Chart_type_mean
#> <fct> <dbl> <dbl> <dbl> <dbl>
#> 1 0 1 2 53.1 1.33
#> 2 1 1 1.5 11.4 1.5
#> # ... with 4 more variables: Element_sd <dbl>, Correct_answer_sd <dbl>,
#> # Response_time_sd <dbl>, Chart_type_sd <dbl>
Created on 2019-06-11 by the reprex package (v0.2.1)
This should work:
a$Correct_answer <- as.logical(a$Correct_answer)
av_data_chartType <- a %>% select(Chart_type, Correct_answer) %>% group_by(Chart_type) %>% summarise_each(funs(mean, sd))
av_data_chartType <- a %>% select(Condition, Correct_answer) %>% group_by(Condition) %>% summarise_each(funs(mean, sd))
You had 2 problems:
Your Correct_answer was a factor.
You tried to calculate your functions over every Column
You probably need
library(dplyr)
data %>%
mutate(Correct_answer = as.logical(Correct_answer)) %>%
group_by(Chart_type, Condition) %>%
summarise(avg = mean(Correct_answer))
Or if you need them separately
data %>%
mutate(Correct_answer = as.logical(Correct_answer)) %>%
group_by(Chart_type) %>%
summarise(avg = mean(Correct_answer))
data %>%
mutate(Correct_answer = as.logical(Correct_answer)) %>%
group_by(Condition) %>%
summarise(avg = mean(Correct_answer))

Calculate max value across multiple columns by multiple groups

I have a data file with numeric values in three columns and two grouping variables (ID and Group) from which I need to calculate a single max value by ID and Group:
structure(list(ID = structure(c(1L, 1L, 1L, 2L), .Label = c("a1",
"a2"), class = "factor"), Group = structure(c(1L, 1L, 2L, 2L), .Label =
c("abc",
"def"), class = "factor"), Score1 = c(10L, 0L, 0L, 5L), Score2 = c(0L,
0L, 5L, 10L), Score3 = c(0L, 11L, 2L, 11L)), class = "data.frame", row.names =
c(NA,
-4L))
The result I am trying to obtain is:
structure(list(ID = structure(c(1L, 1L, 2L), .Label = c("a1",
"a2"), class = "factor"), Group = structure(c(1L, 2L, 2L), .Label = c("abc",
"def"), class = "factor"), Max = c(11L, 5L, 11L)), class = "data.frame",
row.names = c(NA,
-3L))
I am trying the following in dplyr:
SampTable<-SampDF %>% group_by(ID,Group) %>%
summarize(max = pmax(SampDF$Score1, SampDF$Score2,SampDF$Score3))
But it generates this error:
Error in summarise_impl(.data, dots) :
Column `max` must be length 1 (a summary value), not 4
Is there an easy way to achieve this in dplyr or data.table?
Solution using data.table. Find max value on 3:5 columns (Score columns) by ID and Group.
library(data.table)
setDT(d)
d[, .(Max = do.call(max, .SD)), .SDcols = 3:5, .(ID, Group)]
ID Group Max
1: a1 abc 11
2: a1 def 5
3: a2 def 11
Data:
d <- structure(list(ID = structure(c(1L, 1L, 1L, 2L), .Label = c("a1",
"a2"), class = "factor"), Group = structure(c(1L, 1L, 2L, 2L), .Label =
c("abc",
"def"), class = "factor"), Score1 = c(10L, 0L, 0L, 5L), Score2 = c(0L,
0L, 5L, 10L), Score3 = c(0L, 11L, 2L, 11L)), class = "data.frame", row.names =
c(NA,
-4L))
A solution using tidyverse.
library(tidyverse)
dat2 <- dat1 %>%
gather(Column, Value, starts_with("Score")) %>%
group_by(ID, Group) %>%
summarise(Max = max(Value)) %>%
ungroup()
dat2
# # A tibble: 3 x 3
# ID Group Max
# <fct> <fct> <dbl>
# 1 a1 abc 11
# 2 a1 def 5
# 3 a2 def 11
Here are couple of other options with tidyverse
library(tidyverse)
df1 %>%
group_by(ID, Group) %>%
nest %>%
mutate(Max = map_dbl(data, ~ max(unlist(.x)))) %>%
select(-data)
Or using pmax
df1 %>%
mutate(Max = pmax(!!! rlang::syms(names(.)[3:5]))) %>%
group_by(ID, Group) %>%
summarise(Max = max(Max))
# A tibble: 3 x 3
# Groups: ID [?]
# ID Group Max
# <fct> <fct> <dbl>
#1 a1 abc 11
#2 a1 def 5
#3 a2 def 11
Or using base R
aggregate(cbind(Max = do.call(pmax, df1[3:5])) ~ ID + Group, df1, max)
Here is a tidyverse solution using nest :
library(tidyverse)
df %>%
nest(-(1:2),.key="Max") %>%
mutate_at("Max",map_dbl, max)
# ID Group Max
# 1 a1 abc 11
# 2 a1 def 5
# 3 a2 def 11
In base R:
res <- aggregate(. ~ ID + Group,df,max)
res <- cbind(res[1:2], Max = do.call(pmax,res[-(1:2)]))
res
# ID Group Max
# 1 a1 abc 11
# 2 a1 def 5
# 3 a2 def 11
Here is a base R solution
# gives 2x2 table
x <- by(df[, !names(df) %in% c("ID", "Group")], list(df$ID, df$Group), max)
# get requested format
tmp <- expand.grid(ID = rownames(x), Group = colnames(x))
tmp$Max <- as.vector(x)
tmp[complete.cases(tmp), ]
#R ID Group Max
#R 1 a1 abc 11
#R 3 a1 def 5
#R 4 a2 def 11
with
df <- structure(list(
ID = structure(c(1L, 1L, 1L, 2L), .Label = c("a1", "a2"), class = "factor"),
Group = structure(c(1L, 1L, 2L, 2L), .Label = c("abc", "def"), class = "factor"),
Score1 = c(10L, 0L, 0L, 5L), Score2 = c(0L, 0L, 5L, 10L),
Score3 = c(0L, 11L, 2L, 11L)),
class = "data.frame", row.names = c(NA, -4L))

merging and counting similar strings

I have a data with three columns like
Inputdf<-structure(list(df1 = structure(c(4L, 5L, 2L, 1L, 3L), .Label = c("P61160,P61158,O15143,O15144,O15145,P59998,O15511",
"P78537,Q6QNY1,Q6QNY0", "Q06323,Q9UL46", "Q92793,Q09472,Q9Y6Q9,Q92831",
"Q92828,Q13227,O15379,O75376,O60907,Q9BZK7"), class = "factor"),
df2 = structure(c(3L, 2L, 5L, 4L, 1L), .Label = c("", "P61158,O15143,O15144",
"Q06323,Q9UL46", "Q6QNY0", "Q92828"), class = "factor"),
df3 = structure(c(5L, 4L, 3L, 2L, 1L), .Label = c("", "O15511",
"Q06323,Q9UL46", "Q6QNY0", "Q92793,Q09472"), class = "factor")), .Names = c("df1",
"df2", "df3"), class = "data.frame", row.names = c(NA, -5L))
I am trying to find similar strings in this data for example
in df1, I have the first row I have Q92793,Q09472,Q9Y6Q9,Q92831
then I look at df2 and df3 and see if any of these members are in there then in this example, I make the following data
df1 df2 df3 Numberdf1 df2 df3
1 0 1 4 0 Q92793,Q09472
df1 1 means the first row of df1
df2 0 means it did not have any similarity
df3 1, means the first row of df3 has similarity with df1 row 1
Numberdf1, it is the count of strings separated by a ,which is 4
df2 is 0 because there was not any similar string accords df2
df3 is Q92793,Q09472 which paste the string which were similar in here
a desire output looks like below
out<- structure(list(df1 = 1:5, df2 = c(0L, 3L, 4L, 2L, 1L), df3 = c(1L,
0L, 2L, 4L, 3L), Numberdf1 = c(4L, 6L, 2L, 7L, 2L), df2.1 = structure(c(1L,
5L, 4L, 2L, 3L), .Label = c("0", "P61158,O15143,O15144", "Q06323,Q9UL46",
"Q6QNY0", "Q92828"), class = "factor"), df3.1 = structure(c(5L,
1L, 4L, 2L, 3L), .Label = c("0", "O15511", "Q06323,Q9UL46", "Q6QNY0",
"Q92793,Q09472"), class = "factor")), .Names = c("df1", "df2",
"df3", "Numberdf1", "df2.1", "df3.1"), class = "data.frame", row.names = c(NA,
-5L))
The below function does not work , for example, use this data as input
Inputdf1<- structure(list(df1 = structure(c(2L, 3L, 1L), .Label = c("Q06323,Q9UL46",
"Q92793,Q09472,Q9Y6Q9,Q92831", "Q92828,Q13227,O15379,O75376,O60907,Q9BZK7"
), class = "factor"), df2 = structure(1:3, .Label = c("P25788,P25789",
"Q92828, O60907, O75376", "Q9UL46, Q06323"), class = "factor"),
df3 = structure(c(2L, 1L, 3L), .Label = c("Q92831, Q92793, Q09472",
"Q9BZK7, Q92828, O75376, O60907", "Q9UL46, Q06323"), class = "factor")), .Names = c("df1",
"df2", "df3"), class = "data.frame", row.names = c(NA, -3L))
This works for your example:
# First convert factors to strings to lists
Inputdf[] = lapply(Inputdf, as.character)
Inputdf[] = lapply(Inputdf, function(col) sapply(col, function(x) unlist(strsplit(x,','))))
not.empty = function(x) length(x) > 0
out = data.frame()
for (r in 1:nrow(Inputdf)) {
df2.intersect = lapply(Inputdf$df2, intersect, Inputdf$df1[[r]])
df3.intersect = lapply(Inputdf$df3, intersect, Inputdf$df1[[r]])
out[r, 'df1'] = r
out[r, 'df2'] = Position(not.empty, df2.intersect, nomatch=0)
out[r, 'df3'] = Position(not.empty, df3.intersect, nomatch=0)
out[r, 'Numberdf1'] = length(Inputdf$df1[[r]])
out[r, 'df2.1'] = paste(Find(not.empty, df2.intersect, nomatch=0), collapse=',')
out[r, 'df3.1'] = paste(Find(not.empty, df3.intersect, nomatch=0), collapse=',')
}
out
# df1 df2 df3 Numberdf1 df2.1 df3.1
# 1 1 0 1 4 0 Q92793,Q09472
# 2 2 3 0 6 Q92828 0
# 3 3 4 2 3 Q6QNY0 Q6QNY0
# 4 4 2 4 7 P61158,O15143,O15144 O15511
# 5 5 1 3 2 Q06323,Q9UL46 Q06323,Q9UL46
Note: Find and Position identify the first match only. If there are potentially multiple matches, use which.
EDIT
Version accounting for multiple matches
Inputdf[] = lapply(Inputdf, as.character)
Inputdf[] = lapply(Inputdf, function(col) sapply(col, function(x) unlist(strsplit(x,',\\s*'))))
not.empty = function(x) length(x) > 0
out = data.frame()
for (r in 1:nrow(Inputdf)) {
df2.intersect = lapply(Inputdf$df2, intersect, Inputdf$df1[[r]])
df3.intersect = lapply(Inputdf$df3, intersect, Inputdf$df1[[r]])
out[r, 'df1'] = r
out[r, 'df2'] = paste(which(sapply(df2.intersect, not.empty)), collapse=',')
out[r, 'df3'] = paste(which(sapply(df3.intersect, not.empty)), collapse=',')
out[r, 'Numberdf1'] = length(Inputdf$df1[[r]])
out[r, 'df2.1'] = paste(unique(unlist(df2.intersect)), collapse=',')
out[r, 'df3.1'] = paste(unique(unlist(df3.intersect)), collapse=',')
}
out[out==""] = "0"

R program, ?count, rename "freq" to something else

I am studying this webpage, and cannot figure out how to rename freq to something else, say number of times imbibed
Here is dput
structure(list(name = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L), .Label = c("Bill", "Llib"), class = "factor"), drink = structure(c(2L,
3L, 1L, 4L, 2L, 3L, 1L, 4L), .Label = c("cocoa", "coffee", "tea",
"water"), class = "factor"), cost = 1:8), .Names = c("name",
"drink", "cost"), row.names = c(NA, -8L), class = "data.frame")
And this is working code with output. Again, I'd like to rename the freq column. Thanks!
library(plyr)
bevs$cost <- as.integer(bevs$cost)
count(bevs, "name")
Output
name freq
1 Bill 4
2 Llib 4
Are you trying to do this?
counts <- count(bevs, "name")
names(counts) <- c("name", "number of times imbibed")
counts
The count() function returns a data.frame. Just rename it like any other data.frame:
counts <- count(bevs, "name")
names(counts)[which(names(counts) == "freq")] <- "number of times imbibed"
print(counts)
# name number of times imbibed
# 1 Bill 4
# 2 Llib 4

Resources