Beloe is my test data and code to summarise the tbl table by counting num of positive values. Then add 5 consecutive row counts using rollapply and FUN sum. I am getting NA at rows 1,2 - 5,6,7,8 - 10,11.
NA at 5,6 and 10, 11 is due to the missing next rows which is expected but I don't understand why I am getting NA at rows 1,2 and 7,8. Can some take a look at the code and point my mistake?
library(tidyverse)
library(zoo)
tbl<-tribble(
~z, ~x, ~y,
"x", "a", 2,
"x", "b", 1,
"x", "b", 3,
"y", "c", 3,
"x", "c", 1,
"x", "d", -1,
"x", "q", 2,
"x", "q", 2,
"x", "a", 2,
"x", "s", -1,
"y", "q", -1,
"y", "b", 3,
"x", "c", 3,
"y", "c", -1,
"y", "q", 1,
"y", "w", 2,
"y", "w", -2,
"y", "t", 2,
"y", "t", 1
)
tbl %>%
group_by(z, x) %>%
summarise(xy = sum(y>0, na.rm = T))%>%
mutate(zzz = rollapply(xy, width=5, sum, fill=NA))
Output:
# A tibble: 11 x 4
# Groups: z [2]
z x xy zzz
<chr> <chr> <int> <dbl>
1 x a 2 NA
2 x b 2 NA
3 x c 2 8
4 x d 0 6
5 x q 2 NA
6 x s 0 NA
7 y b 1 NA
8 y c 1 NA
9 y q 1 6
10 y t 2 NA
11 y w 1 NA
Related
I have the following data example and code:
lt1 <- list(df1 <- data.frame(V1 = c("a", "b"),
V2 = c("b", "c"),
V3 = c(1, 2)),
df2 <- data.frame(V1 = c("x", "y"),
V2 = c("x", "z"),
V3 = c(1, 2)))
lvls_func <- function(x) {
x[1:2] %>%
unlist() %>%
unique() %>%
sort()
}
lt_lvls <- lapply(lt1, lvls_func)
complete_func <- function(x) {
tidyr::complete(x[1] = factor(x[1], levels = lt_lvls),
x[2] = factor(x[2], levels = lt_lvls),
x[3] = x[3],
fill = list(x[3] = 0))
}
lt1_final <- lapply(lt1, complete_func)
I have difficulty building my complete_func().
I getting this error when I run my complete_func()
Error: unexpected '=' in:
"complete_func <- function(x) {
tidyr::complete(x[1] ="
In my final list lt1_final I expect this output:
lt1_final <- list(df1 <- data.frame(V1 = c("a", "b", "a", "a", "b", "b", "c", "c", "c"),
V2 = c("b", "c", "a", "c", "b", "a", "a", "b", "c"),
V3 = c(1, 2, 0, 0, 0, 0, 0, 0, 0)),
df2 <- data.frame(V1 = c("x", "y", "x", "x", "y", "y", "z", "z", "z"),
V2 = c("x", "z", "y", "z", "y", "x", "z", "x", "y"),
V3 = c(1, 2, 0, 0, 0, 0, 0, 0, 0)))
Thanks all help
As the lt_lvls is a list of levels, we may need either Map (from base R) or use purrr::map2.
In addition, create the function by making use of across. There are multiple changes in the function
Add an argument lvls in the function
Convert the columns 1 to 2 to factor by looping across within mutate, specify the lvls
Apply complete on the subset of data using either splicing (!!!) (or could use invoke/exec), and specify the fill as a named list with dplyr::lst (or regular list with setNames)
library(dplyr)
library(tidyr)
library(purrr)
complete_func <- function(x, lvls) {
x %>%
dplyr::mutate(across(1:2, factor, levels =lvls)) %>%
tidyr::complete(!!! .[1:2], fill = dplyr::lst(!! names(.)[3] := 0)) %>%
arrange(across(3, ~ .x == 0))
}
-testing
map2(lt1, lt_lvls, ~ complete_func(.x, .y))
[[1]]
# A tibble: 9 × 3
V1 V2 V3
<fct> <fct> <dbl>
1 a b 1
2 b c 2
3 a a 0
4 a c 0
5 b a 0
6 b b 0
7 c a 0
8 c b 0
9 c c 0
[[2]]
# A tibble: 9 × 3
V1 V2 V3
<fct> <fct> <dbl>
1 x x 1
2 y z 2
3 x y 0
4 x z 0
5 y x 0
6 y y 0
7 z x 0
8 z y 0
9 z z 0
I would like to duplicate each observation based on the count. For example:
If count == 3, duplicate the observation three times but replacing the count with 1 each time.
If count == 1, no changes are required.
# Sample data
df <- tibble(
x = c("A", "C", "C", "B", "C", "A", "A"),
y = c("Y", "N", "Y", "N", "N", "N", "Y"),
count = c(1, 1, 3, 2, 1, 1, 1)
)
# Target output
df <- tibble(
x = c("A", "C", "C", "C", "C", "B", "B", "C", "A", "A"),
y = c("Y", "N", "Y", "Y", "Y", "N", "N", "N", "N", "Y"),
count = (1, 1, 1, 1, 1, 1, 1, 1, 1, 1)
)
Using dplyr and tidyr,
df %>% uncount(count, .remove = F) %>%
mutate(count = ifelse(count==3,1, count))
The output is
x y count
<chr> <chr> <dbl>
1 A Y 1
2 C N 1
3 C Y 1
4 C Y 1
5 C Y 1
6 B N 2
7 B N 2
8 C N 1
9 A N 1
10 A Y 1
I have a data set as I've shown below:
data <- tribble(
~book_name, ~clicks, ~type,
"A", 10, "X",
"B", 20, "Y",
"C", 30, "Y",
"A", 10, "Z",
"A", 10, "X",
)
Now, I want to copy and paste the rows if the type is "X". So, my desired data set is something like this:
desired_data <- tribble(
~book_name, ~clicks, ~type,
"A", 10, "X",
"B", 20, "Y",
"C", 30, "Y",
"A", 10, "Z",
"A", 10, "X",
"A", 10, "X",
"A", 10, "X",
)
How to do this?
Filter and bind rows
data_x <- data %>% filter(type == 'X')
desired_data <- bind_rows(data,data_x)
A base R solution. The idea is to prepare the row indices for the desired output. 1:nrow(data) is for all rows. which(data$type == "X") is for the rows you would like to duplicate. By combing these two parts together, we can get the desired output.
data[c(1:nrow(data), which(data$type == "X")), ]
# # A tibble: 7 x 3
# book_name clicks type
# <chr> <dbl> <chr>
# 1 A 10 X
# 2 B 20 Y
# 3 C 30 Y
# 4 A 10 Z
# 5 A 10 X
# 6 A 10 X
# 7 A 10 X
I have dataframe df1:
A|A2|A3
-+--+--
a|ut|x
a|tv|y
a|ut|x
a|pq|y
a|ut|y
b|st|x
b|qp|x
b|nt|y
c|st|x
c|st|x
c|st|y
c|st|z
I want to know the frequency of A2 with unique A3 for each A, i.e. I want following output:
A|A2|freq
-+--+----
a|ut|2
a|tv|1
a|pq|1
b|st|1
b|qp|1
b|nt|1
c|st|3
I tried
count(df1, A, A2, A3)
but get the error
Error in count(df1, A, A2, A3) : unused argument A3
With dplyr, you can use distinct to remove duplicated rows, then count to aggregate:
library(dplyr)
df1 <- data_frame(A = c("a", "a", "a", "a", "a", "b", "b", "b", "c", "c", "c", "c"),
A2 = c("ut", "tv", "ut", "pq", "ut", "st", "qp", "nt", "st", "st", "st", "st"),
A3 = c("x", "y", "x", "y", "y", "x", "x", "y", "x", "x", "y", "z"))
df2 <- df1 %>% distinct() %>% count(A, A2)
df2
#> # A tibble: 7 x 3
#> A A2 n
#> <chr> <chr> <int>
#> 1 a pq 1
#> 2 a tv 1
#> 3 a ut 2
#> 4 b nt 1
#> 5 b qp 1
#> 6 b st 1
#> 7 c st 3
or more generalizably, use n_distinct:
df1 %>% group_by(A, A2) %>% summarise(freq = n_distinct(A3))
Here is an option with data.table
library(data.table)
setDT(df1)[, .(freq = uniqueN(A3)), .(A, A2)]
# A A2 freq
#1: a ut 2
#2: a tv 1
#3: a pq 1
#4: b st 1
#5: b qp 1
#6: b nt 1
#7: c st 3
data
df1 <- structure(list(A = c("a", "a", "a", "a", "a", "b", "b", "b",
"c", "c", "c", "c"), A2 = c("ut", "tv", "ut", "pq", "ut", "st",
"qp", "nt", "st", "st", "st", "st"), A3 = c("x", "y", "x", "y",
"y", "x", "x", "y", "x", "x", "y", "z")), .Names = c("A", "A2",
"A3"), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-12L))
You can use aggregate for this:
> aggregate(A3 ~ A+A2, data=df1, FUN=function(x) length(unique(x)))
A A2 A3
1 b nt 1
2 a pq 1
3 b qp 1
4 b st 1
5 c st 3
6 a tv 1
7 a ut 2
I have paired data for 10 subjects (with some missing and some ties). My goal is to select the eye with the best disc_grade (A > B > C) and label ties accordingly from the data frame below.
I'm stuck on how to use R code to select the rows with the best disc_grade for each subject.
df <- structure(list(patientID = c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6,
6, 7, 7, 8, 8, 9, 9, 10, 10), eye = c("R", "L", "R", "L", "R",
"L", "R", "L", "R", "L", "R", "L", "R", "L", "R", "L", "R", "L",
"R", "L"), disc_grade = c(NA, "B", "C", "B", "B", "C", "B", "C",
"B", "A", "B", "B", "C", "B", NA, NA, "B", "C", "B", "C")), .Names = c("patientID", "eye", "disc_grade"), class = c("tbl_df", "data.frame"), row.names = c(NA, -20L))
The desired output is:
patientID eye disc_grade
2 1 L B
4 2 L B
5 3 R B
7 4 R B
10 5 L A
11 6 Tie B
14 7 L B
17 9 R B
19 10 R B
This seems to work:
df %>%
group_by(patientID) %>%
filter(disc_grade == min(disc_grade, na.rm=TRUE)) %>%
summarise(eye = if (n()==1) eye else "Tie", disc_grade = first(disc_grade))
patientID eye disc_grade
(dbl) (chr) (chr)
1 1 L B
2 2 L B
3 3 R B
4 4 R B
5 5 L A
6 6 Tie B
7 7 L B
8 9 R B
9 10 R B
There is a warning for group 8, but we get the desired result thanks to how filter works on NAs.
With data.table:
setDT(df)[,
.SD[ disc_grade == min(disc_grade, na.rm=TRUE) ][,
.( eye = if (.N==1) eye else "Tie", disc_grade = disc_grade[1] )
]
, by=patientID]
Again, there's a warning, but now we do get a row for group 8, since [ does not ignore NAs. To get around this, you could filter the NAs before or after the operation (as in other answers). My best idea for doing it during the main operation is pretty convoluted:
setDT(df)[,
.SD[ which(disc_grade == min(disc_grade, na.rm=TRUE)) ][,
if (.N >= 1) list( eye = if (.N==1) eye else "Tie", disc_grade = disc_grade[1] )
]
, by=patientID]
One option with data.table
library(data.table)
na.omit(setDT(df))[, eye:=if(uniqueN(disc_grade)==1 &
.N >1) 'Tie' else eye, patientID
][order(factor(disc_grade, levels=c('A', 'B', 'C'))),
.SD[1L] ,patientID][order(patientID)]
# patientID eye disc_grade
#1: 1 L B
#2: 2 L B
#3: 3 R B
#4: 4 R B
#5: 5 L A
#6: 6 Tie B
#7: 7 L B
#8: 9 R B
#9: 10 R B
library(dplyr)
df <- structure(list(patientID = c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6,
6, 7, 7, 8, 8, 9, 9, 10, 10), eye = c("R", "L", "R", "L", "R",
"L", "R", "L", "R", "L", "R", "L", "R", "L", "R", "L", "R", "L",
"R", "L"), disc_grade = c(NA, "B", "C", "B", "B", "C", "B", "C",
"B", "A", "B", "B", "C", "B", NA, NA, "B", "C", "B", "C")), .Names = c("patientID", "eye", "disc_grade"), class = c("tbl_df", "data.frame"), row.names = c(NA, -20L))
df %>%
filter(!is.na(disc_grade)) %>% ## remove rows with NAs
group_by(patientID) %>% ## for each patient
filter(disc_grade == min(disc_grade)) %>% ## keep the row (his eye) that has the best score
mutate(eye_upd = ifelse(n() > 1, "tie", eye)) %>% ## if you kept both eyes you have a tie
select(patientID,eye_upd,disc_grade) %>%
distinct()
# patientID eye_upd disc_grade
# (dbl) (chr) (fctr)
# 1 1 L B
# 2 2 L B
# 3 3 R B
# 4 4 R B
# 5 5 L A
# 6 6 tie B
# 7 7 L B
# 8 9 R B
# 9 10 R B
There's certainly a better way to do this, but this gets the job done...need more coffee...
df_orig <- df
library(dplyr)
df %>%
filter(!is.na(disc_grade)) %>%
group_by(patientID) %>%
summarise(best = min(disc_grade)) %>%
left_join(., df_orig, by = c("patientID" = "patientID",
"best" = "disc_grade")) %>%
group_by(patientID) %>%
mutate(eye = ifelse(n() > 1, "tie", eye)) %>%
distinct(patientID) %>%
select(patientID, eye, best)
Note: I am able to get away with min(disc_grade) because of type conversation. Consider looking at as.numeric(as.factor(df$disc_grade)).