I have paired data for 10 subjects (with some missing and some ties). My goal is to select the eye with the best disc_grade (A > B > C) and label ties accordingly from the data frame below.
I'm stuck on how to use R code to select the rows with the best disc_grade for each subject.
df <- structure(list(patientID = c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6,
6, 7, 7, 8, 8, 9, 9, 10, 10), eye = c("R", "L", "R", "L", "R",
"L", "R", "L", "R", "L", "R", "L", "R", "L", "R", "L", "R", "L",
"R", "L"), disc_grade = c(NA, "B", "C", "B", "B", "C", "B", "C",
"B", "A", "B", "B", "C", "B", NA, NA, "B", "C", "B", "C")), .Names = c("patientID", "eye", "disc_grade"), class = c("tbl_df", "data.frame"), row.names = c(NA, -20L))
The desired output is:
patientID eye disc_grade
2 1 L B
4 2 L B
5 3 R B
7 4 R B
10 5 L A
11 6 Tie B
14 7 L B
17 9 R B
19 10 R B
This seems to work:
df %>%
group_by(patientID) %>%
filter(disc_grade == min(disc_grade, na.rm=TRUE)) %>%
summarise(eye = if (n()==1) eye else "Tie", disc_grade = first(disc_grade))
patientID eye disc_grade
(dbl) (chr) (chr)
1 1 L B
2 2 L B
3 3 R B
4 4 R B
5 5 L A
6 6 Tie B
7 7 L B
8 9 R B
9 10 R B
There is a warning for group 8, but we get the desired result thanks to how filter works on NAs.
With data.table:
setDT(df)[,
.SD[ disc_grade == min(disc_grade, na.rm=TRUE) ][,
.( eye = if (.N==1) eye else "Tie", disc_grade = disc_grade[1] )
]
, by=patientID]
Again, there's a warning, but now we do get a row for group 8, since [ does not ignore NAs. To get around this, you could filter the NAs before or after the operation (as in other answers). My best idea for doing it during the main operation is pretty convoluted:
setDT(df)[,
.SD[ which(disc_grade == min(disc_grade, na.rm=TRUE)) ][,
if (.N >= 1) list( eye = if (.N==1) eye else "Tie", disc_grade = disc_grade[1] )
]
, by=patientID]
One option with data.table
library(data.table)
na.omit(setDT(df))[, eye:=if(uniqueN(disc_grade)==1 &
.N >1) 'Tie' else eye, patientID
][order(factor(disc_grade, levels=c('A', 'B', 'C'))),
.SD[1L] ,patientID][order(patientID)]
# patientID eye disc_grade
#1: 1 L B
#2: 2 L B
#3: 3 R B
#4: 4 R B
#5: 5 L A
#6: 6 Tie B
#7: 7 L B
#8: 9 R B
#9: 10 R B
library(dplyr)
df <- structure(list(patientID = c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6,
6, 7, 7, 8, 8, 9, 9, 10, 10), eye = c("R", "L", "R", "L", "R",
"L", "R", "L", "R", "L", "R", "L", "R", "L", "R", "L", "R", "L",
"R", "L"), disc_grade = c(NA, "B", "C", "B", "B", "C", "B", "C",
"B", "A", "B", "B", "C", "B", NA, NA, "B", "C", "B", "C")), .Names = c("patientID", "eye", "disc_grade"), class = c("tbl_df", "data.frame"), row.names = c(NA, -20L))
df %>%
filter(!is.na(disc_grade)) %>% ## remove rows with NAs
group_by(patientID) %>% ## for each patient
filter(disc_grade == min(disc_grade)) %>% ## keep the row (his eye) that has the best score
mutate(eye_upd = ifelse(n() > 1, "tie", eye)) %>% ## if you kept both eyes you have a tie
select(patientID,eye_upd,disc_grade) %>%
distinct()
# patientID eye_upd disc_grade
# (dbl) (chr) (fctr)
# 1 1 L B
# 2 2 L B
# 3 3 R B
# 4 4 R B
# 5 5 L A
# 6 6 tie B
# 7 7 L B
# 8 9 R B
# 9 10 R B
There's certainly a better way to do this, but this gets the job done...need more coffee...
df_orig <- df
library(dplyr)
df %>%
filter(!is.na(disc_grade)) %>%
group_by(patientID) %>%
summarise(best = min(disc_grade)) %>%
left_join(., df_orig, by = c("patientID" = "patientID",
"best" = "disc_grade")) %>%
group_by(patientID) %>%
mutate(eye = ifelse(n() > 1, "tie", eye)) %>%
distinct(patientID) %>%
select(patientID, eye, best)
Note: I am able to get away with min(disc_grade) because of type conversation. Consider looking at as.numeric(as.factor(df$disc_grade)).
Related
I would like to duplicate each observation based on the count. For example:
If count == 3, duplicate the observation three times but replacing the count with 1 each time.
If count == 1, no changes are required.
# Sample data
df <- tibble(
x = c("A", "C", "C", "B", "C", "A", "A"),
y = c("Y", "N", "Y", "N", "N", "N", "Y"),
count = c(1, 1, 3, 2, 1, 1, 1)
)
# Target output
df <- tibble(
x = c("A", "C", "C", "C", "C", "B", "B", "C", "A", "A"),
y = c("Y", "N", "Y", "Y", "Y", "N", "N", "N", "N", "Y"),
count = (1, 1, 1, 1, 1, 1, 1, 1, 1, 1)
)
Using dplyr and tidyr,
df %>% uncount(count, .remove = F) %>%
mutate(count = ifelse(count==3,1, count))
The output is
x y count
<chr> <chr> <dbl>
1 A Y 1
2 C N 1
3 C Y 1
4 C Y 1
5 C Y 1
6 B N 2
7 B N 2
8 C N 1
9 A N 1
10 A Y 1
I have a data set something like this:
df_A <- tribble(
~product_name, ~position, ~cat_id, ~pr,
"A", 1, 1, "X",
"A", 4, 2, "X",
"A", 3, 3, "X",
"B", 4, 5, NA,
"B", 6, 6, NA,
"C", 3, 1, "Y",
"C", 5, 2, "Y",
"D", 6, 2, "Z",
"D", 4, 8, "Z",
"D", 3, 9, "Z",
)
Now, I want to look up 1 and 2 in the cat_id, and find their position in the position for each product_name. If there is no 1 or 2 in the cat_id, then only these three variable will be returned to NA. Please see my desired data set to get a better understanding:
desired <- tribble(
~product_name, ~position_1, ~position_2, ~pr,
"A", 1, 4, "X",
"B", NA, NA, NA,
"C", 3, 5, "Y",
"D", NA, 6, "Z",
)
How can I get it?
We can filter the rows based on the 'cat_id', then if some of the 'product_name' are missing, use complete to expand the dataset and use pivot_wider to reshape into 'wide' format
library(dplyr)
library(tidyr)
library(stringr)
df_A %>%
filter(cat_id %in% 1:2) %>%
mutate(cat_id = str_c('position_', cat_id)) %>%
complete(product_name = unique(df_A$product_name)) %>%
pivot_wider(names_from = cat_id, values_from = position) %>%
select(-`NA`)
# A tibble: 4 x 4
# product_name pr position_1 position_2
# <chr> <chr> <dbl> <dbl>
#1 A X 1 4
#2 B <NA> NA NA
#3 C Y 3 5
#4 D Z NA 6
Or using reshape/subset from base R
reshape(merge(data.frame(product_name = unique(df_A$product_name)),
subset(df_A, cat_id %in% 1:2), all.x = TRUE),
idvar = c('product_name', 'pr'), direction = 'wide', timevar = 'cat_id')[-5]
This question already has answers here:
Collapse text by group in data frame [duplicate]
(2 answers)
Aggregating by unique identifier and concatenating related values into a string [duplicate]
(4 answers)
Closed 3 years ago.
I have one data frame, I want to find the rows where both columns A and B are duplicated, and then combine the rows by combing the elements in C column together.
My example:
DF = cbind.data.frame(A = c(1, 1, 2, 3, 3),
B = c("a", "b", "a", "c", "c"),
C = c("M", "N", "X", "M", "N"))
My expected result:
DFE = cbind.data.frame(A = c(1, 1, 2, 3),
B = c("a", "b", "a", "c"),
C = c("M", "N", "X", "M; N"))
Thanks a lot
Without packages:
DF <- aggregate(C ~ A + B, FUN = function(x) paste(x, collapse = "; "), data = DF)
Output:
A B C
1 1 a M
2 2 a X
3 1 b N
4 3 c M; N
Or with data.table:
setDT(DF)[, .(C = paste(C, collapse = "; ")), by = .(A, B)]
This is a tidyverse based solution where you can use paste with collapse after grouping it.
library(dplyr)
DF = cbind.data.frame(A = c(1, 1, 2, 3, 3),
B = c("a", "b", "a", "c", "c"),
C = c("M", "N", "X", "M", "N"))
DFE = cbind.data.frame(A = c(1, 1, 2, 3),
B = c("a", "b", "a", "c"),
C = c("M", "N", "X", "M; N"))
DF %>%
group_by(A,B) %>%
summarise(C = paste(C, collapse = ";"))
#> # A tibble: 4 x 3
#> # Groups: A [3]
#> A B C
#> <dbl> <fct> <chr>
#> 1 1 a M
#> 2 1 b N
#> 3 2 a X
#> 4 3 c M;N
Created on 2019-03-19 by the reprex package (v0.2.1)
The following code:
df <- data.frame(
"letter" = c("a", "b", "c", "d", "e", "f"),
"score" = seq(1,6)
)
Results in the following dataframe:
letter score
1 a 1
2 b 2
3 c 3
4 d 4
5 e 5
6 f 6
I want to get the scores for a sequence of letters, for example the scores of c("f", "a", "d", "e"). It should result in c(6, 1, 4, 5).
What's more, I want to get the scores for c("c", "o", "f", "f", "e", "e"). Now the o is not in the letter column so it should return NA, resulting in c(3, NA, 6, 6, 5, 5).
What is the best way to achieve this? Can I use dplyr for this?
We can use match to create an index and extract the corresponding 'score' If there is no match, then by default it gives NA
df$score[match(v1, df$letter)]
#[1] 3 NA 6 6 5 5
df$score[match(v2, df$letter)]
#[1] 6 1 4 5
data
v1 <- c("c", "o", "f", "f", "e", "e")
v2 <- c("f", "a", "d", "e")
If you want to use dplyr I would use a join:
df <- data.frame(
"letter" = c("a", "b", "c", "d", "e", "f"),
"score" = seq(1:6)
)
library(dplyr)
df2 <- data.frame(letter = c("c", "o", "f", "f", "e", "e"))
left_join(df2, df, by = "letter")
letter score
1 c 3
2 o NA
3 f 6
4 f 6
5 e 5
6 e 5
Beloe is my test data and code to summarise the tbl table by counting num of positive values. Then add 5 consecutive row counts using rollapply and FUN sum. I am getting NA at rows 1,2 - 5,6,7,8 - 10,11.
NA at 5,6 and 10, 11 is due to the missing next rows which is expected but I don't understand why I am getting NA at rows 1,2 and 7,8. Can some take a look at the code and point my mistake?
library(tidyverse)
library(zoo)
tbl<-tribble(
~z, ~x, ~y,
"x", "a", 2,
"x", "b", 1,
"x", "b", 3,
"y", "c", 3,
"x", "c", 1,
"x", "d", -1,
"x", "q", 2,
"x", "q", 2,
"x", "a", 2,
"x", "s", -1,
"y", "q", -1,
"y", "b", 3,
"x", "c", 3,
"y", "c", -1,
"y", "q", 1,
"y", "w", 2,
"y", "w", -2,
"y", "t", 2,
"y", "t", 1
)
tbl %>%
group_by(z, x) %>%
summarise(xy = sum(y>0, na.rm = T))%>%
mutate(zzz = rollapply(xy, width=5, sum, fill=NA))
Output:
# A tibble: 11 x 4
# Groups: z [2]
z x xy zzz
<chr> <chr> <int> <dbl>
1 x a 2 NA
2 x b 2 NA
3 x c 2 8
4 x d 0 6
5 x q 2 NA
6 x s 0 NA
7 y b 1 NA
8 y c 1 NA
9 y q 1 6
10 y t 2 NA
11 y w 1 NA