Related
I have a dataframe which looks like this.
Name info.1 info.2
ab a 1
123 a 1
de c 4
456 c 4
fg d 5
789 d 5
The two rows that need to be combined are identical aside from the name column and are together in the dataframe. I want the new dataframe to look like this:
Name ID info.1 info.2
ab 123 a 1
de 456 c 4
fg 789 d 5
I have no clue how to do this and google search hasn't been helpful so far
In base R you could do:
data.frame(Name = df[seq(nrow(df)) %% 2 == 0, 1],
ID = df[seq(nrow(df)) %% 2 == 1, 1],
df[seq(nrow(df)) %% 2 == 0, 2:3])
#> Name ID info.1 info.2
#> 2 ab 456 a 1
#> 4 123 fg c 4
#> 6 de 789 d 5
Created on 2022-07-20 by the reprex package (v2.0.1)
A possible solution:
library(tidyverse)
df %>%
group_by(info.1) %>%
summarise(Name = str_c(Name, collapse = "_"), info.2 = first(info.2)) %>%
separate(Name, into = c("Name", "ID"), convert = T) %>%
relocate(info.1, .before = info.2)
#> # A tibble: 3 × 4
#> Name ID info.1 info.2
#> <chr> <int> <chr> <int>
#> 1 ab 123 a 1
#> 2 de 456 c 4
#> 3 fg 789 d 5
Assuming the Name column is consistently ordered Name-ID-Name-ID then:
library(tidyverse)
data <- tibble(Name = c('ab', 123, 'de', 456, 'fg', 789),
info.1 = c('a', 'a', 'c', 'c', 'd', 'd'),
info.2 = c(1, 1, 4, 4, 5, 5))
# remove the troublesome column and make a tibble
# with the unique combos of info1 and 2
data_2 <- data %>% select(info.1, info.2) %>% distinct()
# add columns for name and ID by skipping every other row in the
# original tibble
data_2$Name <- data$Name[seq(from = 1, to = nrow(data), by = 2)]
data_2$ID <- data$Name[seq(from = 2, to = nrow(data), by = 2)]
We could also use summarise and extract first as name and last as id:
data |>
group_by(info.1, info.2) |>
summarise(name = first(Name), ID = last(Name)) |>
ungroup() #|>
#relocate(3:4,1:2)
Output:
# A tibble: 3 × 4
info.1 info.2 name ID
<chr> <dbl> <chr> <chr>
1 a 1 ab 123
2 c 4 de 456
3 d 5 fg 789
We could also use
library(dplyr)
library(stringr)
data %>%
group_by(across(starts_with('info'))) %>%
mutate(ID = str_subset(Name, "^\\d+$"), .before = 2) %>%
ungroup %>%
filter(str_detect(Name, '^\\d+$', negate = TRUE))
-output
# A tibble: 3 × 4
Name ID info.1 info.2
<chr> <chr> <chr> <dbl>
1 ab 123 a 1
2 de 456 c 4
3 fg 789 d 5
data
data <- structure(list(Name = c("ab", "123", "de", "456", "fg", "789"
), info.1 = c("a", "a", "c", "c", "d", "d"), info.2 = c(1, 1,
4, 4, 5, 5)), row.names = c(NA, -6L), class = "data.frame")
I'm close but don't have the syntax correct. I'm trying to select all columns of a data table based on selection of unique combinations of two variables (columns) based on the maximum value of a third. MWE of progress thus far. Thx. J
library(dplyr)
dt1 <- tibble (var1 = c("num1", "num2", "num3", "num4", "num5"),
var2 = rep("A", 5),
var3 = c(rep("B", 2), rep("C", 3)),
var4 = c(5, 10, 3, 7, 19))
dt1 %>% distinct(var2, var3, max(var4), .keep_all = TRUE)
# A tibble: 2 x 5
var1 var2 var3 var4 `max(var4)`
<chr> <chr> <chr> <dbl> <dbl>
1 num1 A B 5 19
2 num3 A C 3 19
which is close, but I want the row where the value of var4 is the max value, within the unique combination of var2 and var3. I'm attempting to get:
# A tibble: 2 x 5
var1 var2 var3 var4 `max(var4)`
<chr> <chr> <chr> <dbl> <dbl>
1 num2 A B 5 10
2 num5 A C 3 19
Do I need a formula for the third argument of the distinct function?
We can add an arrange statement before the distinct
library(dplyr)
dt1 %>%
arrange(var2, var3, desc(var4)) %>%
distinct(var2, var3, .keep_all = TRUE)
-output
# A tibble: 2 x 4
var1 var2 var3 var4
<chr> <chr> <chr> <dbl>
1 num2 A B 10
2 num5 A C 19
Or another option is slice_max
dt1 %>%
group_by(var2, var3) %>%
mutate(var4new = first(var4)) %>%
slice_max(order_by= var4, n = 1) %>%
ungroup
-output
# A tibble: 2 x 5
var1 var2 var3 var4 var4new
<chr> <chr> <chr> <dbl> <dbl>
1 num2 A B 10 5
2 num5 A C 19 3
slice() will do what you want. Though you have drop "var4" = 5, 3 (not really sure if that is important)?
tibble (var1 = c("num1", "num2", "num3", "num4", "num5"),
var2 = rep("A", 5),
var3 = c(rep("B", 2), rep("C", 3)),
var4 = c(5, 10, 3, 7, 19)) %>%
group_by(var2, var3) %>%
slice(which.max(var4)) %>%
ungroup()
# A tibble: 2 x 4
var1 var2 var3 var4
<chr> <chr> <chr> <dbl>
1 num2 A B 10
2 num5 A C 19
Does this work:
library(dplyr)
dt1 %>% group_by(var2, var3) %>% filter(dense_rank(desc(var4)) == 1)
# A tibble: 2 x 4
# Groups: var2, var3 [2]
var1 var2 var3 var4
<chr> <chr> <chr> <dbl>
1 num2 A B 10
2 num5 A C 19
I want to conditionally summarize several variables by group. The following code does that, but I'm not sure how to do this without specifying each variable and the conditions in the summarize step.
library(tidyverse)
dat <- data.frame(group = c("A", "A", "A", "B", "B", "B"),
indicator = c(1, 2, 3, 1, 2, 3),
var1 = c(1, 0, 1, 2, 1, 2),
var2 = c(1, 0, 1, 1, 2, 1))
# dat
# group indicator var1 var2
#1 A 1 1 1
#2 A 2 0 0
#3 A 3 1 1
#4 B 1 2 1
#5 B 2 1 2
#6 B 3 2 1
dat %>%
group_by(group) %>%
summarise(var1 = sum(var1[indicator==1 | indicator==2]),
var2 = sum(var2[indicator==1 | indicator==2]))
# A tibble: 2 x 3
# group var1 var2
#* <chr> <dbl> <dbl>
#1 A 1 1
#2 B 3 3
Use across :
library(dplyr)
dat %>%
group_by(group) %>%
summarise(across(starts_with('var'), ~sum(.[indicator %in% 1:2])))
# group var1 var2
#* <chr> <dbl> <dbl>
#1 A 1 1
#2 B 3 3
My data frame looks like this:
id A T C G ref var
1 1 10 15 7 0 A C
2 2 11 9 2 3 A G
3 3 2 31 1 12 T C
I'd like to create two new columns: ref_count and var_count which will have following values:
Value from A column and value from C column, since ref is A and var is C
Value from A column and value from G column, since ref is A and var is G
etc.
So I'd like to select a column based on the value in another column for each row.
Thanks!
We can use pivot_longer to reshape into 'long' format, filter the rows and then reshape it to 'wide' format with pivot_wider
library(dplyr)
library(tidyr)
df1 %>%
pivot_longer(cols = A:G) %>%
group_by(id) %>%
filter(name == ref|name == var) %>%
mutate(nm1 = c('ref_count', 'var_count')) %>%
ungroup %>%
select(id, value, nm1) %>%
pivot_wider(names_from = nm1, values_from = value) %>%
left_join(df1, .)
# A tibble: 3 x 9
# id A T C G ref var ref_count var_count
#* <int> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
#1 1 10 15 7 0 A C 10 7
#2 2 11 9 2 3 A G 11 3
#3 3 2 31 1 12 T C 31 1
Or in base R, we can also make use of the vectorized row/column indexing
df1$refcount <- as.matrix(df1[2:5])[cbind(seq_len(nrow(df1)), match(df1$ref, names(df1)[2:5]))]
df1$var_count <- as.matrix(df1[2:5])[cbind(seq_len(nrow(df1)), match(df1$var, names(df1)[2:5]))]
data
df1 <- structure(list(id = 1:3, A = c(10, 11, 2), T = c(15, 9, 31),
C = c(7, 2, 1), G = c(0, 3, 12), ref = c("A", "A", "T"),
var = c("C", "G", "C")), row.names = c(NA, -3L), class = c("tbl_df",
"tbl", "data.frame"))
The following is a tidyverse alternative without creating a long dataframe that needs filtering. It essentially uses tidyr::nest() to nest the dataframe by rows, after which the correct column can be selected for each row.
df1 %>%
nest(data = -id) %>%
mutate(
data = map(
data,
~mutate(., refcount = .[[ref]], var_count = .[[var]])
)
) %>%
unnest(data)
#> # A tibble: 3 × 9
#> id A T C G ref var refcount var_count
#> <int> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
#> 1 1 10 15 7 0 A C 10 7
#> 2 2 11 9 2 3 A G 11 3
#> 3 3 2 31 1 12 T C 31 1
A variant of this does not need the (assumed row-specific) id column but defines the nested groups from the unique values of ref and var directly:
df1 %>%
nest(data = -c(ref, var)) %>%
mutate(
data = pmap(
list(data, ref, var),
function(df, ref, var) {
mutate(df, refcount = df[[ref]], var_count = df[[var]])
}
)
) %>%
unnest(data)
The data were specified by akrun:
df1 <- structure(list(id = 1:3, A = c(10, 11, 2), T = c(15, 9, 31),
C = c(7, 2, 1), G = c(0, 3, 12), ref = c("A", "A", "T"),
var = c("C", "G", "C")), row.names = c(NA, -3L), class = c("tbl_df",
"tbl", "data.frame"))
library(dplyr)
mydat1 <- data.frame(ID = c(1, 1, 2, 2),
Gender = c("Male", "Female", "Male", "Male"),
Score = c(30, 40, 20, 60))
mydat1 %>%
group_by(ID, Gender) %>%
slice(which.min(Score))
# A tibble: 3 x 3
# Groups: ID, Gender [3]
ID Gender Score
<dbl> <fctr> <dbl>
1 1 Female 40
2 1 Male 30
3 2 Male 20
I'm trying to group the rows by ID and Gender. And then I want to only keep the row with the lowest Score. The above code works perfectly because when ID == 2, I only kept the entry with the lower score.
mydat2 <- data.frame(ID = c(1, 1, 2, 2),
Gender = c("Male", "Female", "Male", "Male"),
Score = c(NA, NA, 20, 60))
mydat2 %>%
group_by(ID, Gender) %>%
slice(which.min(Score))
# A tibble: 1 x 3
# Groups: ID, Gender [1]
ID Gender Score
<dbl> <fctr> <dbl>
1 2 Male 20
However, when I have NAs, which.min doesn't work like I want it to because it'll not return a valid index. Instead, all of my ID == 1 entries are erased. My desired output in this scenario is:
# A tibble: 1 x 3
# Groups: ID, Gender [1]
ID Gender Score
<dbl> <fctr> <dbl>
1 1 Female NA
2 1 Male NA
1 2 Male 20
How can I modify my code to account for this?
Edit:
df2 <- structure(list(pubmed_id = c(23091106L, 23091106L), Gender = structure(c(4L,
4L), .Label = c("", "Both", "female", "Female", "Male"), class = "factor"),
Total_Carrier = c(NA, 1107)), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"), row.names = c(NA, -2L), vars = "pubmed_id", drop = TRUE, indices = list(
0:1), group_sizes = 2L, biggest_group_size = 2L, labels = structure(list(
pubmed_id = 23091106L), class = "data.frame", row.names = c(NA,
-1L), vars = "pubmed_id", drop = TRUE, .Names = "pubmed_id"), .Names = c("pubmed_id",
"Gender", "Total_Carrier"))
> df2
# A tibble: 2 x 3
# Groups: pubmed_id [1]
pubmed_id Gender Total_Carrier
<int> <fctr> <dbl>
1 23091106 Female NA
2 23091106 Female 1107
In this example, I would want the desired output to only contain row 2 (i.e. the row with carrier sample size of 1107). However, I get the following result:
> df2 %>%
group_by(pubmed_id, Gender) %>%
slice(which.min(Total_Carrier) || 1)
# A tibble: 1 x 3
# Groups: pubmed_id, Gender [1]
pubmed_id Gender Total_Carrier
<int> <fctr> <dbl>
1 23091106 Female NA
which.min ignores the missing values, and returns integer(0) when the input vector contains solely NAs. You can add a condition check in the slice, i.e. when all Scores are NAs in a group, pick the first row:
mydat2 %>%
group_by(ID, Gender) %>%
slice({idx <- which.min(Score); if(length(idx) > 0) idx else 1})
# A tibble: 3 x 3
# Groups: ID, Gender [3]
# ID Gender Score
# <dbl> <fctr> <dbl>
#1 1 Female NA
#2 1 Male NA
#3 2 Male 20
You could also use arrange to sort your scores within your groups, and then slice to select the first row of each group. That way, if there are only NAs in the group, you would still select the first row:
mydat2 %>%
group_by(ID, Gender) %>%
arrange(ID,Gender,Score) %>%
slice(1)
ID Gender Score
<dbl> <fctr> <dbl>
1 1 Female NA
2 1 Male NA
3 2 Male 20
Here is another option with which and pmin
mydat2 %>%
group_by(ID, Gender) %>%
slice(pmin(1, which(Score == min(Score, na.rm = TRUE))[1], na.rm = TRUE))
# A tibble: 3 x 3
# Groups: ID, Gender [3]
# ID Gender Score
# <dbl> <fctr> <dbl>
#1 1 Female NA
#2 1 Male NA
#3 2 Male 20
A solution using data.table
library(data.table)
setDT(mydat2)
mydat2[, .(Score = sort(Score)[1]), by = .(ID, Gender)]
# ID Gender Score
# 1: 1 Male NA
# 2: 1 Female NA
# 3: 2 Male 20