Subset tibble based on column sums, while retaining character columns - r

I have a feeling this is a pretty stupid issue, but I haven't been able to find the solution either
I have a tibble where each row is a sample and the first column is a character variable containing the sample ID and all subsequent columns are variables with numeric variables.
For example:
id <- c("a", "b", "c", "d", "e")
x1 <- rep(1,5)
x2 <- seq(1,5,1)
x3 <- rep(2,5)
x4 <- seq(0.1, 0.5, 0.1)
tb <- tibble(id, x1, x2, x3, x4)
I want to subset this to include only the columns with a sum greater than 5, and the id column. With the old dataframe structure, I know the following worked:
df <- as.data.frame(tb)
df2 <- cbind(df$id, df[,colSums(df[,2:5])>5)
colnames(df2)[1] <- "id"
However, when I try to subset this way with a tibble, I get the error message:
Error: Length of logical index vector must be 1 or 5, got: 4
Does anyone know how to accomplish this task without converting to the old data frame format? Preferably without creating an intermediate tibble with the id variable missing, because separating my ids from my data is just asking for trouble down the road.
Thanks!

# install.packages(c("tidyverse"), dependencies = TRUE)
library(tibble)
df <- tibble(id = letters[1:5], x1 = 1, x2 = 1:5, x3 = 2, x4 = seq(.1, .5, len = 5))
### two additional examples of how to generate the Tibble data
### exploiting that its arguments are evaluated lazily and sequentially
# df <- tibble(id = letters[1:5], x1 = 1, x2 = 1:5, x3 = x1 + 1, x4 = x2/10)
# df <- tibble(x2 = 1:5, id = letters[x2], x3 = 2, x1 = x3-1, x4 = x2/10) %>%
# select(id, num_range("x", 1:4))
base R solution, cf. HubertL's comment above,
### HubertL's base solution
df[c(TRUE,colSums(df[2:5])>5)]
#> # A tibble: 5 x 3
#> id x2 x3
#> <chr> <int> <dbl>
#> 1 a 1 2
#> 2 b 2 2
#> 3 c 3 2
#> 4 d 4 2
#> 5 e 5 2
dplyr solution, cf David Klotz's comment,
### Klotz's dplyr solution
library(dplyr)
df %>% select_if(function(x) is.character(x) || sum(x) > 5)
#> # A tibble: 5 x 3
#> id x2 x3
#> <chr> <int> <dbl>
#> 1 a 1 2
#> 2 b 2 2
#> 3 c 3 2
#> 4 d 4 2
#> 5 e 5 2

Related

Substitute specific rows based on another dataframe in R

Lets say I have:
# Create a, b, c, d variables
x1 <- c("g", "a","c","d","e","f","h", "b")
x2 <- c(1,1,1,1,1,1,1,1)
x7 <- c(10,10,10,10,10,10, 10, 10)
# Join the variables to create a data frame
dataframeA <- data.frame(x1, x2, x7)
# Create a, b, c, d variables
x3 <- c("z", "k" ,"a", "b","c","d")
x4 <- c(5, 19, 6,7,8,9)
# Join the variables to create a data frame
dataframeB <- data.frame(x3, x4)
And I want to substitute values on column x2 of dataframe A with values of column x4 on dataframe b based on matching of a vector, such as dataframe A be:
matchingids = c("a", "b")
dataframeA$x2[which(dataframeA$x1 %in% matchingids)] <- dataframeB$x4[which(dataframeB$x3 %in% matchingids)]
dataframeA turns to:
structure(list(x1 = c("g", "a", "c", "d", "e", "f", "h", "b"),
x2 = c(1, 6, 1, 1, 1, 1, 1, 7), x7 = c(10, 10, 10, 10, 10,
10, 10, 10)), row.names = c(NA, -8L), class = "data.frame")
Which works, but then,
# Create a, b, c, d variables
x1 <- c("g", "a","c","d","e","f","h", "b")
x2 <- c(1,1,1,1,1,1,1,1)
x7 <- c(10,10,10,10,10,10, 10, 10)
# Join the variables to create a data frame
dataframeA <- data.frame(x1, x2, x7)
(here i changed "b" and "a" order
# Create a, b, c, d variables
x3 <- c("z", "k" ,"b", "a","c","d")
x4 <- c(5, 19, 6,7,8,9)
# Join the variables to create a data frame
dataframeB <- data.frame(x3, x4)
matchingids = c("a", "b")
dataframeA$x2[which(dataframeA$x1 %in% matchingids)] <- dataframeB$x4[which(dataframeB$x3 %in% matchingids)]
which gives:
structure(list(x1 = c("g", "a", "c", "d", "e", "f", "h", "b"),
x2 = c(1, 6, 1, 1, 1, 1, 1, 7), x7 = c(10, 10, 10, 10, 10,
10, 10, 10)), row.names = c(NA, -8L), class = "data.frame")
Which does not work, because it is substituing a on first dataframe to b in the second dataframe (order of the objects is wrong)
In the second case, a is being change to b in the first dataframe (value should be a = 7, b = 6)
As you can observe, i get the same result even if I change the position of "a" in dataframeB
This seems like a merge/join operation.
### base R
merge(dataframeA, subset(dataframeB, x3 %in% matchingids),
by.x="x1", by.y="x3", all.x=TRUE) |>
transform(x2 = ifelse(is.na(x4), x2, x4)) |>
subset(select = -x4)
# x1 x2 x7
# 1 a 6 10
# 2 b 7 10
# 3 g 1 10
# 4 c 1 10
# 5 d 1 10
# 6 e 1 10
# 7 f 1 10
# 8 h 1 10
### dplyr
library(dplyr)
filter(dataframeB, x3 %in% matchingids) %>%
right_join(dataframeA, by = c("x3"="x1")) %>%
mutate(x2 = coalesce(x4, x2)) %>%
select(-x4)
# x3 x2 x7
# 1 a 6 10
# 2 b 7 10
# 3 g 1 10
# 4 c 1 10
# 5 d 1 10
# 6 e 1 10
# 7 f 1 10
# 8 h 1 10
(FYI, base::merge doesn't do a good job preserving the original order. If it is very important, I suggest you preface that code with adding a row-number field, then sorting post-merge on that field. Adding sort=FALSE to base::merge does not solve it for me.)
Similarly with the second sets of frames:
merge(dataframeA, subset(dataframeB, x3 %in% matchingids),
by.x="x1", by.y="x3", all.x=TRUE) |>
transform(x2 = ifelse(is.na(x4), x2, x4)) |>
subset(select = -x4)
# x1 x2 x7
# 1 a 7 10
# 2 b 6 10
# 3 g 1 10
# 4 c 1 10
# 5 d 1 10
# 6 e 1 10
# 7 f 1 10
# 8 h 1 10
filter(dataframeB, x3 %in% matchingids) %>%
right_join(dataframeA, by = c("x3"="x1")) %>%
mutate(x2 = coalesce(x4, x2)) %>%
select(-x4)
# x3 x2 x7
# 1 b 6 10
# 2 a 7 10
# 3 g 1 10
# 4 c 1 10
# 5 d 1 10
# 6 e 1 10
# 7 f 1 10
# 8 h 1 10
Note: the |> is in R-4 and later. If you're on an earlier version, you'll need to shift to use intermediate objects.
For more discussions about the concepts of merge/join, see: How to join (merge) data frames (inner, outer, left, right), What's the difference between INNER JOIN, LEFT JOIN, RIGHT JOIN and FULL JOIN?, (pandas) Pandas Merging 101. It's a very powerful process and can pay huge dividends once you become more comfortable with using it.
It works with:
a$x2[order(a$x1)][which(a$x1[order(a$x1)] %in% matchingids)] <- b$x4[order(b$x3)][which(b$x3[order(b$x3)] %in% matchingids)]
But there might be plobems with it, mainly when
matchingids
have IDs that don't match dataframeA or dataframeB, or neither. If the number of IDs are different from dataframe to dataframe, it will also not work
Might only work when dataframeA and dataframeB contains all
matchingids

Find unique entries in otherwise identical rows

I am currently trying to find a way to find unique column values in otherwise duplicate rows in a dataset.
My dataset has the following properties:
The dataset's columns comprise an identifier variable (ID) and a large number of response variables (x1 - xn).
Each row should represent one individual, meaning the values in the ID column should all be unique (and not repeated).
Some rows are duplicated, with repeated entries in the ID column and seemingly identical response item values (x1 - xn). However, the dataset is too large to get a full overview over all variables.
As demonstrated in the code below, if rows are truly identical for all variables, then the duplicate row can be removed with the dplyr::distinct() function. In my case, not all "duplicate" rows are removed by distinct(), which can only mean that not all entries are identical.
I want to find a way to identify which entries are unique in these otherwise duplicate rows.
Example:
library(dplyr)
library(janitor)
df <- data.frame(
"ID" = rep(1:3, each = 2),
"x1" = rep(4:6, each = 2),
"x2" = c("a", "a", "b", "b", "c", "d"),
"x3" = c(7, 10, 8, 8, 9, 11),
"x4" = rep(letters[4:6], each = 2),
"x5" = c("x", "p", "y", "y", "z", "q"),
"x6" = rep(letters[7:9], each = 2)
)
# The dataframe with all entries
df
A data.frame: 6 × 7
ID x1 x2 x3 x4 x5 x6
1 4 a 7 d x g
1 4 a 10 d p g
2 5 b 8 e y h
2 5 b 8 e y h
3 6 c 9 f z i
3 6 d 11 f q i
# The dataframe
df %>%
# with duplicates removed
distinct() %>%
# filtered for columns only containing duplicates in the ID column
janitor::get_dupes(ID)
ID dupe_count x1 x2 x3 x4 x5 x6
1 2 4 a 7 d x g
1 2 4 a 10 d p g
3 2 6 c 9 f z i
3 2 6 d 11 f q i
In the example above I demonstrate how dplyr::distinct() will remove fully duplicate rows (ID = 2), but not rows that are different in some columns (rows where ID = 1 and 3, and columns x2, x3 and x5).
What I want is an overview over which columns that are not duplicates for each value:
df %>%
distinct() %>%
janitor::get_dupes(ID) %>%
# Here I want a way to find columns with unidentical entries:
find_nomatch()
ID x2 x3 x5
1 7 x
1 10 p
3 c 9 z
3 d 11 q
A data.table alternative. Coerce data frame to a data.table (setDT). Melt data to long format (melt(df, id.vars = "ID")).
Within each group defined by 'ID' and 'variable' (corresponding to the columns in the wide format) (by = .(ID, variable)), count number of unique values (uniqueN(value)) and check if it's equal to the number of rows in the subgroup (== .N). If so (if), select the entire subgroup (.SD).
Finally, reshape the data back to wide format (dcast).
library(data.table)
setDT(df)
d = melt(df, id.vars = "ID")
dcast(d[ , if(uniqueN(value) == .N) .SD, by = .(ID, variable)], ID + rowid(ID, variable) ~ variable)
# ID ID_1 x2 x3 x5
# 1: 1 1 <NA> 7 x
# 2: 1 2 <NA> 10 p
# 3: 3 1 c 9 z
# 4: 3 2 d 11 q
A bit more simple than yours I think:
library(dplyr)
library(janitor)
df <- data.frame(
"ID" = rep(1:3, each = 2),
"x1" = rep(4:6, each = 2),
"x2" = c("a", "a", "b", "b", "c", "d"),
"x3" = c(7, 10, 8, 8, 9, 11),
"x4" = rep(letters[4:6], each = 2),
"x5" = c("x", "p", "y", "y", "z", "q"),
"x6" = rep(letters[7:9], each = 2)
)
d <- df %>%
distinct() %>%
janitor::get_dupes(ID)
d %>%
group_by(ID) %>%
# Check for each id which row elements are different from the of the first
group_map(\(.x, .id) apply(.x, 1, \(.y) .x[1, ] != .y))%>%
do.call(what = cbind) %>% # Bind results for all ids
apply(1, any) %>% # return true if there are differences anywhere
c(T, .) %>% # Keep id column
`[`(d, .)
#> ID x2 x3 x5
#> 1 1 a 7 x
#> 2 1 a 10 p
#> 3 3 c 9 z
#> 4 3 d 11 q
Created on 2022-01-18 by the reprex package (v2.0.1)
Edit
d %>%
group_by(ID) %>%
# Check for each id which row elements are different from the of the first
group_map(\(.x, .id) apply(.x, 1, \(.y) !Vectorize(identical)(unlist(.x[1, ]), .y))) %>%
do.call(what = cbind) %>% # Bind results for all ids
apply(1, any) %>% # return true if there are differences anywhere
c(T, .) %>% # Keep id column
`[`(d, .)
#> ID x2 x3 x5
#> 1 1 a 7 x
#> 2 1 a 10 p
#> 3 3 c 9 z
#> 4 3 d 11 q
Created on 2022-01-19 by the reprex package (v2.0.1)
I have been working on this issue for some time and I found a solution, though it tooks more step than I would've though necessary. I can only presume there's a more elegant solution out there. Anyway, this should work:
df <- df %>%
distinct() %>%
janitor::get_dupes(ID)
# Make vector of unique values from the duplicated ID values
l <- distinct(df, ID) %>% unlist()
# Lapply on each ID
df <- lapply(
l,
function(x) {
# Filter rows for the duplicated ID
dplyr::filter(df, ID == x) %>%
# Transpose dataframe (converts it into a matrix)
t() %>%
# Convert back to data frame
as.data.frame() %>%
# Filter columns that are not identical
dplyr::filter(!if_all(everything(), ~ . == V1)) %>%
# Transpose back
t() %>%
# Convert back to data frame
as.data.frame()
}
) %>%
# Bind the dataframes in the list together
bind_rows() %>%
# Finally the columns are moved back in ascending order
relocate(x2, .before = x3)
#Remove row names (not necessary)
row.names(df) <- NULL
df
A data.frame: 4 × 3
x2 x3 x5
NA 7 x
NA 10 p
c 9 z
d 11 q
Feel free to comment
If you just want to keep the first instance of each identifier:
df <- data.frame(
"ID" = rep(1:3, each = 2),
"x1" = rep(4:6, each = 2),
"x2" = rep(letters[1:3], each = 2),
"x3" = c(7, 10, 8, 8, 9, 11),
"x4" = rep(letters[4:6], each = 2)
)
df %>%
distinct(ID, .keep_all = TRUE)
Output:
ID x1 x2 x3 x4
1 1 4 a 7 d
2 2 5 b 8 e
3 3 6 c 9 f

How to rearrange a tibble by rownames

Traditional dataframes support rearrangement of rows by rownames:
> df <- data.frame(c1 = letters[1:3], c2 = 1:3, row.names = paste0("x", 1:3))
> df
c1 c2
x1 a 1
x2 b 2
x3 c 3
#' If we want, say, row "x3" and "x1":
> df[c("x3", "x1"), ]
c1 c2
x3 c 3
x1 a 1
When it comes to tibble, since it drops the concept of rownames, I wonder what the standard way is to achieve similar goal.
> tb <- as_tibble(rownames_to_column(df))
> tb
# A tibble: 3 x 3
rowname c1 c2
<chr> <fct> <int>
1 x1 a 1
2 x2 b 2
3 x3 c 3
> ?
Thanks.
Update
I can come up with the following solution:
> tb[match(c("x3", "x1"), tb[["rowname"]]), ]
# A tibble: 2 x 3
rowname c1 c2
<chr> <fct> <int>
1 x3 c 3
2 x1 a 1
But it seems clumsy. Does anyone have better idea?
Update 2
In a more generalized sense, my question can be rephrased as: by the syntax of tidyverse, what is the most neat and quick equivalent to
df[c("x3", "x1"), ]
that is, subsetting and rearranging rows of a dataframe.
As joran described, you can use filter to select rows of interest and then to arrange a tibble in a specific order, manually defined, you can use arrange with factor:
tibble(rowname = paste0("x", 1:3), c1 = letters[1:3], c2 = 1:3) %>%
filter(rowname %in% c("x3", "x1")) %>%
arrange(factor(rowname, levels = c("x3", "x1")))

Binding data frames from a list with different column types

Trying to figure out a way in purrr to bind rows over different elements of lists where the column types are not consistent. For example, my data looks a little like this...
d0 <- list(
data_frame(x1 = c(1, 2), x2 = c("a", "b")),
data_frame(x1 = c("P1"), x2 = c("c"))
)
d0
# [[1]]
# # A tibble: 2 x 2
# x1 x2
# <dbl> <chr>
# 1 1 a
# 2 2 b
#
# [[2]]
# # A tibble: 1 x 2
# x1 x2
# <chr> <chr>
# 1 P1 c
I can use a for loop and then map_df with bind_rows to get the output I want (map_df will not work if the columns are of different types)...
for(i in 1:length(d0)){
d0[[i]] <- mutate_if(d0[[i]], is.numeric, as.character)
}
map_df(d0, bind_rows)
# # A tibble: 3 x 2
# x1 x2
# <chr> <chr>
# 1 1 a
# 2 2 b
# 3 P1 c
but I think I am missing a trick somewhere that would allow me to avoid the for loop. My attempts along these lines...
d0 %>%
map(mutate_if(., is.numeric, as.character)) %>%
map_df(.,bind_rows)
# Error in UseMethod("tbl_vars") :
# no applicable method for 'tbl_vars' applied to an object of class "list"
... do not seem to work (still getting my head around purrr)
You can use rbindlist() from data.table in this case
data.table::rbindlist(d0) %>%
dplyr::as_data_frame()
# A tibble: 3 x 2
x1 x2
<chr> <chr>
1 1 a
2 2 b
3 P1 c
There may be circumstances where you will want to make sure the fill argument is TRUE
Documentation reference:
If column i of input items do not all have the same type; e.g, a
data.table may be bound with a list or a column is factor while others
are character types, they are coerced to the highest type (SEXPTYPE).
How about this?
library(purrr)
map_df(lapply(d0, function(x) data.frame(lapply(x, as.character))), bind_rows)
Output is:
x1 x2
1 1 a
2 2 b
3 P1 c
Sample data:
d0 <- list(structure(list(x1 = c(1, 2), x2 = c("a", "b")), .Names = c("x1",
"x2"), row.names = c(NA, -2L), class = c("tbl_df", "tbl", "data.frame"
)), structure(list(x1 = "P1", x2 = "c"), .Names = c("x1", "x2"
), row.names = c(NA, -1L), class = c("tbl_df", "tbl", "data.frame"
)))
With tidyverse, the option would be
library(tidyverse)
d0 %>%
map_df(~ .x %>%
mutate_if(is.numeric, as.character))
# A tibble: 3 x 2
# x1 x2
# <chr> <chr>
#1 1 a
#2 2 b
#3 P1 c
It's a good opportunity to use purrr::modify_depth :
library(purrr)
library(dplyr)
bind_rows(modify_depth(d0,2,as.character))
# # A tibble: 3 x 2
# x1 x2
# <chr> <chr>
# 1 1 a
# 2 2 b
# 3 P1 c

writing table from a list in R

I have a SNP file and i want to count how many they in each column. while writing a table from the list it shows error as "arguments imply differing number of rows". I want a solution so that i can write the list into a table.
Please help me.
input file : image file is added
input file contain 830 row and 210 column
#1 R code
require(gdata)
library(plyr)
df = read.xls ("jTest_file.xlsx", sheet = 1, header = TRUE)
combine = c()
for(i in 1:v){
vec = count(df[,i])
colnames(vec) <- c (colnames(df[i]),"freq")
combine = c(combine,vec)
}
write.table(combine,file="test_output.xls",sep="\t",quote=FALSE,row.names =FALSE)
but there are some blank values in the input so i substitued the blank with XX so that the row number can be maintain but it does not worked.
#2 R code
require(gdata)
library(plyr)
df = read.xls ("jTest_file.xlsx", sheet = 1, header = TRUE)
combine = c()
for(i in 1:v){
data=sub("^$", "XX", df[,i])
vec = count(data)
colnames(vec) <- c (colnames(df[i]),"freq")
combine = c(combine,vec)
}
write.table(combine,file="test_output.xls",sep="\t",quote=FALSE,row.names =FALSE)
There is a much cleaner way to do these counts using the dplyr and tidyr packages.
Since you did not provide sample data, I will make some first:
#Make sample data
li = lapply(1:10, function(X) {
sample(x = c("A", "C", "G", "T"), size = 10,
replace = TRUE)
})
df = data.frame(li, stringsAsFactors = FALSE)
names(df) = paste("X", 1:10, sep = "")
head(df, 3)
# X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
# 1 T G C T C A T T C T
# 2 A A A G G G T G C A
# 3 C C A T A A C A T G
Now the actual answer - doing the counts:
library(tidyr)
library(dplyr)
df_long = gather(df, var, value)
df_groups = group_by(df_long, var, value)
df_counts = summarise(df_groups, count = n())
df_wide = spread(df_counts, value, count, fill = 0)
df_wide
# Source: local data frame [10 x 5]
# Groups: var [10]
#
# var A C G T
# * <chr> <dbl> <dbl> <dbl> <dbl>
# 1 X1 3 4 0 3
# 2 X10 5 0 2 3
# 3 X2 3 2 2 3
# 4 X3 4 3 1 2
# 5 X4 2 1 4 3
# 6 X5 2 3 3 2
# 7 X6 4 2 1 3
# 8 X7 2 4 2 2
# 9 X8 2 3 2 3
# 10 X9 2 2 2 4
I encourage you to explore individual steps (df_long, df_groups, df_counts, df_wide). This will give you a sense of what is going on with the data.

Resources