Number of shared components in R dataframe - r

I have a dataframe as follows:
deput(data)
structure(list(genome = c("A", "A", "A", "A", "B", "B", "B",
"B"), gene = c("esaA", "esaB", "esaC", "esaC", "essA", "essB",
"essC", "esaA")), class = c("spec_tbl_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -8L), spec = structure(list(cols = list(
genome = structure(list(), class = c("collector_character",
"collector")), gene = structure(list(), class = c("collector_character",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), class = "col_spec"))
I am interested in the number of items which are shared between each genome, e.g here between A and B it is 1. I can do this (very very slowly) in a loop:
out=NULL
for(i in guuids){
print(i)
m<-filter(data,genome==i)
g<-unique(m$gene)
for(z in guuids){
print(z)
p<-filter(data,genome==z)
p<-filter(p,gene %in% g)
p<-nrow(p)
out=rbind(out,data.frame(i,z,p))
}
}
but this obviously does not scale and there is surely a better way?

table can count them for you:
table(df)
gene
genome esaA esaB esaC essA essB essC
A 1 1 2 0 0 0
B 1 0 0 1 1 1
table(df[,c('gene','genome')])
genome
gene A B
esaA 1 1
esaB 1 0
esaC 2 0
essA 0 1
essB 0 1
essC 0 1
The returned object is a matrix, so you can simply test for which rows/matrices have more than one non-zero entry.

You can do this based on regex:
First you define all unique genes in group A as an alternation pattern; you do the same for B. Then you subset df on those rows for which grepl finds the same gene matches:
gene_A <- paste0("(", paste0(unique(df$gene[df$genome=="A"]), collapse = "|"), ")")
gene_B <- paste0("(", paste0(unique(df$gene[df$genome=="B"]), collapse = "|"), ")")
df[grepl(gene_B, df$gene) == grepl(gene_A, df$gene),]
# A tibble: 2 x 2
genome gene
<chr> <chr>
1 A esaA
2 B esaA
Data:
df <- structure(list(genome = c("A", "A", "A", "A", "B", "B", "B",
"B"), gene = c("esaA", "esaB", "esaC", "esaC", "essA", "essB",
"essC", "esaA")), class = c("spec_tbl_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -8L), spec = structure(list(cols = list(
genome = structure(list(), class = c("collector_character",
"collector")), gene = structure(list(), class = c("collector_character",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), class = "col_spec"))

Perhaps you can try intersect + unstack
> do.call(intersect, unname(unstack(rev(data))))
[1] "esaA"

Related

Start and end of a period in a data frame

I'd like to return the value of the start and end of a data frame based on the data it contains. If there are only zeros than I would like to fill in the start and end column with NA.
Data structure:
Output:
Sample data:
structure(list(ID = c(1, 2, 3), A1 = c(1, 1,0), A2 = c(1, 1,0), A3 = c(0,
1,0), A4 = c(0, 1,0), A5 = c(0, 1,0)), class = c("spec_tbl_df", "tbl_df",
"tbl", "data.frame"), row.names = c(NA, -3L), spec = structure(list(
cols = list(ID = structure(list(), class = c("collector_double",
"collector")), A1 = structure(list(), class = c("collector_double",
"collector")), A2 = structure(list(), class = c("collector_double",
"collector")), A3 = structure(list(), class = c("collector_double",
"collector")), A4 = structure(list(), class = c("collector_double",
"collector")), A5 = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1L), class = "col_spec"))
Sample code (does not work on O rows):
start <- names(df1)[-1][max.col(df1[-1], "first")]
end <- names(df1)[-1][max.col(df1[-1], "last")]
data.frame(ID = df1$ID, start, end)
Does this work:
library(dplyr)
library(tidyr)
library(stringr)
df %>% pivot_longer(-ID) %>% group_by(ID) %>%
mutate(s = cumsum(value)) %>% mutate(s = na_if(s,0)) %>%
transmute(start = str_c('A',min(s)), end = str_c('A',max(s))) %>% distinct()
# A tibble: 3 x 3
# Groups: ID [3]
ID start end
<dbl> <chr> <chr>
1 1 A1 A2
2 2 A1 A5
3 3 NA NA
Using the base functions and a for-loop, you can loop through all the rows and note the lowest and highest column that contains 1. It would, however, not note any breaks in the streak. If your streak of 1 is interrupted by a 0, this would not show in the result.
id = c()
start = c()
end = c()
for(i in 1:dim(df)[1]){
id = c(id,df$ID[i])
row = df[i,-1]
start = c(start,names(row)[min((1:length(row))[row==1])])
end = c(end,names(row)[max((1:length(row))[row==1])])
}
out = data.frame(ID=id,
start=start,
end=end)
The output is:
> out
ID start end
1 1 A1 A2
2 2 A1 A5
3 3 <NA> <NA>
library(tidyverse)
df1 %>% group_by(ID) %>% #rowwise() %>%
summarise(start = list(names(cur_data())[as.logical(cur_data())]),
end = unlist(map(start, ~last(.x))),
start = unlist(map(start, ~first(.x))),
.groups = 'drop')
#> # A tibble: 3 x 3
#> ID start end
#> <dbl> <chr> <chr>
#> 1 1 A1 A2
#> 2 2 A1 A5
#> 3 3 <NA> <NA>
Created on 2021-06-16 by the reprex package (v2.0.0)
Below a little program which might help. However, this will just work if you are sure that there are no zeros within a row of ones. Your example data and sample code suggest that.
#your data
df1 <- structure(list(ID = c(1, 2, 3), A1 = c(1, 1,0), A2 = c(1, 1,0), A3 = c(0, 1,0), A4 = c(0, 1,0), A5 = c(0, 1,0)), class = c("spec_tbl_df", "tbl_df", "tbl", "data.frame"), row.names = c(NA, -3L), spec = structure(list(cols = list(ID = structure(list(), class = c("collector_double", "collector")), A1 = structure(list(), class = c("collector_double", "collector")), A2 = structure(list(), class = c("collector_double", "collector")), A3 = structure(list(), class = c("collector_double", "collector")), A4 = structure(list(), class = c("collector_double", "collector")), A5 = structure(list(), class = c("collector_double", "collector"))), default = structure(list(), class = c("collector_guess", "collector")), skip = 1L), class = "col_spec"))
#use the library data.table
library(data.table)
df1 <- data.table(din)
#make a sum of by ID (by row)
df1[,sumUSE:=sum(.SD), by=ID]
#last
df1[,end:=names(df1)[(df1[,sumUSE]+1)]]
df1[end=="ID", end:=NA]
#first
df1[,start:=names(df1)[2]]
df1[is.na(end), start:=NA]
print(df1)
# ID A1 A2 A3 A4 A5 sumUSE end start
#1: 1 1 1 0 0 0 2 A2 A1
#2: 2 1 1 1 1 1 5 A5 A1
#3: 3 0 0 0 0 0 0 <NA> <NA>

Merge 2 data frame with respect to columns

I have 2 dataframes as shown. Can we merge with rep
df1
a b c
X a 2
X b 4
X c 1
Y a 2
Y b 1
df2
a1 c1
X 12
Y 10
Expected output (Because X and Y are top level values. Under X , we have a, b and c. Under Y, we have a and b. So we need to place them above these values.
Also, in another dataframe df2, we have values for both X and Y that need to populated into dataframe df1. Is this possible to acheive?
a b c
X 12
X a 2
X b 4
X c 1
Y 10
Y a 2
Y b 1
You could use dplyr:
library(dplyr)
df2 %>%
transmute(a = a1, b = a1, c = c1, prio = 1) %>%
bind_rows(df1 %>% mutate(prio = 2)) %>%
arrange(a, prio, b) %>%
mutate(a = ifelse(prio == 1, NA_character_, a)) %>%
select(-prio)
returns
# A tibble: 7 x 3
a b c
<chr> <chr> <dbl>
1 NA X 12
2 X a 2
3 X b 4
4 X c 1
5 NA Y 10
6 Y a 2
7 Y b 1
If you prefer an empty string over NA, just replace NA_character_ with "".
Data
df1 <- structure(list(a = c("X", "X", "X", "Y", "Y"), b = c("a", "b",
"c", "a", "b"), c = c(2, 4, 1, 2, 1)), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -5L), spec = structure(list(
cols = list(a = structure(list(), class = c("collector_character",
"collector")), b = structure(list(), class = c("collector_character",
"collector")), c = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 2L), class = "col_spec"))
df2 <- structure(list(a1 = c("X", "Y"), c1 = c(12, 10)), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -2L), spec = structure(list(
cols = list(a1 = structure(list(), class = c("collector_character",
"collector")), c1 = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1L), class = "col_spec"))

Convert all coumns ending in 'ID' to character in tidyverse

I have numerous dataframes with many columns where the name of the column ends in "ID". What's the simplest way to change the type of every column ending in "ID". Ideally I'd pass the imported dataframe to a function which would return the same dataframe but with the column types changed. I definitely can't hardcode the column names as I will not know in advance what the columns are.
Here's some sample data:
test_data <- structure(list(ContactID = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
), SystemID = c(3, 1, 5, 4, 3, 5, 35, 1, 55, 52, 9), Value1 = c("A",
"B", "C", "D", "E", "F", "E", "G", "D", "S", "C"), Value2 = c("1/01/2020",
"2/01/2020", "3/01/2020", "4/01/2020", "5/01/2020", "6/01/2020",
"7/01/2020", "8/01/2020", "9/01/2020", "10/01/2020", "11/01/2020"
), OtherID = c(10004, 10009, 10002, 10007, 10099, 10010, 10002,
10004, 10002, 10007, 10099)), class = c("spec_tbl_df", "tbl_df",
"tbl", "data.frame"), row.names = c(NA, -11L), spec = structure(list(
cols = list(ContactID = structure(list(), class = c("collector_double",
"collector")), SystemID = structure(list(), class = c("collector_double",
"collector")), Value1 = structure(list(), class = c("collector_character",
"collector")), Value2 = structure(list(), class = c("collector_character",
"collector")), OtherID = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), class = "col_spec"))
Columns ContactID, SystemID and OtherID have been imported from a CSV file (using read_csv from readr) and so have been designated numeric. I want a function where I can pass this (or any other dataframe) to change any columns ending in ID to character.
I've tried this but it seems very clumsy. Looking for a neater solution.
change_ID_cols <- function(x) {
id_cols <- grep("ID$", colnames(x))
for (i in id_cols) {
for (j in 1:nrow(x)) {
x[j,i] <- as.character(x[j,i])
}
}
x
}
Does this work:
library(dplyr)
test_data %>% mutate(across(ends_with('ID'), as.character))
# A tibble: 11 x 5
ContactID SystemID Value1 Value2 OtherID
<chr> <chr> <chr> <chr> <chr>
1 1 3 A 1/01/2020 10004
2 2 1 B 2/01/2020 10009
3 3 5 C 3/01/2020 10002
4 4 4 D 4/01/2020 10007
5 5 3 E 5/01/2020 10099
6 6 5 F 6/01/2020 10010
7 7 35 E 7/01/2020 10002
8 8 1 G 8/01/2020 10004
9 9 55 D 9/01/2020 10002
10 10 52 S 10/01/2020 10007
11 11 9 C 11/01/2020 10099
>
You don't have to change each value individually to character. You can turn the complete column into character at once. To do this for multiple columns use lapply.
change_ID_cols <- function(x) {
id_cols <- grep("ID$", colnames(x))
x[id_cols] <- lapply(x[id_cols], as.character)
x
}
An option with data.table would be
library(data.table)
nm <- grep('ID$', names(test_data), value = TRUE)
setDT(test_data)[, (nm) := lapply(.SD, as.character), .SDcols = nm]

counts of combinations of values in a dataframe R

I have a dataframe like so:
df<-structure(list(id = c("A", "A", "A", "B", "B", "C", "C", "D",
"D", "E", "E"), expertise = c("r", "python", "julia", "python",
"r", "python", "julia", "python", "julia", "r", "julia")), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -11L), .Names = c("id",
"expertise"), spec = structure(list(cols = structure(list(id = structure(list(), class = c("collector_character",
"collector")), expertise = structure(list(), class = c("collector_character",
"collector"))), .Names = c("id", "expertise")), default = structure(list(), class = c("collector_guess",
"collector"))), .Names = c("cols", "default"), class = "col_spec"))
df
id expertise
1 A r
2 A python
3 A julia
4 B python
5 B r
6 C python
7 C julia
8 D python
9 D julia
10 E r
11 E julia
I can get the overall counts of "expertise" by using:
library(dplyr)
df %>% group_by(expertise) %>% mutate (counts_overall= n())
However what I want is the counts for combinations of expertise values. In other words how many "id" had the same combination of two expertise e.g. "r" and"julia"?
Here is a desired output:
df_out<-structure(list(expertise1 = c("r", "r", "python"), expertise2 = c("python",
"julia", "julia"), count = c(2L, 2L, 3L)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -3L), .Names = c("expertise1",
"expertise2", "count"), spec = structure(list(cols = structure(list(
expertise1 = structure(list(), class = c("collector_character",
"collector")), expertise2 = structure(list(), class = c("collector_character",
"collector")), count = structure(list(), class = c("collector_integer",
"collector"))), .Names = c("expertise1", "expertise2", "count"
)), default = structure(list(), class = c("collector_guess",
"collector"))), .Names = c("cols", "default"), class = "col_spec"))
df_out
expertise1 expertise2 count
1 r python 2
2 r julia 2
3 python julia 3
The linked answer from latemail's comment creates a matrix
crossprod(table(df) > 0)
expertise
expertise julia python r
julia 4 3 2
python 3 4 2
r 2 2 3
while the OP expects a dataframe in long format.
1) cross join
Below is a data.table solution which uses the CJ() (cross join) function:
library(data.table)
setDT(df)[, CJ(expertise, expertise)[V1 < V2], by = id][
, .N, by = .(expertise1 = V1, expertise2 = V2)]
expertise1 expertise2 N
1: julia python 3
2: julia r 2
3: python r 2
CJ(expertise, expertise)[V1 < V2] is the data.table equivalent for t(combn(df$expertise, 2)) or combinat::combn2(df$expertise).
2) self-join
Here is another variant which uses a self-join:
library(data.table)
setDT(df)[df, on = "id", allow = TRUE][
expertise < i.expertise, .N, by = .(expertise1 = expertise, expertise2 = i.expertise)]
expertise1 expertise2 N
1: python r 2
2: julia r 2
3: julia python 3
A solution not as efficient as crossprod-table approach but easy to understand:
library(tidyr)
df %>% group_by(id) %>%
summarize(expertise = list(combn(sort(expertise), 2, FUN = paste, collapse = '_'))) %>%
unnest(expertise) %>%
group_by(expertise) %>%
summarize(count = n()) %>%
separate(expertise, c('expertise1', 'expertise2'), sep = '_')
# # A tibble: 3 x 3
# expertise1 expertise2 count
# <chr> <chr> <int>
# 1 julia python 3
# 2 julia r 2
# 3 python r 2

Copying a cell down in R

I need to transform this table:
1 a b 0.689723476 0.149916917
a b 0.200907662 0.109557062
a b 0.684007597 0.703492299
a b 0.437375902 0.074223984
a b 0.090612241 0.146617232
a b 0.526821187 0.895595247
2 a b 0.507794544 0.731096076
a b 0.688987918 0.405801748
a b 0.462341505 0.203533346
a b 0.750096552 0.889744763
a b 0.555074241 0.06610472
into this:
1 a b 0.689723476 0.149916917
1 a b 0.200907662 0.109557062
1 a b 0.684007597 0.703492299
1 a b 0.437375902 0.074223984
1 a b 0.090612241 0.146617232
1 a b 0.526821187 0.895595247
2 a b 0.507794544 0.731096076
2 a b 0.688987918 0.405801748
2 a b 0.462341505 0.203533346
2 a b 0.750096552 0.889744763
2 a b 0.555074241 0.06610472
Basically, the I need to repeat the first row downwards until a new number appears, where that number will repeat. I normally do this in Alteryx using a tool called Multi-Row Formula where I do a quick if statement that is like this:
if IsNull([Row0:Column1]) then [Row-1:NewColumn] else [Row0:Column1 endif
This will create a new column that will the take the value in the above cell if it is null, else it will copy the new value.
Is there anything similar in R or any other solutions to this problem?
UPDATE
Including the dput data:
structure(list(X1 = c(1L, NA, NA, NA, NA, NA, 2L, NA, NA, NA,
NA, NA, NA, NA, NA), X2 = c("a", "a", "a", "a", "a", "a", "a",
"a", "a", "a", "a", "a", "a", "a", "a"), X3 = c("b", "b", "b",
"b", "b", "b", "b", "b", "b", "b", "b", "b", "b", "b", "b"),
X4 = c(0.057520727, 0.54421869, 0.695381681, 0.866708518,
0.764246757, 0.622363342, 0.023082188, 0.140441859, 0.404509135,
0.806008319, 0.649705949, 0.607341742, 0.275179259, 0.199698605,
0.420250037), X5 = c(0.826617034, 0.338437818, 0.069566611,
0.95791903, 0.900005669, 0.533003641, 0.075841125, 0.200099759,
0.858293828, 0.271342591, 0.308531235, 0.344739272, 0.259006154,
0.824994839, 0.610793113)), .Names = c("X1", "X2", "X3",
"X4", "X5"), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-15L), spec = structure(list(cols = structure(list(X1 = structure(list(), class = c("collector_integer",
"collector")), X2 = structure(list(), class = c("collector_character",
"collector")), X3 = structure(list(), class = c("collector_character",
"collector")), X4 = structure(list(), class = c("collector_double",
"collector")), X5 = structure(list(), class = c("collector_double",
"collector"))), .Names = c("X1", "X2", "X3", "X4", "X5")), default = structure(list(), class = c("collector_guess",
"collector"))), .Names = c("cols", "default"), class = "col_spec"))
We can use the fill function from the tidyr package. dt is your original data frame. dt2 is the final output.
library(tidyr)
dt2 <- fill(dt, X1)

Resources