R Merge Tabulations of data from different files - r

I am using R to analyize multiple experiments in which the results are stored in multiple CSV files. I run table() to tabulate the data and get results like the following
Tabulations of Combination1.csv
A 1000
B 50
C 200
Tabulations of Combination2.csv
A 25
B 1500
D 30
Tabulations of Combination3.csv
B 19
C 500
E 2000
I want to build a table that combines these tabulations.
Combination A B C D E
c1 1000 50 200 N/A N/A
c2 25 1500 N/A 30 N/A
c3 N/A 19 500 N/A 2000

Here's how I would do it using tidyr and dplyr:
Data
c1 <- rep(LETTERS[1:3], c(1000, 50, 200))
c2 <- rep(LETTERS[c(1:2, 4)], c(25, 1500, 30))
c3 <- rep(LETTERS[c(2:3, 5)], c(19, 500, 2000))
Code
library(tidyr)
library(plyr)
allC <- list(c1 = c1, c2 = c2, c3 = c3)
# get all tables in data.frame format
ldply(names(allC), function(x) {
tab <- table(allC[[x]])
data.frame(Combination = x, element = names(tab), Freq = c(tab))
}) %>% spread(element, Freq)
# Combination A B C D E
# 1 c1 1000 50 200 NA NA
# 2 c2 25 1500 NA 30 NA
# 3 c3 NA 19 500 NA 2000
Explanation
You transform all your tables to a data.frame first, where you append the name of the respective element. Then you use spread to spread out the values.

library(dplyr)
library(tidyr)
x <- table(c(rep("A", 1000), rep("B", 50), rep("C", 200)))
y <- table(c(rep("A", 25), rep("B", 1500), rep("D", 30)))
z <- table(c(rep("B", 19), rep("C", 500), rep("E", 2000)))
X <- data.frame(x) %>% spread(Var1, Freq)
Y <- data.frame(y) %>% spread(Var1, Freq)
Z <- data.frame(z) %>% spread(Var1, Freq)
X %>% full_join(Y) %>% full_join(Z) %>%
mutate(Combination = paste0("c", seq(1,3)))
Result:
> X %>% full_join(Y) %>% full_join(Z) %>%
+ mutate(Combination = paste0("c", seq(1,3)))
Joining, by = c("A", "B")
Joining, by = c("B", "C")
A B C D E Combination
1 1000 50 200 NA NA c1
2 25 1500 NA 30 NA c2
3 NA 19 500 NA 2000 c3
Please think for the next time to provide x, y and z objects for a reproducible example :)

Related

R using combn with apply

I have a data frame that has percentage values for a number of variables and observations, as follows:
obs <- data.frame(Site = c("A", "B", "C"), X = c(11, 22, 33), Y = c(44, 55, 66), Z = c(77, 88, 99))
I need to prepare this data as an edge list for network analysis, with "Site" as the nodes and the remaining variables as the edges. The result should look like this:
Node1 Node2 Weight Type
A B 33 X
A C 44 X
...
B C 187 Z
So that for "Weight" we are calculating the sum of all possible pairs, and this separately for each column (which ends up in "Type").
I suppose the answer to this has to be using apply on a combn expression, like here Applying combn() function to data frame, but I haven't quite been able to work it out.
I can do this all by hand taking the combinations for "Site"
sites <- combn(obs$Site, 2)
Then the individual columns like so
combA <- combn(obs$A, 2, function(x) sum(x)
and binding those datasets together, but this obviously become annoying very soon.
I have tried to do all the variable columns in one go like this
b <- apply(newdf[, -1], 1, function(x){
sum(utils::combn(x, 2))
}
)
but there is something wrong with that.
Can anyone help, please?
One option would be to create a function and then map that function to all the columns that you have.
func1 <- function(var){
obs %>%
transmute(Node1 = combn(Site, 2)[1, ],
Node2 = combn(Site, 2)[2, ],
Weight = combn(!!sym(var), 2, function(x) sum(x)),
Type = var)
}
map(colnames(obs)[-1], func1) %>% bind_rows()
Here is an example using combn
do.call(
rbind,
combn(1:nrow(obs),
2,
FUN = function(k) cbind(data.frame(t(obs[k, 1])), stack(data.frame(as.list(colSums(obs[k, -1]))))),
simplify = FALSE
)
)
which gives
X1 X2 values ind
1 A B 33 X
2 A B 99 Y
3 A B 165 Z
4 A C 44 X
5 A C 110 Y
6 A C 176 Z
7 B C 55 X
8 B C 121 Y
9 B C 187 Z
try it this way
library(tidyverse)
obs_long <- obs %>% pivot_longer(-Site, names_to = "type")
sites <- combn(obs$Site, 2) %>% t() %>% as_tibble()
Type <- tibble(type = c("X", "Y", "Z"))
merge(sites, Type) %>%
left_join(obs_long, by = c("V1" = "Site", "type" = "type")) %>%
left_join(obs_long, by = c("V2" = "Site", "type" = "type")) %>%
mutate(res = value.x + value.y) %>%
select(-c(value.x, value.y))
V1 V2 type res
1 A B X 33
2 A C X 44
3 B C X 55
4 A B Y 99
5 A C Y 110
6 B C Y 121
7 A B Z 165
8 A C Z 176
9 B C Z 187

How to multiply by a constant and then sum across rows

I am having an issue multiplying 3 columns by 3 different constants (i.e, 2,3,4, respectively) and then summing each row after applying the conversion.
I am using dplyr
variable <- df %>% transmute(df, sum(col1, col2*2, col3*3, col4*4))
We could do
library(dplyr)
df %>%
mutate(a = a * 2,
b = b * 3,
c = c * 4,
total = a + b + c)
# a b c total
#1 2 18 44 64
#2 4 21 48 73
#3 6 24 52 82
#4 8 27 56 91
#5 10 30 60 100
Using rowSums
df %>%
mutate(a = a * 2,
b = b * 3,
c = c * 4) %>%
mutate(total = rowSums(.))
Important to note that if we are using rowSums, we need to include it in the new mutate call and not the same one otherwise it would sum the original df and not the changed one.
Or in base R
df1 <- transform(df, a = a*2, b = b * 3, c = c *4)
df1$total <- rowSums(df1)
data
df <- data.frame(a = 1:5, b = 6:10, c = 11:15)
In base R, we can do this more compactly with %*%
df$total <- c(as.matrix(df) %*% 2:4)
df
# a b c total
#1 1 6 11 64
#2 2 7 12 73
#3 3 8 13 82
#4 4 9 14 91
#5 5 10 15 100
Or with crossprod
df$total <- c(crossprod(t(df), 2:4))
--
Or with tidyverse
library(tidyverse)
map2(df, 2:4, ~ .x * .y) %>%
reduce(`+`) %>%
bind_cols(df, total = .)
data
df <- data.frame(a = 1:5, b = 6:10, c = 11:15)
variable <- df %>%
rowwise() %>%
mutate(new_var = sum(col1, col2*2, col3*3, col4*4))
Try that instead.
add rowwise() to have data analyzed at each row
use mutate() to get the new calculation

Merge two database based on values between other values

I would like to use a category from one data frame and apply it to another based on a similar column (merge). But, the merge needs to consider a range of data points that are found between two columns. I have an example below.
set.seed(123)
df_1 <- tibble(
x = c(0, 500, 1000, 1500, 2000),
y = c(499, 999, 1499, 1999, 99999),
desc = LETTERS[1:5]
)
> df_1
# A tibble: 5 x 3
x y desc
<dbl> <dbl> <chr>
1 0 499 A
2 500 999 B
3 1000 1499 C
4 1500 1999 D
5 2000 99999 E
df_2 <- tibble(
code = sample(1:2500,5,F)
)
>df_2
# A tibble: 5 x 1
code
<int>
1 719
2 1970
3 1022
4 2205
5 2348
## desired output
df_2 %>%
mutate(desc = c('B', 'D', 'C', 'E', 'E'))
# A tibble: 5 x 2
code desc
<int> <chr>
1 719 B
2 1970 D
3 1022 C
4 2205 E
5 2348 E
My first thought was to split df_1 and merge somehow, but I'm stuck on how to deal with the range of values found in x and y. Any ideas?
This is an easy problem to handle in SQL, so one option would be to use the sqldf package, with this query:
SELECT t2.code, COALESCE(t1.desc, '') AS desc
FROM df_2 t2
LEFT JOIN df_1 t1
ON t2.code BETWEEN t1.x AND t1.y;
R code:
library(sqldf)
sql <- paste0("SELECT t2.code, COALESCE(t1.desc, '') AS desc ",
"FROM df_2 t2 LEFT JOIN df_1 t1 ON t2.code BETWEEN t1.x AND t1.y")
result <- sqldf(sql)
library(tidyverse)
set.seed(123)
df_1 <- tibble(
x = c(0, 500, 1000, 1500, 2000),
y = c(499, 999, 1499, 1999, 99999),
desc = LETTERS[1:5]
)
df_2 <- tibble(
code = sample(1:2500,5,F)
)
df_1 %>%
mutate(code = map2(x, y, ~seq(.x, .y, 1))) %>% # create a sequence of numbers with step = 1
unnest() %>% # unnest data
inner_join(df_2, by="code") %>% # join df_2
select(-x, -y) # remove columns
# # A tibble: 5 x 2
# desc code
# <chr> <dbl>
# 1 B 719
# 2 C 1022
# 3 D 1970
# 4 E 2205
# 5 E 2348
This seems to work, but is not very tidyverse-ish:
df_2 %>% mutate(v = with(df_1, desc[ findInterval(code, x) ]))
code v
1 719 B
2 1970 D
3 1022 C
4 2205 E
5 2348 E
This only uses the x column, so the assumption is that there are no gaps in the ranges (y is always one below the next x).

How to split my columns using a unique and tidyR

I'm working on a data.table with a column like this:
A <- c("a;b;c","a;a;b","d;a;b","f;f;f")
df <- data.frame(A)
I would like to separate this column into 3 columns like this:
seg1 seg2 seg3
1 a b c
2 a b <NA>
3 d a b
4 f <NA> <NA>
The thing here is that when i split each row by ";" i need to keep unique of the row.
Here's a tidyverse approach. We split the character in A, keep only the unique values, paste the result back together and separate into three columns:
library(tidyverse)
df %>%
mutate(A = map(strsplit(as.character(A), ";"),
.f = ~ paste(unique(.x), collapse = ";"))) %>%
separate(A, into = c("seg1", "seg2", "seg3"))
Which gives:
# seg1 seg2 seg3
#1 a b c
#2 a b <NA>
#3 d a b
#4 f <NA> <NA>
library(stringr)
A <- c("a;b;c","a;a;b","d;a;b","f;f;f")
df <- data.frame(A)
df <- str_split_fixed(df$A, ";", 3)
df <- apply(X = df,
FUN = function(x){
return(x[!duplicated(x)][1:ncol(df)])
},
MARGIN = 1)
df <- t(df)
df <- as.data.frame(df)
names(df) <- c("seg1", "seg2", "seg3")
df
# seg1 seg2 seg3
# 1 a b c
# 2 a b <NA>
# 3 d a b
# 4 f <NA> <NA>

combination of pairs of columns BUT not rows in a data frame

How to calculate the combinations of pairs of columns in a data frame, but restrict it, so that it does not considers combinations among rows?
I have a data frame like the following, where each column is a variable.
ID A B C D E F G H I J
1 12 185 NA NA NA NA NA NA NA NA
2 35 20 11 NA NA NA NA NA NA NA
3 45 NA NA NA NA NA NA NA NA NA
I want an output like this:
Var1
12, 185
35, 20
35, 11
20, 11
45, 45
I tried the following code, but it considers ALL possible pairs of combinations among columns and rows. I want each row to be consider independently from each other. Does someone have an idea? Thanks.
numNetList <- read.csv2("abd.csv", sep=";")
comb <- lapply(numNetList, function(x) if (length(x) > 1)
combn(sort(as.numeric(x)), 2))
combb <- do.call(cbind, comb)
pajek_list <- as.data.frame(table(paste(combb[1,], combb[2,], sep = ',')))
not the efficient method, but solves the problem
func <- function(x){
t = as.character(x[!is.na(x)])
if (length(t)==1)
t = rep(t,2)
t1 = combn(t,2)
}
l = apply(df[-1], 1, func)
l1 <- as.data.frame(l)
colnames(l1) = NULL
l2= data.frame(t(l1))
library(tidyr)
unite(l2, "new_col", X1,X2 ,sep = ",")
# new_col
# 12,185
# 35,20
# 35,11
# 20,11
# 45,45
I would go with a combination of dplyr and tidyr:
library(dplyr)
library(tidyr)
df <- tibble(A = c(12,35,45), B = c(185, 20, NA), C = c(NA, 11, NA))
df %>%
mutate(group = 1:n()) %>%
gather(col, val, -group) %>%
group_by(group) %>%
expand(col, val) %>%
distinct(val) %>%
summarise(val = toString(val))

Resources