I have the following dataset:
A..B A..C B..C
value 2 5 9
and I would like to break it in a way such as I get the following output:
A B C
A 1 2 5
B 2 1 9
C 5 9 1
in ideas on how can I do this in r?
Maybe you can try the base R code below
dn <- strsplit(names(df), "..", fixed = TRUE)
mat <- `dimnames<-`(diag(rep(1, ncol(df))), replicate(2, list(unique(unlist(dn)))))
inds <- do.call(rbind, lapply(dn, function(x) rbind(x, rev(x))))
mat[inds] <- rep(unlist(df), each = 2)
or
dn <- strsplit(names(df), "..", fixed = TRUE)
mat <- `dimnames<-`(diag(rep(1, ncol(df))), replicate(2, list(unique(unlist(dn)))))
for (k in seq_along(dn)) {
mat[do.call(cbind, as.list(dn[[k]]))] <- df[, k]
}
mat[lower.tri(mat)] <- t(mat)[lower.tri(mat)]
such that
> mat
A B C
A 1 2 5
B 2 1 9
C 5 9 1
Data
> dput(df)
structure(list(A..B = 2L, A..C = 5L, B..C = 9L), class = "data.frame", row.names = "value")
An option with tidyverse
library(dplyr)
library(tidyr)
library(tibble)
df %>%
pivot_longer(cols = everything()) %>%
separate(name, into = c('name1', 'name2')) %>%
complete(name1 = LETTERS[1:3], name2 = LETTERS[1:3],
fill = list(value = 0)) %>%
pivot_wider(names_from = name2, values_from = value) %>%
column_to_rownames('name1') %>%
as.matrix %>%
{. + t(.)} %>%
`diag<-`(., 1)
# A B C
#A 1 2 5
#B 2 1 9
#C 5 9 1
data
df <- structure(list(A..B = 2L, A..C = 5L, B..C = 9L),
class = "data.frame", row.names = "value")
Here's another option that uses matrix indexing to fill in the values:
library(splitstackshape)
# stack your dataset and split the names into two columns
x <- cSplit(stack(df), "ind", "..")
# ij is going to be your index of row and column combinations
ij <- as.matrix(x[, 2:3])
u <- unique(c(ij))
# initialze a matrix of 1s
m <- matrix(1, nrow = length(u), ncol = length(u),
dimnames = list(u, u))
# replace the relevant indices with values
m[rbind(ij, ij[, 2:1])] <- x$values
m
# A B C
# A 1 2 5
# B 2 1 9
# C 5 9 1
In base you can use strsplit to get the names, use unique to get all levels and create a matrix initialized with 1L and the size of the levels. Then you can fill up the matrix by using the names to find the position of the values.
i <- do.call(rbind, strsplit(names(x), "..", TRUE))
u <- unique(as.vector(i))
m <- matrix(1L, length(u), length(u), dimnames = list(u, u))
m[rbind(i, i[,2:1])] <- unlist(x)
#m[rbind(i, i[,2:1])] <- x #Alternative in case of a vector
m
# A B C
#A 1 2 5
#B 2 1 9
#C 5 9 1
Data:
x <- data.frame(A..B = 2L, A..C = 5L, B..C = 9L, row.names = "value")
#x <- c(A..B = 2L, A..C = 5L, B..C = 9L) #Alternative as a vector
Related
I have a dataframe with count information (df1)
rownames
sample1
sample2
sample3
m1
0
5
1
m2
1
7
5
m3
6
2
0
m4
3
1
0
and a second with sample information (df2)
rownames
batch
total count
sample1
a
10
sample2
b
15
sample3
a
6
I also have two lists with information about the m values (could easily be turned into another data frame if necessary but I would rather not add to the count information as it is quite large). No patterns (such as even and odd) exist, I am just using a very simplistic example
x <- c("m1", "m3") and y <- c("m2", "m4")
What I would like to do is add another two columns to the sample information. This is a count of each m per sample that has a value of above 5 and appears in list x or y
rownames
batch
total count
x
y
sample1
a
10
1
0
sample2
b
15
1
1
sample3
a
6
0
1
My current strategy is to make a list of values for both x and y and then append them to df2. Here are my attempts so far:
numX <- colSums(df1[sum(rownames(df1)>10 %in% x),]) and numX <- colSums(df1[sum(rownames(df1)>10 %in% x),]) both return a list of 0s
numX <- colSums(df1[rownames(df1)>10 %in% x,]) returns a list of the sum of count values meeting the conditions for each column
numX <- length(df1[rownames(df1)>10 %in% novel,]) returns the number of times the condition is met (in this example 2L)
I am not really sure how to approach this so I have just been throwing around attempts. I've tried looking for answers but maybe I am just struggling to find the proper wording.
We may do this with rowwise
library(dplyr)
df2 %>%
rowwise %>%
mutate(x = +(sum(df1[[rownames]][df1$rownames %in% x]) >= 5),
y = +(sum(df1[[rownames]][df1$rownames %in% y]) >= 5)) %>%
ungroup
-output
# A tibble: 3 × 5
rownames batch totalcount x y
<chr> <chr> <int> <int> <int>
1 sample1 a 10 1 0
2 sample2 b 15 1 1
3 sample3 a 6 0 1
Or based on the data, a base R option would be
out <- aggregate(. ~ grp, FUN = sum,
transform(df1, grp = c('x', 'y')[1 + (rownames %in% y)] )[-1])
df2[out$grp] <- +(t(out[-1]) >= 5)
-output
> df2
rownames batch totalcount x y
1 sample1 a 10 1 0
2 sample2 b 15 1 1
3 sample3 a 6 0 1
data
df1 <- structure(list(rownames = c("m1", "m2", "m3", "m4"), sample1 = c(0L,
1L, 6L, 3L), sample2 = c(5L, 7L, 2L, 1L), sample3 = c(1L, 5L,
0L, 0L)), class = "data.frame", row.names = c(NA, -4L))
df2 <- structure(list(rownames = c("sample1", "sample2", "sample3"),
batch = c("a", "b", "a"), totalcount = c(10L, 15L, 6L)),
class = "data.frame", row.names = c(NA,
-3L))
How about using using dplyr and reshape2::melt
df3 <- df1 %>%
melt %>%
filter(value >= 5) %>%
mutate(x = as.numeric(rownames %in% c("m1", "m3")),
y = as.numeric(rownames %in% c("m2", "m4"))) %>%
select(-rownames, - value) %>%
group_by(variable) %>%
summarise(x = sum(x), y = sum(y))
df2 %>% left_join(df3, by = c("rownames" = "variable"))
rownames batch total_count x y
1 sample1 a 10 1 0
2 sample2 b 15 1 1
3 sample3 a 6 0 1
You can create a named list of vectors and for each rownames count how many values of x and y in the respective sample is >= 5.
Base R option -
list_vec <- list(x = x, y = y)
cbind(df2, do.call(rbind, lapply(df2$rownames, function(x)
sapply(list_vec, function(y) {
sum(df1[[x]][df1$rownames %in% y] >= 5)
}))))
# rownames batch total.count x y
#1 sample1 a 10 1 0
#2 sample2 b 15 1 1
#3 sample3 a 6 0 1
Using tidyverse -
library(dplyr)
library(purrr)
list_vec <- lst(x, y)
df2 %>%
bind_cols(map_df(df2$rownames, function(x)
map(list_vec, ~sum(df1[[x]][df1$rownames %in% .x] >= 5))))
I have a data frame df
m n o p
a 1 1 2 5
b 1 2 0 4
c 3 3 3 3
I can extract column m by:
df[,"m"]
Now the problem is, the column name was generated somewhere else (multiple times, in a for loop). For example, column name m was generated by choosing a specific element in the dataframe, gen, in one loop
:
> gen[i,1]
[1] m
How do I extract the column based on gen[i,1]?
Just nest the subsetting.
dat[,"m"]
# [1] 1 1 3
i <- 13
gen[i, 1]
# [1] "m"
dat[, gen[i, 1]]
# [1] 1 1 3
Or, if you don't want the column to be dropped:
dat[, gen[i, 1], drop=FALSE]
# m
# a 1
# b 1
# c 3
Data
dat <- structure(list(m = c(1L, 1L, 3L), n = 1:3, o = c(2L, 0L, 3L),
p = 5:3), class = "data.frame", row.names = c("a", "b", "c"
))
gen <- data.frame(letters)
We can use select from dplyr
library(dplyr)
i <- 13
dat %>%
select(gen[i, 1])
# m
#a 1
#b 1
#c 3
data
dat <- structure(list(m = c(1L, 1L, 3L), n = 1:3, o = c(2L, 0L, 3L),
p = 5:3), class = "data.frame", row.names = c("a", "b", "c"
))
gen <- data.frame(letters)
Let's say in the R environment, I have this data frame with n rows:
a b c classes
1 2 0 a
0 0 2 b
0 1 0 c
The result that I am looking for is:
1. Get the number of non-zero values in each row
size_of_a = 2
average_of_a = 1.5
size_of_b= 1
average_of_b= 2
.
the same for the other rows
I have tried rowSums(dt[-c(4)]!=0)for finding the non zero elements, but I can't be sure that the 'classes column' will be the 4th column.
I would appreciate your help with acquiring these results.
Thanks
First, I create the data frame.
df <- read.table(text = "a b c classes
1 2 0 a
0 0 2 b
0 1 0 c", header = TRUE)
Then, I replace zeros with NAs to make life easier, since functions often have na.rm to ignore them.
df[df==0] <- NA
Finally, I bind together the sum of non-zero elements, the mean values, and the class names into a data frame.
data.frame(classes = df[,4],
size = rowSums(df[, -4]>0, na.rm = TRUE),
mean = rowMeans(df[, -4], na.rm = TRUE))
which gives,
# classes size mean
# 1 a 2 1.5
# 2 b 1 2.0
# 3 c 1 1.0
Edit
data.frame(classes = df[,"classes"],
size = rowSums(df[, names(df) != "classes"]>0, na.rm = TRUE),
mean = rowMeans(df[, names(df) != "classes"], na.rm = TRUE))
# classes size mean
# 1 a 2 1.5
# 2 b 1 2.0
# 3 c 1 1.0
You can do it with
# Generate some fake data
set.seed(1)
n = 10
k = 5
x = matrix(runif(n * k), n, k)
x[x < 0.5] = 0
# Get number of nonzero entries in each row
nonzeros = apply(x, 1, function(z) sum(z != 0))
# Take row sums and divide by number of non-zero entries
rowSums(x) / nonzeros
Or, using the data.frame you provided, it would look like this
# The data
x = structure(list(a = c(1L, 0L, 0L), b = c(2L, 0L, 1L), c = c(0L,
2L, 0L), classes = structure(1:3, .Label = c("a", "b", "c"), class = "factor")), .Names = c("a",
"b", "c", "classes"), class = "data.frame", row.names = c(NA,
-3L))
column = which(names(x) == "classes")
nonzeros = apply(x[-column], 1, function(z) sum(z != 0))
rowSums(x[-column]) / nonzeros
Another syntax to create dataframe using tibble function from dplyr library:
library(dplyr)
df <-
tibble(
a = c(1,0,0),
b = c(2,0,1),
c = c(0,2,0),
classes = c("a", "b", "c")
)
To count the elements in a row that are equal to zero, you can evaluate the whole row even when column classes is not numeric
rowSums( df == 0 )
Conversely, the number of elements different from zero in the whole row can be calculated through rowSums( df != 0 ).
Therefore, the average you are looking for is:
rowSums( df[ , 1:3] )/rowSums( df[ ,1:3] != 0 )
Cheers!
I have two data frames. dfOne is made like this:
X Y Z T J
3 4 5 6 1
1 2 3 4 1
5 1 2 5 1
and dfTwo is made like this
C.1 C.2
X Z
Y T
I want to obtain a new dataframe where there are simultaneously X, Y, Z, T Values which are major than a specific threshold.
Example. I need simultaneously (in the same row):
X, Y > 2
Z, T > 4
I need to use the second data frame to reach my objective, I expect something like:
dfTwo$C.1>2
so the result would be a new dataframe with this structure:
X Y Z T J
3 4 5 6 1
How could I do it?
Here is a base R method with Map and Reduce.
# build lookup table of thresholds relative to variable name
vals <- setNames(c(2, 2, 4, 4), unlist(dat2))
# subset data.frame
dat[Reduce("&", Map(">", dat[names(vals)], vals)), ]
X Y Z T J
1 3 4 5 6 1
Here, Map returns a list of length 4 with logical variables corresponding to each comparison. This list is passed to Reduce which returns a single logical vector with length corresponding to the number of rows in the data.frame, dat. This logical vector is used to subset dat.
data
dat <-
structure(list(X = c(3L, 1L, 5L), Y = c(4L, 2L, 1L), Z = c(5L,
3L, 2L), T = c(6L, 4L, 5L), J = c(1L, 1L, 1L)), .Names = c("X",
"Y", "Z", "T", "J"), class = "data.frame", row.names = c(NA,
-3L))
dat2 <-
structure(list(C.1 = structure(1:2, .Label = c("X", "Y"), class = "factor"),
C.2 = structure(c(2L, 1L), .Label = c("T", "Z"), class = "factor")), .Names = c("C.1",
"C.2"), class = "data.frame", row.names = c(NA, -2L))
We can use the purrr package
Here is the input data.
# Data frame from lmo's solution
dat <-
structure(list(X = c(3L, 1L, 5L), Y = c(4L, 2L, 1L), Z = c(5L,
3L, 2L), T = c(6L, 4L, 5L), J = c(1L, 1L, 1L)), .Names = c("X",
"Y", "Z", "T", "J"), class = "data.frame", row.names = c(NA,
-3L))
# A numeric vector to show the threshold values
# Notice that columns without any requirements need NA
vals <- c(X = 2, Y = 2, Z = 4, T = 4, J = NA)
Here is the implementation
library(purrr)
map2_dfc(dat, vals, ~ifelse(.x > .y | is.na(.y), .x, NA)) %>% na.omit()
# A tibble: 1 x 5
X Y Z T J
<int> <int> <int> <int> <int>
1 3 4 5 6 1
map2_dfc loop through each column in dat and each value in vals one by one with a defined function. ~ifelse(.x > .y | is.na(.y), .x, NA) means if the number in each column is larger than the corresponding value in vals, or vals is NA, the output should be the original value from the column. Otherwise, the value is replaced to be NA. The output of map2_dfc(dat, vals, ~ifelse(.x > .y | is.na(.y), .x, NA)) is a data frame with NA values in some rows indicating that the condition is not met. Finally, na.omit removes those rows.
Update
Here I demonstrate how to covert the dfTwo dataframe to the vals vector in my example.
First, let's create the dfTwo data frame.
dfTwo <- read.table(text = "C.1 C.2
X Z
Y T",
header = TRUE, stringsAsFactors = FALSE)
dfTwo
C.1 C.2
1 X Z
2 Y T
To complete the task, I load the dplyr and tidyr package.
library(dplyr)
library(tidyr)
Now I begin the transformation of dfTwo. The first step is to use stack function to convert the format.
dfTwo2 <- dfTwo %>%
stack() %>%
setNames(c("Col", "Group")) %>%
mutate(Group = as.character(Group))
dfTwo2
Col Group
1 X C.1
2 Y C.1
3 Z C.2
4 T C.2
The second step is to add the threshold information. One way to do this is to create a look-up table showing the association between Group and Value
threshold_df <- data.frame(Group = c("C.1", "C.2"),
Value = c(2, 4),
stringsAsFactors = FALSE)
threshold_df
Group Value
1 C.1 2
2 C.2 4
And then we can use the left_join function to combine the data frame.
dfTwo3 <- dfTwo2 %>% left_join(threshold_dt, by = "Group")
dfTwo3
Col Group Value
1 X C.1 2
2 Y C.1 2
3 Z C.2 4
4 T C.2 4
Now it is the third step. Notice that there is a column called J which does not need any threshold. So we need to add this information to dfTwo3. We can use the complete function from tidyr. The following code completes the data frame by adding Col in dat but not in dfTwo3 and NA to the Value.
dfTwo4 <- dfTwo3 %>% complete(Col = colnames(dat))
dfTwo4
# A tibble: 5 x 3
Col Group Value
<chr> <chr> <dbl>
1 J <NA> NA
2 T C.2 4
3 X C.1 2
4 Y C.1 2
5 Z C.2 4
The fourth step is arrange the right order of dfTwo4. We can achieve this by turning Col to factor and assign the level based on the order of the column name in dat.
dfTwo5 <- dfTwo4 %>%
mutate(Col = factor(Col, levels = colnames(dat))) %>%
arrange(Col) %>%
mutate(Col = as.character(Col))
dfTwo5
# A tibble: 5 x 3
Col Group Value
<chr> <chr> <dbl>
1 X C.1 2
2 Y C.1 2
3 Z C.2 4
4 T C.2 4
5 J <NA> NA
We are almost there. Now we can create vals from dfTwo5.
vals <- dfTwo5$Value
names(vals) <- dfTwo5$Col
vals
X Y Z T J
2 2 4 4 NA
Now we are ready to use the purrr package to filter the data.
The aboved are the breakdown of steps. We can combine all these steps into the following code for simlicity.
library(dplyr)
library(tidyr)
threshold_df <- data.frame(Group = c("C.1", "C.2"),
Value = c(2, 4),
stringsAsFactors = FALSE)
dfTwo2 <- dfTwo %>%
stack() %>%
setNames(c("Col", "Group")) %>%
mutate(Group = as.character(Group)) %>%
left_join(threshold_df, by = "Group") %>%
complete(Col = colnames(dat)) %>%
mutate(Col = factor(Col, levels = colnames(dat))) %>%
arrange(Col) %>%
mutate(Col = as.character(Col))
vals <- dfTwo2$Value
names(vals) <- dfTwo2$Col
dfOne[Reduce(intersect, list(which(dfOne["X"] > 2),
which(dfOne["Y"] > 2),
which(dfOne["Z"] > 4),
which(dfOne["T"] > 4))),]
# X Y Z T J
#1 3 4 5 6 1
Or iteratively (so fewer inequalities are tested):
vals = c(X = 2, Y = 2, Z = 4, T = 4) # from #lmo's answer
dfOne[Reduce(intersect, lapply(names(vals), function(x) which(dfOne[x] > vals[x]))),]
# X Y Z T J
#1 3 4 5 6 1
I'm writing this assuming that the second DF is meant to categorize the fields in the first DF. It's way simpler if you don't need to use the second one to define the conditions:
dfNew = dfOne[dfOne$X > 2 & dfOne$Y > 2 & dfOne$Z > 4 & dfOne$T > 4, ]
Or, using dplyr:
library(dplyr)
dfNew = dfOne %>% filter(X > 2 & Y > 2 & Z > 4 & T > 4)
In case that's all you need, I'll save this comment while I poke at the more complicated version of the question.
I have a data frame(df):
group col
a 12
a 15
a 13
b 21
b 23
Desired output is also a data frame(df1):
col1 col2
12 21
15 23
13 0
Namley, I want to partition "col" of "df" by "group" into multi columns as "col1" and "col2".
When the length of each column is not equal to each other, "0" must be added end of each column untill the length of each column reaches to the maximum column length.
We could either use base R functions split or unstack to split the 'col' by 'group' into a list, then pad NA to list elements that are less than the maximum length of the list element. Change the column names, replace 'NA' by 0.
lst <- unstack(df1, col~group)
d1 <- as.data.frame(sapply(lst, `length<-`, max(sapply(lst, length))))
d1[is.na(d1)] <- 0
colnames(d1) <- paste0('col', 1:ncol(d1))
d1
# col1 col2
#1 12 21
#2 15 23
#3 13 0
Or use stri_list2matrix from stringi
library(stringi)
d1 <- as.data.frame(stri_list2matrix(unstack(df1, col~group),
fill=0), stringsAsFactors=FALSE)
d1[] <- lapply(d1, as.numeric)
Or using data.table/splitstackshape
library(splitstackshape)
setnames(dcast(getanID(df1, 'group'), .id~group, value.var='col',
fill=0L)[, .id:= NULL], paste0('col', 1:2))[]
# col1 col2
#1: 12 21
#2: 15 23
#3: 13 0
How to do it with dplyr...
library(dplyr)
library(tidyr)
df1 %>%
group_by(group) %>%
mutate(n = row_number()) %>%
spread(group, col) %>%
select(-n) %>%
(function(x) { x[is.na(x)] <- 0; x })
Since you fill with zeroes, another idea:
xtabs(col ~ ave(DF$col, DF$group, FUN = seq_along) + group, DF)
# group
#ave(DF$col, DF$group, FUN = seq_along) a b
# 1 12 21
# 2 15 23
# 3 13 0
Where "DF":
DF = structure(list(group = structure(c(1L, 1L, 1L, 2L, 2L), .Label = c("a",
"b"), class = "factor"), col = c(12L, 15L, 13L, 21L, 23L)), .Names = c("group",
"col"), class = "data.frame", row.names = c(NA, -5L))