I have a data frame df
m n o p
a 1 1 2 5
b 1 2 0 4
c 3 3 3 3
I can extract column m by:
df[,"m"]
Now the problem is, the column name was generated somewhere else (multiple times, in a for loop). For example, column name m was generated by choosing a specific element in the dataframe, gen, in one loop
:
> gen[i,1]
[1] m
How do I extract the column based on gen[i,1]?
Just nest the subsetting.
dat[,"m"]
# [1] 1 1 3
i <- 13
gen[i, 1]
# [1] "m"
dat[, gen[i, 1]]
# [1] 1 1 3
Or, if you don't want the column to be dropped:
dat[, gen[i, 1], drop=FALSE]
# m
# a 1
# b 1
# c 3
Data
dat <- structure(list(m = c(1L, 1L, 3L), n = 1:3, o = c(2L, 0L, 3L),
p = 5:3), class = "data.frame", row.names = c("a", "b", "c"
))
gen <- data.frame(letters)
We can use select from dplyr
library(dplyr)
i <- 13
dat %>%
select(gen[i, 1])
# m
#a 1
#b 1
#c 3
data
dat <- structure(list(m = c(1L, 1L, 3L), n = 1:3, o = c(2L, 0L, 3L),
p = 5:3), class = "data.frame", row.names = c("a", "b", "c"
))
gen <- data.frame(letters)
Related
i have a data and i want to see if my variables they all have unique value in specefic row
let's say i want to analyze row D
my data
Name F S T
A 1 2 3
B 2 3 4
C 3 4 5
D 4 5 6
> TRUE (because all the three variables have unique value)
Second example
Name F S T
A 1 2 3
B 2 3 4
C 3 4 5
D 4 5 4
>False (because F and T have the same value in row D )
In base R do
f1 <- function(dat, ind) {
tmp <- unlist(dat[ind, -1])
length(unique(tmp)) == length(tmp)
}
-testing
> f1(df, 4)
[1] TRUE
> f1(df1, 4)
[1] FALSE
data
df <- structure(list(Name = c("A", "B", "C", "D"), F = 1:4, S = 2:5,
T = 3:6), class = "data.frame", row.names = c(NA, -4L))
df1 <- structure(list(Name = c("A", "B", "C", "D"), F = 1:4, S = 2:5,
T = c(3L, 4L, 5L, 4L)), class = "data.frame", row.names = c(NA,
-4L))
You can use dplyr for this:
df %>%
summarize_at(c(2:ncol(.)), n_distinct) %>%
summarize(if_all(.fns = ~ .x == nrow(df)))
I have the following dataset:
A..B A..C B..C
value 2 5 9
and I would like to break it in a way such as I get the following output:
A B C
A 1 2 5
B 2 1 9
C 5 9 1
in ideas on how can I do this in r?
Maybe you can try the base R code below
dn <- strsplit(names(df), "..", fixed = TRUE)
mat <- `dimnames<-`(diag(rep(1, ncol(df))), replicate(2, list(unique(unlist(dn)))))
inds <- do.call(rbind, lapply(dn, function(x) rbind(x, rev(x))))
mat[inds] <- rep(unlist(df), each = 2)
or
dn <- strsplit(names(df), "..", fixed = TRUE)
mat <- `dimnames<-`(diag(rep(1, ncol(df))), replicate(2, list(unique(unlist(dn)))))
for (k in seq_along(dn)) {
mat[do.call(cbind, as.list(dn[[k]]))] <- df[, k]
}
mat[lower.tri(mat)] <- t(mat)[lower.tri(mat)]
such that
> mat
A B C
A 1 2 5
B 2 1 9
C 5 9 1
Data
> dput(df)
structure(list(A..B = 2L, A..C = 5L, B..C = 9L), class = "data.frame", row.names = "value")
An option with tidyverse
library(dplyr)
library(tidyr)
library(tibble)
df %>%
pivot_longer(cols = everything()) %>%
separate(name, into = c('name1', 'name2')) %>%
complete(name1 = LETTERS[1:3], name2 = LETTERS[1:3],
fill = list(value = 0)) %>%
pivot_wider(names_from = name2, values_from = value) %>%
column_to_rownames('name1') %>%
as.matrix %>%
{. + t(.)} %>%
`diag<-`(., 1)
# A B C
#A 1 2 5
#B 2 1 9
#C 5 9 1
data
df <- structure(list(A..B = 2L, A..C = 5L, B..C = 9L),
class = "data.frame", row.names = "value")
Here's another option that uses matrix indexing to fill in the values:
library(splitstackshape)
# stack your dataset and split the names into two columns
x <- cSplit(stack(df), "ind", "..")
# ij is going to be your index of row and column combinations
ij <- as.matrix(x[, 2:3])
u <- unique(c(ij))
# initialze a matrix of 1s
m <- matrix(1, nrow = length(u), ncol = length(u),
dimnames = list(u, u))
# replace the relevant indices with values
m[rbind(ij, ij[, 2:1])] <- x$values
m
# A B C
# A 1 2 5
# B 2 1 9
# C 5 9 1
In base you can use strsplit to get the names, use unique to get all levels and create a matrix initialized with 1L and the size of the levels. Then you can fill up the matrix by using the names to find the position of the values.
i <- do.call(rbind, strsplit(names(x), "..", TRUE))
u <- unique(as.vector(i))
m <- matrix(1L, length(u), length(u), dimnames = list(u, u))
m[rbind(i, i[,2:1])] <- unlist(x)
#m[rbind(i, i[,2:1])] <- x #Alternative in case of a vector
m
# A B C
#A 1 2 5
#B 2 1 9
#C 5 9 1
Data:
x <- data.frame(A..B = 2L, A..C = 5L, B..C = 9L, row.names = "value")
#x <- c(A..B = 2L, A..C = 5L, B..C = 9L) #Alternative as a vector
I have a column of numbers (index) in a dataframe like the below. I am attempting to check if these numbers are in ascending order by the value of 1. For example, group B and C do not ascend by 1. While I can check by sight, my dataframe is thousands of rows long, so I'd prefer to automate this. Does anyone have advice? Thank you!
group index
A 0
A 1
A 2
A 3
A 4
B 0
B 1
B 2
B 2
C 0
C 3
C 1
C 2
...
I think this works. diff calculates the difference between the two subsequent numbers, and then we can use all to see if all the differences are 1. dat2 is the final output.
library(dplyr)
dat2 <- dat %>%
group_by(group) %>%
summarize(Result = all(diff(index) == 1)) %>%
ungroup()
dat2
# # A tibble: 3 x 2
# group Result
# <chr> <lgl>
# 1 A TRUE
# 2 B FALSE
# 3 C FALSE
DATA
dat <- read.table(text = "group index
A 0
A 1
A 2
A 3
A 4
B 0
B 1
B 2
B 2
C 0
C 3
C 1
C 2",
header = TRUE, stringsAsFactors = FALSE)
Maybe aggregate could help
> aggregate(.~group,df1,function(v) all(diff(v)==1))
group index
1 A TRUE
2 B FALSE
3 C FALSE
We can do a group by group, get the difference between the current and previous value (shift) and check if all the differences are equal to 1.
library(data.table)
setDT(df1)[, .(Result = all((index - shift(index))[-1] == 1)), group]
# group Result
#1: A TRUE
#2: B FALSE
#3: C FALSE
data
df1 <- structure(list(group = c("A", "A", "A", "A", "A", "B", "B", "B",
"B", "C", "C", "C", "C"), index = c(0L, 1L, 2L, 3L, 4L, 0L, 1L,
2L, 2L, 0L, 3L, 1L, 2L)), class = "data.frame", row.names = c(NA,
-13L))
I want to loop through a large dataframe counting in the first column how many values >0, removing those rows that were counted.... then moving on to column 2 counting the number of values>0 and removing those rows etc...
the data frame
taxonomy A B C
1 cat 0 2 0
2 dog 5 1 0
3 horse 3 0 0
4 mouse 0 0 4
5 frog 0 2 4
6 lion 0 0 2
can be generated with
DF1 = structure(list(taxonomy = c("cat", "dog","horse","mouse","frog", "lion"),
A = c(0L, 5L, 3L, 0L, 0L, 0L), D = c(2L, 1L, 0L, 0L, 2L, 0L), C = c(0L, 0L, 0L, 4L, 4L, 2L)),
.Names = c("taxonomy", "A", "B", "C"),
row.names = c(NA, -6L), class = "data.frame")
and i expect the outcome to be
A B C
count 2 2 2
i wrote this loop but it does not remove the rows as it goes
res <- data.frame(DF1[1,], row.names = c('count'))
for(n in 1:ncol(DF1)) {
res[colnames(DF1)[n]] <- sum(DF1[n])
DF1[!DF1[n]==1]
}
it gives this incorrect result
A B C
count 2 3 3
You could do ...
DF = DF1[, -1]
cond = DF != 0
p = max.col(cond, ties="first")
fp = factor(p, levels = seq_along(DF), labels = names(DF))
table(fp)
# A B C
# 2 2 2
To account for rows that are all zeros, I think this works:
fp[rowSums(cond) == 0] <- NA
We can update the dataset in each run. Create a temporary dataset without the 'taxonomy' column ('tmp'). Initiate a named vector ('n'), loop through the columns of 'tmp', get a logical index based on whether the column is greater than 0 ('i1'), get the sum of TRUE values, update the 'n' for the corresponding column, then update the 'tmp' by removing those rows using 'i1' as row index
tmp <- DF1[-1]
n <- setNames(numeric(ncol(tmp)), names(tmp))
for(i in seq_len(ncol(tmp))) {
i1 <- tmp[[i]] > 0
n[i] <- sum(i1)
tmp <- tmp[!i1, ]}
n
# A B C
# 2 2 2
It can also be done with Reduce
sapply(Reduce(function(x, y) y[!x] > 0, DF1[3:4],
init = DF1[,2] > 0, accumulate = TRUE ), sum)
#[1] 2 2 2
Or using accumulate from purrr
library(purrr)
accumulate(DF1[3:4], ~ .y[!.x] > 0, .init = DF1[[2]] > 0) %>%
map_int(sum)
#[1] 2 2 2
This is easy with Reduce and sapply:
> first <- Reduce(function(a,b) b[a==0], df[-1], accumulate=TRUE)
> first
[[1]]
[1] 0 5 3 0 0 0
[[2]]
[1] 2 0 2 0
[[3]]
[1] 0 4 2
> then <- sapply(setNames(first, names(df[-1])), function(x) length(x[x>0]))
> then
A B C
2 2 2
I have multiple inputs like:
a <- x y z
1 2 2
2 3 2
3 2 4
4 2 4
5 2 1
b <- c(1,2)
c <- c(2,3)
i want to subset this data based on a condition that a$x contains values greater than equal to b[i] and less than equal to c[i]
output should look like:
d <- x y z
1 2 2
2 3 2
2 3 2
3 2 4
i have tried this:
d = as.data.frame(matrix(ncol=3, nrow=0))
names(d) = names(a)
for (i in 1:length(b){
d <- rbind(d,a[which(a$x>=b[i] & a$x<=c[i]),])
}
Using dplyr::filter function:
sub_list <- lapply(1:length(b), function(i) a %>% filter(x >= b[i] & x <= c[i]))
do.call(rbind, sub_list)
x y z
1 1 2 2
2 2 3 2
3 2 3 2
4 3 2 4
Input data:
a <- structure(list(x = 1:5, y = c(2L, 3L, 2L, 2L, 2L), z = c(2L,
2L, 4L, 4L, 1L)), .Names = c("x", "y", "z"), class = "data.frame", row.names = c(NA,
-5L))
b <- c(1,2)
c <- c(2,3)