Data frame of tables from a list - r

Suppose I have a list with observations:
foo <- list(c("C", "E", "A", "F"), c("B", "D", "B", "A", "C"), c("B",
"C", "C", "F", "A", "F"), c("D", "A", "A", "D", "D", "F", "B"
))
> foo
[[1]]
[1] "C" "E" "A" "F"
[[2]]
[1] "B" "D" "B" "A" "C"
[[3]]
[1] "B" "C" "C" "F" "A" "F"
[[4]]
[1] "D" "A" "A" "D" "D" "F" "B"
And a vector with each unique element:
vec <- LETTERS[1:6]
> vec
[1] "A" "B" "C" "D" "E" "F"
I want to obtain a data frame with the counts of each element of vec in each element of foo. I can do this with plyr in a very ugly unvectorized way:
> ldply(foo,function(x)sapply(vec,function(y)sum(y==x)))
A B C D E F
1 1 0 1 0 1 1
2 1 2 1 1 0 0
3 1 1 2 0 0 2
4 2 1 0 3 0 1
But that's obviously slow. How can this be done faster? I know of table() but haven't really figured out how to use it due to 0-counts in some of the elements of foo.

One solution (off the top of my head):
# convert foo to a list of factors
lfoo <- lapply(foo, factor, levels=LETTERS[1:6])
# apply table() to each list element
t(sapply(lfoo, table))
A B C D E F
[1,] 1 0 1 0 1 1
[2,] 1 2 1 1 0 0
[3,] 1 1 2 0 0 2
[4,] 2 1 0 3 0 1

or with reshape:
cast(melt(foo), L1 ~ value, length)[-1]

Related

Replace characters in a column with numbers R

I have a matrix with last column contains characters:
A
B
B
A
...
I would like to replace A with 1 and B with 2 in R. The expected result should be:
1
2
2
1
...
If you are 100% confident only "A" and "B" appear
sample_data = c("A", "B", "B", "A")
sample_data
# [1] "A" "B" "B" "A"
as.numeric(gsub("A", 1, gsub("B", 2, sample_data)))
# [1] 1 2 2 1
Using factor or a simple lookup table would be much more flexible:
sample_data = c("A", "B", "B", "A")
Recommended:
as.numeric(factor(sample_data))
# [1] 1 2 2 1
Possible alternative:
as.numeric(c("A" = "1", "B" = "2")[sample_data])
# [1] 1 2 2 1

how to create edgelist from vector in R

I have a vector like:
c("A", "B", "C", "D", "E", "F")
and I'd like to create a dataframe like
"from" "to"
A B
B C
C D
D E
E F
how can I accomplish that?
Another way:
data.frame(from = vec[-length(vec)], to = vec[-1])
na.omit(data.frame(from = vec, to = dplyr::lead(vec)))
from to
1 A B
2 B C
3 C D
4 D E
5 E F
Another way is to use zoo package,
library(zoo)
rollapply(vec, 2, by = 1, paste)
Here is one method using embed and rearranging columns:
# data
temp <- c("A", "B", "C", "D", "E", "F")
embed(temp, 2)[, c(2,1)]
[,1] [,2]
[1,] "A" "B"
[2,] "B" "C"
[3,] "C" "D"
[4,] "D" "E"
[5,] "E" "F"
to put this into a data.frame, wrap it in data.frame:
setNames(data.frame(embed(temp, 2)[, c(2,1)]), c("from", "to"))
from to
1 A B
2 B C
3 C D
4 D E
5 E F
We could also do:
vec <- c("A", "B", "C", "D", "E", "F")
x <- rep(seq(length(vec)), each=2)[-length(vec)*2][-1]
# [1] 1 2 2 3 3 4 4 5 5 6
data.frame(matrix(vec[x], ncol = 2, byrow = T))
Or alternatively:
data.frame(t(sapply(seq(length(vec)-1), function(i) c(vec[i], vec[i+1]))))
# X1 X2
# 1 A B
# 2 B C
# 3 C D
# 4 D E
# 5 E F

how to deal with missing value in if else statement?

I have a dataframe, mydata, constructed as follows:
col1<-c(8.20e+07, 1.75e+08, NA, 4.80e+07,
3.40e+07, NA, 5.60e+07, 3.00e+06 )
col2<-c(1960,1960,1965,1986,1960
,1969,1960,1993)
col3<-c ( NA,2.190,NA,NA, 5.000, NA,
1.700,4.220)
mydata<-data.frame(col1,col2,col3)
mydata
# col1 col2 col3
# 1 8.20e+07 1960 NA
# 2 1.75e+08 1960 2.19
# 3 NA 1965 NA
# 4 4.80e+07 1986 NA
# 5 3.40e+07 1960 5.00
# 6 NA 1969 NA
# 7 5.60e+07 1960 1.70
# 8 3.00e+06 1993 4.22
I want to create a col4 that has the values "a", "b" and "c",
if col1 is smaller than 4.00e+07, then col4=="a"; if col1 is not less than 4.00e+07, then col4=="b", else col4=="c"
Here is my code:
col4 <-ifelse(col1<4.00e+07, "a",
ifelse(col1 >=4.00e+07, "b",
ifelse(is.na(col1 =4.00e+07), "b", "c" )))
but this evaluates to:
# [1] "b" "b" NA "b" "a" NA "b" "a"
It doesn't change the NA value in col1 as "c".
The outcome should be:
# [1] "b" "b" "c" "b" "a" "c" "b" "a"
What is the problem in my code? Any suggestion would be appreciated!
You have to check is.na first, because NA < 4.00e+07 results in NA. If the first argument of ifelse() is NA, the result will be NA as well:
ifelse(c(NA, TRUE, FALSE), "T", "F")
## [1] NA "T" "F"
As you can see, for the first vector element the result is indeed NA. Even if the other arguments of ifelse() have special code that would take care of this situation, it won't help because that code is never taken into account.
For your example, checking for NA first gives you the desired result:
col4 <- ifelse(is.na(col1), "c",
ifelse(col1 < 4.00e+07, "a","b"))
col4
## [1] "b" "b" "c" "b" "a" "c" "b" "a"
This can be also done with cut
v1 <- with(mydata, as.character(cut(col1,
breaks=c(-Inf, 4.00e+07, Inf), labels=c("a", "b"))))
v1[is.na(v1)] <- "c"
v1
#[1] "b" "b" "c" "b" "a" "c" "b" "a"

Appending values with different order in R

I have two data elements in R:
data1
1 M
2 T
3 Z
4 A
5 J
data2 values
[1,] "A" "aa"
[2,] "J" "ab"
[3,] "M" "ac"
[4,] "T" "ad"
[5,] "Z" "ae"
I would like to get:
data1 values
[1,] "M" "ac"
[2,] "T" "ad"
[3,] "Z" "ae"
[4,] "A" "aa"
[5,] "J" "ab"
How can I append the values to data 1 such that they are sorted according to the different order in data 1?
You can get this behavior with the match function:
dat1 = data.frame(data1=c("M", "T", "Z", "A", "J"), stringsAsFactors=FALSE)
dat2 = data.frame(data2=c("A", "J", "M", "T", "Z"),
values=c("aa", "ab", "ac", "ad", "ae"), stringsAsFactors=FALSE)
dat2[match(dat1$data1, dat2$data2),]
# data2 values
# 3 M ac
# 4 T ad
# 5 Z ae
# 1 A aa
# 2 J ab

How to extract unique levels from 2 columns in a data frame in r

I have the data.frame
df<-data.frame("Site.1" = c("A", "B", "C"),
"Site.2" = c("D", "B", "B"),
"Tsim" = c(2, 4, 7),
"Jaccard" = c(5, 7, 1))
# Site.1 Site.2 Tsim Jaccard
# 1 A D 2 5
# 2 B B 4 7
# 3 C B 7 1
I can get the unique levels for each column using
top.x<-unique(df[1:2,c("Site.1")])
top.x
# [1] A B
# Levels: A B C
top.y<-unique(df[1:2,c("Site.2")])
top.y
# [1] D B
# Levels: B D
How do I get the unique levels for both columns and turn them into a vector i.e:
v <- c("A", "B", "D")
v
# [1] "A" "B" "D"
top.xy <- unique(unlist(df[1:2,]))
top.xy
[1] A B D
Levels: A B C D
Try union:
union(top.x, top.y)
# [1] "A" "B" "D"
union(unique(df[1:2, c("Site.1")]),
unique(df[1:2, c("Site.2")]))
# [1] "A" "B" "D"
You can get the unique levels for the firs two collumns:
de<- apply(df[,1:2],2,unique)
de
# $Site.1
# [1] "A" "B" "C"
# $Site.2
# [1] "D" "B"
Then you can take the symmetric difference of the two sets:
union(setdiff(de$Site.1,de$Site.2), setdiff(de$Site.2,de$Site.1))
# [1] "A" "C" "D"
If you're intrested in just two first two rows (as in your example):
de<- apply(df[1:2,1:2],2,unique)
de
# Site.1 Site.2
# [1,] "A" "D"
# [2,] "B" "B"
union(de[,1],de[,2])
# [1] "A" "B" "D"

Resources