summarise information reported by a named vector - r

I want to extrapolate the info reported by a character see below:
X<- c("BB", "BB", "CC", "CC", "CC", "EE", "EE")
names(X) <- c(1, 2, 2, 2, 3, 3, 4)
Character is below:
X
1 2 2 2 3 3 4
"BB" "BB" "CC" "CC" "CC" "EE" "EE"
"CC" in position 2 occurs twice, this info should be reported by the line Time Expected output:
1 2 2 3 3 4 # Position
1 1 2 1 1 1 # Times
"BB" "BB" "CC" "CC" "EE" "EE" # Character
Tried:
table (names(X))
data.frame(X)

We can use group by operation on the names of 'X' with the values of 'X' and get the frequency
library(data.table)
data.table(X, nm = names(X))[, .N, .(X, nm)]
# X nm N
#1: BB 1 1
#2: BB 2 1
#3: CC 2 2
#4: CC 3 1
#5: EE 3 1
#6: EE 4 1
Or similar option with tidyverse
library(dplyr)
data_frame(X, nm = names(X)) %>%
count(X, nm)
Or with aggregate from base R
aggregate(cbind(n = rep(1, length(X))) ~ X + names(X), FUN = sum)

Related

Match vectors in sequence

I have 2 vectors.
x=c("a", "b", "c", "d", "a", "b", "c")
y=structure(c(1, 2, 3, 4, 5, 6, 7, 8), .Names = c("a", "e", "b",
"c", "d", "a", "b", "c"))
I would like to match a to a, b to b in sequence accordingly, so that x[2] matches y[3] rather than y[7]; and x[5] matches y[6] rather than y[1], so on and so forth.
lapply(x, function(z) grep(z, names(y), fixed=T))
gives:
[[1]]
[1] 1 6
[[2]]
[1] 3 7
[[3]]
[1] 4 8
[[4]]
[1] 5
[[5]]
[1] 1 6
[[6]]
[1] 3 7
[[7]]
[1] 4 8
which matches all instances. How do I get this sequence:
1 3 4 5 6 7 8
So that elements in x can be mapped to the corresponding values in y accordingly?
You are actually looking for pmatch
pmatch(x,names(y))
[1] 1 3 4 5 6 7 8
You can change the names attributes according to the number of times each element appeared and then subset y:
x2 <- paste0(x, ave(x, x, FUN=seq_along))
#[1] "a1" "b1" "c1" "d1" "a2" "b2" "c2"
names(y) <- paste0(names(y), ave(names(y), names(y), FUN=seq_along))
y[x2]
#a1 b1 c1 d1 a2 b2 c2
# 1 3 4 5 6 7 8
Another option using Reduce
Reduce(function(v, k) y[-seq_len(v)][k],
x=x[-1L],
init=y[x[1L]],
accumulate=TRUE)
Well, I did it with a for-loop
#Initialise the vector with length same as x.
answer <- numeric(length(x))
for (i in seq_along(x)) {
#match the ith element of x with that of names in y.
answer[i] <- match(x[i], names(y))
#Replace the name of the matched element to empty string so next time you
#encounter it you get the next index.
names(y)[i] <- ""
}
answer
#[1] 1 3 4 5 6 7 8
Another possibility:
l <- lapply(x, grep, x = names(y), fixed = TRUE)
i <- as.integer(ave(x, x, FUN = seq_along))
mapply(`[`, l, i)
which gives:
[1] 1 3 4 5 6 7 8
Similar solution to Ronak, but it does not persist changes to y
yFoo<-names(y)
sapply(x,function(u){res<-match(u,yFoo);yFoo[res]<<-"foo";return(res)})
Result
#a b c d a b c
#1 3 4 5 6 7 8

Pivot table of concatenated string in r

I have the following dataset:
mydata<- data.frame(Factors= c("a,b" , "c,d" , "a,c"), Valu = c ("2,3" , "7,8" , "9,1"))
Factors Valu
1 a,b 2,3
2 c,d 7,8
3 a,c 9,1
and I wish to convert to the following which has all the values that happend with a factor:
My ideal output
a b c d
2 2 7 7
3 3 8 8
9 9
1 1
I need a pivot table. However I need to prepare the data and then use melt and dcast have my desirable output: one of fail tries for preparing data is :
mydata2 <- cSplit(mydata, c("Factors","Valu") , ",", "long")
But they lose their connections.
Here is an one-line code with cSplit
library(splitstackshape)
with(cSplit(cSplit(mydata, 1, ",", "long"), 2, ",", "long"), split(Valu, Factors))
#$a
#[1] 2 3 9 1
#$b
#[1] 2 3
#$c
#[1] 7 8 9 1
#$d
#[1] 7 8
If we need a data.table/data.frame, use dcast to convert the 'long' format to 'wide'.
dcast(cSplit(cSplit(mydata, 1, ",", "long"), 2, ",", "long"),
rowid(Factors)~Factors, value.var="Valu")[, Factors := NULL][]
# a b c d
#1: 2 2 7 7
#2: 3 3 8 8
#3: 9 NA 9 NA
#4: 1 NA 1 NA
NOTE: splitstackshape loads the data.table. Here, we used data.table_1.10.0. The dcast from data.table is also very fast
Using a couple of *applys, strsplit and grep
## convert columns to characters so you can use strsplit
mydata$Factors <- as.character(mydata$Factors)
mydata$Valu <- as.character(mydata$Valu)
## get all the unique factor values by splitting them
f <- unique(unlist(strsplit(unique(mydata$Factors), split = ",")))
## filter 'mydata' by using 'grep' to search for each individual factor value
## (using sapply for one at a time)
l <- sapply(f, function(x) mydata[grep(x, mydata$Factors), "Valu"])
This gives a list, where each element is named by the 'Factor' value, and it contains all the 'Valu' values associated with it
l
# $a
# [1] "2,3" "9,1"
#
# $b
# [1] "2,3"
#
# $c
# [1] "7,8" "9,1"
#
# $d
# [1] "7,8"
Another lapply on this list will split the 'Valu's
result <- lapply(l, function(x) unlist(strsplit(x, split = ",")))
result
# $a
# [1] "2" "3" "9" "1"
#
# $b
# [1] "2" "3"
#
# $c
# [1] "7" "8" "9" "1"
#
# $d
# [1] "7" "8"
Edit
To get the result in a data.frame, you can make each list element the same length (by filling with NA), then call data.frame on the result
## the number of rows required for each column
maxLength <- max(sapply(result, length))
## append 'NA's to list with fewer than maxLenght lements
result <- data.frame(sapply(result, function(x) c(x, rep(NA, maxLength - length(x))) ))
result
# a b c d
# 1 2 2 7 7
# 2 3 3 8 8
# 3 9 <NA> 9 <NA>
# 4 1 <NA> 1 <NA>
Edit
In response to the comment, if you have 'similar' strings, you can make your grep regex explicit by using ( ) (see any regex cheatsheet for explanations)
mydata<- data.frame(Factors= c("a,b" , "c,d" , "a,c", "bo,ao"), Valu = c ("2,3" , "7,8" , "9,1", "x,y"))
mydata$Factors <- as.character(mydata$Factors)
mydata$Valu <- as.character(mydata$Valu)
f <- unique(unlist(strsplit(unique(mydata$Factors), split = ",")))
## filter 'mydata' by using 'grep' to search for each individual factor value
## (using sapply for one at a time)
l <- sapply(f, function(x) mydata[grep(paste0("(",x,")"), mydata$Factors), "Valu"])
Another base R attempt:
# character conversion first
mydata[] <- lapply(mydata, as.character)
long <- do.call(rbind,
do.call(Map, c(expand.grid, lapply(mydata, strsplit, ","), stringsAsFactors=FALSE))
)
split(long$Valu, long$Factors)
#$a
#[1] "2" "3" "9" "1"
#
#$b
#[1] "2" "3"
#
#$c
#[1] "7" "8" "9" "1"
#
#$d
#[1] "7" "8"
I misunderstood in my comment above; if you want every Factor to match every Valu, you need to separate the columns independently to get the combinations. If you add indices to spread by, it's not too bad:
library(tidyverse)
mydata %>%
separate_rows(Factors) %>% separate_rows(Valu, convert = TRUE) %>%
# add indices to give row order when spreading
group_by(Factors) %>% mutate(i = row_number()) %>%
spread(Factors, Valu) %>%
select(-i) # clean up extra column
## # A tibble: 4 × 4
## a b c d
## * <int> <int> <int> <int>
## 1 2 2 7 7
## 2 3 3 8 8
## 3 9 NA 9 NA
## 4 1 NA 1 NA

Mapping column values

I would like to transform the values of a given column using some mapping function. Example:
df <- data.frame(A = 1:5, B = sample(1:20, 10))
df
A B
1 1 17
2 2 5
3 3 3
4 4 11
5 5 19
6 1 16
7 2 4
8 3 7
9 4 6
10 5 9
My goal is to map all elements of column A as following:
1 -> "tt"
2 -> "ff"
3 -> "ss"
4 -> "fs"
5 -> "sf"
I have written the following:
mappingList <- c("tt", "ff", "ss", "fs", "sf")
df$A <- unlist(lapply(df$A, function(x){replace(x, x>0, mappingList[x])}))
df
A B
1 tt 17
2 ff 5
3 ss 3
4 fs 11
5 sf 19
6 tt 16
7 ff 4
8 ss 7
9 fs 6
10 sf 9
The code as above worked fine.
Now let's assume another dataframe where column A is not made of integers 1,2,3,4,5 but rather any other 'generic' items, say:
df <- data.frame(A = paste("str",1:5,sep=""), B = sample(1:20, 10))
or
df <- data.frame(A = seq(5, 25, by=5), B = sample(1:20, 10))
Question: How would you write the mapping ?
Did you look at factor?
df$A_2 <- factor(df$A, levels = 1:5, labels = c("tt", "ff", "ss", "fs", "sf"))
df
# A B A_2
# 1 1 17 tt
# 2 2 5 ff
# 3 3 3 ss
# 4 4 11 fs
# 5 5 19 sf
# 6 1 16 tt
# 7 2 4 ff
# 8 3 7 ss
# 9 4 6 fs
# 10 5 9 sf
Basically, your levels argument should have the original values to match, and your labels argument should have the replacement values.
You could also create a look-up table with a named vector.
Example:
df <- data.frame(A = paste("str",1:5,sep=""), B = sample(1:20, 10))
NamedVec <- setNames(paste("str",1:5,sep=""), c("tt", "ff", "ss", "fs", "sf"))
NamedVec
# tt ff ss fs sf
# "str1" "str2" "str3" "str4" "str5"
NamedVec[df$A]
# tt ff ss fs sf tt ff ss fs sf
# "str1" "str2" "str3" "str4" "str5" "str1" "str2" "str3" "str4" "str5"
names(NamedVec[df$A])
# [1] "tt" "ff" "ss" "fs" "sf" "tt" "ff" "ss" "fs" "sf"
Try:
mappingList[df$A]
#[1] "tt" "ff" "ss" "fs" "sf" "tt" "ff" "ss" "fs" "sf"
For the two other datasets:
df1 <- data.frame(A = paste("str",1:5,sep=""), B = sample(1:20, 10))
df2 <- data.frame(A = seq(5, 25, by=5), B = sample(1:20, 10))
mappingList[as.numeric(df1$A)]
#[1] "tt" "ff" "ss" "fs" "sf" "tt" "ff" "ss" "fs" "sf"
mappingList[as.numeric(factor(df2$A))]
#[1] "tt" "ff" "ss" "fs" "sf" "tt" "ff" "ss" "fs" "sf"

Find names of columns which contain missing values

I want to find all the names of columns with NA or missing data and store these column names in a vector.
# create matrix
a <- c(1,2,3,4,5,NA,7,8,9,10,NA,12,13,14,NA,16,17,18,19,20)
cnames <- c("aa", "bb", "cc", "dd", "ee")
mymatrix <- matrix(a, nrow = 4, ncol = 5, byrow = TRUE)
colnames(mymatrix) <- cnames
mymatrix
# aa bb cc dd ee
# [1,] 1 2 3 4 5
# [2,] NA 7 8 9 10
# [3,] NA 12 13 14 NA
# [4,] 16 17 18 19 20
The desired result: columns "aa" and "ee".
My attempt:
bad <- character()
for (j in 1:4){
tmp <- which(colnames(mymatrix[j, ]) %in% c("", "NA"))
bad <- tmp
}
However, I keep getting integer(0) as my output. Any help is appreciated.
Like this?
colnames(mymatrix)[colSums(is.na(mymatrix)) > 0]
# [1] "aa" "ee"
Or as suggested by #thelatemail:
names(which(colSums(is.na(mymatrix)) > 0))
# [1] "aa" "ee"
R 3.1 introduced an anyNA function, which is more convenient and faster:
colnames(mymatrix)[ apply(mymatrix, 2, anyNA) ]
Old answer:
If it's a very long matrix, apply + any can short circuit and run a bit faster.
apply(is.na(mymatrix), 2, any)
# aa bb cc dd ee
# TRUE FALSE FALSE FALSE TRUE
colnames(mymatrix)[apply(is.na(mymatrix), 2, any)]
# [1] "aa" "ee"
If you have a data frame with non-numeric columns, this solution is more general (building on previous answers):
R 3.1 +
names(which(sapply(mymatrix, anyNA)))
or
names(which(sapply(mymatrix, function(x) any(is.na(x)))))

split columns in to two while recoding in r

I have following data:
set.seed(123)
M1 <- c(sample(c("AA", "AB", "BB"), 5, replace = T))
M2k <- c(sample (c("AG", "GG", "AA"), 5, replace = T))
M3l <- c(sample (c("AT", "TT", "AA"), 5, replace = T))
M4 <- c(sample (c("CT", "TT", "CC"), 5, replace = T))
#in real data M1 .......M1000
myd <- data.frame (M1, M2k, M3l, M4)
I want split each M into two M1a, M1b for M1, M2ka, M2kb for M2k and so on. Similarly content of cell will be split AB will be A in M1a column and another M1b column. Also I want to re-code A = 1, B = 2, C = 3, G = 4, T = 5, else = NA.
EDIT reshape::colsplit will split by ''
Using reshape::colsplit.
library(reshape)
split_col <- function(.col, data){
.x <- colsplit( data[[.col]], names = paste0(.col, letters[1:2]))
}
# split each column and combine
new_data <- do.call(cbind,lapply(names(myd), split_col, data = myd))
# convert each new column to a factor with levels 1:5 as requested.
new_data_2 <- do.call(data.frame,
lapply(new_data, factor, levels = c('A','B','C','G','T'), labels= 1:5))
M1a M1b M2ka M2kb M3la M3lb M4a M4b
1 1 1 1 4 1 1 3 3
2 2 2 4 4 5 5 3 5
3 1 2 1 1 1 1 3 5
4 2 2 4 4 5 5 3 5
5 2 2 4 4 1 5 3 3
Here is another possible solution, with no particular advantage except that I find it easy follow:
myd$M5 = c("AB", "GT", "GA", "QW", "CK") # Add another test column.
mat = as.matrix(myd) # Convert to matrix for speed and indexing benefits.
# Construct new column names.
new_names = character(length=ncol(mat) * 2)
new_names[seq(1, ncol(mat) * 2, 2)] = paste(colnames(mat), "a", sep="")
new_names[seq(2, ncol(mat) * 2, 2)] = paste(colnames(mat), "b", sep="")
# Create empty matrix with correct column names.
newmat = matrix(ncol=ncol(mat) * 2, nrow=nrow(mat))
colnames(newmat) = new_names
# Split columns.
for (i in seq(1, ncol(mat))) {
newmat[, (i * 2) - 1] = substr(mat[, i], 1, 1)
newmat[, i * 2 ] = substr(mat[, i], 2, 2)
}
# Use named vector to recode data.
recode = c(A=1, B=2, C=3, G=4, T=5)
newmat[] = recode[newmat]
newmat
# M1a M1b M2ka M2kb M3la M3lb M4a M4b M5a M5b
# [1,] "1" "1" "1" "4" "1" "1" "3" "3" "1" "2"
# [2,] "2" "2" "4" "4" "5" "5" "3" "5" "4" "5"
# [3,] "1" "2" "1" "1" "1" "1" "3" "5" "4" "1"
# [4,] "2" "2" "4" "4" "5" "5" "3" "5" NA NA
# [5,] "2" "2" "4" "4" "1" "5" "3" "3" "3" NA
mnel already gave a pretty straight forward answer. This is me playing with my package in process (qdap) that is on GitHub though not on CRAN yet:
To Install qdap
# install.packages("devtools")
library(devtools)
install_github("qdap", "trinker")
Solving the problem:
lapply(seq_along(myd), function(i){
myd <<- colsplit2df(myd, (i+i-1), paste0(names(myd)[i+i-1],
letters[1:2]), sep="")
})
data.frame(apply(myd, 2, function(x) as.numeric(text2color(x,
c("A", "B", "C", "G", "T"), c(1:5, NA)))))
The work horse of this code is colsplit2df (returns a data.frame) and text2col (designed to recode text for wordcloud coloring; really a dictionary lookup tool). This really isn't what these functions are designed to do, just playing and seeing how they can be extended.
Using qdap with more stable solution:
x <- colsplit2df(myd, 1:ncol(myd), sep="")
colnames(x) <- paste(rep(colnames(myd), each = 2), letters[1:2], sep=".")
## M1a M1b M2ka M2kb M3la M3lb M4a M4b
## 1 1 1 1 4 1 1 3 3
## 2 2 2 4 4 5 5 3 5
## 3 1 2 1 1 1 1 3 5
## 4 2 2 4 4 5 5 3 5
## 5 2 2 4 4 1 5 3 3

Resources