A function that allows referral to columns with both index and name - r

I tried to create an easier way to refer to columns with the function below, by allowing both indexes and names. See also link.
So this one works:
df <- data.table::fread("a b c d e f g h i j
1 2 3 4 5 6 7 8 9 10",
header = TRUE)
columns <- c(1:8, "i", 9, "j")
col2num <- function(df, columns){
nums <- as.numeric(columns)
nums[is.na(nums)] <- which(names(df)==columns[is.na(nums)])
return(nums)
}
col2num(df, columns)
#> Warning in col2num(df, columns): NAs introduced by coercion
#> [1] 1 2 3 4 5 6 7 8 9 9 10
And this one works too:
col2name <- function(df, columns){
nums <- as.numeric(columns)
nums[is.na(nums)] <- which(names(df)==columns[is.na(nums)])
return(names(df)[nums])
}
col2name(df, columns)
[1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "i" "j"
Warning message:
In col2name(df, columns) : NAs introduced by coercion
But when I do the following, it no longer works:
columns <- c(1:7, "j", 8, "i")
col2name <- function(df, columns){
nums <- as.numeric(columns)
nums[is.na(nums)] <- which(names(df)==columns[is.na(nums)])
return(names(df)[nums])
}
col2name(df, columns)
Error in nums[is.na(nums)] <- which(names(df) == columns[is.na(nums)]) :
replacement has length zero
Also, this one does not work:
columns <- c("a", "j", 8, "i")
col2name <- function(df, columns){
nums <- as.numeric(columns)
nums[is.na(nums)] <- which(names(df)==columns[is.na(nums)])
return(names(df)[nums])
}
col2name(df, columns)
[1] "a" "i" "h" "a"
How can I fix this?

We just need to loop over the columns:
col2num <- function(df, columns){
nums <- as.numeric(columns)
nums[is.na(nums)] <- sapply(columns[is.na(as.numeric(columns))],
function(x) which(names(df) == x))
return(nums)
}
col2name <- function(df, columns){
nums <- as.numeric(columns)
nums[is.na(nums)] <- sapply(columns[is.na(as.numeric(columns))],
function(x) which(names(df) == x))
return(names(df)[nums])
}
columns1 <- c(1:8, "i", 9, "j")
columns2 <- c(1:7, "j", 8, "i")
suppressWarnings(col2name(df, columns1))
#> [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "i" "j"
suppressWarnings(col2num(df, columns1))
#> [1] 1 2 3 4 5 6 7 8 9 9 10
suppressWarnings(col2num(df, columns2))
#> [1] 1 2 3 4 5 6 7 10 8 9
suppressWarnings(col2name(df, columns2))
#> [1] "a" "b" "c" "d" "e" "f" "g" "j" "h" "i"
I am using suppressWarnings to avoid getting following warning each time I run the function:
Warning messages:
1: In col2name(df, columns) : NAs introduced by coercion
2: In lapply(X = X, FUN = FUN, ...) : NAs introduced by coercion

An alternative with the disadvantage of necessitating that the data be a data.frame object:
indexr<- function(df, cols){
to_match<-cols[grep("[A-za-z]",cols)]
matched<-match(to_match,names(df))
numerics <- as.numeric(c(setdiff(cols,to_match),matched))
df[c(numerics)]
}
indexr(iris,c(1,"Sepal.Width"))
Sepal.Length Sepal.Width
1 5.1 3.5
2 4.9 3.0
3 4.7 3.2
With your data(drawback is that we go back to a data.frame). Might define a method for that.
data.table::setDF(df)
indexr(df,columns)
a b c d e f g h i i.1 j
1 1 2 3 4 5 6 7 8 9 9 10
Edit To return names instead:
indexr<- function(df, cols){
to_match<-cols[grep("[A-za-z]",cols)]
matched<-match(to_match,names(df))
numerics <- as.numeric(c(setdiff(cols,to_match),matched))
names(df[c(numerics)])
}
indexr(mtcars,c("mpg",5))
[1] "drat" "mpg"
indexr(df,columns)
[1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "i.1"
[11] "j"

Related

Within a list of vectors, convert each vector to a string then convert to dataframe in R

I have a list of vectors, j, that looks like this:
>j
[[1]
[1] "a" "b" "c"
[[2]]
[1] "c" "c"
[[3]]
[1] "d" "d" "d" "a" "a"
.
.
.
I would like to transform this into a dataframe that has one column with each vectors contents concatenated together. So the column would look like:
Column_Name
1 a b c
2 c c
3 d d d a a
I have tried using Replace() function as well as a loop where I would use after:
for (x in 1:length(j)){
j[x] = paste(j[x], collapse = " ")
}
j <- data.frame(matrix(unlist(j), nrow=length(j), byrow=T)
Any guidance would be greatly appreciated.
Thank you.
As you have tried yourself, the sapply function together with the collapse argument of paste should do it all wrapped into a data.frame:
# Toy data
set.seed(1)
j <- replicate(5, rep(sample(letters, 1), sample(1:10,1)))
print(j)
#[[1]]
#[1] "g" "g" "g" "g"
#
#[[2]]
# [1] "o" "o" "o" "o" "o" "o" "o" "o" "o" "o"
#
#[[3]]
#[1] "f" "f" "f" "f" "f" "f" "f" "f" "f"
#
#[[4]]
#[1] "y" "y" "y" "y" "y" "y" "y"
#
#[[5]]
#[1] "q"
# Collapse each element and wrap into a data.frame
res <- data.frame("Column_name" = sapply(j, paste, collapse = " "))
print(res)
# Column_name
#1 g g g g
#2 o o o o o o o o o o
#3 f f f f f f f f f
#4 y y y y y y y
#5 q
The sapply applies the paste-function on each element of the list to create a character vector of the concatenated list elements. The data.frame constructor simply converts that output to the wanted output.
Once provide the name to list and then use stack to convert list in a data.frame. Finally, dplyr package is used to collapse vector from common element separated by .
Sample Data is taken from #AndersEllernBilgrau's answer.
set.seed(1)
j <- replicate(5, rep(sample(letters, 1), sample(1:10,1)))
names(j) <- seq_along(j)
library(dplyr)
stack(j) %>% group_by(ind) %>%
summarise(Column_Name = paste0(values, collapse = " ")) %>%
ungroup() %>% select(-ind)
# # A tibble: 5 x 1
# Column_Name
# <chr>
# 1 g g g g
# 2 o o o o o o o o o o
# 3 f f f f f f f f f
# 4 y y y y y y y
# 5 q
#

splitting vector every two indices

Given vector of N elements:
LETTERS[1:10]
[1] "A" "B" "C" "D" "E" "F" "G" "H" "I" "J"
How can one get a data.table/frame (df) as follows?
>df
one two
A B
C D
E F
G H
I J
EDIT
Generalizing I would like to know given a vector to split as follows:
[A B C],[D E],[F G H I J]
and obtaining:
V1 V2 V3 V4 V5
A B C NA NA
D E NA NA NA
F G H I J
One option is the matrix way
as.data.frame(matrix(LETTERS[1:10], ncol=2,byrow=TRUE,
dimnames = list(NULL, c('one', 'two'))), stringsAsFactors=FALSE)
# one two
#1 A B
#2 C D
#3 E F
#4 G H
#5 I J
f we need to create an index, we can use gl to split the vector and rbind
do.call(rbind, split(v1, as.integer(gl(length(v1), 2, length(v1)))))
where
v1 <- LETTERS[1:10]
Update
Based on the update in OP's post
lst <- split(v1, rep(1:3, c(3, 2, 5)))
do.call(rbind, lapply(lst, `length<-`, max(lengths(lst))))
# [,1] [,2] [,3] [,4] [,5]
#1 "A" "B" "C" NA NA
#2 "D" "E" NA NA NA
#3 "F" "G" "H" "I" "J"
Or otherwise
library(stringi)
stri_list2matrix(lst, byrow = TRUE)
Update2
If we are using a 'splitVec'
lst <- split(v1, cumsum(seq_along(v1) %in% splitVec))
and then proceed as above

paste header and first 2 rows with a new line in R

I have 3 rows
first <- LETTERS[1:9]
second <- c(1:9)
third <- letters[1:9]
> first
[1] "A" "B" "C" "D" "E" "F" "G" "H" "I"
> second
[1] 1 2 3 4 5 6 7 8 9
> third
[1] "a" "b" "c" "d" "e" "f" "g" "h" "i"
I want the output vector of 9 elements with new line like that
"A "B
1 2
a" b" and so on
I try to use loop but it doesn't work
y <-c()
z <-c()
for(i in 1:length(first)){
y <- paste(first, second, sep = "\n")
z <- paste(y, third, sep = "\n")
}
I got out put
[1] "A\n1\na" "B\n2\nb" "C\n3\nc" "D\n4\nd"
[5] "E\n5\ne" "F\n6\nf" "G\n7\ng" "H\n8\nh"
[9] "I\n9\ni"
This is not give me a newline. The format of my colnames(data) for report need to be like that. Thanks for advance.
Am I understanding you correctly that you are looking for this:
first <- LETTERS[1:9]
second <- c(1:9)
third <- letters[1:9]
cat(c(first, "\n", second, "\n", third))
A B C D E F G H I
1 2 3 4 5 6 7 8 9
a b c d e f g h i
?
EDIT:
Will this solve your problem?
Printing R data frame with column names in multiple lines

Gather connected IDs across different rows of data frame

Given an R data frame like this:
DF.a <- data.frame(ID1 = c("A","B","C","D","E","F","G","H"),
ID2 = c("D",NA,"G",NA,NA,NA,"H",NA),
ID3 = c("F",NA,NA,NA,NA,NA,NA,NA))
> DF.a
ID1 ID2 ID3
1 A D F
2 B <NA> <NA>
3 C G <NA>
4 D <NA> <NA>
5 E <NA> <NA>
6 F <NA> <NA>
7 G H <NA>
8 H <NA> <NA>
I would like to simplify/reshape it into the following:
DF.b <- data.frame(ID1 = c("A","B","C","E"),
ID2 = c("D",NA,"G",NA),
ID3 = c("F",NA,"H",NA))
> DF.b
ID1 ID2 ID3
1 A D F
2 B <NA> <NA>
3 C G H
4 E <NA> <NA>
It does not seem like a straightforward reshape. The goal is to get all "connected" ID values together on a single row. Note how the connection between "C" and "H" is indirect, as both are connected to "G", but they don't appear together on the same row of DF.a. The order of the ID values in rows of DF.b does not matter.
Really you could think of this as trying to get all the connected components of a graph. The first step I would take would be to convert your data into a more natural structure -- a vector of nodes and matrix of edges:
(nodes <- as.character(sort(unique(unlist(DF.a)))))
# [1] "A" "B" "C" "D" "E" "F" "G" "H"
(edges <- do.call(rbind, apply(DF.a, 1, function(x) {
x <- x[!is.na(x)]
cbind(head(x, -1), tail(x, -1))
})))
# [,1] [,2]
# ID1 "A" "D"
# ID2 "D" "F"
# ID1 "C" "G"
# ID1 "G" "H"
Now you are ready to build a graph and compute its components:
library(igraph)
g <- graph.data.frame(edges, FALSE, nodes)
(comp <- split(nodes, components(g)$membership))
# $`1`
# [1] "A" "D" "F"
#
# $`2`
# [1] "B"
#
# $`3`
# [1] "C" "G" "H"
#
# $`4`
# [1] "E"
The output of the split function is a list, where each list element is all the nodes in one of the components of the graph. Personally I think this is the most useful representation of the output data, but if you really wanted the NA-padded structure you describe you could try something like:
max.len <- max(sapply(comp, length))
do.call(rbind, lapply(comp, function(x) { length(x) <- max.len ; x }))
# [,1] [,2] [,3]
# 1 "A" "D" "F"
# 2 "B" NA NA
# 3 "C" "G" "H"
# 4 "E" NA NA

Unlist data frame column preserving information from other column

I have a data frame which consists of two column: a character vector col1 and a list column, col2.
myVector <- c("A","B","C","D")
myList <- list()
myList[[1]] <- c(1, 4, 6, 7)
myList[[2]] <- c(2, 7, 3)
myList[[3]] <- c(5, 5, 3, 9, 6)
myList[[4]] <- c(7, 9)
myDataFrame <- data.frame(row = c(1,2,3,4))
myDataFrame$col1 <- myVector
myDataFrame$col2 <- myList
myDataFrame
# row col1 col2
# 1 1 A 1, 4, 6, 7
# 2 2 B 2, 7, 3
# 3 3 C 5, 5, 3, 9, 6
# 4 4 D 7, 9
I want to unlist my col2 still keeping for each element of the vectors in the list the information stored in col1. To phrase it differently, in commonly used data frame reshape terminology: the "wide" list column should be converted to a "long" format.
Then at the end of the day I want two vectors of length equal to length(unlist(myDataFrame$col2)). In code:
# unlist myList
unlist.col2 <- unlist(myDataFrame$col2)
unlist.col2
# [1] 1 4 6 7 2 7 3 5 5 3 9 6 7 9
# unlist myVector to obtain
# unlist.col1 <- ???
# unlist.col1
# [1] A A A A B B B C C C C C D D
I can't think of any straightforward way to get it.
You may also use unnest from package tidyr:
library(tidyr)
unnest(myDataFrame, col2)
# row col1 col2
# (dbl) (chr) (dbl)
# 1 1 A 1
# 2 1 A 4
# 3 1 A 6
# 4 1 A 7
# 5 2 B 2
# 6 2 B 7
# 7 2 B 3
# 8 3 C 5
# 9 3 C 5
# 10 3 C 3
# 11 3 C 9
# 12 3 C 6
# 13 4 D 7
# 14 4 D 9
You can use the "data.table" to expand the whole data.frame, and extract the column of interest.
library(data.table)
## expand the entire data.frame (uncomment to see)
# as.data.table(myDataFrame)[, unlist(col2), by = list(row, col1)]
## expand and select the column of interest:
as.data.table(myDataFrame)[, unlist(col2), by = list(row, col1)]$col1
# [1] "A" "A" "A" "A" "B" "B" "B" "C" "C" "C" "C" "C" "D" "D"
In newer versions of R, you can now use the lengths function instead of the sapply(list, length) approach. The lengths function is considerably faster.
with(myDataFrame, rep(col1, lengths(col2)))
# [1] "A" "A" "A" "A" "B" "B" "B" "C" "C" "C" "C" "C" "D" "D"
Here, the idea is to first get the length of each list element using sapply and then use rep to replicate the col1 with that length
l1 <- sapply(myDataFrame$col2, length)
unlist.col1 <- rep(myDataFrame$col1, l1)
unlist.col1
#[1] "A" "A" "A" "A" "B" "B" "B" "C" "C" "C" "C" "C" "D" "D"
Or as suggested by #Ananda Mahto, the above could be also done with vapply
with(myDataFrame, rep(col1, vapply(col2, length, 1L)))
#[1] "A" "A" "A" "A" "B" "B" "B" "C" "C" "C" "C" "C" "D" "D"

Resources