R: Convert delimited string into variables - r

I have a data frame with a column containing a space-delimited list of character codes:
"Ab B C"
""
"X C"
"N Ab F S"
:
I want to convert this into multiple columns, one for each distinct value, indicating (with 1 or 0) that the value was found in the list. Desired result given the above example:
df$Ab = 1,0,0,1
df$B = 1,0,0,0
df$C = 1,0,1,0
df$F = 0,0,0,1
df$N = 0,0,0,1
What is the best way to do this?

Assuming you are starting with:
df <- data.frame(v1 = c("Ab B C", "", "X C", "N Ab F S"))
You can try cSplit_e from my "splitstackshape" package:
library(splitstackshape)
cSplit_e(df, "v1", sep = " ", type = "character", fill = 0)
# v1 v1_Ab v1_B v1_C v1_F v1_N v1_S v1_X
# 1 Ab B C 1 1 1 0 0 0 0
# 2 0 0 0 0 0 0 0
# 3 X C 0 0 1 0 0 0 1
# 4 N Ab F S 1 0 0 1 1 1 0

You can try
library(qdapTools)
lst <- strsplit(df1$Col1, ' ')
cbind(df1, mtabulate(lst))
# Col1 Ab B C F N S X
#1 Ab B C 1 1 1 0 0 0 0
#2 0 0 0 0 0 0 0
#3 X C 0 0 1 0 0 0 1
#4 N Ab F S 1 0 0 1 1 1 0
Or using base R
lvls <- sort(unique(unlist(lst)))
cbind(df1, t(vapply(lst, function(x) table(factor(x, levels=lvls)),
numeric(length(lvls)))))
data
df1 <- structure(list(Col1 = c("Ab B C", "", "X C", "N Ab F S")),
.Names = "Col1", row.names = c(NA, -4L), class = "data.frame")

In base R, another approach:
lst = strsplit(df$Col1, ' ')
cols = unique(unlist(lst))
m = do.call(rbind, lapply(lst, function(u) cols %in% u +0))
colnames(m) = cols
#> m
# Ab B C X N F S
#[1,] 1 1 1 0 0 0 0
#[2,] 0 0 0 0 0 0 0
#[3,] 0 0 1 1 0 0 0
#[4,] 1 0 0 0 1 1 1

Related

What code should be written to create a new variable from observations containing the same content in the R column?

My variable is as follows
variable
D
D
B
C
B
D
C
C
D
I want to make the column in the above figure below
variable
B
C
D
D
0
0
1
D
0
0
1
B
1
0
0
C
0
1
0
B
1
0
0
D
0
0
1
C
0
1
0
C
0
1
0
D
0
0
1
But I don't want a code like the one below. Because the number of factors in the variable column is too many
data = data %>% mutate(B=ifelse(variable=="B", 1,0),
C=ifelse(variable=="C", 1,0),
D=ifelse(variable=="D", 1,0))
Here is a base R approach. We can first find all unique variable values from the data frame. Then, sapply over that vector and generate a new column for each value. Finally, we can rbind this new data frame of 0/1 valued columns to the original data frame.
cols <- sort(unique(df$variable))
df2 <- sapply(cols, function(x) ifelse(df$variable == x, 1, 0))
df <- cbind(df, df2)
df
variable B C D
1 D 0 0 1
2 D 0 0 1
3 B 1 0 0
4 C 0 1 0
5 B 1 0 0
6 D 0 0 1
7 C 0 1 0
8 C 0 1 0
9 D 0 0 1
Data:
df <- data.frame(variable=c("D", "D", "B", "C", "B",
"D", "C", "C", "D"),
stringsAsFactors=FALSE)
Try this with reshaping and duplicating the original variable in order to have a reference for values. Then, you can reshape to obtain the expected output:
library(dplyr)
library(tidyr)
#Code
new <- df %>% mutate(Var=variable,Val=1,id=row_number()) %>%
pivot_wider(names_from = Var,values_from=Val,values_fill = 0) %>%
select(-id)
Output:
# A tibble: 9 x 4
variable D B C
<chr> <dbl> <dbl> <dbl>
1 D 1 0 0
2 D 1 0 0
3 B 0 1 0
4 C 0 0 1
5 B 0 1 0
6 D 1 0 0
7 C 0 0 1
8 C 0 0 1
9 D 1 0 0
Some data used:
#Data
df <- structure(list(variable = c("D", "D", "B", "C", "B", "D", "C",
"C", "D")), class = "data.frame", row.names = c(NA, -9L))
1) model.matrix
model.matrix will generate column names like variableB so the last line removes the variable part to ensure that the column names are exactly the same as in the question. Omit the last line if it is not important that the column names be exactly as shown there.
dat2 <- cbind(dat, model.matrix(~ variable - 1, dat))
names(dat2) <- sub("variable(.)", "\\1", names(dat2))
giving:
> dat2
variable B C D
1 D 0 0 1
2 D 0 0 1
3 B 1 0 0
4 C 0 1 0
5 B 1 0 0
6 D 0 0 1
7 C 0 1 0
8 C 0 1 0
9 D 0 0 1
2) outer
This can also be done using outer as shown. Each component of variable is compared to each level. We name the levels so that outer uses them as column names. The output is the same.
levs <- sort(unique(dat$variable))
names(levs) <- levs
cbind(dat, +outer(dat$variable, levs, `==`))
Note
The input in reproducible form:
Lines <- "
variable
D
D
B
C
B
D
C
C
D"
dat <- read.table(text = Lines, header = TRUE)

How to match characters change in R

I have below-mentioned dataframe in R:
ID source_field_1 field_1 source_field_3 field_3
ER-1 AC45U CD34I 1992-01-23 23/01/1992
ER-2 AB15X 1971-01-23 23/1/1971
ER-3 DB22U AC22Z 1962-11-13 3/11/1962
ER-4 CF12R BA23D 1992-01-23 23/01/1992
I need a group by count of change of characters from column source_field_1 to field_1, from A to Z and from 0 to 9.
Required Output:
source_field_1 A B C D E . . . Z
A 1
B 1
C 1 1
D 1
E
F 1
.
. 1
. 1
Z
Need the same structure for numerical characters as well for both field_1 and field_3.
df1 <- na.omit(df)
create <- function(from,to,nm)
{
s <- sprintf("[^%s]",paste0(nm,collapse = ""))
from <- unlist(strsplit(gsub(s,"",from),""))
to <- unlist(strsplit(gsub(s,"",to),""))
table(from,to)
}
create(df1$source_field_1,df1$field_1,0:9)
to
from 2 3 4
1 1 0 0
2 2 1 0
4 0 1 0
5 0 0 1
create(df1$source_field_1,df1$field_1,LETTERS)
to
from A B C D I Z
A 0 0 1 0 0 0
B 0 0 1 0 0 0
C 0 1 0 1 0 0
D 1 0 0 0 0 0
F 1 0 0 0 0 0
R 0 0 0 1 0 0
U 0 0 0 0 1 1
This is rather simple to achieve by splitting up each character and using the table function.
library(stringr)
df <- [your df]
out <- vector('list', nrow(df))
for(i in seq_along(out)){
#Split both columns
splitted_str <- str_split(unlist(df[i, c('source_field_1', 'field_1')]), '')
#Alternative in base R:
#gsub(LETTERS, '', unlist(df[i, c('source_field_1', 'field_1')]))
#convert to factors, "levels" will be used in our columns
splitted_str <- lapply(splitted_str, factor, levels = LETTERS)
#Create table. dnn sets the names shown for column/rows
out[[i]] <- table(splitted_str, dnn = c('source_field_1', 'field_1'))
}
note that i abuse the fact that factor(...) sets all values not in levels to NA, and by default table(...) excludes these in the table.
Obviously this could all be combined into a single line
out <- lapply(seq(nrow(df)),
function(x) table(lapply(str_split(unlist(df[i, c('source_field_1', 'field_1')]), ''), factor, levels = LETTERS), dnn = c('source_field_1', 'field_1'))
)

Concatenate column names based on row-wise values

I have an R dataframe with 3 columns containing values 0 or 1. I need to create a column as the concatenation of column names when the value is 1 separated by '&'. The following code works with empty space '' as the separator but fails when I change it to '&'.
Code:
A = c(1,0,1,0,0,1)
B = c(1,1,1,0,1,0)
C = c(0,0,0,1,1,1)
data = data.frame(A, B, C)
data$New = paste(ifelse(data$A == 1, "A", ""),
ifelse(data$B == 1, "B", ""),
ifelse(data$C == 1, "C", ""), sep = '')
data
Output:
A B C New
1 1 1 0 AB
2 0 1 0 B
3 1 1 0 AB
4 0 0 1 C
5 0 1 1 BC
6 1 0 1 AC
Code & Output with '&' Separator:
A = c(1,0,1,0,0,1)
B = c(1,1,1,0,1,0)
C = c(0,0,0,1,1,1)
data = data.frame(A, B, C)
data$New = paste(ifelse(data$A == 1, "A", ""),
ifelse(data$B == 1, "B", ""),
ifelse(data$C == 1, "C", ""), sep = '&')
data
A B C New
1 1 1 0 A&B&
2 0 1 0 &B&
3 1 1 0 A&B&
4 0 0 1 &&C
5 0 1 1 &B&C
6 1 0 1 A&&C
Expected Output:
A B C New
1 1 1 0 A&B
2 0 1 0 B
3 1 1 0 A&B
4 0 0 1 C
5 0 1 1 B&C
6 1 0 1 A&C
Is there a way to do this in R?
In case of a large number of columns, is there a way to do the same without writing explicit ifelse condition on each column?
We can subset the names by looping through the rows
data$New <- apply(data[1:3], 1, function(x) paste(names(x[x!=0]), collapse="&"))
data$New
#[1] "A&B" "B" "A&B" "C" "B&C" "A&C"
it can also be done column wise
library(tidyverse)
data[1:3] %>%
na_if(0) %>%
`*`(col(.)) %>%
imap(~ rep(.y, length(.x))[.x]) %>%
reduce(paste, sep= "&") %>%
str_remove("(NA&)+|(&NA)+") %>%
str_remove("&NA")
#[1] "A&B" "B" "A&B" "C" "B&C" "A&C"
You can use apply with paste to do it.
nms <- names(data)
data$New <- apply(data, 1, function(x){
paste(nms[as.logical(x)], collapse = "&")
})
data
# A B C New
#1 1 1 0 A&B
#2 0 1 0 B
#3 1 1 0 A&B
#4 0 0 1 C
#5 0 1 1 B&C
#6 1 0 1 A&C
Using which with arr.ind = TRUE, and then aggregate:
cbind(data,
new = aggregate(col ~ row, data = which(data == 1, arr.ind = TRUE),
function(x) paste(names(data)[x], collapse = "&"))[ , "col"])
# A B C new
# 1 1 1 0 A&B
# 2 0 1 0 B
# 3 1 1 0 A&B
# 4 0 0 1 C
# 5 0 1 1 B&C
# 6 1 0 1 A&C
Similar, using tapply:
ix <- which(data == 1, arr.ind = TRUE)
cbind(data,
new = tapply(ix[ , "col"], ix[ , "row"],
function(x) paste(names(data)[x], collapse = "&")))

multicolumn csv text into presence absence matri

I have a csv file like this:
col1 col2 col3
r1 a,b,c e,f g
r2 h,i j,k
r3 l m,n,o
some cells have multiple text comma separated, some have single and some have none.I want to convert this like:
col1 col2 col3
a 1 0 0
b 1 0 0
c 1 0 0
e 0 1 0
f 0 1 0
g 0 0 1
h 1 0 0
i 1 0 0
j 0 0 1
k 0 0 1
l 1 0 0
m 0 1 0
n 0 1 0
o 0 1 0
Any suggestion? I tried pivot table in excel but not getting the desired output.
Thanks in advance.
Best Regards
Zillur
not sure whether this is the shortest solution (probably not) but it produces the desired output. Basically, we go through all three columns and count the occurrences of the strings and get a long format data frame that we then flip to the wide format you want.
library(tidyr)
library(purrr)
df <- data_frame(col1 = c("a,b,c", "h,i", "l"),
col2 = c("e,f", "", "m,n,o"),
col3 = c("g", "j,k", ""))
let_df <- map_df(df, function(col){
# map_df applies the function to each column of df
# split strings at "," and unlist to get vector of letters
letters <- unlist(str_split(col, ","))
# delete ""
letters <- letters[nchar(letters) > 0]
# count occurrences for each letter
tab <- table(letters)
# replace with 1 if occurs more often
tab[tab > 1] <- 1
# create data frame from table
df <- data_frame(letter = names(tab), count = tab)
return(df)
}, .id = "col") # id adds a column col that contains col1 - col3
# bring data frame into wide format
let_df %>%
spread(col, count, fill = 0)
Such a great problem to solve. Here is my take on it in base R:
col1 <- c("a,b,c","h,i","l")
col2 <- c("e,f","","m,n,o")
col3 <- c("g","j,k","")
data <- data.frame(col1, col2, col3, stringsAsFactors = F)
restructure <- function(df){
df[df==""] <- "missing"
result_rows <- as.character()
l <- list()
for (i in seq_along(colnames(df)) ){
df_col <- sort(unique(unlist(strsplit(gsub(" ", "",toString(df[[i]])), ","))))
df_col <- df_col[!df_col %in% "missing"]
result_rows <- sort(unique(c(result_rows, df_col)))
l[i] <- list(df_col)
}
result <- data.frame(result_rows)
for (j in seq_along(l)){
result$temp <- NA
result$temp[match(l[[j]], result_rows)] <- 1
colnames(result)[colnames(result)=="temp"] <- colnames(df)[j]
}
result[is.na(result)] <- 0
return(result)
}
> restructure(data)
# result_rows col1 col2 col3
#1 a 1 0 0
#2 b 1 0 0
#3 c 1 0 0
#4 e 0 1 0
#5 f 0 1 0
#6 g 0 0 1
#7 h 1 0 0
#8 i 1 0 0
#9 j 0 0 1
#10 k 0 0 1
#11 l 1 0 0
#12 m 0 1 0
#13 n 0 1 0
#14 o 0 1 0

Optimizing a "set in a string list" to a "set as a matrix" operation

I have a set of strings which contain space-separated elements. I want to build a matrix which will tell me which elements were part of which strings. For example:
""
"A B C"
"D"
"B D"
Should give something like:
A B C D
1
2 1 1 1
3 1
4 1 1
Now I've got a solution, but it runs slow as molasse, and I've run out of ideas on how to make it faster:
reverseIn <- function(vector, value) {
return(value %in% vector)
}
buildCategoryMatrix <- function(valueVector) {
allClasses <- c()
for(classVec in unique(valueVector)) {
allClasses <- unique(c(allClasses,
strsplit(classVec, " ", fixed=TRUE)[[1]]))
}
resMatrix <- matrix(ncol=0, nrow=length(valueVector))
splitValues <- strsplit(valueVector, " ", fixed=TRUE)
for(cat in allClasses) {
if(cat=="") {
catIsPart <- (valueVector == "")
} else {
catIsPart <- sapply(splitValues, reverseIn, cat)
}
resMatrix <- cbind(resMatrix, catIsPart)
}
colnames(resMatrix) <- allClasses
return(resMatrix)
}
Profiling the function gives me this:
$by.self
self.time self.pct total.time total.pct
"match" 31.20 34.74 31.24 34.79
"FUN" 30.26 33.70 74.30 82.74
"lapply" 13.56 15.10 87.86 97.84
"%in%" 12.92 14.39 44.10 49.11
So my actual questions would be:
- Where are the 33% spent in "FUN" coming from?
- Would there be any way to speed up the %in% call?
I tried turning the strings into factors prior to going into the loop so that I'd be matching numbers instead of strings, but that actually makes R crash. I've also tried going for partial matrix assignment (IE, resMatrix[i,x] <- 1) where i is the number of the string and x is the vector of factors. No dice there either, as it seems to keep on running infinitely.
In the development version of my "splitstackshape" package, there's a helper function called charBinaryMat that can be used for something like this:
Here's the function (since the version of the package on CRAN doesn't have it yet):
charBinaryMat <- function(listOfValues, fill = NA) {
lev <- sort(unique(unlist(listOfValues, use.names = FALSE)))
m <- matrix(fill, nrow = length(listOfValues), ncol = length(lev))
colnames(m) <- lev
for (i in 1:nrow(m)) {
m[i, listOfValues[[i]]] <- 1
}
m
}
The input is expected to be the output of strsplit:
And here it is in use:
str <- c("" , "A B C" , "D" , "B D" )
## Fill is `NA` by default
charBinaryMat(strsplit(str, " ", fixed=TRUE))
# A B C D
# [1,] NA NA NA NA
# [2,] 1 1 1 NA
# [3,] NA NA NA 1
# [4,] NA 1 NA 1
## Can easily be set to another value
charBinaryMat(strsplit(str, " ", fixed=TRUE), fill = 0)
# A B C D
# [1,] 0 0 0 0
# [2,] 1 1 1 0
# [3,] 0 0 0 1
# [4,] 0 1 0 1
Benchmarking
Since your question is about a faster approach, let's benchmark.
The functions for benchmarking:
CBM <- function() {
charBinaryMat(strsplit(str, " ", fixed=TRUE), fill = 0)
}
BCM <- function() {
buildCategoryMatrix(str)*1L
}
Sapply <- function() {
y <- unique( unlist( strsplit( str , " " ) ) )
out <- t(sapply(str, function(x) y %in% unlist(strsplit(x , " " )),
USE.NAMES = FALSE )) * 1L
colnames(out) <- y
out
}
Some sample data:
set.seed(1)
A = sample(10, 100000, replace = TRUE)
str <- sapply(seq_along(A), function(x)
paste(sample(LETTERS[1:10], A[x]), collapse = " "))
head(str)
# [1] "H G C" "F H J G" "H D J A I B"
# [4] "A C F H J B E G D I" "F C H" "I C G B J D F A E"
Some sample output:
## Automatically sorted
head(CBM())
# A B C D E F G H I J
# [1,] 0 0 1 0 0 0 1 1 0 0
# [2,] 0 0 0 0 0 1 1 1 0 1
# [3,] 1 1 0 1 0 0 0 1 1 1
# [4,] 1 1 1 1 1 1 1 1 1 1
# [5,] 0 0 1 0 0 1 0 1 0 0
# [6,] 1 1 1 1 1 1 1 0 1 1
## Sorting just for comparison
head(BCM())[, LETTERS[1:10]]
# A B C D E F G H I J
# [1,] 0 0 1 0 0 0 1 1 0 0
# [2,] 0 0 0 0 0 1 1 1 0 1
# [3,] 1 1 0 1 0 0 0 1 1 1
# [4,] 1 1 1 1 1 1 1 1 1 1
# [5,] 0 0 1 0 0 1 0 1 0 0
# [6,] 1 1 1 1 1 1 1 0 1 1
## Sorting just for comparison
head(Sapply())[, LETTERS[1:10]]
# A B C D E F G H I J
# [1,] 0 0 1 0 0 0 1 1 0 0
# [2,] 0 0 0 0 0 1 1 1 0 1
# [3,] 1 1 0 1 0 0 0 1 1 1
# [4,] 1 1 1 1 1 1 1 1 1 1
# [5,] 0 0 1 0 0 1 0 1 0 0
# [6,] 1 1 1 1 1 1 1 0 1 1
Benchmarking:
library(microbenchmark)
microbenchmark(CBM(), BCM(), Sapply(), times=20)
# Unit: milliseconds
# expr min lq median uq max neval
# CBM() 675.0929 718.3454 777.2423 805.3872 858.6609 20
# BCM() 11059.6305 11267.9888 11367.3283 11595.1758 11792.5950 20
# Sapply() 3536.7755 3687.0308 3759.7388 3813.4233 3968.3192 20
This is pretty easy to do with vapply:
x <- c("" , "A B C" , "D" , "B D" )
lines <- strsplit(x, " ", fixed = TRUE)
all <- sort(unique(unlist(lines)))
t(vapply(lines, function(x) all %in% x, numeric(length(all))))
This is a little slower than #Ananda's approach: https://gist.github.com/hadley/7169138
Here's one way of doing this. There is a lot going on in the line where out is assigned. Basically, we loop over each element of your input vector. We split each element into individual characters, then we look to see which of these is present in a vector of all the unique values in your dataset. This returns either TRUE or FALSE. We use * 1L at the end to turn logical values into integer but you could just wrap the whole thing in as.integer instead. sapply returns the results column-wise but you want them row-wise so we use the transpose function t() to achieve this.
The final line converts to a data.frame and applies column names.
# Data
str <- c("" , "A B C" , "D" , "B D" )
# Unique column headers (excluding empty strings as in example)
y <- unique( unlist( strsplit( str , " " ) ) )
# Results
out <- t( sapply( str , function(x) y %in% unlist( strsplit( x , " " ) ) , USE.NAMES = FALSE ) ) * 1L
# Combine to a data.frame
setNames( data.frame( out ) , y )
# A B C D
#1 0 0 0 0
#2 1 1 1 0
#3 0 0 0 1
#4 0 1 0 1

Resources