Parsing Delimited Data In a DataFrame Into Separate Columns in R - r

I have a data frame which looks as such
A B C
1 3 X1=7;X2=8;X3=9
2 4 X1=10;X2=11;X3=12
5 6 X1=13;X2=14
I would like to parse the C column into separate columns as such...
A B X1 X2 X3
1 3 7 8 9
2 4 10 11 12
5 6 13 14 NA
How would one go about doing this in R?

First, here's the sample data in data.frame form
dd<-data.frame(
A = c(1L, 2L, 5L),
B = c(3L, 4L, 6L),
C = c("X1=7;X2=8;X3=9",
"X1=10;X2=11;X3=12", "X1=13;X2=14"),
stringsAsFactors=F
)
Now I define a small helper function to take vectors like c("A=1","B=2") and changed them into named vectors like c(A="1", B="2").
namev<-function(x) {
a<-strsplit(x,"=")
setNames(sapply(a,'[',2), sapply(a,'[',1))
}
and now I perform the transformations
#turn each row into a named vector
vv<-lapply(strsplit(dd$C,";"), namev)
#find list of all column names
nm<-unique(unlist(sapply(vv, names)))
#extract data from all rows for every column
nv<-do.call(rbind, lapply(vv, '[', nm))
#convert everything to numeric (optional)
class(nv)<-"numeric"
#rejoin with original data
cbind(dd[,-3], nv)
and that gives you
A B X1 X2 X3
1 1 3 7 8 9
2 2 4 10 11 12
3 5 6 13 14 NA

My cSplit function makes solving problems like these fun. Here it is in action:
## Load some packages
library(data.table)
library(devtools) ## Just for source_gist, really
library(reshape2)
## Load `cSplit`
source_gist("https://gist.github.com/mrdwab/11380733")
First, split your values up and create a "long" dataset:
ddL <- cSplit(cSplit(dd, "C", ";", "long"), "C", "=")
ddL
# A B C_1 C_2
# 1: 1 3 X1 7
# 2: 1 3 X2 8
# 3: 1 3 X3 9
# 4: 2 4 X1 10
# 5: 2 4 X2 11
# 6: 2 4 X3 12
# 7: 5 6 X1 13
# 8: 5 6 X2 14
Next, use dcast.data.table (or just dcast) to go from "long" to "wide":
dcast.data.table(ddL, A + B ~ C_1, value.var="C_2")
# A B X1 X2 X3
# 1: 1 3 7 8 9
# 2: 2 4 10 11 12
# 3: 5 6 13 14 NA

Here's one possible approach:
dat <- read.table(text="A B C
1 3 X1=7;X2=8;X3=9
2 4 X1=10;X2=11;X3=12
5 6 X1=13;X2=14", header=TRUE, stringsAsFactors = FALSE)
library(qdapTools)
dat_C <- strsplit(dat$C, ";")
dat_C2 <- sapply(dat_C, function(x) {
y <- strsplit(x, "=")
rep(sapply(y, "[", 1), as.numeric(sapply(y, "[", 2)))
})
data.frame(dat[, -3], mtabulate(dat_C2))
## A B X1 X2 X3
## 1 1 3 7 8 9
## 2 2 4 10 11 12
## 3 5 6 13 14 0
EDIT To obtain the NA values
m <- mtabulate(dat_C2)
m[m==0] <- NA
data.frame(dat[, -3], m)

Here's a nice, somewhat hacky way to get you there.
## read your data
> dat <- read.table(h=T, text = "A B C
1 3 X1=7;X2=8;X3=9
2 4 X1=10;X2=11;X3=12
5 6 X1=13;X2=14", stringsAsFactors = FALSE)
## ---
> s <- strsplit(dat$C, ";|=")
> xx <- unique(unlist(s)[grepl('[A-Z]', unlist(s))])
> sap <- t(sapply(seq(s), function(i){
wh <- which(!xx %in% s[[i]]); n <- suppressWarnings(as.numeric(s[[i]]))
nn <- n[!is.na(n)]; if(length(wh)){ append(nn, NA, wh-1) } else { nn }
})) ## see below for explanation
> data.frame(dat[1:2], sap)
# A B X1 X2 X3
# 1 1 3 7 8 9
# 2 2 4 10 11 12
# 3 5 6 13 14 NA
Basically what's happening in sap is
check which values are missing
change each list element of s to numeric
remove the NA values from (2)
insert NA into the correct position with append
transpose the result

Related

Add ID column to a list of data frames

I have a list of 142 dataframes file_content and a list from id_list <- list(as.character(1:length(file_content)))
I am trying to add a new column period to each data frame in file_content.
All data frames are similar to 2021-03-16 below.
`2021-03-16` <- file_content[[1]] # take a look at 1/142 dataframes in file_content
head(`2021-03-16`)
author_id created_at id tweet
1 3.304380e+09 2018-12-01 22:58:55+00:00 1.069003e+18 #Acosta I hope he didn’t really say “muckâ€\u009d.
2 5.291559e+08 2018-12-01 22:57:31+00:00 1.069003e+18 #Acosta I like Mattis, but why does he only speak this way when Individual-1 isn't around?
3 2.195313e+09 2018-12-01 22:56:41+00:00 1.069002e+18 #Acosta What did Mattis say about the informal conversation between Trump and Putin at the G20?
4 3.704188e+07 2018-12-01 22:56:41+00:00 1.069002e+18 #Acosta Good! Tree huggers be damned!
5 1.068995e+18 2018-12-01 22:56:11+00:00 1.069002e+18 #Acosta #NinerMBA_01
6 9.983321e+17 2018-12-01 22:55:13+00:00 1.069002e+18 #Acosta Really?
I have tried to add the period column using the following code but it adds all 142 values from the id_list to every row in every data frame in file_content.
for (id in length(id_list)) {
file_content <- lapply(file_content, function(x) { x$period <- paste(id_list[id], sep = "_"); x })
}
You were close, the mistake is you need double brackets in id_list[[id]].
for (id in length(id_list)) {
file_content <- lapply(file_content, function(x) {
x$period <- paste(id_list[[id]], sep = "_")
x
})
}
# $`1`
# X1 X2 X3 X4 period
# 1 1 4 7 10 1
# 2 2 5 8 11 2
# 3 3 6 9 12 3
#
# $`2`
# X1 X2 X3 X4 period
# 1 1 4 7 10 1
# 2 2 5 8 11 2
# 3 3 6 9 12 3
#
# $`3`
# X1 X2 X3 X4 period
# 1 1 4 7 10 1
# 2 2 5 8 11 2
# 3 3 6 9 12 3
You could also try Map() and save a few lines.
Map(`[<-`, file_content, 'period', value=id_list)
# $`1`
# X1 X2 X3 X4 period
# 1 1 4 7 10 1
# 2 2 5 8 11 2
# 3 3 6 9 12 3
#
# $`2`
# X1 X2 X3 X4 period
# 1 1 4 7 10 1
# 2 2 5 8 11 2
# 3 3 6 9 12 3
#
# $`3`
# X1 X2 X3 X4 period
# 1 1 4 7 10 1
# 2 2 5 8 11 2
# 3 3 6 9 12 3
Data:
file_content <- replicate(3, data.frame(matrix(1:12, 3, 4)), simplify=F) |> setNames(1:3)
id_list <- list(as.character(1:length(file_content)))
We may use imap
library(purrr)
library(dplyr)
imap(file_content, ~ .x %>%
mutate(period = .y))
Or with Map from base R
Map(cbind, file_content, period = names(file_content))
In the OP's code, the id_list is created as a single list element by wrapping with list i.e.
list(1:5)
vs
as.list(1:5)
Here, we don't need to convert to list as a vector is enough
id_list <- seq_along(file_content)
Also, the for loop is looping on a single element i.e. the last element with length
for (id in length(id_list)) {
^^
instead, it would be 1:length. In addition, the assignment should be on the single list element file_content[[id]] and not on the entire list
for(id in seq_along(id_list)) {
file_content[[id]]$period <- id_list[id]
}

how to subset every 6 rows in R?

I have to subset the data of 6 rows every time. How to do that in R?
data:
col1 : 1,2,3,4,5,6,7,8,9,10
col2 : a1,a2,a3,a4,a5,a6,a7,a8,a9,a10
I want to do subset of 6 rows every time. First subset of the rows will have 1:6 ,next subset of the rows will have 7:nrow(data). I have tried using seq function.
seqData <- seq(1,nrow(data),6)
output: It is giving 1 and 7th row but I want 1 to 6 rows first, next onwards 7 to nrow(data).
How to get output like that.
Will this work:
set.seed(1)
dat <- data.frame(c1 = sample(1:5,12,T),
c2 = sample(1:5,12,T))
dat
c1 c2
1 1 2
2 4 2
3 1 1
4 2 5
5 5 5
6 3 1
7 2 1
8 3 5
9 3 5
10 1 2
11 5 2
12 5 1
split(dat, rep(1:ceiling(nrow(dat)/6), each = 6))
$`1`
c1 c2
1 1 2
2 4 2
3 1 1
4 2 5
5 5 5
6 3 1
$`2`
c1 c2
7 2 1
8 3 5
9 3 5
10 1 2
11 5 2
12 5 1
The function below creates a numeric vector with integers increasing by 1 unit every n rows. And uses this vector to split the data as needed.
data <- data.frame(col1 = 1:10, col2 = paste0("a", 1:10))
split_nrows <- function(x, n){
f <- c(1, rep(0, n - 1))
f <- rep(f, length.out = NROW(x))
f <- cumsum(f)
split(x, f)
}
split_nrows(data, 6)
Here's a simple example with mtcars that yields a list of 6 subset dfs.
nrows <- nrow(mtcars)
breaks <- seq(1, nrows, 6)
listdfs <- lapply(breaks, function(x) mtcars[x:(x+5), ]) # increment by 5 not 6
listdfs[[6]] <- listdfs[[6]][1:2, ] #last df: remove 4 NA rows (36 - 32)

Combining elements of one column into two columns by group in R

Given a two column data.frame with one containing group labels and a second containing integer values ordered from smallest to largest. How can the data be expanded creating pairs of combinations of the integer column?
Not sure the best way to state this. I'm not interested in all possible combinations but instead all unique combinations starting from the lowest value.
In r, the combn function gives the desired output not considering groups, for example:
t(combn(seq(1:4),2))
[,1] [,2]
[1,] 1 2
[2,] 1 3
[3,] 1 4
[4,] 2 3
[5,] 2 4
[6,] 3 4
Since the first values is 1 we get the unique combination of (1,2) and not the additional combination of (2,1) which I don't need. How would one then apply a similar method by groups?
for example given a data.frame
test <- data.frame(Group = rep(c("A","B"),each=4),
Val = c(1,3,6,8,2,4,5,7))
test
Group Val
1 A 1
2 A 3
3 A 6
4 A 8
5 B 2
6 B 4
7 B 5
8 B 7
I was able to come up with this solution that gives the desired output:
test <- data.frame(Group = rep(c("A","B"),each=4),
Val = c(1,3,6,8,2,4,5,7))
j=1
for(i in unique(test$Group)){
if(j==1){
one <- filter(test,i == Group)
two <- data.frame(t(combn(one$Val,2)))
test1 <- data.frame(Group = i,Val1=two$X1,Val2=two$X2)
j=j+1
}else{
one <- filter(test,i == Group)
two <- data.frame(t(combn(one$Val,2)))
test2 <- data.frame(Group = i,Val1=two$X1,Val2=two$X2)
test1 <- rbind(test1,test2)
}
}
test1
Group Val1 Val2
1 A 1 3
2 A 1 6
3 A 1 8
4 A 3 6
5 A 3 8
6 A 6 8
7 B 2 4
8 B 2 5
9 B 2 7
10 B 4 5
11 B 4 7
12 B 5 7
However, this is not elegant and is really slow as the number of groups and length of each group become large. It seems like there should be a more elegant and efficient solution but so far I have not come across anything on SO.
I would appreciate any ideas!
here is a data.table approach
library( data.table )
#make test a data.table
setDT(test)
#split by group
L <- split( test, by = "Group")
#get unique combinations of 2 Vals
L2 <- lapply( L, function(x) {
as.data.table( t( combn( x$Val, m = 2, simplify = TRUE ) ) )
})
#merge them back together
data.table::rbindlist( L2, idcol = "Group" )
# Group V1 V2
# 1: A 1 3
# 2: A 1 6
# 3: A 1 8
# 4: A 3 6
# 5: A 3 8
# 6: A 6 8
# 7: B 2 4
# 8: B 2 5
# 9: B 2 7
#10: B 4 5
#11: B 4 7
#12: B 5 7
You can set simplify = F in combn() and then use unnest_wider() in dplyr.
library(dplyr)
library(tidyr)
test %>%
group_by(Group) %>%
summarise(Val = combn(Val, 2, simplify = F)) %>%
unnest_wider(Val, names_sep = "_")
# Group Val_1 Val_2
# <chr> <dbl> <dbl>
# 1 A 1 3
# 2 A 1 6
# 3 A 1 8
# 4 A 3 6
# 5 A 3 8
# 6 A 6 8
# 7 B 2 4
# 8 B 2 5
# 9 B 2 7
# 10 B 4 5
# 11 B 4 7
# 12 B 5 7
library(tidyverse)
df2 <- split(df$Val, df$Group) %>%
map(~gtools::combinations(n = 4, r = 2, v = .x)) %>%
map(~as_tibble(.x, .name_repair = "unique")) %>%
bind_rows(.id = "Group")

Keep columns of a data frame based on a data frame

I have a data frame, called df, which contains 4000 values. I have a list of 1000 column numbers, in a data frame called list, which is 1000 rows by 1 column. How can I keep the rows with the numbers in list in the data frame df and throw the rest out. I already tried using:
listv <- as.vector(list)
and then using
dfnew <- df[,listv]
but I get the error
Error in .subset(x, j) : invalid subscript type 'list'
You're mixing up rows and columns subsetting. Here is a minimal example:
df <- data.frame(matrix(1:21, ncol = 3))
df
# X1 X2 X3
# 1 1 8 15
# 2 2 9 16
# 3 3 10 17
# 4 4 11 18
# 5 5 12 19
# 6 6 13 20
# 7 7 14 21
list <- data.frame(V1 = c(1, 4, 6))
list
# V1
# 1 1
# 2 4
# 3 6
df[list[, 1], ]
# X1 X2 X3
# 1 1 8 15
# 4 4 11 18
# 6 6 13 20
df[unlist(list), ]
# X1 X2 X3
# 1 1 8 15
# 4 4 11 18
# 6 6 13 20
Note also that as.vector(list) doesn't create a vector, as you thought it would. You need unlist here (as I used in the last example).

Splitting data in a cell

I have a data set that looks like this
Code Product
1 A|B
2 A|B|C
3 A|B|C|D|E
When I split the column Product using colsplit function, duplication occurs. The output of colsplit function looks like this:
Code Product.1 Product.2 Product.3 Product.4 Product.5
1 A B A B A
2 A B C A B
3 A B C D E
This happens because one of the cells had five elements. Is there any way to avoid this duplication?
Thanks and regards
Jayaram
Update (21 Oct 2013)
The concepts below have been rolled into a family of functions called concat.split.* in my "splitstackshape" package. Here is a very straightforward solution using concat.split.multiple:
library(splitstackshape)
concat.split.multiple(temp, "Product", "|", "long")
# Code time Product
# 1 1 1 A
# 2 2 1 A
# 3 3 1 A
# 4 1 2 B
# 5 2 2 B
# 6 3 2 B
# 7 1 3 <NA>
# 8 2 3 C
# 9 3 3 C
# 10 1 4 <NA>
# 11 2 4 <NA>
# 12 3 4 D
# 13 1 5 <NA>
# 14 2 5 <NA>
# 15 3 5 E
Remove the "long" argument if you want the wide format, but your comments indicated that ultimately you wanted a long format for your output.
Original answer (17 Dec 2012)
You can do this with strsplit and sapply as follows:
# Your data
temp <- structure(list(Code = 1:3, Product = c("A|B", "A|B|C", "A|B|C|D|E"
)), .Names = c("Code", "Product"), class = "data.frame", row.names = c(NA, -3L))
temp1 <- strsplit(temp$Product, "\\|") # Split the product cell
temp1 <- data.frame(Code = temp$Code,
t(sapply(temp1,
function(x) {
temp <- matrix(NA,
nrow = max(sapply(temp1, length)));
temp[1:length(x)] <- x; temp})))
temp1
# Code X1 X2 X3 X4 X5
# 1 1 A B <NA> <NA> <NA>
# 2 2 A B C <NA> <NA>
# 3 3 A B C D E
Or... use rbind.fill from the "plyr" package, after making each of your rows into a single column data.frame:
temp1 <- strsplit(temp$Product, "\\|")
library(plyr)
data.frame(Code = temp$Code,
rbind.fill(lapply(temp1, function(x) data.frame(t(x)))))
# Code X1 X2 X3 X4 X5
# 1 1 A B <NA> <NA> <NA>
# 2 2 A B C <NA> <NA>
# 3 3 A B C D E
Or... inspired by #DWin's great answer here, re-read the second column as a data.frame in itself.
newcols <- max(sapply(strsplit(temp$Product, "\\|"), length))
temp2 <- data.frame(Code = temp$Code,
read.table(text = as.character(temp$Product),
sep="|", fill=TRUE,
col.names=paste("Product", seq(newcols))))
temp2
# Code Product.1 Product.2 Product.3 Product.4 Product.5
# 1 1 A B
# 2 2 A B C
# 3 3 A B C D E

Resources