split columns by character in list of data frames - r

I have the following list of data frames:
df1 <- data.frame(x = 1:3, y=c("1,2","1,2,3","1,5"))
df2 <- data.frame(x = 4:6, y=c("1,2","1,4","1,6,7,8"))
filelist <- list(df1,df2)
> filelist
[[1]]
x y
1 1 1,2
2 2 1,2,3
3 3 1,5
[[2]]
x y
1 4 1,2
2 5 1,4
3 6 1,6,7,8
Now I want to split each column 'y' by character ',' and store the output in new columns in the dataframe.
The output should look like this:
> filelist
[[1]]
x y_ref y_alt1 y_alt2
1 1 1 2
2 2 1 2 3
3 3 1 5
[[2]]
x y_ref y_alt2 y_alt3 y_alt4
1 4 1 2
2 5 1 4
3 6 1 6 7 8
How should I do this? I know there is 'strsplit' to split a string by character. But I don't see how I can store the output then in different columns.

apply cSplit on "y" column of each dataframe in filelist
lapply(filelist, splitstackshape::cSplit, "y")
#[[1]]
# x y_1 y_2 y_3
#1: 1 1 2 NA
#2: 2 1 2 3
#3: 3 1 5 NA
#[[2]]
# x y_1 y_2 y_3 y_4
#1: 4 1 2 NA NA
#2: 5 1 4 NA NA
#3: 6 1 6 7 8

Here's a solution that relies on tstrsplit from data.table
library(data.table)
lapply(filelist,
function(DF) {
commas = max(nchar(as.character(DF$y)) -nchar( gsub(",", "", DF$y)))
DF[, c('y_ind', paste0('y_alt', seq_len(commas)))] = tstrsplit(as.character(DF$y), ',')
DF
})
#> [[1]]
#> x y y_ind y_alt1 y_alt2
#> 1 1 1,2 1 2 <NA>
#> 2 2 1,2,3 1 2 3
#> 3 3 1,5 1 5 <NA>
#>
#> [[2]]
#> x y y_ind y_alt1 y_alt2 y_alt3
#> 1 4 1,2 1 2 <NA> <NA>
#> 2 5 1,4 1 4 <NA> <NA>
#> 3 6 1,6,7,8 1 6 7 8
Created on 2019-09-17 by the reprex package (v0.3.0)

Also with dplyr you can use separate() like this:
df %>%
separate(y, into = c(y_ind,y_alt1,...), sep = ",")
Note that into can also be used more "programmatically" to generate the needed amount of resulting columns with a proper indexing without manually defining each result column.

Related

Add column to each data frame within list with function rowSums and range of columns

SO. The following might serve as a small example of the real list.
a <- data.frame(
x = c("A","A","A","A","A"),
y = c(1,2,3,4,5),
z = c(1,2,3,4,5))
b <- data.frame(
x = c("A","A","A","A","A"),
y = c(1,2,3,4,5),
z = c(1,2,3,4,5))
c <- data.frame(
x = c("A","A","A","A","A"),
y = c(1,2,3,4,5),
z = c(1,2,3,4,5))
l <- list(a,b,c)
From the second column to last column - on every data frame - i want to add the sums as a new column to each data frame.
I tried:
lapply(l, function(x) rowSums(x[2:ncol(x)]))
which returns the correct sums, but doesn't add them to the data frames.
I also tried:
lapply(l, transform, sum = y + z)
which gives me the correct results but is not flexible enough, because i don't always know how many columns there are for each data frame and what names they have. The only thing i know, is, that i have to start from second column to end. I tried to combine these two approaches but i can't figure out, how to do it exactly.
Thanks
Try this. You can play around index in columns and exclude the first variable so that there is not issues about how many additional variables you have in order to obtain the rowsums. Here the code:
#Compute rowsums
l1 <- lapply(l,function(x) {x$RowSum<-rowSums(x[,-1],na.rm=T);return(x)})
Output:
l1
[[1]]
x y z RowSum
1 A 1 1 2
2 A 2 2 4
3 A 3 3 6
4 A 4 4 8
5 A 5 5 10
[[2]]
x y z RowSum
1 A 1 1 2
2 A 2 2 4
3 A 3 3 6
4 A 4 4 8
5 A 5 5 10
[[3]]
x y z RowSum
1 A 1 1 2
2 A 2 2 4
3 A 3 3 6
4 A 4 4 8
5 A 5 5 10
Here's how to combine your attempts. I used data[-1] instead of data[2:ncol(data)] because it seems simpler, but either should work.
lapply(l, function(data) transform(data, sum = rowSums(data[-1])))
Unfortunately, transform will be confused if the name of the argument to your anonymous function is the same as a column name - data[-1] needs to look at the data frame, not a particular column. (I originally use function(x) instead of function(data), and this caused an error because there is a column named x. From this perspective, Duck's answer is a little safer.)
Does this work:
> add_col <- function(df){
+ df[(ncol(df)+1)] = rowSums(df[2:ncol(df)])
+ df
+ }
> lapply(l, add_col)
[[1]]
x y z V4
1 A 1 1 2
2 A 2 2 4
3 A 3 3 6
4 A 4 4 8
5 A 5 5 10
[[2]]
x y z V4
1 A 1 1 2
2 A 2 2 4
3 A 3 3 6
4 A 4 4 8
5 A 5 5 10
[[3]]
x y z V4
1 A 1 1 2
2 A 2 2 4
3 A 3 3 6
4 A 4 4 8
5 A 5 5 10
>
With sum as column name:
> add_col <- function(df){
+ df['sum'] = rowSums(df[2:ncol(df)])
+ df
+ }
> lapply(l, add_col)
[[1]]
x y z sum
1 A 1 1 2
2 A 2 2 4
3 A 3 3 6
4 A 4 4 8
5 A 5 5 10
[[2]]
x y z sum
1 A 1 1 2
2 A 2 2 4
3 A 3 3 6
4 A 4 4 8
5 A 5 5 10
[[3]]
x y z sum
1 A 1 1 2
2 A 2 2 4
3 A 3 3 6
4 A 4 4 8
5 A 5 5 10
use tidyverse
library(tidyverse)
map(l, ~.x %>% mutate(Sum := apply(.x[-1], 1, sum)))
#> [[1]]
#> x y z Sum
#> 1 A 1 1 2
#> 2 A 2 2 4
#> 3 A 3 3 6
#> 4 A 4 4 8
#> 5 A 5 5 10
#>
#> [[2]]
#> x y z Sum
#> 1 A 1 1 2
#> 2 A 2 2 4
#> 3 A 3 3 6
#> 4 A 4 4 8
#> 5 A 5 5 10
#>
#> [[3]]
#> x y z Sum
#> 1 A 1 1 2
#> 2 A 2 2 4
#> 3 A 3 3 6
#> 4 A 4 4 8
#> 5 A 5 5 10
Created on 2020-09-30 by the reprex package (v0.3.0)
We can use map with mutate
library(purrr)
library(dplyr)
map(l, ~ .x %>%
mutate(sum = rowSums(select(., -1))))
Or with c_across
map(l, ~ .x %>%
rowwise() %>%
mutate(sum = sum(c_across(-1), na.rm = TRUE)) %>%
ungroup)

Remove trailing NA by group in a data.frame

I have a data.frame with a grouping variable, and some NAs in the value column.
df = data.frame(group=c(1,1,2,2,2,2,2,3,3), value1=1:9, value2=c(NA,4,9,6,2,NA,NA,1,NA))
I can use zoo::na.trim to remove NA at the end of a column: this will remove the last line of the data.frame:
library(zoo)
library(dplyr)
df %>% na.trim(sides="right")
Now I want to remove the trailing NAs by group; how can I achieve this using dplyr?
Expected output for value2 column: c(NA, 4,9,6,2,1)
You could write a little helper function that checks for trailing NAs of a vector and then use group_by and filter.
f <- function(x) { rev(cumsum(!is.na(rev(x)))) != 0 }
library(dplyr)
df %>%
group_by(group) %>%
filter(f(value2))
# A tibble: 6 x 3
# Groups: group [3]
group value1 value2
<dbl> <int> <dbl>
1 1 1 NA
2 1 2 4
3 2 3 9
4 2 4 6
5 2 5 2
6 3 8 1
edit
If we need to remove both leading and trailing zero we need to extend that function a bit.
f1 <- function(x) { cumsum(!is.na(x)) != 0 & rev(cumsum(!is.na(rev(x)))) != 0 }
Given df1
df1 = data.frame(group=c(1,1,2,2,2,2,2,3,3), value1=1:9, value2=c(NA,4,9,NA,2,NA,NA,1,NA))
df1
# group value1 value2
#1 1 1 NA
#2 1 2 4
#3 2 3 9
#4 2 4 NA
#5 2 5 2
#6 2 6 NA
#7 2 7 NA
#8 3 8 1
#9 3 9 NA
We get this result
df1 %>%
group_by(group) %>%
filter(f1(value2))
# A tibble: 5 x 3
# Groups: group [3]
group value1 value2
<dbl> <int> <dbl>
1 1 2 4
2 2 3 9
3 2 4 NA
4 2 5 2
5 3 8 1
Using lapply, loop through group:
do.call("rbind", lapply(split(df, df$group), na.trim, sides = "right"))
# group value1 value2
# 1.1 1 1 NA
# 1.2 1 2 4
# 2.3 2 3 9
# 2.4 2 4 6
# 2.5 2 5 2
# 3 3 8 1
Or using by, as mentioned by #Henrik:
do.call("rbind", by(df, df$group, na.trim, sides = "right"))

Shift certain values in a column of dataframe to a next column in R

I have a dataframe that looks like the following:
x y z
1 2 3
1 2 3
1 2 3
2 3
1 2 3
1 3
I would like to ask if there is a command in R that will allow to obtain the following dataframe (by shifting and aligning similar values)
x y z
1 2 3
1 2 3
1 2 3
NA 2 3
1 2 3
1 NA 3
An alternative solution, where the main idea is to capture the pattern of your dataset based on rows that don't have NAs and then perform some reshaping using the pattern you captured.
df = read.table(text = "
x y z
1 2 3
1 2 3
1 2 3
2 3 NA
1 2 3
1 3 NA
", header= T)
library(tidyverse)
# get the column names of your dataset
names = names(df)
# get unique values after omitting rows with NAs
value = unlist(unique(na.omit(df)))
# create a dataset with names and values
# (this is the pattern you want to follow)
df3 = data.frame(names, value)
df %>%
mutate(id = row_number()) %>% # flag the row number
gather(v,value,-id) %>% # reshape
na.omit() %>% # remove rows with NAs
left_join(df3, by="value") %>% # join info about your pattern
select(-v) %>% # remove that column
spread(names, value) %>% # reshape
select(-id) # remove row number
# x y z
# 1 1 2 3
# 2 1 2 3
# 3 1 2 3
# 4 NA 2 3
# 5 1 2 3
# 6 1 NA 3
library(tidyverse)
df %>%
pmap_dfr(~{ x <- list(...)
if(any(is.na(x))) intersect(x, df[1,]) # match with first row's values to assign names
else x})
Output:
# # A tibble: 6 x 3
# x y z
# <int> <int> <int>
# 1 1 2 3
# 2 1 2 3
# 3 1 2 3
# 4 NA 2 3
# 5 1 2 3
# 6 1 NA 3
reading your data:
df<- fread("x y z
1 2 3
1 2 3
1 2 3
2 3 NA
1 2 3
1 3 NA") %>% setDF
code:
library(magrittr)
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
pattern <- sapply(df,getmode)
df[!complete.cases(df),] %<>% apply(1,function(x){tmp<-pattern;tmp[!(tmp%in%x)] <- NA;return(tmp)}) %>% t %>% data.frame
result:
> df
x y z
1 1 2 3
2 1 2 3
3 1 2 3
4 NA 2 3
5 1 2 3
6 1 NA 3

Add a column to a dataframe using (extracting unique values) from existing columns

I am new to R, and was not able to search answers for the specific problem I have encountered.
If my dataframe looks like below:
d <- data.frame(Name = c("Jon", "Jon", "Jon", "Kel", "Kel", "Kel", "Don", "Don", "Don"),
No1 = c(1,2,3,1,1,1,3,3,3),
No2 = c(1,1,1,2,2,2,3,3,3))
Name No1 No2
Jon 1 1
Jon 2 1
Jon 3 1
Kel 1 2
Kel 1 2
Kel 1 2
Don 3 3
Don 3 3
Don 3 3
...
How would I add be able to add new columns to the dataframe, where the columns would indicate the unique values in column No1 and No2: which would be (1,2,3), (1,2), (3) for John, Kelly, Don, respectively
So, if the new columns are named ID#, The desired results should be
d2 <- data.frame(Name = c("Jon", "Jon", "Jon", "Kel", "Kel", "Kel", "Don", "Don", "Don"),
No1 = c(1,2,3,1,1,1,3,3,3),
No2 = c(1,1,1,2,2,2,3,3,3),
ID1 = c(1,1,1,1,1,1,3,3,3),
ID2 = c(2,2,2,2,2,2,NA,NA,NA),
ID3 = c(3,3,3,NA,NA,NA,NA,NA,NA))
Name No1 No2 ID1 ID2 ID3
Jon 1 1 1 2 3
Jon 2 1 1 2 3
Jon 3 1 1 2 3
Kel 1 2 1 2 NA
Kel 1 2 1 2 NA
Kel 1 2 1 2 NA
Don 3 3 3 NA NA
Don 3 3 3 NA NA
Don 3 3 3 NA NA
A tidyverse approach:
library(dplyr)
library(tidyr)
# evaluate separately for each name
d %>% group_by(Name) %>%
# add a column of the unique values pasted together into a string
mutate(ID = paste(unique(c(No1, No2)), collapse = ' ')) %>%
# separate the string into individual columns, filling with NA and converting to numbers
separate(ID, into = paste0('ID', 1:3), fill = 'right', convert = TRUE)
## Source: local data frame [9 x 6]
## Groups: Name [3]
##
## Name No1 No2 ID1 ID2 ID3
## * <fctr> <dbl> <dbl> <int> <int> <int>
## 1 Jon 1 1 1 2 3
## 2 Jon 2 1 1 2 3
## 3 Jon 3 1 1 2 3
## 4 Kel 1 2 1 2 NA
## 5 Kel 1 2 1 2 NA
## 6 Kel 1 2 1 2 NA
## 7 Don 3 3 3 NA NA
## 8 Don 3 3 3 NA NA
## 9 Don 3 3 3 NA NA
Here's a nice base version with a basic split-apply-combine approach:
# store distinct values in No1 and No2
cols <- unique(unlist(d[,-1]))
# split No1 and No2 by Name,
ids <- data.frame(t(sapply(split(d[,-1], d$Name),
# find unique values for each split,
function(x){y <- unique(unlist(x))
# pad with NAs,
c(y, rep(NA, length(cols) - length(y)))
# and return a data.frame
})))
# fix column names
names(ids) <- paste0('ID', cols)
# turn rownames into column
ids$Name <- rownames(ids)
# join two data.frames on Name columns
merge(d, ids, sort = FALSE)
## Name No1 No2 ID1 ID2 ID3
## 1 Jon 1 1 1 2 3
## 2 Jon 2 1 1 2 3
## 3 Jon 3 1 1 2 3
## 4 Kel 1 2 1 2 NA
## 5 Kel 1 2 1 2 NA
## 6 Kel 1 2 1 2 NA
## 7 Don 3 3 3 NA NA
## 8 Don 3 3 3 NA NA
## 9 Don 3 3 3 NA NA
And just for kicks, here's a creative alternate base version that leverages table instead of splitting/grouping:
# copy d so as not to distort original with factor columns
d_f <- d
# make No* columns factors to ensure similar table structure
d_f[, -1] <- lapply(d[,-1], factor, levels = unique(unlist(d[, -1])))
# make tables of cols, sum to aggregate occurrences, and set as boolean mask for > 0
tab <- Reduce(`+`, lapply(d_f[, -1], table, d_f$Name)) > 0
# replace all TRUE values with values they tabulated
tab <- tab * matrix(as.integer(rownames(tab)), nrow = nrow(tab), ncol = ncol(tab))
# replace 0s with NAs
tab[tab == 0] <- NA
# store column names
cols <- paste0('ID', rownames(tab))
# sort each row, keeping NAs
tab <- data.frame(t(apply(tab, 2, sort, na.last = T)))
# apply stored column names
names(tab) <- cols
# turn rownames into column
tab$Name <- rownames(tab)
# join two data.frames on Name columns
merge(d, tab, sort = FALSE)
Results are identical.
library(dplyr)
library(tidyr)
d %>%
group_by(Name) %>%
mutate(unique_id = paste0(unique(c(No1, No2)), collapse = ",")) %>%
separate(., unique_id, paste0("id_", 1:max(c(.$No1, .$No2))), fill = "right")
We can use a single external package i.e. data.table and get the output. Convert the 'data.frame' to 'data.table' (setDT(d)), grouped by 'Name', we unlist the columns mentioned in the .SDcols, get the unique values, and dcast from 'long' to 'wide' format, do a join with the original dataset on the "Name" column.
library(data.table)
dcast(setDT(d)[, unique(unlist(.SD)) , Name, .SDcols = No1:No2],
Name~paste0("ID", rowid(Name)), value.var="V1")[d, on = "Name"]
# Name ID1 ID2 ID3 No1 No2
#1: Jon 1 2 3 1 1
#2: Jon 1 2 3 2 1
#3: Jon 1 2 3 3 1
#4: Kel 1 2 NA 1 2
#5: Kel 1 2 NA 1 2
#6: Kel 1 2 NA 1 2
#7: Don 3 NA NA 3 3
#8: Don 3 NA NA 3 3
#9: Don 3 NA NA 3 3
Or this can be done in one-line by first pasteing the unique elements in 'No1' and 'No2', grouped by 'Name', and then split it to three columns by using cSplit from splitstackshape.
library(splitstackshape)
cSplit(setDT(d)[, ID:= paste(unique(c(No1, No2)), collapse=" ") , Name], "ID", " ")
# Name No1 No2 ID_1 ID_2 ID_3
#1: Jon 1 1 1 2 3
#2: Jon 2 1 1 2 3
#3: Jon 3 1 1 2 3
#4: Kel 1 2 1 2 NA
#5: Kel 1 2 1 2 NA
#6: Kel 1 2 1 2 NA
#7: Don 3 3 3 NA NA
#8: Don 3 3 3 NA NA
#9: Don 3 3 3 NA NA
Or using the baseVerse just for kicks
d1 <- read.table(text=ave(unlist(d[-1]), rep(d$Name, 2),
FUN = function(x) paste(unique(x), collapse=" "))[1:nrow(d)],
header=FALSE, fill=TRUE, col.names= paste0("ID", 1:3))
cbind(d, d1)
# Name No1 No2 ID1 ID2 ID3
#1 Jon 1 1 1 2 3
#2 Jon 2 1 1 2 3
#3 Jon 3 1 1 2 3
#4 Kel 1 2 1 2 NA
#5 Kel 1 2 1 2 NA
#6 Kel 1 2 1 2 NA
#7 Don 3 3 3 NA NA
#8 Don 3 3 3 NA NA
#9 Don 3 3 3 NA NA
NOTE: No packages used and without much effort in splitting.

R language check missing data for columns and rows

I have a data frame sells and I want to check the missing data in both rows and columns
What I did for rows is:
sells[, complete.cases(sells)]
nrows(sells[, complete.cases(sells)])
but I didn't know who to solve if for columns
Help please
First let's take the iris dataframe and insert randomly some NA's:
iris.demo <- iris
iris.nas <- matrix(as.logical(sample(FALSE:TRUE, size = 150*5,
prob = c(.9,.1),replace = TRUE)),ncol = 5)
iris.demo[iris.nas] <- NA
For rows, it is pretty straightforward:
sum(complete.cases(iris.demo))
# [1] 75
For columns, two possibilities (among several possible others):
Transposing the whole dataframe
sum(complete.cases(t(iris.demo)))
# [1] 0 # 0 columns are complete
Using lapply to count the "non-missing" on every column and see if it's equal to nrow:
sum(lapply(iris.demo, function(x) sum(!is.na(x))) == nrow(iris.demo))
# [1] 0
You could do it like this:
set.seed(1)
(sells <- data.frame(replicate(2, sample(c(1:3, NA), 10, T)), x3 = 1:10))
# X1 X2 x3
# 1 NA 2 1
# 2 1 3 2
# 3 3 2 3
# 4 1 1 4
# 5 2 NA 5
# 6 2 3 6
# 7 1 NA 7
# 8 2 1 8
# 9 NA 3 9
# 10 2 2 10
Rows:
sells[complete.cases(sells), ]
# X1 X2 x3
# 1 2 1 1
# 2 2 1 2
# 3 3 3 3
# 9 3 2 9
nrow(sells[complete.cases(sells), ])
# [1] 6
Columns:
sells[, sapply(sells, function(col) any(is.na(col)))]
# X1 X2
# 1 2 1
# 2 2 1
# 3 3 3
# 4 NA 2
# 5 1 NA
# 6 NA 2
# 7 NA 3
# 8 3 NA
# 9 3 2
# 10 1 NA
sum(sapply(sells, function(col) any(is.na(col))))
# [1] 2

Resources