Related
I am working with the R programming language.
I have a column in a data frame that looks something like this (the column is a CHARACTER variable):
head(b$`New Col`)
[1] "1073680,, 781230,, 292455," "128485,, 62890,, 65595," "372475,, 184745,, 187730,"
The first row contains a single element: 1073680,, 781230,, 292455,
The second row contains a single element: 128485,, 62890,, 65595,
The third row contains a single element : 372475,, 184745,, 187730,
I want to split this column into 3 columns:
id col1 col2 col3
1 1 1073680 781230 292455
2 2 128485 62890 65595
3 3 372475 184745 187730
I know how to do this in Excel (e.g. remove last comma, and then used "fixed width delimited" using double commas).
But can someone please show me how to do this in R?
Thanks!
In addition to Maƫl's answer: if you're strictly after fixed-width separation, use separate with position indices instead of delimiter strings.
Your example data:
b <- structure(list(New.Col = c("1073680,, 781230,, 292455,", " 128485,, 62890,, 65595,",
" 372475,, 184745,, 187730,")), class = "data.frame", row.names = c(NA,
-3L))
separate by fixed widths:
library(tidyr)
b <- b %>%
separate(col = `New.Col`,
into = c('col1', 'drop1', 'col2', 'drop2', 'col3'),
sep = c(7, 10, 17, 21, 27)
)
drop the garbage columns (containing the delimiters):
b %>% select(-starts_with('drop'))
Just for fun. Another way:
library(tidyverse)
df %>%
separate_rows(col1, sep = ",, ") %>%
mutate(col = parse_number(col1), .keep="unused") %>%
group_by(ID) %>%
mutate(id = row_number()) %>%
pivot_wider(names_from = ID,
values_from = col,
names_glue = "col_{ID}")
id col_1 col_2 col_3
<int> <dbl> <dbl> <dbl>
1 1 1073680 128485 372475
2 2 781230 62890 184745
3 3 292455 65595 187730
Using base R
df <- cbind(df[1], read.csv(text = trimws(gsub(",+\\s+", ",", df$col),
whitespace = ","), header = FALSE, col.names = paste0("col", 1:3))
)
-output
df
ID col1 col2 col3
1 1 1073680 781230 292455
2 2 128485 62890 65595
3 3 372475 184745 187730
You can do the same in R, with str_remove_all and separate. Here, I remove all commas, and separate by whitespace. Use convert = TRUE to convert the separated values to numeric.
library(dplyr)
library(stringr)
library(tidyr)
df %>%
mutate(col = str_remove_all(col, ",")) %>%
separate(col, into = str_c("col", 1:3), convert = TRUE)
# ID col1 col2 col3
# 1 1 1073680 781230 292455
# 2 2 128485 62890 65595
# 3 3 372475 184745 187730
Edit: you actually don't need the first step, since separate convert the first occurrences of the pattern defined by the length of the into parameter (here, 3). It also chooses by default punctuation character(s) as the separator, so it is not needed to specify it.
df %>%
separate(col, into = str_c("col", 1:3), convert = TRUE)
# ID col1 col2 col3
# 1 1 1073680 781230 292455
# 2 2 128485 62890 65595
# 3 3 372475 184745 187730
Data
df <- data.frame(ID = 1:3, col = c("1073680,, 781230,, 292455,", "128485,, 62890,, 65595,", "372475,, 184745,, 187730," ))
Having a dataframe like this:
data.frame(id = c(1,2,3,4), text1 = c("sth","","another",""), text2 = c("more","another","add",""), text3 = c("final","and","where","all"))
How is it possible to detect if in the text1 column a row is blank and fill the blank with a value exist in text2, text3 or text4 column and leave from this NA after the process
Example of expected output
data.frame(id = c(1,2,3,4), text1 = c("sth","another","another","all"), text2 = c("more","","add",""), text3 = c("final","and","where",""))
A vectorized base R approach :
#Get indices where text1 is empty
inds <- which(df$text1 == '')
#get values to replace from the corresponding rows
vals <- cbind(inds, max.col(df[inds, 3:ncol(df)] != "") + 2)
#Replace the values
df$text1[inds] <- df[vals]
#Change the replaced value with blank.
df[vals] <- ''
df
# id text1 text2 text3
#1 1 sth more final
#2 2 another and
#3 3 another add where
#4 4 all
data
df <- data.frame(id = c(1,2,3,4), text1 = c("sth","","another",""),
text2 = c("more","another","add",""),
text3 = c("final","and","where","all"), stringsAsFactors = FALSE)
In base R you could do:
txt <- do.call(paste,c(sep = ',',`is.na<-`(df,df=="")))
df1 <- read.csv(text = sub("((?:,NA)+)(,\\w+)","\\2\\1",txt),
header = FALSE,
col.names = names(df),
stringsAsFactors = FALSE)
df1[is.na(df1)] <- ""
df1
id text1 text2 text3
1 1 sth more final
2 2 another and
3 3 another add where
4 4 all
here is a data.table approach...
explanation in comments below
#sample data
df <- data.frame(id = c(1,2,3,4), text1 = c("sth","","another",""), text2 = c("more","another","add",""), text3 = c("final","and","where","all"), stringsAsFactors = FALSE)
library( data.table )
#create data.table
setDT( df )
#paste together columns by id
ans <- df[, .(string = paste0( .SD, collapse =";")), by = .(id) ][]
# id string
# 1: 1 sth;more;final
# 2: 2 ;another;and
# 3: 3 another;add;where
# 4: 4 ;;all
#remove leading;'s
ans[, string := gsub("^;+", "", string) ]
# id string
# 1: 1 sth;more;final
# 2: 2 another;and
# 3: 3 another;add;where
# 4: 4 all
#split string back to columns, remove the temporary string-column
ans[, paste0( "text", 1:length( tstrsplit(ans$string, ";") ) ) :=
tstrsplit( string, ";") ][, string := NULL ]
# id text1 text2 text3
# 1: 1 sth more final
# 2: 2 another and <NA>
# 3: 3 another add where
# 4: 4 all <NA> <NA>
You can use dplyr + purrr:
df %>%
tidyr::nest(-id) %>%
dplyr::mutate(
new_text = purrr::map_chr(
data, ~
as.vector(t(.x[1,])) %>%
.[. != ""] %>%
dplyr::first())) %>%
tidyr::unnest()
A tibble: 4 x 5
id text1 text2 text3 new_text
<dbl> <fct> <fct> <fct> <chr>
1 1 sth more final sth
2 2 "" another and another
3 3 another add where another
4 4 "" "" all all
At this stage, why not also a dplyr approach? Admittedly, with a sparkle of base R in the middle
df <- data.frame(id = c(1,2,3,4),
text1 = c("sth","","another",""),
text2 = c("more","another","add",""),
text3 = c("final","and","where","all"))
library("dplyr")
library("tidyr")
df_filled <- df %>%
pivot_longer(cols = starts_with("text"),
names_to = "text_id",
values_to = "value") %>%
mutate(value = as.character(value)) %>%
group_by(id) %>%
mutate(value = if_else(value=="", as.character(NA), value)) %>%
mutate(previously_missing = value) %>%
tidyr::fill(value, .direction = "downup")
df_filled$value[which(is.na(df_filled$previously_missing)&df_filled$text_id!="text3")+1] <- NA
df_filled %>%
ungroup() %>%
pivot_wider(id_cols = id,
names_from = "text_id",
values_from = "value")
#> # A tibble: 4 x 4
#> id text1 text2 text3
#> <dbl> <chr> <chr> <chr>
#> 1 1 sth more final
#> 2 2 another <NA> and
#> 3 3 another add where
#> 4 4 all <NA> <NA>
Created on 2020-02-19 by the reprex package (v0.3.0)
Another base R solution is to define your custom function swap and apply it by rows, i.e.,
swap <- function(v) {v[inds]<-v[rev(inds <- c(1,head(which(nchar(v)>0),1)))];v}
df[-1]<-t(apply(df[-1], 1, swap))
such that
> df
id text1 text2 text3
1 1 sth more final
2 2 another and
3 3 another add where
4 4 all
I have a numeric vector with names following a pattern. The name for each element consists of two parts. There are a fixed number of variations on the first part and a fixed number of variations on the second part per the below.
x <- c(2, 4, 3, 7, 6, 9)
names(x) <- c("a.0", "b.0", "c.0", "a.1", "b.1", "c.1")
From this I want to create and print a table where the first part of the names is the rows and the second part the columns per the below.
a b c
0 2 4 3
1 7 6 9
Here are some possibilities. The first 3 only use base R.
1) tapply Use tapply with the row and column parts specified in the second argument.
nms <- names(x)
tapply(x, list(row = sub(".*\\.", "", nms), col = sub("\\..*", "", nms)), c)
giving the following matrix with the indicated row and column names.
col
row a b c
0 2 4 3
1 7 6 9
2) xtabs Another possibility is to use xtabs:
dnms <- read.table(text = names(x), sep = ".", as.is = TRUE,
col.names = c("col", "row"))[2:1]
xtabs(x ~ ., dnms)
giving this xtabs/table object:
col
row a b c
0 2 4 3
1 7 6 9
3) reshape
long <- cbind(x, read.table(text = names(x), sep = ".", as.is = TRUE,
col.names = c("col", "row")))
r <- reshape(long, dir = "wide", idvar = "row", timevar = "col")[-1]
dimnames(r) <- lapply(long[3:2], unique)
r
giving this data.frame:
a b c
0 2 4 3
1 7 6 9
4) dplyr/tidyr/tibble Using the indicated packages we can form the following pipeline:
library(dplyr)
library(tidyr)
library(tibble)
x %>%
stack %>%
separate(ind, c("col", "rowname")) %>%
pivot_wider(names_from = col, values_from = ".") %>%
column_to_rownames
giving this data.frame:
a b c
0 2 4 3
1 7 6 9
If you are using an older version of tidyr replace the pivot_wider line with
spread(col, values) %>%
As per #d.b. comment this would also work:
x %>%
data.frame %>%
rownames_to_column %>%
separate(rowname, c("col", "rowname")) %>%
pivot_wider(names_from = col, values_from = ".") %>%
column_to_rownames
do.call(rbind, split(x, gsub(".*\\.(.*)", "\\1", names(x))))
# a.0 b.0 c.0
#0 2 4 3
#1 7 6 9
I have a data frame that looks like this:
data.frame(group1_a_mu = 10, group1_b_sd = 4, group1_c_xx = 5, group2_a_mu=1, group2_b_sd=2, gorup2_c_xx = 14, stringsAsFactors = FALSE)
group1_a_mu group1_b_sd group1_c_xx group2_a_mu group2_b_sd gorup2_c_xx
1 10 4 5 1 2 14
and I would like to transform it to this:
mu sd xx
group1 10 4 5
group2 1 2 14
how can one do that?
You could try the following (based on the data from the original post):
library(dplyr)
library(tidyr)
data.frame(group1_a = 10, group1_b = 4, group1_c = 5, group2_a=1, group2_b=2, group2_c = 14, stringsAsFactors = FALSE) %>%
gather(key, val) %>%
separate(key, c('group_name', 'subgroup_name'), sep = '_') %>%
spread(subgroup_name, val)
## group_name a b c
## 1 group1 10 4 5
## 2 group2 1 2 14
For the situation where there are 2 _ characters (updated post), the following approach temporarily modifies the _ character. The alternative is to use the look ahead or look behind operators in the separate regular expression (sep).
data.frame(group1_a_mu = 10, group1_b_sd = 4, group1_c_xx = 5, group2_a_mu=1, group2_b_sd=2, group2_c_xx = 14, stringsAsFactors = FALSE) %>%
gather(key, val) %>%
mutate(key = sub('_', '|', key)) %>% ## Temporary change of '_' to '|'
separate(key, c('group_name', 'subgroup_name'), sep = '_') %>%
spread(subgroup_name, val) %>%
mutate(group_name = sub('[|]', '_', group_name)) ## Change back to '_'
## group_name mu sd xx
## 1 group1_a 10 NA NA
## 2 group1_b NA 4 NA
## 3 group1_c NA NA 5
## 4 group2_a 1 NA NA
## 5 group2_b NA 2 NA
## 6 group2_c NA NA 14
Using the positive look behind operator will give the same results.
data.frame(group1_a_mu = 10, group1_b_sd = 4, group1_c_xx = 5, group2_a_mu=1, group2_b_sd=2, group2_c_xx = 14, stringsAsFactors = FALSE) %>%
gather(key, val) %>%
separate(key, c('group_name', 'subgroup_name'), sep = '(?<=[a-z])_') %>%
spread(subgroup_name, val)
Given a situation such as the following
library(dplyr)
myData <- tbl_df(data.frame( var1 = rnorm(100),
var2 = letters[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var3 = LETTERS[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var4 = month.abb[1:3] %>%
sample(100, replace = TRUE) %>%
factor()))
I would like to group `myData' to eventually find summary data grouping by all possible combinations of var2, var3, and var4.
I can create a list with all possible combinations of variables as character values with
groupNames <- names(myData)[2:4]
myGroups <- Map(combn,
list(groupNames),
seq_along(groupNames),
simplify = FALSE) %>%
unlist(recursive = FALSE)
My plan was to make separate data sets for each variable combination with a for() loop, something like
### This Does Not Work
for (i in 1:length(myGroups)){
assign( myGroups[i]%>%
unlist() %>%
paste0(collapse = "")%>%
paste0("Data"),
myData %>%
group_by_(lapply(myGroups[[i]], as.symbol)) %>%
summarise( n = length(var1),
avgVar2 = var2 %>%
mean()))
}
Admittedly I am not very good with lists, and looking up this issue was a bit challenging since dpyr updates have altered how grouping works a bit.
If there is a better way to do this than separate data sets I would love to know.
I've gotten a loop similar to above working when I am only grouping by a single variable.
Any and all help is greatly appreciated! Thank you!
This seems convulated, and there's probably a way to simplify or fancy it up with a do, but it works. Using your myData and myGroups,
results = lapply(myGroups, FUN = function(x) {
do.call(what = group_by_, args = c(list(myData), x)) %>%
summarise( n = length(var1),
avgVar1 = mean(var1))
}
)
> results[[1]]
Source: local data frame [3 x 3]
var2 n avgVar1
1 a 31 0.38929738
2 b 31 -0.07451717
3 c 38 -0.22522129
> results[[4]]
Source: local data frame [9 x 4]
Groups: var2
var2 var3 n avgVar1
1 a A 11 -0.1159160
2 a B 11 0.5663312
3 a C 9 0.7904056
4 b A 7 0.0856384
5 b B 13 0.1309756
6 b C 11 -0.4192895
7 c A 15 -0.2783099
8 c B 10 -0.1110877
9 c C 13 -0.2517602
> results[[7]]
# I won't paste them here, but it has all 27 rows, grouped by var2, var3 and var4.
I changed your summarise call to average var1 since var2 isn't numeric.
I have created a function based on the answer of #Gregor and the comments that followed:
library(magrittr)
myData <- tbl_df(data.frame( var1 = rnorm(100),
var2 = letters[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var3 = LETTERS[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var4 = month.abb[1:3] %>%
sample(100, replace = TRUE) %>%
factor()))
Function combSummarise
combSummarise <- function(data, variables=..., summarise=...){
# Get all different combinations of selected variables (credit to #Michael)
myGroups <- lapply(seq_along(variables), function(x) {
combn(c(variables), x, simplify = FALSE)}) %>%
unlist(recursive = FALSE)
# Group by selected variables (credit to #konvas)
df <- eval(parse(text=paste("lapply(myGroups, function(x){
dplyr::group_by_(data, .dots=x) %>%
dplyr::summarize_( \"", paste(summarise, collapse="\",\""),"\")})"))) %>%
do.call(plyr::rbind.fill,.)
groupNames <- c(myGroups[[length(myGroups)]])
newNames <- names(df)[!(names(df) %in% groupNames)]
df <- cbind(df[, groupNames], df[, newNames])
names(df) <- c(groupNames, newNames)
df
}
Call of combSummarise
combSummarise (myData, var=c("var2", "var3", "var4"),
summarise=c("length(var1)", "mean(var1)", "max(var1)"))
or
combSummarise (myData, var=c("var2", "var4"),
summarise=c("length(var1)", "mean(var1)", "max(var1)"))
or
combSummarise (myData, var=c("var2", "var4"),
summarise=c("length(var1)"))
etc
Inspired by the answers by Gregor and dimitris_ps, I wrote a dplyr style function that runs summarise for all combinations of group variables.
summarise_combo <- function(data, ...) {
groupVars <- group_vars(data) %>% map(as.name)
groupCombos <- map( 0:length(groupVars), ~combn(groupVars, ., simplify=FALSE) ) %>%
unlist(recursive = FALSE)
results <- groupCombos %>%
map(function(x) {data %>% group_by(!!! x) %>% summarise(...)} ) %>%
bind_rows()
results %>% select(!!! groupVars, everything())
}
Example
library(tidyverse)
mtcars %>% group_by(cyl, vs) %>% summarise_combo(cyl_n = n(), mean(mpg))
Using unite to create a new column is the simplest way
library(tidyverse)
df = tibble(
a = c(1,1,2,2,1,1,2,2),
b = c(3,4,3,4,3,4,3,4),
val = c(1,2,3,4,5,6,7,8)
)
print(df)#output1
df_2 = unite(df, 'combined_header', a, b, sep='_', remove=FALSE) #remove=F doesn't remove existing columns
print(df_2)#output2
df_2 %>% group_by(combined_header) %>%
summarize(avg_val=mean(val)) %>% print()#output3
#avg 1_3 = mean(1,5)=3 avg 1_4 = mean(2, 6) = 4
RESULTS
Output:
output1
a b val
<dbl> <dbl> <dbl>
1 1 3 1
2 1 4 2
3 2 3 3
4 2 4 4
5 1 3 5
6 1 4 6
7 2 3 7
8 2 4 8
output2
combined_header a b val
<chr> <dbl> <dbl> <dbl>
1 1_3 1 3 1
2 1_4 1 4 2
3 2_3 2 3 3
4 2_4 2 4 4
5 1_3 1 3 5
6 1_4 1 4 6
7 2_3 2 3 7
8 2_4 2 4 8
output3
combined_header avg_val
<chr> <dbl>
1 1_3 3
2 1_4 4
3 2_3 5
4 2_4 6