I am working with the R programming language.
I have a column in a data frame that looks something like this (the column is a CHARACTER variable):
head(b$`New Col`)
[1] "1073680,, 781230,, 292455," "128485,, 62890,, 65595," "372475,, 184745,, 187730,"
The first row contains a single element: 1073680,, 781230,, 292455,
The second row contains a single element: 128485,, 62890,, 65595,
The third row contains a single element : 372475,, 184745,, 187730,
I want to split this column into 3 columns:
id col1 col2 col3
1 1 1073680 781230 292455
2 2 128485 62890 65595
3 3 372475 184745 187730
I know how to do this in Excel (e.g. remove last comma, and then used "fixed width delimited" using double commas).
But can someone please show me how to do this in R?
Thanks!
In addition to Maƫl's answer: if you're strictly after fixed-width separation, use separate with position indices instead of delimiter strings.
Your example data:
b <- structure(list(New.Col = c("1073680,, 781230,, 292455,", " 128485,, 62890,, 65595,",
" 372475,, 184745,, 187730,")), class = "data.frame", row.names = c(NA,
-3L))
separate by fixed widths:
library(tidyr)
b <- b %>%
separate(col = `New.Col`,
into = c('col1', 'drop1', 'col2', 'drop2', 'col3'),
sep = c(7, 10, 17, 21, 27)
)
drop the garbage columns (containing the delimiters):
b %>% select(-starts_with('drop'))
Just for fun. Another way:
library(tidyverse)
df %>%
separate_rows(col1, sep = ",, ") %>%
mutate(col = parse_number(col1), .keep="unused") %>%
group_by(ID) %>%
mutate(id = row_number()) %>%
pivot_wider(names_from = ID,
values_from = col,
names_glue = "col_{ID}")
id col_1 col_2 col_3
<int> <dbl> <dbl> <dbl>
1 1 1073680 128485 372475
2 2 781230 62890 184745
3 3 292455 65595 187730
Using base R
df <- cbind(df[1], read.csv(text = trimws(gsub(",+\\s+", ",", df$col),
whitespace = ","), header = FALSE, col.names = paste0("col", 1:3))
)
-output
df
ID col1 col2 col3
1 1 1073680 781230 292455
2 2 128485 62890 65595
3 3 372475 184745 187730
You can do the same in R, with str_remove_all and separate. Here, I remove all commas, and separate by whitespace. Use convert = TRUE to convert the separated values to numeric.
library(dplyr)
library(stringr)
library(tidyr)
df %>%
mutate(col = str_remove_all(col, ",")) %>%
separate(col, into = str_c("col", 1:3), convert = TRUE)
# ID col1 col2 col3
# 1 1 1073680 781230 292455
# 2 2 128485 62890 65595
# 3 3 372475 184745 187730
Edit: you actually don't need the first step, since separate convert the first occurrences of the pattern defined by the length of the into parameter (here, 3). It also chooses by default punctuation character(s) as the separator, so it is not needed to specify it.
df %>%
separate(col, into = str_c("col", 1:3), convert = TRUE)
# ID col1 col2 col3
# 1 1 1073680 781230 292455
# 2 2 128485 62890 65595
# 3 3 372475 184745 187730
Data
df <- data.frame(ID = 1:3, col = c("1073680,, 781230,, 292455,", "128485,, 62890,, 65595,", "372475,, 184745,, 187730," ))
Related
is it possible with dplyr and separate to create new lines if the separation produces more values than the specified "into" columns?
i.E.
df <- data.frame(values = c("1,2,3,4,5,6"))
sep <- separate(
data = df,
col = values,
into = c("Part1", "Part2", "Part3"),
sep = ","
)
sep
Part1 Part2 Part3
1 2 3
the expacted result looks like
sep
Part1 Part2 Part3
1 2 3
4 5 6
1) Replace comma with semicolon after every third number, separate into rows and then separate into fields.
library(dplyr)
library(tidyr)
df %>%
mutate(values = gsub("(\\d+,\\d+,\\d+),", "\\1;", values)) %>%
separate_rows(values, sep = ";") %>%
separate(values, into = paste0("Part", 1:3), convert = TRUE)
giving:
# A tibble: 2 x 3
Part1 Part2 Part3
<int> <int> <int>
1 1 2 3
2 4 5 6
2) Alternately, replace the comma after every third field with newline and then use read.table to read it in.
library(dplyr)
df$values %>%
gsub("(\\d+,\\d+,\\d+),", "\\1\n", .) %>%
read.table(text = ., sep = ",", col.names = paste0("Part", 1:3))
giving:
Part1 Part2 Part3
1 1 2 3
2 4 5 6
3) A variation of (2) is to scan it in, convert to matrix and then data frame and add column names.
df$values %>%
scan(text = ., sep = ",", quiet = TRUE) %>%
matrix(ncol = 3, byrow = TRUE) %>%
as.data.frame %>%
setNames(paste0("Part", 1:ncol(.)))
giving:
Part1 Part2 Part3
1 1 2 3
2 4 5 6
I want to see whether the text column has elements outside the specified values of "a" and "b"
specified_value=c("a","b")
df=data.frame(key=c(1,2,3,4),text=c("a,b,c","a,d","1,2","a,b")
df_out=data.frame(key=c(1,2,3),text=c("c","d","1,2",NA))
This is what I have tried:
df=df%>%mutate(text_vector=strsplit(text, split=","),
extra=text_vector[which(!text_vector %in% specified_value)])
But this doesn't work, any suggestions?
We can split the 'text' by the delimiter , with separate_rows, grouped by 'key', get the elements that are not in 'specified_value' with setdiff and paste them together (toString), then do a join to get the other columns in the original dataset
library(dplyr) # >= 1.0.0
library(tidyr)
df %>%
separate_rows(text) %>%
group_by(key) %>%
summarise(extra = toString(setdiff(text, specified_value))) %>%
left_join(df) %>%
mutate(extra = na_if(extra, ""))
# A tibble: 4 x 3
# key extra text
# <dbl> <chr> <chr>
#1 1 c a,b,c
#2 2 d a,d
#3 3 1, 2 1,2
#4 4 <NA> a,b
Using setdiff.
df$outside <- sapply({
x <- lapply(strsplit(df$text, ","), setdiff, specified_value)
replace(x, lengths(x) == 0, NA)},
paste, collapse=",")
df
# key text outside
# 1 1 a,b,c c
# 2 2 a,d d
# 3 3 1,2 1,2
# 4 4 a,b NA
Data:
df <- structure(list(key = c(1, 2, 3, 4), text = c("a,b,c", "a,d",
"1,2", "a,b")), class = "data.frame", row.names = c(NA, -4L))
specified_value <- c("a", "b")
use stringi::stri_split_fixed
library(stringi)
!all(stri_split_fixed("a,b", ",", simplify=T) %in% specified_value) #FALSE
!all(stri_split_fixed("a,b,c", ",", simplify=T) %in% specified_value) #TRUE
An option using regex without splitting the data on comma :
#Collapse the specified_value in one string and remove from text
df$text1 <- gsub(paste0(specified_value, collapse = "|"), '', df$text)
#Remove extra commas
df$text1 <- gsub('(?<![a-z0-9]),', '', df$text1, perl = TRUE)
df
# key text text1
#1 1 a,b,c c
#2 2 a,d d
#3 3 1,2 1,2
#4 4 a,b
library(purrr)
library(tibble)
library(dplyr)
Starting list of dataframes
lst <- list(df1 = data.frame(X.1 = as.character(1:2),
heading = letters[1:2]),
df2 = data.frame(X.32 = as.character(3:4),
another.topic = paste("Line ", 1:2)))
lst
#> $df1
#> X.1 heading
#> 1 1 a
#> 2 2 b
#>
#> $df2
#> X.32 another.topic
#> 1 3 Line 1
#> 2 4 Line 2
Expected "combined" dataframe, with new consistent variable names, and old variable names in the first row of each constituent dataframe.
#> id h1 h2
#> 1 df1 X.1 heading
#> 2 df1 1 a
#> 3 df1 2 b
#> 4 df2 X.32 another.topic
#> 5 df2 3 Line 1
#> 6 df2 4 Line 2
add_row requires "Name-value pairs, passed on to tibble(). Values can be defined only for columns that already exist in .data and unset columns will get an NA value."
Which is what I think I have achieved with this:
df_nms <-
map(lst, names) %>%
map(set_names)
#> $df1
#> X.1 heading
#> "X.1" "heading"
#>
#> $df2
#> X.32 another.topic
#> "X.32" "another.topic"
But I cannot tie up the last bit, using a purrr function to add the names to the head of each dataframe. I've tried numerous variations with map2 and pmap the closest I can get at present (if I treat add_row as a formula , prefixing it with ~ and remove the .y I get a new first row populated with NAs). I think I'm missing how to pass the name-value pairs to the add_row function.
map2(lst, df_nms, add_row(.x, .y, .before = 1)) %>%
map(set_names, c("h1", "h2")) %>%
map_dfr(bind_rows, .id = "id")
#> Error in add_row(.x, .y, .before = 1): object '.x' not found
A pointer to resolve this last step would be most appreciated.
Not quite sure how to do this via purrr map functions, but here is an alternative,
library(dplyr)
bind_rows(lapply(lst, function(i){d1 <- as.data.frame(matrix(names(i), ncol = ncol(i)));
rbind(d1, setNames(i, names(d1)))}), .id = 'id')
# id V1 V2
#1 df1 X.1 heading
#2 df1 1 a
#3 df1 2 b
#4 df2 X.32 another.topic
#5 df2 3 Line 1
#6 df2 4 Line 2
Here's an approach using map, rbindlist from data.table and some base R functions:
library(purrr)
library(dplyr)
library(data.table)
map(lst, ~ as.data.frame(unname(rbind(colnames(.x),as.matrix(.x))))) %>%
rbindlist(idcol = "id")
# id V1 V2
#1: df1 X.1 heading
#2: df1 1 a
#3: df1 2 b
#4: df2 X.32 another.topic
#5: df2 3 Line 1
#6: df2 4 Line 2
Alternatively we could use map_df if we use colnames<-:
map_df(lst, ~ as.data.frame(rbind(colnames(.x),as.matrix(.x))) %>%
`colnames<-`(.,paste0("h",seq(1,dim(.)[2]))), .id = "id")
# id h1 h2
#1 df1 X.1 heading
#2 df1 1 a
#3 df1 2 b
#4 df2 X.32 another.topic
#5 df2 3 Line 1
#6 df2 4 Line 2
Key things here are:
Use as.matrix to get rid of the factor / character incompatibility.
Remove names with unname or set them with colnames<-
Use the idcols = or .id = feature to get the names of the list as a column.
I altered your sample data a bit, setting stringsAsFactors to FALSE when creating the data.frames in lst.
here is a solution using data.table::rbindlist().
#sample data
lst <- list(df1 = data.frame(X.1 = as.character(1:2),
heading = letters[1:2],
stringsAsFactors = FALSE), # !! <--
df2 = data.frame(X.32 = as.character(3:4),
another.topic = paste("Line ", 1:2),
stringsAsFactors = FALSE) # !! <--
)
DT <- data.table::rbindlist( lapply( lst, function(x) rbind( names(x), x ) ),
use.names = FALSE, idcol = "id" )
setnames(DT, names( lst[[1]] ), c("h1", "h2") )
# id h1 h2
# 1: df1 X.1 heading
# 2: df1 1 a
# 3: df1 2 b
# 4: df2 X.32 another.topic
# 5: df2 3 Line 1
# 6: df2 4 Line 2
I have a below lists (with sublists as well). But here the columns are unequal. "a" list has 2 columns and "b" lists has 3 columns.
f <- list(a=list(1,2.5,9.5),b=list("2","-true","3",4))
I need to append this list keeping references like below. For example,
COl1 COl2 COl3 Col4
a 1 false NA
b 2 true 3
As you can see above, there is a reference in col 1 from where the data object the lists is taken. Please guide
1) data.table Set names on the list giving the new list fnam and then use rbindlist from data.table:
library(data.table)
fnam <- lapply(f, function(x) setNames(x, paste0("COL", seq(2, length = length(x)))))
cbind(COL1 = names(f), rbindlist(fnam , fill = TRUE))
giving:
COL1 COL2 COL3 COL4
1: a 1 false <NA>
2: b 2 true 3
2) base R This alternative uses no packages. We create a character vector out of f and then read it in using read.table.
Lines <- paste(names(f), sapply(f, paste, collapse = " "))
nc <- max(lengths(f)) + 1
col.names <- paste0("COL", seq_len(nc))
read.table(text = Lines, header = FALSE, fill = TRUE, col.names = col.names)
giving:
COL1 COL2 COL3 COL4
1 a 1 false NA
2 b 2 true 3
Use some separator not appearing in the data if the data can contain spaces.
One option would be to set the names of the list elements using map and specify the .id as 'COL1' to create a new column based on the names of 'f'. Note that map returns a list, while map_df a tb_df/data.frame
1)
library(tidyverse)
f %>%
map_df(~ set_names(., paste0("COL", seq_along(.)+1)), .id = 'COL1')
# A tibble: 2 x 4
# COL1 COL2 COL3 COL4
# <chr> <dbl> <chr> <chr>
#1 a 1 false <NA>
#2 b 2 true 3
2) If the types are different, retype (from hablar) and then do
library(hablar)
f1 %>%
map_df(~ set_names(.x, paste0("COL", seq_along(.)+1)) %>%
map(retype), .id = 'COL1')
# A tibble: 2 x 4
# COL1 COL2 COL3 COL4
# <chr> <int> <chr> <int>
#1 a 1 false NA
#2 b 2 true 3
3) Or with type.convert
f1 %>%
map_df(~ map(.x, type.convert, as.is = TRUE) %>%
set_names(paste0("COL", seq_along(.x))), .id = "COL1")
# A tibble: 2 x 4
# COL1 COL1 COL2 COL3
# <chr> <int> <chr> <int>
#1 a 1 false NA
#2 b 2 true 3
4) if the integer/numeric is giving an issue, then convert it to common type ie. to numeric
f1 %>%
map_df(~ map(.x, type.convert, as.is = TRUE) %>%
map_if(is.integer, as.numeric) %>%
set_names(paste0("COL", seq_along(.x))), .id = "COL1")
5) As the types are mixed up, it may be better to do the retype after converting to a single data.frame
f %>%
map_df(~ map(.x, as.character) %>%
set_names(paste0("COL", seq_along(.x) + 1)), .id = "COL1") %>%
retype
data
f <- list(a = list(1, "false"), b = list(2, "true", "3"))
f1 <- list(a=list(1,"false"),b=list("2","true","3"))
How about another simple base R solution.
f <- list(a=list(1,2.5,9.5),b=list("2","-true","3",4))
m = matrix(NA,ncol=max(sapply(f,length)),nrow=length(f))
for(i in 1:nrow(m)) {
u = unlist(f[[i]])
m[i,1:length(u)] = u
}
your_data_frame = as.data.frame(m)
I have a simple question about aggregating values in R.
Suppose I have a dataframe:
DF <- data.frame(col1=c("Type 1", "Type 1B", "Type 2"), col2=c(1, 2, 3))
which looks like this:
col1 col2
1 Type 1 1
2 Type 1B 2
3 Type 2 3
I notice that I have Type 1 and Type 1B in the data, so I would like to combine Type 1B into Type 1.
So I decide to use dplyr:
filter(DF, col1=='Type 1' | col1=='Type 1B') %>%
summarise(n = sum(col2))
But now I need to keep going with it:
DF2 <- data.frame('Type 1', filter(DF, col1=='Type 1' | col1=='Type 1B') %>%
summarise(n = sum(col2)))
I guess I want to cbind this new DF2 back to the original DF, but that means I have to set the column names to be consistent:
names(DF2) <- c('col1', 'col2')
OK, now I can rbind:
rbind(DF2, DF[3,])
The result? It worked....
col1 col2
1 Type 1 3
3 Type 2 3
...but ugh! That was awful! There has to be a better way to simply combine values.
Here's a possible dplyr approach:
library(dplyr)
DF %>%
group_by(col1 = sub("(.*\\d+).*$", "\\1", col1)) %>%
summarise(col2 = sum(col2))
#Source: local data frame [2 x 2]
#
# col1 col2
#1 Type 1 3
#2 Type 2 3
Using sub() with aggregate(), removing anything other than a digit from the end of col1,
do.call("data.frame",
aggregate(col2 ~ cbind(col1 = sub("\\D+$", "", col1)), DF, sum)
)
# col1 col2
# 1 Type 1 3
# 2 Type 2 3
The do.call() wrapper is there so that the first column after aggregate() is properly changed from a matrix to a vector. This way there aren't any surprises later on down the road.
In my opinion, aggregate() is the perfect function for this purpose, but you shouldn't have to do any text processing (e.g. gsub()). I would do this in a two-step process:
Overwrite col1 with the new desired grouping.
Compute the aggregation using the new col1 to specify the grouping.
DF$col1 <- ifelse(DF$col1 %in% c('Type 1','Type 1B'),'Type 1',levels(DF$col1));
DF;
## col1 col2
## 1 Type 1 1
## 2 Type 1 2
## 3 Type 2 3
DF <- aggregate(col2~col1, DF, FUN=sum );
DF;
## col1 col2
## 1 Type 1 3
## 2 Type 2 3
You can try:
library(data.table)
setDT(transform(DF, col1=gsub("(.*)[A-Z]+$","\\1",DF$col1)))[,list(col2=sum(col2)),col1]
# col1 col2
# 1: Type 1 3
# 2: Type 2 3
Or even more directly:
setDT(DF)[, .(col2 = sum(col2)), by = .(col1 = sub("[[:alpha:]]+$", "", col1))]