Having a dataframe like this:
data.frame(id = c(1,2,3,4), text1 = c("sth","","another",""), text2 = c("more","another","add",""), text3 = c("final","and","where","all"))
How is it possible to detect if in the text1 column a row is blank and fill the blank with a value exist in text2, text3 or text4 column and leave from this NA after the process
Example of expected output
data.frame(id = c(1,2,3,4), text1 = c("sth","another","another","all"), text2 = c("more","","add",""), text3 = c("final","and","where",""))
A vectorized base R approach :
#Get indices where text1 is empty
inds <- which(df$text1 == '')
#get values to replace from the corresponding rows
vals <- cbind(inds, max.col(df[inds, 3:ncol(df)] != "") + 2)
#Replace the values
df$text1[inds] <- df[vals]
#Change the replaced value with blank.
df[vals] <- ''
df
# id text1 text2 text3
#1 1 sth more final
#2 2 another and
#3 3 another add where
#4 4 all
data
df <- data.frame(id = c(1,2,3,4), text1 = c("sth","","another",""),
text2 = c("more","another","add",""),
text3 = c("final","and","where","all"), stringsAsFactors = FALSE)
In base R you could do:
txt <- do.call(paste,c(sep = ',',`is.na<-`(df,df=="")))
df1 <- read.csv(text = sub("((?:,NA)+)(,\\w+)","\\2\\1",txt),
header = FALSE,
col.names = names(df),
stringsAsFactors = FALSE)
df1[is.na(df1)] <- ""
df1
id text1 text2 text3
1 1 sth more final
2 2 another and
3 3 another add where
4 4 all
here is a data.table approach...
explanation in comments below
#sample data
df <- data.frame(id = c(1,2,3,4), text1 = c("sth","","another",""), text2 = c("more","another","add",""), text3 = c("final","and","where","all"), stringsAsFactors = FALSE)
library( data.table )
#create data.table
setDT( df )
#paste together columns by id
ans <- df[, .(string = paste0( .SD, collapse =";")), by = .(id) ][]
# id string
# 1: 1 sth;more;final
# 2: 2 ;another;and
# 3: 3 another;add;where
# 4: 4 ;;all
#remove leading;'s
ans[, string := gsub("^;+", "", string) ]
# id string
# 1: 1 sth;more;final
# 2: 2 another;and
# 3: 3 another;add;where
# 4: 4 all
#split string back to columns, remove the temporary string-column
ans[, paste0( "text", 1:length( tstrsplit(ans$string, ";") ) ) :=
tstrsplit( string, ";") ][, string := NULL ]
# id text1 text2 text3
# 1: 1 sth more final
# 2: 2 another and <NA>
# 3: 3 another add where
# 4: 4 all <NA> <NA>
You can use dplyr + purrr:
df %>%
tidyr::nest(-id) %>%
dplyr::mutate(
new_text = purrr::map_chr(
data, ~
as.vector(t(.x[1,])) %>%
.[. != ""] %>%
dplyr::first())) %>%
tidyr::unnest()
A tibble: 4 x 5
id text1 text2 text3 new_text
<dbl> <fct> <fct> <fct> <chr>
1 1 sth more final sth
2 2 "" another and another
3 3 another add where another
4 4 "" "" all all
At this stage, why not also a dplyr approach? Admittedly, with a sparkle of base R in the middle
df <- data.frame(id = c(1,2,3,4),
text1 = c("sth","","another",""),
text2 = c("more","another","add",""),
text3 = c("final","and","where","all"))
library("dplyr")
library("tidyr")
df_filled <- df %>%
pivot_longer(cols = starts_with("text"),
names_to = "text_id",
values_to = "value") %>%
mutate(value = as.character(value)) %>%
group_by(id) %>%
mutate(value = if_else(value=="", as.character(NA), value)) %>%
mutate(previously_missing = value) %>%
tidyr::fill(value, .direction = "downup")
df_filled$value[which(is.na(df_filled$previously_missing)&df_filled$text_id!="text3")+1] <- NA
df_filled %>%
ungroup() %>%
pivot_wider(id_cols = id,
names_from = "text_id",
values_from = "value")
#> # A tibble: 4 x 4
#> id text1 text2 text3
#> <dbl> <chr> <chr> <chr>
#> 1 1 sth more final
#> 2 2 another <NA> and
#> 3 3 another add where
#> 4 4 all <NA> <NA>
Created on 2020-02-19 by the reprex package (v0.3.0)
Another base R solution is to define your custom function swap and apply it by rows, i.e.,
swap <- function(v) {v[inds]<-v[rev(inds <- c(1,head(which(nchar(v)>0),1)))];v}
df[-1]<-t(apply(df[-1], 1, swap))
such that
> df
id text1 text2 text3
1 1 sth more final
2 2 another and
3 3 another add where
4 4 all
Related
I am working with the R programming language.
I have a column in a data frame that looks something like this (the column is a CHARACTER variable):
head(b$`New Col`)
[1] "1073680,, 781230,, 292455," "128485,, 62890,, 65595," "372475,, 184745,, 187730,"
The first row contains a single element: 1073680,, 781230,, 292455,
The second row contains a single element: 128485,, 62890,, 65595,
The third row contains a single element : 372475,, 184745,, 187730,
I want to split this column into 3 columns:
id col1 col2 col3
1 1 1073680 781230 292455
2 2 128485 62890 65595
3 3 372475 184745 187730
I know how to do this in Excel (e.g. remove last comma, and then used "fixed width delimited" using double commas).
But can someone please show me how to do this in R?
Thanks!
In addition to Maƫl's answer: if you're strictly after fixed-width separation, use separate with position indices instead of delimiter strings.
Your example data:
b <- structure(list(New.Col = c("1073680,, 781230,, 292455,", " 128485,, 62890,, 65595,",
" 372475,, 184745,, 187730,")), class = "data.frame", row.names = c(NA,
-3L))
separate by fixed widths:
library(tidyr)
b <- b %>%
separate(col = `New.Col`,
into = c('col1', 'drop1', 'col2', 'drop2', 'col3'),
sep = c(7, 10, 17, 21, 27)
)
drop the garbage columns (containing the delimiters):
b %>% select(-starts_with('drop'))
Just for fun. Another way:
library(tidyverse)
df %>%
separate_rows(col1, sep = ",, ") %>%
mutate(col = parse_number(col1), .keep="unused") %>%
group_by(ID) %>%
mutate(id = row_number()) %>%
pivot_wider(names_from = ID,
values_from = col,
names_glue = "col_{ID}")
id col_1 col_2 col_3
<int> <dbl> <dbl> <dbl>
1 1 1073680 128485 372475
2 2 781230 62890 184745
3 3 292455 65595 187730
Using base R
df <- cbind(df[1], read.csv(text = trimws(gsub(",+\\s+", ",", df$col),
whitespace = ","), header = FALSE, col.names = paste0("col", 1:3))
)
-output
df
ID col1 col2 col3
1 1 1073680 781230 292455
2 2 128485 62890 65595
3 3 372475 184745 187730
You can do the same in R, with str_remove_all and separate. Here, I remove all commas, and separate by whitespace. Use convert = TRUE to convert the separated values to numeric.
library(dplyr)
library(stringr)
library(tidyr)
df %>%
mutate(col = str_remove_all(col, ",")) %>%
separate(col, into = str_c("col", 1:3), convert = TRUE)
# ID col1 col2 col3
# 1 1 1073680 781230 292455
# 2 2 128485 62890 65595
# 3 3 372475 184745 187730
Edit: you actually don't need the first step, since separate convert the first occurrences of the pattern defined by the length of the into parameter (here, 3). It also chooses by default punctuation character(s) as the separator, so it is not needed to specify it.
df %>%
separate(col, into = str_c("col", 1:3), convert = TRUE)
# ID col1 col2 col3
# 1 1 1073680 781230 292455
# 2 2 128485 62890 65595
# 3 3 372475 184745 187730
Data
df <- data.frame(ID = 1:3, col = c("1073680,, 781230,, 292455,", "128485,, 62890,, 65595,", "372475,, 184745,, 187730," ))
is it possible with dplyr and separate to create new lines if the separation produces more values than the specified "into" columns?
i.E.
df <- data.frame(values = c("1,2,3,4,5,6"))
sep <- separate(
data = df,
col = values,
into = c("Part1", "Part2", "Part3"),
sep = ","
)
sep
Part1 Part2 Part3
1 2 3
the expacted result looks like
sep
Part1 Part2 Part3
1 2 3
4 5 6
1) Replace comma with semicolon after every third number, separate into rows and then separate into fields.
library(dplyr)
library(tidyr)
df %>%
mutate(values = gsub("(\\d+,\\d+,\\d+),", "\\1;", values)) %>%
separate_rows(values, sep = ";") %>%
separate(values, into = paste0("Part", 1:3), convert = TRUE)
giving:
# A tibble: 2 x 3
Part1 Part2 Part3
<int> <int> <int>
1 1 2 3
2 4 5 6
2) Alternately, replace the comma after every third field with newline and then use read.table to read it in.
library(dplyr)
df$values %>%
gsub("(\\d+,\\d+,\\d+),", "\\1\n", .) %>%
read.table(text = ., sep = ",", col.names = paste0("Part", 1:3))
giving:
Part1 Part2 Part3
1 1 2 3
2 4 5 6
3) A variation of (2) is to scan it in, convert to matrix and then data frame and add column names.
df$values %>%
scan(text = ., sep = ",", quiet = TRUE) %>%
matrix(ncol = 3, byrow = TRUE) %>%
as.data.frame %>%
setNames(paste0("Part", 1:ncol(.)))
giving:
Part1 Part2 Part3
1 1 2 3
2 4 5 6
I want to see whether the text column has elements outside the specified values of "a" and "b"
specified_value=c("a","b")
df=data.frame(key=c(1,2,3,4),text=c("a,b,c","a,d","1,2","a,b")
df_out=data.frame(key=c(1,2,3),text=c("c","d","1,2",NA))
This is what I have tried:
df=df%>%mutate(text_vector=strsplit(text, split=","),
extra=text_vector[which(!text_vector %in% specified_value)])
But this doesn't work, any suggestions?
We can split the 'text' by the delimiter , with separate_rows, grouped by 'key', get the elements that are not in 'specified_value' with setdiff and paste them together (toString), then do a join to get the other columns in the original dataset
library(dplyr) # >= 1.0.0
library(tidyr)
df %>%
separate_rows(text) %>%
group_by(key) %>%
summarise(extra = toString(setdiff(text, specified_value))) %>%
left_join(df) %>%
mutate(extra = na_if(extra, ""))
# A tibble: 4 x 3
# key extra text
# <dbl> <chr> <chr>
#1 1 c a,b,c
#2 2 d a,d
#3 3 1, 2 1,2
#4 4 <NA> a,b
Using setdiff.
df$outside <- sapply({
x <- lapply(strsplit(df$text, ","), setdiff, specified_value)
replace(x, lengths(x) == 0, NA)},
paste, collapse=",")
df
# key text outside
# 1 1 a,b,c c
# 2 2 a,d d
# 3 3 1,2 1,2
# 4 4 a,b NA
Data:
df <- structure(list(key = c(1, 2, 3, 4), text = c("a,b,c", "a,d",
"1,2", "a,b")), class = "data.frame", row.names = c(NA, -4L))
specified_value <- c("a", "b")
use stringi::stri_split_fixed
library(stringi)
!all(stri_split_fixed("a,b", ",", simplify=T) %in% specified_value) #FALSE
!all(stri_split_fixed("a,b,c", ",", simplify=T) %in% specified_value) #TRUE
An option using regex without splitting the data on comma :
#Collapse the specified_value in one string and remove from text
df$text1 <- gsub(paste0(specified_value, collapse = "|"), '', df$text)
#Remove extra commas
df$text1 <- gsub('(?<![a-z0-9]),', '', df$text1, perl = TRUE)
df
# key text text1
#1 1 a,b,c c
#2 2 a,d d
#3 3 1,2 1,2
#4 4 a,b
library(purrr)
library(tibble)
library(dplyr)
Starting list of dataframes
lst <- list(df1 = data.frame(X.1 = as.character(1:2),
heading = letters[1:2]),
df2 = data.frame(X.32 = as.character(3:4),
another.topic = paste("Line ", 1:2)))
lst
#> $df1
#> X.1 heading
#> 1 1 a
#> 2 2 b
#>
#> $df2
#> X.32 another.topic
#> 1 3 Line 1
#> 2 4 Line 2
Expected "combined" dataframe, with new consistent variable names, and old variable names in the first row of each constituent dataframe.
#> id h1 h2
#> 1 df1 X.1 heading
#> 2 df1 1 a
#> 3 df1 2 b
#> 4 df2 X.32 another.topic
#> 5 df2 3 Line 1
#> 6 df2 4 Line 2
add_row requires "Name-value pairs, passed on to tibble(). Values can be defined only for columns that already exist in .data and unset columns will get an NA value."
Which is what I think I have achieved with this:
df_nms <-
map(lst, names) %>%
map(set_names)
#> $df1
#> X.1 heading
#> "X.1" "heading"
#>
#> $df2
#> X.32 another.topic
#> "X.32" "another.topic"
But I cannot tie up the last bit, using a purrr function to add the names to the head of each dataframe. I've tried numerous variations with map2 and pmap the closest I can get at present (if I treat add_row as a formula , prefixing it with ~ and remove the .y I get a new first row populated with NAs). I think I'm missing how to pass the name-value pairs to the add_row function.
map2(lst, df_nms, add_row(.x, .y, .before = 1)) %>%
map(set_names, c("h1", "h2")) %>%
map_dfr(bind_rows, .id = "id")
#> Error in add_row(.x, .y, .before = 1): object '.x' not found
A pointer to resolve this last step would be most appreciated.
Not quite sure how to do this via purrr map functions, but here is an alternative,
library(dplyr)
bind_rows(lapply(lst, function(i){d1 <- as.data.frame(matrix(names(i), ncol = ncol(i)));
rbind(d1, setNames(i, names(d1)))}), .id = 'id')
# id V1 V2
#1 df1 X.1 heading
#2 df1 1 a
#3 df1 2 b
#4 df2 X.32 another.topic
#5 df2 3 Line 1
#6 df2 4 Line 2
Here's an approach using map, rbindlist from data.table and some base R functions:
library(purrr)
library(dplyr)
library(data.table)
map(lst, ~ as.data.frame(unname(rbind(colnames(.x),as.matrix(.x))))) %>%
rbindlist(idcol = "id")
# id V1 V2
#1: df1 X.1 heading
#2: df1 1 a
#3: df1 2 b
#4: df2 X.32 another.topic
#5: df2 3 Line 1
#6: df2 4 Line 2
Alternatively we could use map_df if we use colnames<-:
map_df(lst, ~ as.data.frame(rbind(colnames(.x),as.matrix(.x))) %>%
`colnames<-`(.,paste0("h",seq(1,dim(.)[2]))), .id = "id")
# id h1 h2
#1 df1 X.1 heading
#2 df1 1 a
#3 df1 2 b
#4 df2 X.32 another.topic
#5 df2 3 Line 1
#6 df2 4 Line 2
Key things here are:
Use as.matrix to get rid of the factor / character incompatibility.
Remove names with unname or set them with colnames<-
Use the idcols = or .id = feature to get the names of the list as a column.
I altered your sample data a bit, setting stringsAsFactors to FALSE when creating the data.frames in lst.
here is a solution using data.table::rbindlist().
#sample data
lst <- list(df1 = data.frame(X.1 = as.character(1:2),
heading = letters[1:2],
stringsAsFactors = FALSE), # !! <--
df2 = data.frame(X.32 = as.character(3:4),
another.topic = paste("Line ", 1:2),
stringsAsFactors = FALSE) # !! <--
)
DT <- data.table::rbindlist( lapply( lst, function(x) rbind( names(x), x ) ),
use.names = FALSE, idcol = "id" )
setnames(DT, names( lst[[1]] ), c("h1", "h2") )
# id h1 h2
# 1: df1 X.1 heading
# 2: df1 1 a
# 3: df1 2 b
# 4: df2 X.32 another.topic
# 5: df2 3 Line 1
# 6: df2 4 Line 2
I have a joining problem that I'm struggling with in that the join IDs I want to use for separate dataframes are spread out across three possible ID columns. I'd like to be able to join if at least one join ID matches. I know the _join and merge functions accept a vector of column names but is it possible to make this work conditionally?
For example, if I have the following two data frames:
df_A <- data.frame(dta = c("FOO", "BAR", "GOO"),
id1 = c("abc", "", "bcd"),
id2 = c("", "", "xyz"),
id3 = c("def", "fgh", ""), stringsAsFactors = F)
df_B <- data.frame(dta = c("FUU", "PAR", "KOO"),
id1 = c("abc", "", ""),
id2 = c("", "xyz", "zzz"),
id3 = c("", "", ""), stringsAsFactors = F)
> df_A
dta id1 id2 id3
1 FOO abc def
2 BAR fgh
3 GOO bcd xyz
> df_B
dta id1 id2 id3
1 FUU abc
2 PAR xyz
3 KOO zzz
I hope to end up with something like this:
dta.x dta.y id1 id2 id3
1 FOO FUU abc "" def [matched on id1]
2 BAR "" "" "" fgh [unmatched]
3 GOO PAR bcd xyz "" [matched on id2]
4 KOO "" "" zzz "" [unmatched]
So that unmatched dta1 and dta1 variables are retained but where there is a match (row 1 + 3 above) both dta1 and dta2 are joined in the new table. I have a sense that neither _join, merge, or match will work as is and that I'd need to write a function but I'm not sure where to start. Any help or ideas appreciated. Thank you
Basically, what you want to do is join by corresponding IDs, what you can do is to convert the original id columns to id_column and id_value, because you don't want to join with "", do I dropped it.
library(tidyverse)
df_A_long <- df_A %>%
pivot_longer(
cols = -dta,
names_to = "id_column",
values_to = "id_value"
) %>%
dplyr::filter(id_value != "")
df_B_long <- df_B %>%
pivot_longer(
cols = -dta,
names_to = "id_column",
values_to = "id_value"
) %>%
dplyr::filter(id_value != "")
We always use id_column and id_value to join A & B.
> df_B_long
# A tibble: 3 x 3
dta id_column id_value
<chr> <chr> <chr>
1 FUU id1 abc
2 PAR id2 xyz
3 KOO id2 zzz
The joining part is clear, but to create your desired output, we need to do some data wrangling to make it look identical.
df_joined <- df_A_long %>%
# join using id_column and id_value
full_join(df_B_long, by = c("id_column","id_value"),suffix = c("1","2")) %>%
# pivot back to long format
pivot_wider(
id_cols = c(dta1,dta2),
names_from = id_column,
values_from = id_value
) %>%
# if dta1 is missing, then in the same row, move value from dta2 to dta1
mutate(
dta1_has_value = !is.na(dta1), # helper column
dta1 = ifelse(dta1_has_value,dta1,dta2),
dta2 = ifelse(!dta1_has_value & !is.na(dta2),NA,dta2)
) %>%
select(-dta1_has_value) %>%
group_by(dta1) %>%
# condense multiple rows into one row
summarise_all(
~ifelse(all(is.na(.x)),"",.x[!is.na(.x)])
) %>%
# reorder columns
{
.[sort(colnames(df_joined))]
}
Result:
> df_joined
# A tibble: 4 x 5
dta1 dta2 id1 id2 id3
<chr> <chr> <chr> <chr> <chr>
1 BAR "" "" "" fgh
2 FOO FUU abc "" def
3 GOO PAR bcd xyz ""
4 KOO "" "" zzz ""
library(sqldf)
one <-
sqldf('
select a.*
, b.dta as dta_b
from df_A a
left join df_B b
on a.id1 <> ""
and (
a.id1 = b.id1
or a.id2 = b.id2)
')
two <-
sqldf('
select b.*
from df_B b
left join one
on b.dta = one.dta
or b.dta = one.dta_b
where one.dta is null
')
dplyr::bind_rows(one, two)
# dta id1 id2 id3 dta_b
# 1 FOO abc def FUU
# 2 BAR fgh <NA>
# 3 GOO bcd xyz PAR
# 4 KOO zzz <NA>