Splitting strings into components - r

For example, I have a data table with several columns:
column A column B
key_500:station and loc 2
spectra:key_600:type 9
alpha:key_100:number 12
I want to split the rows of column A into components and create new columns, guided by the following rules:
the value between "key_" and ":" will be var1,
the next value after ":" will be var2,
the original column A should retain the part of string that is prior to ":key_". If it is empty (as in the first line), then replace "" with an "effect" word.
My expected final data table should be like this one:
column A column B var1 var2
effect 2 500 station and loc
spectra 9 600 type
alpha 12 100 number

Using tidyr extract you can extract specific part of the string using regex.
tidyr::extract(df, columnA, into = c('var1', 'var2'), 'key_(\\d+):(.*)',
convert = TRUE, remove = FALSE) %>%
dplyr::mutate(columnA = sub(':?key_.*', '', columnA),
columnA = replace(columnA, columnA == '', 'effect'))
# columnA var1 var2 columnB
#1 effect 500 station and loc 2
#2 spectra 600 type 9
#3 alpha 100 number 12
If you want to use data.table you can break this down in steps :
library(data.table)
setDT(df)
df[, c('var1', 'var2') := .(sub('.*key_(\\d+).*', '\\1',columnA),
sub('.*key_\\d+:', '', columnA))]
df[, columnA := sub(':?key_.*', '', columnA)]
df[, columnA := replace(columnA, columnA == '', 'effect')]
data
df <- structure(list(columnA = c("key_500:station and loc",
"spectra:key_600:type", "alpha:key_100:number"),
columnB = c(2L, 9L, 12L)), class = "data.frame", row.names = c(NA, -3L))

You can use separate which uses non-letters and separates the string into columns defined in into
require(tidyr)
require(dplyr)
df=tribble(
~"column A",~"column B",
"key_500:station", 2,
"spectra:key_600:type", 9,
"alpha:key_100:number", 12)
df %>% separate("column A",into=c('column A','key','var1','var2'),fill='left') %>% select(-key) %>% select("column A","column B",var1,var2) %>%
mutate(`column A`=ifelse(is.na(`column A`),"effect",`column A`))
And this is a modified version to work with data.tables
require(tidyr)
require(data.table)
DT=data.table(
"column A"=
c("key_500:station and loc",
"spectra:key_600:type",
"alpha:key_100:number"),
"column B"=c(2,9,12))
DT=separate(sep = "[^[:alnum:] ]+",DT,"column A",into=c('column A','key','var1','var2'),fill='left')
DT$key=NULL
DT$`column A`=ifelse(is.na(DT$`column A`),"effect",DT$`column A`)
DT=DT[,c(1,4,2,3)]

Related

R: adding matching vector values from two dataframes in one column

I have a data frame which is configured roughly like this:
df <- cbind(c('hello', 'yes', 'example'),c(7,8,5),c(0,0,0))
words
frequency
count
hello
7
0
yes
8
0
example
5
0
What I'm trying to do is add values to the third column from a different data frame, which is similiar but looks like this:
df2 <- cbind(c('example','hello') ,c(5,6))
words
frequency
example
5
hello
6
My goal is to find matching values for the first column in both data frames (they have the same column name) and add matching values from the second data frame to the third column of the first data frame.
The result should look like this:
df <- cbind(c('hello', 'yes', 'example'),c(7,8,5),c(6,0,5))
words
frequency
count
hello
7
6
yes
8
0
example
5
5
What I've tried so far is:
df <- merge(df,df2, by = "words", all.x=TRUE)
However, it doesn't work.
I could use some help understanding how could it be done. Any help will be welcome.
This is an "update join". My favorite way to do it is in dplyr:
library(dplyr)
df %>% rows_update(rename(df2, count = frequency), by = "words")
In base R you could do the same thing like this:
names(df2)[2] = "count2"
df = merge(df, df2, by = "words", all.x=TRUE)
df$count = ifelse(is.na(df$coutn2), df$count, df$count2)
df$count2 = NULL
Here is an option with data.table:
library(data.table)
setDT(df)[setDT(df2), on = "words", count := i.frequency]
Output
words frequency count
<char> <num> <num>
1: hello 7 6
2: yes 8 0
3: example 5 5
Or using match in base R:
df$count[match(df2$words, df$words)] <- df2$frequency
Or another option with tidyverse using left_join and coalesce:
library(tidyverse)
left_join(df, df2 %>% rename(count.y = frequency), by = "words") %>%
mutate(count = pmax(count.y, count, na.rm = T)) %>%
select(-count.y)
Data
df <- structure(list(words = c("hello", "yes", "example"), frequency = c(7,
8, 5), count = c(0, 0, 0)), class = "data.frame", row.names = c(NA,
-3L))
df2 <- structure(list(words = c("example", "hello"), frequency = c(5, 6)), class = "data.frame", row.names = c(NA,
-2L))

R - Comma delimit a column while adding others

I have a data frame which looks like this:
ID Age
1 19
2 20
3 56
4 81
I want to add up the column age and comma delimit ID:
ID Age
1,2,3,4 176
I have tried this:
aggregate(ID ~., data, toString) as per this solution:
Collapse / concatenate / aggregate a column to a single comma separated string within each group
But, this is not producing desired result.
You can make a new data frame by applying different functions to each column.
#Your data
df <- data.frame(ID = c(1:4),
Age = c(19,20,56,81))
#Output
resul <- data.frame(ID = paste(df$ID, collapse = ","),
Age = sum(df$Age))
# ID Age
#1 1,2,3,4 176
We can use dplyr
library(dplyr)
df %>% summarise(ID = toString(ID), Age = sum(Age))
# ID Age
#1 1, 2, 3, 4 176
and data.table
library(data.table)
setDT(df)[, .(ID = toString(ID), Age = sum(Age))]

Pattern Searching in R

I have two data frames as below. DF1 is slighly messy (as you can see below) has multiple values from DF2 combined into one column.
DF1
SRNo. Value
1 1ABCD2EFGH3IJKL
2 1ABCD2EFGH3IJKL/7MLPO0OKMN8MNBV
3 3ABCD4EFGH5IJKL
4 3ABCD4EFGH5IJKL/1ABCD2EFGH3IJKL
5 7MLPO0OKMN8MNBV/9IUYT7HGFD3LKJH
DF2
SRNo. Value
1 1ABCD2EFGH3IJKL
2 3ABCD4EFGH5IJKL
3 6PQRS7TUVW8XYZA
4 5FGHI9XUZX1RATP
5 9AGTY6UGFW0AAUU
6 6TEYD7RARA8MHAT
7 9IUYT7HGFD3LKJH
I want to do a look up using values column in both the data set. Here is what I am trying to accomplish.
i) For rows 1 & 3 in DF1 it is a simple look up in DF2. I expect the code to return those looked up values.
ii) For row #3 in DF1, only first part of the string matches with a value in DF2. I expect the code to return only the first part.
iii) For row#4 in DF1, both the parts in the string matches with values in DF2. In this case I want the first part of the string that is matching to be retained
iv) For Row #5, the second part in the string matches with the value in DF2. I would expect the code to return the 2nd part of the string.
I have around 47000 rows in first dataset and over 300,000 in second dataset and ofcourse there are other columns in both the datasets. I have tried this in multiple ways using str_split/str_match but could not accomplish what I want to. Every suggestion is appreciated. My rest of the coding is in R.
Thank You
First step is to tidyr::separate() your DF1 at "/". Then I used dplyr::case_when() to see if there was a match between the first of the listed items in DF2 with %in%; if there wasn't then check against the second. I used dplyr::mutate() to append the results to DF1 under dat.
library(dplyr)
library(tidyr)
DF1 <- data.frame("SRNo." = 1:5, Value = c("1ABCD2EFGH3IJKL","1ABCD2EFGH3IJKL/7MLPO0OKMN8MNBV","3ABCD4EFGH5IJKL","3ABCD4EFGH5IJKL/1ABCD2EFGH3IJKL","7MLPO0OKMN8MNBV/9IUYT7HGFD3LKJH"), stringsAsFactors = F) %>% tbl_df()
DF2 <- data.frame("SRNo." = 1:7, Value = c("1ABCD2EFGH3IJKL","3ABCD4EFGH5IJKL","6PQRS7TUVW8XYZA","5FGHI9XUZX1RATP","9AGTY6UGFW0AAUU","6TEYD7RARA8MHAT","9IUYT7HGFD3LKJH"), stringsAsFactors = F) %>%tbl_df()
DF1 %>%
separate(Value, c("Value1", "Value2"), sep = "/") %>%
mutate(dat = case_when(
Value1 %in% DF2$Value ~ Value1,
Value2 %in% DF2$Value ~ Value2,
TRUE ~ NA_character_
))
# # A tibble: 5 x 4
# SRNo. Value1 Value2 dat
# <int> <chr> <chr> <chr>
# 1 1 1ABCD2EFGH3IJKL NA 1ABCD2EFGH3IJKL
# 2 2 1ABCD2EFGH3IJKL 7MLPO0OKMN8MNBV 1ABCD2EFGH3IJKL
# 3 3 3ABCD4EFGH5IJKL NA 3ABCD4EFGH5IJKL
# 4 4 3ABCD4EFGH5IJKL 1ABCD2EFGH3IJKL 3ABCD4EFGH5IJKL
# 5 5 7MLPO0OKMN8MNBV 9IUYT7HGFD3LKJH 9IUYT7HGFD3LKJH
Data.table solution
df1 <- read.table(text="SRNo. Value
1 1ABCD2EFGH3IJKL
2 1ABCD2EFGH3IJKL/7MLPO0OKMN8MNBV
3 3ABCD4EFGH5IJKL
4 3ABCD4EFGH5IJKL/1ABCD2EFGH3IJKL
5 7MLPO0OKMN8MNBV/9IUYT7HGFD3LKJH", header = T, stringsAsFactors = F)
df2 <- read.table( text = "SRNo. Value
1 1ABCD2EFGH3IJKL
2 3ABCD4EFGH5IJKL
3 6PQRS7TUVW8XYZA
4 5FGHI9XUZX1RATP
5 9AGTY6UGFW0AAUU
6 6TEYD7RARA8MHAT
7 9IUYT7HGFD3LKJH", header = T, stringsAsFactors = F )
library( data.table )
setDT(df1)[, c( "Value1", "Value2" ) := tstrsplit( Value, "/", fixed = TRUE)]
setDT(df2)
resultv1 <- df2[ df1, on = c( Value = "Value1"), nomatch = 0L ]
resultv2 <- df2[ df1, on = c( Value = "Value2"), nomatch = 0L ]
result <- rbindlist( list( resultv1, resultv2 ) )[!duplicated( i.SRNo.)]
Benchmarking it against the solution from #Paul shows similar runtimes (~2.5 miliseconds).. But data.table sometimes surprises me on larger data-sets..
If memory becomes an issue, you can do it all in one go:
rbindlist( list( setDT(df2)[ setDT(df1)[, c( "Value1", "Value2" ) := tstrsplit( Value, "/", fixed = TRUE)],
on = c( Value = "Value1"), nomatch = 0L ],
setDT(df2)[ setDT(df1)[, c( "Value1", "Value2" ) := tstrsplit( Value, "/", fixed = TRUE)],
on = c( Value = "Value2"), nomatch = 0L ] ) )[!duplicated( i.SRNo.)]

Create a sequence of strings

Given a data set similar to the following
dat = structure(list(OpportunityId = c("006a000000zLXtZAAW", "006a000000zLXtZAAW",
"006a000000gst", "006a000000gstg", "006a000000gstg",
"006a000000zLXtZAAW"), IsWon = c(1, 1, 1, 1, 1, 1),
sequence = c("LLLML", "LHHHL", "LLLML", "HMLLL", "LLLLL", "LLLLL")),
.Names = c("OpportunityId","IsWon", "sequence"), row.names = c(NA, 6L), class = "data.frame")
dat
How would one go about adding each sequence that is associated with a particular opportunity id, such that the final looks like.
oppid sequence
006... LLL, LML, MMM
007... MMM, MML, MMH, LLL, HHH
007... LML, MMM
Any ideas?
We can paste the 'sequence' after grouping by 'OpportunityId'
library(data.table)
setDT(dat)[, .(sequence = toString(unique(sequence))) ,
by = .(oppid = OpportunityId)]
Maybe a combination of aggregate and unique could help.
aggregate(sequence ~ OpportunityId, dat, unique)
# OpportunityId sequence
#1 006a000000gst LLLML
#2 006a000000gstg HMLLL, LLLLL
#3 006a000000zLXtZAAW LLLML, LHHHL, LLLLL
As pointed out by #akrun in a comment, the sequence column is stored as a list in this case.
If necessary, the list in the sequence column can be converted into character format (a single string for each row) by means of:
dat$sequence <- sapply(dat$sequence, paste, collapse=", ")
With dplyr
library(dplyr)
dat_new <- dat %>%
group_by(OpportunityId) %>%
summarise(sequence = toString(sequence)) %>%
distinct(.keep_all = TRUE)
Output
# OpportunityId IsWon sequence
# 1 006a000000zLXtZAAW 1 LLLML, LHHHL, LLLLL
# 2 006a000000gst 1 LLLML
# 3 006a000000gstg 1 HMLLL, LLLLL

Converting key value pair into a data frame

I have a key value data in the following format now
column1 column2 column3
length:30 width:20
length:20 height:10 width:10
Now i want to convert this into a data frame in the following format
Length width height
32 20
40 30 10
Thanks in advance
You can remove the text with sub
setNames(data.frame(lapply(dat, function(x) sub("[a-z]+:", "", x))),
c("length", "width"))
# length width
# 1 32 20
# 2 40 30
Edit
For the updated question,
dat <- unlist(dat, use.names = F) # convert to list
keys <- unique(sub("([a-z]):.*", "\\1", dat)) # extract the keys
keys <- keys[keys!=""] # remove empty strings like in your example
## Key-values in list
keyvals <- setNames(lapply(keys, function(x) {
as.numeric(sub("\\D+", "", grep(x, dat, fixed=T, value=T)))
}), keys)
## Convert to data.frame
as.data.frame(do.call(cbind, lapply(keyvals, `length<-`, max(lengths(keyvals)))))
# length width height
# 1 30 20 10
# 2 20 10 NA
An option using dplyr/tidyr. We convert the 'wide' format to 'long' with gather, remove the blank rows ('') with filter, use separate to create two variables ('Val1' and 'Val2') by separating strings at the : delimiter, remove the unwanted columns (select(-Var)), grouped by one of the variables ('Val1') create a sequence column ('indx'), and convert back from 'long' to 'wide' format (spread).
library(dplyr)
library(tidyr)
gather(df1, Var, Val) %>%
filter(Val!='') %>%
separate(Val, c('Val1', 'Val2'), convert=TRUE) %>%
select(-Var) %>%
group_by(Val1) %>%
mutate(indx=row_number()) %>%
spread(Val1, Val2) %>%
select(-indx)
# height length width
#1 10 30 20
#2 NA 20 10
Or a similar approach using data.table. We unlist the initial dataset, and convert it to 'data.table' with a single column (setDT). Using the tstrsplit from the devel version of 'data.table' i.e. v1.9.5, we split at the :. A sequence column is created ('indx') based on the grouping variable 'V1', removed the 'NA' rows and use dcast from data.table to convert back from 'long' to 'wide' format.
library(data.table)#v1.9.5+
DT <- setDT(list(unlist(df1)))[, tstrsplit(V1, ':', type.convert=TRUE)
][, ind:=1:.N, V1][!is.na(V1)]
dcast(DT, ind~V1, value.var='V2')
# ind height length width
#1: 1 10 30 20
#2: 2 NA 20 10
data
df1 <- structure(list(column1 = c("length:30", "length:20"),
column2 = c("width:20",
"height:10"), column3 = c("", "width:10")), .Names = c("column1",
"column2", "column3"), class = "data.frame", row.names = c(NA, -2L))

Resources