column of unique values between of two other columns - r

sample data:
col1 col2
<NA> cc
a a
ab a
z a
I want to add a column unique with these values -- any valued that isn't shared between col1 and col2.
col1 col2 unique
<NA> cc cc
a a
ab a b
z a za
I tried using setdiff but
(for replication purposes:)
df <- read.table(header=TRUE, stringsAsFactors = FALSE, text =
"col1 col2
NA cc
a a
ab a
z a
")
Like this:
df$unique <- paste0(setdiff(df$col1, df$col2), setdiff(df$col2, df$col1))
But it returns
Error in `$<-.data.frame`(`*tmp*`, "unique", value = c("<NA>cc", "abcc" :
replacement has 2 rows, data has 3
From the error it looks like it's generating a vector of the differences between the columns, instead of the differences between the elements...
Edit: Added z and a sample data in last row.

You could do this using setdiff and Reduce in base R:
cols <- c(1,2)
df$unique <- unlist(lapply(apply(df[cols], 1, function(x)
Reduce(setdiff, strsplit(na.omit(x), split = ""))), paste0, collapse=""))
# col1 col2 unique
# 1 <NA> cc cc
# 2 a a
# 3 ab a b

Here is a length method with apply.
apply(df, 1, function(i) {
i <- i[!is.na(i)] # remove NAs
if(length(i[!is.na(i)]) == 1) i # check length and return singletons untouched
else { # for non-singletons
i <- unlist(strsplit(i, split="")) # strsplit and turn into a vector
i <- i[!(duplicated(i) | duplicated(i, fromLast=TRUE))] # drop duplicates
paste(i, collapse="")}}) # return collapsed singleton set of characters
[1] "cc" "" "b"
Note that for c("cc", "a", "c"), this will return "a" because "cc" and "c" will be marked as duplicates.

We need to split the string first:
df$unique <- mapply(function(x, y){
u <- setdiff(union(x, y), intersect(x, y))
paste0(u[!is.na(u)], collapse = '')
}, strsplit(df$col1, ''), strsplit(df$col2, ''))
# >df
# col1 col2 unique
# 1 <NA> cc c
# 2 a a
# 3 ab a b

Related

how to parse a text file that contains the column names at the beginning of the file?

My text file looks like the following
"
file1
cols=
col1
col2
# this is a comment
col3
data
a,b,c
d,e,f
"
As you can see, the data only starts after the data tag and the rows before that essentially tell me what the column names are. There could be some comments which means the number of rows before the data tag is variable.
How can I parse that in R? Possibly with some tidy tools?
Expected output is:
# A tibble: 2 x 3
col1 col2 col3
<chr> <chr> <chr>
1 a b c
2 d e f
Thanks!
Here is a base way with scan(). strip.white = T to remove blank lines and comment.char = "#" to remove lines leading with #.
text <- scan("test.txt", "", sep = "\n", strip.white = T, comment.char = "#")
text
# [1] "file1" "cols=" "col1" "col2" "col3" "data" "a,b,c" "d,e,f"
ind1 <- which(text == "cols=")
ind2 <- which(text == "data")
df <- read.table(text = paste(text[-seq(ind2)], collapse = "\n"),
sep = ",", col.names = text[(ind1 + 1):(ind2 - 1)])
df
# col1 col2 col3
# 1 a b c
# 2 d e f
I saved your file as ex_text.txt on my machine, removing the start and end quotes. Here's a solution. I don't know how extendable this is, and it might not work for "weirder" data.
# initialize
possible_names <- c()
not_data <- TRUE # stop when we find "data"
n <- 20 # lines to check the txt file
while (not_data){
# read txt line by line
possible_names <- readLines("ex_text.txt", n = n)
not_data <- all(possible_names != "data") # find data?
n <- n + 20 # increment to read more lines if necessary
}
# where does ddata start?
data_start <- which(possible_names == "data")
# remove unnecessary text and find actual column names
possible_names <- possible_names[2:(data_start-1)]
possible_names <- possible_names[""!= possible_names] # remove any blank space
col_names <- possible_names[!grepl("#.*", possible_names)] # remove comments
# read data
read.delim("ex_text.txt",
skip = data_start,
sep = ",",
col.names = col_names,
header = FALSE)
# col1 col2 col3
# 1 a b c
# 2 d e f

Check for string in dataframe containing list of strings

I'm trying to find a way to lookup multiple values in a dataframe and return a value. Simplified example:
df1 <- read.table(text="chk1 chk2 chk3 value
xx aa;bb;cc jj 1
xx;yy dd;ee;ff kk 2
zz gg;hh;ii ll;nn 3", header=T)
df2 <- read.table(text="val1 val2 val3
xx bb jj
xx dd kk
yy ee kk
zz hh jj
", header=T)
Lookup values val1, val2, and val3 from df2 in df1, return value from df1.
Desired results:
df2 <- read.table(text="
val1 val2 val3 value
xx bb jj 1
xx dd kk 2
yy ee kk 2
zz hh jj NA
")
Tried match x %in% y and looping over the rows, can't get it to work.
Here is one possibility:
library(tidyverse)
df3 <- df2 %>% rowwise %>%
mutate(rowmatch=which(grepl(val1, df1$chk1) &
grepl(val2, df1$chk2) &
grepl(val3, df1$chk3))[1],
value=df1$value[rowmatch])
Result:
# A tibble: 4 x 5
val1 val2 val3 rowmatch value
<chr> <chr> <chr> <int> <int>
1 xx bb jj 1 1
2 xx dd kk 2 2
3 yy ee kk 2 2
4 zz hh jj NA NA
Notes:
the [1] is to ensure that only first of the matching rows is used.
note that although rowmatch and value are identical in this example this is only because df1$value is equal to the row number.
tibble behaves like a data.frame, but if you really prefer a data frame, add %>% as.data.frame
The same can be done with base R and apply:
df2$rowmatch <- with(df1, apply(df2, 1, function(x)
which(grepl(x["val1"], chk1) &
grepl(x["val2"], chk2) &
grepl(x["val3"], chk3))[1]))
df2$value <- df1$value[df2$rowmatch]
another option would be splitting the values first:
df1 <- df1 %>%
splitstackshape::cSplit("chk1", ";", fixed = TRUE, direction = "long", drop = FALSE, type.convert = FALSE) %>%
splitstackshape::cSplit("chk2", ";", fixed = TRUE, direction = "long", drop = FALSE, type.convert = FALSE) %>%
splitstackshape::cSplit("chk3", ";", fixed = TRUE, direction = "long", drop = FALSE, type.convert = FALSE)
and then using join
You can also do it using two nested for loops. The logic is to take first row of df2 and then start going through rows of df1 to see if df2$val1 matches df1$chk, df2$val2 matches df1$chk2 and df2$val3 matches df1$chk3. I consider all values a match if there is at least one match per column. The caverat here is that if df2 does not have unique rows, the last matching row from df1 will be written to df2. But this can be changed by breaking out of the loop as soon as the match is found.
for (i in 1:nrow(df2)) {
for (j in 1:nrow(df1)) {
# Take i-th row and split by ;. Result is a vector of strings against
# which we'll use match.
i.split <- strsplit(as.character(unlist(df1[j, , drop = TRUE][-4])), ";")
# Pairwise check columns from df1 and df2.
all.ok <- all(mapply(FUN = function(x, y) {
any(x %in% y)
}, x = i.split, y = as.list(df2[i, 1:3])
))
if (all.ok) {
# If a match is found, write the value to df2.
df2[i, "value"] <- df1[j, "value"]
}
}
}
Output:
val1 val2 val3 value
1 xx bb jj 1
2 xx dd kk 2
3 yy ee kk 2
4 zz hh jj NA

Subset a dataframe by matching it to a list and include non-match value too in the output using R

I have a dataframe (myDF) that has 2 columns "A" and "B" and a function (myfunc) which takes a list as an input and if it finds a match in column "A" then it returns a new dataframe that is a subset of myDF containing the value match and the corresponding "B" column.
But I want the function to also return the non-matching value in column A and NULL string in column B.
myDF:
A B
1 11
2 22
3 33
myfunc:
myfunc <- function(x) {
r<- with(myDF, myDF[a %in% x, c("a", "b")])
return(data.frame(r))
}
Input: mylist = c(1,2,"E")
Expected Output:
A B
1 11
2 22
E NULL
We create a logical index and assign
i1 <- with(myDF, !A %in% mylist)
myDF$B[i1] <- "NULL"
myDF$A[i1] <- mylist[i1]
myDF
# A B
#1 1 11
#2 2 22
#3 E NULL
Note: By assigning a character string to 'B' column, it effectively changes the type from numeric to character. A better option would be to assign it to NA
myDF$B[i1] <- NA
Or
data.frame(A= mylist, B = myDF$B[match(mylist, myDF$A)])
This is a join operation, which can be done in base R with merge, if you make the list a data.frame first. The all.y = T argument includes rows of mylistDF with no matching rows in myDF in the output.
mylistDF <- data.frame(A = mylist, stringsAsFactors = F)
merge(myDF, mylistDF, by = 'A', all.y = T)
# A B
# 1 1 11
# 2 2 22
# 3 E NA
Since you tagged tidyr, here's a tidyverse solution (same output)
library(tidyverse)
mylistDF <- tibble(A = mylist)
myDF %>%
mutate_at('A', as.character) %>%
right_join(mylistDF, by = 'A')

Spacing vector by regular pattern

I have a vector
vec <- c("ab", "#4", "gw", "#29", "mp", "jq", "#35", "ez")
which generally follows the pattern of alternating between two different sequences of strings (the first sequence being all alphabetical, the second being numerical with the symbol #).
However there are cases where no # term appears: so in the above between mp and jq, and then again after ez. I would like to define a function which "fills the gaps" with the character string #, so that I would have the output:
[1] "ab" "#4" "gw" "#29" "mp" "#" "jq" "#35" "ez" "#"
which I would then convert to a data frame
V1 V2
1 ab #4
2 gw #29
3 mp #
4 jq #35
5 ez #
My attempt so far is rather clunky and relies on looping through the vector and filling the gaps. I'd be interested to see more elegant solutions.
My Solution
greplSpace <- function(pattern, replacement, x){
j <- 1
while( j < length(x) ){
if(grepl(pattern, x[j+1]) ){
j <- j+2
} else {
x <- c( x[1:j], replacement, x[(j+1):length(x)] )
j <- j+2
}
}
if( ! grepl(pattern, tail(x,1) ) ){ x <- c(x, replacement) }
return(x)
}
library(magrittr)
vec <- c("ab", "#4", "gw", "#29", "mp", "jq", "#35", "ez")
vec %>% greplSpace("#", "#", . ) %>%
matrix(ncol = 2, byrow = TRUE) %>%
as.data.frame
Start with your vec, we can create your expected data frame directly with some functions from the dplyr, tidyr, and stringr.
library(dplyr)
library(tidyr)
library(stringr)
vec <- c("ab", "#4", "gw", "#29", "mp", "jq", "#35", "ez")
dat <- data_frame(Value = vec)
dat2 <- dat %>%
mutate(String = !str_detect(vec, "#"),
Key = ifelse(String, "V1", "V2"),
Row = cumsum(String)) %>%
select(-String) %>%
spread(Key, Value, fill = "#") %>%
select(-Row)
dat2
# # A tibble: 5 x 2
# V1 V2
# <chr> <chr>
# 1 ab #4
# 2 gw #29
# 3 mp #
# 4 jq #35
# 5 ez #
Here is a base R option with split. Create a logical index by checking the "#" in each of the strings, get the cumulative sum and split the original vector by this grouping variable into a list ('lst'). For those list elements that don't have two (maximum length) elements are appended with NA at the end by assignment with length<-. Then, rbind, the list elements into a two column matrix. If needed, convert those NA to #
lst <- split(vec, cumsum(!grepl("#", vec)))
out <- do.call(rbind, lapply(lst, `length<-`, max(lengths(lst))))
out[,2][is.na(out[,2])] <- "#" #not recommended though
out
# [,1] [,2]
#1 "ab" "#4"
#2 "gw" "#29"
#3 "mp" "#"
#4 "jq" "#35"
#5 "ez" "#"
Wrap it with as.data.frame if we need a data.frame output
You can use Base R:
First Collapse the vector into a string while replaceing # where needed.
Then just read using read.csv
vec1=gsub("([a-z]),\\s*([a-z])|$","\\1,#,\\2",toString(vec))
read.csv(text=gsub("(#.*?),","\\1\n",vec1),h=F)
V1 V2
1 ab #4
2 gw #29
3 mp #
4 jq #35
5 ez #
Explanation:
First collapse the vector into a string by toString
Then if there are alphabets on either side of the , ie [a-z],\s*[a-z] or at the end ie |$ you insert an #.
Then create line breaks after numbers or # and read in the data as a table
You can also do:
a=read.csv(h=F,text=toString(sub("([a-z]+)","\n\\1",vec)),na=c(" ",""))[1:2]
a
V1 V2
1 ab #4
2 gw #29
3 mp <NA>
4 jq #35
5 ez <NA>
data.frame(replace(as.matrix(a),is.na(a),"#"))
V1 V2
1 ab #4
2 gw #29
3 mp #
4 jq #35
5 ez #
Another base possibility:
do.call(rbind, tapply(vec, cumsum(!grepl("^#", vec)), FUN = function(x){
if(length(x) == 1) c(x, "#") else x}))
# [,1] [,2]
# 1 "ab" "#4"
# 2 "gw" "#29"
# 3 "mp" "#"
# 4 "jq" "#35"
# 5 "ez" "#"
Explanation:
Check if elements in vec starts with #, and negate it: !grepl("^#", vec); creates a logical vector.
Create a grouping variable by applying cumsum to the logical vector (note: 1 & 2 similar to #akrun).
Use tapply to apply a function to each subset of vec, defined by the grouping variable. Check if the length is 1. If so, pad by a trailing #, else just return the subset: if(length(x) == 1) c(x, "#") else x
Bind the resulting list together by row: do.call(rbind,
Another one:
# create a row index
ri <- cumsum(!grepl("^#", vec))
# create a column index
ci <- ave(ri, ri, FUN = seq_along)
# create an empty matrix of desired dimensions
m <- matrix(nrow = max(ri), ncol = 2)
# assign 'vec' to matrix at relevant indices
m[cbind(ri, ci)] <- vec
# replace NA with '#'
m[is.na(m)] <- "#"
Using data.table. Create a grouping variable as above, and reshape from long to wide.
library(data.table)
d <- data.table(vec)
d[ , g := cumsum(!grepl("^#", vec))]
dcast(d, g ~ rowid(g), value.var = "vec", fill = "#")
# g 1 2
# 1: 1 ab #4
# 2: 2 gw #29
# 3: 3 mp #
# 4: 4 jq #35
# 5: 5 ez #

split rows by condition in R data.table

I have a data table containing 3 columns, one of them
contains a key:value list of different lengths.
I wish to rearrange the table such that each row will have only one key, conditioned on the value
for example, suppose that I wish to get all rows for whom the value is <= 2 so that each key is on its own row:\
input_tbl <-
data.table::data.table(a=c("AA"),b=c("{\"ha:llo\":1,\"wor:ld\":2,\"doog:bye\":3}"),
c=c(1))
the wanted table then should be
tbl_output <- data.table::data.table(a=c("AA",
"AA"),b=c("ha:llo","wor:ld"), c=c(1,1), s=c(1,2))
I had tried the following function:
data_table_clean <- function(dt){
dt[ ,"b" := data.table::tstrsplit(b, ',', fixed = T),by=c(a, c)]
dt[,c('b', 's'):= data.table::tstrsplit(b, ':', fixed=TRUE)]
return(dt[s <=2,])
}
this produces the following error
"Error in eval(expr, envir, enclos) : object 'a' not found"
Any suggestions are welcome, off course.
The keys are actually of the form :
input2_tbl <-
data.table::data.table(a=c("AA"),b=c("{\"99:1d:3u:7y:89:67\":1,\"99:1D:34:YY:T6:Y6\":2,\"ll:5Y:UY:56:R5:R6\":3}"),
c=c(1))
and accordingly the output table should be:
tbl2_output <- data.table::data.table(a=c("AA",
"AA"),b=c(""99:1d:3u:7y:89:67","99:1D:34:YY:T6:Y6"),
c=c(1,1), s=c(1,2))
Thank you!
update
data_table_clean <- function(dt){
res <- dt[, data.table::tstrsplit(unlist(strsplit(gsub('[{}"]', '', b),',', fixed=TRUE)), ":(?=[^:]+$)", perl=TRUE),
by = .(a, c)][V2 > -100]
data.table::setnames(res, 3:4, c("b", "s"))
res
}
when running this I get the following error:
Error in .subset(x, j) : invalid subscript type 'list'
One option would be to extract the characters that we need in the final output. We use str_extract to do that after grouping by 'a', 'c'. The output is a list, which we unlist, get the non-numeric and numeric into two columns and then subset the rows with the condition s<3.
library(stringr)
library(data.table)
input_tbl[, {
tmp <- unlist(str_extract_all(b, "[A-Za-z]+:[A-Za-z]+|\\d+"))
list(b=tmp[c(TRUE, FALSE)], s=tmp[c(FALSE, TRUE)])
}, by = .(a,c)][s<3]
# a c b s
#1: AA 1 ha:llo 1
#2: AA 1 wor:ld 2
Or if we are using strsplit/tstrsplit, grouped by 'a', 'c', we remove the curly brackets and quotes ([{}]") with gsub, split by , (strsplit), unlist the output, and then use tstrsplit to split by : that is followed by a number. The subset part is similar as above.
res <- input_tbl[, tstrsplit(unlist(strsplit(gsub('[{}"]', '',
b), ',', fixed=TRUE)), ":(?=\\d)", perl=TRUE) ,.(a,c)][V2<3]
setnames(res, 3:4, c("b", "s"))
res
# a c b s
#1: AA 1 ha:llo 1
#2: AA 1 wor:ld 2
Update
For the updated dataset, we can do the tstrsplit on the last delimiter (:)
res1 <- input2_tbl[, tstrsplit(unlist(strsplit(gsub('[{}"]', '',
b),',', fixed=TRUE)), ":(?=[^:]+$)", perl=TRUE) ,
by = .(a, c)][V2 < 3]
setnames(res1, 3:4, c("b", "s"))
res1
# a c b s
# 1: AA 1 99:1d:3u:7y:89:67 1
# 2: AA 1 99:1D:34:YY:T6:Y6 2
Since it seems like you are working with a JSON object, why not use something that parses the JSON, for example, the "jsonlite" package?
With that, you can make a simple function, that looks like this:
myFun <- function(invec) {
require(jsonlite)
x <- fromJSON(invec)
list(b = names(x), s = unlist(x))
}
Now, applied to your dataset, you would get:
input_tbl[, myFun(b), by = .(a, c)]
# a c b s
# 1: AA 1 ha:llo 1
# 2: AA 1 wor:ld 2
# 3: AA 1 doog:bye 3
And, for the subsetting:
input_tbl[, myFun(b), by = .(a, c)][s <= 2]
# a c b s
# 1: AA 1 ha:llo 1
# 2: AA 1 wor:ld 2
You can probably also even rewrite the myFun function to add a "threshold" argument that lets you subset within the function itself.

Resources