separate_rows with unequal size of strings in R - r

Suppose I have a dataset like this:
a b
"1/2/3" "a/b/c"
"3/5" "e/d/s"
"1" "f"
I want to use separate_rows But I can't because of the second row. How can I find these kind of rows?

You can find the rows with unequal numbers of '/' symbols by doing:
which(lengths(strsplit(df$a, '/')) != lengths(strsplit(df$b, '/')))
#> [1] 2
Presumably these rows contain data input mistakes, since the number of rows implied by each entry is different.

Or you can directly count the number of "/" in each column, and output the row that does not have equal number of "/".
library(stringr)
with(df, which(str_count(a, "/") != str_count(b, "/")))
[1] 2
Input data
df <- structure(list(a = c("1/2/3", "3/5", "1"), b = c("a/b/c", "e/d/s",
"f")), class = "data.frame", row.names = c(NA, -3L))

Perhaps cSplit would help
library(splitstackshape)
library(dplyr)
cSplit(df, c("a", "b"), sep = "/", "long") %>%
filter(if_any(c(a, b), complete.cases))
-output
a b
<int> <char>
1: 1 a
2: 2 b
3: 3 c
4: 3 e
5: 5 d
6: NA s
7: 1 f
data
df <- structure(list(a = c("1/2/3", "3/5", "1"), b = c("a/b/c", "e/d/s",
"f")), class = "data.frame", row.names = c(NA, -3L))

Related

Rename columns of a dataframe based on another dataframe except columns not in that dataframe in R

Given two dataframes df1 and df2 as follows:
df1:
df1 <- structure(list(A = 1L, B = 2L, C = 3L, D = 4L, G = 5L), class = "data.frame", row.names = c(NA,
-1L))
Out:
A B C D G
1 1 2 3 4 5
df2:
df2 <- structure(list(Col1 = c("A", "B", "C", "D", "X"), Col2 = c("E",
"Q", "R", "Z", "Y")), class = "data.frame", row.names = c(NA,
-5L))
Out:
Col1 Col2
1 A E
2 B Q
3 C R
4 D Z
5 X Y
I need to rename columns of df1 using df2, except column G since it not in df2's Col1.
I use df2$Col2[match(names(df1), df2$Col1)] based on the answer from here, but it returns "E" "Q" "R" "Z" NA, as you see column G become NA. I hope it keep the original name.
The expected result:
E Q R Z G
1 1 2 3 4 5
How could I deal with this issue? Thanks.
By using na.omit(it's little bit messy..)
colnames(df1)[na.omit(match(names(df1), df2$Col1))] <- df2$Col2[na.omit(match(names(df1), df2$Col1))]
df1
E Q R Z G
1 1 2 3 4 5
I have success to reproduce your error with
df2 <- data.frame(
Col1 = c("H","I","K","A","B","C","D"),
Col2 = c("a1","a2","a3","E","Q","R","Z")
)
The problem is location of df2$Col1 and names(df1) in match.
na.omit(match(names(df1), df2$Col1))
gives [1] 4 5 6 7, which index does not exist in df1 that has length 5.
For df1, we should change order of terms in match, na.omit(match(df2$Col1,names(df1))) gives [1] 1 2 3 4
colnames(df1)[na.omit(match(df2$Col1, names(df1)))] <- df2$Col2[na.omit(match(names(df1), df2$Col1))]
This will works.
A solution using the rename_with function from the dplyr package.
library(dplyr)
df3 <- df2 %>%
filter(Col1 %in% names(df1))
df4 <- df1 %>%
rename_with(.cols = df3$Col1, .fn = function(x) df3$Col2[df3$Col1 %in% x])
df4
# E Q R Z G
# 1 1 2 3 4 5

Verifyin if there's at least two columns have the same value in a specefic column

i have a data and i want to see if my variables they all have unique value in specefic row
let's say i want to analyze row D
my data
Name F S T
A 1 2 3
B 2 3 4
C 3 4 5
D 4 5 6
> TRUE (because all the three variables have unique value)
Second example
Name F S T
A 1 2 3
B 2 3 4
C 3 4 5
D 4 5 4
>False (because F and T have the same value in row D )
In base R do
f1 <- function(dat, ind) {
tmp <- unlist(dat[ind, -1])
length(unique(tmp)) == length(tmp)
}
-testing
> f1(df, 4)
[1] TRUE
> f1(df1, 4)
[1] FALSE
data
df <- structure(list(Name = c("A", "B", "C", "D"), F = 1:4, S = 2:5,
T = 3:6), class = "data.frame", row.names = c(NA, -4L))
df1 <- structure(list(Name = c("A", "B", "C", "D"), F = 1:4, S = 2:5,
T = c(3L, 4L, 5L, 4L)), class = "data.frame", row.names = c(NA,
-4L))
You can use dplyr for this:
df %>%
summarize_at(c(2:ncol(.)), n_distinct) %>%
summarize(if_all(.fns = ~ .x == nrow(df)))

Add two R data frames of different sizes [duplicate]

This question already has answers here:
How to join (merge) data frames (inner, outer, left, right)
(13 answers)
Closed 2 years ago.
If two data frames are
symbol wgt
1 A 2
2 C 4
3 D 6
symbol wgt
1 A 20
2 D 10
how can I add them so that missing observations for a "symbol" in either data frame are treated as zero, giving
symbol wgt
1 A 22
2 C 4
3 D 16
You can join the two dataframes by symbol , replace NA with 0 and add the two weights.
library(dplyr)
df1 %>%
left_join(df2, by = 'symbol') %>%
mutate(wgt.y = replace(wgt.y, is.na(wgt.y), 0),
wgt = wgt.x + wgt.y) %>%
select(-wgt.x, -wgt.y)
# symbol wgt
#1 A 22
#2 C 4
#3 D 16
data
df1 <- structure(list(symbol = c("A", "C", "D"), wgt = c(2L, 4L, 6L)),
class = "data.frame", row.names = c(NA, -3L))
df2 <- structure(list(symbol = c("A", "D"), wgt = c(20L, 10L)),
class = "data.frame", row.names = c(NA, -2L))
Try this one line solution by pipes:
#Data
library(dplyr)
df1 <- structure(list(symbol = c("A", "C", "D"), wgt = c(2L, 4L, 6L)), class = "data.frame", row.names = c("1",
"2", "3"))
df2 <- structure(list(symbol = c("A", "D"), wgt = c(20L, 10L)), class = "data.frame", row.names = c("1",
"2"))
#Code
df1 %>% left_join(df2,by = 'symbol') %>% mutate(wgt = rowSums(.[-1],na.rm=T)) %>% select(c(1,4))
symbol wgt
1 A 22
2 C 4
3 D 16
With data.table and the data provided in the answer of #RonakShah and #Duck the solution could be a simple aggregation:
# Convert data.frame to data.table (very fast since inplace)
setDT(df1)
setDT(df2)
# combine both data.frames into one data.frame, group by symbol, apply the sum (NAs are ignored = counted as zero)
rbind(df1,df2)[, sum(wgt, na.rm = TRUE), by = symbol]
# Output
symbol V1
1: A 22
2: C 4
3: D 16
Note: If you want to use base R only (without data.table) you could use aggregate instead:
aggregate(wgt ~ symbol, rbind(df1,df2), sum)

Subtract two strings from each other

I have the following input
#mydata
ID variable1 variable2
1 a,b,c,d c,a
2 g,f,h h
3 p,l,m,n,c c,l
I wish to subtract the strings of varible2 from variable1 and I'd like to have the following output?
#Output
ID Output
1 b,d
2 g,f
3 p,m,n
#dput
structure(list(ID = 1:3, variable1 = structure(1:3, .Label = c("a,b,c,d",
"g,f,h", "p,l,m,n,c"), class = "factor"), variable2 = structure(c(1L,
3L, 2L), .Label = c("c,a", "c,l", "h"), class = "factor")), .Names = c("ID",
"variable1", "variable2"), class = "data.frame", row.names = c(NA,
-3L))
You can try,
Map(setdiff, strsplit(as.character(df$variable1), ',')), strsplit(as.character(df$variable2), ','))
We can use Map after splitting each of the columns by , get the setdiff, paste them together, set the names of the list output with 'ID' column, stack it to 'data.frame' and set the names to 'ID' and 'Output' for the columns.
setNames(stack(setNames(Map(function(x,y) toString(setdiff(x,y)),
strsplit(as.character(df1$variable1), ","),
strsplit(as.character(df1$variable2), ",")),
df1$ID))[2:1], c("ID", "Output"))
# ID Output
#1 1 b, d
#2 2 g, f
#3 3 p, m, n
Or a compact option would be
library(splitstackshape)
cSplit(df1, 2:3, ",", "long")[, .(Output = toString(setdiff(variable1, variable2))) , ID]
# ID Output
#1: 1 b, d
#2: 2 g, f
#3: 3 p, m, n
Using grepl instead of setdiff
library(stringr)
a1 <- str_split(d$variable1, ",")
a2 <- str_split(d$variable2, ",")
do.call("rbind",Map(function(x,y) paste(x[!grepl(paste(y, collapse="|"), x)], collapse=","), a1, a2))
[,1]
[1,] "b,d"
[2,] "g,f"
[3,] "p,m,n"
Using Dplyr
mydata %>%
rowwise() %>%
mutate(output = paste0(setdiff(strsplit(as.character(variable1),split = ",")[[1]], strsplit(as.character(variable2),",")[[1]] ),collapse = ","))
%>% select(ID,output)
output:
ID output
(int) (chr)
1 1 b,d
2 2 g,f
3 3 p,m,n

R paste0 2 columns if not NA

I would like to paste0 two columns if the element in one column is not NA.If one element of one columns is NA then keep the element of the other column only.
structure(list(col1 = structure(1:3, .Label = c("A", "B", "C"),
class = "factor"), col2 = c(1, NA, 3)), .Names = c("col1", "col2"),
class = "data.frame",row.names = c(NA, -3L))
# col1 col2
# 1 A 1
# 2 B NA
# 3 C 3
structure(list(col1 = structure(1:3, .Label = c("A", "B", "C"),
class = "factor"),col2 = c(1, NA, 3), col3 = c("A|1", "B", "C|3")),
.Names = c("col1", "col2", "col3"), row.names = c(NA,-3L),
class = "data.frame")
# col1 col2 col3
#1 A 1 A|1
#2 B NA B
#3 C 3 C|3
you can also do it with regular expressions:
df$col3 <- sub("NA\\||\\|NA", "", with(df, paste0(col1, "|", col2)))
That is, paste them in regular way and then replace any "NA|" or "|NA" with "". Note that | needs to be "double escaped" because it means "OR" in regexps, that's why the strange pattern NA\\||\\|NA means actually "NA|" OR "|NA".
As #Roland says, this is easy using ifelse (just translate the mental logic into a series of nested ifelse statements):
x <- transform(x,col3=ifelse(is.na(col1),as.character(col2),
ifelse(is.na(col2),as.character(col1),
paste0(col1,"|",col2))))
update: need as.character in some cases.
Try:
> df$col1 = as.character(df$col1)
> df$col3 = with(df, ifelse(is.na(col1),col2, ifelse(is.na(col2), col1, paste0(col1,'|',col2))))
> df
col1 col2 col3
1 A 1 A|1
2 B NA B
3 C 3 C|3
You could also do:
library(stringr)
df$col3 <- apply(df, 1, function(x)
paste(str_trim(x[!is.na(x)]), collapse="|"))
df
# col1 col2 col3
#1 A 1 A|1
#2 B NA B
#3 C 3 C|3

Resources