I have the following input
#mydata
ID variable1 variable2
1 a,b,c,d c,a
2 g,f,h h
3 p,l,m,n,c c,l
I wish to subtract the strings of varible2 from variable1 and I'd like to have the following output?
#Output
ID Output
1 b,d
2 g,f
3 p,m,n
#dput
structure(list(ID = 1:3, variable1 = structure(1:3, .Label = c("a,b,c,d",
"g,f,h", "p,l,m,n,c"), class = "factor"), variable2 = structure(c(1L,
3L, 2L), .Label = c("c,a", "c,l", "h"), class = "factor")), .Names = c("ID",
"variable1", "variable2"), class = "data.frame", row.names = c(NA,
-3L))
You can try,
Map(setdiff, strsplit(as.character(df$variable1), ',')), strsplit(as.character(df$variable2), ','))
We can use Map after splitting each of the columns by , get the setdiff, paste them together, set the names of the list output with 'ID' column, stack it to 'data.frame' and set the names to 'ID' and 'Output' for the columns.
setNames(stack(setNames(Map(function(x,y) toString(setdiff(x,y)),
strsplit(as.character(df1$variable1), ","),
strsplit(as.character(df1$variable2), ",")),
df1$ID))[2:1], c("ID", "Output"))
# ID Output
#1 1 b, d
#2 2 g, f
#3 3 p, m, n
Or a compact option would be
library(splitstackshape)
cSplit(df1, 2:3, ",", "long")[, .(Output = toString(setdiff(variable1, variable2))) , ID]
# ID Output
#1: 1 b, d
#2: 2 g, f
#3: 3 p, m, n
Using grepl instead of setdiff
library(stringr)
a1 <- str_split(d$variable1, ",")
a2 <- str_split(d$variable2, ",")
do.call("rbind",Map(function(x,y) paste(x[!grepl(paste(y, collapse="|"), x)], collapse=","), a1, a2))
[,1]
[1,] "b,d"
[2,] "g,f"
[3,] "p,m,n"
Using Dplyr
mydata %>%
rowwise() %>%
mutate(output = paste0(setdiff(strsplit(as.character(variable1),split = ",")[[1]], strsplit(as.character(variable2),",")[[1]] ),collapse = ","))
%>% select(ID,output)
output:
ID output
(int) (chr)
1 1 b,d
2 2 g,f
3 3 p,m,n
Related
Suppose I have a dataset like this:
a b
"1/2/3" "a/b/c"
"3/5" "e/d/s"
"1" "f"
I want to use separate_rows But I can't because of the second row. How can I find these kind of rows?
You can find the rows with unequal numbers of '/' symbols by doing:
which(lengths(strsplit(df$a, '/')) != lengths(strsplit(df$b, '/')))
#> [1] 2
Presumably these rows contain data input mistakes, since the number of rows implied by each entry is different.
Or you can directly count the number of "/" in each column, and output the row that does not have equal number of "/".
library(stringr)
with(df, which(str_count(a, "/") != str_count(b, "/")))
[1] 2
Input data
df <- structure(list(a = c("1/2/3", "3/5", "1"), b = c("a/b/c", "e/d/s",
"f")), class = "data.frame", row.names = c(NA, -3L))
Perhaps cSplit would help
library(splitstackshape)
library(dplyr)
cSplit(df, c("a", "b"), sep = "/", "long") %>%
filter(if_any(c(a, b), complete.cases))
-output
a b
<int> <char>
1: 1 a
2: 2 b
3: 3 c
4: 3 e
5: 5 d
6: NA s
7: 1 f
data
df <- structure(list(a = c("1/2/3", "3/5", "1"), b = c("a/b/c", "e/d/s",
"f")), class = "data.frame", row.names = c(NA, -3L))
Given two dataframes df1 and df2 as follows:
df1:
df1 <- structure(list(A = 1L, B = 2L, C = 3L, D = 4L, G = 5L), class = "data.frame", row.names = c(NA,
-1L))
Out:
A B C D G
1 1 2 3 4 5
df2:
df2 <- structure(list(Col1 = c("A", "B", "C", "D", "X"), Col2 = c("E",
"Q", "R", "Z", "Y")), class = "data.frame", row.names = c(NA,
-5L))
Out:
Col1 Col2
1 A E
2 B Q
3 C R
4 D Z
5 X Y
I need to rename columns of df1 using df2, except column G since it not in df2's Col1.
I use df2$Col2[match(names(df1), df2$Col1)] based on the answer from here, but it returns "E" "Q" "R" "Z" NA, as you see column G become NA. I hope it keep the original name.
The expected result:
E Q R Z G
1 1 2 3 4 5
How could I deal with this issue? Thanks.
By using na.omit(it's little bit messy..)
colnames(df1)[na.omit(match(names(df1), df2$Col1))] <- df2$Col2[na.omit(match(names(df1), df2$Col1))]
df1
E Q R Z G
1 1 2 3 4 5
I have success to reproduce your error with
df2 <- data.frame(
Col1 = c("H","I","K","A","B","C","D"),
Col2 = c("a1","a2","a3","E","Q","R","Z")
)
The problem is location of df2$Col1 and names(df1) in match.
na.omit(match(names(df1), df2$Col1))
gives [1] 4 5 6 7, which index does not exist in df1 that has length 5.
For df1, we should change order of terms in match, na.omit(match(df2$Col1,names(df1))) gives [1] 1 2 3 4
colnames(df1)[na.omit(match(df2$Col1, names(df1)))] <- df2$Col2[na.omit(match(names(df1), df2$Col1))]
This will works.
A solution using the rename_with function from the dplyr package.
library(dplyr)
df3 <- df2 %>%
filter(Col1 %in% names(df1))
df4 <- df1 %>%
rename_with(.cols = df3$Col1, .fn = function(x) df3$Col2[df3$Col1 %in% x])
df4
# E Q R Z G
# 1 1 2 3 4 5
I want to fill df2 with information from df1.
df1 as below
ID Mutation
1 A
2 B
2 C
3 A
df2 as below
ID A B C
1
2
3
For example, if mutation A is found in ID 1, then I want it in df2 it marked as "Y".
So the df2 result should be
ID A B C
1 Y
2 Y Y
3 Y
I have hundreds of IDs and more than 20 mutations. How can I efficiently achieve this in R? Thanks!
Using data.table you can try
setDT(df)
df2 <- dcast(df,formula = ID~Mutation )
df2[, c("A", "B", "C") := lapply(.SD, function(x) ifelse(is.na(x), " ", "Y")), ID]
df2
#Output
ID A B C
1: 1 Y
2: 2 Y Y
3: 3 Y
Create a new column with value 'Y' and cast the data in wide format.
library(dplyr)
library(tidyr)
df %>%
mutate(value = 'Y') %>%
pivot_wider(names_from = Mutation, values_from = value, values_fill = '')
# ID A B C
# <int> <chr> <chr> <chr>
#1 1 "Y" "" ""
#2 2 "" "Y" "Y"
#3 3 "Y" "" ""
data
df <- structure(list(ID = c(1L, 2L, 2L, 3L), Mutation = c("A", "B",
"C", "A")), class = "data.frame", row.names = c(NA, -4L))
I want perform join.
df1=structure(list(id = 1:3, group_id = c(10L, 20L, 40L)), class = "data.frame", row.names = c(NA,
-3L))
df2 has another structure, in group_id's field contain many groups. For examle {10,100,400}
so dput()
df2=structure(list(id = 1:3, group_id = structure(c(1L, 3L, 2L), .Label = c("{`10`,100,`40`}",
"{3,`40`,600,100}", "{4}"), class = "factor")), class = "data.frame", row.names = c(NA,
-3L))
df2 has group_id 10 and 40,but they are in braces together with other groups.
How get desired joined output
id group_id
1 10
1 40
3 40
You can clean group_id in df2 using gsub, bring each id in separate rows and filter.
library(dplyr)
df2 %>%
mutate(group_id = gsub('[{}`]', '', group_id)) %>%
tidyr::separate_rows(group_id) %>%
filter(group_id %in% df1$group_id)
# id group_id
#1 1 10
#2 1 40
#3 3 40
Here's a data.table alternative:
df2[, strsplit(gsub('[{}`]', '', group_id), ','), by = id][V1 %in% df1$group_id]
# id V1
#1: 1 10
#2: 1 40
#3: 3 40
here is an option with base R using regmatches/regexpr
subset(setNames(stack(setNames(regmatches(df2$group_id, gregexpr("\\d+", df2$group_id)),
df2$id))[2:1], c('id', 'group_id')), group_id %in% df1$group_id)
# id group_id
#1 1 10
#3 1 40
#6 3 40
I have two data frames. dfOne is made like this:
X Y Z T J
3 4 5 6 1
1 2 3 4 1
5 1 2 5 1
and dfTwo is made like this
C.1 C.2
X Z
Y T
I want to obtain a new dataframe where there are simultaneously X, Y, Z, T Values which are major than a specific threshold.
Example. I need simultaneously (in the same row):
X, Y > 2
Z, T > 4
I need to use the second data frame to reach my objective, I expect something like:
dfTwo$C.1>2
so the result would be a new dataframe with this structure:
X Y Z T J
3 4 5 6 1
How could I do it?
Here is a base R method with Map and Reduce.
# build lookup table of thresholds relative to variable name
vals <- setNames(c(2, 2, 4, 4), unlist(dat2))
# subset data.frame
dat[Reduce("&", Map(">", dat[names(vals)], vals)), ]
X Y Z T J
1 3 4 5 6 1
Here, Map returns a list of length 4 with logical variables corresponding to each comparison. This list is passed to Reduce which returns a single logical vector with length corresponding to the number of rows in the data.frame, dat. This logical vector is used to subset dat.
data
dat <-
structure(list(X = c(3L, 1L, 5L), Y = c(4L, 2L, 1L), Z = c(5L,
3L, 2L), T = c(6L, 4L, 5L), J = c(1L, 1L, 1L)), .Names = c("X",
"Y", "Z", "T", "J"), class = "data.frame", row.names = c(NA,
-3L))
dat2 <-
structure(list(C.1 = structure(1:2, .Label = c("X", "Y"), class = "factor"),
C.2 = structure(c(2L, 1L), .Label = c("T", "Z"), class = "factor")), .Names = c("C.1",
"C.2"), class = "data.frame", row.names = c(NA, -2L))
We can use the purrr package
Here is the input data.
# Data frame from lmo's solution
dat <-
structure(list(X = c(3L, 1L, 5L), Y = c(4L, 2L, 1L), Z = c(5L,
3L, 2L), T = c(6L, 4L, 5L), J = c(1L, 1L, 1L)), .Names = c("X",
"Y", "Z", "T", "J"), class = "data.frame", row.names = c(NA,
-3L))
# A numeric vector to show the threshold values
# Notice that columns without any requirements need NA
vals <- c(X = 2, Y = 2, Z = 4, T = 4, J = NA)
Here is the implementation
library(purrr)
map2_dfc(dat, vals, ~ifelse(.x > .y | is.na(.y), .x, NA)) %>% na.omit()
# A tibble: 1 x 5
X Y Z T J
<int> <int> <int> <int> <int>
1 3 4 5 6 1
map2_dfc loop through each column in dat and each value in vals one by one with a defined function. ~ifelse(.x > .y | is.na(.y), .x, NA) means if the number in each column is larger than the corresponding value in vals, or vals is NA, the output should be the original value from the column. Otherwise, the value is replaced to be NA. The output of map2_dfc(dat, vals, ~ifelse(.x > .y | is.na(.y), .x, NA)) is a data frame with NA values in some rows indicating that the condition is not met. Finally, na.omit removes those rows.
Update
Here I demonstrate how to covert the dfTwo dataframe to the vals vector in my example.
First, let's create the dfTwo data frame.
dfTwo <- read.table(text = "C.1 C.2
X Z
Y T",
header = TRUE, stringsAsFactors = FALSE)
dfTwo
C.1 C.2
1 X Z
2 Y T
To complete the task, I load the dplyr and tidyr package.
library(dplyr)
library(tidyr)
Now I begin the transformation of dfTwo. The first step is to use stack function to convert the format.
dfTwo2 <- dfTwo %>%
stack() %>%
setNames(c("Col", "Group")) %>%
mutate(Group = as.character(Group))
dfTwo2
Col Group
1 X C.1
2 Y C.1
3 Z C.2
4 T C.2
The second step is to add the threshold information. One way to do this is to create a look-up table showing the association between Group and Value
threshold_df <- data.frame(Group = c("C.1", "C.2"),
Value = c(2, 4),
stringsAsFactors = FALSE)
threshold_df
Group Value
1 C.1 2
2 C.2 4
And then we can use the left_join function to combine the data frame.
dfTwo3 <- dfTwo2 %>% left_join(threshold_dt, by = "Group")
dfTwo3
Col Group Value
1 X C.1 2
2 Y C.1 2
3 Z C.2 4
4 T C.2 4
Now it is the third step. Notice that there is a column called J which does not need any threshold. So we need to add this information to dfTwo3. We can use the complete function from tidyr. The following code completes the data frame by adding Col in dat but not in dfTwo3 and NA to the Value.
dfTwo4 <- dfTwo3 %>% complete(Col = colnames(dat))
dfTwo4
# A tibble: 5 x 3
Col Group Value
<chr> <chr> <dbl>
1 J <NA> NA
2 T C.2 4
3 X C.1 2
4 Y C.1 2
5 Z C.2 4
The fourth step is arrange the right order of dfTwo4. We can achieve this by turning Col to factor and assign the level based on the order of the column name in dat.
dfTwo5 <- dfTwo4 %>%
mutate(Col = factor(Col, levels = colnames(dat))) %>%
arrange(Col) %>%
mutate(Col = as.character(Col))
dfTwo5
# A tibble: 5 x 3
Col Group Value
<chr> <chr> <dbl>
1 X C.1 2
2 Y C.1 2
3 Z C.2 4
4 T C.2 4
5 J <NA> NA
We are almost there. Now we can create vals from dfTwo5.
vals <- dfTwo5$Value
names(vals) <- dfTwo5$Col
vals
X Y Z T J
2 2 4 4 NA
Now we are ready to use the purrr package to filter the data.
The aboved are the breakdown of steps. We can combine all these steps into the following code for simlicity.
library(dplyr)
library(tidyr)
threshold_df <- data.frame(Group = c("C.1", "C.2"),
Value = c(2, 4),
stringsAsFactors = FALSE)
dfTwo2 <- dfTwo %>%
stack() %>%
setNames(c("Col", "Group")) %>%
mutate(Group = as.character(Group)) %>%
left_join(threshold_df, by = "Group") %>%
complete(Col = colnames(dat)) %>%
mutate(Col = factor(Col, levels = colnames(dat))) %>%
arrange(Col) %>%
mutate(Col = as.character(Col))
vals <- dfTwo2$Value
names(vals) <- dfTwo2$Col
dfOne[Reduce(intersect, list(which(dfOne["X"] > 2),
which(dfOne["Y"] > 2),
which(dfOne["Z"] > 4),
which(dfOne["T"] > 4))),]
# X Y Z T J
#1 3 4 5 6 1
Or iteratively (so fewer inequalities are tested):
vals = c(X = 2, Y = 2, Z = 4, T = 4) # from #lmo's answer
dfOne[Reduce(intersect, lapply(names(vals), function(x) which(dfOne[x] > vals[x]))),]
# X Y Z T J
#1 3 4 5 6 1
I'm writing this assuming that the second DF is meant to categorize the fields in the first DF. It's way simpler if you don't need to use the second one to define the conditions:
dfNew = dfOne[dfOne$X > 2 & dfOne$Y > 2 & dfOne$Z > 4 & dfOne$T > 4, ]
Or, using dplyr:
library(dplyr)
dfNew = dfOne %>% filter(X > 2 & Y > 2 & Z > 4 & T > 4)
In case that's all you need, I'll save this comment while I poke at the more complicated version of the question.