R - generate dynamic number of columns and substring column values - r

looking for some help with data manipulation in R. I have data in the following format;
ID L1 L2 L3
1 BBCBCACCBCB CBCBBBB BEBBBAAB
2 BBCBCCCCBCB CBCCCBC BBAACCCB
3 BBCBCACCBCB CBCBBBB BEBBBAAB
4 BBCBCACCBCB CBCBBBB BEBBBAAB
5 BBCBACBCCCB BBCCCBC BBCBAAAAB
6 BBCBBCCBBCB BBCBCEB BBBBCAACB
7 BBCBBCCBBCB BBCBCEB BBBBCAACB
8
9 BBCBCACCBCB CBCBBBB BEBBBAAB
10 BBCBBCCBBCB BBCBCEB BBBBCAACB
11 BBCBBCCBBCB BBCBCEB BBBBCAACB
The values in each column will be strings of varying length. I want an R function that for each column above, will
1) generate a dynamic number of columns based on the maximum length of any string in the column e.g. L1 max length = 11, therefore 11 new columns each labelled L1_1:L1_11
2) then split the strings into triplets, e.g.
ID L1 L2 L3 L1_1 L1_2 L1_3 L1_4 L1_5 L1_6 L1_7 L1_8 L1_9
1 BBCBCACCBCB CBCBBBB BEBBBAAB BBC BCB CBC BCA CAC ACC CCB CBC BCB
3) perform a calculation on this triplet i.e. (number of 'a' * 1) + (number of 'b' * 3) + (number of 'c'*7) in the triplet.
4) return the value of this calculation in the new column.
I have found that the code suggested does exactly what I need when run for columns L1, L2 but does not work for L3. The error I receive is 'Error in as.data.frame.matrix(passed.args[[i]], stringsAsFactors = st : missing value where TRUE/FALSE needed'
Any ideas?
Thanks very much.
EDIT
dput(df):
structure(list(ID = 1:11, L1 = structure(c(4L, 5L, 4L, 4L, 2L, 3L, 3L, 1L, 4L, 3L, 3L), .Label = c("", "BBCBACBCCCB","BBCBBCCBBCB","BBCBCACCBCB", "BBCBCCCCBCB"), class = "factor"), L2 = structure(c(4L, 5L, 4L, 4L, 3L, 2L, 2L, 1L, 4L, 2L, 2L), .Label = c("","BBCBCEB","BBCCCBC", "CBCBBBB", "CBCCCBC"), class = "factor"), L3 = structure(c(5L,2L, 5L, 5L, 4L, 3L, 3L, 1L, 5L, 3L, 3L), .Label = c("", "BBAACCCB", "BBBBCAACB", "BBCBAAAAB", "BEBBBAAB"), class = "factor")), .Names = c("ID", "L1", "L2", "L3"), class = "data.frame", row.names = c(NA,-11L))
structure(list(ID = 1:11, L1 = structure(c(4L, 5L, 4L, 4L, 2L, 3L, 3L, 1L, 4L, 3L, 3L), .Label = c("", "BBCBACBCCCB","BBCBBCCBBCB","BBCBCACCBCB", "BBCBCCCCBCB"), class = "factor"), L2 = structure(c(4L, 5L, 4L, 4L, 3L, 2L, 2L, 1L, 4L, 2L, 2L), .Label = c("","BBCBCEB","BBCCCBC", "CBCBBBB", "CBCCCBC"), class = "factor"), L3 = structure(c(5L,2L, 5L, 5L, 4L, 3L, 3L, 1L, 5L, 3L, 3L), .Label = c("", "BBAACCCB", "BBBBCAACB", "BBCBAAAAB", "BEBBBAAB"), class = "factor")), .Names = c("ID", "L1", "L2", "L3"), class = "data.frame", row.names = c(NA,-11L))

#DATA
df = structure(list(ID = 1:4, L1 = c("abbbcc", "aabacd", "abbda",
"bbad")), .Names = c("ID", "L1"), class = "data.frame", row.names = c(NA,
-4L))
#Go through the strings and split into subgroups of 3 characters.
#Put the substrings in a list
temp = lapply(df$L1, function(x) sapply(3:nchar(x), function(i) substr(x, i-2, i)))
#Obtain the length of the subgroup with the most triplets
temp_l = max(lengths(temp))
#Subset the subgroups from 1 to temp_l so that remianing values are NA
cbind(df, setNames(data.frame(do.call(rbind, lapply(temp, function(a)
a[1:temp_l]))), nm = paste0("L1_",1:temp_l)))
# ID L1 L1_1 L1_2 L1_3 L1_4
#1 1 abbbcc abb bbb bbc bcc
#2 2 aabacd aab aba bac acd
#3 3 abbda abb bbd bda <NA>
#4 4 bbad bba bad <NA> <NA>
If you want calculation based on triplets, run the following before doing the cbind step
temp_L1 = lapply(df$L1, function(x) sapply(3:nchar(x), function(i) substr(x, i-2, i)))
temp_L1_length = max(lengths(temp_L1))
temp_L1 = lapply(temp_L1, function(x)
sapply(x, function(y){
num_a = unlist(gregexpr(pattern = "a", text = y))
num_a = sum(num_a > 0) #length of positive match
num_b = unlist(gregexpr(pattern = "b", text = y))
num_b = sum(num_b > 0)
num_c = unlist(gregexpr(pattern = "c", text = y))
num_c = sum(num_c > 0)
num_a * 1 + num_b * 3 + num_c * 7
})
)
temp_L1 = setNames(data.frame(do.call(rbind, lapply(temp_L1, function(a)
a[1:temp_L1_length]))), nm = paste0("L1_",1:temp_L1_length))
#REPEAT FOR L2, L3, ...
cbind(df, temp_L1) #Run cbind(df, temp_L1, temp_L2, ...)
# ID L1 L1_1 L1_2 L1_3 L1_4
#1 1 abbbcc 7 9 13 17
#2 2 aabacd 5 5 11 8
#3 3 abbda 7 6 4 NA
#4 4 bbad 7 4 NA NA
UPDATE
You could create a function and use it like shown below
#FUNCTION
foo = function(data, column){
temp_L1 = lapply(as.character(data[[column]]), function(x) sapply(3:nchar(x), function(i) substr(x, i-2, i)))
temp_L1_length = max(lengths(temp_L1))
temp_L1 = lapply(temp_L1, function(x)
sapply(x, function(y){
num_a = unlist(gregexpr(pattern = "a", text = y, ignore.case = TRUE))
num_a = sum(num_a > 0) #length of positive match
num_b = unlist(gregexpr(pattern = "b", text = y, ignore.case = TRUE))
num_b = sum(num_b > 0)
num_c = unlist(gregexpr(pattern = "c", text = y, ignore.case = TRUE))
num_c = sum(num_c > 0)
num_a * 1 + num_b * 3 + num_c * 7
})
)
temp_L1 = setNames(data.frame(do.call(rbind, lapply(temp_L1, function(a)
a[1:temp_L1_length]))), nm = paste0(column,"_",1:temp_L1_length))
return(temp_L1)
}
#USING ON NEW DATA
cbind(df, do.call(cbind, lapply(colnames(df)[-1], function(x) foo(data = df, column = x))))

If you want to use tidyverse verbs
library(tidyverse)
df1 <- df %>%
mutate(L2=L1) %>% # copies L1
nest(L2) %>% # nest L1
mutate(data=map(data,~sapply(1:(nchar(.x)-2), function(y) substr(.x, y, y+2)))) %>% # makes triplets
unnest(data) %>% # unnest triplets
group_by(ID) %>% # perform next operations group wise
mutate(rn=letters[row_number()]) %>% # make future column names
spread(rn,data) # spread long format into wide format (columns)
ID L1 a b c d
1 1 abbbcc abb bbb bbc bcc
2 2 aabacd aab aba bac acd
3 3 abbda abb bbd bda <NA>
4 4 bbad bba bad <NA> <NA>

Related

calculating multiple variable in column based on comparing dates in rows as well as header of columns

i am unable to think of an easy method to do this.
Sample data is :
set.seed(101)
b=sample(seq(as.Date("2010/1/1"), as.Date("2010/1/10"), "days"), 3)
f1=data.frame(a=1:length(b), b=b)
col_names=paste(c('x', 'y'), sort(rep(seq(as.Date("2010/1/1"), as.Date("2010/1/10"), "days"), 2)), sep = '')
set.seed((102))
f2 <- data.frame(matrix(sample(0:5,30, replace = T), ncol = length(col_names), nrow = nrow(f1)))
names(f2)=col_names
f3=data.frame(f1, f2)
or
dput(f3)
structure(list(a = 1:3, b = structure(c(14613, 14610, 14615), class = "Date"),
x2010.01.01 = c(3L, 2L, 4L), y2010.01.01 = c(3L, 0L, 2L),
x2010.01.02 = c(5L, 1L, 5L), y2010.01.02 = c(2L, 5L, 4L),
x2010.01.03 = c(4L, 2L, 3L), y2010.01.03 = c(5L, 4L, 2L),
x2010.01.04 = c(5L, 5L, 5L), y2010.01.04 = c(3L, 3L, 3L),
x2010.01.05 = c(1L, 2L, 0L), y2010.01.05 = c(2L, 2L, 2L),
x2010.01.06 = c(3L, 2L, 4L), y2010.01.06 = c(3L, 0L, 2L),
x2010.01.07 = c(5L, 1L, 5L), y2010.01.07 = c(2L, 5L, 4L),
x2010.01.08 = c(4L, 2L, 3L), y2010.01.08 = c(5L, 4L, 2L),
x2010.01.09 = c(5L, 5L, 5L), y2010.01.09 = c(3L, 3L, 3L),
x2010.01.10 = c(1L, 2L, 0L), y2010.01.10 = c(2L, 2L, 2L)), class = "data.frame", row.names = c(NA,
-3L))
Im trying to create new columns based on comparing b date to column header. im calculating 1 day avg, 3 day avg and so on.
In 1st case the date is 4th jan which imply that 1 day col would be x2010.01.04, 3 day avg would include (x2010.01.04,x2010.01.03,x2010.01.02) and so on. This need to be done for x and y variable both.
Finally op should look like
a b oneday_avg_x oneday_avg_y threeday_avg_x threeday_avg_y
1 1 2010-01-04 5 3 (5+4+5)/3=4.6 3.3
2 2 2010-01-01 2 0 2 0
3 3 2010-01-06 4 2 3 2.3
Let me know if anything is missing.
We can use apply for "x" and "y" values. We remove first leading "x" and "y" from column names, convert it to Date and match it with b value. Return that index along with mean of previous 3 index from the apply call. As apply converts everything to characters, we use type.convert to convert columns to appropriate classes.
x_cols <- grep("^x", names(f3))
y_cols <- grep("^y", names(f3))
out <- f3[1:2]
out[c("oneday_avg_x", "threeday_avg_x")] <- t(apply(f3[c(2, x_cols)], 1, function(x) {
inds <- match(as.Date(x[[1]]), as.Date(sub("^x", "", names(x)), "%Y.%m.%d"))
c(x[inds], mean(as.numeric(x[(inds - 2) : inds]), na.rm = TRUE))
}))
out[c("oneday_avg_y", "threeday_avg_y")] <- t(apply(f3[c(2, y_cols)], 1, function(x) {
inds <- match(as.Date(x[[1]]), as.Date(sub("^y", "", names(x)), "%Y.%m.%d"))
c(x[inds], mean(as.numeric(x[(inds - 2) : inds]), na.rm = TRUE))
}))
out <- type.convert(out)
out
# a b oneday_avg_x threeday_avg_x oneday_avg_y threeday_avg_y
#1 1 2010-01-04 5 4.6667 3 3.3333
#2 2 2010-01-01 2 2.0000 0 0.0000
#3 3 2010-01-06 4 3.0000 2 2.3333
EDIT
A more scalable solution which can handle multiple day averages by using single apply
x_cols <- grep("^x", names(f3))
y_cols <- grep("^y", names(f3))
names(f3)[-(1:2)] <- gsub("\\.", "-", sub(".{1}", "", names(f3)[-(1:2)]))
out <- f3[1:2]
num <- c(1, 3)
new_cols <- c(outer(num, c("x", "y"), function(x, y) paste0(x, "_day_avg_", y)))
out[new_cols] <- t(apply(f3, 1, function(x) {
x_ind <- match(x[[2]], names(x)[x_cols])
x_vals <- sapply(num, function(y)
mean(as.numeric(x[x_cols][max((x_ind - y + 1), 1):x_ind])))
y_ind <- match(x[[2]], names(x)[y_cols])
y_vals <- sapply(num, function(y)
mean(as.numeric(x[y_cols][max((y_ind - y + 1), 1):y_ind])))
c(x_vals, y_vals)
}))
out
# a b 1_day_avg_x 3_day_avg_x 1_day_avg_y 3_day_avg_y
#1 1 2010-01-04 5 4.666667 3 3.333333
#2 2 2010-01-01 2 2.000000 0 0.000000
#3 3 2010-01-06 4 3.000000 2 2.333333

How to loop through a list of lists, whilst subsetting the lists based on a variable

I have a list of lists similar to the toy example given here. I would like to loop through this list to return a new list which has had elements removed based on a variable.
dput(head(list)):
list(FEB_gems = list(GAME1 = structure(list(GAME1_Class = structure(c(2L,
1L, 5L, 4L, 3L), .Label = c("fighter", "paladin", "rouge", "sorcerer",
"wizard"), class = "factor"), GAME1_Race = structure(c(3L, 1L,
4L, 3L, 2L), .Label = c("elf", "gnome", "human", "orc"), class = "factor"),
GAME1_Alignment = structure(c(4L, 2L, 1L, 5L, 3L), .Label = c("CE",
"CG", "LG", "NE", "NN"), class = "factor"), GAME1_Level = c(6,
7, 6, 7, 7), GAME1_Alive = structure(c(1L, 1L, 1L, 1L, 1L
), .Label = "y", class = "factor")), class = "data.frame", row.names = c(NA,
-5L)), GAME2 = structure(list(GAME2_Class = structure(c(3L, 5L,
2L, 4L, 1L), .Label = c("bard", "cleric", "fighter", "monk",
"wizard"), class = "factor"), GAME2_Race = structure(c(2L, 3L,
2L, 4L, 1L), .Label = c("dwarf", "elf", "half-elf", "human"), class = "factor"),
GAME2_Alignment = structure(c(4L, 2L, 1L, 5L, 3L), .Label = c("CE",
"CG", "LG", "NE", "NN"), class = "factor"), GAME2_Level = c(5,
5, 5, 5, 5), GAME2_Alive = structure(c(1L, 2L, 2L, 2L, 2L
), .Label = c("n", "y"), class = "factor")), class = "data.frame", row.names = c(NA,
-5L))), MAR_gems = list(GAME3 = structure(list(GAME3_Class = structure(c(2L,
1L, 5L, 4L, 3L), .Label = c("barbarian", "cleric", "monk", "ranger",
"warlock"), class = "factor"), GAME3_Race = structure(c(2L, 3L,
2L, 4L, 1L), .Label = c("dwarf", "elf", "half-elf", "human"), class = "factor"),
GAME3_Alignment = structure(c(2L, 2L, 1L, 3L, 2L), .Label = c("CE",
"LG", "LN"), class = "factor"), GAME3_Level = c(1, 1, 1,
1, 1), GAME3_Alive = structure(c(2L, 2L, 2L, 1L, 2L), .Label = c("n",
"y"), class = "factor")), class = "data.frame", row.names = c(NA,
-5L)), GAME4 = structure(list(GAME4_Class = structure(c(2L, 1L,
5L, 4L, 3L), .Label = c("fighter", "paladin", "rouge", "sorcerer",
"wizard"), class = "factor"), GAME4_Race = structure(c(2L, 3L,
2L, 4L, 1L), .Label = c("dwarf", "elf", "half-elf", "human"), class = "factor"),
GAME4_Alignment = structure(c(1L, 2L, 1L, 4L, 3L), .Label = c("CE",
"CG", "LG", "LN"), class = "factor"), GAME4_Level = c(5,
5, 5, 5, 5), GAME4_Alive = structure(c(1L, 2L, 2L, 2L, 2L
), .Label = c("n", "y"), class = "factor")), class = "data.frame", row.names = c(NA,
-5L))))
I have made some attempt at manually sub-setting the list of lists. A function would be preferable is because I have multiple types of data to subset.
1) sub-setting Level columns based on interger
df1 <- Games.Split[[1]][[1]]
Level <- df1[which(df1[4] > 6),]
Games.Split[[1]][[1]] <- Level
df1:
GAME1_Class GAME1_Race GAME1_Alignment GAME1_Level GAME1_Alive
1 paladin human NE 6 y
2 fighter elf CG 7 y
3 wizard orc CE 6 y
4 sorcerer human NN 7 y
5 rouge gnome LG 7 y
Level:
GAME1_Class GAME1_Race GAME1_Alignment GAME1_Level GAME1_Alive
2 fighter elf CG 7 y
4 sorcerer human NN 7 y
5 rouge gnome LG 7 y
2) sub-setting Alive columns based on string
df2 <- Games.Split[[1]][[2]]
Alive <- df2[which(df2[5] == 'y'),]
Games.Split[[1]][[2]] <- Alive
df2:
GAME2_Class GAME2_Race GAME2_Alignment GAME2_Level GAME2_Alive
1 fighter elf NE 5 n
2 wizard half-elf CG 5 y
3 cleric elf CE 5 y
4 monk human NN 5 y
5 bard dwarf LG 5 y
Alive:
GAME2_Class GAME2_Race GAME2_Alignment GAME2_Level GAME2_Alive
2 wizard half-elf CG 5 y
3 cleric elf CE 5 y
4 monk human NN 5 y
5 bard dwarf LG 5 y
However I'm struggling to put this into action in a for loop to perform these sub-setting tasks on the entire list.
for (i in Games.Split){
for (j in i){
Alive = j[which(j[5] == 'y'),]
j <- Alive
}
}
Overall, a function/ method that can iterate through the whole list to provide a new list which no longer has the removed elements.
Since you have two levels of lists to organizing the dataframes, this will require a nested list apply function (lapply)-- same as a loop but a little neater. Here is an example that creates a function to subset the game list (gameList) based on maximum level (maxLevel):
listSubset <- function(x, maxLevel){
lapply(x, function(ls){
lapply(ls, function(df) df[df[[grep('Level', names(df), value = TRUE)]] < maxLevel, ])
})
}
listSubset(x = gameList, maxLevel = 6)
Output:
$`FEB_gems`
$`FEB_gems`$`GAME1`
[1] GAME1_Class GAME1_Race GAME1_Alignment GAME1_Level GAME1_Alive
<0 rows> (or 0-length row.names)
$`FEB_gems`$GAME2
GAME2_Class GAME2_Race GAME2_Alignment GAME2_Level GAME2_Alive
1 fighter elf NE 5 n
2 wizard half-elf CG 5 y
3 cleric elf CE 5 y
4 monk human NN 5 y
5 bard dwarf LG 5 y
$MAR_gems
$MAR_gems$`GAME3`
GAME3_Class GAME3_Race GAME3_Alignment GAME3_Level GAME3_Alive
1 cleric elf LG 1 y
2 barbarian half-elf LG 1 y
3 warlock elf CE 1 y
4 ranger human LN 1 n
5 monk dwarf LG 1 y
$MAR_gems$GAME4
GAME4_Class GAME4_Race GAME4_Alignment GAME4_Level GAME4_Alive
1 paladin elf CE 5 n
2 fighter half-elf CG 5 y
3 wizard elf CE 5 y
4 sorcerer human LN 5 y
5 rouge dwarf LG 5 y
All the functions are in base R, so no need to install and learn new packages.
If there are only two nested lists, and want different filtering conditions, apply on it individually and assign the output back to the list element. We loop through the master list with map and then apply the logical conditions
library(purrr)
library(dplyr)
lst2 <- map(lst1, ~ {
.x[[1]] <- .x[[1]] %>%
filter_at(4, all_vars(. > 6))
.x[[2]] <- .x[[2]] %>%
filter_at(5, all_vars(. == 'y'))
.x
})
I'd argue that life would be easier if you restructure your data, then use dplyr's filter to pull out what you want (or omit what you don't want). Assuming your original data is called foo:
# Load libraries
library(dplyr)
library(purrr)
# Remove one list
bar <- unlist(foo, recursive = FALSE)
# Get names of campaigns and games
campaign_games <- data.frame(do.call(rbind, strsplit(names(bar), "\\.")))
# Add campaigns and games numbers to data frames
ls_games <- pmap(list(campaign_games[, 1], campaign_games[, 2], bar), cbind)
# Rename all columns
ls_games <- lapply(ls_games, function(x){names(x) <- c("Campaign", "Game_n", "Class", "Race", "Alignment", "Level", "Alive"); x})
# Convert to data frame
df <- bind_rows(ls_games)
# Look at result
head(df)
Now your data looks like this:
# Campaign Game_n Class Race Alignment Level Alive
# 1 FEB_gems GAME1 paladin human NE 6 y
# 2 FEB_gems GAME1 fighter elf CG 7 y
# 3 FEB_gems GAME1 wizard orc CE 6 y
# 4 FEB_gems GAME1 sorcerer human NN 7 y
# 5 FEB_gems GAME1 rouge gnome LG 7 y
# 6 FEB_gems GAME2 fighter elf NE 5 n
which is easy to handle. For example, pull those that are alive in game 1 of FEB gems and are level 7 or higher.
df %>% filter(Alive == "y", Campaign == "FEB_gems",
Level >= 7, Game_n == "GAME1")
# Campaign Game_n Class Race Alignment Level Alive
# 1 FEB_gems GAME1 fighter elf CG 7 y
# 2 FEB_gems GAME1 sorcerer human NN 7 y
# 3 FEB_gems GAME1 rouge gnome LG 7 y

merging and counting similar strings

I have a data with three columns like
Inputdf<-structure(list(df1 = structure(c(4L, 5L, 2L, 1L, 3L), .Label = c("P61160,P61158,O15143,O15144,O15145,P59998,O15511",
"P78537,Q6QNY1,Q6QNY0", "Q06323,Q9UL46", "Q92793,Q09472,Q9Y6Q9,Q92831",
"Q92828,Q13227,O15379,O75376,O60907,Q9BZK7"), class = "factor"),
df2 = structure(c(3L, 2L, 5L, 4L, 1L), .Label = c("", "P61158,O15143,O15144",
"Q06323,Q9UL46", "Q6QNY0", "Q92828"), class = "factor"),
df3 = structure(c(5L, 4L, 3L, 2L, 1L), .Label = c("", "O15511",
"Q06323,Q9UL46", "Q6QNY0", "Q92793,Q09472"), class = "factor")), .Names = c("df1",
"df2", "df3"), class = "data.frame", row.names = c(NA, -5L))
I am trying to find similar strings in this data for example
in df1, I have the first row I have Q92793,Q09472,Q9Y6Q9,Q92831
then I look at df2 and df3 and see if any of these members are in there then in this example, I make the following data
df1 df2 df3 Numberdf1 df2 df3
1 0 1 4 0 Q92793,Q09472
df1 1 means the first row of df1
df2 0 means it did not have any similarity
df3 1, means the first row of df3 has similarity with df1 row 1
Numberdf1, it is the count of strings separated by a ,which is 4
df2 is 0 because there was not any similar string accords df2
df3 is Q92793,Q09472 which paste the string which were similar in here
a desire output looks like below
out<- structure(list(df1 = 1:5, df2 = c(0L, 3L, 4L, 2L, 1L), df3 = c(1L,
0L, 2L, 4L, 3L), Numberdf1 = c(4L, 6L, 2L, 7L, 2L), df2.1 = structure(c(1L,
5L, 4L, 2L, 3L), .Label = c("0", "P61158,O15143,O15144", "Q06323,Q9UL46",
"Q6QNY0", "Q92828"), class = "factor"), df3.1 = structure(c(5L,
1L, 4L, 2L, 3L), .Label = c("0", "O15511", "Q06323,Q9UL46", "Q6QNY0",
"Q92793,Q09472"), class = "factor")), .Names = c("df1", "df2",
"df3", "Numberdf1", "df2.1", "df3.1"), class = "data.frame", row.names = c(NA,
-5L))
The below function does not work , for example, use this data as input
Inputdf1<- structure(list(df1 = structure(c(2L, 3L, 1L), .Label = c("Q06323,Q9UL46",
"Q92793,Q09472,Q9Y6Q9,Q92831", "Q92828,Q13227,O15379,O75376,O60907,Q9BZK7"
), class = "factor"), df2 = structure(1:3, .Label = c("P25788,P25789",
"Q92828, O60907, O75376", "Q9UL46, Q06323"), class = "factor"),
df3 = structure(c(2L, 1L, 3L), .Label = c("Q92831, Q92793, Q09472",
"Q9BZK7, Q92828, O75376, O60907", "Q9UL46, Q06323"), class = "factor")), .Names = c("df1",
"df2", "df3"), class = "data.frame", row.names = c(NA, -3L))
This works for your example:
# First convert factors to strings to lists
Inputdf[] = lapply(Inputdf, as.character)
Inputdf[] = lapply(Inputdf, function(col) sapply(col, function(x) unlist(strsplit(x,','))))
not.empty = function(x) length(x) > 0
out = data.frame()
for (r in 1:nrow(Inputdf)) {
df2.intersect = lapply(Inputdf$df2, intersect, Inputdf$df1[[r]])
df3.intersect = lapply(Inputdf$df3, intersect, Inputdf$df1[[r]])
out[r, 'df1'] = r
out[r, 'df2'] = Position(not.empty, df2.intersect, nomatch=0)
out[r, 'df3'] = Position(not.empty, df3.intersect, nomatch=0)
out[r, 'Numberdf1'] = length(Inputdf$df1[[r]])
out[r, 'df2.1'] = paste(Find(not.empty, df2.intersect, nomatch=0), collapse=',')
out[r, 'df3.1'] = paste(Find(not.empty, df3.intersect, nomatch=0), collapse=',')
}
out
# df1 df2 df3 Numberdf1 df2.1 df3.1
# 1 1 0 1 4 0 Q92793,Q09472
# 2 2 3 0 6 Q92828 0
# 3 3 4 2 3 Q6QNY0 Q6QNY0
# 4 4 2 4 7 P61158,O15143,O15144 O15511
# 5 5 1 3 2 Q06323,Q9UL46 Q06323,Q9UL46
Note: Find and Position identify the first match only. If there are potentially multiple matches, use which.
EDIT
Version accounting for multiple matches
Inputdf[] = lapply(Inputdf, as.character)
Inputdf[] = lapply(Inputdf, function(col) sapply(col, function(x) unlist(strsplit(x,',\\s*'))))
not.empty = function(x) length(x) > 0
out = data.frame()
for (r in 1:nrow(Inputdf)) {
df2.intersect = lapply(Inputdf$df2, intersect, Inputdf$df1[[r]])
df3.intersect = lapply(Inputdf$df3, intersect, Inputdf$df1[[r]])
out[r, 'df1'] = r
out[r, 'df2'] = paste(which(sapply(df2.intersect, not.empty)), collapse=',')
out[r, 'df3'] = paste(which(sapply(df3.intersect, not.empty)), collapse=',')
out[r, 'Numberdf1'] = length(Inputdf$df1[[r]])
out[r, 'df2.1'] = paste(unique(unlist(df2.intersect)), collapse=',')
out[r, 'df3.1'] = paste(unique(unlist(df3.intersect)), collapse=',')
}
out[out==""] = "0"

splitting a data into several parts

My data is below which I want to split it based on IDs into several parts
df1<- structure(list(Ids1 = 1:7, string1 = structure(c(3L, 2L, 4L,
1L, 1L, 1L, 1L), .Label = c("gdyijq,udyhfs,gqdtr", "hdydg", "hishsgd,gugddf",
"ydis"), class = "factor"), Ids2 = c(1L, 3L, 4L, 9L, 10L, NA,
NA), string2 = structure(c(4L, 6L, 2L, 3L, 5L, 1L, 1L), .Label = c("",
"gdyijq,udyhfs", "gqdtr", "hishsgd,gugddf", "nlrshf", "ydis"), class = "factor")), .Names = c("Ids1",
"string1", "Ids2", "string2"), class = "data.frame", row.names = c(NA,
-7L))
The first I want to make df.1 when I keep only those that have similar Ids and count how many of string1 is similar to string2 (they are separated by a comma).
Ids1 string1 ids2 string2 Similar
1 hishsgd,gugddf 1 hishsgd,gugddf 2
3 ydis 3 ydis 1
4 gdyijq,udyhfs,gqdtr 4 gdyijq,udyhfs 2
I do this
df.1 <- df1[which(df1$Ids1 == df1$Ids2), ]
which only gives me the first row and nothing else
Then I want to have those that there are only ids 1 which dont exist in ids2
Ids1 string1
2 hdydg
5 gdyijq,udyhfs,gqdtr
6 gdyijq,udyhfs,gqdtr
7 gdyijq,udyhfs,gqdtr
I do this but also does not work
df.2<- df1[which(df1$Ids1 != df1$Ids2), ]
and the last I want to keep those that are only in ids2 and not ids1
Ids1 string1
9 gqdtr
10 nlrshf
which I do this but also does not work
df.3<- df1[which(df1$Ids2 != df1$Ids1), ]
Here is one solution I could come up with based on joins using dplyr package:
library(dplyr)
df.1 <- inner_join(select(df1, Ids1, string1), select(df1, Ids2, string2), by = c('Ids1' = 'Ids2'))
df.1$Similar <- apply(df.1[, -1], 1, function(x) sum(unlist(strsplit(x[1], ',')) %in% unlist(strsplit(x[2], ','))))
df.2 <- anti_join(select(df1, Ids1, string1), select(df1, Ids2, string2), by = c('Ids1' = 'Ids2'))
df.3 <- anti_join(select(df1, Ids2, string2), select(df1, Ids1, string1), by = c('Ids2' = 'Ids1'))
df.3 <- df.3[complete.cases(df.3), ]
You can also do something different for df.2 and df.3 as follows:
df.2 <- df1[!df1$Ids1 %in% df1$Ids2, c('Ids1', 'string1')]
df.3 <- df1[!df1$Ids2 %in% df1$Ids1, c('Ids2', 'string2')]
df.3 <- df.3[complete.cases(df.3), ]

Select the nth value of aggregated column after group by in R

Given df as follows:
# group value
# 1 A 8
# 2 A 1
# 3 A 7
# 4 B 3
# 5 B 2
# 6 B 6
# 7 C 4
# 8 C 5
df <- structure(list(group = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L,
3L), .Label = c("A", "B", "C"), class = "factor"), value = c(8L,
1L, 7L, 3L, 2L, 6L, 4L, 5L)), .Names = c("group", "value"), class = "data.frame", row.names = c(NA,
-8L))
And a vector of indices (possibly with NA):
inds <- c(2,1,NA)
How we can get the nth element of column value per group, preferably in base R?
For example, based on inds, we want the second element of value in group A, first element in group B, NA in group C. So the result would be:
#[1] 1 3 NA
Here is a solution with mapply and split:
mapply("[", with(df, split(value, group)), inds)
which returns a named vector
A B C
1 3 NA
with(df, split(value, group)) splits the data frame by group and returns a list of data frames. mapply takes that list and "inds" and applies the subsetting function "[" to each pairs of arguments.
Using levels and sapply you could do:
DF <- structure(list(group = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L,
3L), .Label = c("A", "B", "C"), class = "factor"), value = c(8L,
1L, 7L, 3L, 2L, 6L, 4L, 5L)), .Names = c("group", "value"), class = "data.frame", row.names = c(NA,
-8L))
inds <- c(2,1,NA)
lvls = levels(DF$group)
groupInds = sapply(1:length(lvls),function(x) DF$value[DF$group==lvls[x]][inds[x]] )
groupInds
#[1] 1 3 NA
Using again mapply (but not nearly as elegant as IMO's answer):
mapply(function(x, y) subset(df, group == x, value)[y,] ,levels(df$group), inds)
I know you said preferably in base R, but just for the record, here is a data.table way
setDT(df)[, .SD[inds[.GRP], value], by=group][,V1]
#[1] 1 3 NA
I just did come up with another solution:
diag(aggregate(value~group, df, function(x) x[inds])[,-1])
#[1] 1 3 NA
Benchmarking
library(microbenchmark)
library(data.table)
df <- structure(list(group = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L,
3L), .Label = c("A", "B", "C"), class = "factor"), value = c(8L,
1L, 7L, 3L, 2L, 6L, 4L, 5L)), .Names = c("group", "value"), class = "data.frame", row.names = c(NA,
-8L))
inds <- c(2,1,NA)
f_Imo <- function(df) as.vector(mapply("[", with(df, split(value, group)), inds))
f_Osssan <- function(df) {lvls = levels(df$group);sapply(1:length(lvls),function(x) df$value[df$group==lvls[x]][inds[x]])}
f_User2321 <- function(df) unlist(mapply(function(x, y) subset(df, group == x, value)[y,] ,levels(df$group), inds))
f_dww <- function(df) setDT(df)[, .SD[inds[.GRP], value], by=group][,V1]
f_m0h3n <- function(df) diag(aggregate(value~group, df, function(x) x[inds])[,-1])
all.equal(f_Imo(df), f_Osssan(df), f_User2321(df), f_dww(df), f_m0h3n(df))
# [1] TRUE
microbenchmark(f_Imo(df), f_Osssan(df), f_m0h3n(df), f_User2321(df), f_dww(df))
# Unit: microseconds
# expr min lq mean median uq max neval
# f_Imo(df) 71.004 85.1180 91.52996 91.748 96.8810 121.048 100
# f_Osssan(df) 252.788 276.5265 318.70529 287.648 301.5495 2651.492 100
# f_m0h3n(df) 1422.627 1555.4365 1643.47184 1618.740 1670.7095 4729.827 100
# f_User2321(df) 2889.738 3000.3055 3148.44916 3037.945 3118.7860 6013.442 100
# f_dww(df) 2960.740 3086.2790 3206.02147 3143.381 3250.9545 5976.229 100

Resources