Compare first element of a list with another list - r

I am using R and need a hint to solve my problem:
I have two lists and I want to compare the values of the first row of list "a" with the values of the first row of list "b". If the element exists, I want to write the value of the second row of list "b" into the second row of list "a".
So, here is list "a":
X.WORD FREQ
abase 0
abased 0
abasing 0
abashs 0
here list "b"
V1 V2
arthur 11
abased 29
turtle 9
abash 2
The result should be
X.WORD FREQ
abase 0
abased 29
abasing 0
abashs 0
Thanks for your answers

That's just a task for simple merge in base R
Res <- merge(a, b, by.x = "X.WORD", by.y = "V1", all.x = TRUE)[, -2]
Res$V2[is.na(Res$V2)] <- 0
Res
# X.WORD V2
# 1 abase 0
# 2 abased 29
# 3 abashs 0
# 4 abasing 0
Data
a <- structure(list(X.WORD = structure(c(1L, 2L, 4L, 3L), .Label = c("abase",
"abased", "abashs", "abasing"), class = "factor"), FREQ = c(0L,
0L, 0L, 0L)), .Names = c("X.WORD", "FREQ"), class = "data.frame", row.names = c(NA,
-4L))
b <- structure(list(V1 = structure(c(3L, 1L, 4L, 2L), .Label = c("abased",
"abash", "arthur", "turtle"), class = "factor"), V2 = c(11L,
29L, 9L, 2L)), .Names = c("V1", "V2"), class = "data.frame", row.names = c(NA,
-4L))

Here is one approach.
library(dplyr)
ana <- foo %>%
left_join(foo2, by = c("X.WORD" = "V1")) %>%
select(-FREQ) %>%
rename(FREQ = V2)
ana$FREQ[is.na(ana$FREQ)] <- 0
# X.WORD FREQ
#1 abase 0
#2 abased 29
#3 abasing 0
#4 abashs 0
Data
foo <- structure(list(X.WORD = structure(c(1L, 2L, 4L, 3L), .Label = c("abase",
"abased", "abashs", "abasing"), class = "factor"), FREQ = c(0L,
0L, 0L, 0L)), .Names = c("X.WORD", "FREQ"), class = "data.frame", row.names = c(NA,
-4L))
foo2 <- structure(list(V1 = structure(c(3L, 1L, 4L, 2L), .Label = c("abased",
"abash", "arthur", "turtle"), class = "factor"), V2 = c(11L,
29L, 9L, 2L)), .Names = c("V1", "V2"), class = "data.frame", row.names = c(NA,
-4L))

Related

overlapping unique dataframes in R

My two dataframes are:
df1<-structure(list(header1 = structure(1:4, .Label = c("a", "b",
"c", "d"), class = "factor")), class = "data.frame", row.names = c(NA,
-4L))
and
df2<-structure(list(sample_x = structure(c(1L, 1L, 2L, 3L), .Label = c("0",
"a", "c"), class = "factor"), sample_y = structure(c(1L, 3L,
2L, 4L), .Label = c("0", "a", "m", "t"), class = "factor"), sample_z = structure(c(3L,
2L, 1L, 1L), .Label = c("0", "a", "c"), class = "factor")), class = "data.frame", row.names = c(NA,
-4L))
0s in df2 means no values.
Now I want to overlap df1 and df2 to make an output dataframe(df3):
df3<-structure(list(sample_x = c(2L, 2L, 0L), sample_y = c(1L, 3L,
2L), sample_z = c(2L, 2L, 0L)), class = "data.frame", row.names = c("overlap_df1_df2",
"unique_df1", "unique_df2"))
I tried the datatable function foverlaps:
setkeyv(df1, names(df1))
setkeyv(df2, names(df2))
df3<-foverlaps(df1,df2)
But seems like I need to have some common column names in these two dataframes, which is obviously not the case.
Thank you!
Loop through columns, and use set operations:
sapply(df2, function(i){
x = i[ !is.na(i) ]
o = intersect(df1$header1, x)
u_df1 = setdiff(df1$header1, o)
u_df2 = setdiff(x, o)
c(o = length(o),
u_df1 = length(u_df1),
u_df2 = length(u_df2))
})
# sample_x sample_y sample_z
# o 2 1 2
# u_df1 2 3 2
# u_df2 0 2 0
A solution using map:
library(purrr)
rbind(
overlap = map_dbl(df2, ~length(intersect(df1$header1, .x))),
unique_df1 = map_dbl(df2, ~length(setdiff(df1$header1, .x))),
unique_df2 = unique_df1 - overlap
)
sample_x sample_y sample_z
overlap 2 1 2
unique_df1 2 3 2
unique_df2 0 2 0

Loop to create bivariate/cross table

I am trying to create a loop where I want get the frequency between column 1 and column 2,column 1 and column 3....till col1 and col30.
Col1 col2 col3
0 A 25
1 A 30
0 A 30
1 B 20
0 B 20
Output.
0 1 0 1
A 2 1 25 0 0
B 1 1 30 1 1
20 1 1
Use lapply to loop over columns and then table to calculate frequency
lapply(df[-1], function(x) table(x, df[, 1]))
#$col2
#x 0 1
# A 2 1
# B 1 1
#$col3
#x 0 1
# 20 1 1
# 25 1 0
# 30 1 1
Or a shorter version using Map
Map(table, df[1], df[-1])
data
df <- structure(list(Col1 = c(0L, 1L, 0L, 1L, 0L), col2 = structure(c(1L,
1L, 1L, 2L, 2L), .Label = c("A", "B"), class = "factor"), col3 = c(25L,
30L, 30L, 20L, 20L)), class = "data.frame", row.names = c(NA, -5L))
We can use tidyverse
library(tidyverse)
map(names(df)[-1], ~ cbind(df[1], df[.x]) %>%
count(Col1, !! rlang::sym(.x)) %>%
spread(Col1, n, fill = 0))
data
df <- structure(list(Col1 = c(0L, 1L, 0L, 1L, 0L), col2 = structure(c(1L,
1L, 1L, 2L, 2L), .Label = c("A", "B"), class = "factor"), col3 = c(25L,
30L, 30L, 20L, 20L)), class = "data.frame", row.names = c(NA, -5L))

How to subtract one record from another data frame in R

I have two data frame. One data frame has only 1 record and 3 columns. Another data frame has 6 rows and 3 columns.
Now I want to subtract data frame 1 values from data frame 2 values.
Sample data:
df1 = structure(list(col1 = 2L, col2 = 3L, col3 = 4L), .Names = c("col1",
"col2", "col3"), class = "data.frame", row.names = c(NA, -1L))
df2 = structure(list(col1 = c(1L, 2L, 4L, 5L, 6L, 3L), col2 = c(1L,
2L, 4L, 3L, 5L, 7L), col3 = c(6L, 4L, 3L, 6L, 4L, 6L)), .Names = c("col1", "col2", "col3"), class = "data.frame", row.names = c(NA, -6L))
Final output should be like,
output = structure(list(col1 = c(-1L, 0L, 2L, 3L, 4L, 1L), col2 = c(-2L,
-1L, 1L, 0L, 2L, 4L), col3 = c(2L, 0L, -1L, 2L, 0L, 2L)), .Names = c("col1","col2", "col3"), class = "data.frame", row.names = c(NA, -6L))
Try this..
# Creating Datasets
df1 = structure(list(col1 = 2L, col2 = 3L, col3 = 4L), .Names = c("col1", "col2", "col3"), class = "data.frame", row.names = c(NA, -1L))
df2 = structure(list(col1 = c(1L, 2L, 4L, 5L, 6L, 3L), col2 = c(1L,2L, 4L, 3L, 5L, 7L), col3 = c(6L, 4L, 3L, 6L, 4L, 6L)), .Names = c("col1", "col2", "col3"), class = "data.frame", row.names = c(NA, -6L))
# Output
data.frame(sapply(names(df1), function(i){df2[[i]] - df1[[i]]}))
# col1 col2 col3
# 1 -1 -2 2
# 2 0 -1 0
# 3 2 1 -1
# 4 3 0 2
# 5 4 2 0
# 6 1 4 2
If you do df2 - df1 directly you get
df2 - df1
Error in Ops.data.frame(df2, df1) :
‘-’ only defined for equally-sized data frames
So let us make df1 the same size as df2 by repeating rows and then subtract
df2 - df1[rep(seq_len(nrow(df1)), nrow(df2)), ]
# col1 col2 col3
#1 -1 -2 2
#2 0 -1 0
#3 2 1 -1
#4 3 0 2
#5 4 2 0
#6 1 4 2
Or another option is using mapply without replicating rows
mapply("-", df2, df1)
This would return a matrix, if you want a dataframe back
data.frame(mapply("-", df2, df1))
# col1 col2 col3
#1 -1 -2 2
#2 0 -1 0
#3 2 1 -1
#4 3 0 2
#5 4 2 0
#6 1 4 2
We can use sweep:
x <- sweep(df2, 2, unlist(df1), "-")
#test if same as output
identical(output, x)
# [1] TRUE
Note, it is twice slower than mapply:
df2big <- data.frame(col1 = runif(100000),
col2 = runif(100000),
col3 = runif(100000))
microbenchmark::microbenchmark(
mapply = data.frame(mapply("-", df2big, df1)),
sapply = data.frame(sapply(names(df1), function(i){df2big[[i]] - df1[[i]]})),
sweep = sweep(df2big, 2, unlist(df1), "-"))
# Unit: milliseconds
# expr min lq mean median uq max neval
# mapply 5.239638 7.645213 11.49182 8.514876 9.345765 60.60949 100
# sapply 5.250756 5.518455 10.94827 8.706027 10.091841 59.09909 100
# sweep 10.572785 13.912167 21.18537 14.985525 16.737820 64.90064 100

merging and counting similar strings

I have a data with three columns like
Inputdf<-structure(list(df1 = structure(c(4L, 5L, 2L, 1L, 3L), .Label = c("P61160,P61158,O15143,O15144,O15145,P59998,O15511",
"P78537,Q6QNY1,Q6QNY0", "Q06323,Q9UL46", "Q92793,Q09472,Q9Y6Q9,Q92831",
"Q92828,Q13227,O15379,O75376,O60907,Q9BZK7"), class = "factor"),
df2 = structure(c(3L, 2L, 5L, 4L, 1L), .Label = c("", "P61158,O15143,O15144",
"Q06323,Q9UL46", "Q6QNY0", "Q92828"), class = "factor"),
df3 = structure(c(5L, 4L, 3L, 2L, 1L), .Label = c("", "O15511",
"Q06323,Q9UL46", "Q6QNY0", "Q92793,Q09472"), class = "factor")), .Names = c("df1",
"df2", "df3"), class = "data.frame", row.names = c(NA, -5L))
I am trying to find similar strings in this data for example
in df1, I have the first row I have Q92793,Q09472,Q9Y6Q9,Q92831
then I look at df2 and df3 and see if any of these members are in there then in this example, I make the following data
df1 df2 df3 Numberdf1 df2 df3
1 0 1 4 0 Q92793,Q09472
df1 1 means the first row of df1
df2 0 means it did not have any similarity
df3 1, means the first row of df3 has similarity with df1 row 1
Numberdf1, it is the count of strings separated by a ,which is 4
df2 is 0 because there was not any similar string accords df2
df3 is Q92793,Q09472 which paste the string which were similar in here
a desire output looks like below
out<- structure(list(df1 = 1:5, df2 = c(0L, 3L, 4L, 2L, 1L), df3 = c(1L,
0L, 2L, 4L, 3L), Numberdf1 = c(4L, 6L, 2L, 7L, 2L), df2.1 = structure(c(1L,
5L, 4L, 2L, 3L), .Label = c("0", "P61158,O15143,O15144", "Q06323,Q9UL46",
"Q6QNY0", "Q92828"), class = "factor"), df3.1 = structure(c(5L,
1L, 4L, 2L, 3L), .Label = c("0", "O15511", "Q06323,Q9UL46", "Q6QNY0",
"Q92793,Q09472"), class = "factor")), .Names = c("df1", "df2",
"df3", "Numberdf1", "df2.1", "df3.1"), class = "data.frame", row.names = c(NA,
-5L))
The below function does not work , for example, use this data as input
Inputdf1<- structure(list(df1 = structure(c(2L, 3L, 1L), .Label = c("Q06323,Q9UL46",
"Q92793,Q09472,Q9Y6Q9,Q92831", "Q92828,Q13227,O15379,O75376,O60907,Q9BZK7"
), class = "factor"), df2 = structure(1:3, .Label = c("P25788,P25789",
"Q92828, O60907, O75376", "Q9UL46, Q06323"), class = "factor"),
df3 = structure(c(2L, 1L, 3L), .Label = c("Q92831, Q92793, Q09472",
"Q9BZK7, Q92828, O75376, O60907", "Q9UL46, Q06323"), class = "factor")), .Names = c("df1",
"df2", "df3"), class = "data.frame", row.names = c(NA, -3L))
This works for your example:
# First convert factors to strings to lists
Inputdf[] = lapply(Inputdf, as.character)
Inputdf[] = lapply(Inputdf, function(col) sapply(col, function(x) unlist(strsplit(x,','))))
not.empty = function(x) length(x) > 0
out = data.frame()
for (r in 1:nrow(Inputdf)) {
df2.intersect = lapply(Inputdf$df2, intersect, Inputdf$df1[[r]])
df3.intersect = lapply(Inputdf$df3, intersect, Inputdf$df1[[r]])
out[r, 'df1'] = r
out[r, 'df2'] = Position(not.empty, df2.intersect, nomatch=0)
out[r, 'df3'] = Position(not.empty, df3.intersect, nomatch=0)
out[r, 'Numberdf1'] = length(Inputdf$df1[[r]])
out[r, 'df2.1'] = paste(Find(not.empty, df2.intersect, nomatch=0), collapse=',')
out[r, 'df3.1'] = paste(Find(not.empty, df3.intersect, nomatch=0), collapse=',')
}
out
# df1 df2 df3 Numberdf1 df2.1 df3.1
# 1 1 0 1 4 0 Q92793,Q09472
# 2 2 3 0 6 Q92828 0
# 3 3 4 2 3 Q6QNY0 Q6QNY0
# 4 4 2 4 7 P61158,O15143,O15144 O15511
# 5 5 1 3 2 Q06323,Q9UL46 Q06323,Q9UL46
Note: Find and Position identify the first match only. If there are potentially multiple matches, use which.
EDIT
Version accounting for multiple matches
Inputdf[] = lapply(Inputdf, as.character)
Inputdf[] = lapply(Inputdf, function(col) sapply(col, function(x) unlist(strsplit(x,',\\s*'))))
not.empty = function(x) length(x) > 0
out = data.frame()
for (r in 1:nrow(Inputdf)) {
df2.intersect = lapply(Inputdf$df2, intersect, Inputdf$df1[[r]])
df3.intersect = lapply(Inputdf$df3, intersect, Inputdf$df1[[r]])
out[r, 'df1'] = r
out[r, 'df2'] = paste(which(sapply(df2.intersect, not.empty)), collapse=',')
out[r, 'df3'] = paste(which(sapply(df3.intersect, not.empty)), collapse=',')
out[r, 'Numberdf1'] = length(Inputdf$df1[[r]])
out[r, 'df2.1'] = paste(unique(unlist(df2.intersect)), collapse=',')
out[r, 'df3.1'] = paste(unique(unlist(df3.intersect)), collapse=',')
}
out[out==""] = "0"

Replacing loop in dplyr R

So I am trying to program function with dplyr withou loop and here is something I do not know how to do
Say we have tv stations (x,y,z) and months (2,3). If I group by this say we get
this output also with summarised numeric value
TV months value
x 2 52
y 2 87
z 2 65
x 3 180
y 3 36
z 3 99
This is for evaluated Brand.
Then I will have many Brands I need to filter to get only those which get value >=0.8*value of evaluated brand & <=1.2*value of evaluated brand
So for example from this down I would only want to filter first two, and this should be done for all months&TV combinations
brand TV MONTH value
sdg x 2 60
sdfg x 2 55
shs x 2 120
sdg x 2 11
sdga x 2 5000
As #akrun said, you need to use a combination of merging and subsetting. Here's a base R solution.
m <- merge(df, data, by.x=c("TV", "MONTH"), by.y=c("TV", "months"))
m[m$value.x >= m$value.y*0.8 & m$value.x <= m$value.y*1.2,][,-5]
# TV MONTH brand value.x
#1 x 2 sdg 60
#2 x 2 sdfg 55
Data
data <- structure(list(TV = structure(c(1L, 2L, 3L, 1L, 2L, 3L), .Label = c("x",
"y", "z"), class = "factor"), months = c(2L, 2L, 2L, 3L, 3L,
3L), value = c(52L, 87L, 65L, 180L, 36L, 99L)), .Names = c("TV",
"months", "value"), class = "data.frame", row.names = c(NA, -6L
))
df <- structure(list(brand = structure(c(2L, 1L, 4L, 2L, 3L), .Label = c("sdfg",
"sdg", "sdga", "shs"), class = "factor"), TV = structure(c(1L,
1L, 1L, 1L, 1L), .Label = "x", class = "factor"), MONTH = c(2L,
2L, 2L, 2L, 2L), value = c(60L, 55L, 120L, 11L, 5000L)), .Names = c("brand",
"TV", "MONTH", "value"), class = "data.frame", row.names = c(NA,
-5L))

Resources