I would like to do a few column operations using mutate in more elegant way as I have more than 200 columns in my table that I would like transform using mutate.
here is an example
Sample data:
df <- data.frame(treatment=rep(letters[1:2],10),
c1_x=rnorm(20),c2_y=rnorm(20),c3_z=rnorm(20),
c4_x=rnorm(20),c5_y=rnorm(20),c6_z=rnorm(20),
c7_x=rnorm(20),c8_y=rnorm(20),c9_z=rnorm(20),
c10_x=rnorm(20),c11_y=rnorm(20),c12_z=rnorm(20),
c_n=rnorm(20))
sample code:
dfm<-df %>%
mutate(cx=(c1_x*c4_x/c_n+c7_x*c10_x/c_n),
cy=(c2_y*c5_y/c_n+c8_y*c11_y/c_n),
cz=(c3_z*c6_z/c_n+c9_z*c12_z/c_n))
Despite the tangent, the initial recommendations for using tidyr functions is where you need to go. This pipe of functions seems to do the job based on what you've provided.
Your data:
df <- data.frame(treatment=rep(letters[1:2],10),
c1_x=rnorm(20), c2_y=rnorm(20), c3_z=rnorm(20),
c4_x=rnorm(20), c5_y=rnorm(20), c6_z=rnorm(20),
c7_x=rnorm(20), c8_y=rnorm(20), c9_z=rnorm(20),
c10_x=rnorm(20), c11_y=rnorm(20), c12_z=rnorm(20),
c_n=rnorm(20))
library(dplyr)
library(tidyr)
This first auxiliary data.frame is used to translate your c#_[xyz] variable into a unified one. I'm sure there are other ways to handle this, but it works and is relatively easy to reproduce and extend based on your 200+ columns.
variableTransform <- data_frame(
cnum = paste0("c", 1:12),
cvar = rep(paste0("a", 1:4), each = 3)
)
head(variableTransform)
# Source: local data frame [6 x 2]
# cnum cvar
# <chr> <chr>
# 1 c1 a1
# 2 c2 a1
# 3 c3 a1
# 4 c4 a2
# 5 c5 a2
# 6 c6 a2
Here's the pipe all at once. I'll explain the steps in a sec. What you're looking for is likely a combination of the treatment, xyz, and ans columns.
df %>%
tidyr::gather(cnum, value, -treatment, -c_n) %>%
tidyr::separate(cnum, c("cnum", "xyz"), sep = "_") %>%
left_join(variableTransform, by = "cnum") %>%
select(-cnum) %>%
tidyr::spread(cvar, value) %>%
mutate(
ans = a1 * (a2/c_n) + a3 * (a4/c_n)
) %>%
head
# treatment c_n xyz a1 a2 a3 a4 ans
# 1 a -1.535934 x -0.3276474 1.45959746 -1.2650369 1.02795419 1.15801448
# 2 a -1.535934 y -1.3662388 -0.05668467 0.4867865 -0.10138979 -0.01828831
# 3 a -1.535934 z -2.5026018 -0.99797169 0.5181513 1.20321878 -2.03197283
# 4 a -1.363584 x -0.9742016 -0.12650863 1.3612361 -0.24840493 0.15759418
# 5 a -1.363584 y -0.9795871 1.52027017 0.5510857 1.08733839 0.65270681
# 6 a -1.363584 z 0.2985557 -0.22883439 0.1536078 -0.09993095 0.06136036
First, we take the original data and turn all (except two) columns into two columns of "column name" and "column values" pairs:
df %>%
tidyr::gather(cnum, value, -treatment, -c_n) %>%
# treatment c_n cnum value
# 1 a 0.20745647 c1_x -0.1250222
# 2 b 0.01015871 c1_x -0.4585088
# 3 a 1.65671028 c1_x -0.2455927
# 4 b -0.24037137 c1_x 0.6219516
# 5 a -1.16092349 c1_x -0.3716138
# 6 b 1.61191700 c1_x 1.7605452
It will be helpful to split c1_x into c1 and x in order to translate the first and preserve the latter:
tidyr::separate(cnum, c("cnum", "xyz"), sep = "_") %>%
# treatment c_n cnum xyz value
# 1 a 0.20745647 c1 x -0.1250222
# 2 b 0.01015871 c1 x -0.4585088
# 3 a 1.65671028 c1 x -0.2455927
# 4 b -0.24037137 c1 x 0.6219516
# 5 a -1.16092349 c1 x -0.3716138
# 6 b 1.61191700 c1 x 1.7605452
From here, let's translate the c1, c2, and c3 variables into a1 (repeat for other 9 variables) using variableTransform:
left_join(variableTransform, by = "cnum") %>%
select(-cnum) %>%
# treatment c_n xyz value cvar
# 1 a 0.20745647 x -0.1250222 a1
# 2 b 0.01015871 x -0.4585088 a1
# 3 a 1.65671028 x -0.2455927 a1
# 4 b -0.24037137 x 0.6219516 a1
# 5 a -1.16092349 x -0.3716138 a1
# 6 b 1.61191700 x 1.7605452 a1
Since we want to deal with multiple variables simultaneously (with a simple mutate), we need to bring some of the variables back into columns. (The reason we gathered and will now spread helps me with keeping things organized and named well. I'm confident somebody can come up with another way to do it.)
tidyr::spread(cvar, value) %>% head
# treatment c_n xyz a1 a2 a3 a4
# 1 a -1.535934 x -0.3276474 1.45959746 -1.2650369 1.02795419
# 2 a -1.535934 y -1.3662388 -0.05668467 0.4867865 -0.10138979
# 3 a -1.535934 z -2.5026018 -0.99797169 0.5181513 1.20321878
# 4 a -1.363584 x -0.9742016 -0.12650863 1.3612361 -0.24840493
# 5 a -1.363584 y -0.9795871 1.52027017 0.5510857 1.08733839
# 6 a -1.363584 z 0.2985557 -0.22883439 0.1536078 -0.09993095
From here, we just need to mutate to get the right answer.
Similar to r2evans's answer, but with more manipulation instead of the joins (and less explanation).
library(tidyr)
library(stringr)
library(dplyr)
# get it into fully long form
gather(df, key = cc_xyz, value = value, c1_x:c12_z) %>%
# separate off the xyz and the c123
separate(col = cc_xyz, into = c("cc", "xyz")) %>%
# extract the number
mutate(num = as.numeric(str_replace(cc, pattern = "c", replacement = "")),
# mod it by 4 for groupings and add a letter so its a good col name
num_mod = paste0("v", (num %% 4) + 1)) %>%
# remove unwanted columns
select(-cc, -num) %>%
# go into a reasonable data width for calculation
spread(key = num_mod, value = value) %>%
# calculate
mutate(result = v1 + v2/c_n + v3 + v4 / c_n)
# treatment c_n xyz v1 v2 v3 v4 result
# 1 a -1.433858289 x 1.242153708 -0.985482158 -0.0240414692 1.98710285 0.51956295
# 2 a -1.433858289 y -0.019255516 0.074453615 -1.6081599298 1.18228939 -2.50389188
# 3 a -1.433858289 z -0.362785313 2.296744655 -0.0610463292 0.89797526 -2.65188998
# 4 a -0.911463819 x -1.088308527 -0.703388193 0.6308253909 0.22685013 0.06534405
# 5 a -0.911463819 y 1.284513516 1.410276163 0.5066869590 -2.07263912 2.51790289
# 6 a -0.911463819 z 0.957778345 -1.136532104 1.3959561507 -0.50021647 4.14947069
# ...
Related
I want to find out what the dominant class of of each id_b is. To calculate it I need to find out the sum of size per class for each id_b. Whichever class is largest is the new dominant class assigned to id_b.
The script below does what I want it to do but it feels quite clunky and over complicated. I've not worked with nested data much before so I'm not sure if I've used the best methods possible Can anyone think of a neater way of achieving the same output in tidyverse or data.table?
Thanks!
library(tidyverse)
# sample data
set.seed(123)
input <- tibble(id_a = c(letters[seq(1,10)]),
size = runif(10, min = 10, max = 50),
class = c("x","x","y","x","y",
"y","x","y","x","x"),
id_b = c("A1","A1","B1","B1","B1",
"C1","C1","C1","D1","E1"))
print(input)
id_a size class id_b
<chr> <dbl> <chr> <chr>
1 a 23.6 x A1
2 b 43.6 x A1
3 c 23.9 y B1
4 d 23.4 x B1
5 e 29.1 y B1
6 f 45.7 y C1
7 g 44.6 x C1
8 h 25.6 y C1
9 i 41.1 x D1
10 j 48.4 x E1
# nest input to create a nested tibble for each id_b
input_nest <- input %>% group_by(id_b) %>% nest()
# calculate dominant class
input_nest_dominant <- input_nest %>% mutate(DOMINANT_CLASS = lapply(data, function(x){
# group each nested tibble by class, and calculate total size. Then find the biggest size and extract
# the class value
output <- x %>% group_by(class) %>%
summarise(total_size = sum(size)) %>%
top_n(total_size, n = 1) %>%
pull(class)
return(output)
} ))
# unnest to end up with a tibble
input_nest_dominant_clean <- input_nest_dominant %>%
unnest(cols = c(DOMINANT_CLASS)) %>%
select(-data) %>%
ungroup()
print(input_nest_dominant_clean)
id_b DOMINANT_CLASS
<chr> <chr>
1 A1 x
2 B1 y
3 C1 y
4 D1 x
5 E1 x
From this example, you don't need nest at all, just calculate it using group_by and summarize.
input %>%
group_by(id_b, class) %>%
summarize(size = sum(size)) %>%
group_by(id_b) %>%
summarize(DOMINANT_CLASS = class[which.max(size)])
#> # A tibble: 5 x 2
#> id_b DOMINANT_CLASS
#> <chr> <chr>
#> 1 A1 x
#> 2 B1 y
#> 3 C1 y
#> 4 D1 x
#> 5 E1 x
Here is a base R solution, which used aggregate twice, i.e.,
agg <-aggregate(size ~ class + id_b, input, FUN = sum)
output <- aggregate(agg[-2],agg[2],FUN = max)[-3]
or a more compact version
output <- aggregate(.~id_b,
aggregate(size ~ class + id_b,
input,
FUN = function(v) sum(v)),
FUN = function(v) tail(sort(v),1))[-3]
such that
> output
id_b class
1 A1 x
2 B1 y
3 C1 y
4 D1 x
5 E1 x
You can do just 1 sort, and remove all duplicates. Something like:
input %>% arrange(desc(size)) %>% filter(!duplicated(id_b)) %>% arrange(id_b)
# A tibble: 5 x 4
id_a size class id_b
<chr> <dbl> <chr> <chr>
1 b 41.5 x A1
2 e 47.6 y B1
3 h 45.7 y C1
4 i 32.1 x D1
5 j 28.3 x E1
If the order of id_b is not important you can omit the last arrange
Or in base R:
input = input[order(-input$size),]
input[!duplicated(input$id_b),]
I have two data frames v1 and V2. I need to add column y from v2 to data frame v1 but want the matched value to be max. for example
v1 <- data.frame(x = c("a1","b2"))
v2 <- data.frame(x = c("a1","a1","b2","b2"), y= c(1,3,4,6))
I am using below line to populate y column in v1.
v1$y <-v2$y[match(v1$x,v2$x)]
which outputs below.
> v1
x y
1 a1 1
2 b2 4
match is taking y based on the first occurrence but I need it based on max. something like below
> v1
x y
1 a1 3
2 b2 6
As match returns first match, you can order the data such that the first match is the max match
v2 <- v2[order(v2$x, -v2$y), ]
v1$y <- v2$y[match(v1$x, v2$x)]
v1
# x y
#1 a1 3
#2 b2 6
You can first aggregate to find the max and then match it to v1.
tt <- aggregate(y ~ x, data=v2, FUN=max)
v1$y <-tt$y[match(v1$x,tt$x)]
v1
# x y
#1 a1 3
#2 b2 6
Try to aggregate first and then join (or match),
merge(v1, aggregate(y~x, v2, max), by = 'x')
or
max_v2 <- aggregate(y~x, v2, max)
max_v2$y[match(v1$x, max_v2$x)]
A possible base solution:
new_df<-merge(v1,v2, by="x")
aggregate(.~x, new_df,max)
Or with dplyr:
v1 %>%
left_join(v2, "x") %>%
group_by(x) %>%
summarise(y=max(y))
# A tibble: 2 x 2
x y
<fct> <dbl>
1 a1 3
2 b2 6
Or another base option:
aggregate(.~x,v2[v1$x %in% v2$x,],max)
x y
1 a1 3
2 b2 6
First filter v2 for for max values and then match
library(dplyr)
v1 <- data.frame(x = c("a1","b2"))
v2 <- data.frame(x = c("a1","a1","b2","b2"), y= c(1,3,4,6))
v2.sub <- v2 %>%
group_by(x) %>%
filter(y==max(y))
v1$y <-v2.sub$y[match(v1$x,v2.sub$x)]
Here is a solution with data.table
library("data.table")
v1 <- data.table(x = c("a1","b2"))
v2 <- data.table(x = c("a1","a1","b2","b2"), y= c(1,3,4,6))
v2[, .(y=max(y)), x][v1, on="x"]
# > v2[, .(y=max(y)), x][v1, on="x"]
# x y
# 1: a1 3
# 2: b2 6
I would like to combine a set of data frames into a single data frame by summing columns that have matching variables (instead of appending columns).
For example, given
df1 <- data.frame(A = c(0,0,1,1,1,2,2), B = c(1,2,1,2,3,1,5), x = c(2,3,1,5,3,7,0))
df2 <- data.frame(A = c(0,1,1,2,2,2), B = c(1,1,3,2,4,5), x = c(4,8,4,1,0,3))
df3 <- data.frame(A = c(0,1,2), B = c(5,4,2), x = c(5,3,1))
I want to match by "A" and "B" and sum the values of "x". For this example, I can get the desired result as follows:
library(plyr)
library(dplyr)
# rename columns so that join_all preserves them all:
colnames(df1)[3] <- "x1"
colnames(df2)[3] <- "x2"
colnames(df3)[3] <- "x3"
# join the data frames by matching "A" and "B" values:
res <- join_all(list(df1, df2, df3), by = c("A", "B"), type = "full")
# get the sums and drop superfluous columns:
arrange(res, A, B) %>%
rowwise() %>%
mutate(x = sum(x1, x2, x3, na.rm = TRUE)) %>%
select(A, B, x)
Result:
A B x
<dbl> <dbl> <dbl>
1 0 1 6
2 0 2 3
3 0 5 5
4 1 1 9
5 1 2 5
6 1 3 7
7 1 4 3
8 2 1 7
9 2 2 2
10 2 4 0
11 2 5 3
A more general solution is
library(dplyr)
# function to get the desired result for two data frames:
my_merge <- function(df1, df2)
{
m1 <- merge(df1, df2, by = c("A", "B"), all = TRUE)
m1 <- rowwise(res) %>%
mutate(x = sum(x.x, x.y, na.rm = TRUE)) %>%
select(A, B, x)
return(m1)
}
l1 <- list(df2, df3) # omit the first data frame
res <- df1 # initial value of the result
for(df in l1) res <- my_merge(res, df) # call the function repeatedly
Is there a more efficient option for combining a large set of data frames? Ideally it should be recursive (i.e. it's better not to join all data frames into one massive data frame before calculating the sums).
An easier option is to bind the rows of the datasets, then group by the columns of interest and get the summarised output by getting the sum of 'x'
library(tidyverse)
bind_rows(df1, df2, df3) %>%
group_by(A, B) %>%
summarise(x = sum(x))
# A tibble: 11 x 3
# Groups: A [?]
# A B x
# <dbl> <dbl> <dbl>
# 1 0 1 6
# 2 0 2 3
# 3 0 5 5
# 4 1 1 9
# 5 1 2 5
# 6 1 3 7
# 7 1 4 3
# 8 2 1 7
# 9 2 2 2
#10 2 4 0
#11 2 5 3
If there are many objects in the global environment with the pattern "df" followed by some digits
mget(ls(pattern= "^df\\d+")) %>%
bind_rows %>%
group_by(A, B) %>%
summarise(x = sum(x))
As the OP mentioned about memory constraints, if we do the join first and then use rowSums or + with reduce, it would be more efficient
mget(ls(pattern= "^df\\d+")) %>%
reduce(full_join, by = c("A", "B")) %>%
transmute(A, B, x = rowSums(.[3:5], na.rm = TRUE)) %>%
arrange(A, B)
# A B x
#1 0 1 6
#2 0 2 3
#3 0 5 5
#4 1 1 9
#5 1 2 5
#6 1 3 7
#7 1 4 3
#8 2 1 7
#9 2 2 2
#10 2 4 0
#11 2 5 3
This could also be done with data.table
library(data.table)
rbindlist(mget(ls(pattern= "^df\\d+")))[, .(x = sum(x)), by = .(A, B)]
Ideally it should be recursive (i.e. it's better not to join all data frames into one massive data frame before calculating the sums).
If you're memory constrained and willing to sacrifice speed (vs #akrun's data.table approach), use one table at a time in a loop:
library(data.table)
tabs = c("df1", "df2", "df3")
# enumerate all combos for the results table
# initializing sum to 0
res = CJ(A = 0:2, B = 1:5, x = 0)
# loop over tabs, adding on
for (i in seq_along(tabs)){
tab = get(tabs[[i]])
res[tab, on=.(A, B), x := x + i.x][]
rm(tab)
}
If you need to read tables from disk, change tabs to file names and get to fread or whatever function.
I am skeptical that you can fit all the tables in memory, but cannot also fit an rbind-ed copy of them together.
Similarly (thanks to #akrun's comment), use his approach pairwise:
res = data.table(get(tabs[[1]]))[0L]
for (i in seq_along(tabs)){
tab = get(tabs[[i]])
res = rbind(res, tab)[, .(x = sum(x)), by=.(A,B)]
rm(tab)
}
Consider the following two data.frames:
a1 <- data.frame(A = c(1:5, 2, 4, 2), B = letters[c(1:5, 2, 4, 2)])
a2 <- data.frame(A = c(1:3,2), B = letters[c(1:3,2)])
I would like to remove the exact rows of a1 that are in a2 so that the result should be:
A B
4 d
5 e
4 d
2 b
Note that one row with 2 b in a1 is retained in the final result. Currently, I use a looping statement, which becomes extremely slow as I have many variables and thousands of rows in my data.frames. Is there any built-in function to get this result?
The idea is, add a counter for duplicates to each file, so you can get a unique match for each occurrence of a row. Data table is nice because it is easy to count the duplicates (with .N), and it also gives the necessary function (fsetdiff) for set operations.
library(data.table)
a1 <- data.table(A = c(1:5, 2, 4, 2), B = letters[c(1:5, 2, 4, 2)])
a2 <- data.table(A = c(1:3,2), B = letters[c(1:3,2)])
# add counter for duplicates
a1[, i := 1:.N, .(A,B)]
a2[, i := 1:.N, .(A,B)]
# setdiff gets the exception
# "all = T" allows duplicate rows to be returned
fsetdiff(a1, a2, all = T)
# A B i
# 1: 4 d 1
# 2: 5 e 1
# 3: 4 d 2
# 4: 2 b 3
You could use dplyr to do this. I set stringsAsFactors = FALSE to get rid of warnings about factor mismatches.
library(dplyr)
a1 <- data.frame(A = c(1:5, 2, 4, 2), B = letters[c(1:5, 2, 4, 2)], stringsAsFactors = FALSE)
a2 <- data.frame(A = c(1:3,2), B = letters[c(1:3,2)], stringsAsFactors = FALSE)
## Make temp variables to join on then delete later.
# Create a row number
a1_tmp <-
a1 %>%
group_by(A, B) %>%
mutate(tmp_id = row_number()) %>%
ungroup()
# Create a count
a2_tmp <-
a2 %>%
group_by(A, B) %>%
summarise(count = n()) %>%
ungroup()
## Keep all that have no entry int a2 or the id > the count (i.e. used up a2 entries).
left_join(a1_tmp, a2_tmp, by = c('A', 'B')) %>%
ungroup() %>% filter(is.na(count) | tmp_id > count) %>%
select(-tmp_id, -count)
## # A tibble: 4 x 2
## A B
## <dbl> <chr>
## 1 4 d
## 2 5 e
## 3 4 d
## 4 2 b
EDIT
Here is a similar solution that is a little shorter. This does the following: (1) add a column for row number to join both data.frame items (2) a temporary column in a2 (2nd data.frame) that will show up as null in the join to a1 (i.e. indicates it's unique to a1).
library(dplyr)
left_join(a1 %>% group_by(A,B) %>% mutate(rn = row_number()) %>% ungroup(),
a2 %>% group_by(A,B) %>% mutate(rn = row_number(), tmpcol = 0) %>% ungroup(),
by = c('A', 'B', 'rn')) %>%
filter(is.na(tmpcol)) %>%
select(-tmpcol, -rn)
## # A tibble: 4 x 2
## A B
## <dbl> <chr>
## 1 4 d
## 2 5 e
## 3 4 d
## 4 2 b
I think this solution is a little simpler (perhaps very little) than the first.
I guess this is similar to DWal's solution but in base R
a1_temp = Reduce(paste, a1)
a1_temp = paste(a1_temp, ave(seq_along(a1_temp), a1_temp, FUN = seq_along))
a2_temp = Reduce(paste, a2)
a2_temp = paste(a2_temp, ave(seq_along(a2_temp), a2_temp, FUN = seq_along))
a1[!a1_temp %in% a2_temp,]
# A B
#4 4 d
#5 5 e
#7 4 d
#8 2 b
Here's another solution with dplyr:
library(dplyr)
a1 %>%
arrange(A) %>%
group_by(A) %>%
filter(!(paste0(1:n(), A, B) %in% with(arrange(a2, A), paste0(1:n(), A, B))))
Result:
# A tibble: 4 x 2
# Groups: A [3]
A B
<dbl> <fctr>
1 2 b
2 4 d
3 4 d
4 5 e
This way of filtering avoids creating extra unwanted columns that you have to later remove in the final output. This method also sorts the output. Not sure if it's what you want.
I've got two data frames in which the unique identifiers common to both frames differ in the number of observations. I would like to create a dataframe from both in which the observations from each frame are taken if they have more observations for a common identifier. For example:
f1 <- data.frame(x = c("a", "a", "b", "c", "c", "c"), y = c(1,1,2,3,3,3))
f2 <- data.frame(x = c("a","b", "b", "c", "c"), y = c(4,5,5,6,6))
I would like this to generate a merge based on the longer x such that it produces:
x y
a 1
a 1
b 5
b 5
c 3
c 3
c 3
Any and all thoughts would be great.
Here's a solution using split
dd<-rbind(cbind(f1, s="f1"), cbind(f2, s="f2"))
keep<-unsplit(lapply(split(dd$s, dd$x), FUN=function(x) {
y<-table(x)
x == names(y[which.max(y)])
}), dd$x)
dd <- dd[keep,]
Normally i'd prefer to use the ave function here but because i'm changing data.types from a factor to a logical, it wasn't as appropriate so I basically copied the idea that ave uses and used split.
dplyr solution
library(dplyr)
First we combine the data:
with rbind() and introduce a new variable called ref to know where each observation came from:
both <- rbind( f1, f2 )
both$ref <- rep( c( "f1", "f2" ) , c( nrow(f1), nrow(f2) ) )
then count the observations:
make another new variable that contains how many observations for each ref and x combination:
both_with_counts <- both %>%
group_by( ref ,x ) %>%
mutate( counts = n() )
then filter for the largest count:
both_with_counts %>% group_by( x ) %>% filter( n==max(n) )
note: you could also select only the x and y cols with select(x,y)...
this gives:
## Source: local data frame [7 x 4]
## Groups: x
##
## x y ref counts
## 1 a 1 f1 2
## 2 a 1 f1 2
## 3 c 3 f1 3
## 4 c 3 f1 3
## 5 c 3 f1 3
## 6 b 5 f2 2
## 7 b 5 f2 2
Altogether now...
what_I_want <-
rbind(cbind(f1,ref = "f1"),cbind(f2,ref = "f2")) %>%
group_by(ref,x) %>%
mutate(counts = n()) %>%
group_by( x ) %>%
filter( counts==max(counts) ) %>%
select( x, y )
and thus:
> what_I_want
# Source: local data frame [7 x 2]
# Groups: x
#
# x y
# 1 a 1
# 2 a 1
# 3 c 3
# 4 c 3
# 5 c 3
# 6 b 5
# 7 b 5
Not a elegant answer but still give the desired result. Hope this help.
f1table <- data.frame(table(f1$x))
colnames(f1table) <- c("x","freq")
f1new <- merge(f1,f1table)
f2table <- data.frame(table(f2$x))
colnames(f2table) <- c("x","freq")
f2new <- merge(f2,f2table)
table <- rbind(f1table, f2table)
table <- table[with(table, order(x,-freq)), ]
table <- table[!duplicated(table$x), ]
data <-rbind(f1new, f2new)
merge(data, table, by=c("x","freq"))[,c(1,3)]
x y
1 a 1
2 a 1
3 b 5
4 b 5
5 c 3
6 c 3
7 c 3