I have 3 data frames with three variables each and the name of the player
a <- rnorm(16, 3, 2)
b <- rnorm(16, 1, 3)
c <- rpois(16, 3)
year <- c(rep(2015, 5), rep(2016, 5), rep(2017, 6))
player <- c("Alex", "CT", "Bill", "Brian", "Collin", "Chez", "Adam", "Danny III", "Lee", "Chris",
"Erik", "Axel", "Louis", "Justin", "Dustin", "Johnson")
df_1 <- data.frame(player, year, a, b, c)
d <- rnorm(16, 3, 2)
e <- rnorm(16, 1, 3)
f <- rpois(16, 3)
year <- c(rep(2015, 5), rep(2016, 5), rep(2017, 6))
player <- c("Alexander", "C.T.", "Bill", "Brian", "Collin", "Chez", "Adam", "Danny IV", "Lee", "Chris",
"Erik", "Axel", "Louis", "Justin", "Dustin", "Johnson")
df_2 <- data.frame(player, year, d, e, f)
g <- rnorm(16, 3, 2)
h <- rnorm(16, 1, 3)
i <- rpois(16, 3)
year <- c(rep(2015, 5), rep(2016, 5), rep(2017, 6))
player <- c("Alex", "CT", "Bill", "Brian", "Collin", "Chez", "Adam", "Danny III", "Lee", "Chris",
"Erik", "Axel", "Louis", "Justin", "Dustin", "Johnson")
df_3 <- data.frame(player, year, g, h, i)
This data frame contains the name of the player corresponding to each data set of variables.
For example, Alex is the same as Alexander in variables from d to f, and it is the same as Alex in observations from g to i. Danny III is named Danny IV in variables from d to f and it is named Danny III in variables from g to i.
a_to_c <- c("Alex", "CT", "Danny III")
d_to_f <- c("Alexander", "C.T.", "Danny IV")
g_to_i <- c("Alex", "CT", "Danny III")
names_palyer <- data.frame(a_to_c, d_to_f, g_to_i)
I want to merge the three data frames by year and player into a single data frame. I need to use the information from "names_player" to correctly match the player with the data
I did this example for simplicity, in reality, I have thousand of observations so I need to find a way to automatically match the player's name so I can have a single data frame with the information of the three data frames.
Initialize the output ('out') as the first data ('df_1'). Loop over the index of columns of 'names_palyer' (excluding the last column), get the value of 'df_' object corresponding (incrementing 1 - i + 1 - assuming objects are named as df_1, df_2 etc.), then select a subset of columns of 'names_palyer' (keydat), use match to get the index of matching values with 'player' column of 'tmp' data. Replace the 'player' to the first column values of 'keydat' based on index, then do the merge (left join - all.x = TRUE), and at end, change the output 'player' that match to keydat' column to second column values of 'keydat' (so that it would be useful for the next iteration)
out <- df_1
for(i in 1:(ncol(names_palyer)-1)) {
tmp <- data.table::copy(get(paste0('df_', i + 1)))
keydat <- names_palyer[c(i, i + 1)]
keydat <- keydat[keydat[[2]] %in% tmp$player,, drop = FALSE]
i1 <- match(keydat[[2]], tmp$player, nomatch = 0)
tmp$player[i1] <- keydat[[1]]
print(tmp)
out <- merge(out, tmp, by = c('player', 'year'), all.x = TRUE)
i2 <- match(keydat[[1]], out$player, nomatch = 0)
out$player[i2] <- keydat[[2]]
}
-output
out
player year a b c d e f g h i
1 Adam 2016 0.03587367 -0.57907496 3 5.1149009 2.47064240 2 2.3325348 0.62526907 6
2 Alex 2015 1.27778013 0.05809471 0 4.1932959 4.37934704 0 4.3226737 -0.33523019 5
3 Axel 2017 2.56466723 0.43108713 2 5.9970138 -2.19947169 4 0.9717511 2.05843957 3
4 Bill 2015 2.05594607 3.96167974 3 2.5232810 3.87191286 3 3.1726895 3.43683108 0
5 Brian 2015 3.44690732 0.35032810 4 4.7287671 0.08108714 2 2.8519495 -0.08249603 2
6 CT 2015 5.85679299 -1.57623304 2 3.9653678 1.68389034 3 3.0328709 1.04315644 2
7 Chez 2016 0.73604605 -2.58101736 1 4.0642894 0.04941299 3 5.4688474 -1.82831432 3
8 Chris 2016 0.95621081 2.05206411 4 2.7249987 2.42911270 8 -0.4515070 -2.12097504 0
9 Collin 2015 7.14194691 0.74030236 5 4.7879545 5.41397214 4 1.4835656 0.92897125 2
10 Danny III 2016 4.59832890 0.60355092 5 4.4822495 4.15865653 0 2.4950848 3.31059942 3
11 Dustin 2017 0.26640646 -0.23381080 4 5.3164916 3.67001803 1 0.7011976 2.59135173 4
12 Erik 2017 0.27363760 -4.50110125 3 4.9495033 3.31417537 3 4.1907692 5.57914934 6
13 Johnson 2017 7.12013083 2.52775367 3 1.9192381 4.33916287 2 3.3836699 -2.37444447 3
14 Justin 2017 3.41710305 -3.82843506 4 5.5590782 0.56030426 1 0.1670448 5.99934712 6
15 Lee 2016 -1.02002976 -3.24576311 4 0.9538381 -0.91783716 5 2.5668076 -0.67247680 2
16 Louis 2017 1.94420093 0.47369179 3 2.8249960 -1.28630731 7 3.0070664 1.25132019 5
With the OP's new data
out <- copy(df_1)
for(i in 1:(ncol(names_palyer)-1)) {
tmp <- data.table::copy(get(paste0('df_', i + 1)))
keydat <- names_palyer[c(i, i + 1)]
keydat <- keydat[keydat[[2]] %in% tmp$player,, drop = FALSE]
i1 <- match(keydat[[2]], tmp$player, nomatch = 0)
tmp$player[i1] <- keydat[[1]]
print(tmp)
out <- merge(out, tmp, by = c('player', 'year'), all.x = TRUE)
i2 <- match(keydat[[1]], out$player, nomatch = 0)
out$player[i2] <- keydat[[2]][keydat[[1]] %in% out$player]
}
library(dplyr)
library(purrr)
split.default(out[-(1:2)], sub("\\..*", "", names(out)[-(1:2)])) %>%
map_dfc(reduce, coalesce) %>%
bind_cols( out[1:2], .)
Related
I have three data frames. The first data frame df_1 is the baseline data frame, the rest of the data frames contains information that I want to add to the matching observations of df_1 The problem is that the observations don't have the same names I have a fourth data frame with the corresponding name on each variable. I want to recognize those values that have matching observations in the rest of the data frames to get a single data frame with all the observations.
set.seed(123)
# I have three dataframes
a <- rnorm(6, 3, 2)
b <- rnorm(6, 1, 3)
c <- rpois(6, 3)
year <- c(rep(2014, 6))
player <- c("Aaron Badaley", "Andrew Loupe", "Ben Crane", "Ben Curtis", "Ben Martin", "Brendon de Jonge")
df_1 <- data.frame(player, year, a, b, c)
d <- rnorm(3, 3, 2)
e <- rnorm(3, 1, 3)
f <- rpois(3, 3)
year <- c(rep(2014, 3))
player <- c("Andrew Loupe IV", "Ben Crane", "Brendon de Jonge")
df_2 <- data.frame(player, year, d, e, f)
g <- rnorm(4, 3, 2)
h <- rnorm(4, 1, 3)
i <- rpois(4, 3)
year <- c(rep(2014, 4))
player <- c("Aron Badelay", "Ben Crane 3", "Brendon de Jonge", "Ben Curt")
df_3 <- data.frame(player, year, g, h, i)
a_to_c <- c("Aaron Badaley", "CT", "Andrew Loupe", "Ben Crane","Brendon de Jonge", "Ben Curtis", "Ben Martin")
d_to_f <- c("Aron Badaley", "C.T.", "Andrew Loupe", "Ben Crane","Brendon de Jonge", "Ben Curt", "Ben")
g_to_i <- c("Aron Badelay", "CT", "Andrew Loupe", "Ben Crane 3","Brendon de Jonge", "Ben Curt", "Ben Martin")
names_palyer <- data.frame(a_to_c, d_to_f, g_to_i)
These are the three data frames. They are of different lengths. There are names, years, and variables. For this example, there is only one year but in reality, there are more years and I have thousands of observations. The main problem is that not all observations containing in df_1 have a matching observation with df_2 or/and df_3 but all observations in df_2 and df_3should be inside df_1. The matching data frame has more observations than all the data frames since this is like a dictionary for names.
This is what I tried to do:
out <- data.table::copy(df_1)
for(i in 1:(ncol(names_palyer)-1)) {
tmp <- data.table::copy(get(paste0('df_', i + 1)))
keydat <- names_palyer[c(i, i + 1)]
keydat <- keydat[keydat[[2]] %in% tmp$player,, drop = FALSE]
i1 <- match(keydat[[2]], tmp$player, nomatch = 0)
tmp$player[i1] <- keydat[[1]]
print(tmp)
out <- merge(out, tmp, by = c('player', 'year'), all.x = TRUE)
i2 <- match(keydat[[1]], out$player, nomatch = 0)
out$player[i2] <- keydat[[2]][keydat[[1]] %in% out$player]
}
library(dplyr)
library(purrr)
split.default(out[-(1:2)], sub("\\..*", "", names(out)[-(1:2)])) %>%
map_dfc(reduce, coalesce) %>%
bind_cols( out[1:2], .)
This output doesn't work since does not recognize the observations that are present in df_1 and df_3 but not inside df_2.
This is the output I need:
head(out,1)
player year a b c d e f g h i
1 Aaron Badaley 2014 1.879049 2.382749 4 NA NA NA 0.829 -2.4966 1
If we are making the change to first column index, it seems to work
out <- data.table::copy(df_1)
for(i in 1:(ncol(names_palyer)-1)) {
tmp <- data.table::copy(get(paste0('df_', i + 1)))
keydat <- copy(names_palyer)
keydat <- keydat[keydat[[i+1]] %in% tmp$player,, drop = FALSE]
i1 <- match(keydat[[i + 1]], tmp$player, nomatch = 0)
tmp$player[i1] <- keydat[[1]]
out <- merge(out, tmp, by = c('player', 'year'), all.x = TRUE)
i2 <- match(keydat[[1]], out$player, nomatch = 0)
#out$player[i2] <- keydat[[i+1]][keydat[[1]] %in% out$player]
}
-checking
out[1, ]
# player year a b c d e f g h i
#1 Aaron Badaley 2014 1.879049 2.382749 4 NA NA NA 0.8286017 -2.496635 1
Or may be we could bind the rows together and change the 'player' name and then grouped by 'player' replace the column values with the non-NA value
bind_rows(df_1, df_2, df_3 ) %>%
mutate(player = recode(player, !!! setNames(names_palyer[[1]], names_palyer[[2]]) )) %>%
group_by(player, year) %>%
mutate(across(a:f, ~ .[complete.cases(.)][1])) %>%
mutate(player = recode(player, !!! setNames(names_palyer[[1]], names_palyer[[3]]))) %>%
group_by(player, year) %>%
mutate(across(a:i, ~ .[complete.cases(.)][1])) %>%
ungroup %>%
distinct
# A tibble: 7 x 11
# player year a b c d e f g h i
# <chr> <dbl> <dbl> <dbl> <int> <dbl> <dbl> <int> <dbl> <dbl> <int>
#1 Aaron Badaley 2014 1.88 2.38 4 NA NA NA 0.829 -2.50 1
#2 Andrew Loupe 2014 2.54 -2.80 4 NA NA NA NA NA NA
#3 Ben Crane 2014 6.12 -1.06 3 4.00 -0.418 2 2.83 -1.46 2
#4 Ben Curtis 2014 3.14 -0.337 3 NA NA NA 2.71 0.0398 4
#5 Ben Martin 2014 3.26 4.67 2 NA NA NA NA NA NA
#6 Brendon de Jonge 2014 6.43 2.08 1 -0.933 -2.20 1 5.14 3.05 2
#7 Andrew Loupe IV 2014 NA NA NA 6.57 3.10 2 NA NA NA
I have a cbind of 2 data.frames called DATA. Using BASE R, I was wondering how I could extract and then, cbind similarly named variables in DATA and store them as a list?
For the example below, I want all variable AAs, and separately all variable BBs in DATA be separately cbinded and stored as a list?
Note: names could be anything, and the number of variables could be any number. A function(al) solution is highly appreciated.
Note: suppose we have NO ACCESS to r, the only input is DATA.
r <- list(
data.frame(Name = rep("Jacob", 6),
X = c(2,2,1,1,NA, NA),
Y = c(1,1,1,2,1,NA),
Z = rep(3, 6),
out = rep(1, 6)),
data.frame(Name = rep("Jon", 6),
X = c(1,NA,3,1,NA,NA),
Y = c(1,1,1,2,NA,NA),
Z = rep(2, 6),
out = rep(1, 6)),
data.frame(Name = rep("Jon", 6),
X = c(1,NA,3,1,NA,NA),
Y = c(1,1,1,2,2,NA),
Z = rep(2, 6),
out = rep(2, 6)),
data.frame(Name = rep("Jim", 6),
X = c(1,NA,3,1,NA,NA),
Y = c(1,1,1,2,2,NA),
Z = rep(2, 6),
out = rep(1, 6)))
DATA <- do.call(cbind, r) ## DATA: cbind of two data.frames
Here is an option with split. Wouldn't recommend to have same duplicate column names in the dataset. But, if it is really needed, after thee split, change the column names by removing the . following by one or more numbers at the end of it with sub
nm1 <- Reduce(intersect, lapply(r, colnames)) # get the common names
lst1 <- split.default(DATA[names(DATA) %in% nm1], names(DATA)[names(DATA) %in% nm1])
lapply(lst1, function(x) setNames(x, sub("\\.\\d+$", "", names(x))))
Or if we need to use only 'DATA' and not 'r' for finding the intersecting column names. It is difficult but we can get a frequency of the occurence of column names and select that have 2 as frequency
tbl <- table(names(DATA))
nm1 <- names(which(tbl==max(tbl)))
Use that in the split.default as before
lst1 <- split.default(DATA[names(DATA) %in% nm1], names(DATA)[names(DATA) %in% nm1])
lapply(lst1, function(x) setNames(x, sub("\\.\\d+$", "", names(x))))
Using OP's new example
r <- list( data.frame( AA = c(2,2,1,1,3,2), BB = c(1,1,1,2,2,NA), CC = 1:6), data.frame( AA = c(1,NA,3,1,3,2), BB = c(1,1,1,2,2,2)), data.frame( AA = c(1,NA,3,1,3,2), BB = c(1,1,1,2,2,2), DD = 0:5) )
DATA <- do.call(cbind, r)
tbl <- table(names(DATA))
nm1 <- names(which(tbl==max(tbl)))
lst1 <- split.default(DATA[names(DATA) %in% nm1], names(DATA)[names(DATA) %in% nm1])
lapply(lst1, function(x) setNames(x, sub("\\.\\d+$", "", names(x))))
#$AA
# AA AA AA
#1 2 1 1
#2 2 NA NA
#3 1 3 3
#4 1 1 1
#5 3 3 3
#6 2 2 2
#$BB
# BB BB BB
#1 1 1 1
#2 1 1 1
#3 1 1 1
#4 2 2 2
#5 2 2 2
#6 NA 2 2
I apologize if this is a duplicated question. I tried to find my question but I may not be using the right terminology. Feel free to change the title of this post if there is a better way to ask this question.
I have two dataframes
df <- data.frame("Location" = c("chr1:123", "chr6:2452", "chr8:4352", "chr11:8754", "chr3:76345", "chr7:23454","chr18:23452"),
"Score" = c("tolered(1)", "tolerated(2)", "", "", "deleterious(0.1)", "", "deleterious(0.2)"))
df2 <- data.frame("Location" = c( "chr7:23454", "chr9:243256", "chr8:4352", "chr2:6795452", "chr11:8754","chr18:23452", "chr3:76345"),
"Score" = c("", "", "", "", "", "", ""))
df has locations and values in the "score" column that I want to keep.
df2 has the data from df plus some new data.
I want the scores from df for any values that are in df2 and make a
new dataframe called df3.
Desired result:
df3 <- data.frame("Location" = c( "chr7:23454", "chr9:243256", "chr8:4352", "chr2:6795452", "chr11:8754","chr18:23452", "chr3:76345"),
"Score" = c("", "", "", "", "", "deleterious(0.2)", "deleterious(0.1)"))
I am just not sure what the best/fastest method to do this. I am not quite sure where to begin. I feel like you can do this with dplyr but I have never done this before
Using a left_join() from dplyr:
library(dplyr)
df3 <- df2 %>%
dplyr::select(-Score) %>%
left_join(df, by = "Location")
I was able to sort of force this.
I started with this
df3 <- anti_join(df2, df, by = "Location")
df3 <- rbind(df3, df)
but that gave me some extra data that I didn't want/need so I filtered back with df2
df3 <- df3 %>%
filter(Location %in% df2$Location)
This isn't the prettiest method so if anyone else has a cleaner method, please feel free to answer!
df
Location Score
1 A 1
2 B 2
3 C NA
4 D NA
5 E 5
6 F NA
7 G 7
df2
Location Score
1 E NA
2 F NA
3 G NA
4 H NA
5 I NA
6 J NA
7 K 11
df3
Location Score
1 H NA
2 I NA
3 J NA
4 K 11
5 E 5
6 F NA
7 G 7
Code
library(dplyr)
df3 <- df2 %>%
anti_join(df, by = "Location") %>%
bind_rows(inner_join(df, df2 %>% select(1), by = "Location"))
Data
df <- data.frame("Location" = LETTERS[1:7],
"Score" = c(1, 2, NA, NA, 5, NA, 7),
stringsAsFactors = FALSE)
df2 <- data.frame("Location" = LETTERS[5:11],
"Score" = c(rep(NA, 6), 11),
stringsAsFactors = FALSE)
I have data like this:
Name Rating
Tom 3
Tom 4
Tom 2
Johnson 5
Johnson 7
But I'd like it so each unique name is instead a column, with the ratings below, in each row. How can I approach this?
Here is a good way of doing it
x <- data.frame(c("Tom", "Tom", "Tom", "Johnson", "Johnson"), c(3,4,2,5,7))
colnames(x) <- c("Name", "Rating")
n <- unique(x[,1])
m <- max(table(x[,1]))
c <- data.frame(matrix(, ncol = length(n), nrow = m))
for (i in 1:length(n)) {
l <- x[which(x[,1] == n[i]), 2]
l2 <- rep("", m - length(l))
c[,i] <- c(l, l2)
}
colnames(c) <- n
Results:
Tom Johnson
1 3 5
2 4 7
3 2
Here is a way using CRAN package reshape.
library(reshape2)
d <- dcast(mydata, Rating ~ Name, value.var = "Rating")[-1]
d
# Johnson Tom
#1 NA 2
#2 NA 3
#3 NA 4
#4 5 NA
#5 7 NA
As you can see, there are too many NA values in this result. One way of getting rid of them could be:
d <- lapply(d, function(x) x[!is.na(x)])
n <- max(sapply(d, length))
d <- do.call(cbind.data.frame, lapply(d, function(x) c(x, rep(NA, n - length(x)))))
d
# Johnson Tom
#1 5 2
#2 7 3
#3 NA 4
Well, this does the job but introduces some NAs.
Edit: Replace the NAs with some other Rating.
mydata<-data.frame(Name=c("Tom","Tom","Tom","Johnson","Johnson"),Rating=c(3,4,2,5,7))
library(reshape2)
library(tidyverse)
mydata1<-mydata %>%
mutate(Name=as.factor(Name)) %>%
melt(id.var="Name") %>%
dcast(variable+value~Name) %>%
select(-value) %>%
rename(Name=variable) %>%
select_if(is.numeric)
mydata1 %>%
mutate(Johnson=as.factor(Johnson),Tom=as.factor(Tom)) %>%
mutate(Johnson=fct_explicit_na(Johnson,na_level = "No Rating"),
Tom=fct_explicit_na(Tom,na_level = "No Rating"))
Johnson Tom
1 No Rating 2
2 No Rating 3
3 No Rating 4
4 5 No Rating
5 7 No Rating
This is a followup question of this question.
Imagine the following data frame:
a <- c(rep("A", 3), rep("B", 3), rep("A",2))
b <- c(1,1,2,4,1,1,2,2)
df <-data.frame(a,b)
which gives
a b
1 A 1
2 A 1
3 A 2
4 B 4
5 B 1
6 B 1
7 A 2
8 A 2
I reduce it to it's unique rows by:
df_unique <- unique(df)
Now, I am wondering how can I keep track of the merged rows. I would like to create a new column in which each component has a list of row names that have been merged. Something like the following:
df_unique_informative =
a b track
1 A 1 [1,2]
3 A 2 [3,7,8]
4 B 4 [4]
5 B 1 [5,6]
res = aggregate(x = list(track = 1:NROW(df)), by = list(a = df$a, b = df$b), function(x) x)
# OR perhaps you want
#res = aggregate(x = list(track = 1:NROW(df)), by = list(a = df$a, b = df$b), function(x)
# paste(x, collapse = ", "))
res
# a b track
#1 A 1 1, 2
#2 B 1 5, 6
#3 A 2 3, 7, 8
#4 B 4 4
#Shorter code
res = aggregate(list(track = 1:NROW(df)), df[,1:2], '[')
Update
a <- c(rep("A", 3), rep("B", 3), rep("A",2))
b <- c(1,1,2,4,1,1,2,2)
c = letters[1:8]
df <-data.frame(a,b,c, stringsAsFactors = FALSE)
res = aggregate(x = list(track = 1:NROW(df)), by = list(a = df$a, b = df$b), function(x) df$c[x])
res
# a b track
#1 A 1 a, b
#2 B 1 e, f
#3 A 2 c, g, h
#4 B 4 d
Here is one option with tidyverse
library(tidyverse)
rownames_to_column(df, 'rn') %>%
group_by(a, b) %>%
summarise(track = list(rn))