Return row(i) if two columns match - r

I have two datasets:
df1
ID paddock cow ID
90/123 10 09/123
90/124 11 09/124
90/125 11 09/124
df2
ID paddock
09/123 20
09/124 21
I would like to match df1$cowID with df2$ID and return df2$paddock for whatever row matches. My current code is as follows:
dt <- ifelse(df1$cowID %in% df2$ID, df2$paddock[i], NA)
But I'm getting a return error. Could someone direct me in the right direction please? Thanks in advance!

You might consider joining the datasets.
dplyr::left_join(df1, df2, by = c('cow ID', 'ID')

You should probably use match :
df1$df2_paddock <- df2$paddock[match(df1$cow_ID, df2$ID)]
df1
# ID paddock cow_ID df2_paddock
#1 90/123 10 09/123 20
#2 90/124 11 09/124 21
data
df1 <- structure(list(ID = structure(1:2, .Label = c("90/123", "90/124"
), class = "factor"), paddock = 10:11, cow_ID = structure(1:2, .Label = c("09/123",
"09/124"), class = "factor")), class = "data.frame", row.names = c(NA, -2L))
df2 <- structure(list(ID = structure(1:2, .Label = c("09/123", "09/124"
), class = "factor"), paddock = 20:21), class = "data.frame",
row.names = c(NA, -2L))

You can do that by joining the two dataframes and getting the column that you want.
Using Base R
df1 <-
data.frame(
ID = c("90/123", "90/124"),
paddock = c(10, 11),
cow_ID = c("09/123", "09/124")
)
df2 <-
data.frame(
ID = c("90/123", "90/124"),
paddock = c(20, 21)
)
# Joining the two dataframes by ID then choosing coloum of interest
merge(df1, df2, by = c("ID"), suffixes = c(".x", ".y"))["paddock.y"]
# paddock.y
# 20
# 21
Using Dplyr
library(dplyr)
df1 <-
data.frame(
ID = c("90/123", "90/124"),
paddock = c(10, 11),
cow_ID = c("09/123", "09/124")
)
df2 <-
data.frame(
ID = c("90/123", "90/124"),
paddock = c(20, 21)
)
# Joining the two dataframes by ID then choosing coloum of interest
df1 %>%
inner_join(df2, by = c("ID"), suffixes = c(".x", ".y")) %>%
select(paddock.y) %>%
rename(paddock = paddock.y)
# paddock
# 20
# 21

If you would like to use ifelse(), maybe you can use the following code to make it
with(df2,ifelse(ID %in% df1$cow_ID,paddock,NA))
such that
> with(df2,ifelse(ID %in% df1$cow_ID,paddock,NA))
[1] 20 21
DATA
df1 <- structure(list(ID = structure(1:3, .Label = c("90/123", "90/124",
"90/125"), class = "factor"), paddock = c(10, 11, 11), cow_ID = structure(c(1L,
2L, 2L), .Label = c("09/123", "09/124"), class = "factor")), class = "data.frame", row.names = c(NA,
-3L))
df2 <- structure(list(ID = structure(1:2, .Label = c("09/123", "09/124"
), class = "factor"), paddock = c(20, 21)), class = "data.frame", row.names = c(NA,
-2L))

Related

how to identify whether all data frame in a list has unique ID or not

I have a list of dfs. I want to know whether there is a smart way to tell whether each df in lst has unique ID, and create a summary table like below"
Sample data:
lst<-list(structure(list(ID = c("Tom", "Jerry", "Mary"), Score = c(85,
85, 96)), row.names = c(NA, -3L), class = c("tbl_df", "tbl",
"data.frame")), structure(list(ID = c("Tom", "Jerry", "Mary",
"Jerry"), Score = c(75, 65, 88, 98)), row.names = c(NA, -4L), class = c("tbl_df",
"tbl", "data.frame")), structure(list(ID = c("Tom", "Jerry",
"Tom"), Score = c(97, 65, 96)), row.names = c(NA, -3L), class = c("tbl_df",
"tbl", "data.frame")))
We could loop over the list and check with n_distinct
library(dplyr)
library(stringr)
library(purrr)
map_dfr(setNames(lst, str_c("df", seq_along(lst))),
~.x %>%
summarise(UniqueID = c("N", "Y")[1 + (n_distinct(ID) == n())]), .id= 'Data')
-output
# A tibble: 3 × 2
Data UniqueID
<chr> <chr>
1 df1 Y
2 df2 N
3 df3 N
In base R:
data.frame(Data = paste0("df", seq(lst)),
UniqueID = ifelse(sapply(lst, \(x) length(unique(x$ID)) == nrow(x)), "Y", "N"))
Data UniqueID
1 df1 Y
2 df2 N
3 df3 N

how to get the variable list from each data frame in a list

I have a list of df, and I would like to rename the df as df1, df2, df3. and then create a summary like below to capture the variables in each df. What should I do?
I tried to use map to setNames for the data frames in lst, but I must do it in the wrong way. my current codes set variable names to df1, df2, def3. 😅
lst<- map( lst ~
setNames(.x, str_c("df", seq_along(lst))))
sample data:
lst<-list(structure(list(ID = c("Tom", "Jerry", "Mary"), Score = c(85,
85, 96), Test = c("Y", "N", "Y")), row.names = c(NA, -3L), class = c("tbl_df",
"tbl", "data.frame")), structure(list(ID = c("Tom", "Jerry",
"Mary", "Jerry"), Score = c(75, 65, 88, 98), try = c("Y", NA,
"N", NA)), row.names = c(NA, -4L), class = c("tbl_df", "tbl",
"data.frame")), structure(list(ID = c("Tom", "Jerry", "Tom"),
Score = c(97, 65, 96), weight = c("A", NA, "C")), row.names = c(NA,
-3L), class = c("tbl_df", "tbl", "data.frame")))
We get the column names with names/colnames by looping, paste to a single string with toString, convert to a data.frame column and bind the elements (_dfr).
library(purrr)
library(dplyr)
library(stringr)
setNames(lst, str_c("df", seq_along(lst))) %>%
map_dfr(~ tibble(Var = toString(names(.x))), .id = 'Data')
-output
# A tibble: 3 × 2
Data Var
<chr> <chr>
1 df1 ID, Score, Test
2 df2 ID, Score, try
3 df3 ID, Score, weight

Merge List containing Lists with the same structure

I have a list containing multiple lists which all have the same structure:
ls <- list(
one = list(df = data.frame(var1_1 = c(1, 1, 1),
var1_2 = c('a', 'a', 'a')),
ls = list(n_df_1 = data.frame(var3_1 = c('x', 'x', 'x'),
var3_2 = c(4, 4, 4))),
name = c("one", "one", "one")),
two = list(df = data.frame(var1_1 = c(1, 1, 1),
var1_2 = c('a', 'a', 'a')),
ls = list(n_df_1 = data.frame(var3_1 = c('x', 'x', 'x'),
var3_2 = c(4, 4, 4))),
name = c("two", "two", "two")))
I want to merge all these nested lists like stated here: Merge Two Lists in R
It does exactly what want if I do this:
merged <- mapply(c, ls[[1]], ls[[2]], SIMPLIFY = FALSE)
The problem is, is that the main list (ls) doesn't always have only two nested lists. How can I make this code more modular?
I tried to make a vector containing all indexes of the nested lists:
sapply(seq_along(ls), function(x) paste0("ls[[", x, "]]"))
Which output this:
[1] "ls[[1]]" "ls[[2]]"
I thought I could unquote these character vector so that R sees them as object. But I can't figure out how to do that (if it's even possible). I looked at tidy eval for this, but I'm don't know if this is the way to do it.
Any suggestions?
You can use Reduce to do it on an abstract number of list elements, i.e.
Reduce(function(...)Map(c, ...), l1) #Map = mapply(..., simplify = FALSE)
which gives,
$df
$df$var1_1
[1] 1 1 1
$df$var1_2
[1] a a a
Levels: a
$df$var1_1
[1] 1 1 1
$df$var1_2
[1] a a a
Levels: a
$ls
$ls$n_df_1
var3_1 var3_2
1 x 4
2 x 4
3 x 4
$ls$n_df_1
var3_1 var3_2
1 x 4
2 x 4
3 x 4
$name
[1] "one" "one" "one" "two" "two" "two"
DATA:
dput(l1)
list(one = list(df = structure(list(var1_1 = c(1, 1, 1), var1_2 = structure(c(1L,
1L, 1L), .Label = "a", class = "factor")), class = "data.frame", row.names = c(NA,
-3L)), ls = list(n_df_1 = structure(list(var3_1 = structure(c(1L,
1L, 1L), .Label = "x", class = "factor"), var3_2 = c(4, 4, 4)), class = "data.frame", row.names = c(NA,
-3L))), name = c("one", "one", "one")), two = list(df = structure(list(
var1_1 = c(1, 1, 1), var1_2 = structure(c(1L, 1L, 1L), .Label = "a", class = "factor")), class = "data.frame", row.names = c(NA,
-3L)), ls = list(n_df_1 = structure(list(var3_1 = structure(c(1L,
1L, 1L), .Label = "x", class = "factor"), var3_2 = c(4, 4, 4)), class = "data.frame", row.names = c(NA,
-3L))), name = c("two", "two", "two")))

Reshape dataframe to be three columns wide without knowing variable names

I have a dataframe with unknown column names, but a consistent format. How can I reshape it to be three columns wide without using column names?
cpu[,1:6]
Datapoints.Timestamp Datapoints.Maximum Datapoints.Unit Datapoints.Timestamp.1 Datapoints.Maximum.1 Datapoints.Unit.1
1 2019-03-05T08:00:00Z 7.833333 Percent 2019-03-11T22:00:00Z 24.25 Percent
GOAL
Timestamp Maximum Unit
2019-03-05T08:00:00Z 7.833333 Percent
.....
Dataset:
> dput(cpu[,1:6])
structure(list(Datapoints.Timestamp = structure(1L, .Label = "2019-03-05T08:00:00Z", class = "factor"),
Datapoints.Maximum = 7.83333333332848, Datapoints.Unit = structure(1L, .Label = "Percent", class = "factor"),
Datapoints.Timestamp.1 = structure(1L, .Label = "2019-03-11T22:00:00Z", class = "factor"),
Datapoints.Maximum.1 = 24.2500000000048, Datapoints.Unit.1 = structure(1L, .Label = "Percent", class = "factor")), .Names = c("Datapoints.Timestamp",
"Datapoints.Maximum", "Datapoints.Unit", "Datapoints.Timestamp.1",
"Datapoints.Maximum.1", "Datapoints.Unit.1"), class = "data.frame", row.names = c(NA,
-1L))
do.call(rbind, lapply(seq(1, NCOL(df1), 3), function(i)
setNames(df1[,i+(0:2)], colnames(df1)[1:3])))
# Datapoints.Timestamp Datapoints.Maximum Datapoints.Unit
#1 2019-03-05T08:00:00Z 7.833333 Percent
#2 2019-03-11T22:00:00Z 24.250000 Percent

Merging two dataframes using only columns from one dataframe and ignoring others in R

I am trying to merge two dataframes, I've been reading the different posts but I couldn't find a way to obtain my desired output.
dfA:
Name Surname C
Ja Men T
Ale Bu T
Ge Men
dfB:
Name Surname C Ex
Ge Men T hello
Je Di T hello
Desired output:
Merge:
Name Surname C
Ja Men T
Ale Bu T
Ge Men T
Je Di T
That is, fill the columns in dfA with the available columns in dfB and ignore the columns from dfB that are not present in dfA.
I tried:
merge(dfA,dfB, by=c("Name", "Surname", "Caracter"), all.x = T)
And other combinations of the merge. I tried using dplyr but couldn't get a satisfactory results.
Any help would be aprreciated.
Thanks in advance
Data:
dfA <- data.frame(
name=c("Ja", "Ale", "Ge"),
surname=c("Men", "Bu", "Men"),
C= c("T", "T", NA))
dfB <- data.frame(
name=c("Ge", "Je"),
surname=c("Men","Di"),
C= c("T","T"),
X = c("hello","hello"))
Using dput():
# based on dput(dfA)
dfA <- structure(list(name = structure(c(3L, 1L, 2L), .Label = c("Ale",
"Ge", "Ja"), class = "factor"), surname = structure(c(2L, 1L,
2L), .Label = c("Bu", "Men"), class = "factor"), C = structure(c(1L,
1L, NA), .Label = "T", class = "factor")), .Names = c("name",
"surname", "C"), row.names = c(NA, -3L), class = "data.frame")
# based on dput(dfB)
dfB <- structure(list(name = structure(1L, .Label = "Ge", class = "factor"),
surname = structure(1L, .Label = "Men", class = "factor"),
C = "T", X = structure(1L, .Label = "hello", class = "factor")),
.Names = c("name", "surname", "C", "X"),
row.names = c(NA, -1L), class = "data.frame")
Assuming that the input is as in the output shown at the end of the question, we perform a left join of dfA with dfB . Note that coalese returns its first non-null argument -- NAs are regarded as SQL nulls:
library(sqldf)
sqldf("select A.Name, A.Surname, coalesce(A.C, B.C) C
from dfA A left join dfB B on A.Name = B.Name and A.Surname = B.Surname")
giving:
name surname C
1 Ja Men T
2 Ale Bu T
3 Ge Men T
We could use safe_full_join from my package safejoin, and resolve column conflicts using dplyr::coalesce :
# devtools::install_github("moodymudskipper/safejoin")
library(safejoin)
library(dplyr)
safe_full_join(dfA, dfB[names(dfA)], by=c("name","surname"), conflict = coalesce, check="")
# name surname C
# 1 Ja Men T
# 2 Ale Bu T
# 3 Ge Men T
# 4 Je Di T
check = "" is for not displaying warning, as we're joining on factor columns with different levels

Resources