Select rows based in rows of another data.frame - r

I have these following data.frames:
dt1
Id Mother Weight
1 elly 10
2 bina 20
3 sirce 30
4 tina 30
5 lina 40
and
dt2
Id Mother Weight sex
1 elly 10 M
2 bina 20 F
3 sirce 30 F
And I would like select rows from DT1 (ID) based in DT2 (ID), this way:
new.dt
Id Mother Weight sex
4 tina 30 NA
5 lina 40 NA

Here is one option with anti_join
library(dplyr)
anti_join(dt1 %>%
mutate(sex = NA), dt2, by = 'Id')
# Id Mother Weight sex
#1 4 tina 30 NA
#2 5 lina 40 NA
data
dt1 <- structure(list(Id = 1:5, Mother = c("elly", "bina", "sirce",
"tina", "lina"), Weight = c(10L, 20L, 30L, 30L, 40L)),
class = "data.frame", row.names = c(NA,
-5L))
dt2 <- structure(list(Id = 1:3, Mother = c("elly", "bina", "sirce"),
Weight = c(10L, 20L, 30L), sex = c("M", "F", "F")),
class = "data.frame", row.names = c(NA,
-3L))

transform(dt1[!dt1$Id %in% dt2$Id,], sex = NA)
# Id Mother Weight sex
#4 4 tina 30 NA
#5 5 lina 40 NA
d = merge(dt1, dt2, all = TRUE)
d[is.na(d$sex),]
# Id Mother Weight sex
#4 4 tina 30 <NA>
#5 5 lina 40 <NA>

Related

dplyr join with three data frame

I have 3 data frames as like this
df1 <- structure(list(Vehicle = c("Car1", "Car2", "Car8"), Year = c(20L,
21L, 20L), type = c("A", "A", "A")), class = "data.frame", row.names = c(NA, -3L))
df2 <- structure(list(Vehicle = c("Car1", "Car2", "Car7"), Year = c(20L,
21L, 90L), type = c("M", "M", "M")), class = "data.frame", row.names = c(NA, -3L))
df3 <- structure(list(Vehicle = c("Car1", "Car2", "Car9"), Year = c(20L,
21L, 92L), type = c("I", "I", "I")), class = "data.frame", row.names = c(NA, -3L))
And I need to make a new table as follows
Vehicle Year type
Car1 20 A/M/I
Car2 21 A/M/I
Car7 90 M
Car8 20 A
Car9 92 I
for this purpose I used this code using dplyr as like this, but it is not working with 3 data frames:
dplyr::full_join(df1, df2, df3, by = c('Vehicle', 'Year')) %>%
tidyr::unite(type, type.x, type.y, sep = '/', na.rm = TRUE)
Try this approach. Instead of merging it looks like you want to combine all dataframes and then aggregate. Here the code using dplyr:
library(dplyr)
#Code
newdf <- bind_rows(df1,df2,df3) %>%
group_by(Vehicle,Year) %>%
summarise(type=paste0(type,collapse='|'))
Output:
# A tibble: 5 x 3
# Groups: Vehicle [5]
Vehicle Year type
<chr> <int> <chr>
1 Car1 20 A|M|I
2 Car2 21 A|M|I
3 Car7 90 M
4 Car8 20 A
5 Car9 92 I
Generally, to merge >2 data.frame's/tibble's you'd use either base R's Reduce or purrr::reduce; for example using the latter:
list(df1, df2, df3) %>%
purrr::reduce(dplyr::full_join, by = c("Vehicle", "Year")) %>%
tidyr::unite(type, dplyr::starts_with("type"), sep = "/", na.rm = TRUE)
# Vehicle Year type
#1 Car1 20 A/M/I
#2 Car2 21 A/M/I
#3 Car8 20 A
#4 Car7 90 M
#5 Car9 92 I
Using base R
aggregate(type ~ Vehicle + Year, rbind(df1, df2, df3) ,
FUN = paste, collapse="|")
-output
# Vehicle Year type
#1 Car1 20 A|M|I
#2 Car8 20 A
#3 Car2 21 A|M|I
#4 Car7 90 M
#5 Car9 92 I

How to use regex for ifelse tidyverse

My df is as followed
monday_A monday_B tuesday_A tuesday_B
1 2 4 100
6 7 8 5
I want to reorder this so it becomes
date Group quantitive
Monday A 1
Monday A 6
Monday B 2
Monday B 7
Tuesday A 4
Tuesday A 8
Tuesday B 100
Tuesday B 5
What i've done
df %>% pivot_longer(monday_A:tuesday_B, names_to="tempGroup", values_to="quantitive")
This made it
tempGroup quantitive
monday_A 1
monday_A 6
monday_B 2
monday_B 7
tuesday_A 4
tuesday_A 8
tuesday_B 100
tuesday_B 5
Now how do I separate tempgroup ? I think regex by ifelse could do it by separating the undercore
Use names_sep :
tidyr::pivot_longer(df, cols = everything(),
names_sep = "_",
names_to= c("date", "tempGroup"),
values_to="quantitative")
# A tibble: 8 x 3
# date tempGroup quantitative
# <chr> <chr> <int>
#1 monday A 1
#2 monday B 2
#3 tuesday A 4
#4 tuesday B 100
#5 monday A 6
#6 monday B 7
#7 tuesday A 8
#8 tuesday B 5
data
df <- structure(list(monday_A = c(1L, 6L), monday_B = c(2L, 7L),
tuesday_A = c(4L, 8L), tuesday_B = c(100L, 5L)),
class = "data.frame", row.names = c(NA, -2L))
Base R Solution:
# Transpose dataframe matrix: tpd => as.data.frame
tpd <- as.data.frame(t(df))
# Restructure the dataframe into the desired format: df_td => data.frame
df_td <-
data.frame(
day = gsub("_.*", "", rep(row.names(tpd), ncol(tpd))),
group = gsub(".*_", "", rep(row.names(tpd), ncol(tpd))),
quantitative = unlist(tpd),
row.names = NULL
)
Data
# Create re-usable data: df => data.frame
df <-
structure(
list(
monday_A = c(1L, 6L),
monday_B = c(2L, 7L),
tuesday_A = c(4L,
8L),
tuesday_B = c(100L, 5L)
),
row.names = c(NA,-2L),
class = "data.frame"
)

R split each row of a dataframe into two rows

I would like to splite each row of a data frame(numberic) into two rows. For example, part of the original data frame like this (nrow(original datafram) > 2800000):
ID X Y Z value_1 value_2
1 3 2 6 22 54
6 11 5 9 52 71
3 7 2 5 2 34
5 10 7 1 23 47
And after spliting each row, we can get:
ID X Y Z
1 3 2 6
22 54 NA NA
6 11 5 9
52 71 NA NA
3 7 2 5
2 34 NA NA
5 10 7 1
23 47 NA NA
the "value_1" and "value_2" columns are split and each element is set to a new row. For example, value_1 = 22 and value_2 = 54 are set to a new row.
Here is one option with data.table. We convert the 'data.frame' to 'data.table' by creating a column of rownames (setDT(df1, keep.rownames = TRUE)). Subset the columns 1:5 and 1, 6, 7 in a list, rbind the list element with fill = TRUE option to return NA for corresponding columns that are not found in one of the datasets, order by the row number ('rn') and assign (:=) the row number column to 'NULL'.
library(data.table)
setDT(df1, keep.rownames = TRUE)[]
rbindlist(list(df1[, 1:5, with = FALSE], setnames(df1[, c(1, 6:7),
with = FALSE], 2:3, c("ID", "X"))), fill = TRUE)[order(rn)][, rn:= NULL][]
# ID X Y Z
#1: 1 3 2 6
#2: 22 54 NA NA
#3: 6 11 5 9
#4: 52 71 NA NA
#5: 3 7 2 5
#6: 2 34 NA NA
#7: 5 10 7 1
#8: 23 47 NA NA
A hadleyverse corresponding to the above logic would be
library(dplyr)
tibble::rownames_to_column(df1[1:4]) %>%
bind_rows(., setNames(tibble::rownames_to_column(df1[5:6]),
c("rowname", "ID", "X"))) %>%
arrange(rowname) %>%
select(-rowname)
# ID X Y Z
#1 1 3 2 6
#2 22 54 NA NA
#3 6 11 5 9
#4 52 71 NA NA
#5 3 7 2 5
#6 2 34 NA NA
#7 5 10 7 1
#8 23 47 NA NA
data
df1 <- structure(list(ID = c(1L, 6L, 3L, 5L), X = c(3L, 11L, 7L, 10L
), Y = c(2L, 5L, 2L, 7L), Z = c(6L, 9L, 5L, 1L), value_1 = c(22L,
52L, 2L, 23L), value_2 = c(54L, 71L, 34L, 47L)), .Names = c("ID",
"X", "Y", "Z", "value_1", "value_2"), class = "data.frame",
row.names = c(NA, -4L))
Here's a (very slow) pure R solution using no extra packages:
# Replicate your matrix
input_df <- data.frame(ID = rnorm(10000),
X = rnorm(10000),
Y = rnorm(10000),
Z = rnorm(10000),
value_1 = rnorm(10000),
value_2 = rnorm(10000))
# Preallocate memory to a data frame
output_df <- data.frame(
matrix(
nrow = nrow(input_df)*2,
ncol = ncol(input_df)-2))
# Loop through each row in turn.
# Put the first four elements into the current
# row, and the next two into the current+1 row
# with two NAs attached.
for(i in seq(1, nrow(output_df), 2)){
output_df[i,] <- input_df[i, c(1:4)]
output_df[i+1,] <- c(input_df[i, c(5:6)],NA,NA)
}
colnames(output_df) <- c("ID", "X", "Y", "Z")
Which results in
> head(output_df)
X1 X2 X3 X4
1 0.5529417 -0.93859275 2.0900276 -2.4023800
2 0.9751090 0.13357075 NA NA
3 0.6753835 0.07018647 0.8529300 -0.9844643
4 1.6405939 0.96133195 NA NA
5 0.3378821 -0.44612782 -0.8176745 0.2759752
6 -0.8910678 -0.37928353 NA NA
This should work
data <- read.table(text= "ID X Y Z value_1 value_2
1 3 2 6 22 54
6 11 5 9 52 71
3 7 2 5 2 34
5 10 7 1 23 47", header=T)
data1 <- data[,1:4]
data2 <- setdiff(data,data1)
names(data2) <- names(data1)[1:ncol(data2)]
combined <- plyr::rbind.fill(data1,data2)
n <- nrow(data1)
combined[kronecker(1:n, c(0, n), "+"),]
Though why you would need to do this beats me.

Update rows of data frame in R

Suppose I start with a data frame:
ID Measurement1 Measurement2
1 45 104
2 34 87
3 23 99
4 56 67
...
Then I have a second data frame which is meant to be used to update records in the first:
ID Measurement1 Measurement2
2 10 11
4 21 22
How do I use R to end up with:
ID Measurement1 Measurement2
1 45 104
2 10 11
3 23 99
4 21 22
...
The data frames in reality are very large datasets.
We can use match to get the row index. Using that index to subset the rows, we replace the 2nd and 3rd columns of the first dataset with the corresponding columns of second dataset.
ind <- match(df2$ID, df1$ID)
df1[ind, 2:3] <- df2[2:3]
df1
# ID Measurement1 Measurement2
#1 1 45 104
#2 2 10 11
#3 3 23 99
#4 4 21 22
Or we can use data.table to join the dataset on the 'ID' column (after converting the first dataset to 'data.table' i.e. setDT(df1)), and assign the 'Cols' with the 'iCols' from the second dataset.
library(data.table)#v1.9.6+
Cols <- names(df1)[-1]
iCols <- paste0('i.', Cols)
setDT(df1)[df2, (Cols) := mget(iCols), on= 'ID'][]
# ID Measurement1 Measurement2
#1: 1 45 104
#2: 2 10 11
#3: 3 23 99
#4: 4 21 22
data
df1 <- structure(list(ID = 1:4, Measurement1 = c(45L, 34L, 23L, 56L),
Measurement2 = c(104L, 87L, 99L, 67L)), .Names = c("ID",
"Measurement1", "Measurement2"), class = "data.frame",
row.names = c(NA, -4L))
df2 <- structure(list(ID = c(2L, 4L), Measurement1 = c(10L, 21L),
Measurement2 = c(11L,
22L)), .Names = c("ID", "Measurement1", "Measurement2"),
class = "data.frame", row.names = c(NA, -2L))
library(dplyr)
df1 %>%
anti_join(df2, by = "ID") %>%
bind_rows(df2) %>%
arrange(ID)
dplyr 1.0.0 introduced a family of SQL-inspired functions for modifying rows. In this case you can now use rows_update():
library(dplyr)
df1 %>%
rows_update(df2, by = "ID")
ID Measurement1 Measurement2
1 1 45 104
2 2 10 11
3 3 23 99
4 4 21 22

R - How to transform a list of lists to matrix with unique indexes

I have a list which looking like this
[[1]]
users V1
1 28 3
2 33 1
3 35 4
4 260 1
[[2]]
users V1
1 33 2
2 260 1
3 285 13
How can I create a table like this one using R?
users V1 V2
1 28 3 NA
2 33 1 2
3 35 4 NA
4 260 NA 1
5 285 13 NA
You can try
Reduce(function(...) merge(..., by='users', all=TRUE), lst)
# users V1.x V1.y
#1 28 3 NA
#2 33 1 2
#3 35 4 NA
#4 260 1 1
#5 285 NA 13
Another option would be to use join_all from plyr. But, this requires the column names other than the one used in the by= to be named differently.
library(plyr)
nm1 <- make.names(sapply(lst, colnames)[-1,],unique=TRUE)
join_all(Map(function(x,y) {names(x)[-1] <- y; x}, lst, nm1),
by='users', type='full')
# users V1 V1.1
#1 28 3 NA
#2 33 1 2
#3 35 4 NA
#4 260 1 1
#5 285 NA 13
data
lst <- list(structure(list(users = c(28L, 33L, 35L, 260L), V1 = c(3L,
1L, 4L, 1L)), .Names = c("users", "V1"), class = "data.frame",
row.names = c("1", "2", "3", "4")), structure(list(users = c(33L, 260L, 285L),
V1 = c(2L, 1L, 13L)), .Names = c("users", "V1"), class = "data.frame",
row.names = c("1", "2", "3")))
You can use do.call with merge on your list.
l <- list(structure(list(users = c(28, 33, 35, 260), V1 = c(3, 1,
4, 1)), .Names = c("users", "V1"), row.names = c(NA, -4L), class = "data.frame"),
structure(list(users = c(33, 260, 285), V1 = c(2, 1, 13)), .Names = c("users",
"V1"), row.names = c(NA, -3L), class = "data.frame"))
do.call(merge, c(l, list(all = TRUE, by = "users")))
# users V1.x V1.y
#1 28 3 NA
#2 33 1 2
#3 35 4 NA
#4 260 1 1
#5 285 NA 13

Resources